Browse Source

add missing barrier for transfer dst, fix softmax pack4, fix #932

tags/20190611
nihuini 7 years ago
parent
commit
5d86014d9c
6 changed files with 43 additions and 58 deletions
  1. +2
    -0
      src/command.cpp
  2. +7
    -19
      src/layer/shader/softmax_div_sum_pack4.comp
  3. +7
    -7
      src/layer/shader/softmax_exp_sub_max_pack4.comp
  4. +7
    -10
      src/layer/shader/softmax_reduce_max_pack4.comp
  5. +7
    -10
      src/layer/shader/softmax_reduce_sum_pack4.comp
  6. +13
    -12
      src/layer/softmax.cpp

+ 2
- 0
src/command.cpp View File

@@ -224,6 +224,7 @@ void VkCompute::record_download(const VkMat& m)
void VkCompute::record_clone(const VkMat& src, const VkMat& dst)
{
record_prepare_transfer_barrier(src);
record_prepare_transfer_barrier(dst);

if (vkdev->info.support_VK_KHR_push_descriptor)
return copy_buffer(src.buffer(), src.buffer_offset(), dst.buffer(), dst.buffer_offset(), src.total() * src.elemsize);
@@ -249,6 +250,7 @@ void VkCompute::record_copy_region(const VkMat& src, const VkMat& dst, const VkB
void VkCompute::record_copy_regions(const VkMat& src, const VkMat& dst, const std::vector<VkBufferCopy>& regions)
{
record_prepare_transfer_barrier(src);
record_prepare_transfer_barrier(dst);

if (vkdev->info.support_VK_KHR_push_descriptor)
return copy_buffer_regions(src.buffer(), dst.buffer(), regions);


+ 7
- 19
src/layer/shader/softmax_div_sum_pack4.comp View File

@@ -28,7 +28,7 @@ layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; };
layout (binding = 1) readonly buffer sum_workspace { sfp sum_workspace_data[]; };
layout (binding = 1) readonly buffer sum_workspace { sfpvec4 sum_workspace_data[]; };

layout (push_constant) uniform parameter
{
@@ -56,54 +56,42 @@ void main()

if (p.dims == 1) // axis == 0
{
afp sum = afp(sum_workspace_data[0]);
afpvec4 v = afpvec4(bottom_top_blob_data[gx]);
bottom_top_blob_data[gx] = sfpvec4(v / sum);
bottom_top_blob_data[gx] = sfpvec4(afpvec4(bottom_top_blob_data[gx]) / afpvec4(sum_workspace_data[0]));
return;
}

if (p.dims == 2 && axis == 0)
{
int gi = gy * p.w + gx;
afp sum = afp(sum_workspace_data[gx]);
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);
bottom_top_blob_data[gi] = sfpvec4(v / sum);
bottom_top_blob_data[gi] = sfpvec4(afpvec4(bottom_top_blob_data[gi]) / afpvec4(sum_workspace_data[gx]));
return;
}

if (p.dims == 2 && axis == 1)
{
int gi = gy * p.w + gx;
afp sum = afp(sum_workspace_data[gy]);
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);
bottom_top_blob_data[gi] = sfpvec4(v / sum);
bottom_top_blob_data[gi] = sfpvec4(afpvec4(bottom_top_blob_data[gi]) / afpvec4(sum_workspace_data[gy]));
return;
}

if (p.dims == 3 && axis == 0)
{
int gi = gz * p.cstep + gy * p.w + gx;
afp sum = afp(sum_workspace_data[gy * p.w + gx]);
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);
bottom_top_blob_data[gi] = sfpvec4(v / sum);
bottom_top_blob_data[gi] = sfpvec4(afpvec4(bottom_top_blob_data[gi]) / afpvec4(sum_workspace_data[gy * p.w + gx]));
return;
}

if (p.dims == 3 && axis == 1)
{
int gi = gz * p.cstep + gy * p.w + gx;
afp sum = afp(sum_workspace_data[gz * p.w + gx]);
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);
bottom_top_blob_data[gi] = sfpvec4(v / sum);
bottom_top_blob_data[gi] = sfpvec4(afpvec4(bottom_top_blob_data[gi]) / afpvec4(sum_workspace_data[gz * p.w + gx]));
return;
}

if (p.dims == 3 && axis == 2)
{
int gi = gz * p.cstep + gy * p.w + gx;
afp sum = afp(sum_workspace_data[gz * p.h + gy]);
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);
bottom_top_blob_data[gi] = sfpvec4(v / sum);
bottom_top_blob_data[gi] = sfpvec4(afpvec4(bottom_top_blob_data[gi]) / afpvec4(sum_workspace_data[gz * p.h + gy]));
return;
}
}

+ 7
- 7
src/layer/shader/softmax_exp_sub_max_pack4.comp View File

@@ -28,7 +28,7 @@ layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; };
layout (binding = 1) readonly buffer max_workspace { sfp max_workspace_data[]; };
layout (binding = 1) readonly buffer max_workspace { sfpvec4 max_workspace_data[]; };

layout (push_constant) uniform parameter
{
@@ -56,42 +56,42 @@ void main()

if (p.dims == 1) // axis == 0
{
bottom_top_blob_data[gx] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gx]) - afp(max_workspace_data[0])));
bottom_top_blob_data[gx] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gx]) - afpvec4(max_workspace_data[0])));
return;
}

if (p.dims == 2 && axis == 0)
{
int gi = gy * p.w + gx;
bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afp(max_workspace_data[gx])));
bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afpvec4(max_workspace_data[gx])));
return;
}

if (p.dims == 2 && axis == 1)
{
int gi = gy * p.w + gx;
bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afp(max_workspace_data[gy])));
bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afpvec4(max_workspace_data[gy])));
return;
}

if (p.dims == 3 && axis == 0)
{
int gi = gz * p.cstep + gy * p.w + gx;
bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afp(max_workspace_data[gy * p.w + gx])));
bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afpvec4(max_workspace_data[gy * p.w + gx])));
return;
}

if (p.dims == 3 && axis == 1)
{
int gi = gz * p.cstep + gy * p.w + gx;
bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afp(max_workspace_data[gz * p.w + gx])));
bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afpvec4(max_workspace_data[gz * p.w + gx])));
return;
}

if (p.dims == 3 && axis == 2)
{
int gi = gz * p.cstep + gy * p.w + gx;
bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afp(max_workspace_data[gz * p.h + gy])));
bottom_top_blob_data[gi] = sfpvec4(exp(afpvec4(bottom_top_blob_data[gi]) - afpvec4(max_workspace_data[gz * p.h + gy])));
return;
}
}

+ 7
- 10
src/layer/shader/softmax_reduce_max_pack4.comp View File

@@ -28,7 +28,7 @@ layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; };
layout (binding = 1) writeonly buffer max_workspace { sfp max_workspace_data[]; };
layout (binding = 1) writeonly buffer max_workspace { sfpvec4 max_workspace_data[]; };

layout (push_constant) uniform parameter
{
@@ -62,7 +62,7 @@ void main()
max_value = max(max_value, afpvec4(bottom_top_blob_data[i]));
}
afpvec2 max2 = max(max_value.rg, max_value.ba);
max_workspace_data[0] = sfp(max(max2.r, max2.g));
max_workspace_data[0] = sfpvec4(max(max2.r, max2.g));

return;
}
@@ -76,7 +76,7 @@ void main()
max_value = max(max_value, afpvec4(bottom_top_blob_data[v_offset]));
}
afpvec2 max2 = max(max_value.rg, max_value.ba);
max_workspace_data[gx] = sfp(max(max2.r, max2.g));
max_workspace_data[gx] = sfpvec4(max(max2.r, max2.g));

return;
}
@@ -89,8 +89,7 @@ void main()
int v_offset = gx * p.w + i;
max_value = max(max_value, afpvec4(bottom_top_blob_data[v_offset]));
}
afpvec2 max2 = max(max_value.rg, max_value.ba);
max_workspace_data[gx] = sfp(max(max2.r, max2.g));
max_workspace_data[gx] = sfpvec4(max_value);

return;
}
@@ -104,7 +103,7 @@ void main()
max_value = max(max_value, afpvec4(bottom_top_blob_data[v_offset]));
}
afpvec2 max2 = max(max_value.rg, max_value.ba);
max_workspace_data[gy * p.w + gx] = sfp(max(max2.r, max2.g));
max_workspace_data[gy * p.w + gx] = sfpvec4(max(max2.r, max2.g));

return;
}
@@ -117,8 +116,7 @@ void main()
int v_offset = gy * p.cstep + i * p.w + gx;
max_value = max(max_value, afpvec4(bottom_top_blob_data[v_offset]));
}
afpvec2 max2 = max(max_value.rg, max_value.ba);
max_workspace_data[gy * p.w + gx] = sfp(max(max2.r, max2.g));
max_workspace_data[gy * p.w + gx] = sfpvec4(max_value);

return;
}
@@ -131,8 +129,7 @@ void main()
int v_offset = gy * p.cstep + gx * p.w + i;
max_value = max(max_value, afpvec4(bottom_top_blob_data[v_offset]));
}
afpvec2 max2 = max(max_value.rg, max_value.ba);
max_workspace_data[gy * p.h + gx] = sfp(max(max2.r, max2.g));
max_workspace_data[gy * p.h + gx] = sfpvec4(max_value);

return;
}


+ 7
- 10
src/layer/shader/softmax_reduce_sum_pack4.comp View File

@@ -28,7 +28,7 @@ layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; };
layout (binding = 1) writeonly buffer sum_workspace { sfp sum_workspace_data[]; };
layout (binding = 1) writeonly buffer sum_workspace { sfpvec4 sum_workspace_data[]; };

layout (push_constant) uniform parameter
{
@@ -62,7 +62,7 @@ void main()
sum_value += afpvec4(bottom_top_blob_data[i]);
}
afpvec2 sum2 = sum_value.rg + sum_value.ba;
sum_workspace_data[0] = sfp(sum2.r + sum2.g);
sum_workspace_data[0] = sfpvec4(sum2.r + sum2.g);

return;
}
@@ -76,7 +76,7 @@ void main()
sum_value += afpvec4(bottom_top_blob_data[v_offset]);
}
afpvec2 sum2 = sum_value.rg + sum_value.ba;
sum_workspace_data[gx] = sfp(sum2.r + sum2.g);
sum_workspace_data[gx] = sfpvec4(sum2.r + sum2.g);

return;
}
@@ -89,8 +89,7 @@ void main()
int v_offset = gx * p.w + i;
sum_value += afpvec4(bottom_top_blob_data[v_offset]);
}
afpvec2 sum2 = sum_value.rg + sum_value.ba;
sum_workspace_data[gx] = sfp(sum2.r + sum2.g);
sum_workspace_data[gx] = sfpvec4(sum_value);

return;
}
@@ -104,7 +103,7 @@ void main()
sum_value += afpvec4(bottom_top_blob_data[v_offset]);
}
afpvec2 sum2 = sum_value.rg + sum_value.ba;
sum_workspace_data[ gy * p.w + gx ] = sfp(sum2.r + sum2.g);
sum_workspace_data[ gy * p.w + gx ] = sfpvec4(sum2.r + sum2.g);

return;
}
@@ -117,8 +116,7 @@ void main()
int v_offset = gy * p.cstep + i * p.w + gx;
sum_value += afpvec4(bottom_top_blob_data[v_offset]);
}
afpvec2 sum2 = sum_value.rg + sum_value.ba;
sum_workspace_data[gy * p.w + gx] = sfp(sum2.r + sum2.g);
sum_workspace_data[gy * p.w + gx] = sfpvec4(sum_value);

return;
}
@@ -131,8 +129,7 @@ void main()
int v_offset = gy * p.cstep + gx * p.w + i;
sum_value += afpvec4(bottom_top_blob_data[v_offset]);
}
afpvec2 sum2 = sum_value.rg + sum_value.ba;
sum_workspace_data[gy * p.h + gx] = sfp(sum2.r + sum2.g);
sum_workspace_data[gy * p.h + gx] = sfpvec4(sum_value);

return;
}


+ 13
- 12
src/layer/softmax.cpp View File

@@ -412,6 +412,7 @@ int Softmax::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Optio
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
size_t elemsize = bottom_top_blob.elemsize;
int packing = bottom_top_blob.packing;

VkMat max_workspace;
@@ -419,33 +420,33 @@ int Softmax::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Optio

if (dims == 1) // axis == 0
{
max_workspace.create(1, 4u, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(1, 4u, opt.workspace_vkallocator, opt.staging_vkallocator);
max_workspace.create(1, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(1, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator);
}
else if (dims == 2 && axis == 0)
{
max_workspace.create(w, 4u, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(w, 4u, opt.workspace_vkallocator, opt.staging_vkallocator);
max_workspace.create(w, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(w, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator);
}
else if (dims == 2 && axis == 1)
{
max_workspace.create(h, 4u, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(h, 4u, opt.workspace_vkallocator, opt.staging_vkallocator);
max_workspace.create(h, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(h, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator);
}
else if (dims == 3 && axis == 0)
{
max_workspace.create(w, h, 4u, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(w, h, 4u, opt.workspace_vkallocator, opt.staging_vkallocator);
max_workspace.create(w, h, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(w, h, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator);
}
else if (dims == 3 && axis == 1)
{
max_workspace.create(w, channels, 4u, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(w, channels, 4u, opt.workspace_vkallocator, opt.staging_vkallocator);
max_workspace.create(w, channels, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(w, channels, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator);
}
else if (dims == 3 && axis == 2)
{
max_workspace.create(h, channels, 4u, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(h, channels, 4u, opt.workspace_vkallocator, opt.staging_vkallocator);
max_workspace.create(h, channels, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator);
sum_workspace.create(h, channels, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator);
}

// fprintf(stderr, "Softmax::forward_inplace %p\n", bottom_top_blob.buffer());


Loading…
Cancel
Save