Browse Source

set localsize as multiple of subgroup size (#2483)

* fix innerproduct gemm vulkan
pull/6116/head
nihui GitHub 11 months ago
parent
commit
8a2eab1114
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
4 changed files with 100 additions and 4 deletions
  1. +1
    -1
      src/layer/vulkan/shader/innerproduct_gemm_wp4to8.comp
  2. +1
    -1
      src/layer/vulkan/shader/innerproduct_gemm_wp8.comp
  3. +1
    -1
      src/layer/vulkan/shader/innerproduct_gemm_wp8to4.comp
  4. +97
    -1
      src/pipeline.cpp

+ 1
- 1
src/layer/vulkan/shader/innerproduct_gemm_wp4to8.comp View File

@@ -68,7 +68,7 @@ void main()
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) || gy >= psc(outh) || gz >= 1)
if (gx * 8 >= psc(outw) || gy >= psc(outh) || gz >= 1)
return;

afpvec8 sum;


+ 1
- 1
src/layer/vulkan/shader/innerproduct_gemm_wp8.comp View File

@@ -68,7 +68,7 @@ void main()
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) || gy >= psc(outh) || gz >= 1)
if (gx * 8 >= psc(outw) || gy >= psc(outh) || gz >= 1)
return;

afpvec8 sum;


+ 1
- 1
src/layer/vulkan/shader/innerproduct_gemm_wp8to4.comp View File

@@ -68,7 +68,7 @@ void main()
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) || gy >= psc(outh) || gz >= 1)
if (gx * 4 >= psc(outw) || gy >= psc(outh) || gz >= 1)
return;

afpvec4 sum;


+ 97
- 1
src/pipeline.cpp View File

@@ -122,13 +122,109 @@ void Pipeline::set_subgroup_size(uint32_t subgroup_size)
d->subgroup_size = subgroup_size;
}

static int count_trailing_zeros(unsigned int v)
{
int cnt = 0;
while ((v & 1) == 0)
{
cnt++;
v >>= 1;
}
return cnt;
}

// round up v to the next multiple of 2^k
static unsigned int round_up_pow2_mul(unsigned int v, int k)
{
unsigned int m = 1u << k;
return ((v + m - 1) / m) * m;
}

// adjust x, y, z so that new x * y * z is a multiple of size (size must be a power of two), and new x, y, z are no less than the inputs
// new values do not have to be integer multiples of the originals
// minimize the total increment (x'-x)+(y'-y)+(z'-z)
// additional constraint: if original y is 1, prefer not to adjust y; if original z is 1, prefer not to adjust z
static void adjust_xyz(int& x, int& y, int& z, const int subgroup_size)
{
if (x * y * z % subgroup_size == 0)
return;

int target_n = 0;
{
while ((1 << target_n) != subgroup_size)
target_n++;
}

// subgroup shall usually be 4 ~ 128, sanitize the max possible size
target_n = std::min(target_n, 10);

const int tx = count_trailing_zeros((unsigned int)x);
const int ty = count_trailing_zeros((unsigned int)y);
const int tz = count_trailing_zeros((unsigned int)z);
const int tn = tx + ty + tz;

const int need = target_n - tn;

if (z == 1)
{
if (y == 1)
{
// adjust x only
x = round_up_pow2_mul((unsigned int)x, target_n);
}
else if (x == 1)
{
// adjust y only
y = round_up_pow2_mul((unsigned int)y, target_n);
}
else
{
// adjust x and y
y = round_up_pow2_mul((unsigned int)y, ty + need / 2);
x = round_up_pow2_mul((unsigned int)x, tx + need - need / 2);
}
}
else if (y == 1)
{
if (x == 1)
{
// adjust z only
z = round_up_pow2_mul((unsigned int)z, target_n);
}
else
{
// adjust x and z
z = round_up_pow2_mul((unsigned int)z, tz + need / 2);
x = round_up_pow2_mul((unsigned int)x, tx + need - need / 2);
}
}
else if (x == 1)
{
// adjust y and z
z = round_up_pow2_mul((unsigned int)z, tz + need / 2);
y = round_up_pow2_mul((unsigned int)y, ty + need - need / 2);
}
else
{
// adjust x y z
z = round_up_pow2_mul((unsigned int)z, tz + need / 3);
y = round_up_pow2_mul((unsigned int)y, ty + (need - need / 3) / 2);
x = round_up_pow2_mul((unsigned int)x, tx + need - (need - need / 3) / 2);
}
}

void Pipeline::set_local_size_xyz(int w, int h, int c)
{
// dispatch at least one subgroup
// make local size be multiple of subgroup size
// and metal is unhappy with arbitrary local size anyway
adjust_xyz(w, h, c, d->subgroup_size);

d->local_size_x = w;
d->local_size_y = h;
d->local_size_z = c;

// NCNN_LOGE("local size = %d %d %d", local_size_x, local_size_y, local_size_z);
// NCNN_LOGE("local size = %d %d %d", local_size_x, local_size_y, local_size_z);
}

int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations)


Loading…
Cancel
Save