|
|
@@ -14,9 +14,9 @@ __kernel void SoftMax_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *outp |
|
|
|
|
|
|
|
|
if (X >= H || Y >= W) return; |
|
|
if (X >= H || Y >= W) return; |
|
|
|
|
|
|
|
|
FLT sum = 0.0f; |
|
|
|
|
|
|
|
|
float sum = 0.0f; |
|
|
for (int d = 0; d < S; ++d) { |
|
|
for (int d = 0; d < S; ++d) { |
|
|
FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X)); |
|
|
|
|
|
|
|
|
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X))); |
|
|
sum += exp(t.x); |
|
|
sum += exp(t.x); |
|
|
if (d * 4 + 1 < C) sum += exp(t.y); |
|
|
if (d * 4 + 1 < C) sum += exp(t.y); |
|
|
if (d * 4 + 2 < C) sum += exp(t.z); |
|
|
if (d * 4 + 2 < C) sum += exp(t.z); |
|
|
@@ -24,14 +24,15 @@ __kernel void SoftMax_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *outp |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
for (int d = 0; d < S; ++d) { |
|
|
for (int d = 0; d < S; ++d) { |
|
|
FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X)); |
|
|
|
|
|
|
|
|
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X))); |
|
|
t = divide_no_check(exp(t), sum); |
|
|
t = divide_no_check(exp(t), sum); |
|
|
__global FLT *output_flt = (__global FLT *)output; |
|
|
__global FLT *output_flt = (__global FLT *)output; |
|
|
output_flt += (X * W + Y) * C + 4 * d; |
|
|
output_flt += (X * W + Y) * C + 4 * d; |
|
|
output_flt[0] = t.x; |
|
|
|
|
|
if (d * 4 + 1 < C) output_flt[1] += t.y; |
|
|
|
|
|
if (d * 4 + 2 < C) output_flt[2] += t.z; |
|
|
|
|
|
if (d * 4 + 3 < C) output_flt[3] += t.w; |
|
|
|
|
|
|
|
|
FLT4 result = TO_FLT4(t); |
|
|
|
|
|
output_flt[0] = result.x; |
|
|
|
|
|
if (d * 4 + 1 < C) output_flt[1] += result.y; |
|
|
|
|
|
if (d * 4 + 2 < C) output_flt[2] += result.z; |
|
|
|
|
|
if (d * 4 + 3 < C) output_flt[3] += result.w; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
@@ -45,9 +46,9 @@ __kernel void SoftMax_NHWC4_IMG(__read_only image2d_t input, __write_only image2 |
|
|
|
|
|
|
|
|
if (X >= H || Y >= W) return; |
|
|
if (X >= H || Y >= W) return; |
|
|
|
|
|
|
|
|
FLT sum = 0.0f; |
|
|
|
|
|
|
|
|
float sum = 0.0f; |
|
|
for (int d = 0; d < S; ++d) { |
|
|
for (int d = 0; d < S; ++d) { |
|
|
FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X)); |
|
|
|
|
|
|
|
|
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X))); |
|
|
sum += exp(t.x); |
|
|
sum += exp(t.x); |
|
|
if (d * 4 + 1 < C) sum += exp(t.y); |
|
|
if (d * 4 + 1 < C) sum += exp(t.y); |
|
|
if (d * 4 + 2 < C) sum += exp(t.z); |
|
|
if (d * 4 + 2 < C) sum += exp(t.z); |
|
|
@@ -55,9 +56,9 @@ __kernel void SoftMax_NHWC4_IMG(__read_only image2d_t input, __write_only image2 |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
for (int d = 0; d < S; ++d) { |
|
|
for (int d = 0; d < S; ++d) { |
|
|
FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X)); |
|
|
|
|
|
|
|
|
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y * S + d, X))); |
|
|
t = exp(t) / sum; |
|
|
t = exp(t) / sum; |
|
|
WRITE_IMAGE(output, (int2)(Y * S + d, X), t); |
|
|
|
|
|
|
|
|
WRITE_IMAGE(output, (int2)(Y * S + d, X), TO_FLT4(t)); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
@@ -71,9 +72,9 @@ __kernel void SoftMax_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *out |
|
|
|
|
|
|
|
|
if (X >= H || Y >= W) return; |
|
|
if (X >= H || Y >= W) return; |
|
|
|
|
|
|
|
|
FLT sum = 0.0f; |
|
|
|
|
|
|
|
|
float sum = 0.0f; |
|
|
for (int d = 0; d < S; ++d) { |
|
|
for (int d = 0; d < S; ++d) { |
|
|
FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X)); |
|
|
|
|
|
|
|
|
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X))); |
|
|
sum += exp(t.x); |
|
|
sum += exp(t.x); |
|
|
if (d * 4 + 1 < C) sum += exp(t.y); |
|
|
if (d * 4 + 1 < C) sum += exp(t.y); |
|
|
if (d * 4 + 2 < C) sum += exp(t.z); |
|
|
if (d * 4 + 2 < C) sum += exp(t.z); |
|
|
@@ -81,14 +82,15 @@ __kernel void SoftMax_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *out |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
for (int d = 0; d < S; ++d) { |
|
|
for (int d = 0; d < S; ++d) { |
|
|
FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X)); |
|
|
|
|
|
|
|
|
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X))); |
|
|
t = divide_no_check(exp(t), sum); |
|
|
t = divide_no_check(exp(t), sum); |
|
|
__global FLT *output_flt = (__global FLT *)output; |
|
|
__global FLT *output_flt = (__global FLT *)output; |
|
|
output_flt += (X * W + Y) * C + 4 * d; |
|
|
output_flt += (X * W + Y) * C + 4 * d; |
|
|
output_flt[0] = t.x; |
|
|
|
|
|
if (d * 4 + 1 < C) output_flt[1] += t.y; |
|
|
|
|
|
if (d * 4 + 2 < C) output_flt[2] += t.z; |
|
|
|
|
|
if (d * 4 + 3 < C) output_flt[3] += t.w; |
|
|
|
|
|
|
|
|
FLT4 result = TO_FLT4(t); |
|
|
|
|
|
output_flt[0] = result.x; |
|
|
|
|
|
if (d * 4 + 1 < C) output_flt[1] += result.y; |
|
|
|
|
|
if (d * 4 + 2 < C) output_flt[2] += result.z; |
|
|
|
|
|
if (d * 4 + 3 < C) output_flt[3] += result.w; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
@@ -102,9 +104,9 @@ __kernel void SoftMax_NC4HW4_IMG(__read_only image2d_t input, __write_only image |
|
|
|
|
|
|
|
|
if (X >= H || Y >= W) return; |
|
|
if (X >= H || Y >= W) return; |
|
|
|
|
|
|
|
|
FLT sum = 0.0f; |
|
|
|
|
|
|
|
|
float sum = 0.0f; |
|
|
for (int d = 0; d < S; ++d) { |
|
|
for (int d = 0; d < S; ++d) { |
|
|
FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X)); |
|
|
|
|
|
|
|
|
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X))); |
|
|
sum += exp(t.x); |
|
|
sum += exp(t.x); |
|
|
if (d * 4 + 1 < C) sum += exp(t.y); |
|
|
if (d * 4 + 1 < C) sum += exp(t.y); |
|
|
if (d * 4 + 2 < C) sum += exp(t.z); |
|
|
if (d * 4 + 2 < C) sum += exp(t.z); |
|
|
@@ -112,51 +114,51 @@ __kernel void SoftMax_NC4HW4_IMG(__read_only image2d_t input, __write_only image |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
for (int d = 0; d < S; ++d) { |
|
|
for (int d = 0; d < S; ++d) { |
|
|
FLT4 t = READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X)); |
|
|
|
|
|
|
|
|
float4 t = convert_float4(READ_IMAGE(input, smp_zero, (int2)(Y, d * H + X))); |
|
|
t = exp(t) / sum; |
|
|
t = exp(t) / sum; |
|
|
WRITE_IMAGE(output, (int2)(Y, d * H + X), t); |
|
|
|
|
|
|
|
|
WRITE_IMAGE(output, (int2)(Y, d * H + X), TO_FLT4(t)); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
__kernel void SoftMax1x1_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask, |
|
|
__kernel void SoftMax1x1_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask, |
|
|
const int slices, const int slices_x32) { |
|
|
const int slices, const int slices_x32) { |
|
|
int tid = get_local_id(0); |
|
|
int tid = get_local_id(0); |
|
|
FLT sum = 0.0f; |
|
|
|
|
|
|
|
|
float sum = 0.0f; |
|
|
for (size_t i = tid; i < slices - 1; i += 32) { |
|
|
for (size_t i = tid; i < slices - 1; i += 32) { |
|
|
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(i, 0)); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), exp(src)); |
|
|
|
|
|
|
|
|
float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0))); |
|
|
|
|
|
sum += dot((float4)(1.0f), exp(src)); |
|
|
} |
|
|
} |
|
|
if ((slices - 1) % 32 == tid) { |
|
|
if ((slices - 1) % 32 == tid) { |
|
|
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0)); |
|
|
|
|
|
|
|
|
|
|
|
sum += dot(TO_FLT4(mask), exp(src)); |
|
|
|
|
|
|
|
|
float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0))); |
|
|
|
|
|
sum += dot(convert_float4(mask), exp(src)); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
__local FLT4 tmp[8]; |
|
|
|
|
|
__local FLT *tmpx1 = (__local FLT *)tmp; |
|
|
|
|
|
|
|
|
__local float4 tmp[8]; |
|
|
|
|
|
__local float *tmpx1 = (__local float *)tmp; |
|
|
tmpx1[tid] = sum; |
|
|
tmpx1[tid] = sum; |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
if (tid == 0) { |
|
|
if (tid == 0) { |
|
|
sum = dot((FLT4)(1.0f), tmp[0]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[1]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[2]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[3]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[4]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[5]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[6]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[7]); |
|
|
|
|
|
|
|
|
sum = dot((float4)(1.0f), tmp[0]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[1]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[2]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[3]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[4]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[5]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[6]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[7]); |
|
|
tmpx1[0] = divide_no_check(1.0f, sum); |
|
|
tmpx1[0] = divide_no_check(1.0f, sum); |
|
|
} |
|
|
} |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
sum = tmpx1[0]; |
|
|
sum = tmpx1[0]; |
|
|
for (size_t i = tid; i < slices - 1; i += 32) { |
|
|
for (size_t i = tid; i < slices - 1; i += 32) { |
|
|
FLT4 result = READ_IMAGE(input, smp_zero, (int2)(i, 0)); |
|
|
|
|
|
|
|
|
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0))); |
|
|
result = exp(result) * sum; |
|
|
result = exp(result) * sum; |
|
|
output[i] = result; |
|
|
|
|
|
|
|
|
output[i] = TO_FLT4(result); |
|
|
} |
|
|
} |
|
|
if ((slices - 1) % 32 == tid) { |
|
|
if ((slices - 1) % 32 == tid) { |
|
|
FLT4 result = READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0)); |
|
|
|
|
|
result = exp(result) * sum; |
|
|
|
|
|
|
|
|
float4 result_float = convert_float4(READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0))); |
|
|
|
|
|
result_float = exp(result_float) * sum; |
|
|
|
|
|
FLT4 result = TO_FLT4(result_float); |
|
|
__global FLT4 *remain_ptr4 = output; |
|
|
__global FLT4 *remain_ptr4 = output; |
|
|
remain_ptr4 += slices - 1; |
|
|
remain_ptr4 += slices - 1; |
|
|
__global FLT *remain_ptr = (__global FLT *)remain_ptr4; |
|
|
__global FLT *remain_ptr = (__global FLT *)remain_ptr4; |
|
|
@@ -176,80 +178,81 @@ __kernel void SoftMax1x1_NHWC4_BUF(__read_only image2d_t input, __global FLT4 *o |
|
|
__kernel void SoftMax1x1_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask, |
|
|
__kernel void SoftMax1x1_NHWC4_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask, |
|
|
const int slices, const int slices_x32) { |
|
|
const int slices, const int slices_x32) { |
|
|
int tid = get_local_id(0); |
|
|
int tid = get_local_id(0); |
|
|
FLT sum = 0.0f; |
|
|
|
|
|
|
|
|
float sum = 0.0f; |
|
|
for (size_t i = tid; i < slices - 1; i += 32) { |
|
|
for (size_t i = tid; i < slices - 1; i += 32) { |
|
|
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(i, 0)); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), exp(src)); |
|
|
|
|
|
|
|
|
float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0))); |
|
|
|
|
|
sum += dot((float4)(1.0f), exp(src)); |
|
|
} |
|
|
} |
|
|
if ((slices - 1) % 32 == tid) { |
|
|
if ((slices - 1) % 32 == tid) { |
|
|
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0)); |
|
|
|
|
|
|
|
|
float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(slices - 1, 0))); |
|
|
|
|
|
|
|
|
sum += dot(TO_FLT4(mask), exp(src)); |
|
|
|
|
|
|
|
|
sum += dot(convert_float4(mask), exp(src)); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
__local FLT4 tmp[8]; |
|
|
|
|
|
__local FLT *tmpx1 = (__local FLT *)tmp; |
|
|
|
|
|
|
|
|
__local float4 tmp[8]; |
|
|
|
|
|
__local float *tmpx1 = (__local float *)tmp; |
|
|
tmpx1[tid] = sum; |
|
|
tmpx1[tid] = sum; |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
if (tid == 0) { |
|
|
if (tid == 0) { |
|
|
sum = dot((FLT4)(1.0f), tmp[0]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[1]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[2]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[3]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[4]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[5]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[6]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[7]); |
|
|
|
|
|
|
|
|
sum = dot((float4)(1.0f), tmp[0]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[1]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[2]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[3]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[4]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[5]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[6]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[7]); |
|
|
tmpx1[0] = divide_no_check(1.0f, sum); |
|
|
tmpx1[0] = divide_no_check(1.0f, sum); |
|
|
} |
|
|
} |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
sum = tmpx1[0]; |
|
|
sum = tmpx1[0]; |
|
|
for (size_t i = tid; i < slices; i += 32) { |
|
|
for (size_t i = tid; i < slices; i += 32) { |
|
|
FLT4 result = READ_IMAGE(input, smp_zero, (int2)(i, 0)); |
|
|
|
|
|
|
|
|
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(i, 0))); |
|
|
result = exp(result) * sum; |
|
|
result = exp(result) * sum; |
|
|
WRITE_IMAGE(output, (int2)(i, 0), result); |
|
|
|
|
|
|
|
|
WRITE_IMAGE(output, (int2)(i, 0), TO_FLT4(result)); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
__kernel void SoftMax1x1_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask, |
|
|
__kernel void SoftMax1x1_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 *output, const float4 mask, |
|
|
const int slices, const int slices_x32) { |
|
|
const int slices, const int slices_x32) { |
|
|
int tid = get_local_id(0); |
|
|
int tid = get_local_id(0); |
|
|
FLT sum = 0.0f; |
|
|
|
|
|
|
|
|
float sum = 0.0f; |
|
|
for (size_t i = tid; i < slices - 1; i += 32) { |
|
|
for (size_t i = tid; i < slices - 1; i += 32) { |
|
|
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, i)); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), exp(src)); |
|
|
|
|
|
|
|
|
float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, i))); |
|
|
|
|
|
sum += dot((float4)(1.0f), exp(src)); |
|
|
} |
|
|
} |
|
|
if ((slices - 1) % 32 == tid) { |
|
|
if ((slices - 1) % 32 == tid) { |
|
|
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, slices - 1)); |
|
|
|
|
|
|
|
|
float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, slices - 1))); |
|
|
|
|
|
|
|
|
sum += dot(TO_FLT4(mask), exp(src)); |
|
|
|
|
|
|
|
|
sum += dot(convert_float4(mask), exp(src)); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
__local FLT4 tmp[8]; |
|
|
|
|
|
__local FLT *tmpx1 = (__local FLT *)tmp; |
|
|
|
|
|
|
|
|
__local float4 tmp[8]; |
|
|
|
|
|
__local float *tmpx1 = (__local float *)tmp; |
|
|
tmpx1[tid] = sum; |
|
|
tmpx1[tid] = sum; |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
if (tid == 0) { |
|
|
if (tid == 0) { |
|
|
sum = dot((FLT4)(1.0f), tmp[0]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[1]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[2]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[3]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[4]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[5]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[6]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[7]); |
|
|
|
|
|
|
|
|
sum = dot((float4)(1.0f), tmp[0]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[1]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[2]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[3]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[4]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[5]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[6]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[7]); |
|
|
tmpx1[0] = divide_no_check(1.0f, sum); |
|
|
tmpx1[0] = divide_no_check(1.0f, sum); |
|
|
} |
|
|
} |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
sum = tmpx1[0]; |
|
|
sum = tmpx1[0]; |
|
|
for (size_t i = tid; i < slices - 1; i += 32) { |
|
|
for (size_t i = tid; i < slices - 1; i += 32) { |
|
|
FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, i)); |
|
|
|
|
|
|
|
|
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, i))); |
|
|
result = exp(result) * sum; |
|
|
result = exp(result) * sum; |
|
|
output[i] = result; |
|
|
|
|
|
|
|
|
output[i] = TO_FLT4(result); |
|
|
} |
|
|
} |
|
|
if ((slices - 1) % 32 == tid) { |
|
|
if ((slices - 1) % 32 == tid) { |
|
|
FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, slices - 1)); |
|
|
|
|
|
result = exp(result) * sum; |
|
|
|
|
|
|
|
|
float4 result_float = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, slices - 1))); |
|
|
|
|
|
result_float = exp(result_float) * sum; |
|
|
|
|
|
FLT4 result = TO_FLT4(result_float); |
|
|
__global FLT4 *remain_ptr4 = output; |
|
|
__global FLT4 *remain_ptr4 = output; |
|
|
remain_ptr4 += slices - 1; |
|
|
remain_ptr4 += slices - 1; |
|
|
__global FLT *remain_ptr = (__global FLT *)remain_ptr4; |
|
|
__global FLT *remain_ptr = (__global FLT *)remain_ptr4; |
|
|
@@ -269,37 +272,36 @@ __kernel void SoftMax1x1_NC4HW4_BUF(__read_only image2d_t input, __global FLT4 * |
|
|
__kernel void SoftMax1x1_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask, |
|
|
__kernel void SoftMax1x1_NC4HW4_IMG(__read_only image2d_t input, __write_only image2d_t output, const float4 mask, |
|
|
const int slices, const int slices_x32) { |
|
|
const int slices, const int slices_x32) { |
|
|
int tid = get_local_id(0); |
|
|
int tid = get_local_id(0); |
|
|
FLT sum = 0.0f; |
|
|
|
|
|
|
|
|
float sum = 0.0f; |
|
|
for (size_t i = tid; i < slices - 1; i += 32) { |
|
|
for (size_t i = tid; i < slices - 1; i += 32) { |
|
|
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, i)); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), exp(src)); |
|
|
|
|
|
|
|
|
float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, i))); |
|
|
|
|
|
sum += dot((float4)(1.0f), exp(src)); |
|
|
} |
|
|
} |
|
|
if ((slices - 1) % 32 == tid) { |
|
|
if ((slices - 1) % 32 == tid) { |
|
|
FLT4 src = READ_IMAGE(input, smp_zero, (int2)(0, slices - 1)); |
|
|
|
|
|
|
|
|
|
|
|
sum += dot(TO_FLT4(mask), exp(src)); |
|
|
|
|
|
|
|
|
float4 src = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, slices - 1))); |
|
|
|
|
|
sum += dot(convert_float4(mask), exp(src)); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
__local FLT4 tmp[8]; |
|
|
|
|
|
__local FLT *tmpx1 = (__local FLT *)tmp; |
|
|
|
|
|
|
|
|
__local float4 tmp[8]; |
|
|
|
|
|
__local float *tmpx1 = (__local float *)tmp; |
|
|
tmpx1[tid] = sum; |
|
|
tmpx1[tid] = sum; |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
if (tid == 0) { |
|
|
if (tid == 0) { |
|
|
sum = dot((FLT4)(1.0f), tmp[0]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[1]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[2]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[3]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[4]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[5]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[6]); |
|
|
|
|
|
sum += dot((FLT4)(1.0f), tmp[7]); |
|
|
|
|
|
|
|
|
sum = dot((float4)(1.0f), tmp[0]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[1]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[2]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[3]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[4]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[5]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[6]); |
|
|
|
|
|
sum += dot((float4)(1.0f), tmp[7]); |
|
|
tmpx1[0] = divide_no_check(1.0f, sum); |
|
|
tmpx1[0] = divide_no_check(1.0f, sum); |
|
|
} |
|
|
} |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
sum = tmpx1[0]; |
|
|
sum = tmpx1[0]; |
|
|
for (size_t i = tid; i < slices; i += 32) { |
|
|
for (size_t i = tid; i < slices; i += 32) { |
|
|
FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, i)); |
|
|
|
|
|
|
|
|
float4 result = convert_float4(READ_IMAGE(input, smp_zero, (int2)(0, i))); |
|
|
result = exp(result) * sum; |
|
|
result = exp(result) * sum; |
|
|
WRITE_IMAGE(output, (int2)(0, i), result); |
|
|
|
|
|
|
|
|
WRITE_IMAGE(output, (int2)(0, i), TO_FLT4(result)); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |