Browse Source

vulkan convolution winograd f63

tags/20190908
nihui 6 years ago
parent
commit
c013bd9b7e
5 changed files with 1370 additions and 24 deletions
  1. +315
    -22
      src/layer/vulkan/convolution_vulkan.cpp
  2. +5
    -2
      src/layer/vulkan/convolution_vulkan.h
  3. +99
    -0
      src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd63_gemm.comp
  4. +368
    -0
      src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd63_transform_input.comp
  5. +583
    -0
      src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd63_transform_output.comp

+ 315
- 22
src/layer/vulkan/convolution_vulkan.cpp View File

@@ -31,8 +31,11 @@ Convolution_vulkan::Convolution_vulkan()
pipeline_convolution_pack4 = 0;
pipeline_convolution_pack4_1x1s1d1 = 0;
pipeline_convolution_pack4_3x3s1d1_lds_8_8_2 = 0;
winograd23_padding = 0;
winograd23_crop = 0;
winograd_padding = 0;
winograd_crop = 0;
pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input = 0;
pipeline_convolution_pack4_3x3s1d1_winograd63_gemm = 0;
pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output = 0;
pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input = 0;
pipeline_convolution_pack4_3x3s1d1_winograd23_gemm = 0;
pipeline_convolution_pack4_3x3s1d1_winograd23_transform_output = 0;
@@ -137,8 +140,8 @@ int Convolution_vulkan::create_pipeline(const Option& opt)
if (num_input >= 16 && num_output >= 16)
{
{
winograd23_padding = ncnn::create_layer(ncnn::LayerType::Padding);
winograd23_padding->vkdev = vkdev;
winograd_padding = ncnn::create_layer(ncnn::LayerType::Padding);
winograd_padding->vkdev = vkdev;

ncnn::ParamDict pd;
pd.set(0, -233);
@@ -148,14 +151,14 @@ int Convolution_vulkan::create_pipeline(const Option& opt)
pd.set(4, 0);
pd.set(5, 0.f);

winograd23_padding->load_param(pd);
winograd_padding->load_param(pd);

winograd23_padding->create_pipeline(opt);
winograd_padding->create_pipeline(opt);
}

{
winograd23_crop = ncnn::create_layer(ncnn::LayerType::Crop);
winograd23_crop->vkdev = vkdev;
winograd_crop = ncnn::create_layer(ncnn::LayerType::Crop);
winograd_crop->vkdev = vkdev;

ncnn::ParamDict pd;
pd.set(0, -233);
@@ -165,11 +168,28 @@ int Convolution_vulkan::create_pipeline(const Option& opt)
pd.set(4, 0);
pd.set(5, 0);

winograd23_crop->load_param(pd);
winograd_crop->load_param(pd);

winograd23_crop->create_pipeline(opt);
winograd_crop->create_pipeline(opt);
}
}

if (num_input >= 64 && num_output >= 64)
{
pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input = new Pipeline(vkdev);
pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input->set_local_size_xyz(8, 8, 1);
pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input->create("convolution_pack4_3x3s1d1_winograd63_transform_input", opt, std::vector<vk_specialization_type>(), 2, 7);

pipeline_convolution_pack4_3x3s1d1_winograd63_gemm = new Pipeline(vkdev);
pipeline_convolution_pack4_3x3s1d1_winograd63_gemm->set_local_size_xyz(4, 4, 4);
pipeline_convolution_pack4_3x3s1d1_winograd63_gemm->create("convolution_pack4_3x3s1d1_winograd63_gemm", opt, std::vector<vk_specialization_type>(), 3, 6);

pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output = new Pipeline(vkdev);
pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output->set_local_size_xyz(8, 8, 1);
pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output->create("convolution_pack4_3x3s1d1_winograd63_transform_output", opt, specializations, 3, 7);
}
else if (num_input >= 16 && num_output >= 16)
{
pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input = new Pipeline(vkdev);
pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input->set_local_size_xyz(8, 8, 1);
pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input->create("convolution_pack4_3x3s1d1_winograd23_transform_input", opt, std::vector<vk_specialization_type>(), 2, 7);
@@ -270,20 +290,27 @@ int Convolution_vulkan::destroy_pipeline(const Option& opt)
delete pipeline_convolution_pack4_3x3s1d1_lds_8_8_2;
pipeline_convolution_pack4_3x3s1d1_lds_8_8_2 = 0;

if (winograd23_padding)
if (winograd_padding)
{
winograd23_padding->destroy_pipeline(opt);
delete winograd23_padding;
winograd23_padding = 0;
winograd_padding->destroy_pipeline(opt);
delete winograd_padding;
winograd_padding = 0;
}

if (winograd23_crop)
if (winograd_crop)
{
winograd23_crop->destroy_pipeline(opt);
delete winograd23_crop;
winograd23_crop = 0;
winograd_crop->destroy_pipeline(opt);
delete winograd_crop;
winograd_crop = 0;
}

delete pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input;
delete pipeline_convolution_pack4_3x3s1d1_winograd63_gemm;
delete pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output;
pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input = 0;
pipeline_convolution_pack4_3x3s1d1_winograd63_gemm = 0;
pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output = 0;

delete pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input;
delete pipeline_convolution_pack4_3x3s1d1_winograd23_gemm;
delete pipeline_convolution_pack4_3x3s1d1_winograd23_transform_output;
@@ -398,7 +425,129 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)

cmd.record_upload(weight_data_pack4, weight_data_gpu_pack4, opt);

if (kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1 && num_input >= 16 && num_output >= 16)
bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1;
if (is_conv3x3s1d1 && num_input >= 64 && num_output >= 64)
{
// winograd63 transform kernel
Mat weight_data_tm;
weight_data_tm.create(8*8, num_input, num_output);

const float ktm[8][3] = {
{ 1.0f, 0.0f, 0.0f},
{-2.0f/9, -2.0f/9, -2.0f/9},
{-2.0f/9, 2.0f/9, -2.0f/9},
{1.0f/90, 1.0f/45, 2.0f/45},
{1.0f/90, -1.0f/45, 2.0f/45},
{1.0f/45, 1.0f/90, 1.0f/180},
{1.0f/45, -1.0f/90, 1.0f/180},
{ 0.0f, 0.0f, 1.0f}
};

#pragma omp parallel for
for (int p = 0; p<num_output; p++)
{
for (int q = 0; q<num_input; q++)
{
const float* kernel0 = (const float*)weight_data + p*num_input * 9 + q * 9;
float* kernel_tm0 = weight_data_tm.channel(p).row(q);

// transform kernel, transposed
const float* k0 = kernel0;
const float* k1 = kernel0 + 3;
const float* k2 = kernel0 + 6;

// h
float tmp[8][3];
for (int i=0; i<8; i++)
{
tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
}

// v
for (int j=0; j<8; j++)
{
float* tmpp = &tmp[j][0];

for (int i=0; i<8; i++)
{
kernel_tm0[j*8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
}
}
}
}

// src = 64-inch-outch
// dst = 4a-4b-64-inch/4a-outch/4b
Mat weight_data_pack4_tm;
{
weight_data_pack4_tm.create(64, num_input/4, num_output/4, (size_t)4*16, 16);

for (int q=0; q+3<num_output; q+=4)
{
const Mat k0 = weight_data_tm.channel(q);
const Mat k1 = weight_data_tm.channel(q+1);
const Mat k2 = weight_data_tm.channel(q+2);
const Mat k3 = weight_data_tm.channel(q+3);

Mat g0 = weight_data_pack4_tm.channel(q/4);

for (int p=0; p+3<num_input; p+=4)
{
const float* k00 = k0.row(p);
const float* k01 = k0.row(p+1);
const float* k02 = k0.row(p+2);
const float* k03 = k0.row(p+3);

const float* k10 = k1.row(p);
const float* k11 = k1.row(p+1);
const float* k12 = k1.row(p+2);
const float* k13 = k1.row(p+3);

const float* k20 = k2.row(p);
const float* k21 = k2.row(p+1);
const float* k22 = k2.row(p+2);
const float* k23 = k2.row(p+3);

const float* k30 = k3.row(p);
const float* k31 = k3.row(p+1);
const float* k32 = k3.row(p+2);
const float* k33 = k3.row(p+3);

float* g00 = g0.row(p/4);

for (int k=0; k<64; k++)
{
g00[0] = k00[k];
g00[1] = k01[k];
g00[2] = k02[k];
g00[3] = k03[k];

g00[4] = k10[k];
g00[5] = k11[k];
g00[6] = k12[k];
g00[7] = k13[k];

g00[8] = k20[k];
g00[9] = k21[k];
g00[10] = k22[k];
g00[11] = k23[k];

g00[12] = k30[k];
g00[13] = k31[k];
g00[14] = k32[k];
g00[15] = k33[k];

g00 += 16;
}
}
}
}

cmd.record_upload(weight_data_pack4_tm, weight_data_gpu_pack4_tm, opt);
}
else if (is_conv3x3s1d1 && num_input >= 16 && num_output >= 16)
{
// winograd23 transform kernel
Mat weight_data_tm;
@@ -766,7 +915,151 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
}

bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1;
if (packing == 4 && out_packing == 4 && is_conv3x3s1d1 && channels * packing >= 16 && num_output >= 16)
if (packing == 4 && out_packing == 4 && is_conv3x3s1d1 && channels * packing >= 64 && num_output >= 64)
{
// winograd63
int outw_bordered = (outw + 5) / 6 * 6;
int outh_bordered = (outh + 5) / 6 * 6;

int w_bordered = outw_bordered + 2;
int h_bordered = outh_bordered + 2;

int block_x = outw_bordered / 6;
int block_y = outh_bordered / 6;

// pad to 6n+2
{
ncnn::Option opt_pad = opt;
opt_pad.blob_vkallocator = opt.workspace_vkallocator;

VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
padding_param_blob.prepare_staging_buffer();
int* padding_params = padding_param_blob.mapped();

padding_params[0] = 0;
padding_params[1] = h_bordered - bottom_blob_bordered.h;
padding_params[2] = 0;
padding_params[3] = w_bordered - bottom_blob_bordered.w;

std::vector<VkMat> padding_inputs(2);
padding_inputs[0] = bottom_blob_bordered;
padding_inputs[1] = padding_param_blob;

std::vector<VkMat> padding_outputs(1);
winograd_padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
bottom_blob_bordered = padding_outputs[0];
}

// transform input
VkMat bottom_tm_blob;
{
bottom_tm_blob.create(64, block_x * block_y, channels, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator);
if (bottom_tm_blob.empty())
return -100;

std::vector<VkMat> bindings(2);
bindings[0] = bottom_blob_bordered;
bindings[1] = bottom_tm_blob;

std::vector<vk_constant_type> constants(7);
constants[0].i = bottom_blob_bordered.w;
constants[1].i = bottom_blob_bordered.h;
constants[2].i = bottom_blob_bordered.c;
constants[3].i = bottom_blob_bordered.cstep;
constants[4].i = bottom_tm_blob.cstep;
constants[5].i = block_x;
constants[6].i = block_y;

VkMat dispatcher;
dispatcher.w = block_x;
dispatcher.h = block_y;
dispatcher.c = bottom_tm_blob.c;

cmd.record_pipeline(pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input, bindings, constants, dispatcher);
}

// gemm
VkMat top_tm_blob;
{
top_tm_blob.create(64, block_x * block_y, num_output / out_packing, elemsize, out_packing, opt.workspace_vkallocator, opt.staging_vkallocator);
if (top_tm_blob.empty())
return -100;

std::vector<VkMat> bindings(3);
bindings[0] = bottom_tm_blob;
bindings[1] = top_tm_blob;
bindings[2] = weight_data_gpu_pack4_tm;

std::vector<vk_constant_type> constants(6);
constants[0].i = bottom_tm_blob.c;
constants[1].i = bottom_tm_blob.cstep;
constants[2].i = (top_tm_blob.h + 3) / 4;
constants[3].i = top_tm_blob.h;
constants[4].i = top_tm_blob.c;
constants[5].i = top_tm_blob.cstep;

VkMat dispatcher;
dispatcher.w = top_tm_blob.w;
dispatcher.h = (top_tm_blob.h + 3) / 4;
dispatcher.c = top_tm_blob.c;

cmd.record_pipeline(pipeline_convolution_pack4_3x3s1d1_winograd63_gemm, bindings, constants, dispatcher);
}

// transform output
VkMat top_blob_bordered;
{
top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_packing, elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator);
if (top_blob_bordered.empty())
return -100;

std::vector<VkMat> bindings(3);
bindings[0] = top_tm_blob;
bindings[1] = top_blob_bordered;
bindings[2] = bias_term ? bias_data_gpu_pack4 : bindings[1];

std::vector<vk_constant_type> constants(7);
constants[0].i = top_tm_blob.c;
constants[1].i = top_tm_blob.cstep;
constants[2].i = block_x;
constants[3].i = block_y;
constants[4].i = top_blob_bordered.w;
constants[5].i = top_blob_bordered.h;
constants[6].i = top_blob_bordered.cstep;

VkMat dispatcher;
dispatcher.w = block_x;
dispatcher.h = block_y;
dispatcher.c = top_blob_bordered.c;

cmd.record_pipeline(pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output, bindings, constants, dispatcher);
}

// crop top_blob
{
VkMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator);
crop_param_blob.prepare_staging_buffer();
int* crop_params = crop_param_blob.mapped();

crop_params[0] = 0;
crop_params[1] = 0;
crop_params[2] = 0;
crop_params[3] = outw;
crop_params[4] = outh;
crop_params[5] = num_output;

std::vector<VkMat> crop_inputs(2);
crop_inputs[0] = top_blob_bordered;
crop_inputs[1] = crop_param_blob;

std::vector<VkMat> crop_outputs(1);
winograd_crop->forward(crop_inputs, crop_outputs, cmd, opt);
top_blob = crop_outputs[0];
}

return 0;
}
else if (packing == 4 && out_packing == 4 && is_conv3x3s1d1 && channels * packing >= 16 && num_output >= 16)
{
// winograd23
int outw_bordered = (outw + 1) / 2 * 2;
@@ -797,7 +1090,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
padding_inputs[1] = padding_param_blob;

std::vector<VkMat> padding_outputs(1);
winograd23_padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
winograd_padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
bottom_blob_bordered = padding_outputs[0];
}

@@ -904,7 +1197,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
crop_inputs[1] = crop_param_blob;

std::vector<VkMat> crop_outputs(1);
winograd23_crop->forward(crop_inputs, crop_outputs, cmd, opt);
winograd_crop->forward(crop_inputs, crop_outputs, cmd, opt);
top_blob = crop_outputs[0];
}



+ 5
- 2
src/layer/vulkan/convolution_vulkan.h View File

@@ -49,9 +49,12 @@ public:
Pipeline* pipeline_convolution_pack4_3x3s1d1_lds_8_8_2;

// pack4 winograd23
ncnn::Layer* winograd23_padding;
ncnn::Layer* winograd23_crop;
ncnn::Layer* winograd_padding;
ncnn::Layer* winograd_crop;
VkMat weight_data_gpu_pack4_tm;
Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input;
Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd63_gemm;
Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output;
Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input;
Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd23_gemm;
Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd23_transform_output;


+ 99
- 0
src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd63_gemm.comp View File

@@ -0,0 +1,99 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require
#endif

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; };
layout (binding = 1) writeonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; };
#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
layout (binding = 2) readonly buffer weight_tm_blob { sfpvec4 weight_tm_data[]; };
#else
layout (binding = 2) readonly buffer weight_tm_blob { sfpmat4 weight_tm_data[]; };
#endif

layout (push_constant) uniform parameter
{
int c;
int cstep;

int outh_4;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= 64 || gy >= p.outh_4 || gz >= p.outc)
return;

afpvec4 sum0 = afpvec4(0.f);
afpvec4 sum1 = afpvec4(0.f);
afpvec4 sum2 = afpvec4(0.f);
afpvec4 sum3 = afpvec4(0.f);

int v_offset = gy * 4 * 64 + gx;
int w_offset = gz * p.c * 64 + gx;

for (int z = 0; z < p.c; z++)
{
afpvec4 v0 = sfp2afpvec4(bottom_tm_blob_data[v_offset + 0]);
afpvec4 v1 = sfp2afpvec4(bottom_tm_blob_data[v_offset + 64]);
afpvec4 v2 = sfp2afpvec4(bottom_tm_blob_data[v_offset + 128]);
afpvec4 v3 = sfp2afpvec4(bottom_tm_blob_data[v_offset + 192]);

#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
afpmat4 k = afpmat4(
sfp2afpvec4(weight_tm_data[w_offset * 4 + 0]),
sfp2afpvec4(weight_tm_data[w_offset * 4 + 1]),
sfp2afpvec4(weight_tm_data[w_offset * 4 + 2]),
sfp2afpvec4(weight_tm_data[w_offset * 4 + 3])
);
#else
afpmat4 k = sfpmat4(weight_tm_data[w_offset]);
#endif

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;

v_offset += p.cstep;
w_offset += 64;
}

int gi = gz * p.outcstep + gy * 4 * 64 + gx;

top_tm_blob_data[gi + 0] = afp2sfpvec4(sum0);
if (gy * 4 + 1 < p.outh) top_tm_blob_data[gi + 64] = afp2sfpvec4(sum1);
if (gy * 4 + 2 < p.outh) top_tm_blob_data[gi + 128] = afp2sfpvec4(sum2);
if (gy * 4 + 3 < p.outh) top_tm_blob_data[gi + 192] = afp2sfpvec4(sum3);
}

+ 368
- 0
src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd63_transform_input.comp View File

@@ -0,0 +1,368 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require
#endif

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; };

layout (push_constant) uniform parameter
{
int w;
int h;
int c;
int cstep;

int outcstep;

int block_x;
int block_y;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.block_x || gy >= p.block_y || gz >= p.c)
return;

// load 8x8
int v_offset_0 = gz * p.cstep + gy * 6 * p.w + gx * 6;
ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * p.w;
ivec4 n_offset = v_offset_0 + ivec4(4, 5, 6, 7) * p.w;

afpvec4 v00 = sfp2afpvec4(bottom_blob_data[v_offset.r + 0]);
afpvec4 v01 = sfp2afpvec4(bottom_blob_data[v_offset.r + 1]);
afpvec4 v02 = sfp2afpvec4(bottom_blob_data[v_offset.r + 2]);
afpvec4 v03 = sfp2afpvec4(bottom_blob_data[v_offset.r + 3]);
afpvec4 v04 = sfp2afpvec4(bottom_blob_data[v_offset.r + 4]);
afpvec4 v05 = sfp2afpvec4(bottom_blob_data[v_offset.r + 5]);
afpvec4 v06 = sfp2afpvec4(bottom_blob_data[v_offset.r + 6]);
afpvec4 v07 = sfp2afpvec4(bottom_blob_data[v_offset.r + 7]);

afpvec4 v10 = sfp2afpvec4(bottom_blob_data[v_offset.g + 0]);
afpvec4 v11 = sfp2afpvec4(bottom_blob_data[v_offset.g + 1]);
afpvec4 v12 = sfp2afpvec4(bottom_blob_data[v_offset.g + 2]);
afpvec4 v13 = sfp2afpvec4(bottom_blob_data[v_offset.g + 3]);
afpvec4 v14 = sfp2afpvec4(bottom_blob_data[v_offset.g + 4]);
afpvec4 v15 = sfp2afpvec4(bottom_blob_data[v_offset.g + 5]);
afpvec4 v16 = sfp2afpvec4(bottom_blob_data[v_offset.g + 6]);
afpvec4 v17 = sfp2afpvec4(bottom_blob_data[v_offset.g + 7]);

afpvec4 v20 = sfp2afpvec4(bottom_blob_data[v_offset.b + 0]);
afpvec4 v21 = sfp2afpvec4(bottom_blob_data[v_offset.b + 1]);
afpvec4 v22 = sfp2afpvec4(bottom_blob_data[v_offset.b + 2]);
afpvec4 v23 = sfp2afpvec4(bottom_blob_data[v_offset.b + 3]);
afpvec4 v24 = sfp2afpvec4(bottom_blob_data[v_offset.b + 4]);
afpvec4 v25 = sfp2afpvec4(bottom_blob_data[v_offset.b + 5]);
afpvec4 v26 = sfp2afpvec4(bottom_blob_data[v_offset.b + 6]);
afpvec4 v27 = sfp2afpvec4(bottom_blob_data[v_offset.b + 7]);

afpvec4 v30 = sfp2afpvec4(bottom_blob_data[v_offset.a + 0]);
afpvec4 v31 = sfp2afpvec4(bottom_blob_data[v_offset.a + 1]);
afpvec4 v32 = sfp2afpvec4(bottom_blob_data[v_offset.a + 2]);
afpvec4 v33 = sfp2afpvec4(bottom_blob_data[v_offset.a + 3]);
afpvec4 v34 = sfp2afpvec4(bottom_blob_data[v_offset.a + 4]);
afpvec4 v35 = sfp2afpvec4(bottom_blob_data[v_offset.a + 5]);
afpvec4 v36 = sfp2afpvec4(bottom_blob_data[v_offset.a + 6]);
afpvec4 v37 = sfp2afpvec4(bottom_blob_data[v_offset.a + 7]);

afpvec4 v40 = sfp2afpvec4(bottom_blob_data[n_offset.r + 0]);
afpvec4 v41 = sfp2afpvec4(bottom_blob_data[n_offset.r + 1]);
afpvec4 v42 = sfp2afpvec4(bottom_blob_data[n_offset.r + 2]);
afpvec4 v43 = sfp2afpvec4(bottom_blob_data[n_offset.r + 3]);
afpvec4 v44 = sfp2afpvec4(bottom_blob_data[n_offset.r + 4]);
afpvec4 v45 = sfp2afpvec4(bottom_blob_data[n_offset.r + 5]);
afpvec4 v46 = sfp2afpvec4(bottom_blob_data[n_offset.r + 6]);
afpvec4 v47 = sfp2afpvec4(bottom_blob_data[n_offset.r + 7]);

afpvec4 v50 = sfp2afpvec4(bottom_blob_data[n_offset.g + 0]);
afpvec4 v51 = sfp2afpvec4(bottom_blob_data[n_offset.g + 1]);
afpvec4 v52 = sfp2afpvec4(bottom_blob_data[n_offset.g + 2]);
afpvec4 v53 = sfp2afpvec4(bottom_blob_data[n_offset.g + 3]);
afpvec4 v54 = sfp2afpvec4(bottom_blob_data[n_offset.g + 4]);
afpvec4 v55 = sfp2afpvec4(bottom_blob_data[n_offset.g + 5]);
afpvec4 v56 = sfp2afpvec4(bottom_blob_data[n_offset.g + 6]);
afpvec4 v57 = sfp2afpvec4(bottom_blob_data[n_offset.g + 7]);

afpvec4 v60 = sfp2afpvec4(bottom_blob_data[n_offset.b + 0]);
afpvec4 v61 = sfp2afpvec4(bottom_blob_data[n_offset.b + 1]);
afpvec4 v62 = sfp2afpvec4(bottom_blob_data[n_offset.b + 2]);
afpvec4 v63 = sfp2afpvec4(bottom_blob_data[n_offset.b + 3]);
afpvec4 v64 = sfp2afpvec4(bottom_blob_data[n_offset.b + 4]);
afpvec4 v65 = sfp2afpvec4(bottom_blob_data[n_offset.b + 5]);
afpvec4 v66 = sfp2afpvec4(bottom_blob_data[n_offset.b + 6]);
afpvec4 v67 = sfp2afpvec4(bottom_blob_data[n_offset.b + 7]);

afpvec4 v70 = sfp2afpvec4(bottom_blob_data[n_offset.a + 0]);
afpvec4 v71 = sfp2afpvec4(bottom_blob_data[n_offset.a + 1]);
afpvec4 v72 = sfp2afpvec4(bottom_blob_data[n_offset.a + 2]);
afpvec4 v73 = sfp2afpvec4(bottom_blob_data[n_offset.a + 3]);
afpvec4 v74 = sfp2afpvec4(bottom_blob_data[n_offset.a + 4]);
afpvec4 v75 = sfp2afpvec4(bottom_blob_data[n_offset.a + 5]);
afpvec4 v76 = sfp2afpvec4(bottom_blob_data[n_offset.a + 6]);
afpvec4 v77 = sfp2afpvec4(bottom_blob_data[n_offset.a + 7]);

// const float itm[8][8] = {
// {1.0f, 0.0f, -5.25f, 0.00f, 5.25f, 0.00f, -1.0f, 0.0f},
//
// {0.0f, 1.0f, 1.00f, -4.25f, -4.25f, 1.00f, 1.0f, 0.0f},
// {0.0f, -1.0f, 1.00f, 4.25f, -4.25f, -1.00f, 1.0f, 0.0f},
//
// {0.0f, 0.5f, 0.25f, -2.50f, -1.25f, 2.00f, 1.0f, 0.0f},
// {0.0f, -0.5f, 0.25f, 2.50f, -1.25f, -2.00f, 1.0f, 0.0f},
//
// {0.0f, 2.0f, 4.00f, -2.50f, -5.00f, 0.50f, 1.0f, 0.0f},
// {0.0f, -2.0f, 4.00f, 2.50f, -5.00f, -0.50f, 1.0f, 0.0f},
//
// {0.0f, -1.0f, 0.00f, 5.25f, 0.00f, -5.25f, 0.0f, 1.0f}
// };

// 0 = r00 - r06 + (r04 - r02) * 5.25
// 7 = r07 - r01 + (r03 - r05) * 5.25

// 1 = (r02 + r06 - r04 * 4.25) + (r01 - r03 * 4.25 + r05)
// 2 = (r02 + r06 - r04 * 4.25) - (r01 - r03 * 4.25 + r05)

// 3 = (r06 + r02 * 0.25 - r04 * 1.25) + (r01 * 0.5 - r03 * 2.5 + r05 * 2)
// 4 = (r06 + r02 * 0.25 - r04 * 1.25) - (r01 * 0.5 - r03 * 2.5 + r05 * 2)

// 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
// 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)

// implicit transpose
afpvec4 m00 = v00 - v06 + (v04 - v02) * afp(5.25f);
afpvec4 m01 = v10 - v16 + (v14 - v12) * afp(5.25f);
afpvec4 m02 = v20 - v26 + (v24 - v22) * afp(5.25f);
afpvec4 m03 = v30 - v36 + (v34 - v32) * afp(5.25f);
afpvec4 m04 = v40 - v46 + (v44 - v42) * afp(5.25f);
afpvec4 m05 = v50 - v56 + (v54 - v52) * afp(5.25f);
afpvec4 m06 = v60 - v66 + (v64 - v62) * afp(5.25f);
afpvec4 m07 = v70 - v76 + (v74 - v72) * afp(5.25f);
afpvec4 m70 = v07 - v01 + (v03 - v05) * afp(5.25f);
afpvec4 m71 = v17 - v11 + (v13 - v15) * afp(5.25f);
afpvec4 m72 = v27 - v21 + (v23 - v25) * afp(5.25f);
afpvec4 m73 = v37 - v31 + (v33 - v35) * afp(5.25f);
afpvec4 m74 = v47 - v41 + (v43 - v45) * afp(5.25f);
afpvec4 m75 = v57 - v51 + (v53 - v55) * afp(5.25f);
afpvec4 m76 = v67 - v61 + (v63 - v65) * afp(5.25f);
afpvec4 m77 = v77 - v71 + (v73 - v75) * afp(5.25f);

afpvec4 m10 = (v02 + v06 - v04 * afp(4.25f)) + (v01 - v03 * afp(4.25f) + v05);
afpvec4 m11 = (v12 + v16 - v14 * afp(4.25f)) + (v11 - v13 * afp(4.25f) + v15);
afpvec4 m12 = (v22 + v26 - v24 * afp(4.25f)) + (v21 - v23 * afp(4.25f) + v25);
afpvec4 m13 = (v32 + v36 - v34 * afp(4.25f)) + (v31 - v33 * afp(4.25f) + v35);
afpvec4 m14 = (v42 + v46 - v44 * afp(4.25f)) + (v41 - v43 * afp(4.25f) + v45);
afpvec4 m15 = (v52 + v56 - v54 * afp(4.25f)) + (v51 - v53 * afp(4.25f) + v55);
afpvec4 m16 = (v62 + v66 - v64 * afp(4.25f)) + (v61 - v63 * afp(4.25f) + v65);
afpvec4 m17 = (v72 + v76 - v74 * afp(4.25f)) + (v71 - v73 * afp(4.25f) + v75);
afpvec4 m20 = (v02 + v06 - v04 * afp(4.25f)) - (v01 - v03 * afp(4.25f) + v05);
afpvec4 m21 = (v12 + v16 - v14 * afp(4.25f)) - (v11 - v13 * afp(4.25f) + v15);
afpvec4 m22 = (v22 + v26 - v24 * afp(4.25f)) - (v21 - v23 * afp(4.25f) + v25);
afpvec4 m23 = (v32 + v36 - v34 * afp(4.25f)) - (v31 - v33 * afp(4.25f) + v35);
afpvec4 m24 = (v42 + v46 - v44 * afp(4.25f)) - (v41 - v43 * afp(4.25f) + v45);
afpvec4 m25 = (v52 + v56 - v54 * afp(4.25f)) - (v51 - v53 * afp(4.25f) + v55);
afpvec4 m26 = (v62 + v66 - v64 * afp(4.25f)) - (v61 - v63 * afp(4.25f) + v65);
afpvec4 m27 = (v72 + v76 - v74 * afp(4.25f)) - (v71 - v73 * afp(4.25f) + v75);

afpvec4 m30 = (v06 + v02 * afp(0.25f) - v04 * afp(1.25f)) + (v01 * afp(0.5f) - v03 * afp(2.5f) + v05 * afp(2.f));
afpvec4 m31 = (v16 + v12 * afp(0.25f) - v14 * afp(1.25f)) + (v11 * afp(0.5f) - v13 * afp(2.5f) + v15 * afp(2.f));
afpvec4 m32 = (v26 + v22 * afp(0.25f) - v24 * afp(1.25f)) + (v21 * afp(0.5f) - v23 * afp(2.5f) + v25 * afp(2.f));
afpvec4 m33 = (v36 + v32 * afp(0.25f) - v34 * afp(1.25f)) + (v31 * afp(0.5f) - v33 * afp(2.5f) + v35 * afp(2.f));
afpvec4 m34 = (v46 + v42 * afp(0.25f) - v44 * afp(1.25f)) + (v41 * afp(0.5f) - v43 * afp(2.5f) + v45 * afp(2.f));
afpvec4 m35 = (v56 + v52 * afp(0.25f) - v54 * afp(1.25f)) + (v51 * afp(0.5f) - v53 * afp(2.5f) + v55 * afp(2.f));
afpvec4 m36 = (v66 + v62 * afp(0.25f) - v64 * afp(1.25f)) + (v61 * afp(0.5f) - v63 * afp(2.5f) + v65 * afp(2.f));
afpvec4 m37 = (v76 + v72 * afp(0.25f) - v74 * afp(1.25f)) + (v71 * afp(0.5f) - v73 * afp(2.5f) + v75 * afp(2.f));
afpvec4 m40 = (v06 + v02 * afp(0.25f) - v04 * afp(1.25f)) - (v01 * afp(0.5f) - v03 * afp(2.5f) + v05 * afp(2.f));
afpvec4 m41 = (v16 + v12 * afp(0.25f) - v14 * afp(1.25f)) - (v11 * afp(0.5f) - v13 * afp(2.5f) + v15 * afp(2.f));
afpvec4 m42 = (v26 + v22 * afp(0.25f) - v24 * afp(1.25f)) - (v21 * afp(0.5f) - v23 * afp(2.5f) + v25 * afp(2.f));
afpvec4 m43 = (v36 + v32 * afp(0.25f) - v34 * afp(1.25f)) - (v31 * afp(0.5f) - v33 * afp(2.5f) + v35 * afp(2.f));
afpvec4 m44 = (v46 + v42 * afp(0.25f) - v44 * afp(1.25f)) - (v41 * afp(0.5f) - v43 * afp(2.5f) + v45 * afp(2.f));
afpvec4 m45 = (v56 + v52 * afp(0.25f) - v54 * afp(1.25f)) - (v51 * afp(0.5f) - v53 * afp(2.5f) + v55 * afp(2.f));
afpvec4 m46 = (v66 + v62 * afp(0.25f) - v64 * afp(1.25f)) - (v61 * afp(0.5f) - v63 * afp(2.5f) + v65 * afp(2.f));
afpvec4 m47 = (v76 + v72 * afp(0.25f) - v74 * afp(1.25f)) - (v71 * afp(0.5f) - v73 * afp(2.5f) + v75 * afp(2.f));

afpvec4 m50 = (v06 + (v02 - v04 * afp(1.25f)) * afp(4.f)) + (v01 * afp(2.f) - v03 * afp(2.5f) + v05 * afp(0.5f));
afpvec4 m51 = (v16 + (v12 - v14 * afp(1.25f)) * afp(4.f)) + (v11 * afp(2.f) - v13 * afp(2.5f) + v15 * afp(0.5f));
afpvec4 m52 = (v26 + (v22 - v24 * afp(1.25f)) * afp(4.f)) + (v21 * afp(2.f) - v23 * afp(2.5f) + v25 * afp(0.5f));
afpvec4 m53 = (v36 + (v32 - v34 * afp(1.25f)) * afp(4.f)) + (v31 * afp(2.f) - v33 * afp(2.5f) + v35 * afp(0.5f));
afpvec4 m54 = (v46 + (v42 - v44 * afp(1.25f)) * afp(4.f)) + (v41 * afp(2.f) - v43 * afp(2.5f) + v45 * afp(0.5f));
afpvec4 m55 = (v56 + (v52 - v54 * afp(1.25f)) * afp(4.f)) + (v51 * afp(2.f) - v53 * afp(2.5f) + v55 * afp(0.5f));
afpvec4 m56 = (v66 + (v62 - v64 * afp(1.25f)) * afp(4.f)) + (v61 * afp(2.f) - v63 * afp(2.5f) + v65 * afp(0.5f));
afpvec4 m57 = (v76 + (v72 - v74 * afp(1.25f)) * afp(4.f)) + (v71 * afp(2.f) - v73 * afp(2.5f) + v75 * afp(0.5f));
afpvec4 m60 = (v06 + (v02 - v04 * afp(1.25f)) * afp(4.f)) - (v01 * afp(2.f) - v03 * afp(2.5f) + v05 * afp(0.5f));
afpvec4 m61 = (v16 + (v12 - v14 * afp(1.25f)) * afp(4.f)) - (v11 * afp(2.f) - v13 * afp(2.5f) + v15 * afp(0.5f));
afpvec4 m62 = (v26 + (v22 - v24 * afp(1.25f)) * afp(4.f)) - (v21 * afp(2.f) - v23 * afp(2.5f) + v25 * afp(0.5f));
afpvec4 m63 = (v36 + (v32 - v34 * afp(1.25f)) * afp(4.f)) - (v31 * afp(2.f) - v33 * afp(2.5f) + v35 * afp(0.5f));
afpvec4 m64 = (v46 + (v42 - v44 * afp(1.25f)) * afp(4.f)) - (v41 * afp(2.f) - v43 * afp(2.5f) + v45 * afp(0.5f));
afpvec4 m65 = (v56 + (v52 - v54 * afp(1.25f)) * afp(4.f)) - (v51 * afp(2.f) - v53 * afp(2.5f) + v55 * afp(0.5f));
afpvec4 m66 = (v66 + (v62 - v64 * afp(1.25f)) * afp(4.f)) - (v61 * afp(2.f) - v63 * afp(2.5f) + v65 * afp(0.5f));
afpvec4 m67 = (v76 + (v72 - v74 * afp(1.25f)) * afp(4.f)) - (v71 * afp(2.f) - v73 * afp(2.5f) + v75 * afp(0.5f));

v00 = m00 - m06 + (m04 - m02) * afp(5.25f);
v10 = m10 - m16 + (m14 - m12) * afp(5.25f);
v20 = m20 - m26 + (m24 - m22) * afp(5.25f);
v30 = m30 - m36 + (m34 - m32) * afp(5.25f);
v40 = m40 - m46 + (m44 - m42) * afp(5.25f);
v50 = m50 - m56 + (m54 - m52) * afp(5.25f);
v60 = m60 - m66 + (m64 - m62) * afp(5.25f);
v70 = m70 - m76 + (m74 - m72) * afp(5.25f);
v07 = m07 - m01 + (m03 - m05) * afp(5.25f);
v17 = m17 - m11 + (m13 - m15) * afp(5.25f);
v27 = m27 - m21 + (m23 - m25) * afp(5.25f);
v37 = m37 - m31 + (m33 - m35) * afp(5.25f);
v47 = m47 - m41 + (m43 - m45) * afp(5.25f);
v57 = m57 - m51 + (m53 - m55) * afp(5.25f);
v67 = m67 - m61 + (m63 - m65) * afp(5.25f);
v77 = m77 - m71 + (m73 - m75) * afp(5.25f);

v01 = (m02 + m06 - m04 * afp(4.25f)) + (m01 - m03 * afp(4.25f) + m05);
v11 = (m12 + m16 - m14 * afp(4.25f)) + (m11 - m13 * afp(4.25f) + m15);
v21 = (m22 + m26 - m24 * afp(4.25f)) + (m21 - m23 * afp(4.25f) + m25);
v31 = (m32 + m36 - m34 * afp(4.25f)) + (m31 - m33 * afp(4.25f) + m35);
v41 = (m42 + m46 - m44 * afp(4.25f)) + (m41 - m43 * afp(4.25f) + m45);
v51 = (m52 + m56 - m54 * afp(4.25f)) + (m51 - m53 * afp(4.25f) + m55);
v61 = (m62 + m66 - m64 * afp(4.25f)) + (m61 - m63 * afp(4.25f) + m65);
v71 = (m72 + m76 - m74 * afp(4.25f)) + (m71 - m73 * afp(4.25f) + m75);
v02 = (m02 + m06 - m04 * afp(4.25f)) - (m01 - m03 * afp(4.25f) + m05);
v12 = (m12 + m16 - m14 * afp(4.25f)) - (m11 - m13 * afp(4.25f) + m15);
v22 = (m22 + m26 - m24 * afp(4.25f)) - (m21 - m23 * afp(4.25f) + m25);
v32 = (m32 + m36 - m34 * afp(4.25f)) - (m31 - m33 * afp(4.25f) + m35);
v42 = (m42 + m46 - m44 * afp(4.25f)) - (m41 - m43 * afp(4.25f) + m45);
v52 = (m52 + m56 - m54 * afp(4.25f)) - (m51 - m53 * afp(4.25f) + m55);
v62 = (m62 + m66 - m64 * afp(4.25f)) - (m61 - m63 * afp(4.25f) + m65);
v72 = (m72 + m76 - m74 * afp(4.25f)) - (m71 - m73 * afp(4.25f) + m75);

v03 = (m06 + m02 * afp(0.25f) - m04 * afp(1.25f)) + (m01 * afp(0.5f) - m03 * afp(2.5f) + m05 * afp(2.f));
v13 = (m16 + m12 * afp(0.25f) - m14 * afp(1.25f)) + (m11 * afp(0.5f) - m13 * afp(2.5f) + m15 * afp(2.f));
v23 = (m26 + m22 * afp(0.25f) - m24 * afp(1.25f)) + (m21 * afp(0.5f) - m23 * afp(2.5f) + m25 * afp(2.f));
v33 = (m36 + m32 * afp(0.25f) - m34 * afp(1.25f)) + (m31 * afp(0.5f) - m33 * afp(2.5f) + m35 * afp(2.f));
v43 = (m46 + m42 * afp(0.25f) - m44 * afp(1.25f)) + (m41 * afp(0.5f) - m43 * afp(2.5f) + m45 * afp(2.f));
v53 = (m56 + m52 * afp(0.25f) - m54 * afp(1.25f)) + (m51 * afp(0.5f) - m53 * afp(2.5f) + m55 * afp(2.f));
v63 = (m66 + m62 * afp(0.25f) - m64 * afp(1.25f)) + (m61 * afp(0.5f) - m63 * afp(2.5f) + m65 * afp(2.f));
v73 = (m76 + m72 * afp(0.25f) - m74 * afp(1.25f)) + (m71 * afp(0.5f) - m73 * afp(2.5f) + m75 * afp(2.f));
v04 = (m06 + m02 * afp(0.25f) - m04 * afp(1.25f)) - (m01 * afp(0.5f) - m03 * afp(2.5f) + m05 * afp(2.f));
v14 = (m16 + m12 * afp(0.25f) - m14 * afp(1.25f)) - (m11 * afp(0.5f) - m13 * afp(2.5f) + m15 * afp(2.f));
v24 = (m26 + m22 * afp(0.25f) - m24 * afp(1.25f)) - (m21 * afp(0.5f) - m23 * afp(2.5f) + m25 * afp(2.f));
v34 = (m36 + m32 * afp(0.25f) - m34 * afp(1.25f)) - (m31 * afp(0.5f) - m33 * afp(2.5f) + m35 * afp(2.f));
v44 = (m46 + m42 * afp(0.25f) - m44 * afp(1.25f)) - (m41 * afp(0.5f) - m43 * afp(2.5f) + m45 * afp(2.f));
v54 = (m56 + m52 * afp(0.25f) - m54 * afp(1.25f)) - (m51 * afp(0.5f) - m53 * afp(2.5f) + m55 * afp(2.f));
v64 = (m66 + m62 * afp(0.25f) - m64 * afp(1.25f)) - (m61 * afp(0.5f) - m63 * afp(2.5f) + m65 * afp(2.f));
v74 = (m76 + m72 * afp(0.25f) - m74 * afp(1.25f)) - (m71 * afp(0.5f) - m73 * afp(2.5f) + m75 * afp(2.f));

v05 = (m06 + (m02 - m04 * afp(1.25f)) * afp(4.f)) + (m01 * afp(2.f) - m03 * afp(2.5f) + m05 * afp(0.5f));
v15 = (m16 + (m12 - m14 * afp(1.25f)) * afp(4.f)) + (m11 * afp(2.f) - m13 * afp(2.5f) + m15 * afp(0.5f));
v25 = (m26 + (m22 - m24 * afp(1.25f)) * afp(4.f)) + (m21 * afp(2.f) - m23 * afp(2.5f) + m25 * afp(0.5f));
v35 = (m36 + (m32 - m34 * afp(1.25f)) * afp(4.f)) + (m31 * afp(2.f) - m33 * afp(2.5f) + m35 * afp(0.5f));
v45 = (m46 + (m42 - m44 * afp(1.25f)) * afp(4.f)) + (m41 * afp(2.f) - m43 * afp(2.5f) + m45 * afp(0.5f));
v55 = (m56 + (m52 - m54 * afp(1.25f)) * afp(4.f)) + (m51 * afp(2.f) - m53 * afp(2.5f) + m55 * afp(0.5f));
v65 = (m66 + (m62 - m64 * afp(1.25f)) * afp(4.f)) + (m61 * afp(2.f) - m63 * afp(2.5f) + m65 * afp(0.5f));
v75 = (m76 + (m72 - m74 * afp(1.25f)) * afp(4.f)) + (m71 * afp(2.f) - m73 * afp(2.5f) + m75 * afp(0.5f));
v06 = (m06 + (m02 - m04 * afp(1.25f)) * afp(4.f)) - (m01 * afp(2.f) - m03 * afp(2.5f) + m05 * afp(0.5f));
v16 = (m16 + (m12 - m14 * afp(1.25f)) * afp(4.f)) - (m11 * afp(2.f) - m13 * afp(2.5f) + m15 * afp(0.5f));
v26 = (m26 + (m22 - m24 * afp(1.25f)) * afp(4.f)) - (m21 * afp(2.f) - m23 * afp(2.5f) + m25 * afp(0.5f));
v36 = (m36 + (m32 - m34 * afp(1.25f)) * afp(4.f)) - (m31 * afp(2.f) - m33 * afp(2.5f) + m35 * afp(0.5f));
v46 = (m46 + (m42 - m44 * afp(1.25f)) * afp(4.f)) - (m41 * afp(2.f) - m43 * afp(2.5f) + m45 * afp(0.5f));
v56 = (m56 + (m52 - m54 * afp(1.25f)) * afp(4.f)) - (m51 * afp(2.f) - m53 * afp(2.5f) + m55 * afp(0.5f));
v66 = (m66 + (m62 - m64 * afp(1.25f)) * afp(4.f)) - (m61 * afp(2.f) - m63 * afp(2.5f) + m65 * afp(0.5f));
v76 = (m76 + (m72 - m74 * afp(1.25f)) * afp(4.f)) - (m71 * afp(2.f) - m73 * afp(2.5f) + m75 * afp(0.5f));

// store 64
int v_tm_offset = gz * p.outcstep + (gy * p.block_x + gx) * 64;

bottom_tm_blob_data[v_tm_offset + 0] = afp2sfpvec4(v00);
bottom_tm_blob_data[v_tm_offset + 1] = afp2sfpvec4(v01);
bottom_tm_blob_data[v_tm_offset + 2] = afp2sfpvec4(v02);
bottom_tm_blob_data[v_tm_offset + 3] = afp2sfpvec4(v03);
bottom_tm_blob_data[v_tm_offset + 4] = afp2sfpvec4(v04);
bottom_tm_blob_data[v_tm_offset + 5] = afp2sfpvec4(v05);
bottom_tm_blob_data[v_tm_offset + 6] = afp2sfpvec4(v06);
bottom_tm_blob_data[v_tm_offset + 7] = afp2sfpvec4(v07);

bottom_tm_blob_data[v_tm_offset + 8] = afp2sfpvec4(v10);
bottom_tm_blob_data[v_tm_offset + 9] = afp2sfpvec4(v11);
bottom_tm_blob_data[v_tm_offset + 10] = afp2sfpvec4(v12);
bottom_tm_blob_data[v_tm_offset + 11] = afp2sfpvec4(v13);
bottom_tm_blob_data[v_tm_offset + 12] = afp2sfpvec4(v14);
bottom_tm_blob_data[v_tm_offset + 13] = afp2sfpvec4(v15);
bottom_tm_blob_data[v_tm_offset + 14] = afp2sfpvec4(v16);
bottom_tm_blob_data[v_tm_offset + 15] = afp2sfpvec4(v17);

bottom_tm_blob_data[v_tm_offset + 16] = afp2sfpvec4(v20);
bottom_tm_blob_data[v_tm_offset + 17] = afp2sfpvec4(v21);
bottom_tm_blob_data[v_tm_offset + 18] = afp2sfpvec4(v22);
bottom_tm_blob_data[v_tm_offset + 19] = afp2sfpvec4(v23);
bottom_tm_blob_data[v_tm_offset + 20] = afp2sfpvec4(v24);
bottom_tm_blob_data[v_tm_offset + 21] = afp2sfpvec4(v25);
bottom_tm_blob_data[v_tm_offset + 22] = afp2sfpvec4(v26);
bottom_tm_blob_data[v_tm_offset + 23] = afp2sfpvec4(v27);

bottom_tm_blob_data[v_tm_offset + 24] = afp2sfpvec4(v30);
bottom_tm_blob_data[v_tm_offset + 25] = afp2sfpvec4(v31);
bottom_tm_blob_data[v_tm_offset + 26] = afp2sfpvec4(v32);
bottom_tm_blob_data[v_tm_offset + 27] = afp2sfpvec4(v33);
bottom_tm_blob_data[v_tm_offset + 28] = afp2sfpvec4(v34);
bottom_tm_blob_data[v_tm_offset + 29] = afp2sfpvec4(v35);
bottom_tm_blob_data[v_tm_offset + 30] = afp2sfpvec4(v36);
bottom_tm_blob_data[v_tm_offset + 31] = afp2sfpvec4(v37);

bottom_tm_blob_data[v_tm_offset + 32] = afp2sfpvec4(v40);
bottom_tm_blob_data[v_tm_offset + 33] = afp2sfpvec4(v41);
bottom_tm_blob_data[v_tm_offset + 34] = afp2sfpvec4(v42);
bottom_tm_blob_data[v_tm_offset + 35] = afp2sfpvec4(v43);
bottom_tm_blob_data[v_tm_offset + 36] = afp2sfpvec4(v44);
bottom_tm_blob_data[v_tm_offset + 37] = afp2sfpvec4(v45);
bottom_tm_blob_data[v_tm_offset + 38] = afp2sfpvec4(v46);
bottom_tm_blob_data[v_tm_offset + 39] = afp2sfpvec4(v47);

bottom_tm_blob_data[v_tm_offset + 40] = afp2sfpvec4(v50);
bottom_tm_blob_data[v_tm_offset + 41] = afp2sfpvec4(v51);
bottom_tm_blob_data[v_tm_offset + 42] = afp2sfpvec4(v52);
bottom_tm_blob_data[v_tm_offset + 43] = afp2sfpvec4(v53);
bottom_tm_blob_data[v_tm_offset + 44] = afp2sfpvec4(v54);
bottom_tm_blob_data[v_tm_offset + 45] = afp2sfpvec4(v55);
bottom_tm_blob_data[v_tm_offset + 46] = afp2sfpvec4(v56);
bottom_tm_blob_data[v_tm_offset + 47] = afp2sfpvec4(v57);

bottom_tm_blob_data[v_tm_offset + 48] = afp2sfpvec4(v60);
bottom_tm_blob_data[v_tm_offset + 49] = afp2sfpvec4(v61);
bottom_tm_blob_data[v_tm_offset + 50] = afp2sfpvec4(v62);
bottom_tm_blob_data[v_tm_offset + 51] = afp2sfpvec4(v63);
bottom_tm_blob_data[v_tm_offset + 52] = afp2sfpvec4(v64);
bottom_tm_blob_data[v_tm_offset + 53] = afp2sfpvec4(v65);
bottom_tm_blob_data[v_tm_offset + 54] = afp2sfpvec4(v66);
bottom_tm_blob_data[v_tm_offset + 55] = afp2sfpvec4(v67);

bottom_tm_blob_data[v_tm_offset + 56] = afp2sfpvec4(v70);
bottom_tm_blob_data[v_tm_offset + 57] = afp2sfpvec4(v71);
bottom_tm_blob_data[v_tm_offset + 58] = afp2sfpvec4(v72);
bottom_tm_blob_data[v_tm_offset + 59] = afp2sfpvec4(v73);
bottom_tm_blob_data[v_tm_offset + 60] = afp2sfpvec4(v74);
bottom_tm_blob_data[v_tm_offset + 61] = afp2sfpvec4(v75);
bottom_tm_blob_data[v_tm_offset + 62] = afp2sfpvec4(v76);
bottom_tm_blob_data[v_tm_offset + 63] = afp2sfpvec4(v77);
}

+ 583
- 0
src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd63_transform_output.comp View File

@@ -0,0 +1,583 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require
#endif

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
layout (constant_id = 3) const float activation_param_1 = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
layout (binding = 2) readonly buffer bias_blob { sfpvec4 bias_data[]; };

layout (push_constant) uniform parameter
{
int c;
int cstep;

int block_x;
int block_y;

int outw;
int outh;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.block_x || gy >= p.block_y || gz >= p.c)
return;

// load 64
int v_tm_offset = gz * p.cstep + (gy * p.block_x + gx) * 64;

afpvec4 v00 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 0]);
afpvec4 v01 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 1]);
afpvec4 v02 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 2]);
afpvec4 v03 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 3]);
afpvec4 v04 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 4]);
afpvec4 v05 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 5]);
afpvec4 v06 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 6]);
afpvec4 v07 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 7]);
afpvec4 v10 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 8]);
afpvec4 v11 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 9]);
afpvec4 v12 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 10]);
afpvec4 v13 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 11]);
afpvec4 v14 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 12]);
afpvec4 v15 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 13]);
afpvec4 v16 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 14]);
afpvec4 v17 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 15]);
afpvec4 v20 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 16]);
afpvec4 v21 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 17]);
afpvec4 v22 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 18]);
afpvec4 v23 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 19]);
afpvec4 v24 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 20]);
afpvec4 v25 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 21]);
afpvec4 v26 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 22]);
afpvec4 v27 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 23]);
afpvec4 v30 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 24]);
afpvec4 v31 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 25]);
afpvec4 v32 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 26]);
afpvec4 v33 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 27]);
afpvec4 v34 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 28]);
afpvec4 v35 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 29]);
afpvec4 v36 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 30]);
afpvec4 v37 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 31]);
afpvec4 v40 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 32]);
afpvec4 v41 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 33]);
afpvec4 v42 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 34]);
afpvec4 v43 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 35]);
afpvec4 v44 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 36]);
afpvec4 v45 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 37]);
afpvec4 v46 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 38]);
afpvec4 v47 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 39]);
afpvec4 v50 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 40]);
afpvec4 v51 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 41]);
afpvec4 v52 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 42]);
afpvec4 v53 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 43]);
afpvec4 v54 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 44]);
afpvec4 v55 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 45]);
afpvec4 v56 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 46]);
afpvec4 v57 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 47]);
afpvec4 v60 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 48]);
afpvec4 v61 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 49]);
afpvec4 v62 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 50]);
afpvec4 v63 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 51]);
afpvec4 v64 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 52]);
afpvec4 v65 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 53]);
afpvec4 v66 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 54]);
afpvec4 v67 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 55]);
afpvec4 v70 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 56]);
afpvec4 v71 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 57]);
afpvec4 v72 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 58]);
afpvec4 v73 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 59]);
afpvec4 v74 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 60]);
afpvec4 v75 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 61]);
afpvec4 v76 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 62]);
afpvec4 v77 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 63]);

// const float otm[6][8] = {
// {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 32.0f, 32.0f, 0.0f},
// {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 16.0f,-16.0f, 0.0f},
// {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 8.0f, 8.0f, 0.0f},
// {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 4.0f, -4.0f, 0.0f},
// {0.0f, 1.0f, 1.0f, 16.0f, 16.0f, 2.0f, 2.0f, 0.0f},
// {0.0f, 1.0f, -1.0f, 32.0f, -32.0f, 1.0f, -1.0f, 1.0f}
// };

// 0 = r0 + (r1 + r2) + (r3 + r4) + (r5 + r6) * 32
// 1 = (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16
// 2 = (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8
// 3 = (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4
// 4 = (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2
// 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6)

afpvec4 v01_a_v02 = v01 + v02;
afpvec4 v11_a_v12 = v11 + v12;
afpvec4 v21_a_v22 = v21 + v22;
afpvec4 v31_a_v32 = v31 + v32;
afpvec4 v41_a_v42 = v41 + v42;
afpvec4 v51_a_v52 = v51 + v52;
afpvec4 v61_a_v62 = v61 + v62;
afpvec4 v71_a_v72 = v71 + v72;
afpvec4 v01_m_v02 = v01 - v02;
afpvec4 v11_m_v12 = v11 - v12;
afpvec4 v21_m_v22 = v21 - v22;
afpvec4 v31_m_v32 = v31 - v32;
afpvec4 v41_m_v42 = v41 - v42;
afpvec4 v51_m_v52 = v51 - v52;
afpvec4 v61_m_v62 = v61 - v62;
afpvec4 v71_m_v72 = v71 - v72;

afpvec4 v03_a_v04 = v03 + v04;
afpvec4 v13_a_v14 = v13 + v14;
afpvec4 v23_a_v24 = v23 + v24;
afpvec4 v33_a_v34 = v33 + v34;
afpvec4 v43_a_v44 = v43 + v44;
afpvec4 v53_a_v54 = v53 + v54;
afpvec4 v63_a_v64 = v63 + v64;
afpvec4 v73_a_v74 = v73 + v74;
afpvec4 v03_m_v04 = v03 - v04;
afpvec4 v13_m_v14 = v13 - v14;
afpvec4 v23_m_v24 = v23 - v24;
afpvec4 v33_m_v34 = v33 - v34;
afpvec4 v43_m_v44 = v43 - v44;
afpvec4 v53_m_v54 = v53 - v54;
afpvec4 v63_m_v64 = v63 - v64;
afpvec4 v73_m_v74 = v73 - v74;

afpvec4 v05_a_v06 = v05 + v06;
afpvec4 v15_a_v16 = v15 + v16;
afpvec4 v25_a_v26 = v25 + v26;
afpvec4 v35_a_v36 = v35 + v36;
afpvec4 v45_a_v46 = v45 + v46;
afpvec4 v55_a_v56 = v55 + v56;
afpvec4 v65_a_v66 = v65 + v66;
afpvec4 v75_a_v76 = v75 + v76;
afpvec4 v05_m_v06 = v05 - v06;
afpvec4 v15_m_v16 = v15 - v16;
afpvec4 v25_m_v26 = v25 - v26;
afpvec4 v35_m_v36 = v35 - v36;
afpvec4 v45_m_v46 = v45 - v46;
afpvec4 v55_m_v56 = v55 - v56;
afpvec4 v65_m_v66 = v65 - v66;
afpvec4 v75_m_v76 = v75 - v76;

// implicit transpose
afpvec4 m00 = v00 + v01_a_v02 + v03_a_v04 + v05_a_v06 * afp(32.f);
afpvec4 m01 = v10 + v11_a_v12 + v13_a_v14 + v15_a_v16 * afp(32.f);
afpvec4 m02 = v20 + v21_a_v22 + v23_a_v24 + v25_a_v26 * afp(32.f);
afpvec4 m03 = v30 + v31_a_v32 + v33_a_v34 + v35_a_v36 * afp(32.f);
afpvec4 m04 = v40 + v41_a_v42 + v43_a_v44 + v45_a_v46 * afp(32.f);
afpvec4 m05 = v50 + v51_a_v52 + v53_a_v54 + v55_a_v56 * afp(32.f);
afpvec4 m06 = v60 + v61_a_v62 + v63_a_v64 + v65_a_v66 * afp(32.f);
afpvec4 m07 = v70 + v71_a_v72 + v73_a_v74 + v75_a_v76 * afp(32.f);

afpvec4 m10 = v01_m_v02 + v03_m_v04 * afp(2.f) + v05_m_v06 * afp(16.f);
afpvec4 m11 = v11_m_v12 + v13_m_v14 * afp(2.f) + v15_m_v16 * afp(16.f);
afpvec4 m12 = v21_m_v22 + v23_m_v24 * afp(2.f) + v25_m_v26 * afp(16.f);
afpvec4 m13 = v31_m_v32 + v33_m_v34 * afp(2.f) + v35_m_v36 * afp(16.f);
afpvec4 m14 = v41_m_v42 + v43_m_v44 * afp(2.f) + v45_m_v46 * afp(16.f);
afpvec4 m15 = v51_m_v52 + v53_m_v54 * afp(2.f) + v55_m_v56 * afp(16.f);
afpvec4 m16 = v61_m_v62 + v63_m_v64 * afp(2.f) + v65_m_v66 * afp(16.f);
afpvec4 m17 = v71_m_v72 + v73_m_v74 * afp(2.f) + v75_m_v76 * afp(16.f);

afpvec4 m20 = v01_a_v02 + v03_a_v04 * afp(4.f) + v05_a_v06 * afp(8.f);
afpvec4 m21 = v11_a_v12 + v13_a_v14 * afp(4.f) + v15_a_v16 * afp(8.f);
afpvec4 m22 = v21_a_v22 + v23_a_v24 * afp(4.f) + v25_a_v26 * afp(8.f);
afpvec4 m23 = v31_a_v32 + v33_a_v34 * afp(4.f) + v35_a_v36 * afp(8.f);
afpvec4 m24 = v41_a_v42 + v43_a_v44 * afp(4.f) + v45_a_v46 * afp(8.f);
afpvec4 m25 = v51_a_v52 + v53_a_v54 * afp(4.f) + v55_a_v56 * afp(8.f);
afpvec4 m26 = v61_a_v62 + v63_a_v64 * afp(4.f) + v65_a_v66 * afp(8.f);
afpvec4 m27 = v71_a_v72 + v73_a_v74 * afp(4.f) + v75_a_v76 * afp(8.f);

afpvec4 m30 = v01_m_v02 + v03_m_v04 * afp(8.f) + v05_m_v06 * afp(4.f);
afpvec4 m31 = v11_m_v12 + v13_m_v14 * afp(8.f) + v15_m_v16 * afp(4.f);
afpvec4 m32 = v21_m_v22 + v23_m_v24 * afp(8.f) + v25_m_v26 * afp(4.f);
afpvec4 m33 = v31_m_v32 + v33_m_v34 * afp(8.f) + v35_m_v36 * afp(4.f);
afpvec4 m34 = v41_m_v42 + v43_m_v44 * afp(8.f) + v45_m_v46 * afp(4.f);
afpvec4 m35 = v51_m_v52 + v53_m_v54 * afp(8.f) + v55_m_v56 * afp(4.f);
afpvec4 m36 = v61_m_v62 + v63_m_v64 * afp(8.f) + v65_m_v66 * afp(4.f);
afpvec4 m37 = v71_m_v72 + v73_m_v74 * afp(8.f) + v75_m_v76 * afp(4.f);

afpvec4 m40 = v01_a_v02 + v03_a_v04 * afp(16.f) + v05_a_v06 * afp(2.f);
afpvec4 m41 = v11_a_v12 + v13_a_v14 * afp(16.f) + v15_a_v16 * afp(2.f);
afpvec4 m42 = v21_a_v22 + v23_a_v24 * afp(16.f) + v25_a_v26 * afp(2.f);
afpvec4 m43 = v31_a_v32 + v33_a_v34 * afp(16.f) + v35_a_v36 * afp(2.f);
afpvec4 m44 = v41_a_v42 + v43_a_v44 * afp(16.f) + v45_a_v46 * afp(2.f);
afpvec4 m45 = v51_a_v52 + v53_a_v54 * afp(16.f) + v55_a_v56 * afp(2.f);
afpvec4 m46 = v61_a_v62 + v63_a_v64 * afp(16.f) + v65_a_v66 * afp(2.f);
afpvec4 m47 = v71_a_v72 + v73_a_v74 * afp(16.f) + v75_a_v76 * afp(2.f);

afpvec4 m50 = v07 + v01_m_v02 + v03_m_v04 * afp(32.f) + v05_m_v06;
afpvec4 m51 = v17 + v11_m_v12 + v13_m_v14 * afp(32.f) + v15_m_v16;
afpvec4 m52 = v27 + v21_m_v22 + v23_m_v24 * afp(32.f) + v25_m_v26;
afpvec4 m53 = v37 + v31_m_v32 + v33_m_v34 * afp(32.f) + v35_m_v36;
afpvec4 m54 = v47 + v41_m_v42 + v43_m_v44 * afp(32.f) + v45_m_v46;
afpvec4 m55 = v57 + v51_m_v52 + v53_m_v54 * afp(32.f) + v55_m_v56;
afpvec4 m56 = v67 + v61_m_v62 + v63_m_v64 * afp(32.f) + v65_m_v66;
afpvec4 m57 = v77 + v71_m_v72 + v73_m_v74 * afp(32.f) + v75_m_v76;

afpvec4 m01_a_m02 = m01 + m02;
afpvec4 m11_a_m12 = m11 + m12;
afpvec4 m21_a_m22 = m21 + m22;
afpvec4 m31_a_m32 = m31 + m32;
afpvec4 m41_a_m42 = m41 + m42;
afpvec4 m51_a_m52 = m51 + m52;
afpvec4 m01_m_m02 = m01 - m02;
afpvec4 m11_m_m12 = m11 - m12;
afpvec4 m21_m_m22 = m21 - m22;
afpvec4 m31_m_m32 = m31 - m32;
afpvec4 m41_m_m42 = m41 - m42;
afpvec4 m51_m_m52 = m51 - m52;

afpvec4 m03_a_m04 = m03 + m04;
afpvec4 m13_a_m14 = m13 + m14;
afpvec4 m23_a_m24 = m23 + m24;
afpvec4 m33_a_m34 = m33 + m34;
afpvec4 m43_a_m44 = m43 + m44;
afpvec4 m53_a_m54 = m53 + m54;
afpvec4 m03_m_m04 = m03 - m04;
afpvec4 m13_m_m14 = m13 - m14;
afpvec4 m23_m_m24 = m23 - m24;
afpvec4 m33_m_m34 = m33 - m34;
afpvec4 m43_m_m44 = m43 - m44;
afpvec4 m53_m_m54 = m53 - m54;

afpvec4 m05_a_m06 = m05 + m06;
afpvec4 m15_a_m16 = m15 + m16;
afpvec4 m25_a_m26 = m25 + m26;
afpvec4 m35_a_m36 = m35 + m36;
afpvec4 m45_a_m46 = m45 + m46;
afpvec4 m55_a_m56 = m55 + m56;
afpvec4 m05_m_m06 = m05 - m06;
afpvec4 m15_m_m16 = m15 - m16;
afpvec4 m25_m_m26 = m25 - m26;
afpvec4 m35_m_m36 = m35 - m36;
afpvec4 m45_m_m46 = m45 - m46;
afpvec4 m55_m_m56 = m55 - m56;

if (bias_term == 1)
{
const afpvec4 bias_value = sfp2afpvec4(bias_data[gz]);

v00 = bias_value + m00 + m01_a_m02 + m03_a_m04 + m05_a_m06 * afp(32.f);
v10 = bias_value + m10 + m11_a_m12 + m13_a_m14 + m15_a_m16 * afp(32.f);
v20 = bias_value + m20 + m21_a_m22 + m23_a_m24 + m25_a_m26 * afp(32.f);
v30 = bias_value + m30 + m31_a_m32 + m33_a_m34 + m35_a_m36 * afp(32.f);
v40 = bias_value + m40 + m41_a_m42 + m43_a_m44 + m45_a_m46 * afp(32.f);
v50 = bias_value + m50 + m51_a_m52 + m53_a_m54 + m55_a_m56 * afp(32.f);

v01 = bias_value + m01_m_m02 + m03_m_m04 * afp(2.f) + m05_m_m06 * afp(16.f);
v11 = bias_value + m11_m_m12 + m13_m_m14 * afp(2.f) + m15_m_m16 * afp(16.f);
v21 = bias_value + m21_m_m22 + m23_m_m24 * afp(2.f) + m25_m_m26 * afp(16.f);
v31 = bias_value + m31_m_m32 + m33_m_m34 * afp(2.f) + m35_m_m36 * afp(16.f);
v41 = bias_value + m41_m_m42 + m43_m_m44 * afp(2.f) + m45_m_m46 * afp(16.f);
v51 = bias_value + m51_m_m52 + m53_m_m54 * afp(2.f) + m55_m_m56 * afp(16.f);

v02 = bias_value + m01_a_m02 + m03_a_m04 * afp(4.f) + m05_a_m06 * afp(8.f);
v12 = bias_value + m11_a_m12 + m13_a_m14 * afp(4.f) + m15_a_m16 * afp(8.f);
v22 = bias_value + m21_a_m22 + m23_a_m24 * afp(4.f) + m25_a_m26 * afp(8.f);
v32 = bias_value + m31_a_m32 + m33_a_m34 * afp(4.f) + m35_a_m36 * afp(8.f);
v42 = bias_value + m41_a_m42 + m43_a_m44 * afp(4.f) + m45_a_m46 * afp(8.f);
v52 = bias_value + m51_a_m52 + m53_a_m54 * afp(4.f) + m55_a_m56 * afp(8.f);

v03 = bias_value + m01_m_m02 + m03_m_m04 * afp(8.f) + m05_m_m06 * afp(4.f);
v13 = bias_value + m11_m_m12 + m13_m_m14 * afp(8.f) + m15_m_m16 * afp(4.f);
v23 = bias_value + m21_m_m22 + m23_m_m24 * afp(8.f) + m25_m_m26 * afp(4.f);
v33 = bias_value + m31_m_m32 + m33_m_m34 * afp(8.f) + m35_m_m36 * afp(4.f);
v43 = bias_value + m41_m_m42 + m43_m_m44 * afp(8.f) + m45_m_m46 * afp(4.f);
v53 = bias_value + m51_m_m52 + m53_m_m54 * afp(8.f) + m55_m_m56 * afp(4.f);

v04 = bias_value + m01_a_m02 + m03_a_m04 * afp(16.f) + m05_a_m06 * afp(2.f);
v14 = bias_value + m11_a_m12 + m13_a_m14 * afp(16.f) + m15_a_m16 * afp(2.f);
v24 = bias_value + m21_a_m22 + m23_a_m24 * afp(16.f) + m25_a_m26 * afp(2.f);
v34 = bias_value + m31_a_m32 + m33_a_m34 * afp(16.f) + m35_a_m36 * afp(2.f);
v44 = bias_value + m41_a_m42 + m43_a_m44 * afp(16.f) + m45_a_m46 * afp(2.f);
v54 = bias_value + m51_a_m52 + m53_a_m54 * afp(16.f) + m55_a_m56 * afp(2.f);

v05 = bias_value + m07 + m01_m_m02 + m03_m_m04 * afp(32.f) + m05_m_m06;
v15 = bias_value + m17 + m11_m_m12 + m13_m_m14 * afp(32.f) + m15_m_m16;
v25 = bias_value + m27 + m21_m_m22 + m23_m_m24 * afp(32.f) + m25_m_m26;
v35 = bias_value + m37 + m31_m_m32 + m33_m_m34 * afp(32.f) + m35_m_m36;
v45 = bias_value + m47 + m41_m_m42 + m43_m_m44 * afp(32.f) + m45_m_m46;
v55 = bias_value + m57 + m51_m_m52 + m53_m_m54 * afp(32.f) + m55_m_m56;
}
else
{
v00 = m00 + m01_a_m02 + m03_a_m04 + m05_a_m06 * afp(32.f);
v10 = m10 + m11_a_m12 + m13_a_m14 + m15_a_m16 * afp(32.f);
v20 = m20 + m21_a_m22 + m23_a_m24 + m25_a_m26 * afp(32.f);
v30 = m30 + m31_a_m32 + m33_a_m34 + m35_a_m36 * afp(32.f);
v40 = m40 + m41_a_m42 + m43_a_m44 + m45_a_m46 * afp(32.f);
v50 = m50 + m51_a_m52 + m53_a_m54 + m55_a_m56 * afp(32.f);

v01 = m01_m_m02 + m03_m_m04 * afp(2.f) + m05_m_m06 * afp(16.f);
v11 = m11_m_m12 + m13_m_m14 * afp(2.f) + m15_m_m16 * afp(16.f);
v21 = m21_m_m22 + m23_m_m24 * afp(2.f) + m25_m_m26 * afp(16.f);
v31 = m31_m_m32 + m33_m_m34 * afp(2.f) + m35_m_m36 * afp(16.f);
v41 = m41_m_m42 + m43_m_m44 * afp(2.f) + m45_m_m46 * afp(16.f);
v51 = m51_m_m52 + m53_m_m54 * afp(2.f) + m55_m_m56 * afp(16.f);

v02 = m01_a_m02 + m03_a_m04 * afp(4.f) + m05_a_m06 * afp(8.f);
v12 = m11_a_m12 + m13_a_m14 * afp(4.f) + m15_a_m16 * afp(8.f);
v22 = m21_a_m22 + m23_a_m24 * afp(4.f) + m25_a_m26 * afp(8.f);
v32 = m31_a_m32 + m33_a_m34 * afp(4.f) + m35_a_m36 * afp(8.f);
v42 = m41_a_m42 + m43_a_m44 * afp(4.f) + m45_a_m46 * afp(8.f);
v52 = m51_a_m52 + m53_a_m54 * afp(4.f) + m55_a_m56 * afp(8.f);

v03 = m01_m_m02 + m03_m_m04 * afp(8.f) + m05_m_m06 * afp(4.f);
v13 = m11_m_m12 + m13_m_m14 * afp(8.f) + m15_m_m16 * afp(4.f);
v23 = m21_m_m22 + m23_m_m24 * afp(8.f) + m25_m_m26 * afp(4.f);
v33 = m31_m_m32 + m33_m_m34 * afp(8.f) + m35_m_m36 * afp(4.f);
v43 = m41_m_m42 + m43_m_m44 * afp(8.f) + m45_m_m46 * afp(4.f);
v53 = m51_m_m52 + m53_m_m54 * afp(8.f) + m55_m_m56 * afp(4.f);

v04 = m01_a_m02 + m03_a_m04 * afp(16.f) + m05_a_m06 * afp(2.f);
v14 = m11_a_m12 + m13_a_m14 * afp(16.f) + m15_a_m16 * afp(2.f);
v24 = m21_a_m22 + m23_a_m24 * afp(16.f) + m25_a_m26 * afp(2.f);
v34 = m31_a_m32 + m33_a_m34 * afp(16.f) + m35_a_m36 * afp(2.f);
v44 = m41_a_m42 + m43_a_m44 * afp(16.f) + m45_a_m46 * afp(2.f);
v54 = m51_a_m52 + m53_a_m54 * afp(16.f) + m55_a_m56 * afp(2.f);

v05 = m07 + m01_m_m02 + m03_m_m04 * afp(32.f) + m05_m_m06;
v15 = m17 + m11_m_m12 + m13_m_m14 * afp(32.f) + m15_m_m16;
v25 = m27 + m21_m_m22 + m23_m_m24 * afp(32.f) + m25_m_m26;
v35 = m37 + m31_m_m32 + m33_m_m34 * afp(32.f) + m35_m_m36;
v45 = m47 + m41_m_m42 + m43_m_m44 * afp(32.f) + m45_m_m46;
v55 = m57 + m51_m_m52 + m53_m_m54 * afp(32.f) + m55_m_m56;
}

if (activation_type == 1)
{
v00 = max(v00, afp(0.f));
v10 = max(v10, afp(0.f));
v20 = max(v20, afp(0.f));
v30 = max(v30, afp(0.f));
v40 = max(v40, afp(0.f));
v50 = max(v50, afp(0.f));
v01 = max(v01, afp(0.f));
v11 = max(v11, afp(0.f));
v21 = max(v21, afp(0.f));
v31 = max(v31, afp(0.f));
v41 = max(v41, afp(0.f));
v51 = max(v51, afp(0.f));
v02 = max(v02, afp(0.f));
v12 = max(v12, afp(0.f));
v22 = max(v22, afp(0.f));
v32 = max(v32, afp(0.f));
v42 = max(v42, afp(0.f));
v52 = max(v52, afp(0.f));
v03 = max(v03, afp(0.f));
v13 = max(v13, afp(0.f));
v23 = max(v23, afp(0.f));
v33 = max(v33, afp(0.f));
v43 = max(v43, afp(0.f));
v53 = max(v53, afp(0.f));
v04 = max(v04, afp(0.f));
v14 = max(v14, afp(0.f));
v24 = max(v24, afp(0.f));
v34 = max(v34, afp(0.f));
v44 = max(v44, afp(0.f));
v54 = max(v54, afp(0.f));
v05 = max(v05, afp(0.f));
v15 = max(v15, afp(0.f));
v25 = max(v25, afp(0.f));
v35 = max(v35, afp(0.f));
v45 = max(v45, afp(0.f));
v55 = max(v55, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
v00 = mix(v00, v00 * afp(slope), lessThan(v00, afpvec4(0.f)));
v10 = mix(v10, v10 * afp(slope), lessThan(v10, afpvec4(0.f)));
v20 = mix(v20, v20 * afp(slope), lessThan(v20, afpvec4(0.f)));
v30 = mix(v30, v30 * afp(slope), lessThan(v30, afpvec4(0.f)));
v40 = mix(v40, v40 * afp(slope), lessThan(v40, afpvec4(0.f)));
v50 = mix(v50, v50 * afp(slope), lessThan(v50, afpvec4(0.f)));
v01 = mix(v01, v01 * afp(slope), lessThan(v01, afpvec4(0.f)));
v11 = mix(v11, v11 * afp(slope), lessThan(v11, afpvec4(0.f)));
v21 = mix(v21, v21 * afp(slope), lessThan(v21, afpvec4(0.f)));
v31 = mix(v31, v31 * afp(slope), lessThan(v31, afpvec4(0.f)));
v41 = mix(v41, v41 * afp(slope), lessThan(v41, afpvec4(0.f)));
v51 = mix(v51, v51 * afp(slope), lessThan(v51, afpvec4(0.f)));
v02 = mix(v02, v02 * afp(slope), lessThan(v02, afpvec4(0.f)));
v12 = mix(v12, v12 * afp(slope), lessThan(v12, afpvec4(0.f)));
v22 = mix(v22, v22 * afp(slope), lessThan(v22, afpvec4(0.f)));
v32 = mix(v32, v32 * afp(slope), lessThan(v32, afpvec4(0.f)));
v42 = mix(v42, v42 * afp(slope), lessThan(v42, afpvec4(0.f)));
v52 = mix(v52, v52 * afp(slope), lessThan(v52, afpvec4(0.f)));
v03 = mix(v03, v03 * afp(slope), lessThan(v03, afpvec4(0.f)));
v13 = mix(v13, v13 * afp(slope), lessThan(v13, afpvec4(0.f)));
v23 = mix(v23, v23 * afp(slope), lessThan(v23, afpvec4(0.f)));
v33 = mix(v33, v33 * afp(slope), lessThan(v33, afpvec4(0.f)));
v43 = mix(v43, v43 * afp(slope), lessThan(v43, afpvec4(0.f)));
v53 = mix(v53, v53 * afp(slope), lessThan(v53, afpvec4(0.f)));
v04 = mix(v04, v04 * afp(slope), lessThan(v04, afpvec4(0.f)));
v14 = mix(v14, v14 * afp(slope), lessThan(v14, afpvec4(0.f)));
v24 = mix(v24, v24 * afp(slope), lessThan(v24, afpvec4(0.f)));
v34 = mix(v34, v34 * afp(slope), lessThan(v34, afpvec4(0.f)));
v44 = mix(v44, v44 * afp(slope), lessThan(v44, afpvec4(0.f)));
v54 = mix(v54, v54 * afp(slope), lessThan(v54, afpvec4(0.f)));
v05 = mix(v05, v05 * afp(slope), lessThan(v05, afpvec4(0.f)));
v15 = mix(v15, v15 * afp(slope), lessThan(v15, afpvec4(0.f)));
v25 = mix(v25, v25 * afp(slope), lessThan(v25, afpvec4(0.f)));
v35 = mix(v35, v35 * afp(slope), lessThan(v35, afpvec4(0.f)));
v45 = mix(v45, v45 * afp(slope), lessThan(v45, afpvec4(0.f)));
v55 = mix(v55, v55 * afp(slope), lessThan(v55, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
v00 = clamp(v00, const_min, const_max);
v10 = clamp(v10, const_min, const_max);
v20 = clamp(v20, const_min, const_max);
v30 = clamp(v30, const_min, const_max);
v40 = clamp(v40, const_min, const_max);
v50 = clamp(v50, const_min, const_max);
v01 = clamp(v01, const_min, const_max);
v11 = clamp(v11, const_min, const_max);
v21 = clamp(v21, const_min, const_max);
v31 = clamp(v31, const_min, const_max);
v41 = clamp(v41, const_min, const_max);
v51 = clamp(v51, const_min, const_max);
v02 = clamp(v02, const_min, const_max);
v12 = clamp(v12, const_min, const_max);
v22 = clamp(v22, const_min, const_max);
v32 = clamp(v32, const_min, const_max);
v42 = clamp(v42, const_min, const_max);
v52 = clamp(v52, const_min, const_max);
v03 = clamp(v03, const_min, const_max);
v13 = clamp(v13, const_min, const_max);
v23 = clamp(v23, const_min, const_max);
v33 = clamp(v33, const_min, const_max);
v43 = clamp(v43, const_min, const_max);
v53 = clamp(v53, const_min, const_max);
v04 = clamp(v04, const_min, const_max);
v14 = clamp(v14, const_min, const_max);
v24 = clamp(v24, const_min, const_max);
v34 = clamp(v34, const_min, const_max);
v44 = clamp(v44, const_min, const_max);
v54 = clamp(v54, const_min, const_max);
v05 = clamp(v05, const_min, const_max);
v15 = clamp(v15, const_min, const_max);
v25 = clamp(v25, const_min, const_max);
v35 = clamp(v35, const_min, const_max);
v45 = clamp(v45, const_min, const_max);
v55 = clamp(v55, const_min, const_max);
}
if (activation_type == 4)
{
v00 = afp(1.f) / (afp(1.f) + exp(-v00));
v10 = afp(1.f) / (afp(1.f) + exp(-v10));
v20 = afp(1.f) / (afp(1.f) + exp(-v20));
v30 = afp(1.f) / (afp(1.f) + exp(-v30));
v40 = afp(1.f) / (afp(1.f) + exp(-v40));
v50 = afp(1.f) / (afp(1.f) + exp(-v50));
v01 = afp(1.f) / (afp(1.f) + exp(-v01));
v11 = afp(1.f) / (afp(1.f) + exp(-v11));
v21 = afp(1.f) / (afp(1.f) + exp(-v21));
v31 = afp(1.f) / (afp(1.f) + exp(-v31));
v41 = afp(1.f) / (afp(1.f) + exp(-v41));
v51 = afp(1.f) / (afp(1.f) + exp(-v51));
v02 = afp(1.f) / (afp(1.f) + exp(-v02));
v12 = afp(1.f) / (afp(1.f) + exp(-v12));
v22 = afp(1.f) / (afp(1.f) + exp(-v22));
v32 = afp(1.f) / (afp(1.f) + exp(-v32));
v42 = afp(1.f) / (afp(1.f) + exp(-v42));
v52 = afp(1.f) / (afp(1.f) + exp(-v52));
v03 = afp(1.f) / (afp(1.f) + exp(-v03));
v13 = afp(1.f) / (afp(1.f) + exp(-v13));
v23 = afp(1.f) / (afp(1.f) + exp(-v23));
v33 = afp(1.f) / (afp(1.f) + exp(-v33));
v43 = afp(1.f) / (afp(1.f) + exp(-v43));
v53 = afp(1.f) / (afp(1.f) + exp(-v53));
v04 = afp(1.f) / (afp(1.f) + exp(-v04));
v14 = afp(1.f) / (afp(1.f) + exp(-v14));
v24 = afp(1.f) / (afp(1.f) + exp(-v24));
v34 = afp(1.f) / (afp(1.f) + exp(-v34));
v44 = afp(1.f) / (afp(1.f) + exp(-v44));
v54 = afp(1.f) / (afp(1.f) + exp(-v54));
v05 = afp(1.f) / (afp(1.f) + exp(-v05));
v15 = afp(1.f) / (afp(1.f) + exp(-v15));
v25 = afp(1.f) / (afp(1.f) + exp(-v25));
v35 = afp(1.f) / (afp(1.f) + exp(-v35));
v45 = afp(1.f) / (afp(1.f) + exp(-v45));
v55 = afp(1.f) / (afp(1.f) + exp(-v55));
}

// store 6x6
int v_offset_0 = gz * p.outcstep + gy * 6 * p.outw + gx * 6;
int v_offset_1 = v_offset_0 + p.outw;
int v_offset_2 = v_offset_0 + p.outw * 2;
int v_offset_3 = v_offset_0 + p.outw * 3;
int v_offset_4 = v_offset_0 + p.outw * 4;
int v_offset_5 = v_offset_0 + p.outw * 5;

top_blob_data[v_offset_0 + 0] = afp2sfpvec4(v00);
top_blob_data[v_offset_0 + 1] = afp2sfpvec4(v01);
top_blob_data[v_offset_0 + 2] = afp2sfpvec4(v02);
top_blob_data[v_offset_0 + 3] = afp2sfpvec4(v03);
top_blob_data[v_offset_0 + 4] = afp2sfpvec4(v04);
top_blob_data[v_offset_0 + 5] = afp2sfpvec4(v05);
top_blob_data[v_offset_1 + 0] = afp2sfpvec4(v10);
top_blob_data[v_offset_1 + 1] = afp2sfpvec4(v11);
top_blob_data[v_offset_1 + 2] = afp2sfpvec4(v12);
top_blob_data[v_offset_1 + 3] = afp2sfpvec4(v13);
top_blob_data[v_offset_1 + 4] = afp2sfpvec4(v14);
top_blob_data[v_offset_1 + 5] = afp2sfpvec4(v15);
top_blob_data[v_offset_2 + 0] = afp2sfpvec4(v20);
top_blob_data[v_offset_2 + 1] = afp2sfpvec4(v21);
top_blob_data[v_offset_2 + 2] = afp2sfpvec4(v22);
top_blob_data[v_offset_2 + 3] = afp2sfpvec4(v23);
top_blob_data[v_offset_2 + 4] = afp2sfpvec4(v24);
top_blob_data[v_offset_2 + 5] = afp2sfpvec4(v25);
top_blob_data[v_offset_3 + 0] = afp2sfpvec4(v30);
top_blob_data[v_offset_3 + 1] = afp2sfpvec4(v31);
top_blob_data[v_offset_3 + 2] = afp2sfpvec4(v32);
top_blob_data[v_offset_3 + 3] = afp2sfpvec4(v33);
top_blob_data[v_offset_3 + 4] = afp2sfpvec4(v34);
top_blob_data[v_offset_3 + 5] = afp2sfpvec4(v35);
top_blob_data[v_offset_4 + 0] = afp2sfpvec4(v40);
top_blob_data[v_offset_4 + 1] = afp2sfpvec4(v41);
top_blob_data[v_offset_4 + 2] = afp2sfpvec4(v42);
top_blob_data[v_offset_4 + 3] = afp2sfpvec4(v43);
top_blob_data[v_offset_4 + 4] = afp2sfpvec4(v44);
top_blob_data[v_offset_4 + 5] = afp2sfpvec4(v45);
top_blob_data[v_offset_5 + 0] = afp2sfpvec4(v50);
top_blob_data[v_offset_5 + 1] = afp2sfpvec4(v51);
top_blob_data[v_offset_5 + 2] = afp2sfpvec4(v52);
top_blob_data[v_offset_5 + 3] = afp2sfpvec4(v53);
top_blob_data[v_offset_5 + 4] = afp2sfpvec4(v54);
top_blob_data[v_offset_5 + 5] = afp2sfpvec4(v55);
}

Loading…
Cancel
Save