diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp index cff0a43f8..bf00b9b01 100644 --- a/src/layer/vulkan/convolution_vulkan.cpp +++ b/src/layer/vulkan/convolution_vulkan.cpp @@ -31,8 +31,11 @@ Convolution_vulkan::Convolution_vulkan() pipeline_convolution_pack4 = 0; pipeline_convolution_pack4_1x1s1d1 = 0; pipeline_convolution_pack4_3x3s1d1_lds_8_8_2 = 0; - winograd23_padding = 0; - winograd23_crop = 0; + winograd_padding = 0; + winograd_crop = 0; + pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input = 0; + pipeline_convolution_pack4_3x3s1d1_winograd63_gemm = 0; + pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output = 0; pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input = 0; pipeline_convolution_pack4_3x3s1d1_winograd23_gemm = 0; pipeline_convolution_pack4_3x3s1d1_winograd23_transform_output = 0; @@ -137,8 +140,8 @@ int Convolution_vulkan::create_pipeline(const Option& opt) if (num_input >= 16 && num_output >= 16) { { - winograd23_padding = ncnn::create_layer(ncnn::LayerType::Padding); - winograd23_padding->vkdev = vkdev; + winograd_padding = ncnn::create_layer(ncnn::LayerType::Padding); + winograd_padding->vkdev = vkdev; ncnn::ParamDict pd; pd.set(0, -233); @@ -148,14 +151,14 @@ int Convolution_vulkan::create_pipeline(const Option& opt) pd.set(4, 0); pd.set(5, 0.f); - winograd23_padding->load_param(pd); + winograd_padding->load_param(pd); - winograd23_padding->create_pipeline(opt); + winograd_padding->create_pipeline(opt); } { - winograd23_crop = ncnn::create_layer(ncnn::LayerType::Crop); - winograd23_crop->vkdev = vkdev; + winograd_crop = ncnn::create_layer(ncnn::LayerType::Crop); + winograd_crop->vkdev = vkdev; ncnn::ParamDict pd; pd.set(0, -233); @@ -165,11 +168,28 @@ int Convolution_vulkan::create_pipeline(const Option& opt) pd.set(4, 0); pd.set(5, 0); - winograd23_crop->load_param(pd); + winograd_crop->load_param(pd); - winograd23_crop->create_pipeline(opt); + winograd_crop->create_pipeline(opt); } + } + + if (num_input >= 64 && num_output >= 64) + { + pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input = new Pipeline(vkdev); + pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input->set_local_size_xyz(8, 8, 1); + pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input->create("convolution_pack4_3x3s1d1_winograd63_transform_input", opt, std::vector(), 2, 7); + + pipeline_convolution_pack4_3x3s1d1_winograd63_gemm = new Pipeline(vkdev); + pipeline_convolution_pack4_3x3s1d1_winograd63_gemm->set_local_size_xyz(4, 4, 4); + pipeline_convolution_pack4_3x3s1d1_winograd63_gemm->create("convolution_pack4_3x3s1d1_winograd63_gemm", opt, std::vector(), 3, 6); + pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output = new Pipeline(vkdev); + pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output->set_local_size_xyz(8, 8, 1); + pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output->create("convolution_pack4_3x3s1d1_winograd63_transform_output", opt, specializations, 3, 7); + } + else if (num_input >= 16 && num_output >= 16) + { pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input = new Pipeline(vkdev); pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input->set_local_size_xyz(8, 8, 1); pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input->create("convolution_pack4_3x3s1d1_winograd23_transform_input", opt, std::vector(), 2, 7); @@ -270,20 +290,27 @@ int Convolution_vulkan::destroy_pipeline(const Option& opt) delete pipeline_convolution_pack4_3x3s1d1_lds_8_8_2; pipeline_convolution_pack4_3x3s1d1_lds_8_8_2 = 0; - if (winograd23_padding) + if (winograd_padding) { - winograd23_padding->destroy_pipeline(opt); - delete winograd23_padding; - winograd23_padding = 0; + winograd_padding->destroy_pipeline(opt); + delete winograd_padding; + winograd_padding = 0; } - if (winograd23_crop) + if (winograd_crop) { - winograd23_crop->destroy_pipeline(opt); - delete winograd23_crop; - winograd23_crop = 0; + winograd_crop->destroy_pipeline(opt); + delete winograd_crop; + winograd_crop = 0; } + delete pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input; + delete pipeline_convolution_pack4_3x3s1d1_winograd63_gemm; + delete pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output; + pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input = 0; + pipeline_convolution_pack4_3x3s1d1_winograd63_gemm = 0; + pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output = 0; + delete pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input; delete pipeline_convolution_pack4_3x3s1d1_winograd23_gemm; delete pipeline_convolution_pack4_3x3s1d1_winograd23_transform_output; @@ -398,7 +425,129 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) cmd.record_upload(weight_data_pack4, weight_data_gpu_pack4, opt); - if (kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1 && num_input >= 16 && num_output >= 16) + bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; + if (is_conv3x3s1d1 && num_input >= 64 && num_output >= 64) + { + // winograd63 transform kernel + Mat weight_data_tm; + weight_data_tm.create(8*8, num_input, num_output); + + const float ktm[8][3] = { + { 1.0f, 0.0f, 0.0f}, + {-2.0f/9, -2.0f/9, -2.0f/9}, + {-2.0f/9, 2.0f/9, -2.0f/9}, + {1.0f/90, 1.0f/45, 2.0f/45}, + {1.0f/90, -1.0f/45, 2.0f/45}, + {1.0f/45, 1.0f/90, 1.0f/180}, + {1.0f/45, -1.0f/90, 1.0f/180}, + { 0.0f, 0.0f, 1.0f} + }; + + #pragma omp parallel for + for (int p = 0; p= 16 && num_output >= 16) { // winograd23 transform kernel Mat weight_data_tm; @@ -766,7 +915,151 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom } bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; - if (packing == 4 && out_packing == 4 && is_conv3x3s1d1 && channels * packing >= 16 && num_output >= 16) + if (packing == 4 && out_packing == 4 && is_conv3x3s1d1 && channels * packing >= 64 && num_output >= 64) + { + // winograd63 + int outw_bordered = (outw + 5) / 6 * 6; + int outh_bordered = (outh + 5) / 6 * 6; + + int w_bordered = outw_bordered + 2; + int h_bordered = outh_bordered + 2; + + int block_x = outw_bordered / 6; + int block_y = outh_bordered / 6; + + // pad to 6n+2 + { + ncnn::Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); + padding_param_blob.prepare_staging_buffer(); + int* padding_params = padding_param_blob.mapped(); + + padding_params[0] = 0; + padding_params[1] = h_bordered - bottom_blob_bordered.h; + padding_params[2] = 0; + padding_params[3] = w_bordered - bottom_blob_bordered.w; + + std::vector padding_inputs(2); + padding_inputs[0] = bottom_blob_bordered; + padding_inputs[1] = padding_param_blob; + + std::vector padding_outputs(1); + winograd_padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); + bottom_blob_bordered = padding_outputs[0]; + } + + // transform input + VkMat bottom_tm_blob; + { + bottom_tm_blob.create(64, block_x * block_y, channels, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); + if (bottom_tm_blob.empty()) + return -100; + + std::vector bindings(2); + bindings[0] = bottom_blob_bordered; + bindings[1] = bottom_tm_blob; + + std::vector constants(7); + constants[0].i = bottom_blob_bordered.w; + constants[1].i = bottom_blob_bordered.h; + constants[2].i = bottom_blob_bordered.c; + constants[3].i = bottom_blob_bordered.cstep; + constants[4].i = bottom_tm_blob.cstep; + constants[5].i = block_x; + constants[6].i = block_y; + + VkMat dispatcher; + dispatcher.w = block_x; + dispatcher.h = block_y; + dispatcher.c = bottom_tm_blob.c; + + cmd.record_pipeline(pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input, bindings, constants, dispatcher); + } + + // gemm + VkMat top_tm_blob; + { + top_tm_blob.create(64, block_x * block_y, num_output / out_packing, elemsize, out_packing, opt.workspace_vkallocator, opt.staging_vkallocator); + if (top_tm_blob.empty()) + return -100; + + std::vector bindings(3); + bindings[0] = bottom_tm_blob; + bindings[1] = top_tm_blob; + bindings[2] = weight_data_gpu_pack4_tm; + + std::vector constants(6); + constants[0].i = bottom_tm_blob.c; + constants[1].i = bottom_tm_blob.cstep; + constants[2].i = (top_tm_blob.h + 3) / 4; + constants[3].i = top_tm_blob.h; + constants[4].i = top_tm_blob.c; + constants[5].i = top_tm_blob.cstep; + + VkMat dispatcher; + dispatcher.w = top_tm_blob.w; + dispatcher.h = (top_tm_blob.h + 3) / 4; + dispatcher.c = top_tm_blob.c; + + cmd.record_pipeline(pipeline_convolution_pack4_3x3s1d1_winograd63_gemm, bindings, constants, dispatcher); + } + + // transform output + VkMat top_blob_bordered; + { + top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_packing, elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); + if (top_blob_bordered.empty()) + return -100; + + std::vector bindings(3); + bindings[0] = top_tm_blob; + bindings[1] = top_blob_bordered; + bindings[2] = bias_term ? bias_data_gpu_pack4 : bindings[1]; + + std::vector constants(7); + constants[0].i = top_tm_blob.c; + constants[1].i = top_tm_blob.cstep; + constants[2].i = block_x; + constants[3].i = block_y; + constants[4].i = top_blob_bordered.w; + constants[5].i = top_blob_bordered.h; + constants[6].i = top_blob_bordered.cstep; + + VkMat dispatcher; + dispatcher.w = block_x; + dispatcher.h = block_y; + dispatcher.c = top_blob_bordered.c; + + cmd.record_pipeline(pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output, bindings, constants, dispatcher); + } + + // crop top_blob + { + VkMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); + crop_param_blob.prepare_staging_buffer(); + int* crop_params = crop_param_blob.mapped(); + + crop_params[0] = 0; + crop_params[1] = 0; + crop_params[2] = 0; + crop_params[3] = outw; + crop_params[4] = outh; + crop_params[5] = num_output; + + std::vector crop_inputs(2); + crop_inputs[0] = top_blob_bordered; + crop_inputs[1] = crop_param_blob; + + std::vector crop_outputs(1); + winograd_crop->forward(crop_inputs, crop_outputs, cmd, opt); + top_blob = crop_outputs[0]; + } + + return 0; + } + else if (packing == 4 && out_packing == 4 && is_conv3x3s1d1 && channels * packing >= 16 && num_output >= 16) { // winograd23 int outw_bordered = (outw + 1) / 2 * 2; @@ -797,7 +1090,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom padding_inputs[1] = padding_param_blob; std::vector padding_outputs(1); - winograd23_padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); + winograd_padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); bottom_blob_bordered = padding_outputs[0]; } @@ -904,7 +1197,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom crop_inputs[1] = crop_param_blob; std::vector crop_outputs(1); - winograd23_crop->forward(crop_inputs, crop_outputs, cmd, opt); + winograd_crop->forward(crop_inputs, crop_outputs, cmd, opt); top_blob = crop_outputs[0]; } diff --git a/src/layer/vulkan/convolution_vulkan.h b/src/layer/vulkan/convolution_vulkan.h index 6f4af3c2a..7c2098fe6 100644 --- a/src/layer/vulkan/convolution_vulkan.h +++ b/src/layer/vulkan/convolution_vulkan.h @@ -49,9 +49,12 @@ public: Pipeline* pipeline_convolution_pack4_3x3s1d1_lds_8_8_2; // pack4 winograd23 - ncnn::Layer* winograd23_padding; - ncnn::Layer* winograd23_crop; + ncnn::Layer* winograd_padding; + ncnn::Layer* winograd_crop; VkMat weight_data_gpu_pack4_tm; + Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input; + Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd63_gemm; + Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output; Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input; Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd23_gemm; Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd23_transform_output; diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd63_gemm.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd63_gemm.comp new file mode 100644 index 000000000..968b47145 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd63_gemm.comp @@ -0,0 +1,99 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_AMD_gpu_shader_half_float: require +#endif + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; }; +layout (binding = 1) writeonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; }; +#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) +// GL_EXT_shader_16bit_storage does not define f16mat4 type :( +layout (binding = 2) readonly buffer weight_tm_blob { sfpvec4 weight_tm_data[]; }; +#else +layout (binding = 2) readonly buffer weight_tm_blob { sfpmat4 weight_tm_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int c; + int cstep; + + int outh_4; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= 64 || gy >= p.outh_4 || gz >= p.outc) + return; + + afpvec4 sum0 = afpvec4(0.f); + afpvec4 sum1 = afpvec4(0.f); + afpvec4 sum2 = afpvec4(0.f); + afpvec4 sum3 = afpvec4(0.f); + + int v_offset = gy * 4 * 64 + gx; + int w_offset = gz * p.c * 64 + gx; + + for (int z = 0; z < p.c; z++) + { + afpvec4 v0 = sfp2afpvec4(bottom_tm_blob_data[v_offset + 0]); + afpvec4 v1 = sfp2afpvec4(bottom_tm_blob_data[v_offset + 64]); + afpvec4 v2 = sfp2afpvec4(bottom_tm_blob_data[v_offset + 128]); + afpvec4 v3 = sfp2afpvec4(bottom_tm_blob_data[v_offset + 192]); + +#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) + // GL_EXT_shader_16bit_storage does not define f16mat4 type :( + afpmat4 k = afpmat4( + sfp2afpvec4(weight_tm_data[w_offset * 4 + 0]), + sfp2afpvec4(weight_tm_data[w_offset * 4 + 1]), + sfp2afpvec4(weight_tm_data[w_offset * 4 + 2]), + sfp2afpvec4(weight_tm_data[w_offset * 4 + 3]) + ); +#else + afpmat4 k = sfpmat4(weight_tm_data[w_offset]); +#endif + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + + v_offset += p.cstep; + w_offset += 64; + } + + int gi = gz * p.outcstep + gy * 4 * 64 + gx; + + top_tm_blob_data[gi + 0] = afp2sfpvec4(sum0); + if (gy * 4 + 1 < p.outh) top_tm_blob_data[gi + 64] = afp2sfpvec4(sum1); + if (gy * 4 + 2 < p.outh) top_tm_blob_data[gi + 128] = afp2sfpvec4(sum2); + if (gy * 4 + 3 < p.outh) top_tm_blob_data[gi + 192] = afp2sfpvec4(sum3); +} diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd63_transform_input.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd63_transform_input.comp new file mode 100644 index 000000000..883503e10 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd63_transform_input.comp @@ -0,0 +1,368 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_AMD_gpu_shader_half_float: require +#endif + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outcstep; + + int block_x; + int block_y; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.block_x || gy >= p.block_y || gz >= p.c) + return; + + // load 8x8 + int v_offset_0 = gz * p.cstep + gy * 6 * p.w + gx * 6; + ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * p.w; + ivec4 n_offset = v_offset_0 + ivec4(4, 5, 6, 7) * p.w; + + afpvec4 v00 = sfp2afpvec4(bottom_blob_data[v_offset.r + 0]); + afpvec4 v01 = sfp2afpvec4(bottom_blob_data[v_offset.r + 1]); + afpvec4 v02 = sfp2afpvec4(bottom_blob_data[v_offset.r + 2]); + afpvec4 v03 = sfp2afpvec4(bottom_blob_data[v_offset.r + 3]); + afpvec4 v04 = sfp2afpvec4(bottom_blob_data[v_offset.r + 4]); + afpvec4 v05 = sfp2afpvec4(bottom_blob_data[v_offset.r + 5]); + afpvec4 v06 = sfp2afpvec4(bottom_blob_data[v_offset.r + 6]); + afpvec4 v07 = sfp2afpvec4(bottom_blob_data[v_offset.r + 7]); + + afpvec4 v10 = sfp2afpvec4(bottom_blob_data[v_offset.g + 0]); + afpvec4 v11 = sfp2afpvec4(bottom_blob_data[v_offset.g + 1]); + afpvec4 v12 = sfp2afpvec4(bottom_blob_data[v_offset.g + 2]); + afpvec4 v13 = sfp2afpvec4(bottom_blob_data[v_offset.g + 3]); + afpvec4 v14 = sfp2afpvec4(bottom_blob_data[v_offset.g + 4]); + afpvec4 v15 = sfp2afpvec4(bottom_blob_data[v_offset.g + 5]); + afpvec4 v16 = sfp2afpvec4(bottom_blob_data[v_offset.g + 6]); + afpvec4 v17 = sfp2afpvec4(bottom_blob_data[v_offset.g + 7]); + + afpvec4 v20 = sfp2afpvec4(bottom_blob_data[v_offset.b + 0]); + afpvec4 v21 = sfp2afpvec4(bottom_blob_data[v_offset.b + 1]); + afpvec4 v22 = sfp2afpvec4(bottom_blob_data[v_offset.b + 2]); + afpvec4 v23 = sfp2afpvec4(bottom_blob_data[v_offset.b + 3]); + afpvec4 v24 = sfp2afpvec4(bottom_blob_data[v_offset.b + 4]); + afpvec4 v25 = sfp2afpvec4(bottom_blob_data[v_offset.b + 5]); + afpvec4 v26 = sfp2afpvec4(bottom_blob_data[v_offset.b + 6]); + afpvec4 v27 = sfp2afpvec4(bottom_blob_data[v_offset.b + 7]); + + afpvec4 v30 = sfp2afpvec4(bottom_blob_data[v_offset.a + 0]); + afpvec4 v31 = sfp2afpvec4(bottom_blob_data[v_offset.a + 1]); + afpvec4 v32 = sfp2afpvec4(bottom_blob_data[v_offset.a + 2]); + afpvec4 v33 = sfp2afpvec4(bottom_blob_data[v_offset.a + 3]); + afpvec4 v34 = sfp2afpvec4(bottom_blob_data[v_offset.a + 4]); + afpvec4 v35 = sfp2afpvec4(bottom_blob_data[v_offset.a + 5]); + afpvec4 v36 = sfp2afpvec4(bottom_blob_data[v_offset.a + 6]); + afpvec4 v37 = sfp2afpvec4(bottom_blob_data[v_offset.a + 7]); + + afpvec4 v40 = sfp2afpvec4(bottom_blob_data[n_offset.r + 0]); + afpvec4 v41 = sfp2afpvec4(bottom_blob_data[n_offset.r + 1]); + afpvec4 v42 = sfp2afpvec4(bottom_blob_data[n_offset.r + 2]); + afpvec4 v43 = sfp2afpvec4(bottom_blob_data[n_offset.r + 3]); + afpvec4 v44 = sfp2afpvec4(bottom_blob_data[n_offset.r + 4]); + afpvec4 v45 = sfp2afpvec4(bottom_blob_data[n_offset.r + 5]); + afpvec4 v46 = sfp2afpvec4(bottom_blob_data[n_offset.r + 6]); + afpvec4 v47 = sfp2afpvec4(bottom_blob_data[n_offset.r + 7]); + + afpvec4 v50 = sfp2afpvec4(bottom_blob_data[n_offset.g + 0]); + afpvec4 v51 = sfp2afpvec4(bottom_blob_data[n_offset.g + 1]); + afpvec4 v52 = sfp2afpvec4(bottom_blob_data[n_offset.g + 2]); + afpvec4 v53 = sfp2afpvec4(bottom_blob_data[n_offset.g + 3]); + afpvec4 v54 = sfp2afpvec4(bottom_blob_data[n_offset.g + 4]); + afpvec4 v55 = sfp2afpvec4(bottom_blob_data[n_offset.g + 5]); + afpvec4 v56 = sfp2afpvec4(bottom_blob_data[n_offset.g + 6]); + afpvec4 v57 = sfp2afpvec4(bottom_blob_data[n_offset.g + 7]); + + afpvec4 v60 = sfp2afpvec4(bottom_blob_data[n_offset.b + 0]); + afpvec4 v61 = sfp2afpvec4(bottom_blob_data[n_offset.b + 1]); + afpvec4 v62 = sfp2afpvec4(bottom_blob_data[n_offset.b + 2]); + afpvec4 v63 = sfp2afpvec4(bottom_blob_data[n_offset.b + 3]); + afpvec4 v64 = sfp2afpvec4(bottom_blob_data[n_offset.b + 4]); + afpvec4 v65 = sfp2afpvec4(bottom_blob_data[n_offset.b + 5]); + afpvec4 v66 = sfp2afpvec4(bottom_blob_data[n_offset.b + 6]); + afpvec4 v67 = sfp2afpvec4(bottom_blob_data[n_offset.b + 7]); + + afpvec4 v70 = sfp2afpvec4(bottom_blob_data[n_offset.a + 0]); + afpvec4 v71 = sfp2afpvec4(bottom_blob_data[n_offset.a + 1]); + afpvec4 v72 = sfp2afpvec4(bottom_blob_data[n_offset.a + 2]); + afpvec4 v73 = sfp2afpvec4(bottom_blob_data[n_offset.a + 3]); + afpvec4 v74 = sfp2afpvec4(bottom_blob_data[n_offset.a + 4]); + afpvec4 v75 = sfp2afpvec4(bottom_blob_data[n_offset.a + 5]); + afpvec4 v76 = sfp2afpvec4(bottom_blob_data[n_offset.a + 6]); + afpvec4 v77 = sfp2afpvec4(bottom_blob_data[n_offset.a + 7]); + + // const float itm[8][8] = { + // {1.0f, 0.0f, -5.25f, 0.00f, 5.25f, 0.00f, -1.0f, 0.0f}, + // + // {0.0f, 1.0f, 1.00f, -4.25f, -4.25f, 1.00f, 1.0f, 0.0f}, + // {0.0f, -1.0f, 1.00f, 4.25f, -4.25f, -1.00f, 1.0f, 0.0f}, + // + // {0.0f, 0.5f, 0.25f, -2.50f, -1.25f, 2.00f, 1.0f, 0.0f}, + // {0.0f, -0.5f, 0.25f, 2.50f, -1.25f, -2.00f, 1.0f, 0.0f}, + // + // {0.0f, 2.0f, 4.00f, -2.50f, -5.00f, 0.50f, 1.0f, 0.0f}, + // {0.0f, -2.0f, 4.00f, 2.50f, -5.00f, -0.50f, 1.0f, 0.0f}, + // + // {0.0f, -1.0f, 0.00f, 5.25f, 0.00f, -5.25f, 0.0f, 1.0f} + // }; + + // 0 = r00 - r06 + (r04 - r02) * 5.25 + // 7 = r07 - r01 + (r03 - r05) * 5.25 + + // 1 = (r02 + r06 - r04 * 4.25) + (r01 - r03 * 4.25 + r05) + // 2 = (r02 + r06 - r04 * 4.25) - (r01 - r03 * 4.25 + r05) + + // 3 = (r06 + r02 * 0.25 - r04 * 1.25) + (r01 * 0.5 - r03 * 2.5 + r05 * 2) + // 4 = (r06 + r02 * 0.25 - r04 * 1.25) - (r01 * 0.5 - r03 * 2.5 + r05 * 2) + + // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5) + // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5) + + // implicit transpose + afpvec4 m00 = v00 - v06 + (v04 - v02) * afp(5.25f); + afpvec4 m01 = v10 - v16 + (v14 - v12) * afp(5.25f); + afpvec4 m02 = v20 - v26 + (v24 - v22) * afp(5.25f); + afpvec4 m03 = v30 - v36 + (v34 - v32) * afp(5.25f); + afpvec4 m04 = v40 - v46 + (v44 - v42) * afp(5.25f); + afpvec4 m05 = v50 - v56 + (v54 - v52) * afp(5.25f); + afpvec4 m06 = v60 - v66 + (v64 - v62) * afp(5.25f); + afpvec4 m07 = v70 - v76 + (v74 - v72) * afp(5.25f); + afpvec4 m70 = v07 - v01 + (v03 - v05) * afp(5.25f); + afpvec4 m71 = v17 - v11 + (v13 - v15) * afp(5.25f); + afpvec4 m72 = v27 - v21 + (v23 - v25) * afp(5.25f); + afpvec4 m73 = v37 - v31 + (v33 - v35) * afp(5.25f); + afpvec4 m74 = v47 - v41 + (v43 - v45) * afp(5.25f); + afpvec4 m75 = v57 - v51 + (v53 - v55) * afp(5.25f); + afpvec4 m76 = v67 - v61 + (v63 - v65) * afp(5.25f); + afpvec4 m77 = v77 - v71 + (v73 - v75) * afp(5.25f); + + afpvec4 m10 = (v02 + v06 - v04 * afp(4.25f)) + (v01 - v03 * afp(4.25f) + v05); + afpvec4 m11 = (v12 + v16 - v14 * afp(4.25f)) + (v11 - v13 * afp(4.25f) + v15); + afpvec4 m12 = (v22 + v26 - v24 * afp(4.25f)) + (v21 - v23 * afp(4.25f) + v25); + afpvec4 m13 = (v32 + v36 - v34 * afp(4.25f)) + (v31 - v33 * afp(4.25f) + v35); + afpvec4 m14 = (v42 + v46 - v44 * afp(4.25f)) + (v41 - v43 * afp(4.25f) + v45); + afpvec4 m15 = (v52 + v56 - v54 * afp(4.25f)) + (v51 - v53 * afp(4.25f) + v55); + afpvec4 m16 = (v62 + v66 - v64 * afp(4.25f)) + (v61 - v63 * afp(4.25f) + v65); + afpvec4 m17 = (v72 + v76 - v74 * afp(4.25f)) + (v71 - v73 * afp(4.25f) + v75); + afpvec4 m20 = (v02 + v06 - v04 * afp(4.25f)) - (v01 - v03 * afp(4.25f) + v05); + afpvec4 m21 = (v12 + v16 - v14 * afp(4.25f)) - (v11 - v13 * afp(4.25f) + v15); + afpvec4 m22 = (v22 + v26 - v24 * afp(4.25f)) - (v21 - v23 * afp(4.25f) + v25); + afpvec4 m23 = (v32 + v36 - v34 * afp(4.25f)) - (v31 - v33 * afp(4.25f) + v35); + afpvec4 m24 = (v42 + v46 - v44 * afp(4.25f)) - (v41 - v43 * afp(4.25f) + v45); + afpvec4 m25 = (v52 + v56 - v54 * afp(4.25f)) - (v51 - v53 * afp(4.25f) + v55); + afpvec4 m26 = (v62 + v66 - v64 * afp(4.25f)) - (v61 - v63 * afp(4.25f) + v65); + afpvec4 m27 = (v72 + v76 - v74 * afp(4.25f)) - (v71 - v73 * afp(4.25f) + v75); + + afpvec4 m30 = (v06 + v02 * afp(0.25f) - v04 * afp(1.25f)) + (v01 * afp(0.5f) - v03 * afp(2.5f) + v05 * afp(2.f)); + afpvec4 m31 = (v16 + v12 * afp(0.25f) - v14 * afp(1.25f)) + (v11 * afp(0.5f) - v13 * afp(2.5f) + v15 * afp(2.f)); + afpvec4 m32 = (v26 + v22 * afp(0.25f) - v24 * afp(1.25f)) + (v21 * afp(0.5f) - v23 * afp(2.5f) + v25 * afp(2.f)); + afpvec4 m33 = (v36 + v32 * afp(0.25f) - v34 * afp(1.25f)) + (v31 * afp(0.5f) - v33 * afp(2.5f) + v35 * afp(2.f)); + afpvec4 m34 = (v46 + v42 * afp(0.25f) - v44 * afp(1.25f)) + (v41 * afp(0.5f) - v43 * afp(2.5f) + v45 * afp(2.f)); + afpvec4 m35 = (v56 + v52 * afp(0.25f) - v54 * afp(1.25f)) + (v51 * afp(0.5f) - v53 * afp(2.5f) + v55 * afp(2.f)); + afpvec4 m36 = (v66 + v62 * afp(0.25f) - v64 * afp(1.25f)) + (v61 * afp(0.5f) - v63 * afp(2.5f) + v65 * afp(2.f)); + afpvec4 m37 = (v76 + v72 * afp(0.25f) - v74 * afp(1.25f)) + (v71 * afp(0.5f) - v73 * afp(2.5f) + v75 * afp(2.f)); + afpvec4 m40 = (v06 + v02 * afp(0.25f) - v04 * afp(1.25f)) - (v01 * afp(0.5f) - v03 * afp(2.5f) + v05 * afp(2.f)); + afpvec4 m41 = (v16 + v12 * afp(0.25f) - v14 * afp(1.25f)) - (v11 * afp(0.5f) - v13 * afp(2.5f) + v15 * afp(2.f)); + afpvec4 m42 = (v26 + v22 * afp(0.25f) - v24 * afp(1.25f)) - (v21 * afp(0.5f) - v23 * afp(2.5f) + v25 * afp(2.f)); + afpvec4 m43 = (v36 + v32 * afp(0.25f) - v34 * afp(1.25f)) - (v31 * afp(0.5f) - v33 * afp(2.5f) + v35 * afp(2.f)); + afpvec4 m44 = (v46 + v42 * afp(0.25f) - v44 * afp(1.25f)) - (v41 * afp(0.5f) - v43 * afp(2.5f) + v45 * afp(2.f)); + afpvec4 m45 = (v56 + v52 * afp(0.25f) - v54 * afp(1.25f)) - (v51 * afp(0.5f) - v53 * afp(2.5f) + v55 * afp(2.f)); + afpvec4 m46 = (v66 + v62 * afp(0.25f) - v64 * afp(1.25f)) - (v61 * afp(0.5f) - v63 * afp(2.5f) + v65 * afp(2.f)); + afpvec4 m47 = (v76 + v72 * afp(0.25f) - v74 * afp(1.25f)) - (v71 * afp(0.5f) - v73 * afp(2.5f) + v75 * afp(2.f)); + + afpvec4 m50 = (v06 + (v02 - v04 * afp(1.25f)) * afp(4.f)) + (v01 * afp(2.f) - v03 * afp(2.5f) + v05 * afp(0.5f)); + afpvec4 m51 = (v16 + (v12 - v14 * afp(1.25f)) * afp(4.f)) + (v11 * afp(2.f) - v13 * afp(2.5f) + v15 * afp(0.5f)); + afpvec4 m52 = (v26 + (v22 - v24 * afp(1.25f)) * afp(4.f)) + (v21 * afp(2.f) - v23 * afp(2.5f) + v25 * afp(0.5f)); + afpvec4 m53 = (v36 + (v32 - v34 * afp(1.25f)) * afp(4.f)) + (v31 * afp(2.f) - v33 * afp(2.5f) + v35 * afp(0.5f)); + afpvec4 m54 = (v46 + (v42 - v44 * afp(1.25f)) * afp(4.f)) + (v41 * afp(2.f) - v43 * afp(2.5f) + v45 * afp(0.5f)); + afpvec4 m55 = (v56 + (v52 - v54 * afp(1.25f)) * afp(4.f)) + (v51 * afp(2.f) - v53 * afp(2.5f) + v55 * afp(0.5f)); + afpvec4 m56 = (v66 + (v62 - v64 * afp(1.25f)) * afp(4.f)) + (v61 * afp(2.f) - v63 * afp(2.5f) + v65 * afp(0.5f)); + afpvec4 m57 = (v76 + (v72 - v74 * afp(1.25f)) * afp(4.f)) + (v71 * afp(2.f) - v73 * afp(2.5f) + v75 * afp(0.5f)); + afpvec4 m60 = (v06 + (v02 - v04 * afp(1.25f)) * afp(4.f)) - (v01 * afp(2.f) - v03 * afp(2.5f) + v05 * afp(0.5f)); + afpvec4 m61 = (v16 + (v12 - v14 * afp(1.25f)) * afp(4.f)) - (v11 * afp(2.f) - v13 * afp(2.5f) + v15 * afp(0.5f)); + afpvec4 m62 = (v26 + (v22 - v24 * afp(1.25f)) * afp(4.f)) - (v21 * afp(2.f) - v23 * afp(2.5f) + v25 * afp(0.5f)); + afpvec4 m63 = (v36 + (v32 - v34 * afp(1.25f)) * afp(4.f)) - (v31 * afp(2.f) - v33 * afp(2.5f) + v35 * afp(0.5f)); + afpvec4 m64 = (v46 + (v42 - v44 * afp(1.25f)) * afp(4.f)) - (v41 * afp(2.f) - v43 * afp(2.5f) + v45 * afp(0.5f)); + afpvec4 m65 = (v56 + (v52 - v54 * afp(1.25f)) * afp(4.f)) - (v51 * afp(2.f) - v53 * afp(2.5f) + v55 * afp(0.5f)); + afpvec4 m66 = (v66 + (v62 - v64 * afp(1.25f)) * afp(4.f)) - (v61 * afp(2.f) - v63 * afp(2.5f) + v65 * afp(0.5f)); + afpvec4 m67 = (v76 + (v72 - v74 * afp(1.25f)) * afp(4.f)) - (v71 * afp(2.f) - v73 * afp(2.5f) + v75 * afp(0.5f)); + + v00 = m00 - m06 + (m04 - m02) * afp(5.25f); + v10 = m10 - m16 + (m14 - m12) * afp(5.25f); + v20 = m20 - m26 + (m24 - m22) * afp(5.25f); + v30 = m30 - m36 + (m34 - m32) * afp(5.25f); + v40 = m40 - m46 + (m44 - m42) * afp(5.25f); + v50 = m50 - m56 + (m54 - m52) * afp(5.25f); + v60 = m60 - m66 + (m64 - m62) * afp(5.25f); + v70 = m70 - m76 + (m74 - m72) * afp(5.25f); + v07 = m07 - m01 + (m03 - m05) * afp(5.25f); + v17 = m17 - m11 + (m13 - m15) * afp(5.25f); + v27 = m27 - m21 + (m23 - m25) * afp(5.25f); + v37 = m37 - m31 + (m33 - m35) * afp(5.25f); + v47 = m47 - m41 + (m43 - m45) * afp(5.25f); + v57 = m57 - m51 + (m53 - m55) * afp(5.25f); + v67 = m67 - m61 + (m63 - m65) * afp(5.25f); + v77 = m77 - m71 + (m73 - m75) * afp(5.25f); + + v01 = (m02 + m06 - m04 * afp(4.25f)) + (m01 - m03 * afp(4.25f) + m05); + v11 = (m12 + m16 - m14 * afp(4.25f)) + (m11 - m13 * afp(4.25f) + m15); + v21 = (m22 + m26 - m24 * afp(4.25f)) + (m21 - m23 * afp(4.25f) + m25); + v31 = (m32 + m36 - m34 * afp(4.25f)) + (m31 - m33 * afp(4.25f) + m35); + v41 = (m42 + m46 - m44 * afp(4.25f)) + (m41 - m43 * afp(4.25f) + m45); + v51 = (m52 + m56 - m54 * afp(4.25f)) + (m51 - m53 * afp(4.25f) + m55); + v61 = (m62 + m66 - m64 * afp(4.25f)) + (m61 - m63 * afp(4.25f) + m65); + v71 = (m72 + m76 - m74 * afp(4.25f)) + (m71 - m73 * afp(4.25f) + m75); + v02 = (m02 + m06 - m04 * afp(4.25f)) - (m01 - m03 * afp(4.25f) + m05); + v12 = (m12 + m16 - m14 * afp(4.25f)) - (m11 - m13 * afp(4.25f) + m15); + v22 = (m22 + m26 - m24 * afp(4.25f)) - (m21 - m23 * afp(4.25f) + m25); + v32 = (m32 + m36 - m34 * afp(4.25f)) - (m31 - m33 * afp(4.25f) + m35); + v42 = (m42 + m46 - m44 * afp(4.25f)) - (m41 - m43 * afp(4.25f) + m45); + v52 = (m52 + m56 - m54 * afp(4.25f)) - (m51 - m53 * afp(4.25f) + m55); + v62 = (m62 + m66 - m64 * afp(4.25f)) - (m61 - m63 * afp(4.25f) + m65); + v72 = (m72 + m76 - m74 * afp(4.25f)) - (m71 - m73 * afp(4.25f) + m75); + + v03 = (m06 + m02 * afp(0.25f) - m04 * afp(1.25f)) + (m01 * afp(0.5f) - m03 * afp(2.5f) + m05 * afp(2.f)); + v13 = (m16 + m12 * afp(0.25f) - m14 * afp(1.25f)) + (m11 * afp(0.5f) - m13 * afp(2.5f) + m15 * afp(2.f)); + v23 = (m26 + m22 * afp(0.25f) - m24 * afp(1.25f)) + (m21 * afp(0.5f) - m23 * afp(2.5f) + m25 * afp(2.f)); + v33 = (m36 + m32 * afp(0.25f) - m34 * afp(1.25f)) + (m31 * afp(0.5f) - m33 * afp(2.5f) + m35 * afp(2.f)); + v43 = (m46 + m42 * afp(0.25f) - m44 * afp(1.25f)) + (m41 * afp(0.5f) - m43 * afp(2.5f) + m45 * afp(2.f)); + v53 = (m56 + m52 * afp(0.25f) - m54 * afp(1.25f)) + (m51 * afp(0.5f) - m53 * afp(2.5f) + m55 * afp(2.f)); + v63 = (m66 + m62 * afp(0.25f) - m64 * afp(1.25f)) + (m61 * afp(0.5f) - m63 * afp(2.5f) + m65 * afp(2.f)); + v73 = (m76 + m72 * afp(0.25f) - m74 * afp(1.25f)) + (m71 * afp(0.5f) - m73 * afp(2.5f) + m75 * afp(2.f)); + v04 = (m06 + m02 * afp(0.25f) - m04 * afp(1.25f)) - (m01 * afp(0.5f) - m03 * afp(2.5f) + m05 * afp(2.f)); + v14 = (m16 + m12 * afp(0.25f) - m14 * afp(1.25f)) - (m11 * afp(0.5f) - m13 * afp(2.5f) + m15 * afp(2.f)); + v24 = (m26 + m22 * afp(0.25f) - m24 * afp(1.25f)) - (m21 * afp(0.5f) - m23 * afp(2.5f) + m25 * afp(2.f)); + v34 = (m36 + m32 * afp(0.25f) - m34 * afp(1.25f)) - (m31 * afp(0.5f) - m33 * afp(2.5f) + m35 * afp(2.f)); + v44 = (m46 + m42 * afp(0.25f) - m44 * afp(1.25f)) - (m41 * afp(0.5f) - m43 * afp(2.5f) + m45 * afp(2.f)); + v54 = (m56 + m52 * afp(0.25f) - m54 * afp(1.25f)) - (m51 * afp(0.5f) - m53 * afp(2.5f) + m55 * afp(2.f)); + v64 = (m66 + m62 * afp(0.25f) - m64 * afp(1.25f)) - (m61 * afp(0.5f) - m63 * afp(2.5f) + m65 * afp(2.f)); + v74 = (m76 + m72 * afp(0.25f) - m74 * afp(1.25f)) - (m71 * afp(0.5f) - m73 * afp(2.5f) + m75 * afp(2.f)); + + v05 = (m06 + (m02 - m04 * afp(1.25f)) * afp(4.f)) + (m01 * afp(2.f) - m03 * afp(2.5f) + m05 * afp(0.5f)); + v15 = (m16 + (m12 - m14 * afp(1.25f)) * afp(4.f)) + (m11 * afp(2.f) - m13 * afp(2.5f) + m15 * afp(0.5f)); + v25 = (m26 + (m22 - m24 * afp(1.25f)) * afp(4.f)) + (m21 * afp(2.f) - m23 * afp(2.5f) + m25 * afp(0.5f)); + v35 = (m36 + (m32 - m34 * afp(1.25f)) * afp(4.f)) + (m31 * afp(2.f) - m33 * afp(2.5f) + m35 * afp(0.5f)); + v45 = (m46 + (m42 - m44 * afp(1.25f)) * afp(4.f)) + (m41 * afp(2.f) - m43 * afp(2.5f) + m45 * afp(0.5f)); + v55 = (m56 + (m52 - m54 * afp(1.25f)) * afp(4.f)) + (m51 * afp(2.f) - m53 * afp(2.5f) + m55 * afp(0.5f)); + v65 = (m66 + (m62 - m64 * afp(1.25f)) * afp(4.f)) + (m61 * afp(2.f) - m63 * afp(2.5f) + m65 * afp(0.5f)); + v75 = (m76 + (m72 - m74 * afp(1.25f)) * afp(4.f)) + (m71 * afp(2.f) - m73 * afp(2.5f) + m75 * afp(0.5f)); + v06 = (m06 + (m02 - m04 * afp(1.25f)) * afp(4.f)) - (m01 * afp(2.f) - m03 * afp(2.5f) + m05 * afp(0.5f)); + v16 = (m16 + (m12 - m14 * afp(1.25f)) * afp(4.f)) - (m11 * afp(2.f) - m13 * afp(2.5f) + m15 * afp(0.5f)); + v26 = (m26 + (m22 - m24 * afp(1.25f)) * afp(4.f)) - (m21 * afp(2.f) - m23 * afp(2.5f) + m25 * afp(0.5f)); + v36 = (m36 + (m32 - m34 * afp(1.25f)) * afp(4.f)) - (m31 * afp(2.f) - m33 * afp(2.5f) + m35 * afp(0.5f)); + v46 = (m46 + (m42 - m44 * afp(1.25f)) * afp(4.f)) - (m41 * afp(2.f) - m43 * afp(2.5f) + m45 * afp(0.5f)); + v56 = (m56 + (m52 - m54 * afp(1.25f)) * afp(4.f)) - (m51 * afp(2.f) - m53 * afp(2.5f) + m55 * afp(0.5f)); + v66 = (m66 + (m62 - m64 * afp(1.25f)) * afp(4.f)) - (m61 * afp(2.f) - m63 * afp(2.5f) + m65 * afp(0.5f)); + v76 = (m76 + (m72 - m74 * afp(1.25f)) * afp(4.f)) - (m71 * afp(2.f) - m73 * afp(2.5f) + m75 * afp(0.5f)); + + // store 64 + int v_tm_offset = gz * p.outcstep + (gy * p.block_x + gx) * 64; + + bottom_tm_blob_data[v_tm_offset + 0] = afp2sfpvec4(v00); + bottom_tm_blob_data[v_tm_offset + 1] = afp2sfpvec4(v01); + bottom_tm_blob_data[v_tm_offset + 2] = afp2sfpvec4(v02); + bottom_tm_blob_data[v_tm_offset + 3] = afp2sfpvec4(v03); + bottom_tm_blob_data[v_tm_offset + 4] = afp2sfpvec4(v04); + bottom_tm_blob_data[v_tm_offset + 5] = afp2sfpvec4(v05); + bottom_tm_blob_data[v_tm_offset + 6] = afp2sfpvec4(v06); + bottom_tm_blob_data[v_tm_offset + 7] = afp2sfpvec4(v07); + + bottom_tm_blob_data[v_tm_offset + 8] = afp2sfpvec4(v10); + bottom_tm_blob_data[v_tm_offset + 9] = afp2sfpvec4(v11); + bottom_tm_blob_data[v_tm_offset + 10] = afp2sfpvec4(v12); + bottom_tm_blob_data[v_tm_offset + 11] = afp2sfpvec4(v13); + bottom_tm_blob_data[v_tm_offset + 12] = afp2sfpvec4(v14); + bottom_tm_blob_data[v_tm_offset + 13] = afp2sfpvec4(v15); + bottom_tm_blob_data[v_tm_offset + 14] = afp2sfpvec4(v16); + bottom_tm_blob_data[v_tm_offset + 15] = afp2sfpvec4(v17); + + bottom_tm_blob_data[v_tm_offset + 16] = afp2sfpvec4(v20); + bottom_tm_blob_data[v_tm_offset + 17] = afp2sfpvec4(v21); + bottom_tm_blob_data[v_tm_offset + 18] = afp2sfpvec4(v22); + bottom_tm_blob_data[v_tm_offset + 19] = afp2sfpvec4(v23); + bottom_tm_blob_data[v_tm_offset + 20] = afp2sfpvec4(v24); + bottom_tm_blob_data[v_tm_offset + 21] = afp2sfpvec4(v25); + bottom_tm_blob_data[v_tm_offset + 22] = afp2sfpvec4(v26); + bottom_tm_blob_data[v_tm_offset + 23] = afp2sfpvec4(v27); + + bottom_tm_blob_data[v_tm_offset + 24] = afp2sfpvec4(v30); + bottom_tm_blob_data[v_tm_offset + 25] = afp2sfpvec4(v31); + bottom_tm_blob_data[v_tm_offset + 26] = afp2sfpvec4(v32); + bottom_tm_blob_data[v_tm_offset + 27] = afp2sfpvec4(v33); + bottom_tm_blob_data[v_tm_offset + 28] = afp2sfpvec4(v34); + bottom_tm_blob_data[v_tm_offset + 29] = afp2sfpvec4(v35); + bottom_tm_blob_data[v_tm_offset + 30] = afp2sfpvec4(v36); + bottom_tm_blob_data[v_tm_offset + 31] = afp2sfpvec4(v37); + + bottom_tm_blob_data[v_tm_offset + 32] = afp2sfpvec4(v40); + bottom_tm_blob_data[v_tm_offset + 33] = afp2sfpvec4(v41); + bottom_tm_blob_data[v_tm_offset + 34] = afp2sfpvec4(v42); + bottom_tm_blob_data[v_tm_offset + 35] = afp2sfpvec4(v43); + bottom_tm_blob_data[v_tm_offset + 36] = afp2sfpvec4(v44); + bottom_tm_blob_data[v_tm_offset + 37] = afp2sfpvec4(v45); + bottom_tm_blob_data[v_tm_offset + 38] = afp2sfpvec4(v46); + bottom_tm_blob_data[v_tm_offset + 39] = afp2sfpvec4(v47); + + bottom_tm_blob_data[v_tm_offset + 40] = afp2sfpvec4(v50); + bottom_tm_blob_data[v_tm_offset + 41] = afp2sfpvec4(v51); + bottom_tm_blob_data[v_tm_offset + 42] = afp2sfpvec4(v52); + bottom_tm_blob_data[v_tm_offset + 43] = afp2sfpvec4(v53); + bottom_tm_blob_data[v_tm_offset + 44] = afp2sfpvec4(v54); + bottom_tm_blob_data[v_tm_offset + 45] = afp2sfpvec4(v55); + bottom_tm_blob_data[v_tm_offset + 46] = afp2sfpvec4(v56); + bottom_tm_blob_data[v_tm_offset + 47] = afp2sfpvec4(v57); + + bottom_tm_blob_data[v_tm_offset + 48] = afp2sfpvec4(v60); + bottom_tm_blob_data[v_tm_offset + 49] = afp2sfpvec4(v61); + bottom_tm_blob_data[v_tm_offset + 50] = afp2sfpvec4(v62); + bottom_tm_blob_data[v_tm_offset + 51] = afp2sfpvec4(v63); + bottom_tm_blob_data[v_tm_offset + 52] = afp2sfpvec4(v64); + bottom_tm_blob_data[v_tm_offset + 53] = afp2sfpvec4(v65); + bottom_tm_blob_data[v_tm_offset + 54] = afp2sfpvec4(v66); + bottom_tm_blob_data[v_tm_offset + 55] = afp2sfpvec4(v67); + + bottom_tm_blob_data[v_tm_offset + 56] = afp2sfpvec4(v70); + bottom_tm_blob_data[v_tm_offset + 57] = afp2sfpvec4(v71); + bottom_tm_blob_data[v_tm_offset + 58] = afp2sfpvec4(v72); + bottom_tm_blob_data[v_tm_offset + 59] = afp2sfpvec4(v73); + bottom_tm_blob_data[v_tm_offset + 60] = afp2sfpvec4(v74); + bottom_tm_blob_data[v_tm_offset + 61] = afp2sfpvec4(v75); + bottom_tm_blob_data[v_tm_offset + 62] = afp2sfpvec4(v76); + bottom_tm_blob_data[v_tm_offset + 63] = afp2sfpvec4(v77); +} diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd63_transform_output.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd63_transform_output.comp new file mode 100644 index 000000000..e9ebd773c --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd63_transform_output.comp @@ -0,0 +1,583 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_AMD_gpu_shader_half_float: require +#endif + +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer bias_blob { sfpvec4 bias_data[]; }; + +layout (push_constant) uniform parameter +{ + int c; + int cstep; + + int block_x; + int block_y; + + int outw; + int outh; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.block_x || gy >= p.block_y || gz >= p.c) + return; + + // load 64 + int v_tm_offset = gz * p.cstep + (gy * p.block_x + gx) * 64; + + afpvec4 v00 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 0]); + afpvec4 v01 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 1]); + afpvec4 v02 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 2]); + afpvec4 v03 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 3]); + afpvec4 v04 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 4]); + afpvec4 v05 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 5]); + afpvec4 v06 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 6]); + afpvec4 v07 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 7]); + afpvec4 v10 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 8]); + afpvec4 v11 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 9]); + afpvec4 v12 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 10]); + afpvec4 v13 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 11]); + afpvec4 v14 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 12]); + afpvec4 v15 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 13]); + afpvec4 v16 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 14]); + afpvec4 v17 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 15]); + afpvec4 v20 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 16]); + afpvec4 v21 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 17]); + afpvec4 v22 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 18]); + afpvec4 v23 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 19]); + afpvec4 v24 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 20]); + afpvec4 v25 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 21]); + afpvec4 v26 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 22]); + afpvec4 v27 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 23]); + afpvec4 v30 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 24]); + afpvec4 v31 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 25]); + afpvec4 v32 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 26]); + afpvec4 v33 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 27]); + afpvec4 v34 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 28]); + afpvec4 v35 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 29]); + afpvec4 v36 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 30]); + afpvec4 v37 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 31]); + afpvec4 v40 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 32]); + afpvec4 v41 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 33]); + afpvec4 v42 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 34]); + afpvec4 v43 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 35]); + afpvec4 v44 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 36]); + afpvec4 v45 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 37]); + afpvec4 v46 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 38]); + afpvec4 v47 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 39]); + afpvec4 v50 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 40]); + afpvec4 v51 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 41]); + afpvec4 v52 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 42]); + afpvec4 v53 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 43]); + afpvec4 v54 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 44]); + afpvec4 v55 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 45]); + afpvec4 v56 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 46]); + afpvec4 v57 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 47]); + afpvec4 v60 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 48]); + afpvec4 v61 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 49]); + afpvec4 v62 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 50]); + afpvec4 v63 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 51]); + afpvec4 v64 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 52]); + afpvec4 v65 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 53]); + afpvec4 v66 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 54]); + afpvec4 v67 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 55]); + afpvec4 v70 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 56]); + afpvec4 v71 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 57]); + afpvec4 v72 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 58]); + afpvec4 v73 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 59]); + afpvec4 v74 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 60]); + afpvec4 v75 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 61]); + afpvec4 v76 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 62]); + afpvec4 v77 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 63]); + + // const float otm[6][8] = { + // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 32.0f, 32.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 16.0f,-16.0f, 0.0f}, + // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 8.0f, 8.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 4.0f, -4.0f, 0.0f}, + // {0.0f, 1.0f, 1.0f, 16.0f, 16.0f, 2.0f, 2.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 32.0f, -32.0f, 1.0f, -1.0f, 1.0f} + // }; + + // 0 = r0 + (r1 + r2) + (r3 + r4) + (r5 + r6) * 32 + // 1 = (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16 + // 2 = (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8 + // 3 = (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4 + // 4 = (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2 + // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6) + + afpvec4 v01_a_v02 = v01 + v02; + afpvec4 v11_a_v12 = v11 + v12; + afpvec4 v21_a_v22 = v21 + v22; + afpvec4 v31_a_v32 = v31 + v32; + afpvec4 v41_a_v42 = v41 + v42; + afpvec4 v51_a_v52 = v51 + v52; + afpvec4 v61_a_v62 = v61 + v62; + afpvec4 v71_a_v72 = v71 + v72; + afpvec4 v01_m_v02 = v01 - v02; + afpvec4 v11_m_v12 = v11 - v12; + afpvec4 v21_m_v22 = v21 - v22; + afpvec4 v31_m_v32 = v31 - v32; + afpvec4 v41_m_v42 = v41 - v42; + afpvec4 v51_m_v52 = v51 - v52; + afpvec4 v61_m_v62 = v61 - v62; + afpvec4 v71_m_v72 = v71 - v72; + + afpvec4 v03_a_v04 = v03 + v04; + afpvec4 v13_a_v14 = v13 + v14; + afpvec4 v23_a_v24 = v23 + v24; + afpvec4 v33_a_v34 = v33 + v34; + afpvec4 v43_a_v44 = v43 + v44; + afpvec4 v53_a_v54 = v53 + v54; + afpvec4 v63_a_v64 = v63 + v64; + afpvec4 v73_a_v74 = v73 + v74; + afpvec4 v03_m_v04 = v03 - v04; + afpvec4 v13_m_v14 = v13 - v14; + afpvec4 v23_m_v24 = v23 - v24; + afpvec4 v33_m_v34 = v33 - v34; + afpvec4 v43_m_v44 = v43 - v44; + afpvec4 v53_m_v54 = v53 - v54; + afpvec4 v63_m_v64 = v63 - v64; + afpvec4 v73_m_v74 = v73 - v74; + + afpvec4 v05_a_v06 = v05 + v06; + afpvec4 v15_a_v16 = v15 + v16; + afpvec4 v25_a_v26 = v25 + v26; + afpvec4 v35_a_v36 = v35 + v36; + afpvec4 v45_a_v46 = v45 + v46; + afpvec4 v55_a_v56 = v55 + v56; + afpvec4 v65_a_v66 = v65 + v66; + afpvec4 v75_a_v76 = v75 + v76; + afpvec4 v05_m_v06 = v05 - v06; + afpvec4 v15_m_v16 = v15 - v16; + afpvec4 v25_m_v26 = v25 - v26; + afpvec4 v35_m_v36 = v35 - v36; + afpvec4 v45_m_v46 = v45 - v46; + afpvec4 v55_m_v56 = v55 - v56; + afpvec4 v65_m_v66 = v65 - v66; + afpvec4 v75_m_v76 = v75 - v76; + + // implicit transpose + afpvec4 m00 = v00 + v01_a_v02 + v03_a_v04 + v05_a_v06 * afp(32.f); + afpvec4 m01 = v10 + v11_a_v12 + v13_a_v14 + v15_a_v16 * afp(32.f); + afpvec4 m02 = v20 + v21_a_v22 + v23_a_v24 + v25_a_v26 * afp(32.f); + afpvec4 m03 = v30 + v31_a_v32 + v33_a_v34 + v35_a_v36 * afp(32.f); + afpvec4 m04 = v40 + v41_a_v42 + v43_a_v44 + v45_a_v46 * afp(32.f); + afpvec4 m05 = v50 + v51_a_v52 + v53_a_v54 + v55_a_v56 * afp(32.f); + afpvec4 m06 = v60 + v61_a_v62 + v63_a_v64 + v65_a_v66 * afp(32.f); + afpvec4 m07 = v70 + v71_a_v72 + v73_a_v74 + v75_a_v76 * afp(32.f); + + afpvec4 m10 = v01_m_v02 + v03_m_v04 * afp(2.f) + v05_m_v06 * afp(16.f); + afpvec4 m11 = v11_m_v12 + v13_m_v14 * afp(2.f) + v15_m_v16 * afp(16.f); + afpvec4 m12 = v21_m_v22 + v23_m_v24 * afp(2.f) + v25_m_v26 * afp(16.f); + afpvec4 m13 = v31_m_v32 + v33_m_v34 * afp(2.f) + v35_m_v36 * afp(16.f); + afpvec4 m14 = v41_m_v42 + v43_m_v44 * afp(2.f) + v45_m_v46 * afp(16.f); + afpvec4 m15 = v51_m_v52 + v53_m_v54 * afp(2.f) + v55_m_v56 * afp(16.f); + afpvec4 m16 = v61_m_v62 + v63_m_v64 * afp(2.f) + v65_m_v66 * afp(16.f); + afpvec4 m17 = v71_m_v72 + v73_m_v74 * afp(2.f) + v75_m_v76 * afp(16.f); + + afpvec4 m20 = v01_a_v02 + v03_a_v04 * afp(4.f) + v05_a_v06 * afp(8.f); + afpvec4 m21 = v11_a_v12 + v13_a_v14 * afp(4.f) + v15_a_v16 * afp(8.f); + afpvec4 m22 = v21_a_v22 + v23_a_v24 * afp(4.f) + v25_a_v26 * afp(8.f); + afpvec4 m23 = v31_a_v32 + v33_a_v34 * afp(4.f) + v35_a_v36 * afp(8.f); + afpvec4 m24 = v41_a_v42 + v43_a_v44 * afp(4.f) + v45_a_v46 * afp(8.f); + afpvec4 m25 = v51_a_v52 + v53_a_v54 * afp(4.f) + v55_a_v56 * afp(8.f); + afpvec4 m26 = v61_a_v62 + v63_a_v64 * afp(4.f) + v65_a_v66 * afp(8.f); + afpvec4 m27 = v71_a_v72 + v73_a_v74 * afp(4.f) + v75_a_v76 * afp(8.f); + + afpvec4 m30 = v01_m_v02 + v03_m_v04 * afp(8.f) + v05_m_v06 * afp(4.f); + afpvec4 m31 = v11_m_v12 + v13_m_v14 * afp(8.f) + v15_m_v16 * afp(4.f); + afpvec4 m32 = v21_m_v22 + v23_m_v24 * afp(8.f) + v25_m_v26 * afp(4.f); + afpvec4 m33 = v31_m_v32 + v33_m_v34 * afp(8.f) + v35_m_v36 * afp(4.f); + afpvec4 m34 = v41_m_v42 + v43_m_v44 * afp(8.f) + v45_m_v46 * afp(4.f); + afpvec4 m35 = v51_m_v52 + v53_m_v54 * afp(8.f) + v55_m_v56 * afp(4.f); + afpvec4 m36 = v61_m_v62 + v63_m_v64 * afp(8.f) + v65_m_v66 * afp(4.f); + afpvec4 m37 = v71_m_v72 + v73_m_v74 * afp(8.f) + v75_m_v76 * afp(4.f); + + afpvec4 m40 = v01_a_v02 + v03_a_v04 * afp(16.f) + v05_a_v06 * afp(2.f); + afpvec4 m41 = v11_a_v12 + v13_a_v14 * afp(16.f) + v15_a_v16 * afp(2.f); + afpvec4 m42 = v21_a_v22 + v23_a_v24 * afp(16.f) + v25_a_v26 * afp(2.f); + afpvec4 m43 = v31_a_v32 + v33_a_v34 * afp(16.f) + v35_a_v36 * afp(2.f); + afpvec4 m44 = v41_a_v42 + v43_a_v44 * afp(16.f) + v45_a_v46 * afp(2.f); + afpvec4 m45 = v51_a_v52 + v53_a_v54 * afp(16.f) + v55_a_v56 * afp(2.f); + afpvec4 m46 = v61_a_v62 + v63_a_v64 * afp(16.f) + v65_a_v66 * afp(2.f); + afpvec4 m47 = v71_a_v72 + v73_a_v74 * afp(16.f) + v75_a_v76 * afp(2.f); + + afpvec4 m50 = v07 + v01_m_v02 + v03_m_v04 * afp(32.f) + v05_m_v06; + afpvec4 m51 = v17 + v11_m_v12 + v13_m_v14 * afp(32.f) + v15_m_v16; + afpvec4 m52 = v27 + v21_m_v22 + v23_m_v24 * afp(32.f) + v25_m_v26; + afpvec4 m53 = v37 + v31_m_v32 + v33_m_v34 * afp(32.f) + v35_m_v36; + afpvec4 m54 = v47 + v41_m_v42 + v43_m_v44 * afp(32.f) + v45_m_v46; + afpvec4 m55 = v57 + v51_m_v52 + v53_m_v54 * afp(32.f) + v55_m_v56; + afpvec4 m56 = v67 + v61_m_v62 + v63_m_v64 * afp(32.f) + v65_m_v66; + afpvec4 m57 = v77 + v71_m_v72 + v73_m_v74 * afp(32.f) + v75_m_v76; + + afpvec4 m01_a_m02 = m01 + m02; + afpvec4 m11_a_m12 = m11 + m12; + afpvec4 m21_a_m22 = m21 + m22; + afpvec4 m31_a_m32 = m31 + m32; + afpvec4 m41_a_m42 = m41 + m42; + afpvec4 m51_a_m52 = m51 + m52; + afpvec4 m01_m_m02 = m01 - m02; + afpvec4 m11_m_m12 = m11 - m12; + afpvec4 m21_m_m22 = m21 - m22; + afpvec4 m31_m_m32 = m31 - m32; + afpvec4 m41_m_m42 = m41 - m42; + afpvec4 m51_m_m52 = m51 - m52; + + afpvec4 m03_a_m04 = m03 + m04; + afpvec4 m13_a_m14 = m13 + m14; + afpvec4 m23_a_m24 = m23 + m24; + afpvec4 m33_a_m34 = m33 + m34; + afpvec4 m43_a_m44 = m43 + m44; + afpvec4 m53_a_m54 = m53 + m54; + afpvec4 m03_m_m04 = m03 - m04; + afpvec4 m13_m_m14 = m13 - m14; + afpvec4 m23_m_m24 = m23 - m24; + afpvec4 m33_m_m34 = m33 - m34; + afpvec4 m43_m_m44 = m43 - m44; + afpvec4 m53_m_m54 = m53 - m54; + + afpvec4 m05_a_m06 = m05 + m06; + afpvec4 m15_a_m16 = m15 + m16; + afpvec4 m25_a_m26 = m25 + m26; + afpvec4 m35_a_m36 = m35 + m36; + afpvec4 m45_a_m46 = m45 + m46; + afpvec4 m55_a_m56 = m55 + m56; + afpvec4 m05_m_m06 = m05 - m06; + afpvec4 m15_m_m16 = m15 - m16; + afpvec4 m25_m_m26 = m25 - m26; + afpvec4 m35_m_m36 = m35 - m36; + afpvec4 m45_m_m46 = m45 - m46; + afpvec4 m55_m_m56 = m55 - m56; + + if (bias_term == 1) + { + const afpvec4 bias_value = sfp2afpvec4(bias_data[gz]); + + v00 = bias_value + m00 + m01_a_m02 + m03_a_m04 + m05_a_m06 * afp(32.f); + v10 = bias_value + m10 + m11_a_m12 + m13_a_m14 + m15_a_m16 * afp(32.f); + v20 = bias_value + m20 + m21_a_m22 + m23_a_m24 + m25_a_m26 * afp(32.f); + v30 = bias_value + m30 + m31_a_m32 + m33_a_m34 + m35_a_m36 * afp(32.f); + v40 = bias_value + m40 + m41_a_m42 + m43_a_m44 + m45_a_m46 * afp(32.f); + v50 = bias_value + m50 + m51_a_m52 + m53_a_m54 + m55_a_m56 * afp(32.f); + + v01 = bias_value + m01_m_m02 + m03_m_m04 * afp(2.f) + m05_m_m06 * afp(16.f); + v11 = bias_value + m11_m_m12 + m13_m_m14 * afp(2.f) + m15_m_m16 * afp(16.f); + v21 = bias_value + m21_m_m22 + m23_m_m24 * afp(2.f) + m25_m_m26 * afp(16.f); + v31 = bias_value + m31_m_m32 + m33_m_m34 * afp(2.f) + m35_m_m36 * afp(16.f); + v41 = bias_value + m41_m_m42 + m43_m_m44 * afp(2.f) + m45_m_m46 * afp(16.f); + v51 = bias_value + m51_m_m52 + m53_m_m54 * afp(2.f) + m55_m_m56 * afp(16.f); + + v02 = bias_value + m01_a_m02 + m03_a_m04 * afp(4.f) + m05_a_m06 * afp(8.f); + v12 = bias_value + m11_a_m12 + m13_a_m14 * afp(4.f) + m15_a_m16 * afp(8.f); + v22 = bias_value + m21_a_m22 + m23_a_m24 * afp(4.f) + m25_a_m26 * afp(8.f); + v32 = bias_value + m31_a_m32 + m33_a_m34 * afp(4.f) + m35_a_m36 * afp(8.f); + v42 = bias_value + m41_a_m42 + m43_a_m44 * afp(4.f) + m45_a_m46 * afp(8.f); + v52 = bias_value + m51_a_m52 + m53_a_m54 * afp(4.f) + m55_a_m56 * afp(8.f); + + v03 = bias_value + m01_m_m02 + m03_m_m04 * afp(8.f) + m05_m_m06 * afp(4.f); + v13 = bias_value + m11_m_m12 + m13_m_m14 * afp(8.f) + m15_m_m16 * afp(4.f); + v23 = bias_value + m21_m_m22 + m23_m_m24 * afp(8.f) + m25_m_m26 * afp(4.f); + v33 = bias_value + m31_m_m32 + m33_m_m34 * afp(8.f) + m35_m_m36 * afp(4.f); + v43 = bias_value + m41_m_m42 + m43_m_m44 * afp(8.f) + m45_m_m46 * afp(4.f); + v53 = bias_value + m51_m_m52 + m53_m_m54 * afp(8.f) + m55_m_m56 * afp(4.f); + + v04 = bias_value + m01_a_m02 + m03_a_m04 * afp(16.f) + m05_a_m06 * afp(2.f); + v14 = bias_value + m11_a_m12 + m13_a_m14 * afp(16.f) + m15_a_m16 * afp(2.f); + v24 = bias_value + m21_a_m22 + m23_a_m24 * afp(16.f) + m25_a_m26 * afp(2.f); + v34 = bias_value + m31_a_m32 + m33_a_m34 * afp(16.f) + m35_a_m36 * afp(2.f); + v44 = bias_value + m41_a_m42 + m43_a_m44 * afp(16.f) + m45_a_m46 * afp(2.f); + v54 = bias_value + m51_a_m52 + m53_a_m54 * afp(16.f) + m55_a_m56 * afp(2.f); + + v05 = bias_value + m07 + m01_m_m02 + m03_m_m04 * afp(32.f) + m05_m_m06; + v15 = bias_value + m17 + m11_m_m12 + m13_m_m14 * afp(32.f) + m15_m_m16; + v25 = bias_value + m27 + m21_m_m22 + m23_m_m24 * afp(32.f) + m25_m_m26; + v35 = bias_value + m37 + m31_m_m32 + m33_m_m34 * afp(32.f) + m35_m_m36; + v45 = bias_value + m47 + m41_m_m42 + m43_m_m44 * afp(32.f) + m45_m_m46; + v55 = bias_value + m57 + m51_m_m52 + m53_m_m54 * afp(32.f) + m55_m_m56; + } + else + { + v00 = m00 + m01_a_m02 + m03_a_m04 + m05_a_m06 * afp(32.f); + v10 = m10 + m11_a_m12 + m13_a_m14 + m15_a_m16 * afp(32.f); + v20 = m20 + m21_a_m22 + m23_a_m24 + m25_a_m26 * afp(32.f); + v30 = m30 + m31_a_m32 + m33_a_m34 + m35_a_m36 * afp(32.f); + v40 = m40 + m41_a_m42 + m43_a_m44 + m45_a_m46 * afp(32.f); + v50 = m50 + m51_a_m52 + m53_a_m54 + m55_a_m56 * afp(32.f); + + v01 = m01_m_m02 + m03_m_m04 * afp(2.f) + m05_m_m06 * afp(16.f); + v11 = m11_m_m12 + m13_m_m14 * afp(2.f) + m15_m_m16 * afp(16.f); + v21 = m21_m_m22 + m23_m_m24 * afp(2.f) + m25_m_m26 * afp(16.f); + v31 = m31_m_m32 + m33_m_m34 * afp(2.f) + m35_m_m36 * afp(16.f); + v41 = m41_m_m42 + m43_m_m44 * afp(2.f) + m45_m_m46 * afp(16.f); + v51 = m51_m_m52 + m53_m_m54 * afp(2.f) + m55_m_m56 * afp(16.f); + + v02 = m01_a_m02 + m03_a_m04 * afp(4.f) + m05_a_m06 * afp(8.f); + v12 = m11_a_m12 + m13_a_m14 * afp(4.f) + m15_a_m16 * afp(8.f); + v22 = m21_a_m22 + m23_a_m24 * afp(4.f) + m25_a_m26 * afp(8.f); + v32 = m31_a_m32 + m33_a_m34 * afp(4.f) + m35_a_m36 * afp(8.f); + v42 = m41_a_m42 + m43_a_m44 * afp(4.f) + m45_a_m46 * afp(8.f); + v52 = m51_a_m52 + m53_a_m54 * afp(4.f) + m55_a_m56 * afp(8.f); + + v03 = m01_m_m02 + m03_m_m04 * afp(8.f) + m05_m_m06 * afp(4.f); + v13 = m11_m_m12 + m13_m_m14 * afp(8.f) + m15_m_m16 * afp(4.f); + v23 = m21_m_m22 + m23_m_m24 * afp(8.f) + m25_m_m26 * afp(4.f); + v33 = m31_m_m32 + m33_m_m34 * afp(8.f) + m35_m_m36 * afp(4.f); + v43 = m41_m_m42 + m43_m_m44 * afp(8.f) + m45_m_m46 * afp(4.f); + v53 = m51_m_m52 + m53_m_m54 * afp(8.f) + m55_m_m56 * afp(4.f); + + v04 = m01_a_m02 + m03_a_m04 * afp(16.f) + m05_a_m06 * afp(2.f); + v14 = m11_a_m12 + m13_a_m14 * afp(16.f) + m15_a_m16 * afp(2.f); + v24 = m21_a_m22 + m23_a_m24 * afp(16.f) + m25_a_m26 * afp(2.f); + v34 = m31_a_m32 + m33_a_m34 * afp(16.f) + m35_a_m36 * afp(2.f); + v44 = m41_a_m42 + m43_a_m44 * afp(16.f) + m45_a_m46 * afp(2.f); + v54 = m51_a_m52 + m53_a_m54 * afp(16.f) + m55_a_m56 * afp(2.f); + + v05 = m07 + m01_m_m02 + m03_m_m04 * afp(32.f) + m05_m_m06; + v15 = m17 + m11_m_m12 + m13_m_m14 * afp(32.f) + m15_m_m16; + v25 = m27 + m21_m_m22 + m23_m_m24 * afp(32.f) + m25_m_m26; + v35 = m37 + m31_m_m32 + m33_m_m34 * afp(32.f) + m35_m_m36; + v45 = m47 + m41_m_m42 + m43_m_m44 * afp(32.f) + m45_m_m46; + v55 = m57 + m51_m_m52 + m53_m_m54 * afp(32.f) + m55_m_m56; + } + + if (activation_type == 1) + { + v00 = max(v00, afp(0.f)); + v10 = max(v10, afp(0.f)); + v20 = max(v20, afp(0.f)); + v30 = max(v30, afp(0.f)); + v40 = max(v40, afp(0.f)); + v50 = max(v50, afp(0.f)); + v01 = max(v01, afp(0.f)); + v11 = max(v11, afp(0.f)); + v21 = max(v21, afp(0.f)); + v31 = max(v31, afp(0.f)); + v41 = max(v41, afp(0.f)); + v51 = max(v51, afp(0.f)); + v02 = max(v02, afp(0.f)); + v12 = max(v12, afp(0.f)); + v22 = max(v22, afp(0.f)); + v32 = max(v32, afp(0.f)); + v42 = max(v42, afp(0.f)); + v52 = max(v52, afp(0.f)); + v03 = max(v03, afp(0.f)); + v13 = max(v13, afp(0.f)); + v23 = max(v23, afp(0.f)); + v33 = max(v33, afp(0.f)); + v43 = max(v43, afp(0.f)); + v53 = max(v53, afp(0.f)); + v04 = max(v04, afp(0.f)); + v14 = max(v14, afp(0.f)); + v24 = max(v24, afp(0.f)); + v34 = max(v34, afp(0.f)); + v44 = max(v44, afp(0.f)); + v54 = max(v54, afp(0.f)); + v05 = max(v05, afp(0.f)); + v15 = max(v15, afp(0.f)); + v25 = max(v25, afp(0.f)); + v35 = max(v35, afp(0.f)); + v45 = max(v45, afp(0.f)); + v55 = max(v55, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + v00 = mix(v00, v00 * afp(slope), lessThan(v00, afpvec4(0.f))); + v10 = mix(v10, v10 * afp(slope), lessThan(v10, afpvec4(0.f))); + v20 = mix(v20, v20 * afp(slope), lessThan(v20, afpvec4(0.f))); + v30 = mix(v30, v30 * afp(slope), lessThan(v30, afpvec4(0.f))); + v40 = mix(v40, v40 * afp(slope), lessThan(v40, afpvec4(0.f))); + v50 = mix(v50, v50 * afp(slope), lessThan(v50, afpvec4(0.f))); + v01 = mix(v01, v01 * afp(slope), lessThan(v01, afpvec4(0.f))); + v11 = mix(v11, v11 * afp(slope), lessThan(v11, afpvec4(0.f))); + v21 = mix(v21, v21 * afp(slope), lessThan(v21, afpvec4(0.f))); + v31 = mix(v31, v31 * afp(slope), lessThan(v31, afpvec4(0.f))); + v41 = mix(v41, v41 * afp(slope), lessThan(v41, afpvec4(0.f))); + v51 = mix(v51, v51 * afp(slope), lessThan(v51, afpvec4(0.f))); + v02 = mix(v02, v02 * afp(slope), lessThan(v02, afpvec4(0.f))); + v12 = mix(v12, v12 * afp(slope), lessThan(v12, afpvec4(0.f))); + v22 = mix(v22, v22 * afp(slope), lessThan(v22, afpvec4(0.f))); + v32 = mix(v32, v32 * afp(slope), lessThan(v32, afpvec4(0.f))); + v42 = mix(v42, v42 * afp(slope), lessThan(v42, afpvec4(0.f))); + v52 = mix(v52, v52 * afp(slope), lessThan(v52, afpvec4(0.f))); + v03 = mix(v03, v03 * afp(slope), lessThan(v03, afpvec4(0.f))); + v13 = mix(v13, v13 * afp(slope), lessThan(v13, afpvec4(0.f))); + v23 = mix(v23, v23 * afp(slope), lessThan(v23, afpvec4(0.f))); + v33 = mix(v33, v33 * afp(slope), lessThan(v33, afpvec4(0.f))); + v43 = mix(v43, v43 * afp(slope), lessThan(v43, afpvec4(0.f))); + v53 = mix(v53, v53 * afp(slope), lessThan(v53, afpvec4(0.f))); + v04 = mix(v04, v04 * afp(slope), lessThan(v04, afpvec4(0.f))); + v14 = mix(v14, v14 * afp(slope), lessThan(v14, afpvec4(0.f))); + v24 = mix(v24, v24 * afp(slope), lessThan(v24, afpvec4(0.f))); + v34 = mix(v34, v34 * afp(slope), lessThan(v34, afpvec4(0.f))); + v44 = mix(v44, v44 * afp(slope), lessThan(v44, afpvec4(0.f))); + v54 = mix(v54, v54 * afp(slope), lessThan(v54, afpvec4(0.f))); + v05 = mix(v05, v05 * afp(slope), lessThan(v05, afpvec4(0.f))); + v15 = mix(v15, v15 * afp(slope), lessThan(v15, afpvec4(0.f))); + v25 = mix(v25, v25 * afp(slope), lessThan(v25, afpvec4(0.f))); + v35 = mix(v35, v35 * afp(slope), lessThan(v35, afpvec4(0.f))); + v45 = mix(v45, v45 * afp(slope), lessThan(v45, afpvec4(0.f))); + v55 = mix(v55, v55 * afp(slope), lessThan(v55, afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + v00 = clamp(v00, const_min, const_max); + v10 = clamp(v10, const_min, const_max); + v20 = clamp(v20, const_min, const_max); + v30 = clamp(v30, const_min, const_max); + v40 = clamp(v40, const_min, const_max); + v50 = clamp(v50, const_min, const_max); + v01 = clamp(v01, const_min, const_max); + v11 = clamp(v11, const_min, const_max); + v21 = clamp(v21, const_min, const_max); + v31 = clamp(v31, const_min, const_max); + v41 = clamp(v41, const_min, const_max); + v51 = clamp(v51, const_min, const_max); + v02 = clamp(v02, const_min, const_max); + v12 = clamp(v12, const_min, const_max); + v22 = clamp(v22, const_min, const_max); + v32 = clamp(v32, const_min, const_max); + v42 = clamp(v42, const_min, const_max); + v52 = clamp(v52, const_min, const_max); + v03 = clamp(v03, const_min, const_max); + v13 = clamp(v13, const_min, const_max); + v23 = clamp(v23, const_min, const_max); + v33 = clamp(v33, const_min, const_max); + v43 = clamp(v43, const_min, const_max); + v53 = clamp(v53, const_min, const_max); + v04 = clamp(v04, const_min, const_max); + v14 = clamp(v14, const_min, const_max); + v24 = clamp(v24, const_min, const_max); + v34 = clamp(v34, const_min, const_max); + v44 = clamp(v44, const_min, const_max); + v54 = clamp(v54, const_min, const_max); + v05 = clamp(v05, const_min, const_max); + v15 = clamp(v15, const_min, const_max); + v25 = clamp(v25, const_min, const_max); + v35 = clamp(v35, const_min, const_max); + v45 = clamp(v45, const_min, const_max); + v55 = clamp(v55, const_min, const_max); + } + if (activation_type == 4) + { + v00 = afp(1.f) / (afp(1.f) + exp(-v00)); + v10 = afp(1.f) / (afp(1.f) + exp(-v10)); + v20 = afp(1.f) / (afp(1.f) + exp(-v20)); + v30 = afp(1.f) / (afp(1.f) + exp(-v30)); + v40 = afp(1.f) / (afp(1.f) + exp(-v40)); + v50 = afp(1.f) / (afp(1.f) + exp(-v50)); + v01 = afp(1.f) / (afp(1.f) + exp(-v01)); + v11 = afp(1.f) / (afp(1.f) + exp(-v11)); + v21 = afp(1.f) / (afp(1.f) + exp(-v21)); + v31 = afp(1.f) / (afp(1.f) + exp(-v31)); + v41 = afp(1.f) / (afp(1.f) + exp(-v41)); + v51 = afp(1.f) / (afp(1.f) + exp(-v51)); + v02 = afp(1.f) / (afp(1.f) + exp(-v02)); + v12 = afp(1.f) / (afp(1.f) + exp(-v12)); + v22 = afp(1.f) / (afp(1.f) + exp(-v22)); + v32 = afp(1.f) / (afp(1.f) + exp(-v32)); + v42 = afp(1.f) / (afp(1.f) + exp(-v42)); + v52 = afp(1.f) / (afp(1.f) + exp(-v52)); + v03 = afp(1.f) / (afp(1.f) + exp(-v03)); + v13 = afp(1.f) / (afp(1.f) + exp(-v13)); + v23 = afp(1.f) / (afp(1.f) + exp(-v23)); + v33 = afp(1.f) / (afp(1.f) + exp(-v33)); + v43 = afp(1.f) / (afp(1.f) + exp(-v43)); + v53 = afp(1.f) / (afp(1.f) + exp(-v53)); + v04 = afp(1.f) / (afp(1.f) + exp(-v04)); + v14 = afp(1.f) / (afp(1.f) + exp(-v14)); + v24 = afp(1.f) / (afp(1.f) + exp(-v24)); + v34 = afp(1.f) / (afp(1.f) + exp(-v34)); + v44 = afp(1.f) / (afp(1.f) + exp(-v44)); + v54 = afp(1.f) / (afp(1.f) + exp(-v54)); + v05 = afp(1.f) / (afp(1.f) + exp(-v05)); + v15 = afp(1.f) / (afp(1.f) + exp(-v15)); + v25 = afp(1.f) / (afp(1.f) + exp(-v25)); + v35 = afp(1.f) / (afp(1.f) + exp(-v35)); + v45 = afp(1.f) / (afp(1.f) + exp(-v45)); + v55 = afp(1.f) / (afp(1.f) + exp(-v55)); + } + + // store 6x6 + int v_offset_0 = gz * p.outcstep + gy * 6 * p.outw + gx * 6; + int v_offset_1 = v_offset_0 + p.outw; + int v_offset_2 = v_offset_0 + p.outw * 2; + int v_offset_3 = v_offset_0 + p.outw * 3; + int v_offset_4 = v_offset_0 + p.outw * 4; + int v_offset_5 = v_offset_0 + p.outw * 5; + + top_blob_data[v_offset_0 + 0] = afp2sfpvec4(v00); + top_blob_data[v_offset_0 + 1] = afp2sfpvec4(v01); + top_blob_data[v_offset_0 + 2] = afp2sfpvec4(v02); + top_blob_data[v_offset_0 + 3] = afp2sfpvec4(v03); + top_blob_data[v_offset_0 + 4] = afp2sfpvec4(v04); + top_blob_data[v_offset_0 + 5] = afp2sfpvec4(v05); + top_blob_data[v_offset_1 + 0] = afp2sfpvec4(v10); + top_blob_data[v_offset_1 + 1] = afp2sfpvec4(v11); + top_blob_data[v_offset_1 + 2] = afp2sfpvec4(v12); + top_blob_data[v_offset_1 + 3] = afp2sfpvec4(v13); + top_blob_data[v_offset_1 + 4] = afp2sfpvec4(v14); + top_blob_data[v_offset_1 + 5] = afp2sfpvec4(v15); + top_blob_data[v_offset_2 + 0] = afp2sfpvec4(v20); + top_blob_data[v_offset_2 + 1] = afp2sfpvec4(v21); + top_blob_data[v_offset_2 + 2] = afp2sfpvec4(v22); + top_blob_data[v_offset_2 + 3] = afp2sfpvec4(v23); + top_blob_data[v_offset_2 + 4] = afp2sfpvec4(v24); + top_blob_data[v_offset_2 + 5] = afp2sfpvec4(v25); + top_blob_data[v_offset_3 + 0] = afp2sfpvec4(v30); + top_blob_data[v_offset_3 + 1] = afp2sfpvec4(v31); + top_blob_data[v_offset_3 + 2] = afp2sfpvec4(v32); + top_blob_data[v_offset_3 + 3] = afp2sfpvec4(v33); + top_blob_data[v_offset_3 + 4] = afp2sfpvec4(v34); + top_blob_data[v_offset_3 + 5] = afp2sfpvec4(v35); + top_blob_data[v_offset_4 + 0] = afp2sfpvec4(v40); + top_blob_data[v_offset_4 + 1] = afp2sfpvec4(v41); + top_blob_data[v_offset_4 + 2] = afp2sfpvec4(v42); + top_blob_data[v_offset_4 + 3] = afp2sfpvec4(v43); + top_blob_data[v_offset_4 + 4] = afp2sfpvec4(v44); + top_blob_data[v_offset_4 + 5] = afp2sfpvec4(v45); + top_blob_data[v_offset_5 + 0] = afp2sfpvec4(v50); + top_blob_data[v_offset_5 + 1] = afp2sfpvec4(v51); + top_blob_data[v_offset_5 + 2] = afp2sfpvec4(v52); + top_blob_data[v_offset_5 + 3] = afp2sfpvec4(v53); + top_blob_data[v_offset_5 + 4] = afp2sfpvec4(v54); + top_blob_data[v_offset_5 + 5] = afp2sfpvec4(v55); +}