| @@ -31,8 +31,11 @@ Convolution_vulkan::Convolution_vulkan() | |||
| pipeline_convolution_pack4 = 0; | |||
| pipeline_convolution_pack4_1x1s1d1 = 0; | |||
| pipeline_convolution_pack4_3x3s1d1_lds_8_8_2 = 0; | |||
| winograd23_padding = 0; | |||
| winograd23_crop = 0; | |||
| winograd_padding = 0; | |||
| winograd_crop = 0; | |||
| pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input = 0; | |||
| pipeline_convolution_pack4_3x3s1d1_winograd63_gemm = 0; | |||
| pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output = 0; | |||
| pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input = 0; | |||
| pipeline_convolution_pack4_3x3s1d1_winograd23_gemm = 0; | |||
| pipeline_convolution_pack4_3x3s1d1_winograd23_transform_output = 0; | |||
| @@ -137,8 +140,8 @@ int Convolution_vulkan::create_pipeline(const Option& opt) | |||
| if (num_input >= 16 && num_output >= 16) | |||
| { | |||
| { | |||
| winograd23_padding = ncnn::create_layer(ncnn::LayerType::Padding); | |||
| winograd23_padding->vkdev = vkdev; | |||
| winograd_padding = ncnn::create_layer(ncnn::LayerType::Padding); | |||
| winograd_padding->vkdev = vkdev; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, -233); | |||
| @@ -148,14 +151,14 @@ int Convolution_vulkan::create_pipeline(const Option& opt) | |||
| pd.set(4, 0); | |||
| pd.set(5, 0.f); | |||
| winograd23_padding->load_param(pd); | |||
| winograd_padding->load_param(pd); | |||
| winograd23_padding->create_pipeline(opt); | |||
| winograd_padding->create_pipeline(opt); | |||
| } | |||
| { | |||
| winograd23_crop = ncnn::create_layer(ncnn::LayerType::Crop); | |||
| winograd23_crop->vkdev = vkdev; | |||
| winograd_crop = ncnn::create_layer(ncnn::LayerType::Crop); | |||
| winograd_crop->vkdev = vkdev; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, -233); | |||
| @@ -165,11 +168,28 @@ int Convolution_vulkan::create_pipeline(const Option& opt) | |||
| pd.set(4, 0); | |||
| pd.set(5, 0); | |||
| winograd23_crop->load_param(pd); | |||
| winograd_crop->load_param(pd); | |||
| winograd23_crop->create_pipeline(opt); | |||
| winograd_crop->create_pipeline(opt); | |||
| } | |||
| } | |||
| if (num_input >= 64 && num_output >= 64) | |||
| { | |||
| pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input = new Pipeline(vkdev); | |||
| pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input->set_local_size_xyz(8, 8, 1); | |||
| pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input->create("convolution_pack4_3x3s1d1_winograd63_transform_input", opt, std::vector<vk_specialization_type>(), 2, 7); | |||
| pipeline_convolution_pack4_3x3s1d1_winograd63_gemm = new Pipeline(vkdev); | |||
| pipeline_convolution_pack4_3x3s1d1_winograd63_gemm->set_local_size_xyz(4, 4, 4); | |||
| pipeline_convolution_pack4_3x3s1d1_winograd63_gemm->create("convolution_pack4_3x3s1d1_winograd63_gemm", opt, std::vector<vk_specialization_type>(), 3, 6); | |||
| pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output = new Pipeline(vkdev); | |||
| pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output->set_local_size_xyz(8, 8, 1); | |||
| pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output->create("convolution_pack4_3x3s1d1_winograd63_transform_output", opt, specializations, 3, 7); | |||
| } | |||
| else if (num_input >= 16 && num_output >= 16) | |||
| { | |||
| pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input = new Pipeline(vkdev); | |||
| pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input->set_local_size_xyz(8, 8, 1); | |||
| pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input->create("convolution_pack4_3x3s1d1_winograd23_transform_input", opt, std::vector<vk_specialization_type>(), 2, 7); | |||
| @@ -270,20 +290,27 @@ int Convolution_vulkan::destroy_pipeline(const Option& opt) | |||
| delete pipeline_convolution_pack4_3x3s1d1_lds_8_8_2; | |||
| pipeline_convolution_pack4_3x3s1d1_lds_8_8_2 = 0; | |||
| if (winograd23_padding) | |||
| if (winograd_padding) | |||
| { | |||
| winograd23_padding->destroy_pipeline(opt); | |||
| delete winograd23_padding; | |||
| winograd23_padding = 0; | |||
| winograd_padding->destroy_pipeline(opt); | |||
| delete winograd_padding; | |||
| winograd_padding = 0; | |||
| } | |||
| if (winograd23_crop) | |||
| if (winograd_crop) | |||
| { | |||
| winograd23_crop->destroy_pipeline(opt); | |||
| delete winograd23_crop; | |||
| winograd23_crop = 0; | |||
| winograd_crop->destroy_pipeline(opt); | |||
| delete winograd_crop; | |||
| winograd_crop = 0; | |||
| } | |||
| delete pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input; | |||
| delete pipeline_convolution_pack4_3x3s1d1_winograd63_gemm; | |||
| delete pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output; | |||
| pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input = 0; | |||
| pipeline_convolution_pack4_3x3s1d1_winograd63_gemm = 0; | |||
| pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output = 0; | |||
| delete pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input; | |||
| delete pipeline_convolution_pack4_3x3s1d1_winograd23_gemm; | |||
| delete pipeline_convolution_pack4_3x3s1d1_winograd23_transform_output; | |||
| @@ -398,7 +425,129 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) | |||
| cmd.record_upload(weight_data_pack4, weight_data_gpu_pack4, opt); | |||
| if (kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1 && num_input >= 16 && num_output >= 16) | |||
| bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; | |||
| if (is_conv3x3s1d1 && num_input >= 64 && num_output >= 64) | |||
| { | |||
| // winograd63 transform kernel | |||
| Mat weight_data_tm; | |||
| weight_data_tm.create(8*8, num_input, num_output); | |||
| const float ktm[8][3] = { | |||
| { 1.0f, 0.0f, 0.0f}, | |||
| {-2.0f/9, -2.0f/9, -2.0f/9}, | |||
| {-2.0f/9, 2.0f/9, -2.0f/9}, | |||
| {1.0f/90, 1.0f/45, 2.0f/45}, | |||
| {1.0f/90, -1.0f/45, 2.0f/45}, | |||
| {1.0f/45, 1.0f/90, 1.0f/180}, | |||
| {1.0f/45, -1.0f/90, 1.0f/180}, | |||
| { 0.0f, 0.0f, 1.0f} | |||
| }; | |||
| #pragma omp parallel for | |||
| for (int p = 0; p<num_output; p++) | |||
| { | |||
| for (int q = 0; q<num_input; q++) | |||
| { | |||
| const float* kernel0 = (const float*)weight_data + p*num_input * 9 + q * 9; | |||
| float* kernel_tm0 = weight_data_tm.channel(p).row(q); | |||
| // transform kernel, transposed | |||
| const float* k0 = kernel0; | |||
| const float* k1 = kernel0 + 3; | |||
| const float* k2 = kernel0 + 6; | |||
| // h | |||
| float tmp[8][3]; | |||
| for (int i=0; i<8; i++) | |||
| { | |||
| tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; | |||
| tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; | |||
| tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; | |||
| } | |||
| // v | |||
| for (int j=0; j<8; j++) | |||
| { | |||
| float* tmpp = &tmp[j][0]; | |||
| for (int i=0; i<8; i++) | |||
| { | |||
| kernel_tm0[j*8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| // src = 64-inch-outch | |||
| // dst = 4a-4b-64-inch/4a-outch/4b | |||
| Mat weight_data_pack4_tm; | |||
| { | |||
| weight_data_pack4_tm.create(64, num_input/4, num_output/4, (size_t)4*16, 16); | |||
| for (int q=0; q+3<num_output; q+=4) | |||
| { | |||
| const Mat k0 = weight_data_tm.channel(q); | |||
| const Mat k1 = weight_data_tm.channel(q+1); | |||
| const Mat k2 = weight_data_tm.channel(q+2); | |||
| const Mat k3 = weight_data_tm.channel(q+3); | |||
| Mat g0 = weight_data_pack4_tm.channel(q/4); | |||
| for (int p=0; p+3<num_input; p+=4) | |||
| { | |||
| const float* k00 = k0.row(p); | |||
| const float* k01 = k0.row(p+1); | |||
| const float* k02 = k0.row(p+2); | |||
| const float* k03 = k0.row(p+3); | |||
| const float* k10 = k1.row(p); | |||
| const float* k11 = k1.row(p+1); | |||
| const float* k12 = k1.row(p+2); | |||
| const float* k13 = k1.row(p+3); | |||
| const float* k20 = k2.row(p); | |||
| const float* k21 = k2.row(p+1); | |||
| const float* k22 = k2.row(p+2); | |||
| const float* k23 = k2.row(p+3); | |||
| const float* k30 = k3.row(p); | |||
| const float* k31 = k3.row(p+1); | |||
| const float* k32 = k3.row(p+2); | |||
| const float* k33 = k3.row(p+3); | |||
| float* g00 = g0.row(p/4); | |||
| for (int k=0; k<64; k++) | |||
| { | |||
| g00[0] = k00[k]; | |||
| g00[1] = k01[k]; | |||
| g00[2] = k02[k]; | |||
| g00[3] = k03[k]; | |||
| g00[4] = k10[k]; | |||
| g00[5] = k11[k]; | |||
| g00[6] = k12[k]; | |||
| g00[7] = k13[k]; | |||
| g00[8] = k20[k]; | |||
| g00[9] = k21[k]; | |||
| g00[10] = k22[k]; | |||
| g00[11] = k23[k]; | |||
| g00[12] = k30[k]; | |||
| g00[13] = k31[k]; | |||
| g00[14] = k32[k]; | |||
| g00[15] = k33[k]; | |||
| g00 += 16; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| cmd.record_upload(weight_data_pack4_tm, weight_data_gpu_pack4_tm, opt); | |||
| } | |||
| else if (is_conv3x3s1d1 && num_input >= 16 && num_output >= 16) | |||
| { | |||
| // winograd23 transform kernel | |||
| Mat weight_data_tm; | |||
| @@ -766,7 +915,151 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| } | |||
| bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; | |||
| if (packing == 4 && out_packing == 4 && is_conv3x3s1d1 && channels * packing >= 16 && num_output >= 16) | |||
| if (packing == 4 && out_packing == 4 && is_conv3x3s1d1 && channels * packing >= 64 && num_output >= 64) | |||
| { | |||
| // winograd63 | |||
| int outw_bordered = (outw + 5) / 6 * 6; | |||
| int outh_bordered = (outh + 5) / 6 * 6; | |||
| int w_bordered = outw_bordered + 2; | |||
| int h_bordered = outh_bordered + 2; | |||
| int block_x = outw_bordered / 6; | |||
| int block_y = outh_bordered / 6; | |||
| // pad to 6n+2 | |||
| { | |||
| ncnn::Option opt_pad = opt; | |||
| opt_pad.blob_vkallocator = opt.workspace_vkallocator; | |||
| VkMat padding_param_blob(4, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); | |||
| padding_param_blob.prepare_staging_buffer(); | |||
| int* padding_params = padding_param_blob.mapped(); | |||
| padding_params[0] = 0; | |||
| padding_params[1] = h_bordered - bottom_blob_bordered.h; | |||
| padding_params[2] = 0; | |||
| padding_params[3] = w_bordered - bottom_blob_bordered.w; | |||
| std::vector<VkMat> padding_inputs(2); | |||
| padding_inputs[0] = bottom_blob_bordered; | |||
| padding_inputs[1] = padding_param_blob; | |||
| std::vector<VkMat> padding_outputs(1); | |||
| winograd_padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); | |||
| bottom_blob_bordered = padding_outputs[0]; | |||
| } | |||
| // transform input | |||
| VkMat bottom_tm_blob; | |||
| { | |||
| bottom_tm_blob.create(64, block_x * block_y, channels, elemsize, packing, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| if (bottom_tm_blob.empty()) | |||
| return -100; | |||
| std::vector<VkMat> bindings(2); | |||
| bindings[0] = bottom_blob_bordered; | |||
| bindings[1] = bottom_tm_blob; | |||
| std::vector<vk_constant_type> constants(7); | |||
| constants[0].i = bottom_blob_bordered.w; | |||
| constants[1].i = bottom_blob_bordered.h; | |||
| constants[2].i = bottom_blob_bordered.c; | |||
| constants[3].i = bottom_blob_bordered.cstep; | |||
| constants[4].i = bottom_tm_blob.cstep; | |||
| constants[5].i = block_x; | |||
| constants[6].i = block_y; | |||
| VkMat dispatcher; | |||
| dispatcher.w = block_x; | |||
| dispatcher.h = block_y; | |||
| dispatcher.c = bottom_tm_blob.c; | |||
| cmd.record_pipeline(pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input, bindings, constants, dispatcher); | |||
| } | |||
| // gemm | |||
| VkMat top_tm_blob; | |||
| { | |||
| top_tm_blob.create(64, block_x * block_y, num_output / out_packing, elemsize, out_packing, opt.workspace_vkallocator, opt.staging_vkallocator); | |||
| if (top_tm_blob.empty()) | |||
| return -100; | |||
| std::vector<VkMat> bindings(3); | |||
| bindings[0] = bottom_tm_blob; | |||
| bindings[1] = top_tm_blob; | |||
| bindings[2] = weight_data_gpu_pack4_tm; | |||
| std::vector<vk_constant_type> constants(6); | |||
| constants[0].i = bottom_tm_blob.c; | |||
| constants[1].i = bottom_tm_blob.cstep; | |||
| constants[2].i = (top_tm_blob.h + 3) / 4; | |||
| constants[3].i = top_tm_blob.h; | |||
| constants[4].i = top_tm_blob.c; | |||
| constants[5].i = top_tm_blob.cstep; | |||
| VkMat dispatcher; | |||
| dispatcher.w = top_tm_blob.w; | |||
| dispatcher.h = (top_tm_blob.h + 3) / 4; | |||
| dispatcher.c = top_tm_blob.c; | |||
| cmd.record_pipeline(pipeline_convolution_pack4_3x3s1d1_winograd63_gemm, bindings, constants, dispatcher); | |||
| } | |||
| // transform output | |||
| VkMat top_blob_bordered; | |||
| { | |||
| top_blob_bordered.create(outw_bordered, outh_bordered, num_output / out_packing, elemsize, out_packing, opt.blob_vkallocator, opt.staging_vkallocator); | |||
| if (top_blob_bordered.empty()) | |||
| return -100; | |||
| std::vector<VkMat> bindings(3); | |||
| bindings[0] = top_tm_blob; | |||
| bindings[1] = top_blob_bordered; | |||
| bindings[2] = bias_term ? bias_data_gpu_pack4 : bindings[1]; | |||
| std::vector<vk_constant_type> constants(7); | |||
| constants[0].i = top_tm_blob.c; | |||
| constants[1].i = top_tm_blob.cstep; | |||
| constants[2].i = block_x; | |||
| constants[3].i = block_y; | |||
| constants[4].i = top_blob_bordered.w; | |||
| constants[5].i = top_blob_bordered.h; | |||
| constants[6].i = top_blob_bordered.cstep; | |||
| VkMat dispatcher; | |||
| dispatcher.w = block_x; | |||
| dispatcher.h = block_y; | |||
| dispatcher.c = top_blob_bordered.c; | |||
| cmd.record_pipeline(pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output, bindings, constants, dispatcher); | |||
| } | |||
| // crop top_blob | |||
| { | |||
| VkMat crop_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator, opt.staging_vkallocator); | |||
| crop_param_blob.prepare_staging_buffer(); | |||
| int* crop_params = crop_param_blob.mapped(); | |||
| crop_params[0] = 0; | |||
| crop_params[1] = 0; | |||
| crop_params[2] = 0; | |||
| crop_params[3] = outw; | |||
| crop_params[4] = outh; | |||
| crop_params[5] = num_output; | |||
| std::vector<VkMat> crop_inputs(2); | |||
| crop_inputs[0] = top_blob_bordered; | |||
| crop_inputs[1] = crop_param_blob; | |||
| std::vector<VkMat> crop_outputs(1); | |||
| winograd_crop->forward(crop_inputs, crop_outputs, cmd, opt); | |||
| top_blob = crop_outputs[0]; | |||
| } | |||
| return 0; | |||
| } | |||
| else if (packing == 4 && out_packing == 4 && is_conv3x3s1d1 && channels * packing >= 16 && num_output >= 16) | |||
| { | |||
| // winograd23 | |||
| int outw_bordered = (outw + 1) / 2 * 2; | |||
| @@ -797,7 +1090,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| padding_inputs[1] = padding_param_blob; | |||
| std::vector<VkMat> padding_outputs(1); | |||
| winograd23_padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); | |||
| winograd_padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); | |||
| bottom_blob_bordered = padding_outputs[0]; | |||
| } | |||
| @@ -904,7 +1197,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| crop_inputs[1] = crop_param_blob; | |||
| std::vector<VkMat> crop_outputs(1); | |||
| winograd23_crop->forward(crop_inputs, crop_outputs, cmd, opt); | |||
| winograd_crop->forward(crop_inputs, crop_outputs, cmd, opt); | |||
| top_blob = crop_outputs[0]; | |||
| } | |||
| @@ -49,9 +49,12 @@ public: | |||
| Pipeline* pipeline_convolution_pack4_3x3s1d1_lds_8_8_2; | |||
| // pack4 winograd23 | |||
| ncnn::Layer* winograd23_padding; | |||
| ncnn::Layer* winograd23_crop; | |||
| ncnn::Layer* winograd_padding; | |||
| ncnn::Layer* winograd_crop; | |||
| VkMat weight_data_gpu_pack4_tm; | |||
| Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd63_transform_input; | |||
| Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd63_gemm; | |||
| Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd63_transform_output; | |||
| Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd23_transform_input; | |||
| Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd23_gemm; | |||
| Pipeline* pipeline_convolution_pack4_3x3s1d1_winograd23_transform_output; | |||
| @@ -0,0 +1,99 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| #if NCNN_fp16_storage | |||
| #extension GL_EXT_shader_16bit_storage: require | |||
| #endif | |||
| #if NCNN_fp16_arithmetic | |||
| #extension GL_AMD_gpu_shader_half_float: require | |||
| #endif | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; }; | |||
| #if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) | |||
| // GL_EXT_shader_16bit_storage does not define f16mat4 type :( | |||
| layout (binding = 2) readonly buffer weight_tm_blob { sfpvec4 weight_tm_data[]; }; | |||
| #else | |||
| layout (binding = 2) readonly buffer weight_tm_blob { sfpmat4 weight_tm_data[]; }; | |||
| #endif | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int c; | |||
| int cstep; | |||
| int outh_4; | |||
| int outh; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= 64 || gy >= p.outh_4 || gz >= p.outc) | |||
| return; | |||
| afpvec4 sum0 = afpvec4(0.f); | |||
| afpvec4 sum1 = afpvec4(0.f); | |||
| afpvec4 sum2 = afpvec4(0.f); | |||
| afpvec4 sum3 = afpvec4(0.f); | |||
| int v_offset = gy * 4 * 64 + gx; | |||
| int w_offset = gz * p.c * 64 + gx; | |||
| for (int z = 0; z < p.c; z++) | |||
| { | |||
| afpvec4 v0 = sfp2afpvec4(bottom_tm_blob_data[v_offset + 0]); | |||
| afpvec4 v1 = sfp2afpvec4(bottom_tm_blob_data[v_offset + 64]); | |||
| afpvec4 v2 = sfp2afpvec4(bottom_tm_blob_data[v_offset + 128]); | |||
| afpvec4 v3 = sfp2afpvec4(bottom_tm_blob_data[v_offset + 192]); | |||
| #if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) | |||
| // GL_EXT_shader_16bit_storage does not define f16mat4 type :( | |||
| afpmat4 k = afpmat4( | |||
| sfp2afpvec4(weight_tm_data[w_offset * 4 + 0]), | |||
| sfp2afpvec4(weight_tm_data[w_offset * 4 + 1]), | |||
| sfp2afpvec4(weight_tm_data[w_offset * 4 + 2]), | |||
| sfp2afpvec4(weight_tm_data[w_offset * 4 + 3]) | |||
| ); | |||
| #else | |||
| afpmat4 k = sfpmat4(weight_tm_data[w_offset]); | |||
| #endif | |||
| sum0 += v0 * k; | |||
| sum1 += v1 * k; | |||
| sum2 += v2 * k; | |||
| sum3 += v3 * k; | |||
| v_offset += p.cstep; | |||
| w_offset += 64; | |||
| } | |||
| int gi = gz * p.outcstep + gy * 4 * 64 + gx; | |||
| top_tm_blob_data[gi + 0] = afp2sfpvec4(sum0); | |||
| if (gy * 4 + 1 < p.outh) top_tm_blob_data[gi + 64] = afp2sfpvec4(sum1); | |||
| if (gy * 4 + 2 < p.outh) top_tm_blob_data[gi + 128] = afp2sfpvec4(sum2); | |||
| if (gy * 4 + 3 < p.outh) top_tm_blob_data[gi + 192] = afp2sfpvec4(sum3); | |||
| } | |||
| @@ -0,0 +1,368 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| #if NCNN_fp16_storage | |||
| #extension GL_EXT_shader_16bit_storage: require | |||
| #endif | |||
| #if NCNN_fp16_arithmetic | |||
| #extension GL_AMD_gpu_shader_half_float: require | |||
| #endif | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outcstep; | |||
| int block_x; | |||
| int block_y; | |||
| } p; | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= p.block_x || gy >= p.block_y || gz >= p.c) | |||
| return; | |||
| // load 8x8 | |||
| int v_offset_0 = gz * p.cstep + gy * 6 * p.w + gx * 6; | |||
| ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * p.w; | |||
| ivec4 n_offset = v_offset_0 + ivec4(4, 5, 6, 7) * p.w; | |||
| afpvec4 v00 = sfp2afpvec4(bottom_blob_data[v_offset.r + 0]); | |||
| afpvec4 v01 = sfp2afpvec4(bottom_blob_data[v_offset.r + 1]); | |||
| afpvec4 v02 = sfp2afpvec4(bottom_blob_data[v_offset.r + 2]); | |||
| afpvec4 v03 = sfp2afpvec4(bottom_blob_data[v_offset.r + 3]); | |||
| afpvec4 v04 = sfp2afpvec4(bottom_blob_data[v_offset.r + 4]); | |||
| afpvec4 v05 = sfp2afpvec4(bottom_blob_data[v_offset.r + 5]); | |||
| afpvec4 v06 = sfp2afpvec4(bottom_blob_data[v_offset.r + 6]); | |||
| afpvec4 v07 = sfp2afpvec4(bottom_blob_data[v_offset.r + 7]); | |||
| afpvec4 v10 = sfp2afpvec4(bottom_blob_data[v_offset.g + 0]); | |||
| afpvec4 v11 = sfp2afpvec4(bottom_blob_data[v_offset.g + 1]); | |||
| afpvec4 v12 = sfp2afpvec4(bottom_blob_data[v_offset.g + 2]); | |||
| afpvec4 v13 = sfp2afpvec4(bottom_blob_data[v_offset.g + 3]); | |||
| afpvec4 v14 = sfp2afpvec4(bottom_blob_data[v_offset.g + 4]); | |||
| afpvec4 v15 = sfp2afpvec4(bottom_blob_data[v_offset.g + 5]); | |||
| afpvec4 v16 = sfp2afpvec4(bottom_blob_data[v_offset.g + 6]); | |||
| afpvec4 v17 = sfp2afpvec4(bottom_blob_data[v_offset.g + 7]); | |||
| afpvec4 v20 = sfp2afpvec4(bottom_blob_data[v_offset.b + 0]); | |||
| afpvec4 v21 = sfp2afpvec4(bottom_blob_data[v_offset.b + 1]); | |||
| afpvec4 v22 = sfp2afpvec4(bottom_blob_data[v_offset.b + 2]); | |||
| afpvec4 v23 = sfp2afpvec4(bottom_blob_data[v_offset.b + 3]); | |||
| afpvec4 v24 = sfp2afpvec4(bottom_blob_data[v_offset.b + 4]); | |||
| afpvec4 v25 = sfp2afpvec4(bottom_blob_data[v_offset.b + 5]); | |||
| afpvec4 v26 = sfp2afpvec4(bottom_blob_data[v_offset.b + 6]); | |||
| afpvec4 v27 = sfp2afpvec4(bottom_blob_data[v_offset.b + 7]); | |||
| afpvec4 v30 = sfp2afpvec4(bottom_blob_data[v_offset.a + 0]); | |||
| afpvec4 v31 = sfp2afpvec4(bottom_blob_data[v_offset.a + 1]); | |||
| afpvec4 v32 = sfp2afpvec4(bottom_blob_data[v_offset.a + 2]); | |||
| afpvec4 v33 = sfp2afpvec4(bottom_blob_data[v_offset.a + 3]); | |||
| afpvec4 v34 = sfp2afpvec4(bottom_blob_data[v_offset.a + 4]); | |||
| afpvec4 v35 = sfp2afpvec4(bottom_blob_data[v_offset.a + 5]); | |||
| afpvec4 v36 = sfp2afpvec4(bottom_blob_data[v_offset.a + 6]); | |||
| afpvec4 v37 = sfp2afpvec4(bottom_blob_data[v_offset.a + 7]); | |||
| afpvec4 v40 = sfp2afpvec4(bottom_blob_data[n_offset.r + 0]); | |||
| afpvec4 v41 = sfp2afpvec4(bottom_blob_data[n_offset.r + 1]); | |||
| afpvec4 v42 = sfp2afpvec4(bottom_blob_data[n_offset.r + 2]); | |||
| afpvec4 v43 = sfp2afpvec4(bottom_blob_data[n_offset.r + 3]); | |||
| afpvec4 v44 = sfp2afpvec4(bottom_blob_data[n_offset.r + 4]); | |||
| afpvec4 v45 = sfp2afpvec4(bottom_blob_data[n_offset.r + 5]); | |||
| afpvec4 v46 = sfp2afpvec4(bottom_blob_data[n_offset.r + 6]); | |||
| afpvec4 v47 = sfp2afpvec4(bottom_blob_data[n_offset.r + 7]); | |||
| afpvec4 v50 = sfp2afpvec4(bottom_blob_data[n_offset.g + 0]); | |||
| afpvec4 v51 = sfp2afpvec4(bottom_blob_data[n_offset.g + 1]); | |||
| afpvec4 v52 = sfp2afpvec4(bottom_blob_data[n_offset.g + 2]); | |||
| afpvec4 v53 = sfp2afpvec4(bottom_blob_data[n_offset.g + 3]); | |||
| afpvec4 v54 = sfp2afpvec4(bottom_blob_data[n_offset.g + 4]); | |||
| afpvec4 v55 = sfp2afpvec4(bottom_blob_data[n_offset.g + 5]); | |||
| afpvec4 v56 = sfp2afpvec4(bottom_blob_data[n_offset.g + 6]); | |||
| afpvec4 v57 = sfp2afpvec4(bottom_blob_data[n_offset.g + 7]); | |||
| afpvec4 v60 = sfp2afpvec4(bottom_blob_data[n_offset.b + 0]); | |||
| afpvec4 v61 = sfp2afpvec4(bottom_blob_data[n_offset.b + 1]); | |||
| afpvec4 v62 = sfp2afpvec4(bottom_blob_data[n_offset.b + 2]); | |||
| afpvec4 v63 = sfp2afpvec4(bottom_blob_data[n_offset.b + 3]); | |||
| afpvec4 v64 = sfp2afpvec4(bottom_blob_data[n_offset.b + 4]); | |||
| afpvec4 v65 = sfp2afpvec4(bottom_blob_data[n_offset.b + 5]); | |||
| afpvec4 v66 = sfp2afpvec4(bottom_blob_data[n_offset.b + 6]); | |||
| afpvec4 v67 = sfp2afpvec4(bottom_blob_data[n_offset.b + 7]); | |||
| afpvec4 v70 = sfp2afpvec4(bottom_blob_data[n_offset.a + 0]); | |||
| afpvec4 v71 = sfp2afpvec4(bottom_blob_data[n_offset.a + 1]); | |||
| afpvec4 v72 = sfp2afpvec4(bottom_blob_data[n_offset.a + 2]); | |||
| afpvec4 v73 = sfp2afpvec4(bottom_blob_data[n_offset.a + 3]); | |||
| afpvec4 v74 = sfp2afpvec4(bottom_blob_data[n_offset.a + 4]); | |||
| afpvec4 v75 = sfp2afpvec4(bottom_blob_data[n_offset.a + 5]); | |||
| afpvec4 v76 = sfp2afpvec4(bottom_blob_data[n_offset.a + 6]); | |||
| afpvec4 v77 = sfp2afpvec4(bottom_blob_data[n_offset.a + 7]); | |||
| // const float itm[8][8] = { | |||
| // {1.0f, 0.0f, -5.25f, 0.00f, 5.25f, 0.00f, -1.0f, 0.0f}, | |||
| // | |||
| // {0.0f, 1.0f, 1.00f, -4.25f, -4.25f, 1.00f, 1.0f, 0.0f}, | |||
| // {0.0f, -1.0f, 1.00f, 4.25f, -4.25f, -1.00f, 1.0f, 0.0f}, | |||
| // | |||
| // {0.0f, 0.5f, 0.25f, -2.50f, -1.25f, 2.00f, 1.0f, 0.0f}, | |||
| // {0.0f, -0.5f, 0.25f, 2.50f, -1.25f, -2.00f, 1.0f, 0.0f}, | |||
| // | |||
| // {0.0f, 2.0f, 4.00f, -2.50f, -5.00f, 0.50f, 1.0f, 0.0f}, | |||
| // {0.0f, -2.0f, 4.00f, 2.50f, -5.00f, -0.50f, 1.0f, 0.0f}, | |||
| // | |||
| // {0.0f, -1.0f, 0.00f, 5.25f, 0.00f, -5.25f, 0.0f, 1.0f} | |||
| // }; | |||
| // 0 = r00 - r06 + (r04 - r02) * 5.25 | |||
| // 7 = r07 - r01 + (r03 - r05) * 5.25 | |||
| // 1 = (r02 + r06 - r04 * 4.25) + (r01 - r03 * 4.25 + r05) | |||
| // 2 = (r02 + r06 - r04 * 4.25) - (r01 - r03 * 4.25 + r05) | |||
| // 3 = (r06 + r02 * 0.25 - r04 * 1.25) + (r01 * 0.5 - r03 * 2.5 + r05 * 2) | |||
| // 4 = (r06 + r02 * 0.25 - r04 * 1.25) - (r01 * 0.5 - r03 * 2.5 + r05 * 2) | |||
| // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5) | |||
| // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5) | |||
| // implicit transpose | |||
| afpvec4 m00 = v00 - v06 + (v04 - v02) * afp(5.25f); | |||
| afpvec4 m01 = v10 - v16 + (v14 - v12) * afp(5.25f); | |||
| afpvec4 m02 = v20 - v26 + (v24 - v22) * afp(5.25f); | |||
| afpvec4 m03 = v30 - v36 + (v34 - v32) * afp(5.25f); | |||
| afpvec4 m04 = v40 - v46 + (v44 - v42) * afp(5.25f); | |||
| afpvec4 m05 = v50 - v56 + (v54 - v52) * afp(5.25f); | |||
| afpvec4 m06 = v60 - v66 + (v64 - v62) * afp(5.25f); | |||
| afpvec4 m07 = v70 - v76 + (v74 - v72) * afp(5.25f); | |||
| afpvec4 m70 = v07 - v01 + (v03 - v05) * afp(5.25f); | |||
| afpvec4 m71 = v17 - v11 + (v13 - v15) * afp(5.25f); | |||
| afpvec4 m72 = v27 - v21 + (v23 - v25) * afp(5.25f); | |||
| afpvec4 m73 = v37 - v31 + (v33 - v35) * afp(5.25f); | |||
| afpvec4 m74 = v47 - v41 + (v43 - v45) * afp(5.25f); | |||
| afpvec4 m75 = v57 - v51 + (v53 - v55) * afp(5.25f); | |||
| afpvec4 m76 = v67 - v61 + (v63 - v65) * afp(5.25f); | |||
| afpvec4 m77 = v77 - v71 + (v73 - v75) * afp(5.25f); | |||
| afpvec4 m10 = (v02 + v06 - v04 * afp(4.25f)) + (v01 - v03 * afp(4.25f) + v05); | |||
| afpvec4 m11 = (v12 + v16 - v14 * afp(4.25f)) + (v11 - v13 * afp(4.25f) + v15); | |||
| afpvec4 m12 = (v22 + v26 - v24 * afp(4.25f)) + (v21 - v23 * afp(4.25f) + v25); | |||
| afpvec4 m13 = (v32 + v36 - v34 * afp(4.25f)) + (v31 - v33 * afp(4.25f) + v35); | |||
| afpvec4 m14 = (v42 + v46 - v44 * afp(4.25f)) + (v41 - v43 * afp(4.25f) + v45); | |||
| afpvec4 m15 = (v52 + v56 - v54 * afp(4.25f)) + (v51 - v53 * afp(4.25f) + v55); | |||
| afpvec4 m16 = (v62 + v66 - v64 * afp(4.25f)) + (v61 - v63 * afp(4.25f) + v65); | |||
| afpvec4 m17 = (v72 + v76 - v74 * afp(4.25f)) + (v71 - v73 * afp(4.25f) + v75); | |||
| afpvec4 m20 = (v02 + v06 - v04 * afp(4.25f)) - (v01 - v03 * afp(4.25f) + v05); | |||
| afpvec4 m21 = (v12 + v16 - v14 * afp(4.25f)) - (v11 - v13 * afp(4.25f) + v15); | |||
| afpvec4 m22 = (v22 + v26 - v24 * afp(4.25f)) - (v21 - v23 * afp(4.25f) + v25); | |||
| afpvec4 m23 = (v32 + v36 - v34 * afp(4.25f)) - (v31 - v33 * afp(4.25f) + v35); | |||
| afpvec4 m24 = (v42 + v46 - v44 * afp(4.25f)) - (v41 - v43 * afp(4.25f) + v45); | |||
| afpvec4 m25 = (v52 + v56 - v54 * afp(4.25f)) - (v51 - v53 * afp(4.25f) + v55); | |||
| afpvec4 m26 = (v62 + v66 - v64 * afp(4.25f)) - (v61 - v63 * afp(4.25f) + v65); | |||
| afpvec4 m27 = (v72 + v76 - v74 * afp(4.25f)) - (v71 - v73 * afp(4.25f) + v75); | |||
| afpvec4 m30 = (v06 + v02 * afp(0.25f) - v04 * afp(1.25f)) + (v01 * afp(0.5f) - v03 * afp(2.5f) + v05 * afp(2.f)); | |||
| afpvec4 m31 = (v16 + v12 * afp(0.25f) - v14 * afp(1.25f)) + (v11 * afp(0.5f) - v13 * afp(2.5f) + v15 * afp(2.f)); | |||
| afpvec4 m32 = (v26 + v22 * afp(0.25f) - v24 * afp(1.25f)) + (v21 * afp(0.5f) - v23 * afp(2.5f) + v25 * afp(2.f)); | |||
| afpvec4 m33 = (v36 + v32 * afp(0.25f) - v34 * afp(1.25f)) + (v31 * afp(0.5f) - v33 * afp(2.5f) + v35 * afp(2.f)); | |||
| afpvec4 m34 = (v46 + v42 * afp(0.25f) - v44 * afp(1.25f)) + (v41 * afp(0.5f) - v43 * afp(2.5f) + v45 * afp(2.f)); | |||
| afpvec4 m35 = (v56 + v52 * afp(0.25f) - v54 * afp(1.25f)) + (v51 * afp(0.5f) - v53 * afp(2.5f) + v55 * afp(2.f)); | |||
| afpvec4 m36 = (v66 + v62 * afp(0.25f) - v64 * afp(1.25f)) + (v61 * afp(0.5f) - v63 * afp(2.5f) + v65 * afp(2.f)); | |||
| afpvec4 m37 = (v76 + v72 * afp(0.25f) - v74 * afp(1.25f)) + (v71 * afp(0.5f) - v73 * afp(2.5f) + v75 * afp(2.f)); | |||
| afpvec4 m40 = (v06 + v02 * afp(0.25f) - v04 * afp(1.25f)) - (v01 * afp(0.5f) - v03 * afp(2.5f) + v05 * afp(2.f)); | |||
| afpvec4 m41 = (v16 + v12 * afp(0.25f) - v14 * afp(1.25f)) - (v11 * afp(0.5f) - v13 * afp(2.5f) + v15 * afp(2.f)); | |||
| afpvec4 m42 = (v26 + v22 * afp(0.25f) - v24 * afp(1.25f)) - (v21 * afp(0.5f) - v23 * afp(2.5f) + v25 * afp(2.f)); | |||
| afpvec4 m43 = (v36 + v32 * afp(0.25f) - v34 * afp(1.25f)) - (v31 * afp(0.5f) - v33 * afp(2.5f) + v35 * afp(2.f)); | |||
| afpvec4 m44 = (v46 + v42 * afp(0.25f) - v44 * afp(1.25f)) - (v41 * afp(0.5f) - v43 * afp(2.5f) + v45 * afp(2.f)); | |||
| afpvec4 m45 = (v56 + v52 * afp(0.25f) - v54 * afp(1.25f)) - (v51 * afp(0.5f) - v53 * afp(2.5f) + v55 * afp(2.f)); | |||
| afpvec4 m46 = (v66 + v62 * afp(0.25f) - v64 * afp(1.25f)) - (v61 * afp(0.5f) - v63 * afp(2.5f) + v65 * afp(2.f)); | |||
| afpvec4 m47 = (v76 + v72 * afp(0.25f) - v74 * afp(1.25f)) - (v71 * afp(0.5f) - v73 * afp(2.5f) + v75 * afp(2.f)); | |||
| afpvec4 m50 = (v06 + (v02 - v04 * afp(1.25f)) * afp(4.f)) + (v01 * afp(2.f) - v03 * afp(2.5f) + v05 * afp(0.5f)); | |||
| afpvec4 m51 = (v16 + (v12 - v14 * afp(1.25f)) * afp(4.f)) + (v11 * afp(2.f) - v13 * afp(2.5f) + v15 * afp(0.5f)); | |||
| afpvec4 m52 = (v26 + (v22 - v24 * afp(1.25f)) * afp(4.f)) + (v21 * afp(2.f) - v23 * afp(2.5f) + v25 * afp(0.5f)); | |||
| afpvec4 m53 = (v36 + (v32 - v34 * afp(1.25f)) * afp(4.f)) + (v31 * afp(2.f) - v33 * afp(2.5f) + v35 * afp(0.5f)); | |||
| afpvec4 m54 = (v46 + (v42 - v44 * afp(1.25f)) * afp(4.f)) + (v41 * afp(2.f) - v43 * afp(2.5f) + v45 * afp(0.5f)); | |||
| afpvec4 m55 = (v56 + (v52 - v54 * afp(1.25f)) * afp(4.f)) + (v51 * afp(2.f) - v53 * afp(2.5f) + v55 * afp(0.5f)); | |||
| afpvec4 m56 = (v66 + (v62 - v64 * afp(1.25f)) * afp(4.f)) + (v61 * afp(2.f) - v63 * afp(2.5f) + v65 * afp(0.5f)); | |||
| afpvec4 m57 = (v76 + (v72 - v74 * afp(1.25f)) * afp(4.f)) + (v71 * afp(2.f) - v73 * afp(2.5f) + v75 * afp(0.5f)); | |||
| afpvec4 m60 = (v06 + (v02 - v04 * afp(1.25f)) * afp(4.f)) - (v01 * afp(2.f) - v03 * afp(2.5f) + v05 * afp(0.5f)); | |||
| afpvec4 m61 = (v16 + (v12 - v14 * afp(1.25f)) * afp(4.f)) - (v11 * afp(2.f) - v13 * afp(2.5f) + v15 * afp(0.5f)); | |||
| afpvec4 m62 = (v26 + (v22 - v24 * afp(1.25f)) * afp(4.f)) - (v21 * afp(2.f) - v23 * afp(2.5f) + v25 * afp(0.5f)); | |||
| afpvec4 m63 = (v36 + (v32 - v34 * afp(1.25f)) * afp(4.f)) - (v31 * afp(2.f) - v33 * afp(2.5f) + v35 * afp(0.5f)); | |||
| afpvec4 m64 = (v46 + (v42 - v44 * afp(1.25f)) * afp(4.f)) - (v41 * afp(2.f) - v43 * afp(2.5f) + v45 * afp(0.5f)); | |||
| afpvec4 m65 = (v56 + (v52 - v54 * afp(1.25f)) * afp(4.f)) - (v51 * afp(2.f) - v53 * afp(2.5f) + v55 * afp(0.5f)); | |||
| afpvec4 m66 = (v66 + (v62 - v64 * afp(1.25f)) * afp(4.f)) - (v61 * afp(2.f) - v63 * afp(2.5f) + v65 * afp(0.5f)); | |||
| afpvec4 m67 = (v76 + (v72 - v74 * afp(1.25f)) * afp(4.f)) - (v71 * afp(2.f) - v73 * afp(2.5f) + v75 * afp(0.5f)); | |||
| v00 = m00 - m06 + (m04 - m02) * afp(5.25f); | |||
| v10 = m10 - m16 + (m14 - m12) * afp(5.25f); | |||
| v20 = m20 - m26 + (m24 - m22) * afp(5.25f); | |||
| v30 = m30 - m36 + (m34 - m32) * afp(5.25f); | |||
| v40 = m40 - m46 + (m44 - m42) * afp(5.25f); | |||
| v50 = m50 - m56 + (m54 - m52) * afp(5.25f); | |||
| v60 = m60 - m66 + (m64 - m62) * afp(5.25f); | |||
| v70 = m70 - m76 + (m74 - m72) * afp(5.25f); | |||
| v07 = m07 - m01 + (m03 - m05) * afp(5.25f); | |||
| v17 = m17 - m11 + (m13 - m15) * afp(5.25f); | |||
| v27 = m27 - m21 + (m23 - m25) * afp(5.25f); | |||
| v37 = m37 - m31 + (m33 - m35) * afp(5.25f); | |||
| v47 = m47 - m41 + (m43 - m45) * afp(5.25f); | |||
| v57 = m57 - m51 + (m53 - m55) * afp(5.25f); | |||
| v67 = m67 - m61 + (m63 - m65) * afp(5.25f); | |||
| v77 = m77 - m71 + (m73 - m75) * afp(5.25f); | |||
| v01 = (m02 + m06 - m04 * afp(4.25f)) + (m01 - m03 * afp(4.25f) + m05); | |||
| v11 = (m12 + m16 - m14 * afp(4.25f)) + (m11 - m13 * afp(4.25f) + m15); | |||
| v21 = (m22 + m26 - m24 * afp(4.25f)) + (m21 - m23 * afp(4.25f) + m25); | |||
| v31 = (m32 + m36 - m34 * afp(4.25f)) + (m31 - m33 * afp(4.25f) + m35); | |||
| v41 = (m42 + m46 - m44 * afp(4.25f)) + (m41 - m43 * afp(4.25f) + m45); | |||
| v51 = (m52 + m56 - m54 * afp(4.25f)) + (m51 - m53 * afp(4.25f) + m55); | |||
| v61 = (m62 + m66 - m64 * afp(4.25f)) + (m61 - m63 * afp(4.25f) + m65); | |||
| v71 = (m72 + m76 - m74 * afp(4.25f)) + (m71 - m73 * afp(4.25f) + m75); | |||
| v02 = (m02 + m06 - m04 * afp(4.25f)) - (m01 - m03 * afp(4.25f) + m05); | |||
| v12 = (m12 + m16 - m14 * afp(4.25f)) - (m11 - m13 * afp(4.25f) + m15); | |||
| v22 = (m22 + m26 - m24 * afp(4.25f)) - (m21 - m23 * afp(4.25f) + m25); | |||
| v32 = (m32 + m36 - m34 * afp(4.25f)) - (m31 - m33 * afp(4.25f) + m35); | |||
| v42 = (m42 + m46 - m44 * afp(4.25f)) - (m41 - m43 * afp(4.25f) + m45); | |||
| v52 = (m52 + m56 - m54 * afp(4.25f)) - (m51 - m53 * afp(4.25f) + m55); | |||
| v62 = (m62 + m66 - m64 * afp(4.25f)) - (m61 - m63 * afp(4.25f) + m65); | |||
| v72 = (m72 + m76 - m74 * afp(4.25f)) - (m71 - m73 * afp(4.25f) + m75); | |||
| v03 = (m06 + m02 * afp(0.25f) - m04 * afp(1.25f)) + (m01 * afp(0.5f) - m03 * afp(2.5f) + m05 * afp(2.f)); | |||
| v13 = (m16 + m12 * afp(0.25f) - m14 * afp(1.25f)) + (m11 * afp(0.5f) - m13 * afp(2.5f) + m15 * afp(2.f)); | |||
| v23 = (m26 + m22 * afp(0.25f) - m24 * afp(1.25f)) + (m21 * afp(0.5f) - m23 * afp(2.5f) + m25 * afp(2.f)); | |||
| v33 = (m36 + m32 * afp(0.25f) - m34 * afp(1.25f)) + (m31 * afp(0.5f) - m33 * afp(2.5f) + m35 * afp(2.f)); | |||
| v43 = (m46 + m42 * afp(0.25f) - m44 * afp(1.25f)) + (m41 * afp(0.5f) - m43 * afp(2.5f) + m45 * afp(2.f)); | |||
| v53 = (m56 + m52 * afp(0.25f) - m54 * afp(1.25f)) + (m51 * afp(0.5f) - m53 * afp(2.5f) + m55 * afp(2.f)); | |||
| v63 = (m66 + m62 * afp(0.25f) - m64 * afp(1.25f)) + (m61 * afp(0.5f) - m63 * afp(2.5f) + m65 * afp(2.f)); | |||
| v73 = (m76 + m72 * afp(0.25f) - m74 * afp(1.25f)) + (m71 * afp(0.5f) - m73 * afp(2.5f) + m75 * afp(2.f)); | |||
| v04 = (m06 + m02 * afp(0.25f) - m04 * afp(1.25f)) - (m01 * afp(0.5f) - m03 * afp(2.5f) + m05 * afp(2.f)); | |||
| v14 = (m16 + m12 * afp(0.25f) - m14 * afp(1.25f)) - (m11 * afp(0.5f) - m13 * afp(2.5f) + m15 * afp(2.f)); | |||
| v24 = (m26 + m22 * afp(0.25f) - m24 * afp(1.25f)) - (m21 * afp(0.5f) - m23 * afp(2.5f) + m25 * afp(2.f)); | |||
| v34 = (m36 + m32 * afp(0.25f) - m34 * afp(1.25f)) - (m31 * afp(0.5f) - m33 * afp(2.5f) + m35 * afp(2.f)); | |||
| v44 = (m46 + m42 * afp(0.25f) - m44 * afp(1.25f)) - (m41 * afp(0.5f) - m43 * afp(2.5f) + m45 * afp(2.f)); | |||
| v54 = (m56 + m52 * afp(0.25f) - m54 * afp(1.25f)) - (m51 * afp(0.5f) - m53 * afp(2.5f) + m55 * afp(2.f)); | |||
| v64 = (m66 + m62 * afp(0.25f) - m64 * afp(1.25f)) - (m61 * afp(0.5f) - m63 * afp(2.5f) + m65 * afp(2.f)); | |||
| v74 = (m76 + m72 * afp(0.25f) - m74 * afp(1.25f)) - (m71 * afp(0.5f) - m73 * afp(2.5f) + m75 * afp(2.f)); | |||
| v05 = (m06 + (m02 - m04 * afp(1.25f)) * afp(4.f)) + (m01 * afp(2.f) - m03 * afp(2.5f) + m05 * afp(0.5f)); | |||
| v15 = (m16 + (m12 - m14 * afp(1.25f)) * afp(4.f)) + (m11 * afp(2.f) - m13 * afp(2.5f) + m15 * afp(0.5f)); | |||
| v25 = (m26 + (m22 - m24 * afp(1.25f)) * afp(4.f)) + (m21 * afp(2.f) - m23 * afp(2.5f) + m25 * afp(0.5f)); | |||
| v35 = (m36 + (m32 - m34 * afp(1.25f)) * afp(4.f)) + (m31 * afp(2.f) - m33 * afp(2.5f) + m35 * afp(0.5f)); | |||
| v45 = (m46 + (m42 - m44 * afp(1.25f)) * afp(4.f)) + (m41 * afp(2.f) - m43 * afp(2.5f) + m45 * afp(0.5f)); | |||
| v55 = (m56 + (m52 - m54 * afp(1.25f)) * afp(4.f)) + (m51 * afp(2.f) - m53 * afp(2.5f) + m55 * afp(0.5f)); | |||
| v65 = (m66 + (m62 - m64 * afp(1.25f)) * afp(4.f)) + (m61 * afp(2.f) - m63 * afp(2.5f) + m65 * afp(0.5f)); | |||
| v75 = (m76 + (m72 - m74 * afp(1.25f)) * afp(4.f)) + (m71 * afp(2.f) - m73 * afp(2.5f) + m75 * afp(0.5f)); | |||
| v06 = (m06 + (m02 - m04 * afp(1.25f)) * afp(4.f)) - (m01 * afp(2.f) - m03 * afp(2.5f) + m05 * afp(0.5f)); | |||
| v16 = (m16 + (m12 - m14 * afp(1.25f)) * afp(4.f)) - (m11 * afp(2.f) - m13 * afp(2.5f) + m15 * afp(0.5f)); | |||
| v26 = (m26 + (m22 - m24 * afp(1.25f)) * afp(4.f)) - (m21 * afp(2.f) - m23 * afp(2.5f) + m25 * afp(0.5f)); | |||
| v36 = (m36 + (m32 - m34 * afp(1.25f)) * afp(4.f)) - (m31 * afp(2.f) - m33 * afp(2.5f) + m35 * afp(0.5f)); | |||
| v46 = (m46 + (m42 - m44 * afp(1.25f)) * afp(4.f)) - (m41 * afp(2.f) - m43 * afp(2.5f) + m45 * afp(0.5f)); | |||
| v56 = (m56 + (m52 - m54 * afp(1.25f)) * afp(4.f)) - (m51 * afp(2.f) - m53 * afp(2.5f) + m55 * afp(0.5f)); | |||
| v66 = (m66 + (m62 - m64 * afp(1.25f)) * afp(4.f)) - (m61 * afp(2.f) - m63 * afp(2.5f) + m65 * afp(0.5f)); | |||
| v76 = (m76 + (m72 - m74 * afp(1.25f)) * afp(4.f)) - (m71 * afp(2.f) - m73 * afp(2.5f) + m75 * afp(0.5f)); | |||
| // store 64 | |||
| int v_tm_offset = gz * p.outcstep + (gy * p.block_x + gx) * 64; | |||
| bottom_tm_blob_data[v_tm_offset + 0] = afp2sfpvec4(v00); | |||
| bottom_tm_blob_data[v_tm_offset + 1] = afp2sfpvec4(v01); | |||
| bottom_tm_blob_data[v_tm_offset + 2] = afp2sfpvec4(v02); | |||
| bottom_tm_blob_data[v_tm_offset + 3] = afp2sfpvec4(v03); | |||
| bottom_tm_blob_data[v_tm_offset + 4] = afp2sfpvec4(v04); | |||
| bottom_tm_blob_data[v_tm_offset + 5] = afp2sfpvec4(v05); | |||
| bottom_tm_blob_data[v_tm_offset + 6] = afp2sfpvec4(v06); | |||
| bottom_tm_blob_data[v_tm_offset + 7] = afp2sfpvec4(v07); | |||
| bottom_tm_blob_data[v_tm_offset + 8] = afp2sfpvec4(v10); | |||
| bottom_tm_blob_data[v_tm_offset + 9] = afp2sfpvec4(v11); | |||
| bottom_tm_blob_data[v_tm_offset + 10] = afp2sfpvec4(v12); | |||
| bottom_tm_blob_data[v_tm_offset + 11] = afp2sfpvec4(v13); | |||
| bottom_tm_blob_data[v_tm_offset + 12] = afp2sfpvec4(v14); | |||
| bottom_tm_blob_data[v_tm_offset + 13] = afp2sfpvec4(v15); | |||
| bottom_tm_blob_data[v_tm_offset + 14] = afp2sfpvec4(v16); | |||
| bottom_tm_blob_data[v_tm_offset + 15] = afp2sfpvec4(v17); | |||
| bottom_tm_blob_data[v_tm_offset + 16] = afp2sfpvec4(v20); | |||
| bottom_tm_blob_data[v_tm_offset + 17] = afp2sfpvec4(v21); | |||
| bottom_tm_blob_data[v_tm_offset + 18] = afp2sfpvec4(v22); | |||
| bottom_tm_blob_data[v_tm_offset + 19] = afp2sfpvec4(v23); | |||
| bottom_tm_blob_data[v_tm_offset + 20] = afp2sfpvec4(v24); | |||
| bottom_tm_blob_data[v_tm_offset + 21] = afp2sfpvec4(v25); | |||
| bottom_tm_blob_data[v_tm_offset + 22] = afp2sfpvec4(v26); | |||
| bottom_tm_blob_data[v_tm_offset + 23] = afp2sfpvec4(v27); | |||
| bottom_tm_blob_data[v_tm_offset + 24] = afp2sfpvec4(v30); | |||
| bottom_tm_blob_data[v_tm_offset + 25] = afp2sfpvec4(v31); | |||
| bottom_tm_blob_data[v_tm_offset + 26] = afp2sfpvec4(v32); | |||
| bottom_tm_blob_data[v_tm_offset + 27] = afp2sfpvec4(v33); | |||
| bottom_tm_blob_data[v_tm_offset + 28] = afp2sfpvec4(v34); | |||
| bottom_tm_blob_data[v_tm_offset + 29] = afp2sfpvec4(v35); | |||
| bottom_tm_blob_data[v_tm_offset + 30] = afp2sfpvec4(v36); | |||
| bottom_tm_blob_data[v_tm_offset + 31] = afp2sfpvec4(v37); | |||
| bottom_tm_blob_data[v_tm_offset + 32] = afp2sfpvec4(v40); | |||
| bottom_tm_blob_data[v_tm_offset + 33] = afp2sfpvec4(v41); | |||
| bottom_tm_blob_data[v_tm_offset + 34] = afp2sfpvec4(v42); | |||
| bottom_tm_blob_data[v_tm_offset + 35] = afp2sfpvec4(v43); | |||
| bottom_tm_blob_data[v_tm_offset + 36] = afp2sfpvec4(v44); | |||
| bottom_tm_blob_data[v_tm_offset + 37] = afp2sfpvec4(v45); | |||
| bottom_tm_blob_data[v_tm_offset + 38] = afp2sfpvec4(v46); | |||
| bottom_tm_blob_data[v_tm_offset + 39] = afp2sfpvec4(v47); | |||
| bottom_tm_blob_data[v_tm_offset + 40] = afp2sfpvec4(v50); | |||
| bottom_tm_blob_data[v_tm_offset + 41] = afp2sfpvec4(v51); | |||
| bottom_tm_blob_data[v_tm_offset + 42] = afp2sfpvec4(v52); | |||
| bottom_tm_blob_data[v_tm_offset + 43] = afp2sfpvec4(v53); | |||
| bottom_tm_blob_data[v_tm_offset + 44] = afp2sfpvec4(v54); | |||
| bottom_tm_blob_data[v_tm_offset + 45] = afp2sfpvec4(v55); | |||
| bottom_tm_blob_data[v_tm_offset + 46] = afp2sfpvec4(v56); | |||
| bottom_tm_blob_data[v_tm_offset + 47] = afp2sfpvec4(v57); | |||
| bottom_tm_blob_data[v_tm_offset + 48] = afp2sfpvec4(v60); | |||
| bottom_tm_blob_data[v_tm_offset + 49] = afp2sfpvec4(v61); | |||
| bottom_tm_blob_data[v_tm_offset + 50] = afp2sfpvec4(v62); | |||
| bottom_tm_blob_data[v_tm_offset + 51] = afp2sfpvec4(v63); | |||
| bottom_tm_blob_data[v_tm_offset + 52] = afp2sfpvec4(v64); | |||
| bottom_tm_blob_data[v_tm_offset + 53] = afp2sfpvec4(v65); | |||
| bottom_tm_blob_data[v_tm_offset + 54] = afp2sfpvec4(v66); | |||
| bottom_tm_blob_data[v_tm_offset + 55] = afp2sfpvec4(v67); | |||
| bottom_tm_blob_data[v_tm_offset + 56] = afp2sfpvec4(v70); | |||
| bottom_tm_blob_data[v_tm_offset + 57] = afp2sfpvec4(v71); | |||
| bottom_tm_blob_data[v_tm_offset + 58] = afp2sfpvec4(v72); | |||
| bottom_tm_blob_data[v_tm_offset + 59] = afp2sfpvec4(v73); | |||
| bottom_tm_blob_data[v_tm_offset + 60] = afp2sfpvec4(v74); | |||
| bottom_tm_blob_data[v_tm_offset + 61] = afp2sfpvec4(v75); | |||
| bottom_tm_blob_data[v_tm_offset + 62] = afp2sfpvec4(v76); | |||
| bottom_tm_blob_data[v_tm_offset + 63] = afp2sfpvec4(v77); | |||
| } | |||
| @@ -0,0 +1,583 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| #if NCNN_fp16_storage | |||
| #extension GL_EXT_shader_16bit_storage: require | |||
| #endif | |||
| #if NCNN_fp16_arithmetic | |||
| #extension GL_AMD_gpu_shader_half_float: require | |||
| #endif | |||
| layout (constant_id = 0) const int bias_term = 0; | |||
| layout (constant_id = 1) const int activation_type = 0; | |||
| layout (constant_id = 2) const float activation_param_0 = 0; | |||
| layout (constant_id = 3) const float activation_param_1 = 0; | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| layout (binding = 0) readonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int c; | |||
| int cstep; | |||
| int block_x; | |||
| int block_y; | |||
| int outw; | |||
| int outh; | |||
| int outcstep; | |||
| } p; | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= p.block_x || gy >= p.block_y || gz >= p.c) | |||
| return; | |||
| // load 64 | |||
| int v_tm_offset = gz * p.cstep + (gy * p.block_x + gx) * 64; | |||
| afpvec4 v00 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 0]); | |||
| afpvec4 v01 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 1]); | |||
| afpvec4 v02 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 2]); | |||
| afpvec4 v03 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 3]); | |||
| afpvec4 v04 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 4]); | |||
| afpvec4 v05 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 5]); | |||
| afpvec4 v06 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 6]); | |||
| afpvec4 v07 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 7]); | |||
| afpvec4 v10 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 8]); | |||
| afpvec4 v11 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 9]); | |||
| afpvec4 v12 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 10]); | |||
| afpvec4 v13 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 11]); | |||
| afpvec4 v14 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 12]); | |||
| afpvec4 v15 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 13]); | |||
| afpvec4 v16 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 14]); | |||
| afpvec4 v17 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 15]); | |||
| afpvec4 v20 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 16]); | |||
| afpvec4 v21 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 17]); | |||
| afpvec4 v22 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 18]); | |||
| afpvec4 v23 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 19]); | |||
| afpvec4 v24 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 20]); | |||
| afpvec4 v25 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 21]); | |||
| afpvec4 v26 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 22]); | |||
| afpvec4 v27 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 23]); | |||
| afpvec4 v30 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 24]); | |||
| afpvec4 v31 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 25]); | |||
| afpvec4 v32 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 26]); | |||
| afpvec4 v33 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 27]); | |||
| afpvec4 v34 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 28]); | |||
| afpvec4 v35 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 29]); | |||
| afpvec4 v36 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 30]); | |||
| afpvec4 v37 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 31]); | |||
| afpvec4 v40 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 32]); | |||
| afpvec4 v41 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 33]); | |||
| afpvec4 v42 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 34]); | |||
| afpvec4 v43 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 35]); | |||
| afpvec4 v44 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 36]); | |||
| afpvec4 v45 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 37]); | |||
| afpvec4 v46 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 38]); | |||
| afpvec4 v47 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 39]); | |||
| afpvec4 v50 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 40]); | |||
| afpvec4 v51 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 41]); | |||
| afpvec4 v52 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 42]); | |||
| afpvec4 v53 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 43]); | |||
| afpvec4 v54 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 44]); | |||
| afpvec4 v55 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 45]); | |||
| afpvec4 v56 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 46]); | |||
| afpvec4 v57 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 47]); | |||
| afpvec4 v60 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 48]); | |||
| afpvec4 v61 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 49]); | |||
| afpvec4 v62 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 50]); | |||
| afpvec4 v63 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 51]); | |||
| afpvec4 v64 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 52]); | |||
| afpvec4 v65 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 53]); | |||
| afpvec4 v66 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 54]); | |||
| afpvec4 v67 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 55]); | |||
| afpvec4 v70 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 56]); | |||
| afpvec4 v71 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 57]); | |||
| afpvec4 v72 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 58]); | |||
| afpvec4 v73 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 59]); | |||
| afpvec4 v74 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 60]); | |||
| afpvec4 v75 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 61]); | |||
| afpvec4 v76 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 62]); | |||
| afpvec4 v77 = sfp2afpvec4(top_tm_blob_data[v_tm_offset + 63]); | |||
| // const float otm[6][8] = { | |||
| // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 32.0f, 32.0f, 0.0f}, | |||
| // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 16.0f,-16.0f, 0.0f}, | |||
| // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 8.0f, 8.0f, 0.0f}, | |||
| // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 4.0f, -4.0f, 0.0f}, | |||
| // {0.0f, 1.0f, 1.0f, 16.0f, 16.0f, 2.0f, 2.0f, 0.0f}, | |||
| // {0.0f, 1.0f, -1.0f, 32.0f, -32.0f, 1.0f, -1.0f, 1.0f} | |||
| // }; | |||
| // 0 = r0 + (r1 + r2) + (r3 + r4) + (r5 + r6) * 32 | |||
| // 1 = (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16 | |||
| // 2 = (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8 | |||
| // 3 = (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4 | |||
| // 4 = (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2 | |||
| // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6) | |||
| afpvec4 v01_a_v02 = v01 + v02; | |||
| afpvec4 v11_a_v12 = v11 + v12; | |||
| afpvec4 v21_a_v22 = v21 + v22; | |||
| afpvec4 v31_a_v32 = v31 + v32; | |||
| afpvec4 v41_a_v42 = v41 + v42; | |||
| afpvec4 v51_a_v52 = v51 + v52; | |||
| afpvec4 v61_a_v62 = v61 + v62; | |||
| afpvec4 v71_a_v72 = v71 + v72; | |||
| afpvec4 v01_m_v02 = v01 - v02; | |||
| afpvec4 v11_m_v12 = v11 - v12; | |||
| afpvec4 v21_m_v22 = v21 - v22; | |||
| afpvec4 v31_m_v32 = v31 - v32; | |||
| afpvec4 v41_m_v42 = v41 - v42; | |||
| afpvec4 v51_m_v52 = v51 - v52; | |||
| afpvec4 v61_m_v62 = v61 - v62; | |||
| afpvec4 v71_m_v72 = v71 - v72; | |||
| afpvec4 v03_a_v04 = v03 + v04; | |||
| afpvec4 v13_a_v14 = v13 + v14; | |||
| afpvec4 v23_a_v24 = v23 + v24; | |||
| afpvec4 v33_a_v34 = v33 + v34; | |||
| afpvec4 v43_a_v44 = v43 + v44; | |||
| afpvec4 v53_a_v54 = v53 + v54; | |||
| afpvec4 v63_a_v64 = v63 + v64; | |||
| afpvec4 v73_a_v74 = v73 + v74; | |||
| afpvec4 v03_m_v04 = v03 - v04; | |||
| afpvec4 v13_m_v14 = v13 - v14; | |||
| afpvec4 v23_m_v24 = v23 - v24; | |||
| afpvec4 v33_m_v34 = v33 - v34; | |||
| afpvec4 v43_m_v44 = v43 - v44; | |||
| afpvec4 v53_m_v54 = v53 - v54; | |||
| afpvec4 v63_m_v64 = v63 - v64; | |||
| afpvec4 v73_m_v74 = v73 - v74; | |||
| afpvec4 v05_a_v06 = v05 + v06; | |||
| afpvec4 v15_a_v16 = v15 + v16; | |||
| afpvec4 v25_a_v26 = v25 + v26; | |||
| afpvec4 v35_a_v36 = v35 + v36; | |||
| afpvec4 v45_a_v46 = v45 + v46; | |||
| afpvec4 v55_a_v56 = v55 + v56; | |||
| afpvec4 v65_a_v66 = v65 + v66; | |||
| afpvec4 v75_a_v76 = v75 + v76; | |||
| afpvec4 v05_m_v06 = v05 - v06; | |||
| afpvec4 v15_m_v16 = v15 - v16; | |||
| afpvec4 v25_m_v26 = v25 - v26; | |||
| afpvec4 v35_m_v36 = v35 - v36; | |||
| afpvec4 v45_m_v46 = v45 - v46; | |||
| afpvec4 v55_m_v56 = v55 - v56; | |||
| afpvec4 v65_m_v66 = v65 - v66; | |||
| afpvec4 v75_m_v76 = v75 - v76; | |||
| // implicit transpose | |||
| afpvec4 m00 = v00 + v01_a_v02 + v03_a_v04 + v05_a_v06 * afp(32.f); | |||
| afpvec4 m01 = v10 + v11_a_v12 + v13_a_v14 + v15_a_v16 * afp(32.f); | |||
| afpvec4 m02 = v20 + v21_a_v22 + v23_a_v24 + v25_a_v26 * afp(32.f); | |||
| afpvec4 m03 = v30 + v31_a_v32 + v33_a_v34 + v35_a_v36 * afp(32.f); | |||
| afpvec4 m04 = v40 + v41_a_v42 + v43_a_v44 + v45_a_v46 * afp(32.f); | |||
| afpvec4 m05 = v50 + v51_a_v52 + v53_a_v54 + v55_a_v56 * afp(32.f); | |||
| afpvec4 m06 = v60 + v61_a_v62 + v63_a_v64 + v65_a_v66 * afp(32.f); | |||
| afpvec4 m07 = v70 + v71_a_v72 + v73_a_v74 + v75_a_v76 * afp(32.f); | |||
| afpvec4 m10 = v01_m_v02 + v03_m_v04 * afp(2.f) + v05_m_v06 * afp(16.f); | |||
| afpvec4 m11 = v11_m_v12 + v13_m_v14 * afp(2.f) + v15_m_v16 * afp(16.f); | |||
| afpvec4 m12 = v21_m_v22 + v23_m_v24 * afp(2.f) + v25_m_v26 * afp(16.f); | |||
| afpvec4 m13 = v31_m_v32 + v33_m_v34 * afp(2.f) + v35_m_v36 * afp(16.f); | |||
| afpvec4 m14 = v41_m_v42 + v43_m_v44 * afp(2.f) + v45_m_v46 * afp(16.f); | |||
| afpvec4 m15 = v51_m_v52 + v53_m_v54 * afp(2.f) + v55_m_v56 * afp(16.f); | |||
| afpvec4 m16 = v61_m_v62 + v63_m_v64 * afp(2.f) + v65_m_v66 * afp(16.f); | |||
| afpvec4 m17 = v71_m_v72 + v73_m_v74 * afp(2.f) + v75_m_v76 * afp(16.f); | |||
| afpvec4 m20 = v01_a_v02 + v03_a_v04 * afp(4.f) + v05_a_v06 * afp(8.f); | |||
| afpvec4 m21 = v11_a_v12 + v13_a_v14 * afp(4.f) + v15_a_v16 * afp(8.f); | |||
| afpvec4 m22 = v21_a_v22 + v23_a_v24 * afp(4.f) + v25_a_v26 * afp(8.f); | |||
| afpvec4 m23 = v31_a_v32 + v33_a_v34 * afp(4.f) + v35_a_v36 * afp(8.f); | |||
| afpvec4 m24 = v41_a_v42 + v43_a_v44 * afp(4.f) + v45_a_v46 * afp(8.f); | |||
| afpvec4 m25 = v51_a_v52 + v53_a_v54 * afp(4.f) + v55_a_v56 * afp(8.f); | |||
| afpvec4 m26 = v61_a_v62 + v63_a_v64 * afp(4.f) + v65_a_v66 * afp(8.f); | |||
| afpvec4 m27 = v71_a_v72 + v73_a_v74 * afp(4.f) + v75_a_v76 * afp(8.f); | |||
| afpvec4 m30 = v01_m_v02 + v03_m_v04 * afp(8.f) + v05_m_v06 * afp(4.f); | |||
| afpvec4 m31 = v11_m_v12 + v13_m_v14 * afp(8.f) + v15_m_v16 * afp(4.f); | |||
| afpvec4 m32 = v21_m_v22 + v23_m_v24 * afp(8.f) + v25_m_v26 * afp(4.f); | |||
| afpvec4 m33 = v31_m_v32 + v33_m_v34 * afp(8.f) + v35_m_v36 * afp(4.f); | |||
| afpvec4 m34 = v41_m_v42 + v43_m_v44 * afp(8.f) + v45_m_v46 * afp(4.f); | |||
| afpvec4 m35 = v51_m_v52 + v53_m_v54 * afp(8.f) + v55_m_v56 * afp(4.f); | |||
| afpvec4 m36 = v61_m_v62 + v63_m_v64 * afp(8.f) + v65_m_v66 * afp(4.f); | |||
| afpvec4 m37 = v71_m_v72 + v73_m_v74 * afp(8.f) + v75_m_v76 * afp(4.f); | |||
| afpvec4 m40 = v01_a_v02 + v03_a_v04 * afp(16.f) + v05_a_v06 * afp(2.f); | |||
| afpvec4 m41 = v11_a_v12 + v13_a_v14 * afp(16.f) + v15_a_v16 * afp(2.f); | |||
| afpvec4 m42 = v21_a_v22 + v23_a_v24 * afp(16.f) + v25_a_v26 * afp(2.f); | |||
| afpvec4 m43 = v31_a_v32 + v33_a_v34 * afp(16.f) + v35_a_v36 * afp(2.f); | |||
| afpvec4 m44 = v41_a_v42 + v43_a_v44 * afp(16.f) + v45_a_v46 * afp(2.f); | |||
| afpvec4 m45 = v51_a_v52 + v53_a_v54 * afp(16.f) + v55_a_v56 * afp(2.f); | |||
| afpvec4 m46 = v61_a_v62 + v63_a_v64 * afp(16.f) + v65_a_v66 * afp(2.f); | |||
| afpvec4 m47 = v71_a_v72 + v73_a_v74 * afp(16.f) + v75_a_v76 * afp(2.f); | |||
| afpvec4 m50 = v07 + v01_m_v02 + v03_m_v04 * afp(32.f) + v05_m_v06; | |||
| afpvec4 m51 = v17 + v11_m_v12 + v13_m_v14 * afp(32.f) + v15_m_v16; | |||
| afpvec4 m52 = v27 + v21_m_v22 + v23_m_v24 * afp(32.f) + v25_m_v26; | |||
| afpvec4 m53 = v37 + v31_m_v32 + v33_m_v34 * afp(32.f) + v35_m_v36; | |||
| afpvec4 m54 = v47 + v41_m_v42 + v43_m_v44 * afp(32.f) + v45_m_v46; | |||
| afpvec4 m55 = v57 + v51_m_v52 + v53_m_v54 * afp(32.f) + v55_m_v56; | |||
| afpvec4 m56 = v67 + v61_m_v62 + v63_m_v64 * afp(32.f) + v65_m_v66; | |||
| afpvec4 m57 = v77 + v71_m_v72 + v73_m_v74 * afp(32.f) + v75_m_v76; | |||
| afpvec4 m01_a_m02 = m01 + m02; | |||
| afpvec4 m11_a_m12 = m11 + m12; | |||
| afpvec4 m21_a_m22 = m21 + m22; | |||
| afpvec4 m31_a_m32 = m31 + m32; | |||
| afpvec4 m41_a_m42 = m41 + m42; | |||
| afpvec4 m51_a_m52 = m51 + m52; | |||
| afpvec4 m01_m_m02 = m01 - m02; | |||
| afpvec4 m11_m_m12 = m11 - m12; | |||
| afpvec4 m21_m_m22 = m21 - m22; | |||
| afpvec4 m31_m_m32 = m31 - m32; | |||
| afpvec4 m41_m_m42 = m41 - m42; | |||
| afpvec4 m51_m_m52 = m51 - m52; | |||
| afpvec4 m03_a_m04 = m03 + m04; | |||
| afpvec4 m13_a_m14 = m13 + m14; | |||
| afpvec4 m23_a_m24 = m23 + m24; | |||
| afpvec4 m33_a_m34 = m33 + m34; | |||
| afpvec4 m43_a_m44 = m43 + m44; | |||
| afpvec4 m53_a_m54 = m53 + m54; | |||
| afpvec4 m03_m_m04 = m03 - m04; | |||
| afpvec4 m13_m_m14 = m13 - m14; | |||
| afpvec4 m23_m_m24 = m23 - m24; | |||
| afpvec4 m33_m_m34 = m33 - m34; | |||
| afpvec4 m43_m_m44 = m43 - m44; | |||
| afpvec4 m53_m_m54 = m53 - m54; | |||
| afpvec4 m05_a_m06 = m05 + m06; | |||
| afpvec4 m15_a_m16 = m15 + m16; | |||
| afpvec4 m25_a_m26 = m25 + m26; | |||
| afpvec4 m35_a_m36 = m35 + m36; | |||
| afpvec4 m45_a_m46 = m45 + m46; | |||
| afpvec4 m55_a_m56 = m55 + m56; | |||
| afpvec4 m05_m_m06 = m05 - m06; | |||
| afpvec4 m15_m_m16 = m15 - m16; | |||
| afpvec4 m25_m_m26 = m25 - m26; | |||
| afpvec4 m35_m_m36 = m35 - m36; | |||
| afpvec4 m45_m_m46 = m45 - m46; | |||
| afpvec4 m55_m_m56 = m55 - m56; | |||
| if (bias_term == 1) | |||
| { | |||
| const afpvec4 bias_value = sfp2afpvec4(bias_data[gz]); | |||
| v00 = bias_value + m00 + m01_a_m02 + m03_a_m04 + m05_a_m06 * afp(32.f); | |||
| v10 = bias_value + m10 + m11_a_m12 + m13_a_m14 + m15_a_m16 * afp(32.f); | |||
| v20 = bias_value + m20 + m21_a_m22 + m23_a_m24 + m25_a_m26 * afp(32.f); | |||
| v30 = bias_value + m30 + m31_a_m32 + m33_a_m34 + m35_a_m36 * afp(32.f); | |||
| v40 = bias_value + m40 + m41_a_m42 + m43_a_m44 + m45_a_m46 * afp(32.f); | |||
| v50 = bias_value + m50 + m51_a_m52 + m53_a_m54 + m55_a_m56 * afp(32.f); | |||
| v01 = bias_value + m01_m_m02 + m03_m_m04 * afp(2.f) + m05_m_m06 * afp(16.f); | |||
| v11 = bias_value + m11_m_m12 + m13_m_m14 * afp(2.f) + m15_m_m16 * afp(16.f); | |||
| v21 = bias_value + m21_m_m22 + m23_m_m24 * afp(2.f) + m25_m_m26 * afp(16.f); | |||
| v31 = bias_value + m31_m_m32 + m33_m_m34 * afp(2.f) + m35_m_m36 * afp(16.f); | |||
| v41 = bias_value + m41_m_m42 + m43_m_m44 * afp(2.f) + m45_m_m46 * afp(16.f); | |||
| v51 = bias_value + m51_m_m52 + m53_m_m54 * afp(2.f) + m55_m_m56 * afp(16.f); | |||
| v02 = bias_value + m01_a_m02 + m03_a_m04 * afp(4.f) + m05_a_m06 * afp(8.f); | |||
| v12 = bias_value + m11_a_m12 + m13_a_m14 * afp(4.f) + m15_a_m16 * afp(8.f); | |||
| v22 = bias_value + m21_a_m22 + m23_a_m24 * afp(4.f) + m25_a_m26 * afp(8.f); | |||
| v32 = bias_value + m31_a_m32 + m33_a_m34 * afp(4.f) + m35_a_m36 * afp(8.f); | |||
| v42 = bias_value + m41_a_m42 + m43_a_m44 * afp(4.f) + m45_a_m46 * afp(8.f); | |||
| v52 = bias_value + m51_a_m52 + m53_a_m54 * afp(4.f) + m55_a_m56 * afp(8.f); | |||
| v03 = bias_value + m01_m_m02 + m03_m_m04 * afp(8.f) + m05_m_m06 * afp(4.f); | |||
| v13 = bias_value + m11_m_m12 + m13_m_m14 * afp(8.f) + m15_m_m16 * afp(4.f); | |||
| v23 = bias_value + m21_m_m22 + m23_m_m24 * afp(8.f) + m25_m_m26 * afp(4.f); | |||
| v33 = bias_value + m31_m_m32 + m33_m_m34 * afp(8.f) + m35_m_m36 * afp(4.f); | |||
| v43 = bias_value + m41_m_m42 + m43_m_m44 * afp(8.f) + m45_m_m46 * afp(4.f); | |||
| v53 = bias_value + m51_m_m52 + m53_m_m54 * afp(8.f) + m55_m_m56 * afp(4.f); | |||
| v04 = bias_value + m01_a_m02 + m03_a_m04 * afp(16.f) + m05_a_m06 * afp(2.f); | |||
| v14 = bias_value + m11_a_m12 + m13_a_m14 * afp(16.f) + m15_a_m16 * afp(2.f); | |||
| v24 = bias_value + m21_a_m22 + m23_a_m24 * afp(16.f) + m25_a_m26 * afp(2.f); | |||
| v34 = bias_value + m31_a_m32 + m33_a_m34 * afp(16.f) + m35_a_m36 * afp(2.f); | |||
| v44 = bias_value + m41_a_m42 + m43_a_m44 * afp(16.f) + m45_a_m46 * afp(2.f); | |||
| v54 = bias_value + m51_a_m52 + m53_a_m54 * afp(16.f) + m55_a_m56 * afp(2.f); | |||
| v05 = bias_value + m07 + m01_m_m02 + m03_m_m04 * afp(32.f) + m05_m_m06; | |||
| v15 = bias_value + m17 + m11_m_m12 + m13_m_m14 * afp(32.f) + m15_m_m16; | |||
| v25 = bias_value + m27 + m21_m_m22 + m23_m_m24 * afp(32.f) + m25_m_m26; | |||
| v35 = bias_value + m37 + m31_m_m32 + m33_m_m34 * afp(32.f) + m35_m_m36; | |||
| v45 = bias_value + m47 + m41_m_m42 + m43_m_m44 * afp(32.f) + m45_m_m46; | |||
| v55 = bias_value + m57 + m51_m_m52 + m53_m_m54 * afp(32.f) + m55_m_m56; | |||
| } | |||
| else | |||
| { | |||
| v00 = m00 + m01_a_m02 + m03_a_m04 + m05_a_m06 * afp(32.f); | |||
| v10 = m10 + m11_a_m12 + m13_a_m14 + m15_a_m16 * afp(32.f); | |||
| v20 = m20 + m21_a_m22 + m23_a_m24 + m25_a_m26 * afp(32.f); | |||
| v30 = m30 + m31_a_m32 + m33_a_m34 + m35_a_m36 * afp(32.f); | |||
| v40 = m40 + m41_a_m42 + m43_a_m44 + m45_a_m46 * afp(32.f); | |||
| v50 = m50 + m51_a_m52 + m53_a_m54 + m55_a_m56 * afp(32.f); | |||
| v01 = m01_m_m02 + m03_m_m04 * afp(2.f) + m05_m_m06 * afp(16.f); | |||
| v11 = m11_m_m12 + m13_m_m14 * afp(2.f) + m15_m_m16 * afp(16.f); | |||
| v21 = m21_m_m22 + m23_m_m24 * afp(2.f) + m25_m_m26 * afp(16.f); | |||
| v31 = m31_m_m32 + m33_m_m34 * afp(2.f) + m35_m_m36 * afp(16.f); | |||
| v41 = m41_m_m42 + m43_m_m44 * afp(2.f) + m45_m_m46 * afp(16.f); | |||
| v51 = m51_m_m52 + m53_m_m54 * afp(2.f) + m55_m_m56 * afp(16.f); | |||
| v02 = m01_a_m02 + m03_a_m04 * afp(4.f) + m05_a_m06 * afp(8.f); | |||
| v12 = m11_a_m12 + m13_a_m14 * afp(4.f) + m15_a_m16 * afp(8.f); | |||
| v22 = m21_a_m22 + m23_a_m24 * afp(4.f) + m25_a_m26 * afp(8.f); | |||
| v32 = m31_a_m32 + m33_a_m34 * afp(4.f) + m35_a_m36 * afp(8.f); | |||
| v42 = m41_a_m42 + m43_a_m44 * afp(4.f) + m45_a_m46 * afp(8.f); | |||
| v52 = m51_a_m52 + m53_a_m54 * afp(4.f) + m55_a_m56 * afp(8.f); | |||
| v03 = m01_m_m02 + m03_m_m04 * afp(8.f) + m05_m_m06 * afp(4.f); | |||
| v13 = m11_m_m12 + m13_m_m14 * afp(8.f) + m15_m_m16 * afp(4.f); | |||
| v23 = m21_m_m22 + m23_m_m24 * afp(8.f) + m25_m_m26 * afp(4.f); | |||
| v33 = m31_m_m32 + m33_m_m34 * afp(8.f) + m35_m_m36 * afp(4.f); | |||
| v43 = m41_m_m42 + m43_m_m44 * afp(8.f) + m45_m_m46 * afp(4.f); | |||
| v53 = m51_m_m52 + m53_m_m54 * afp(8.f) + m55_m_m56 * afp(4.f); | |||
| v04 = m01_a_m02 + m03_a_m04 * afp(16.f) + m05_a_m06 * afp(2.f); | |||
| v14 = m11_a_m12 + m13_a_m14 * afp(16.f) + m15_a_m16 * afp(2.f); | |||
| v24 = m21_a_m22 + m23_a_m24 * afp(16.f) + m25_a_m26 * afp(2.f); | |||
| v34 = m31_a_m32 + m33_a_m34 * afp(16.f) + m35_a_m36 * afp(2.f); | |||
| v44 = m41_a_m42 + m43_a_m44 * afp(16.f) + m45_a_m46 * afp(2.f); | |||
| v54 = m51_a_m52 + m53_a_m54 * afp(16.f) + m55_a_m56 * afp(2.f); | |||
| v05 = m07 + m01_m_m02 + m03_m_m04 * afp(32.f) + m05_m_m06; | |||
| v15 = m17 + m11_m_m12 + m13_m_m14 * afp(32.f) + m15_m_m16; | |||
| v25 = m27 + m21_m_m22 + m23_m_m24 * afp(32.f) + m25_m_m26; | |||
| v35 = m37 + m31_m_m32 + m33_m_m34 * afp(32.f) + m35_m_m36; | |||
| v45 = m47 + m41_m_m42 + m43_m_m44 * afp(32.f) + m45_m_m46; | |||
| v55 = m57 + m51_m_m52 + m53_m_m54 * afp(32.f) + m55_m_m56; | |||
| } | |||
| if (activation_type == 1) | |||
| { | |||
| v00 = max(v00, afp(0.f)); | |||
| v10 = max(v10, afp(0.f)); | |||
| v20 = max(v20, afp(0.f)); | |||
| v30 = max(v30, afp(0.f)); | |||
| v40 = max(v40, afp(0.f)); | |||
| v50 = max(v50, afp(0.f)); | |||
| v01 = max(v01, afp(0.f)); | |||
| v11 = max(v11, afp(0.f)); | |||
| v21 = max(v21, afp(0.f)); | |||
| v31 = max(v31, afp(0.f)); | |||
| v41 = max(v41, afp(0.f)); | |||
| v51 = max(v51, afp(0.f)); | |||
| v02 = max(v02, afp(0.f)); | |||
| v12 = max(v12, afp(0.f)); | |||
| v22 = max(v22, afp(0.f)); | |||
| v32 = max(v32, afp(0.f)); | |||
| v42 = max(v42, afp(0.f)); | |||
| v52 = max(v52, afp(0.f)); | |||
| v03 = max(v03, afp(0.f)); | |||
| v13 = max(v13, afp(0.f)); | |||
| v23 = max(v23, afp(0.f)); | |||
| v33 = max(v33, afp(0.f)); | |||
| v43 = max(v43, afp(0.f)); | |||
| v53 = max(v53, afp(0.f)); | |||
| v04 = max(v04, afp(0.f)); | |||
| v14 = max(v14, afp(0.f)); | |||
| v24 = max(v24, afp(0.f)); | |||
| v34 = max(v34, afp(0.f)); | |||
| v44 = max(v44, afp(0.f)); | |||
| v54 = max(v54, afp(0.f)); | |||
| v05 = max(v05, afp(0.f)); | |||
| v15 = max(v15, afp(0.f)); | |||
| v25 = max(v25, afp(0.f)); | |||
| v35 = max(v35, afp(0.f)); | |||
| v45 = max(v45, afp(0.f)); | |||
| v55 = max(v55, afp(0.f)); | |||
| } | |||
| if (activation_type == 2) | |||
| { | |||
| const afp slope = afp(activation_param_0); | |||
| v00 = mix(v00, v00 * afp(slope), lessThan(v00, afpvec4(0.f))); | |||
| v10 = mix(v10, v10 * afp(slope), lessThan(v10, afpvec4(0.f))); | |||
| v20 = mix(v20, v20 * afp(slope), lessThan(v20, afpvec4(0.f))); | |||
| v30 = mix(v30, v30 * afp(slope), lessThan(v30, afpvec4(0.f))); | |||
| v40 = mix(v40, v40 * afp(slope), lessThan(v40, afpvec4(0.f))); | |||
| v50 = mix(v50, v50 * afp(slope), lessThan(v50, afpvec4(0.f))); | |||
| v01 = mix(v01, v01 * afp(slope), lessThan(v01, afpvec4(0.f))); | |||
| v11 = mix(v11, v11 * afp(slope), lessThan(v11, afpvec4(0.f))); | |||
| v21 = mix(v21, v21 * afp(slope), lessThan(v21, afpvec4(0.f))); | |||
| v31 = mix(v31, v31 * afp(slope), lessThan(v31, afpvec4(0.f))); | |||
| v41 = mix(v41, v41 * afp(slope), lessThan(v41, afpvec4(0.f))); | |||
| v51 = mix(v51, v51 * afp(slope), lessThan(v51, afpvec4(0.f))); | |||
| v02 = mix(v02, v02 * afp(slope), lessThan(v02, afpvec4(0.f))); | |||
| v12 = mix(v12, v12 * afp(slope), lessThan(v12, afpvec4(0.f))); | |||
| v22 = mix(v22, v22 * afp(slope), lessThan(v22, afpvec4(0.f))); | |||
| v32 = mix(v32, v32 * afp(slope), lessThan(v32, afpvec4(0.f))); | |||
| v42 = mix(v42, v42 * afp(slope), lessThan(v42, afpvec4(0.f))); | |||
| v52 = mix(v52, v52 * afp(slope), lessThan(v52, afpvec4(0.f))); | |||
| v03 = mix(v03, v03 * afp(slope), lessThan(v03, afpvec4(0.f))); | |||
| v13 = mix(v13, v13 * afp(slope), lessThan(v13, afpvec4(0.f))); | |||
| v23 = mix(v23, v23 * afp(slope), lessThan(v23, afpvec4(0.f))); | |||
| v33 = mix(v33, v33 * afp(slope), lessThan(v33, afpvec4(0.f))); | |||
| v43 = mix(v43, v43 * afp(slope), lessThan(v43, afpvec4(0.f))); | |||
| v53 = mix(v53, v53 * afp(slope), lessThan(v53, afpvec4(0.f))); | |||
| v04 = mix(v04, v04 * afp(slope), lessThan(v04, afpvec4(0.f))); | |||
| v14 = mix(v14, v14 * afp(slope), lessThan(v14, afpvec4(0.f))); | |||
| v24 = mix(v24, v24 * afp(slope), lessThan(v24, afpvec4(0.f))); | |||
| v34 = mix(v34, v34 * afp(slope), lessThan(v34, afpvec4(0.f))); | |||
| v44 = mix(v44, v44 * afp(slope), lessThan(v44, afpvec4(0.f))); | |||
| v54 = mix(v54, v54 * afp(slope), lessThan(v54, afpvec4(0.f))); | |||
| v05 = mix(v05, v05 * afp(slope), lessThan(v05, afpvec4(0.f))); | |||
| v15 = mix(v15, v15 * afp(slope), lessThan(v15, afpvec4(0.f))); | |||
| v25 = mix(v25, v25 * afp(slope), lessThan(v25, afpvec4(0.f))); | |||
| v35 = mix(v35, v35 * afp(slope), lessThan(v35, afpvec4(0.f))); | |||
| v45 = mix(v45, v45 * afp(slope), lessThan(v45, afpvec4(0.f))); | |||
| v55 = mix(v55, v55 * afp(slope), lessThan(v55, afpvec4(0.f))); | |||
| } | |||
| if (activation_type == 3) | |||
| { | |||
| const afp const_min = afp(activation_param_0); | |||
| const afp const_max = afp(activation_param_1); | |||
| v00 = clamp(v00, const_min, const_max); | |||
| v10 = clamp(v10, const_min, const_max); | |||
| v20 = clamp(v20, const_min, const_max); | |||
| v30 = clamp(v30, const_min, const_max); | |||
| v40 = clamp(v40, const_min, const_max); | |||
| v50 = clamp(v50, const_min, const_max); | |||
| v01 = clamp(v01, const_min, const_max); | |||
| v11 = clamp(v11, const_min, const_max); | |||
| v21 = clamp(v21, const_min, const_max); | |||
| v31 = clamp(v31, const_min, const_max); | |||
| v41 = clamp(v41, const_min, const_max); | |||
| v51 = clamp(v51, const_min, const_max); | |||
| v02 = clamp(v02, const_min, const_max); | |||
| v12 = clamp(v12, const_min, const_max); | |||
| v22 = clamp(v22, const_min, const_max); | |||
| v32 = clamp(v32, const_min, const_max); | |||
| v42 = clamp(v42, const_min, const_max); | |||
| v52 = clamp(v52, const_min, const_max); | |||
| v03 = clamp(v03, const_min, const_max); | |||
| v13 = clamp(v13, const_min, const_max); | |||
| v23 = clamp(v23, const_min, const_max); | |||
| v33 = clamp(v33, const_min, const_max); | |||
| v43 = clamp(v43, const_min, const_max); | |||
| v53 = clamp(v53, const_min, const_max); | |||
| v04 = clamp(v04, const_min, const_max); | |||
| v14 = clamp(v14, const_min, const_max); | |||
| v24 = clamp(v24, const_min, const_max); | |||
| v34 = clamp(v34, const_min, const_max); | |||
| v44 = clamp(v44, const_min, const_max); | |||
| v54 = clamp(v54, const_min, const_max); | |||
| v05 = clamp(v05, const_min, const_max); | |||
| v15 = clamp(v15, const_min, const_max); | |||
| v25 = clamp(v25, const_min, const_max); | |||
| v35 = clamp(v35, const_min, const_max); | |||
| v45 = clamp(v45, const_min, const_max); | |||
| v55 = clamp(v55, const_min, const_max); | |||
| } | |||
| if (activation_type == 4) | |||
| { | |||
| v00 = afp(1.f) / (afp(1.f) + exp(-v00)); | |||
| v10 = afp(1.f) / (afp(1.f) + exp(-v10)); | |||
| v20 = afp(1.f) / (afp(1.f) + exp(-v20)); | |||
| v30 = afp(1.f) / (afp(1.f) + exp(-v30)); | |||
| v40 = afp(1.f) / (afp(1.f) + exp(-v40)); | |||
| v50 = afp(1.f) / (afp(1.f) + exp(-v50)); | |||
| v01 = afp(1.f) / (afp(1.f) + exp(-v01)); | |||
| v11 = afp(1.f) / (afp(1.f) + exp(-v11)); | |||
| v21 = afp(1.f) / (afp(1.f) + exp(-v21)); | |||
| v31 = afp(1.f) / (afp(1.f) + exp(-v31)); | |||
| v41 = afp(1.f) / (afp(1.f) + exp(-v41)); | |||
| v51 = afp(1.f) / (afp(1.f) + exp(-v51)); | |||
| v02 = afp(1.f) / (afp(1.f) + exp(-v02)); | |||
| v12 = afp(1.f) / (afp(1.f) + exp(-v12)); | |||
| v22 = afp(1.f) / (afp(1.f) + exp(-v22)); | |||
| v32 = afp(1.f) / (afp(1.f) + exp(-v32)); | |||
| v42 = afp(1.f) / (afp(1.f) + exp(-v42)); | |||
| v52 = afp(1.f) / (afp(1.f) + exp(-v52)); | |||
| v03 = afp(1.f) / (afp(1.f) + exp(-v03)); | |||
| v13 = afp(1.f) / (afp(1.f) + exp(-v13)); | |||
| v23 = afp(1.f) / (afp(1.f) + exp(-v23)); | |||
| v33 = afp(1.f) / (afp(1.f) + exp(-v33)); | |||
| v43 = afp(1.f) / (afp(1.f) + exp(-v43)); | |||
| v53 = afp(1.f) / (afp(1.f) + exp(-v53)); | |||
| v04 = afp(1.f) / (afp(1.f) + exp(-v04)); | |||
| v14 = afp(1.f) / (afp(1.f) + exp(-v14)); | |||
| v24 = afp(1.f) / (afp(1.f) + exp(-v24)); | |||
| v34 = afp(1.f) / (afp(1.f) + exp(-v34)); | |||
| v44 = afp(1.f) / (afp(1.f) + exp(-v44)); | |||
| v54 = afp(1.f) / (afp(1.f) + exp(-v54)); | |||
| v05 = afp(1.f) / (afp(1.f) + exp(-v05)); | |||
| v15 = afp(1.f) / (afp(1.f) + exp(-v15)); | |||
| v25 = afp(1.f) / (afp(1.f) + exp(-v25)); | |||
| v35 = afp(1.f) / (afp(1.f) + exp(-v35)); | |||
| v45 = afp(1.f) / (afp(1.f) + exp(-v45)); | |||
| v55 = afp(1.f) / (afp(1.f) + exp(-v55)); | |||
| } | |||
| // store 6x6 | |||
| int v_offset_0 = gz * p.outcstep + gy * 6 * p.outw + gx * 6; | |||
| int v_offset_1 = v_offset_0 + p.outw; | |||
| int v_offset_2 = v_offset_0 + p.outw * 2; | |||
| int v_offset_3 = v_offset_0 + p.outw * 3; | |||
| int v_offset_4 = v_offset_0 + p.outw * 4; | |||
| int v_offset_5 = v_offset_0 + p.outw * 5; | |||
| top_blob_data[v_offset_0 + 0] = afp2sfpvec4(v00); | |||
| top_blob_data[v_offset_0 + 1] = afp2sfpvec4(v01); | |||
| top_blob_data[v_offset_0 + 2] = afp2sfpvec4(v02); | |||
| top_blob_data[v_offset_0 + 3] = afp2sfpvec4(v03); | |||
| top_blob_data[v_offset_0 + 4] = afp2sfpvec4(v04); | |||
| top_blob_data[v_offset_0 + 5] = afp2sfpvec4(v05); | |||
| top_blob_data[v_offset_1 + 0] = afp2sfpvec4(v10); | |||
| top_blob_data[v_offset_1 + 1] = afp2sfpvec4(v11); | |||
| top_blob_data[v_offset_1 + 2] = afp2sfpvec4(v12); | |||
| top_blob_data[v_offset_1 + 3] = afp2sfpvec4(v13); | |||
| top_blob_data[v_offset_1 + 4] = afp2sfpvec4(v14); | |||
| top_blob_data[v_offset_1 + 5] = afp2sfpvec4(v15); | |||
| top_blob_data[v_offset_2 + 0] = afp2sfpvec4(v20); | |||
| top_blob_data[v_offset_2 + 1] = afp2sfpvec4(v21); | |||
| top_blob_data[v_offset_2 + 2] = afp2sfpvec4(v22); | |||
| top_blob_data[v_offset_2 + 3] = afp2sfpvec4(v23); | |||
| top_blob_data[v_offset_2 + 4] = afp2sfpvec4(v24); | |||
| top_blob_data[v_offset_2 + 5] = afp2sfpvec4(v25); | |||
| top_blob_data[v_offset_3 + 0] = afp2sfpvec4(v30); | |||
| top_blob_data[v_offset_3 + 1] = afp2sfpvec4(v31); | |||
| top_blob_data[v_offset_3 + 2] = afp2sfpvec4(v32); | |||
| top_blob_data[v_offset_3 + 3] = afp2sfpvec4(v33); | |||
| top_blob_data[v_offset_3 + 4] = afp2sfpvec4(v34); | |||
| top_blob_data[v_offset_3 + 5] = afp2sfpvec4(v35); | |||
| top_blob_data[v_offset_4 + 0] = afp2sfpvec4(v40); | |||
| top_blob_data[v_offset_4 + 1] = afp2sfpvec4(v41); | |||
| top_blob_data[v_offset_4 + 2] = afp2sfpvec4(v42); | |||
| top_blob_data[v_offset_4 + 3] = afp2sfpvec4(v43); | |||
| top_blob_data[v_offset_4 + 4] = afp2sfpvec4(v44); | |||
| top_blob_data[v_offset_4 + 5] = afp2sfpvec4(v45); | |||
| top_blob_data[v_offset_5 + 0] = afp2sfpvec4(v50); | |||
| top_blob_data[v_offset_5 + 1] = afp2sfpvec4(v51); | |||
| top_blob_data[v_offset_5 + 2] = afp2sfpvec4(v52); | |||
| top_blob_data[v_offset_5 + 3] = afp2sfpvec4(v53); | |||
| top_blob_data[v_offset_5 + 4] = afp2sfpvec4(v54); | |||
| top_blob_data[v_offset_5 + 5] = afp2sfpvec4(v55); | |||
| } | |||