| @@ -172,52 +172,6 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) | |||
| padding->create_pipeline(opt); | |||
| } | |||
| std::vector<vk_specialization_type> specializations(10 + 10); | |||
| specializations[0].i = kernel_w; | |||
| specializations[1].i = kernel_h; | |||
| specializations[2].i = dilation_w; | |||
| specializations[3].i = dilation_h; | |||
| specializations[4].i = stride_w; | |||
| specializations[5].i = stride_h; | |||
| specializations[6].i = bias_term; | |||
| specializations[7].i = activation_type; | |||
| specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f; | |||
| specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f; | |||
| specializations[10 + 0].i = shape_bordered_packed.dims; | |||
| specializations[10 + 1].i = shape_bordered_packed.w; | |||
| specializations[10 + 2].i = shape_bordered_packed.h; | |||
| specializations[10 + 3].i = shape_bordered_packed.c; | |||
| specializations[10 + 4].i = shape_bordered_packed.cstep; | |||
| specializations[10 + 5].i = out_shape_packed.dims; | |||
| specializations[10 + 6].i = out_shape_packed.w; | |||
| specializations[10 + 7].i = out_shape_packed.h; | |||
| specializations[10 + 8].i = out_shape_packed.c; | |||
| specializations[10 + 9].i = out_shape_packed.cstep; | |||
| if (is_conv1x1s1d1) | |||
| { | |||
| int shader_type_index = -1; | |||
| if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_1x1s1d1; | |||
| if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1; | |||
| if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_1x1s1d1; | |||
| if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_1x1s1d1; | |||
| if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_1x1s1d1; | |||
| if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_1x1s1d1; | |||
| if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_1x1s1d1; | |||
| if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_1x1s1d1; | |||
| if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_1x1s1d1; | |||
| pipeline_convolution_1x1s1d1 = new Pipeline(vkdev); | |||
| if (opt.use_shader_local_memory) | |||
| { | |||
| pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, 8); | |||
| } | |||
| else | |||
| { | |||
| pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / out_elempack)); | |||
| } | |||
| pipeline_convolution_1x1s1d1->create(shader_type_index, opt, specializations); | |||
| } | |||
| if (opt.use_winograd_convolution && is_conv3x3s1d1 && num_input >= 16 && num_output >= 16) | |||
| { | |||
| // winograd43 | |||
| @@ -477,59 +431,117 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) | |||
| opt.use_image_storage = false; | |||
| } | |||
| std::vector<vk_specialization_type> specializations(10 + 8); | |||
| specializations[0].i = kernel_w; | |||
| specializations[1].i = kernel_h; | |||
| specializations[2].i = dilation_w; | |||
| specializations[3].i = dilation_h; | |||
| specializations[4].i = stride_w; | |||
| specializations[5].i = stride_h; | |||
| specializations[6].i = bias_term; | |||
| specializations[7].i = activation_type; | |||
| specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f; | |||
| specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f; | |||
| specializations[10 + 0].i = shape_bordered_packed.w; | |||
| specializations[10 + 1].i = shape_bordered_packed.h; | |||
| specializations[10 + 2].i = shape_bordered_packed.c; | |||
| specializations[10 + 3].i = shape_bordered_packed.cstep; | |||
| specializations[10 + 4].i = out_shape_packed.w; | |||
| specializations[10 + 5].i = out_shape_packed.h; | |||
| specializations[10 + 6].i = out_shape_packed.c; | |||
| specializations[10 + 7].i = out_shape_packed.cstep; | |||
| Mat local_size_xyz(16, std::min(4, num_output / out_elempack), 1, (void*)0); | |||
| if (out_shape_packed.dims != 0) | |||
| { | |||
| std::vector<vk_specialization_type> specializations(10 + 8); | |||
| specializations[0].i = kernel_w; | |||
| specializations[1].i = kernel_h; | |||
| specializations[2].i = dilation_w; | |||
| specializations[3].i = dilation_h; | |||
| specializations[4].i = stride_w; | |||
| specializations[5].i = stride_h; | |||
| specializations[6].i = bias_term; | |||
| specializations[7].i = activation_type; | |||
| specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f; | |||
| specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f; | |||
| specializations[10 + 0].i = shape_bordered_packed.w; | |||
| specializations[10 + 1].i = shape_bordered_packed.h; | |||
| specializations[10 + 2].i = shape_bordered_packed.c; | |||
| specializations[10 + 3].i = shape_bordered_packed.cstep; | |||
| specializations[10 + 4].i = out_shape_packed.w; | |||
| specializations[10 + 5].i = out_shape_packed.h; | |||
| specializations[10 + 6].i = out_shape_packed.c; | |||
| specializations[10 + 7].i = out_shape_packed.cstep; | |||
| Mat local_size_xyz(16, std::min(4, num_output / out_elempack), 1, (void*)0); | |||
| if (out_shape_packed.dims != 0) | |||
| { | |||
| local_size_xyz.w = std::min(16, out_shape_packed.w * out_shape_packed.h); | |||
| local_size_xyz.h = std::min(4, out_shape_packed.c); | |||
| } | |||
| local_size_xyz.w = std::min(16, out_shape_packed.w * out_shape_packed.h); | |||
| local_size_xyz.h = std::min(4, out_shape_packed.c); | |||
| } | |||
| int shader_type_index = -1; | |||
| if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_gemm; | |||
| if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_gemm; | |||
| if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_gemm; | |||
| if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_gemm; | |||
| if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_gemm; | |||
| if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_gemm; | |||
| if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_gemm; | |||
| if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_gemm; | |||
| if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_gemm; | |||
| pipeline_convolution_gemm = new Pipeline(vkdev); | |||
| if (opt.use_shader_local_memory) | |||
| { | |||
| pipeline_convolution_gemm->set_local_size_xyz(8, 8, 1); | |||
| } | |||
| else | |||
| { | |||
| pipeline_convolution_gemm->set_optimal_local_size_xyz(local_size_xyz); | |||
| } | |||
| pipeline_convolution_gemm->create(shader_type_index, opt, specializations); | |||
| int shader_type_index = -1; | |||
| if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_gemm; | |||
| if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_gemm; | |||
| if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_gemm; | |||
| if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_gemm; | |||
| if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_gemm; | |||
| if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_gemm; | |||
| if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_gemm; | |||
| if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_gemm; | |||
| if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_gemm; | |||
| pipeline_convolution_gemm = new Pipeline(vkdev); | |||
| if (opt.use_shader_local_memory) | |||
| { | |||
| pipeline_convolution_gemm->set_local_size_xyz(8, 8, 1); | |||
| } | |||
| else | |||
| { | |||
| pipeline_convolution_gemm->set_optimal_local_size_xyz(local_size_xyz); | |||
| } | |||
| pipeline_convolution_gemm->create(shader_type_index, opt, specializations); | |||
| } | |||
| if (is_conv1x1s1d1) | |||
| { | |||
| std::vector<vk_specialization_type> specializations(4 + 8); | |||
| specializations[0].i = bias_term; | |||
| specializations[1].i = activation_type; | |||
| specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f; | |||
| specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f; | |||
| specializations[4 + 0].i = shape_bordered_packed.w; | |||
| specializations[4 + 1].i = shape_bordered_packed.h; | |||
| specializations[4 + 2].i = shape_bordered_packed.c; | |||
| specializations[4 + 3].i = shape_bordered_packed.cstep; | |||
| specializations[4 + 4].i = out_shape_packed.w; | |||
| specializations[4 + 5].i = out_shape_packed.h; | |||
| specializations[4 + 6].i = out_shape_packed.c; | |||
| specializations[4 + 7].i = out_shape_packed.cstep; | |||
| int shader_type_index = -1; | |||
| if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_1x1s1d1; | |||
| if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1; | |||
| if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_1x1s1d1; | |||
| if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_1x1s1d1; | |||
| if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_1x1s1d1; | |||
| if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_1x1s1d1; | |||
| if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_1x1s1d1; | |||
| if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_1x1s1d1; | |||
| if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_1x1s1d1; | |||
| pipeline_convolution_1x1s1d1 = new Pipeline(vkdev); | |||
| if (opt.use_shader_local_memory) | |||
| { | |||
| pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 8, 1); | |||
| } | |||
| else | |||
| { | |||
| pipeline_convolution_1x1s1d1->set_local_size_xyz(8, std::min(8, num_output / out_elempack), 1); | |||
| } | |||
| pipeline_convolution_1x1s1d1->create(shader_type_index, opt, specializations); | |||
| } | |||
| else | |||
| { | |||
| std::vector<vk_specialization_type> specializations(10 + 10); | |||
| specializations[0].i = kernel_w; | |||
| specializations[1].i = kernel_h; | |||
| specializations[2].i = dilation_w; | |||
| specializations[3].i = dilation_h; | |||
| specializations[4].i = stride_w; | |||
| specializations[5].i = stride_h; | |||
| specializations[6].i = bias_term; | |||
| specializations[7].i = activation_type; | |||
| specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f; | |||
| specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f; | |||
| specializations[10 + 0].i = shape_bordered_packed.dims; | |||
| specializations[10 + 1].i = shape_bordered_packed.w; | |||
| specializations[10 + 2].i = shape_bordered_packed.h; | |||
| specializations[10 + 3].i = shape_bordered_packed.c; | |||
| specializations[10 + 4].i = shape_bordered_packed.cstep; | |||
| specializations[10 + 5].i = out_shape_packed.dims; | |||
| specializations[10 + 6].i = out_shape_packed.w; | |||
| specializations[10 + 7].i = out_shape_packed.h; | |||
| specializations[10 + 8].i = out_shape_packed.c; | |||
| specializations[10 + 9].i = out_shape_packed.cstep; | |||
| Mat local_size_xyz(8, 8, std::min(4, (num_output / out_elempack + 1) / 2), (void*)0); | |||
| if (out_shape_packed.dims != 0) | |||
| { | |||
| @@ -1194,34 +1206,63 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && channels * elempack >= 16 && num_output >= 16) | |||
| { | |||
| // gemm | |||
| { | |||
| top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| std::vector<VkMat> bindings(4); | |||
| bindings[0] = bottom_blob_bordered; | |||
| bindings[1] = top_blob; | |||
| bindings[2] = weight_data_gpu; | |||
| bindings[3] = bias_data_gpu; | |||
| std::vector<vk_constant_type> constants(8); | |||
| constants[0].i = bottom_blob_bordered.w; | |||
| constants[1].i = bottom_blob_bordered.h; | |||
| constants[2].i = bottom_blob_bordered.c; | |||
| constants[3].i = bottom_blob_bordered.cstep; | |||
| constants[4].i = top_blob.w; | |||
| constants[5].i = top_blob.h; | |||
| constants[6].i = top_blob.c; | |||
| constants[7].i = top_blob.cstep; | |||
| VkMat dispatcher; | |||
| dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; | |||
| dispatcher.h = top_blob.c; | |||
| dispatcher.c = 1; | |||
| cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher); | |||
| } | |||
| top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| std::vector<VkMat> bindings(4); | |||
| bindings[0] = bottom_blob_bordered; | |||
| bindings[1] = top_blob; | |||
| bindings[2] = weight_data_gpu; | |||
| bindings[3] = bias_data_gpu; | |||
| std::vector<vk_constant_type> constants(8); | |||
| constants[0].i = bottom_blob_bordered.w; | |||
| constants[1].i = bottom_blob_bordered.h; | |||
| constants[2].i = bottom_blob_bordered.c; | |||
| constants[3].i = bottom_blob_bordered.cstep; | |||
| constants[4].i = top_blob.w; | |||
| constants[5].i = top_blob.h; | |||
| constants[6].i = top_blob.c; | |||
| constants[7].i = top_blob.cstep; | |||
| VkMat dispatcher; | |||
| dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; | |||
| dispatcher.h = top_blob.c; | |||
| dispatcher.c = 1; | |||
| cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher); | |||
| return 0; | |||
| } | |||
| if (is_conv1x1s1d1) | |||
| { | |||
| top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| std::vector<VkMat> bindings(4); | |||
| bindings[0] = bottom_blob_bordered; | |||
| bindings[1] = top_blob; | |||
| bindings[2] = weight_data_gpu; | |||
| bindings[3] = bias_data_gpu; | |||
| std::vector<vk_constant_type> constants(8); | |||
| constants[0].i = bottom_blob_bordered.w; | |||
| constants[1].i = bottom_blob_bordered.h; | |||
| constants[2].i = bottom_blob_bordered.c; | |||
| constants[3].i = bottom_blob_bordered.cstep; | |||
| constants[4].i = top_blob.w; | |||
| constants[5].i = top_blob.h; | |||
| constants[6].i = top_blob.c; | |||
| constants[7].i = top_blob.cstep; | |||
| VkMat dispatcher; | |||
| dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; | |||
| dispatcher.h = top_blob.c; | |||
| dispatcher.c = 1; | |||
| cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher); | |||
| return 0; | |||
| } | |||
| @@ -1248,25 +1289,12 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = top_blob.cstep; | |||
| // record | |||
| if (is_conv1x1s1d1) | |||
| { | |||
| VkMat dispatcher; | |||
| dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; | |||
| dispatcher.h = 1; | |||
| dispatcher.c = top_blob.c; | |||
| cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher); | |||
| } | |||
| else | |||
| { | |||
| VkMat dispatcher; | |||
| dispatcher.w = (top_blob.w + 1) / 2; | |||
| dispatcher.h = (top_blob.h + 1) / 2; | |||
| dispatcher.c = (top_blob.c + 1) / 2; | |||
| VkMat dispatcher; | |||
| dispatcher.w = (top_blob.w + 1) / 2; | |||
| dispatcher.h = (top_blob.h + 1) / 2; | |||
| dispatcher.c = (top_blob.c + 1) / 2; | |||
| cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher); | |||
| } | |||
| cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher); | |||
| return 0; | |||
| } | |||
| @@ -1567,34 +1595,63 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b | |||
| if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && channels * elempack >= 16 && num_output >= 16) | |||
| { | |||
| // gemm | |||
| { | |||
| top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(4); | |||
| bindings[0] = bottom_blob_bordered; | |||
| bindings[1] = top_blob; | |||
| bindings[2] = weight_data_gpu_image; | |||
| bindings[3] = bias_data_gpu_image; | |||
| std::vector<vk_constant_type> constants(8); | |||
| constants[0].i = bottom_blob_bordered.w; | |||
| constants[1].i = bottom_blob_bordered.h; | |||
| constants[2].i = bottom_blob_bordered.c; | |||
| constants[3].i = 0; // bottom_blob_bordered.cstep; | |||
| constants[4].i = top_blob.w; | |||
| constants[5].i = top_blob.h; | |||
| constants[6].i = top_blob.c; | |||
| constants[7].i = 0; // top_blob.cstep; | |||
| VkImageMat dispatcher; | |||
| dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; | |||
| dispatcher.h = top_blob.c; | |||
| dispatcher.c = 1; | |||
| cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher); | |||
| } | |||
| top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(4); | |||
| bindings[0] = bottom_blob_bordered; | |||
| bindings[1] = top_blob; | |||
| bindings[2] = weight_data_gpu_image; | |||
| bindings[3] = bias_data_gpu_image; | |||
| std::vector<vk_constant_type> constants(8); | |||
| constants[0].i = bottom_blob_bordered.w; | |||
| constants[1].i = bottom_blob_bordered.h; | |||
| constants[2].i = bottom_blob_bordered.c; | |||
| constants[3].i = 0; // bottom_blob_bordered.cstep; | |||
| constants[4].i = top_blob.w; | |||
| constants[5].i = top_blob.h; | |||
| constants[6].i = top_blob.c; | |||
| constants[7].i = 0; // top_blob.cstep; | |||
| VkImageMat dispatcher; | |||
| dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; | |||
| dispatcher.h = top_blob.c; | |||
| dispatcher.c = 1; | |||
| cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher); | |||
| return 0; | |||
| } | |||
| if (is_conv1x1s1d1) | |||
| { | |||
| top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| std::vector<VkImageMat> bindings(4); | |||
| bindings[0] = bottom_blob_bordered; | |||
| bindings[1] = top_blob; | |||
| bindings[2] = weight_data_gpu_image; | |||
| bindings[3] = bias_data_gpu_image; | |||
| std::vector<vk_constant_type> constants(8); | |||
| constants[0].i = bottom_blob_bordered.w; | |||
| constants[1].i = bottom_blob_bordered.h; | |||
| constants[2].i = bottom_blob_bordered.c; | |||
| constants[3].i = 0; // bottom_blob_bordered.cstep; | |||
| constants[4].i = top_blob.w; | |||
| constants[5].i = top_blob.h; | |||
| constants[6].i = top_blob.c; | |||
| constants[7].i = 0; // top_blob.cstep; | |||
| VkImageMat dispatcher; | |||
| dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; | |||
| dispatcher.h = top_blob.c; | |||
| dispatcher.c = 1; | |||
| cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher); | |||
| return 0; | |||
| } | |||
| @@ -1621,25 +1678,12 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b | |||
| constants[8].i = top_blob.c; | |||
| constants[9].i = 0; //top_blob.cstep; | |||
| // record | |||
| if (is_conv1x1s1d1) | |||
| { | |||
| VkImageMat dispatcher; | |||
| dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; | |||
| dispatcher.h = 1; | |||
| dispatcher.c = top_blob.c; | |||
| cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher); | |||
| } | |||
| else | |||
| { | |||
| VkImageMat dispatcher; | |||
| dispatcher.w = (top_blob.w + 1) / 2; | |||
| dispatcher.h = (top_blob.h + 1) / 2; | |||
| dispatcher.c = (top_blob.c + 1) / 2; | |||
| VkImageMat dispatcher; | |||
| dispatcher.w = (top_blob.w + 1) / 2; | |||
| dispatcher.h = (top_blob.h + 1) / 2; | |||
| dispatcher.c = (top_blob.c + 1) / 2; | |||
| cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher); | |||
| } | |||
| cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher); | |||
| return 0; | |||
| } | |||
| @@ -21,29 +21,21 @@ | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int kernel_w = 1; | |||
| layout (constant_id = 1) const int kernel_h = 1; | |||
| layout (constant_id = 2) const int dilation_w = 1; | |||
| layout (constant_id = 3) const int dilation_h = 1; | |||
| layout (constant_id = 4) const int stride_w = 1; | |||
| layout (constant_id = 5) const int stride_h = 1; | |||
| layout (constant_id = 6) const int bias_term = 0; | |||
| layout (constant_id = 7) const int activation_type = 0; | |||
| layout (constant_id = 8) const float activation_param_0 = 0; | |||
| layout (constant_id = 9) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 10 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; | |||
| layout (constant_id = 0) const int bias_term = 0; | |||
| layout (constant_id = 1) const int activation_type = 0; | |||
| layout (constant_id = 2) const float activation_param_0 = 0; | |||
| layout (constant_id = 3) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 4 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| @@ -64,13 +56,11 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| @@ -82,16 +72,14 @@ void main() | |||
| #if NCNN_image_shader | |||
| int gx = int(gl_GlobalInvocationID.x) * 4; | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx >= psc(outw) * psc(outh) || gy >= psc(outc)) | |||
| return; | |||
| #else | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx * 4 >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx * 4 >= psc(outcstep) || gy >= psc(outc)) | |||
| return; | |||
| #endif | |||
| @@ -100,9 +88,9 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| sum = afpvec4(image3d_ld1(bias_blob, ivec3(gz, 0, 0))); | |||
| sum = afpvec4(image3d_ld1(bias_blob, ivec3(gy, 0, 0))); | |||
| #else | |||
| sum = afpvec4(buffer_ld1(bias_data, gz)); | |||
| sum = afpvec4(buffer_ld1(bias_data, gy)); | |||
| #endif | |||
| } | |||
| else | |||
| @@ -118,7 +106,7 @@ void main() | |||
| for (int z = 0; z < psc(c); z++) | |||
| { | |||
| afp k = image3d_ld1(weight_blob, ivec3(0, z, gz)); | |||
| afp k = image3d_ld1(weight_blob, ivec3(0, z, gy)); | |||
| sum.r += k * image3d_ld1(bottom_blob, ivec3(sx4.r, sy4.r, z)); | |||
| sum.g += k * image3d_ld1(bottom_blob, ivec3(sx4.g, sy4.g, z)); | |||
| @@ -126,7 +114,7 @@ void main() | |||
| sum.a += k * image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z)); | |||
| } | |||
| #else | |||
| int w_offset = gz * psc(c); | |||
| int w_offset = gy * psc(c); | |||
| int v_offset = gx; | |||
| for (int z = 0; z < psc(c); z++) | |||
| @@ -174,12 +162,12 @@ void main() | |||
| #if NCNN_image_shader | |||
| image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum.r); | |||
| image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum.g); | |||
| image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum.b); | |||
| image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum.a); | |||
| image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum.r); | |||
| image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum.g); | |||
| image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum.b); | |||
| image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum.a); | |||
| #else | |||
| const int gi = gz * psc(outcstep) / 4 + gx; | |||
| const int gi = gy * psc(outcstep) / 4 + gx; | |||
| #if NCNN_fp16_packed | |||
| top_blob_data[gi] = sum; | |||
| @@ -23,29 +23,21 @@ | |||
| #define LOCAL_MEMORY_UNROLL_INCH 8 | |||
| layout (constant_id = 0) const int kernel_w = 1; | |||
| layout (constant_id = 1) const int kernel_h = 1; | |||
| layout (constant_id = 2) const int dilation_w = 1; | |||
| layout (constant_id = 3) const int dilation_h = 1; | |||
| layout (constant_id = 4) const int stride_w = 1; | |||
| layout (constant_id = 5) const int stride_h = 1; | |||
| layout (constant_id = 6) const int bias_term = 0; | |||
| layout (constant_id = 7) const int activation_type = 0; | |||
| layout (constant_id = 8) const float activation_param_0 = 0; | |||
| layout (constant_id = 9) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 10 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; | |||
| layout (constant_id = 0) const int bias_term = 0; | |||
| layout (constant_id = 1) const int activation_type = 0; | |||
| layout (constant_id = 2) const float activation_param_0 = 0; | |||
| layout (constant_id = 3) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 4 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| @@ -61,13 +53,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| @@ -83,14 +73,13 @@ void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x) * 4; | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| #if NCNN_image_shader | |||
| if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx >= psc(outw) * psc(outh) || gy >= psc(outc)) | |||
| return; | |||
| #else | |||
| #if !NCNN_shader_local_memory | |||
| if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx >= psc(outcstep) || gy >= psc(outc)) | |||
| return; | |||
| #endif | |||
| #endif | |||
| @@ -103,9 +92,9 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| afpvec4 b = image3d_ld4(bias_blob, ivec3(gz, 0, 0)); | |||
| afpvec4 b = image3d_ld4(bias_blob, ivec3(gy, 0, 0)); | |||
| #else | |||
| afpvec4 b = buffer_ld4(bias_data, gz); | |||
| afpvec4 b = buffer_ld4(bias_data, gy); | |||
| #endif | |||
| sum0 = b; | |||
| sum1 = b; | |||
| @@ -133,7 +122,7 @@ void main() | |||
| afp v2 = image3d_ld1(bottom_blob, ivec3(sx4.b, sy4.b, z)); | |||
| afp v3 = image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z)); | |||
| afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gz)); | |||
| afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gy)); | |||
| sum0 += v0 * k; | |||
| sum1 += v1 * k; | |||
| @@ -141,21 +130,21 @@ void main() | |||
| sum3 += v3 * k; | |||
| } | |||
| #else | |||
| int w_offset = gz * psc(c); | |||
| int w_offset = gy * psc(c); | |||
| int v_offset = gx; | |||
| #if NCNN_shader_local_memory | |||
| const int lx = int(gl_LocalInvocationID.x); | |||
| const int lz = int(gl_LocalInvocationID.z); | |||
| const int ly = int(gl_LocalInvocationID.y); | |||
| int z = 0; | |||
| for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH) | |||
| { | |||
| if (lz < 4) | |||
| if (ly < 4) | |||
| { | |||
| for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) | |||
| { | |||
| tmp_v[lx][z4][lz] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]); | |||
| tmp_v[lx][z4][ly] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); | |||
| } | |||
| } | |||
| @@ -163,7 +152,7 @@ void main() | |||
| { | |||
| for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) | |||
| { | |||
| tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); | |||
| tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); | |||
| } | |||
| } | |||
| @@ -176,7 +165,7 @@ void main() | |||
| afp v2 = lfp2afp(tmp_v[lx][z4][2]); | |||
| afp v3 = lfp2afp(tmp_v[lx][z4][3]); | |||
| afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]); | |||
| afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]); | |||
| sum0 += v0 * k; | |||
| sum1 += v1 * k; | |||
| @@ -194,11 +183,11 @@ void main() | |||
| { | |||
| const int remain = psc(c) - z; | |||
| if (lz < 4) | |||
| if (ly < 4) | |||
| { | |||
| for (int z4 = 0; z4 < remain; z4++) | |||
| { | |||
| tmp_v[lx][z4][lz] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]); | |||
| tmp_v[lx][z4][ly] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); | |||
| } | |||
| } | |||
| @@ -206,7 +195,7 @@ void main() | |||
| { | |||
| for (int z4 = 0; z4 < remain; z4++) | |||
| { | |||
| tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); | |||
| tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); | |||
| } | |||
| } | |||
| @@ -219,7 +208,7 @@ void main() | |||
| afp v2 = lfp2afp(tmp_v[lx][z4][2]); | |||
| afp v3 = lfp2afp(tmp_v[lx][z4][3]); | |||
| afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]); | |||
| afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]); | |||
| sum0 += v0 * k; | |||
| sum1 += v1 * k; | |||
| @@ -297,17 +286,17 @@ void main() | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); | |||
| image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); | |||
| image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); | |||
| image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); | |||
| image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); | |||
| image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); | |||
| image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); | |||
| image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); | |||
| #else | |||
| #if NCNN_shader_local_memory | |||
| if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx >= psc(outcstep) || gy >= psc(outc)) | |||
| return; | |||
| #endif | |||
| int gi = gz * psc(outcstep) + gx; | |||
| int gi = gy * psc(outcstep) + gx; | |||
| buffer_st4(top_blob_data, gi + 0, sum0); | |||
| if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1); | |||
| @@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int kernel_w = 1; | |||
| layout (constant_id = 1) const int kernel_h = 1; | |||
| layout (constant_id = 2) const int dilation_w = 1; | |||
| layout (constant_id = 3) const int dilation_h = 1; | |||
| layout (constant_id = 4) const int stride_w = 1; | |||
| layout (constant_id = 5) const int stride_h = 1; | |||
| layout (constant_id = 6) const int bias_term = 0; | |||
| layout (constant_id = 7) const int activation_type = 0; | |||
| layout (constant_id = 8) const float activation_param_0 = 0; | |||
| layout (constant_id = 9) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 10 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; | |||
| layout (constant_id = 0) const int bias_term = 0; | |||
| layout (constant_id = 1) const int activation_type = 0; | |||
| layout (constant_id = 2) const float activation_param_0 = 0; | |||
| layout (constant_id = 3) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 4 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| @@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| @@ -75,19 +65,14 @@ layout (push_constant) uniform parameter | |||
| void main() | |||
| { | |||
| #if NCNN_image_shader | |||
| int gx = int(gl_GlobalInvocationID.x) * 4; | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) | |||
| #if NCNN_image_shader | |||
| if (gx >= psc(outw) * psc(outh) || gy >= psc(outc)) | |||
| return; | |||
| #else | |||
| int gx = int(gl_GlobalInvocationID.x) * 4; | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx >= psc(outcstep) || gy >= psc(outc)) | |||
| return; | |||
| #endif | |||
| @@ -99,9 +84,9 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| afpvec8 b = image3d_ld8(bias_blob, ivec3(gz, 0, 0)); | |||
| afpvec8 b = image3d_ld8(bias_blob, ivec3(gy, 0, 0)); | |||
| #else | |||
| afpvec8 b = buffer_ld8(bias_data, gz); | |||
| afpvec8 b = buffer_ld8(bias_data, gy); | |||
| #endif | |||
| sum0 = b; | |||
| sum1 = b; | |||
| @@ -129,7 +114,7 @@ void main() | |||
| afp v2 = image3d_ld1(bottom_blob, ivec3(sx4.b, sy4.b, z)); | |||
| afp v3 = image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z)); | |||
| afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gz)); | |||
| afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gy)); | |||
| // sum += v * k; | |||
| sum0[0] += v0 * k[0]; | |||
| @@ -145,7 +130,7 @@ void main() | |||
| sum3[1] += v3 * k[1]; | |||
| } | |||
| #else | |||
| int w_offset = gz * psc(c); | |||
| int w_offset = gy * psc(c); | |||
| int v_offset = gx; | |||
| for (int z = 0; z < psc(c); z++) | |||
| @@ -248,12 +233,12 @@ void main() | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); | |||
| image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); | |||
| image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); | |||
| image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); | |||
| image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); | |||
| image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); | |||
| image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); | |||
| image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); | |||
| #else | |||
| int gi = gz * psc(outcstep) + gx; | |||
| int gi = gy * psc(outcstep) + gx; | |||
| buffer_st8(top_blob_data, gi + 0, sum0); | |||
| if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1); | |||
| @@ -23,29 +23,21 @@ | |||
| #define LOCAL_MEMORY_UNROLL_INCH 8 | |||
| layout (constant_id = 0) const int kernel_w = 1; | |||
| layout (constant_id = 1) const int kernel_h = 1; | |||
| layout (constant_id = 2) const int dilation_w = 1; | |||
| layout (constant_id = 3) const int dilation_h = 1; | |||
| layout (constant_id = 4) const int stride_w = 1; | |||
| layout (constant_id = 5) const int stride_h = 1; | |||
| layout (constant_id = 6) const int bias_term = 0; | |||
| layout (constant_id = 7) const int activation_type = 0; | |||
| layout (constant_id = 8) const float activation_param_0 = 0; | |||
| layout (constant_id = 9) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 10 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; | |||
| layout (constant_id = 0) const int bias_term = 0; | |||
| layout (constant_id = 1) const int activation_type = 0; | |||
| layout (constant_id = 2) const float activation_param_0 = 0; | |||
| layout (constant_id = 3) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 4 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| @@ -61,13 +53,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| @@ -83,14 +73,13 @@ void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x) * 4; | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| #if NCNN_image_shader | |||
| if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx >= psc(outw) * psc(outh) || gy >= psc(outc)) | |||
| return; | |||
| #else | |||
| #if !NCNN_shader_local_memory | |||
| if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx >= psc(outcstep) || gy >= psc(outc)) | |||
| return; | |||
| #endif | |||
| #endif | |||
| @@ -103,9 +92,9 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| afpvec4 b = image3d_ld4(bias_blob, ivec3(gz, 0, 0)); | |||
| afpvec4 b = image3d_ld4(bias_blob, ivec3(gy, 0, 0)); | |||
| #else | |||
| afpvec4 b = buffer_ld4(bias_data, gz); | |||
| afpvec4 b = buffer_ld4(bias_data, gy); | |||
| #endif | |||
| sum0 = b; | |||
| sum1 = b; | |||
| @@ -134,10 +123,10 @@ void main() | |||
| afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z)); | |||
| afpmat4 k = afpmat4( | |||
| image3d_ld4(weight_blob, ivec3(0, z, gz)), | |||
| image3d_ld4(weight_blob, ivec3(1, z, gz)), | |||
| image3d_ld4(weight_blob, ivec3(2, z, gz)), | |||
| image3d_ld4(weight_blob, ivec3(3, z, gz)) | |||
| image3d_ld4(weight_blob, ivec3(0, z, gy)), | |||
| image3d_ld4(weight_blob, ivec3(1, z, gy)), | |||
| image3d_ld4(weight_blob, ivec3(2, z, gy)), | |||
| image3d_ld4(weight_blob, ivec3(3, z, gy)) | |||
| ); | |||
| sum0 += v0 * k; | |||
| @@ -146,21 +135,21 @@ void main() | |||
| sum3 += v3 * k; | |||
| } | |||
| #else | |||
| int w_offset = gz * psc(c) * 4; | |||
| int w_offset = gy * psc(c) * 4; | |||
| int v_offset = gx; | |||
| #if NCNN_shader_local_memory | |||
| const int lx = int(gl_LocalInvocationID.x); | |||
| const int lz = int(gl_LocalInvocationID.z); | |||
| const int ly = int(gl_LocalInvocationID.y); | |||
| int z = 0; | |||
| for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH) | |||
| { | |||
| if (lz < 4) | |||
| if (ly < 4) | |||
| { | |||
| for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) | |||
| { | |||
| tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]); | |||
| tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); | |||
| } | |||
| } | |||
| @@ -168,7 +157,7 @@ void main() | |||
| { | |||
| for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) | |||
| { | |||
| tmp_k[lz][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]); | |||
| tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]); | |||
| } | |||
| } | |||
| @@ -181,10 +170,10 @@ void main() | |||
| afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); | |||
| afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); | |||
| afpvec4 k0 = lfp2afpvec4(tmp_k[lz][z4][0]); | |||
| afpvec4 k1 = lfp2afpvec4(tmp_k[lz][z4][1]); | |||
| afpvec4 k2 = lfp2afpvec4(tmp_k[lz][z4][2]); | |||
| afpvec4 k3 = lfp2afpvec4(tmp_k[lz][z4][3]); | |||
| afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]); | |||
| afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]); | |||
| afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]); | |||
| afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]); | |||
| afpmat4 k = afpmat4(k0, k1, k2, k3); | |||
| @@ -204,11 +193,11 @@ void main() | |||
| { | |||
| const int remain = psc(c) - z; | |||
| if (lz < 4) | |||
| if (ly < 4) | |||
| { | |||
| for (int z4 = 0; z4 < remain; z4++) | |||
| { | |||
| tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]); | |||
| tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); | |||
| } | |||
| } | |||
| @@ -216,7 +205,7 @@ void main() | |||
| { | |||
| for (int z4 = 0; z4 < remain; z4++) | |||
| { | |||
| tmp_k[lz][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]); | |||
| tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]); | |||
| } | |||
| } | |||
| @@ -229,10 +218,10 @@ void main() | |||
| afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); | |||
| afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); | |||
| afpvec4 k0 = lfp2afpvec4(tmp_k[lz][z4][0]); | |||
| afpvec4 k1 = lfp2afpvec4(tmp_k[lz][z4][1]); | |||
| afpvec4 k2 = lfp2afpvec4(tmp_k[lz][z4][2]); | |||
| afpvec4 k3 = lfp2afpvec4(tmp_k[lz][z4][3]); | |||
| afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]); | |||
| afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]); | |||
| afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]); | |||
| afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]); | |||
| afpmat4 k = afpmat4(k0, k1, k2, k3); | |||
| @@ -317,17 +306,17 @@ void main() | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); | |||
| image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); | |||
| image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); | |||
| image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); | |||
| image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); | |||
| image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); | |||
| image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); | |||
| image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); | |||
| #else | |||
| #if NCNN_shader_local_memory | |||
| if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx >= psc(outcstep) || gy >= psc(outc)) | |||
| return; | |||
| #endif | |||
| int gi = gz * psc(outcstep) + gx; | |||
| int gi = gy * psc(outcstep) + gx; | |||
| buffer_st4(top_blob_data, gi + 0, sum0); | |||
| if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1); | |||
| @@ -23,29 +23,21 @@ | |||
| #define LOCAL_MEMORY_UNROLL_INCH 8 | |||
| layout (constant_id = 0) const int kernel_w = 1; | |||
| layout (constant_id = 1) const int kernel_h = 1; | |||
| layout (constant_id = 2) const int dilation_w = 1; | |||
| layout (constant_id = 3) const int dilation_h = 1; | |||
| layout (constant_id = 4) const int stride_w = 1; | |||
| layout (constant_id = 5) const int stride_h = 1; | |||
| layout (constant_id = 6) const int bias_term = 0; | |||
| layout (constant_id = 7) const int activation_type = 0; | |||
| layout (constant_id = 8) const float activation_param_0 = 0; | |||
| layout (constant_id = 9) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 10 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; | |||
| layout (constant_id = 0) const int bias_term = 0; | |||
| layout (constant_id = 1) const int activation_type = 0; | |||
| layout (constant_id = 2) const float activation_param_0 = 0; | |||
| layout (constant_id = 3) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 4 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| @@ -61,13 +53,11 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| @@ -83,14 +73,13 @@ void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x) * 4; | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| #if NCNN_image_shader | |||
| if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx >= psc(outw) * psc(outh) || gy >= psc(outc)) | |||
| return; | |||
| #else | |||
| #if !NCNN_shader_local_memory | |||
| if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx >= psc(outcstep) || gy >= psc(outc)) | |||
| return; | |||
| #endif | |||
| #endif | |||
| @@ -103,9 +92,9 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| afp b = image3d_ld1(bias_blob, ivec3(gz, 0, 0)); | |||
| afp b = image3d_ld1(bias_blob, ivec3(gy, 0, 0)); | |||
| #else | |||
| afp b = buffer_ld1(bias_data, gz); | |||
| afp b = buffer_ld1(bias_data, gy); | |||
| #endif | |||
| sum0 = b; | |||
| sum1 = b; | |||
| @@ -133,7 +122,7 @@ void main() | |||
| afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(sx4.b, sy4.b, z)); | |||
| afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z)); | |||
| afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gz)); | |||
| afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gy)); | |||
| sum0 += dot(v0, k); | |||
| sum1 += dot(v1, k); | |||
| @@ -141,21 +130,21 @@ void main() | |||
| sum3 += dot(v3, k); | |||
| } | |||
| #else | |||
| int w_offset = gz * psc(c); | |||
| int w_offset = gy * psc(c); | |||
| int v_offset = gx; | |||
| #if NCNN_shader_local_memory | |||
| const int lx = int(gl_LocalInvocationID.x); | |||
| const int lz = int(gl_LocalInvocationID.z); | |||
| const int ly = int(gl_LocalInvocationID.y); | |||
| int z = 0; | |||
| for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH) | |||
| { | |||
| if (lz < 4) | |||
| if (ly < 4) | |||
| { | |||
| for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) | |||
| { | |||
| tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]); | |||
| tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); | |||
| } | |||
| } | |||
| @@ -163,7 +152,7 @@ void main() | |||
| { | |||
| for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) | |||
| { | |||
| tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); | |||
| tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); | |||
| } | |||
| } | |||
| @@ -176,7 +165,7 @@ void main() | |||
| afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); | |||
| afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); | |||
| afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]); | |||
| afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]); | |||
| sum0 += dot(v0, k); | |||
| sum1 += dot(v1, k); | |||
| @@ -194,11 +183,11 @@ void main() | |||
| { | |||
| const int remain = psc(c) - z; | |||
| if (lz < 4) | |||
| if (ly < 4) | |||
| { | |||
| for (int z4 = 0; z4 < remain; z4++) | |||
| { | |||
| tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]); | |||
| tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); | |||
| } | |||
| } | |||
| @@ -206,7 +195,7 @@ void main() | |||
| { | |||
| for (int z4 = 0; z4 < remain; z4++) | |||
| { | |||
| tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); | |||
| tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); | |||
| } | |||
| } | |||
| @@ -219,7 +208,7 @@ void main() | |||
| afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); | |||
| afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); | |||
| afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]); | |||
| afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]); | |||
| sum0 += dot(v0, k); | |||
| sum1 += dot(v1, k); | |||
| @@ -297,17 +286,17 @@ void main() | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); | |||
| image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); | |||
| image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); | |||
| image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); | |||
| image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); | |||
| image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); | |||
| image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); | |||
| image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); | |||
| #else | |||
| #if NCNN_shader_local_memory | |||
| if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx >= psc(outcstep) || gy >= psc(outc)) | |||
| return; | |||
| #endif | |||
| int gi = gz * psc(outcstep) + gx; | |||
| int gi = gy * psc(outcstep) + gx; | |||
| buffer_st1(top_blob_data, gi + 0, sum0); | |||
| if (gx + 1 < psc(outcstep)) buffer_st1(top_blob_data, gi + 1, sum1); | |||
| @@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int kernel_w = 1; | |||
| layout (constant_id = 1) const int kernel_h = 1; | |||
| layout (constant_id = 2) const int dilation_w = 1; | |||
| layout (constant_id = 3) const int dilation_h = 1; | |||
| layout (constant_id = 4) const int stride_w = 1; | |||
| layout (constant_id = 5) const int stride_h = 1; | |||
| layout (constant_id = 6) const int bias_term = 0; | |||
| layout (constant_id = 7) const int activation_type = 0; | |||
| layout (constant_id = 8) const float activation_param_0 = 0; | |||
| layout (constant_id = 9) const float activation_param_1 = 0; | |||
| layout (constant_id = 0) const int bias_term = 0; | |||
| layout (constant_id = 1) const int activation_type = 0; | |||
| layout (constant_id = 2) const float activation_param_0 = 0; | |||
| layout (constant_id = 3) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 10 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; | |||
| #define shape_constant_id_offset 4 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| @@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| @@ -75,19 +65,14 @@ layout (push_constant) uniform parameter | |||
| void main() | |||
| { | |||
| #if NCNN_image_shader | |||
| int gx = int(gl_GlobalInvocationID.x) * 4; | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) | |||
| #if NCNN_image_shader | |||
| if (gx >= psc(outw) * psc(outh) || gy >= psc(outc)) | |||
| return; | |||
| #else | |||
| int gx = int(gl_GlobalInvocationID.x) * 4; | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx >= psc(outcstep) || gy >= psc(outc)) | |||
| return; | |||
| #endif | |||
| @@ -99,9 +84,9 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| afpvec8 b = image3d_ld8(bias_blob, ivec3(gz, 0, 0)); | |||
| afpvec8 b = image3d_ld8(bias_blob, ivec3(gy, 0, 0)); | |||
| #else | |||
| afpvec8 b = buffer_ld8(bias_data, gz); | |||
| afpvec8 b = buffer_ld8(bias_data, gy); | |||
| #endif | |||
| sum0 = b; | |||
| sum1 = b; | |||
| @@ -129,14 +114,14 @@ void main() | |||
| afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(sx4.b, sy4.b, z)); | |||
| afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z)); | |||
| afpvec4 k0 = image3d_ld4(weight_blob, ivec3(0, z, gz)); | |||
| afpvec4 k1 = image3d_ld4(weight_blob, ivec3(1, z, gz)); | |||
| afpvec4 k2 = image3d_ld4(weight_blob, ivec3(2, z, gz)); | |||
| afpvec4 k3 = image3d_ld4(weight_blob, ivec3(3, z, gz)); | |||
| afpvec4 k4 = image3d_ld4(weight_blob, ivec3(4, z, gz)); | |||
| afpvec4 k5 = image3d_ld4(weight_blob, ivec3(5, z, gz)); | |||
| afpvec4 k6 = image3d_ld4(weight_blob, ivec3(6, z, gz)); | |||
| afpvec4 k7 = image3d_ld4(weight_blob, ivec3(7, z, gz)); | |||
| afpvec4 k0 = image3d_ld4(weight_blob, ivec3(0, z, gy)); | |||
| afpvec4 k1 = image3d_ld4(weight_blob, ivec3(1, z, gy)); | |||
| afpvec4 k2 = image3d_ld4(weight_blob, ivec3(2, z, gy)); | |||
| afpvec4 k3 = image3d_ld4(weight_blob, ivec3(3, z, gy)); | |||
| afpvec4 k4 = image3d_ld4(weight_blob, ivec3(4, z, gy)); | |||
| afpvec4 k5 = image3d_ld4(weight_blob, ivec3(5, z, gy)); | |||
| afpvec4 k6 = image3d_ld4(weight_blob, ivec3(6, z, gy)); | |||
| afpvec4 k7 = image3d_ld4(weight_blob, ivec3(7, z, gy)); | |||
| // sum += v * k; | |||
| sum0[0].r += dot(v0, k0); | |||
| @@ -176,7 +161,7 @@ void main() | |||
| sum3[1].a += dot(v3, k7); | |||
| } | |||
| #else | |||
| int w_offset = gz * psc(c) * 8; | |||
| int w_offset = gy * psc(c) * 8; | |||
| int v_offset = gx; | |||
| for (int z = 0; z < psc(c); z++) | |||
| @@ -310,12 +295,12 @@ void main() | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); | |||
| image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); | |||
| image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); | |||
| image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); | |||
| image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); | |||
| image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); | |||
| image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); | |||
| image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); | |||
| #else | |||
| int gi = gz * psc(outcstep) + gx; | |||
| int gi = gy * psc(outcstep) + gx; | |||
| buffer_st8(top_blob_data, gi + 0, sum0); | |||
| if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1); | |||
| @@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int kernel_w = 1; | |||
| layout (constant_id = 1) const int kernel_h = 1; | |||
| layout (constant_id = 2) const int dilation_w = 1; | |||
| layout (constant_id = 3) const int dilation_h = 1; | |||
| layout (constant_id = 4) const int stride_w = 1; | |||
| layout (constant_id = 5) const int stride_h = 1; | |||
| layout (constant_id = 6) const int bias_term = 0; | |||
| layout (constant_id = 7) const int activation_type = 0; | |||
| layout (constant_id = 8) const float activation_param_0 = 0; | |||
| layout (constant_id = 9) const float activation_param_1 = 0; | |||
| layout (constant_id = 0) const int bias_term = 0; | |||
| layout (constant_id = 1) const int activation_type = 0; | |||
| layout (constant_id = 2) const float activation_param_0 = 0; | |||
| layout (constant_id = 3) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 10 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; | |||
| #define shape_constant_id_offset 4 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| @@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| @@ -75,19 +65,14 @@ layout (push_constant) uniform parameter | |||
| void main() | |||
| { | |||
| #if NCNN_image_shader | |||
| int gx = int(gl_GlobalInvocationID.x) * 4; | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) | |||
| #if NCNN_image_shader | |||
| if (gx >= psc(outw) * psc(outh) || gy >= psc(outc)) | |||
| return; | |||
| #else | |||
| int gx = int(gl_GlobalInvocationID.x) * 4; | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx >= psc(outcstep) || gy >= psc(outc)) | |||
| return; | |||
| #endif | |||
| @@ -99,9 +84,9 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| afpvec8 b = image3d_ld8(bias_blob, ivec3(gz, 0, 0)); | |||
| afpvec8 b = image3d_ld8(bias_blob, ivec3(gy, 0, 0)); | |||
| #else | |||
| afpvec8 b = buffer_ld8(bias_data, gz); | |||
| afpvec8 b = buffer_ld8(bias_data, gy); | |||
| #endif | |||
| sum0 = b; | |||
| sum1 = b; | |||
| @@ -129,14 +114,14 @@ void main() | |||
| afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z)); | |||
| afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z)); | |||
| afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gz)); | |||
| afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gz)); | |||
| afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gz)); | |||
| afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gz)); | |||
| afpvec8 k4 = image3d_ld8(weight_blob, ivec3(4, z, gz)); | |||
| afpvec8 k5 = image3d_ld8(weight_blob, ivec3(5, z, gz)); | |||
| afpvec8 k6 = image3d_ld8(weight_blob, ivec3(6, z, gz)); | |||
| afpvec8 k7 = image3d_ld8(weight_blob, ivec3(7, z, gz)); | |||
| afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gy)); | |||
| afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gy)); | |||
| afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gy)); | |||
| afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gy)); | |||
| afpvec8 k4 = image3d_ld8(weight_blob, ivec3(4, z, gy)); | |||
| afpvec8 k5 = image3d_ld8(weight_blob, ivec3(5, z, gy)); | |||
| afpvec8 k6 = image3d_ld8(weight_blob, ivec3(6, z, gy)); | |||
| afpvec8 k7 = image3d_ld8(weight_blob, ivec3(7, z, gy)); | |||
| // sum += v * k | |||
| sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); | |||
| @@ -176,7 +161,7 @@ void main() | |||
| sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); | |||
| } | |||
| #else | |||
| int w_offset = gz * psc(c) * 8; | |||
| int w_offset = gy * psc(c) * 8; | |||
| int v_offset = gx; | |||
| for (int z = 0; z < psc(c); z++) | |||
| @@ -310,12 +295,12 @@ void main() | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); | |||
| image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); | |||
| image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); | |||
| image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); | |||
| image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); | |||
| image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); | |||
| image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); | |||
| image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); | |||
| #else | |||
| int gi = gz * psc(outcstep) + gx; | |||
| int gi = gy * psc(outcstep) + gx; | |||
| buffer_st8(top_blob_data, gi + 0, sum0); | |||
| if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1); | |||
| @@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int kernel_w = 1; | |||
| layout (constant_id = 1) const int kernel_h = 1; | |||
| layout (constant_id = 2) const int dilation_w = 1; | |||
| layout (constant_id = 3) const int dilation_h = 1; | |||
| layout (constant_id = 4) const int stride_w = 1; | |||
| layout (constant_id = 5) const int stride_h = 1; | |||
| layout (constant_id = 6) const int bias_term = 0; | |||
| layout (constant_id = 7) const int activation_type = 0; | |||
| layout (constant_id = 8) const float activation_param_0 = 0; | |||
| layout (constant_id = 9) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 10 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; | |||
| layout (constant_id = 0) const int bias_term = 0; | |||
| layout (constant_id = 1) const int activation_type = 0; | |||
| layout (constant_id = 2) const float activation_param_0 = 0; | |||
| layout (constant_id = 3) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 4 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| @@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| @@ -75,19 +65,14 @@ layout (push_constant) uniform parameter | |||
| void main() | |||
| { | |||
| #if NCNN_image_shader | |||
| int gx = int(gl_GlobalInvocationID.x) * 4; | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) | |||
| #if NCNN_image_shader | |||
| if (gx >= psc(outw) * psc(outh) || gy >= psc(outc)) | |||
| return; | |||
| #else | |||
| int gx = int(gl_GlobalInvocationID.x) * 4; | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx >= psc(outcstep) || gy >= psc(outc)) | |||
| return; | |||
| #endif | |||
| @@ -99,9 +84,9 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| afp b = image3d_ld1(bias_blob, ivec3(gz, 0, 0)); | |||
| afp b = image3d_ld1(bias_blob, ivec3(gy, 0, 0)); | |||
| #else | |||
| afp b = buffer_ld1(bias_data, gz); | |||
| afp b = buffer_ld1(bias_data, gy); | |||
| #endif | |||
| sum0 = b; | |||
| sum1 = b; | |||
| @@ -129,7 +114,7 @@ void main() | |||
| afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z)); | |||
| afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z)); | |||
| afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gz)); | |||
| afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gy)); | |||
| // sum += dot(v, k); | |||
| sum0 += dot(v0[0], k[0]) + dot(v0[1], k[1]); | |||
| @@ -138,7 +123,7 @@ void main() | |||
| sum3 += dot(v3[0], k[0]) + dot(v3[1], k[1]); | |||
| } | |||
| #else | |||
| int w_offset = gz * psc(c); | |||
| int w_offset = gy * psc(c); | |||
| int v_offset = gx; | |||
| for (int z = 0; z < psc(c); z++) | |||
| @@ -210,12 +195,12 @@ void main() | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); | |||
| image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); | |||
| image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); | |||
| image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); | |||
| image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); | |||
| image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); | |||
| image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); | |||
| image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); | |||
| #else | |||
| int gi = gz * psc(outcstep) + gx; | |||
| int gi = gy * psc(outcstep) + gx; | |||
| buffer_st1(top_blob_data, gi + 0, sum0); | |||
| if (gx + 1 < psc(outcstep)) buffer_st1(top_blob_data, gi + 1, sum1); | |||
| @@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int kernel_w = 1; | |||
| layout (constant_id = 1) const int kernel_h = 1; | |||
| layout (constant_id = 2) const int dilation_w = 1; | |||
| layout (constant_id = 3) const int dilation_h = 1; | |||
| layout (constant_id = 4) const int stride_w = 1; | |||
| layout (constant_id = 5) const int stride_h = 1; | |||
| layout (constant_id = 6) const int bias_term = 0; | |||
| layout (constant_id = 7) const int activation_type = 0; | |||
| layout (constant_id = 8) const float activation_param_0 = 0; | |||
| layout (constant_id = 9) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 10 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; | |||
| layout (constant_id = 0) const int bias_term = 0; | |||
| layout (constant_id = 1) const int activation_type = 0; | |||
| layout (constant_id = 2) const float activation_param_0 = 0; | |||
| layout (constant_id = 3) const float activation_param_1 = 0; | |||
| #define shape_constant_id_offset 4 | |||
| layout (constant_id = shape_constant_id_offset + 0) const int w = 0; | |||
| layout (constant_id = shape_constant_id_offset + 1) const int h = 0; | |||
| layout (constant_id = shape_constant_id_offset + 2) const int c = 0; | |||
| layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; | |||
| layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; | |||
| layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; | |||
| layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; | |||
| layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; | |||
| #if NCNN_image_shader | |||
| layout (binding = 0) uniform unfp sampler3D bottom_blob; | |||
| @@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| @@ -75,19 +65,14 @@ layout (push_constant) uniform parameter | |||
| void main() | |||
| { | |||
| #if NCNN_image_shader | |||
| int gx = int(gl_GlobalInvocationID.x) * 4; | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) | |||
| #if NCNN_image_shader | |||
| if (gx >= psc(outw) * psc(outh) || gy >= psc(outc)) | |||
| return; | |||
| #else | |||
| int gx = int(gl_GlobalInvocationID.x) * 4; | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) | |||
| if (gx >= psc(outcstep) || gy >= psc(outc)) | |||
| return; | |||
| #endif | |||
| @@ -99,9 +84,9 @@ void main() | |||
| if (bias_term == 1) | |||
| { | |||
| #if NCNN_image_shader | |||
| afpvec4 b = image3d_ld4(bias_blob, ivec3(gz, 0, 0)); | |||
| afpvec4 b = image3d_ld4(bias_blob, ivec3(gy, 0, 0)); | |||
| #else | |||
| afpvec4 b = buffer_ld4(bias_data, gz); | |||
| afpvec4 b = buffer_ld4(bias_data, gy); | |||
| #endif | |||
| sum0 = b; | |||
| sum1 = b; | |||
| @@ -129,10 +114,10 @@ void main() | |||
| afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z)); | |||
| afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z)); | |||
| afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gz)); | |||
| afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gz)); | |||
| afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gz)); | |||
| afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gz)); | |||
| afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gy)); | |||
| afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gy)); | |||
| afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gy)); | |||
| afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gy)); | |||
| // sum += v * k; | |||
| sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); | |||
| @@ -156,7 +141,7 @@ void main() | |||
| sum3.a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); | |||
| } | |||
| #else | |||
| int w_offset = gz * psc(c) * 4; | |||
| int w_offset = gy * psc(c) * 4; | |||
| int v_offset = gx; | |||
| for (int z = 0; z < psc(c); z++) | |||
| @@ -246,12 +231,12 @@ void main() | |||
| } | |||
| #if NCNN_image_shader | |||
| image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); | |||
| image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); | |||
| image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); | |||
| image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); | |||
| image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); | |||
| image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); | |||
| image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); | |||
| image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); | |||
| #else | |||
| int gi = gz * psc(outcstep) + gx; | |||
| int gi = gy * psc(outcstep) + gx; | |||
| buffer_st4(top_blob_data, gi + 0, sum0); | |||
| if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1); | |||