GitOrigin-RevId: 932e7689e8
tags/v1.8.2
| @@ -17,6 +17,8 @@ genrule( | |||||
| CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_fprop --type tensorop884 $(@D) | CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_fprop --type tensorop884 $(@D) | ||||
| CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_dgrad --type simt $(@D) | CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_dgrad --type simt $(@D) | ||||
| CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_dgrad --type tensorop884 $(@D) | CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_dgrad --type tensorop884 $(@D) | ||||
| CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_wgrad --type simt $(@D) | |||||
| CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_wgrad --type tensorop884 $(@D) | |||||
| """, | """, | ||||
| tools = ["//brain/megbrain/dnn/scripts/cutlass_generator:generator.py"], | tools = ["//brain/megbrain/dnn/scripts/cutlass_generator:generator.py"], | ||||
| visibility = ["//visibility:public"], | visibility = ["//visibility:public"], | ||||
| @@ -317,7 +317,7 @@ class EmitDeconvInstance: | |||||
| def __init__(self): | def __init__(self): | ||||
| self.template = """ | self.template = """ | ||||
| // kernel instance "${operation_name}" generated by cutlass generator | // kernel instance "${operation_name}" generated by cutlass generator | ||||
| using Deconvolution = | |||||
| using Convolution = | |||||
| typename cutlass::conv::device::Deconvolution< | typename cutlass::conv::device::Deconvolution< | ||||
| ${element_src}, | ${element_src}, | ||||
| ${layout_src}, | ${layout_src}, | ||||
| @@ -415,6 +415,103 @@ using Deconvolution = | |||||
| return SubstituteTemplate(self.template, values) | return SubstituteTemplate(self.template, values) | ||||
| class EmitConvolutionBackwardFilterInstance: | |||||
| def __init__(self): | |||||
| self.template = """ | |||||
| // kernel instance "${operation_name}" generated by cutlass generator | |||||
| using Convolution = | |||||
| typename cutlass::conv::device::ConvolutionBackwardFilter< | |||||
| ${element_src}, | |||||
| ${layout_src}, | |||||
| ${element_diff}, | |||||
| ${layout_diff}, | |||||
| ${element_grad}, | |||||
| ${layout_grad}, | |||||
| ${element_accumulator}, | |||||
| ${conv_type}, | |||||
| ${opcode_class}, | |||||
| ${arch}, | |||||
| cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, | |||||
| cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, | |||||
| cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, | |||||
| ${epilogue_functor}< | |||||
| ${element_grad}, | |||||
| ${epilogue_vector_length}, | |||||
| ${element_accumulator}, | |||||
| ${element_epilogue} | |||||
| >, | |||||
| ${swizzling_functor}, | |||||
| ${stages}, | |||||
| ${alignment_src}, | |||||
| ${alignment_diff}, | |||||
| ${special_optimization}, | |||||
| ${math_operator}, | |||||
| ${implicit_gemm_mode}>; | |||||
| """ | |||||
| def emit(self, operation): | |||||
| warp_shape = [ | |||||
| int( | |||||
| operation.tile_description.threadblock_shape[idx] | |||||
| / operation.tile_description.warp_count[idx] | |||||
| ) | |||||
| for idx in range(3) | |||||
| ] | |||||
| epilogue_vector_length = int( | |||||
| min(operation.dst.alignment * DataTypeSize[operation.dst.element], 128) | |||||
| / DataTypeSize[operation.dst.element] | |||||
| ) | |||||
| values = { | |||||
| "operation_name": operation.procedural_name(), | |||||
| "conv_type": ConvTypeTag[operation.conv_type], | |||||
| "element_src": DataTypeTag[operation.src.element], | |||||
| "layout_src": LayoutTag[operation.src.layout], | |||||
| "element_diff": DataTypeTag[operation.flt.element], | |||||
| "layout_diff": LayoutTag[operation.flt.layout], | |||||
| "element_grad": DataTypeTag[operation.dst.element], | |||||
| "layout_grad": LayoutTag[operation.dst.layout], | |||||
| "element_accumulator": DataTypeTag[operation.accumulator_type()], | |||||
| "opcode_class": OpcodeClassTag[ | |||||
| operation.tile_description.math_instruction.opcode_class | |||||
| ], | |||||
| "arch": "cutlass::arch::Sm%d" % operation.arch, | |||||
| "threadblock_shape_m": str(operation.tile_description.threadblock_shape[0]), | |||||
| "threadblock_shape_n": str(operation.tile_description.threadblock_shape[1]), | |||||
| "threadblock_shape_k": str(operation.tile_description.threadblock_shape[2]), | |||||
| "warp_shape_m": str(warp_shape[0]), | |||||
| "warp_shape_n": str(warp_shape[1]), | |||||
| "warp_shape_k": str(warp_shape[2]), | |||||
| "instruction_shape_m": str( | |||||
| operation.tile_description.math_instruction.instruction_shape[0] | |||||
| ), | |||||
| "instruction_shape_n": str( | |||||
| operation.tile_description.math_instruction.instruction_shape[1] | |||||
| ), | |||||
| "instruction_shape_k": str( | |||||
| operation.tile_description.math_instruction.instruction_shape[2] | |||||
| ), | |||||
| "epilogue_vector_length": str(epilogue_vector_length), | |||||
| "epilogue_functor": EpilogueFunctorTag[operation.epilogue_functor], | |||||
| "element_epilogue": str(DataTypeTag[operation.element_epilogue]), | |||||
| "swizzling_functor": SwizzlingFunctorTag[operation.swizzling_functor], | |||||
| "stages": str(operation.tile_description.stages), | |||||
| "alignment_src": str(operation.src.alignment), | |||||
| "alignment_diff": str(operation.flt.alignment), | |||||
| "special_optimization": SpecialOptimizeDescTag[ | |||||
| operation.special_optimization | |||||
| ], | |||||
| "math_operator": MathOperationTag[ | |||||
| operation.tile_description.math_instruction.math_operation | |||||
| ], | |||||
| "implicit_gemm_mode": ImplicitGemmModeTag[operation.implicit_gemm_mode], | |||||
| } | |||||
| return SubstituteTemplate(self.template, values) | |||||
| ################################################################################################### | ################################################################################################### | ||||
| # | # | ||||
| # Generator functions for all layouts | # Generator functions for all layouts | ||||
| @@ -500,6 +597,7 @@ def GenerateConv2d( | |||||
| epilogues = [ | epilogues = [ | ||||
| EpilogueFunctor.BiasAddLinearCombination, | EpilogueFunctor.BiasAddLinearCombination, | ||||
| EpilogueFunctor.BiasAddLinearCombinationRelu, | EpilogueFunctor.BiasAddLinearCombinationRelu, | ||||
| EpilogueFunctor.LinearCombination, | |||||
| ] | ] | ||||
| if conv_type == ConvType.Convolution: | if conv_type == ConvType.Convolution: | ||||
| epilogues.append(EpilogueFunctor.BiasAddLinearCombinationHSwish) | epilogues.append(EpilogueFunctor.BiasAddLinearCombinationHSwish) | ||||
| @@ -544,11 +642,15 @@ def GenerateConv2d( | |||||
| def filter_epilogue_with_conv_kind( | def filter_epilogue_with_conv_kind( | ||||
| epilogue: EpilogueFunctor, conv_kind: ConvKind | epilogue: EpilogueFunctor, conv_kind: ConvKind | ||||
| ) -> bool: | ) -> bool: | ||||
| return ( | |||||
| (conv_kind == ConvKind.Dgrad or conv_kind == ConvKind.Wgrad) | |||||
| and epilogue != EpilogueFunctor.BiasAddLinearCombinationClamp | |||||
| and epilogue != EpilogueFunctor.BiasAddLinearCombination | |||||
| ) | |||||
| if conv_kind == ConvKind.Fprop: | |||||
| return epilogue == EpilogueFunctor.LinearCombination | |||||
| elif conv_kind == ConvKind.Dgrad: | |||||
| return ( | |||||
| epilogue != EpilogueFunctor.BiasAddLinearCombinationClamp | |||||
| and epilogue != EpilogueFunctor.BiasAddLinearCombination | |||||
| ) | |||||
| elif conv_kind == ConvKind.Wgrad: | |||||
| return epilogue != EpilogueFunctor.LinearCombination | |||||
| # loop over all tile descriptions | # loop over all tile descriptions | ||||
| for tile in tile_descriptions: | for tile in tile_descriptions: | ||||
| @@ -557,7 +659,7 @@ def GenerateConv2d( | |||||
| bias_type, epilogues = get_bias_type_and_epilogues(tile, dst_type) | bias_type, epilogues = get_bias_type_and_epilogues(tile, dst_type) | ||||
| flt_align = get_flt_align(tile) | |||||
| flt_align = flt_align if conv_kind == ConvKind.Wgrad else get_flt_align(tile) | |||||
| dst_align = get_dst_align(tile, dst_layout) | dst_align = get_dst_align(tile, dst_layout) | ||||
| @@ -771,11 +873,14 @@ class EmitConvSingleKernelWrapper: | |||||
| if self.operation.conv_kind == ConvKind.Fprop: | if self.operation.conv_kind == ConvKind.Fprop: | ||||
| self.instance_emitter = EmitConv2dInstance() | self.instance_emitter = EmitConv2dInstance() | ||||
| self.convolution_name = "Convolution" | |||||
| else: | |||||
| assert self.operation.conv_kind == ConvKind.Dgrad | |||||
| self.convolution_name = "ConvolutionOperation" | |||||
| elif self.operation.conv_kind == ConvKind.Dgrad: | |||||
| self.instance_emitter = EmitDeconvInstance() | self.instance_emitter = EmitDeconvInstance() | ||||
| self.convolution_name = "Deconvolution" | |||||
| self.convolution_name = "ConvolutionOperation" | |||||
| else: | |||||
| assert self.operation.conv_kind == ConvKind.Wgrad | |||||
| self.instance_emitter = EmitConvolutionBackwardFilterInstance() | |||||
| self.convolution_name = "ConvolutionBackwardFilterOperation" | |||||
| self.header_template = """ | self.header_template = """ | ||||
| #if __CUDACC_VER_MAJOR__ > ${required_cuda_ver_major} || (__CUDACC_VER_MAJOR__ == ${required_cuda_ver_major} && __CUDACC_VER_MINOR__ >= ${required_cuda_ver_minor}) | #if __CUDACC_VER_MAJOR__ > ${required_cuda_ver_major} || (__CUDACC_VER_MAJOR__ == ${required_cuda_ver_major} && __CUDACC_VER_MINOR__ >= ${required_cuda_ver_minor}) | ||||
| @@ -800,7 +905,7 @@ namespace cutlass { | |||||
| namespace library { | namespace library { | ||||
| void initialize_${operation_name}(Manifest &manifest) { | void initialize_${operation_name}(Manifest &manifest) { | ||||
| manifest.append(new ConvolutionOperation<${convolution_name}>( | |||||
| manifest.append(new ${convolution_name}<Convolution>( | |||||
| "${operation_name}" | "${operation_name}" | ||||
| )); | )); | ||||
| } | } | ||||
| @@ -3,8 +3,9 @@ from generator import ( | |||||
| GenerateGemvOperations, | GenerateGemvOperations, | ||||
| GenerateConv2dOperations, | GenerateConv2dOperations, | ||||
| GenerateDeconvOperations, | GenerateDeconvOperations, | ||||
| GenerateDwconv2dFpropOperations, | |||||
| GenerateDwconv2dFpropOperations, | |||||
| GenerateDwconv2dDgradOperations, | GenerateDwconv2dDgradOperations, | ||||
| GenerateDwconv2dWgradOperations, | |||||
| ) | ) | ||||
| @@ -28,7 +29,7 @@ def write_op_list(f, gen_op, gen_type): | |||||
| elif gen_op == "dwconv2d_dgrad": | elif gen_op == "dwconv2d_dgrad": | ||||
| operations = GenerateDwconv2dDgradOperations(GenArg(gen_op, gen_type)) | operations = GenerateDwconv2dDgradOperations(GenArg(gen_op, gen_type)) | ||||
| elif gen_op == "dwconv2d_wgrad": | elif gen_op == "dwconv2d_wgrad": | ||||
| pass | |||||
| operations = GenerateDwconv2dWgradOperations(GenArg(gen_op, gen_type)) | |||||
| for op in operations: | for op in operations: | ||||
| f.write(' "%s.cu",\n' % op.procedural_name()) | f.write(' "%s.cu",\n' % op.procedural_name()) | ||||
| if gen_op != "gemv": | if gen_op != "gemv": | ||||
| @@ -52,4 +53,6 @@ if __name__ == "__main__": | |||||
| write_op_list(f, "dwconv2d_fprop", "tensorop884") | write_op_list(f, "dwconv2d_fprop", "tensorop884") | ||||
| write_op_list(f, "dwconv2d_dgrad", "simt") | write_op_list(f, "dwconv2d_dgrad", "simt") | ||||
| write_op_list(f, "dwconv2d_dgrad", "tensorop884") | write_op_list(f, "dwconv2d_dgrad", "tensorop884") | ||||
| write_op_list(f, "dwconv2d_wgrad", "simt") | |||||
| write_op_list(f, "dwconv2d_wgrad", "tensorop884") | |||||
| f.write("]") | f.write("]") | ||||
| @@ -1115,7 +1115,10 @@ def GenerateDwconv2d_Simt(args, conv_kind): | |||||
| dst_types = [DataType.f32] | dst_types = [DataType.f32] | ||||
| alignment_constraints = [128, 32] | |||||
| if conv_kind == ConvKind.Wgrad: | |||||
| alignment_constraints = [32] | |||||
| else: | |||||
| alignment_constraints = [128, 32] | |||||
| operations = [] | operations = [] | ||||
| for math_inst in math_instructions: | for math_inst in math_instructions: | ||||
| @@ -1244,7 +1247,9 @@ def GenerateDwconv2d_Simt(args, conv_kind): | |||||
| 32, | 32, | ||||
| 32, | 32, | ||||
| SpecialOptimizeDesc.NoneSpecialOpt, | SpecialOptimizeDesc.NoneSpecialOpt, | ||||
| ImplicitGemmMode.GemmTN, | |||||
| ImplicitGemmMode.GemmNT | |||||
| if conv_kind == ConvKind.Wgrad | |||||
| else ImplicitGemmMode.GemmTN, | |||||
| ) | ) | ||||
| return operations | return operations | ||||
| @@ -1277,11 +1282,14 @@ def GenerateDwconv2d_TensorOp_884(args, conv_kind): | |||||
| dst_layouts = [LayoutType.TensorNCHW] | dst_layouts = [LayoutType.TensorNCHW] | ||||
| dst_types = [DataType.f16] | |||||
| if conv_kind == ConvKind.Wgrad: | |||||
| dst_types = [DataType.f32] | |||||
| else: | |||||
| dst_types = [DataType.f16] | |||||
| alignment_constraints = [128, 32, 16] | alignment_constraints = [128, 32, 16] | ||||
| cuda_major = 10 | cuda_major = 10 | ||||
| cuda_minor = 2 | |||||
| cuda_minor = 1 | |||||
| operations = [] | operations = [] | ||||
| for math_inst in math_instructions: | for math_inst in math_instructions: | ||||
| @@ -1295,24 +1303,48 @@ def GenerateDwconv2d_TensorOp_884(args, conv_kind): | |||||
| for layout in layouts: | for layout in layouts: | ||||
| for dst_type, dst_layout in zip(dst_types, dst_layouts): | for dst_type, dst_layout in zip(dst_types, dst_layouts): | ||||
| for alignment_src in alignment_constraints: | for alignment_src in alignment_constraints: | ||||
| operations += GenerateConv2d( | |||||
| ConvType.DepthwiseConvolution, | |||||
| conv_kind, | |||||
| tile_descriptions, | |||||
| layout[0], | |||||
| layout[1], | |||||
| dst_layout, | |||||
| dst_type, | |||||
| min_cc, | |||||
| alignment_src, | |||||
| 16, | |||||
| 16, | |||||
| SpecialOptimizeDesc.NoneSpecialOpt, | |||||
| ImplicitGemmMode.GemmTN, | |||||
| False, | |||||
| cuda_major, | |||||
| cuda_minor, | |||||
| ) | |||||
| if conv_kind == ConvKind.Wgrad: | |||||
| # skip io16xc16 | |||||
| if math_inst.element_accumulator == DataType.f16: | |||||
| continue | |||||
| for alignment_diff in alignment_constraints: | |||||
| operations += GenerateConv2d( | |||||
| ConvType.DepthwiseConvolution, | |||||
| conv_kind, | |||||
| tile_descriptions, | |||||
| layout[0], | |||||
| layout[1], | |||||
| dst_layout, | |||||
| dst_type, | |||||
| min_cc, | |||||
| alignment_src, | |||||
| alignment_diff, | |||||
| 32, # always f32 output | |||||
| SpecialOptimizeDesc.NoneSpecialOpt, | |||||
| ImplicitGemmMode.GemmNT, | |||||
| False, | |||||
| cuda_major, | |||||
| cuda_minor, | |||||
| ) | |||||
| else: | |||||
| operations += GenerateConv2d( | |||||
| ConvType.DepthwiseConvolution, | |||||
| conv_kind, | |||||
| tile_descriptions, | |||||
| layout[0], | |||||
| layout[1], | |||||
| dst_layout, | |||||
| dst_type, | |||||
| min_cc, | |||||
| alignment_src, | |||||
| 16, | |||||
| 16, | |||||
| SpecialOptimizeDesc.NoneSpecialOpt, | |||||
| ImplicitGemmMode.GemmTN, | |||||
| False, | |||||
| cuda_major, | |||||
| cuda_minor, | |||||
| ) | |||||
| return operations | return operations | ||||
| @@ -1501,7 +1533,7 @@ def GeneratesGemm_TensorOp_884(args): | |||||
| # 1 | # 1 | ||||
| ] | ] | ||||
| cuda_major = 10 | cuda_major = 10 | ||||
| cuda_minor = 2 | |||||
| cuda_minor = 1 | |||||
| operations = [] | operations = [] | ||||
| for math_inst in math_instructions: | for math_inst in math_instructions: | ||||
| @@ -1595,6 +1627,17 @@ def GenerateDwconv2dDgradOperations(args): | |||||
| return GenerateDwconv2d_TensorOp_884(args, ConvKind.Dgrad) | return GenerateDwconv2d_TensorOp_884(args, ConvKind.Dgrad) | ||||
| def GenerateDwconv2dWgradOperations(args): | |||||
| if args.type == "simt": | |||||
| return GenerateDwconv2d_Simt(args, ConvKind.Wgrad) | |||||
| else: | |||||
| assert args.type == "tensorop884", ( | |||||
| "operation dwconv2d fprop only support" | |||||
| "simt, tensorop884. (got:{})".format(args.type) | |||||
| ) | |||||
| return GenerateDwconv2d_TensorOp_884(args, ConvKind.Wgrad) | |||||
| def GenerateGemmOperations(args): | def GenerateGemmOperations(args): | ||||
| if args.type == "tensorop884": | if args.type == "tensorop884": | ||||
| return GeneratesGemm_TensorOp_884(args) | return GeneratesGemm_TensorOp_884(args) | ||||
| @@ -1668,8 +1711,9 @@ if __name__ == "__main__": | |||||
| operations = GenerateDwconv2dFpropOperations(args) | operations = GenerateDwconv2dFpropOperations(args) | ||||
| elif args.operations == "dwconv2d_dgrad": | elif args.operations == "dwconv2d_dgrad": | ||||
| operations = GenerateDwconv2dDgradOperations(args) | operations = GenerateDwconv2dDgradOperations(args) | ||||
| elif args.operations == "dwconv2d_wgrad": | |||||
| pass | |||||
| else: | |||||
| assert args.operations == "dwconv2d_wgrad", "invalid operation" | |||||
| operations = GenerateDwconv2dWgradOperations(args) | |||||
| if ( | if ( | ||||
| args.operations == "conv2d" | args.operations == "conv2d" | ||||
| @@ -483,6 +483,7 @@ EpilogueFunctorTag = { | |||||
| # | # | ||||
| ShortEpilogueNames = { | ShortEpilogueNames = { | ||||
| EpilogueFunctor.LinearCombination: "id", | |||||
| EpilogueFunctor.BiasAddLinearCombinationHSwishClamp: "hswish", | EpilogueFunctor.BiasAddLinearCombinationHSwishClamp: "hswish", | ||||
| EpilogueFunctor.BiasAddLinearCombinationReluClamp: "relu", | EpilogueFunctor.BiasAddLinearCombinationReluClamp: "relu", | ||||
| EpilogueFunctor.BiasAddLinearCombinationClamp: "id", | EpilogueFunctor.BiasAddLinearCombinationClamp: "id", | ||||
| @@ -1382,4 +1382,60 @@ cutlass_gen_list = [ | |||||
| "cutlass_tensorop_h884dwdgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align1x1.cu", | "cutlass_tensorop_h884dwdgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align1x1.cu", | ||||
| "cutlass_tensorop_h884dwdgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align1x1.cu", | "cutlass_tensorop_h884dwdgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align1x1.cu", | ||||
| "all_dwconv2d_dgrad_tensorop884_operations.cu", | "all_dwconv2d_dgrad_tensorop884_operations.cu", | ||||
| ] | |||||
| "cutlass_simt_sdwwgrad_id_f32_32x32x8_32x32x8_2_nchw_nchw_align1x1.cu", | |||||
| "cutlass_simt_sdwwgrad_id_f32_32x64x8_32x64x8_2_nchw_nchw_align1x1.cu", | |||||
| "cutlass_simt_sdwwgrad_id_f32_64x32x8_64x32x8_2_nchw_nchw_align1x1.cu", | |||||
| "cutlass_simt_sdwwgrad_id_f32_32x128x8_32x64x8_2_nchw_nchw_align1x1.cu", | |||||
| "cutlass_simt_sdwwgrad_id_f32_64x64x8_32x64x8_2_nchw_nchw_align1x1.cu", | |||||
| "cutlass_simt_sdwwgrad_id_f32_128x32x8_64x32x8_2_nchw_nchw_align1x1.cu", | |||||
| "cutlass_simt_sdwwgrad_id_f32_64x128x8_32x64x8_2_nchw_nchw_align1x1.cu", | |||||
| "cutlass_simt_sdwwgrad_id_f32_128x64x8_64x32x8_2_nchw_nchw_align1x1.cu", | |||||
| "cutlass_simt_sdwwgrad_id_f32_128x128x8_32x64x8_2_nchw_nchw_align1x1.cu", | |||||
| "all_dwconv2d_wgrad_simt_operations.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align8x8.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align8x8.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align8x8.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align8x8.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align8x8.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align8x2.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align8x2.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align8x2.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align8x2.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align8x2.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align8x1.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align8x1.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align8x1.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align8x1.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align8x1.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align2x8.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align2x8.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align2x8.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align2x8.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align2x8.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align2x2.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align2x2.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align2x2.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align2x2.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align2x2.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align2x1.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align2x1.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align2x1.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align2x1.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align2x1.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align1x8.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align1x8.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align1x8.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align1x8.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align1x8.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align1x2.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align1x2.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align1x2.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align1x2.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align1x2.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align1x1.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align1x1.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align1x1.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align1x1.cu", | |||||
| "cutlass_tensorop_s884dwwgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align1x1.cu", | |||||
| "all_dwconv2d_wgrad_tensorop884_operations.cu", | |||||
| ] | |||||
| @@ -185,6 +185,8 @@ if(MGE_WITH_CUDA) | |||||
| gen_cutlass_kimpl(dwconv2d_fprop tensorop884 CUTLASS_SOURCES) | gen_cutlass_kimpl(dwconv2d_fprop tensorop884 CUTLASS_SOURCES) | ||||
| gen_cutlass_kimpl(dwconv2d_dgrad simt CUTLASS_SOURCES) | gen_cutlass_kimpl(dwconv2d_dgrad simt CUTLASS_SOURCES) | ||||
| gen_cutlass_kimpl(dwconv2d_dgrad tensorop884 CUTLASS_SOURCES) | gen_cutlass_kimpl(dwconv2d_dgrad tensorop884 CUTLASS_SOURCES) | ||||
| gen_cutlass_kimpl(dwconv2d_wgrad simt CUTLASS_SOURCES) | |||||
| gen_cutlass_kimpl(dwconv2d_wgrad tensorop884 CUTLASS_SOURCES) | |||||
| list(APPEND SOURCES ${CUTLASS_SOURCES}) | list(APPEND SOURCES ${CUTLASS_SOURCES}) | ||||
| list(APPEND SOURCES ${CUSOURCES}) | list(APPEND SOURCES ${CUSOURCES}) | ||||
| endif() | endif() | ||||
| @@ -317,7 +317,7 @@ void ConvBiasForwardImpl::AlgoPack::fill_dwconv_algos() { | |||||
| for (auto&& algo : f32_implicit_bmm) { | for (auto&& algo : f32_implicit_bmm) { | ||||
| all_algos.push_back(&algo); | all_algos.push_back(&algo); | ||||
| } | } | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #if CUDA_VERSION >= 10010 | |||||
| /// preferred algo | /// preferred algo | ||||
| f16_implicit_bmm.emplace_back(AlgoParam{64, 128, 32, 32, 32, 32, 8, 8, 4, 2}); | f16_implicit_bmm.emplace_back(AlgoParam{64, 128, 32, 32, 32, 32, 8, 8, 4, 2}); | ||||
| f16_implicit_bmm.emplace_back(AlgoParam{128, 128, 32, 32, 32, 32, 8, 8, 4, 2}); | f16_implicit_bmm.emplace_back(AlgoParam{128, 128, 32, 32, 32, 32, 8, 8, 4, 2}); | ||||
| @@ -50,6 +50,7 @@ bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available( | |||||
| RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION); | RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION); | ||||
| // check if channelwise convolution | // check if channelwise convolution | ||||
| RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1); | RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1); | ||||
| RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1); | |||||
| const auto* op = get_cutlass_conv_op( | const auto* op = get_cutlass_conv_op( | ||||
| args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false); | args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false); | ||||
| RETURN_IF_FALSE(op != nullptr); | RETURN_IF_FALSE(op != nullptr); | ||||
| @@ -50,6 +50,7 @@ bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available( | |||||
| RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION); | RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION); | ||||
| // check if channelwise convolution | // check if channelwise convolution | ||||
| RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1); | RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1); | ||||
| RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1); | |||||
| const auto* op = get_cutlass_conv_op( | const auto* op = get_cutlass_conv_op( | ||||
| args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false); | args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false); | ||||
| RETURN_IF_FALSE(op != nullptr); | RETURN_IF_FALSE(op != nullptr); | ||||
| @@ -146,15 +146,17 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( | |||||
| args.filter_meta.stride[0] != 1 || | args.filter_meta.stride[0] != 1 || | ||||
| args.filter_meta.stride[1] != 1 || hw_size < 512; | args.filter_meta.stride[1] != 1 || hw_size < 512; | ||||
| //! choose for large kernel cases | //! choose for large kernel cases | ||||
| size_t fh = args.filter_meta.spatial[2], fw = args.filter_meta.spatial[3]; | |||||
| size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1]; | |||||
| size_t hi = src[2], wi = src[3]; | size_t hi = src[2], wi = src[3]; | ||||
| const bool prefer_dnn_lk_implbmm = hi <= 2 * fh && wi <= 2 * fw; | const bool prefer_dnn_lk_implbmm = hi <= 2 * fh && wi <= 2 * fw; | ||||
| //! avoid bad case in cudnn, check dnn chanwise impl first | //! avoid bad case in cudnn, check dnn chanwise impl first | ||||
| if (is_chanwise) { | if (is_chanwise) { | ||||
| if (prefer_dnn_lk_implbmm) { | if (prefer_dnn_lk_implbmm) { | ||||
| #if CUDA_VERSION >= 10020 | |||||
| if (sm_algo_pack.f16_implicit_bmm[0].is_available_attribute( | if (sm_algo_pack.f16_implicit_bmm[0].is_available_attribute( | ||||
| args, positive_attr, negative_attr, workspace_limit_in_bytes)) | args, positive_attr, negative_attr, workspace_limit_in_bytes)) | ||||
| return &sm_algo_pack.f16_implicit_bmm[0]; | return &sm_algo_pack.f16_implicit_bmm[0]; | ||||
| #endif | |||||
| if (sm_algo_pack.f32_implicit_bmm[0].is_available_attribute( | if (sm_algo_pack.f32_implicit_bmm[0].is_available_attribute( | ||||
| args, positive_attr, negative_attr, workspace_limit_in_bytes)) | args, positive_attr, negative_attr, workspace_limit_in_bytes)) | ||||
| return &sm_algo_pack.f32_implicit_bmm[0]; | return &sm_algo_pack.f32_implicit_bmm[0]; | ||||
| @@ -72,7 +72,7 @@ void ConvolutionBackwardDataImpl::AlgoPack::fill_dwconv_algos() { | |||||
| all_algos.push_back(&algo); | all_algos.push_back(&algo); | ||||
| } | } | ||||
| } | } | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #if CUDA_VERSION >= 10010 | |||||
| { | { | ||||
| using AlgoParam = AlgoFloat16NCHWHMMAImplicitBatchedGemm::AlgoParam; | using AlgoParam = AlgoFloat16NCHWHMMAImplicitBatchedGemm::AlgoParam; | ||||
| /// preferred algo | /// preferred algo | ||||
| @@ -24,8 +24,10 @@ const void* ConvolutionBackwardDataImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm: | |||||
| int alignment_diff = 0; | int alignment_diff = 0; | ||||
| int wo = args.diff_layout->dtype.size(args.diff_layout->operator[](3)); | int wo = args.diff_layout->dtype.size(args.diff_layout->operator[](3)); | ||||
| for (int candidate : {16, 4, 2}) { | for (int candidate : {16, 4, 2}) { | ||||
| if (wo % candidate == 0) | |||||
| if (wo % candidate == 0) { | |||||
| alignment_diff = candidate; | alignment_diff = candidate; | ||||
| break; | |||||
| } | |||||
| } | } | ||||
| alignment_diff /= args.diff_layout->dtype.size(1); | alignment_diff /= args.diff_layout->dtype.size(1); | ||||
| NumericTypeID accumulator_dtype = | NumericTypeID accumulator_dtype = | ||||
| @@ -85,6 +87,7 @@ bool ConvolutionBackwardDataImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_ava | |||||
| RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION); | RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION); | ||||
| // check if channelwise convolution | // check if channelwise convolution | ||||
| RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1); | RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1); | ||||
| RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1); | |||||
| const auto* op = get_available_op(args); | const auto* op = get_available_op(args); | ||||
| RETURN_IF_FALSE(op != nullptr); | RETURN_IF_FALSE(op != nullptr); | ||||
| return true; | return true; | ||||
| @@ -24,8 +24,10 @@ const void* ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm:: | |||||
| int alignment_diff = 0; | int alignment_diff = 0; | ||||
| int wo = args.diff_layout->dtype.size(args.diff_layout->operator[](3)); | int wo = args.diff_layout->dtype.size(args.diff_layout->operator[](3)); | ||||
| for (int candidate : {16, 4}) { | for (int candidate : {16, 4}) { | ||||
| if (wo % candidate == 0) | |||||
| if (wo % candidate == 0) { | |||||
| alignment_diff = candidate; | alignment_diff = candidate; | ||||
| break; | |||||
| } | |||||
| } | } | ||||
| alignment_diff /= args.diff_layout->dtype.size(1); | alignment_diff /= args.diff_layout->dtype.size(1); | ||||
| ConvolutionKey key{ | ConvolutionKey key{ | ||||
| @@ -81,6 +83,7 @@ bool ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_avai | |||||
| RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION); | RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION); | ||||
| // check if channelwise convolution | // check if channelwise convolution | ||||
| RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1); | RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1); | ||||
| RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1); | |||||
| const auto* op = get_available_op(args); | const auto* op = get_available_op(args); | ||||
| RETURN_IF_FALSE(op != nullptr); | RETURN_IF_FALSE(op != nullptr); | ||||
| return true; | return true; | ||||
| @@ -25,6 +25,7 @@ ConvolutionBackwardFilterImpl::AlgoPack::AlgoPack() { | |||||
| for (auto&& i : cudnn) { | for (auto&& i : cudnn) { | ||||
| all_algos.push_back(&i); | all_algos.push_back(&i); | ||||
| } | } | ||||
| fill_dwconv_algos(); | |||||
| all_algos.push_back(&matmul); | all_algos.push_back(&matmul); | ||||
| all_algos.push_back(&group); | all_algos.push_back(&group); | ||||
| @@ -48,6 +49,39 @@ ConvolutionBackwardFilterImpl::AlgoCUDNN* ConvolutionBackwardFilterImpl::AlgoPac | |||||
| "can not find cudnn bwd_filter algorithm %d", static_cast<int>(algo))); | "can not find cudnn bwd_filter algorithm %d", static_cast<int>(algo))); | ||||
| } | } | ||||
| void ConvolutionBackwardFilterImpl::AlgoPack::fill_dwconv_algos() { | |||||
| { | |||||
| using AlgoParam = AlgoFloat32NCHWFMAImplicitBatchedGemm::AlgoParam; | |||||
| /// preferred algo | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8, 2}); | |||||
| for (auto&& algo : implbmm_nchw_fma) { | |||||
| all_algos.push_back(&algo); | |||||
| } | |||||
| } | |||||
| #if CUDA_VERSION >= 10010 | |||||
| { | |||||
| using AlgoParam = AlgoFloat16NCHWHMMAImplicitBatchedGemm::AlgoParam; | |||||
| /// preferred algo | |||||
| implbmm_nchw_hmma.emplace_back(AlgoParam{64, 128, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| implbmm_nchw_hmma.emplace_back(AlgoParam{128, 128, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| implbmm_nchw_hmma.emplace_back(AlgoParam{128, 256, 32, 64, 64, 32, 8, 8, 4, 2}); | |||||
| implbmm_nchw_hmma.emplace_back(AlgoParam{128, 64, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| implbmm_nchw_hmma.emplace_back(AlgoParam{64, 64, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| for (auto&& algo : implbmm_nchw_hmma) { | |||||
| all_algos.push_back(&algo); | |||||
| } | |||||
| } | |||||
| #endif | |||||
| } | |||||
| ConvolutionBackwardFilterImpl::AlgoPack ConvolutionBackwardFilterImpl::sm_algo_pack; | ConvolutionBackwardFilterImpl::AlgoPack ConvolutionBackwardFilterImpl::sm_algo_pack; | ||||
| ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( | ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( | ||||
| @@ -37,6 +37,8 @@ public: | |||||
| CUDA_CHANWISE, | CUDA_CHANWISE, | ||||
| CUDA_BFLOAT16, | CUDA_BFLOAT16, | ||||
| CUDA_GROUP_CONV_GENERAL, | CUDA_GROUP_CONV_GENERAL, | ||||
| CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32, | |||||
| CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16, | |||||
| }; | }; | ||||
| using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | ||||
| @@ -210,9 +212,86 @@ private: | |||||
| WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | ||||
| }; | }; | ||||
| class ConvolutionBackwardFilterImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm final | |||||
| : public AlgoBase { | |||||
| public: | |||||
| struct AlgoParam { | |||||
| int threadblock_m; | |||||
| int threadblock_n; | |||||
| int threadblock_k; | |||||
| int warp_m; | |||||
| int warp_n; | |||||
| int warp_k; | |||||
| int stage; | |||||
| std::string to_string() { | |||||
| return ssprintf( | |||||
| "_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n, | |||||
| threadblock_k, warp_m, warp_n, warp_k, stage); | |||||
| } | |||||
| }; | |||||
| AlgoFloat32NCHWFMAImplicitBatchedGemm(AlgoParam algo_param) | |||||
| : m_algo_param{algo_param}, | |||||
| m_name{ssprintf( | |||||
| "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM%s", | |||||
| m_algo_param.to_string().c_str())} {} | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override { return 0; } | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return m_name.c_str(); } | |||||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } | |||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32) | |||||
| private: | |||||
| const void* get_available_op(const SizeArgs& args) const; | |||||
| AlgoParam m_algo_param; | |||||
| std::string m_name; | |||||
| }; | |||||
| class ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm final | |||||
| : public AlgoBase { | |||||
| public: | |||||
| /// add instruction shape as member of algo param, because f16 tensor core has 2 | |||||
| /// different matrix shapes (i.e. mma.884 and mma.1688) | |||||
| struct AlgoParam { | |||||
| int threadblock_m; | |||||
| int threadblock_n; | |||||
| int threadblock_k; | |||||
| int warp_m; | |||||
| int warp_n; | |||||
| int warp_k; | |||||
| int instruction_m; | |||||
| int instruction_n; | |||||
| int instruction_k; | |||||
| int stage; | |||||
| std::string to_string() { | |||||
| return ssprintf( | |||||
| "_%dX%dX%d_%dX%dX%d_mma%dX%dX%d_%dstage", threadblock_m, | |||||
| threadblock_n, threadblock_k, warp_m, warp_n, warp_k, instruction_m, | |||||
| instruction_n, instruction_k, stage); | |||||
| } | |||||
| }; | |||||
| AlgoFloat16NCHWHMMAImplicitBatchedGemm(AlgoParam algo_param) | |||||
| : m_algo_param{algo_param}, | |||||
| m_name{ssprintf( | |||||
| "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM%s", | |||||
| m_algo_param.to_string().c_str())} {} | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return m_name.c_str(); } | |||||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } | |||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16) | |||||
| private: | |||||
| const void* get_available_op(const SizeArgs& args) const; | |||||
| AlgoParam m_algo_param; | |||||
| std::string m_name; | |||||
| }; | |||||
| class ConvolutionBackwardFilterImpl::AlgoPack : NonCopyableObj { | class ConvolutionBackwardFilterImpl::AlgoPack : NonCopyableObj { | ||||
| // defined in cudnn.cpp | // defined in cudnn.cpp | ||||
| void fill_cudnn_algos(); | void fill_cudnn_algos(); | ||||
| void fill_dwconv_algos(); | |||||
| AlgoBase::Mapper m_all_algos_map; | AlgoBase::Mapper m_all_algos_map; | ||||
| @@ -224,6 +303,8 @@ public: | |||||
| AlgoChanwise chanwise; | AlgoChanwise chanwise; | ||||
| AlgoGroupConvGeneral group; | AlgoGroupConvGeneral group; | ||||
| AlgoBFloat16 bfloat16; | AlgoBFloat16 bfloat16; | ||||
| std::vector<AlgoFloat32NCHWFMAImplicitBatchedGemm> implbmm_nchw_fma; | |||||
| std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> implbmm_nchw_hmma; | |||||
| std::vector<AlgoBase*> | std::vector<AlgoBase*> | ||||
| //! all algorithms | //! all algorithms | ||||
| @@ -0,0 +1,172 @@ | |||||
| /** | |||||
| * \file | |||||
| * dnn/src/cuda/convolution/backward_filter/implicit_batched_gemm_float16_nchw_hmma.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "src/cuda/convolution/backward_filter/algo.h" | |||||
| #include "src/cuda/cutlass/singleton.h" | |||||
| #include "src/cuda/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace cuda; | |||||
| using namespace cutlass::library; | |||||
| const void* ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm:: | |||||
| get_available_op(const SizeArgs& args) const { | |||||
| auto get_alignment = [](const TensorLayout& layout) { | |||||
| int alignment = 0; | |||||
| int width = layout.dtype.size(layout[3]); | |||||
| for (int candidate : {16, 4, 2}) { | |||||
| if (width % candidate == 0) { | |||||
| alignment = candidate; | |||||
| break; | |||||
| } | |||||
| } | |||||
| alignment /= layout.dtype.size(1); | |||||
| return alignment; | |||||
| }; | |||||
| int alignment_src = get_alignment(*args.src_layout); | |||||
| int alignment_diff = get_alignment(*args.diff_layout); | |||||
| megdnn_assert(alignment_src >= 1 && alignment_diff >= 1); | |||||
| NumericTypeID accumulator_dtype = | |||||
| args.opr->param().compute_mode == param::Convolution::ComputeMode::DEFAULT | |||||
| ? NumericTypeID::kF16 | |||||
| : NumericTypeID::kF32; | |||||
| ConvolutionKey key{ | |||||
| cutlass::conv::Operator::kWgrad, | |||||
| NumericTypeID::kF16, // src tensor data type | |||||
| LayoutTypeID::kTensorNCHW, // src tensor layout | |||||
| NumericTypeID::kF16, // diff tensor data type | |||||
| LayoutTypeID::kTensorNCHW, // diff tensor layout | |||||
| NumericTypeID::kF32, // grad tensor data type | |||||
| LayoutTypeID::kTensorNCHW, // grad tensor layout | |||||
| NumericTypeID::kF32, // dummy argument, not used. | |||||
| LayoutTypeID::kTensorNCHW, // dummy argument, not used | |||||
| accumulator_dtype, | |||||
| cutlass::conv::ConvType::kDepthwiseConvolution, | |||||
| m_algo_param.threadblock_m, | |||||
| m_algo_param.threadblock_n, | |||||
| m_algo_param.threadblock_k, | |||||
| m_algo_param.warp_m, | |||||
| m_algo_param.warp_n, | |||||
| m_algo_param.warp_k, | |||||
| m_algo_param.instruction_m, | |||||
| m_algo_param.instruction_n, | |||||
| m_algo_param.instruction_k, | |||||
| cutlass::epilogue::EpilogueType::kLinearCombination, // no bias | |||||
| m_algo_param.stage, | |||||
| cutlass::conv::SpecialOptimizeDesc::NONE, | |||||
| alignment_src, | |||||
| alignment_diff, | |||||
| true}; | |||||
| return (void*)Singleton::get().operation_table.find_op(key); | |||||
| } | |||||
| bool ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm:: | |||||
| is_available(const SizeArgs& args) const { | |||||
| #define RETURN_IF_FALSE(stmt_) \ | |||||
| if (!(stmt_)) \ | |||||
| return false; | |||||
| RETURN_IF_FALSE(is_compute_capability_required(7, 0)); | |||||
| RETURN_IF_FALSE( | |||||
| args.src_layout->is_contiguous() && args.diff_layout->is_contiguous() && | |||||
| args.grad_layout->is_contiguous()); | |||||
| using Param = param::Convolution; | |||||
| using Format = Param::Format; | |||||
| using Sparse = Param::Sparse; | |||||
| using Mode = Param::Mode; | |||||
| using ComputeMode = Param::ComputeMode; | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.grad_filter_meta; | |||||
| RETURN_IF_FALSE(param.compute_mode == ComputeMode::FLOAT32); | |||||
| RETURN_IF_FALSE( | |||||
| param.format == Format::NCHW && | |||||
| args.src_layout->dtype.enumv() == DTypeEnum::Float16 && | |||||
| args.diff_layout->dtype.enumv() == DTypeEnum::Float16 && | |||||
| args.grad_layout->dtype.enumv() == DTypeEnum::Float16); | |||||
| RETURN_IF_FALSE(param.sparse == Sparse::GROUP); | |||||
| RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION); | |||||
| // check if channelwise convolution | |||||
| RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1); | |||||
| RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1); | |||||
| const auto* op = get_available_op(args); | |||||
| RETURN_IF_FALSE(op != nullptr); | |||||
| return true; | |||||
| #undef RETURN_IF_FALSE | |||||
| } | |||||
| size_t ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm:: | |||||
| get_workspace_in_bytes(const SizeArgs& args) const { | |||||
| auto layout = *args.grad_layout; | |||||
| // modify data type | |||||
| layout.modify_dtype_inplace(dtype::Float32()); | |||||
| return layout.span().dist_byte(); | |||||
| } | |||||
| void ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::exec( | |||||
| const ExecArgs& args) const { | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.grad_filter_meta; | |||||
| int hi = args.src_layout->operator[](2), wi = args.src_layout->operator[](3); | |||||
| int n = args.diff_layout->operator[](0), ho = args.diff_layout->operator[](2), | |||||
| wo = args.diff_layout->operator[](3); | |||||
| int co = fm.group, ci = co, groups = co; | |||||
| int fh = fm.spatial[0], fw = fm.spatial[1]; | |||||
| int sh = fm.stride[0], sw = fm.stride[1]; | |||||
| int ph = fm.padding[0], pw = fm.padding[1]; | |||||
| int dh = param.dilate_h, dw = param.dilate_w; | |||||
| // check if channelwise convolution | |||||
| megdnn_assert(fm.icpg == 1 && fm.ocpg == 1); | |||||
| auto&& stream = cuda_stream(args.opr->handle()); | |||||
| float alpha = 1.f; | |||||
| float beta = 0.f; | |||||
| const Operation* op = (const Operation*)get_available_op(args); | |||||
| cutlass::conv::Conv2dProblemSize problem_size{ | |||||
| n, hi, wi, ci, co, fh, fw, ho, | |||||
| wo, ph, pw, sh, sw, dh, dw, cutlass::conv::Mode::kCrossCorrelation, | |||||
| 1, // split k slices, always 1 | |||||
| groups, // groups | |||||
| }; | |||||
| cutlass::library::ConvolutionArguments conv_args{ | |||||
| problem_size, | |||||
| args.src_tensor->raw_ptr(), | |||||
| args.diff_tensor->raw_ptr(), | |||||
| nullptr, | |||||
| nullptr, | |||||
| args.workspace.raw_ptr, | |||||
| &alpha, | |||||
| &beta, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr}; | |||||
| cutlass_check(op->run(&conv_args, nullptr, stream)); | |||||
| after_kernel_launch(); | |||||
| auto&& typecvt = args.opr->handle()->create_operator<TypeCvt>(); | |||||
| auto f32_grad_layout = *args.grad_layout; | |||||
| // modify data type | |||||
| f32_grad_layout.modify_dtype_inplace(dtype::Float32()); | |||||
| TensorND src{args.workspace.raw_ptr, f32_grad_layout}, | |||||
| dst{args.grad_tensor->raw_ptr(), *args.grad_layout}; | |||||
| typecvt->exec(src, dst); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,135 @@ | |||||
| /** | |||||
| * \file | |||||
| * dnn/src/cuda/convolution/backward_filter/implicit_batched_gemm_float32_nchw_fma.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "src/cuda/convolution/backward_filter/algo.h" | |||||
| #include "src/cuda/cutlass/singleton.h" | |||||
| #include "src/cuda/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace cuda; | |||||
| using namespace cutlass::library; | |||||
| const void* ConvolutionBackwardFilterImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm:: | |||||
| get_available_op(const SizeArgs& args) const { | |||||
| ConvolutionKey key{ | |||||
| cutlass::conv::Operator::kWgrad, | |||||
| NumericTypeID::kF32, // src tensor data type | |||||
| LayoutTypeID::kTensorNCHW, // src tensor layout | |||||
| NumericTypeID::kF32, // diff tensor data type | |||||
| LayoutTypeID::kTensorNCHW, // diff tensor layout | |||||
| NumericTypeID::kF32, // grad tensor data type | |||||
| LayoutTypeID::kTensorNCHW, // grad tensor layout | |||||
| NumericTypeID::kF32, // dummy argument, not used. | |||||
| LayoutTypeID::kTensorNCHW, // dummy argument, not used | |||||
| NumericTypeID::kF32, | |||||
| cutlass::conv::ConvType::kDepthwiseConvolution, | |||||
| m_algo_param.threadblock_m, | |||||
| m_algo_param.threadblock_n, | |||||
| m_algo_param.threadblock_k, | |||||
| m_algo_param.warp_m, | |||||
| m_algo_param.warp_n, | |||||
| m_algo_param.warp_k, | |||||
| 1, | |||||
| 1, | |||||
| 1, | |||||
| cutlass::epilogue::EpilogueType::kLinearCombination, // no bias | |||||
| m_algo_param.stage, | |||||
| cutlass::conv::SpecialOptimizeDesc::NONE, | |||||
| 1, | |||||
| 1, | |||||
| true}; | |||||
| return (void*)Singleton::get().operation_table.find_op(key); | |||||
| } | |||||
| bool ConvolutionBackwardFilterImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available( | |||||
| const SizeArgs& args) const { | |||||
| #define RETURN_IF_FALSE(stmt_) \ | |||||
| if (!(stmt_)) \ | |||||
| return false; | |||||
| RETURN_IF_FALSE(is_compute_capability_required(6, 1)); | |||||
| RETURN_IF_FALSE( | |||||
| args.src_layout->is_contiguous() && args.diff_layout->is_contiguous() && | |||||
| args.grad_layout->is_contiguous()); | |||||
| using Param = param::Convolution; | |||||
| using Format = Param::Format; | |||||
| using Sparse = Param::Sparse; | |||||
| using Mode = Param::Mode; | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.grad_filter_meta; | |||||
| RETURN_IF_FALSE( | |||||
| param.format == Format::NCHW && | |||||
| args.src_layout->dtype.enumv() == DTypeEnum::Float32 && | |||||
| args.diff_layout->dtype.enumv() == DTypeEnum::Float32 && | |||||
| args.grad_layout->dtype.enumv() == DTypeEnum::Float32); | |||||
| RETURN_IF_FALSE(param.sparse == Sparse::GROUP); | |||||
| RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION); | |||||
| // check if channelwise convolution | |||||
| RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1); | |||||
| RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1); | |||||
| const auto* op = get_available_op(args); | |||||
| RETURN_IF_FALSE(op != nullptr); | |||||
| return true; | |||||
| #undef RETURN_IF_FALSE | |||||
| } | |||||
| void ConvolutionBackwardFilterImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::exec( | |||||
| const ExecArgs& args) const { | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.grad_filter_meta; | |||||
| int hi = args.src_layout->operator[](2), wi = args.src_layout->operator[](3); | |||||
| int n = args.diff_layout->operator[](0), ho = args.diff_layout->operator[](2), | |||||
| wo = args.diff_layout->operator[](3); | |||||
| int co = fm.group, ci = co, groups = co; | |||||
| int fh = fm.spatial[0], fw = fm.spatial[1]; | |||||
| int sh = fm.stride[0], sw = fm.stride[1]; | |||||
| int ph = fm.padding[0], pw = fm.padding[1]; | |||||
| int dh = param.dilate_h, dw = param.dilate_w; | |||||
| // check if channelwise convolution | |||||
| megdnn_assert(fm.icpg == 1 && fm.ocpg == 1); | |||||
| auto&& stream = cuda_stream(args.opr->handle()); | |||||
| float alpha = 1.f; | |||||
| float beta = 0.f; | |||||
| const Operation* op = (const Operation*)get_available_op(args); | |||||
| cutlass::conv::Conv2dProblemSize problem_size{ | |||||
| n, hi, wi, ci, co, fh, fw, ho, | |||||
| wo, ph, pw, sh, sw, dh, dw, cutlass::conv::Mode::kCrossCorrelation, | |||||
| 1, // split k slices, always 1 | |||||
| groups, // groups | |||||
| }; | |||||
| cutlass::library::ConvolutionArguments conv_args{ | |||||
| problem_size, | |||||
| args.src_tensor->raw_ptr(), | |||||
| args.diff_tensor->raw_ptr(), | |||||
| nullptr, | |||||
| nullptr, | |||||
| args.grad_tensor->raw_ptr(), | |||||
| &alpha, | |||||
| &beta, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr}; | |||||
| cutlass_check(op->run(&conv_args, nullptr, stream)); | |||||
| after_kernel_launch(); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -116,15 +116,18 @@ ConvolutionBackwardDataImpl::Algorithm* ConvolutionBackwardDataImpl:: | |||||
| AlgoBase::SizeArgs args(this, filter, diff, grad); | AlgoBase::SizeArgs args(this, filter, diff, grad); | ||||
| //! choose for large kernel cases | //! choose for large kernel cases | ||||
| size_t fh = args.filter_meta.spatial[2], fw = args.filter_meta.spatial[3]; | |||||
| size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1]; | |||||
| size_t ho = diff[2], wo = diff[3]; | size_t ho = diff[2], wo = diff[3]; | ||||
| const bool prefer_dnn_lk_implbmm = args.filter_meta.format == Param::Format::NCHW && | const bool prefer_dnn_lk_implbmm = args.filter_meta.format == Param::Format::NCHW && | ||||
| ho <= 2 * fh && wo <= 2 * fw; | ho <= 2 * fh && wo <= 2 * fw; | ||||
| if (prefer_dnn_lk_implbmm) { | if (prefer_dnn_lk_implbmm) { | ||||
| if (sm_algo_pack.implbmm_nchw_hmma.is_available_attribute( | |||||
| #if CUDA_VERSION >= 10020 | |||||
| if (sm_algo_pack.implbmm_nchw_hmma[0].is_available_attribute( | |||||
| args, positive_attr, negative_attr, workspace_limit_in_bytes)) | args, positive_attr, negative_attr, workspace_limit_in_bytes)) | ||||
| return &sm_algo_pack.implbmm_nchw_hmma[0]; | return &sm_algo_pack.implbmm_nchw_hmma[0]; | ||||
| if (sm_algo_pack.implbmm_nchw_fma.is_available_attribute(args, positive_attr, negative_attr, workspace_limit_in_bytes)) | |||||
| #endif | |||||
| if (sm_algo_pack.implbmm_nchw_fma[0].is_available_attribute( | |||||
| args, positive_attr, negative_attr, workspace_limit_in_bytes)) | |||||
| return &sm_algo_pack.implbmm_nchw_fma[0]; | return &sm_algo_pack.implbmm_nchw_fma[0]; | ||||
| } | } | ||||
| @@ -255,6 +258,23 @@ ConvolutionBackwardFilterImpl::Algorithm* ConvolutionBackwardFilterImpl:: | |||||
| const AlgoAttribute& negative_attr) { | const AlgoAttribute& negative_attr) { | ||||
| AlgoBase::SizeArgs args(this, src, diff, grad); | AlgoBase::SizeArgs args(this, src, diff, grad); | ||||
| //! choose for large kernel cases | |||||
| size_t fh = args.grad_filter_meta.spatial[0], fw = args.grad_filter_meta.spatial[1]; | |||||
| size_t ho = diff[2], wo = diff[3]; | |||||
| const bool prefer_dnn_lk_implbmm = | |||||
| args.grad_filter_meta.format == Param::Format::NCHW && ho <= 2 * fh && | |||||
| wo <= 2 * fw; | |||||
| if (prefer_dnn_lk_implbmm) { | |||||
| #if CUDA_VERSION >= 10020 | |||||
| if (sm_algo_pack.implbmm_nchw_hmma[0].is_available_attribute( | |||||
| args, positive_attr, negative_attr, workspace_limit_in_bytes)) | |||||
| return &sm_algo_pack.implbmm_nchw_hmma[0]; | |||||
| #endif | |||||
| if (sm_algo_pack.implbmm_nchw_fma[0].is_available_attribute( | |||||
| args, positive_attr, negative_attr, workspace_limit_in_bytes)) | |||||
| return &sm_algo_pack.implbmm_nchw_fma[0]; | |||||
| } | |||||
| if (args.grad_filter_meta.group > 1 && | if (args.grad_filter_meta.group > 1 && | ||||
| sm_algo_pack.chanwise.is_available_attribute( | sm_algo_pack.chanwise.is_available_attribute( | ||||
| args, positive_attr, negative_attr, workspace_limit_in_bytes)) { | args, positive_attr, negative_attr, workspace_limit_in_bytes)) { | ||||
| @@ -156,6 +156,8 @@ public: | |||||
| class AlgoChanwise; | class AlgoChanwise; | ||||
| class AlgoGroupConvGeneral; | class AlgoGroupConvGeneral; | ||||
| class AlgoBFloat16; | class AlgoBFloat16; | ||||
| class AlgoFloat32NCHWFMAImplicitBatchedGemm; | |||||
| class AlgoFloat16NCHWHMMAImplicitBatchedGemm; | |||||
| class AlgoPack; | class AlgoPack; | ||||
| @@ -135,6 +135,15 @@ namespace detail { | |||||
| template <typename EpilogueOp, epilogue::EpilogueType type> | template <typename EpilogueOp, epilogue::EpilogueType type> | ||||
| struct init_epilogue_param_; | struct init_epilogue_param_; | ||||
| template <typename EpilogueOp> | |||||
| struct init_epilogue_param_<EpilogueOp, epilogue::EpilogueType::kLinearCombination> { | |||||
| using ElementCompute = typename EpilogueOp::ElementCompute; | |||||
| typename EpilogueOp::Params get(ConvolutionArguments const* conv_args) { | |||||
| return {*static_cast<ElementCompute const*>(conv_args->alpha), | |||||
| *static_cast<ElementCompute const*>(conv_args->beta)}; | |||||
| } | |||||
| }; | |||||
| template <typename EpilogueOp> | template <typename EpilogueOp> | ||||
| struct init_epilogue_param_< | struct init_epilogue_param_< | ||||
| EpilogueOp, epilogue::EpilogueType::kBiasAddLinearCombination> { | EpilogueOp, epilogue::EpilogueType::kBiasAddLinearCombination> { | ||||
| @@ -290,6 +299,159 @@ public: | |||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| /// We add a new template class to handle convolution backward filter operation, because | |||||
| /// the device-level convolution operator of backward filter is different from the | |||||
| /// others (convolution forward and convolution backward data). | |||||
| /// But the description object is reused in this wrapper of convolution backward filter. | |||||
| /// The reason is that we do not want to introduce an another unnecessary structure. | |||||
| /// TODO: Maybe the device-level operator in cutlass for convoluton forward, backward | |||||
| /// data and backward filter should be combined. | |||||
| template <typename Operator_> | |||||
| class ConvolutionBackwardFilterOperationBase : public Operation { | |||||
| public: | |||||
| using Operator = Operator_; | |||||
| using ElementSrc = typename Operator::ElementSrc; | |||||
| using LayoutSrc = typename Operator::LayoutSrc; | |||||
| using ElementDiff = typename Operator::ElementDiff; | |||||
| using LayoutDiff = typename Operator::LayoutDiff; | |||||
| using ElementGrad = typename Operator::ElementGrad; | |||||
| using LayoutGrad = typename Operator::LayoutGrad; | |||||
| using ElementAccumulator = typename Operator::ElementAccumulator; | |||||
| ConvolutionBackwardFilterOperationBase(char const* name = "unknown_convolution") { | |||||
| m_description.name = name; | |||||
| m_description.provider = Provider::kCUTLASS; | |||||
| m_description.kind = OperationKind::kConvolution; | |||||
| m_description.conv_op = Operator::kConvolutionalOperator; | |||||
| m_description.tile_description.threadblock_shape = make_Coord( | |||||
| Operator::ThreadblockShape::kM, Operator::ThreadblockShape::kN, | |||||
| Operator::ThreadblockShape::kK); | |||||
| m_description.tile_description.threadblock_stages = Operator::kStages; | |||||
| m_description.tile_description.warp_count = make_Coord( | |||||
| Operator::ConvolutionKernel::WarpCount::kM, | |||||
| Operator::ConvolutionKernel::WarpCount::kN, | |||||
| Operator::ConvolutionKernel::WarpCount::kK); | |||||
| m_description.tile_description.math_instruction.instruction_shape = make_Coord( | |||||
| Operator::InstructionShape::kM, Operator::InstructionShape::kN, | |||||
| Operator::InstructionShape::kK); | |||||
| m_description.tile_description.math_instruction.element_accumulator = | |||||
| NumericTypeMap<ElementAccumulator>::kId; | |||||
| m_description.tile_description.math_instruction.opcode_class = | |||||
| OpcodeClassMap<typename Operator::OperatorClass>::kId; | |||||
| m_description.tile_description.math_instruction.math_operation = | |||||
| MathOperationMap<typename Operator::Operator>::kId; | |||||
| m_description.tile_description.minimum_compute_capability = | |||||
| ArchMap<typename Operator::ArchTag, | |||||
| typename Operator::OperatorClass>::kMin; | |||||
| m_description.tile_description.maximum_compute_capability = | |||||
| ArchMap<typename Operator::ArchTag, | |||||
| typename Operator::OperatorClass>::kMax; | |||||
| /// src in description -> src in C++ template | |||||
| m_description.src = | |||||
| make_TensorDescription<ElementSrc, LayoutSrc>(Operator::kAlignmentSrc); | |||||
| /// filter in description -> diff in C++ template | |||||
| m_description.filter = make_TensorDescription<ElementDiff, LayoutDiff>( | |||||
| Operator::kAlignmentDiff); | |||||
| /// dst in description -> grad in C++ template | |||||
| m_description.dst = make_TensorDescription<ElementGrad, LayoutGrad>( | |||||
| Operator::kAlignmentGrad); | |||||
| /// because bias tensor is not used in ConvolutionBackwardFilter operation, the | |||||
| /// following tensor description is a dummy arguments | |||||
| m_description.bias = make_TensorDescription<ElementGrad, LayoutGrad>( | |||||
| Operator::kAlignmentGrad); | |||||
| m_description.convolution_type = Operator::kConvolutionType; | |||||
| m_description.arch_tag = ArchTagMap<typename Operator::ArchTag>::kId; | |||||
| m_description.epilogue_type = Operator::EpilogueOutputOp::kType; | |||||
| m_description.epilogue_count = Operator::EpilogueOutputOp::kCount; | |||||
| m_description.threadblock_swizzle = | |||||
| ThreadblockSwizzleMap<typename Operator::ThreadblockSwizzle>::kId; | |||||
| m_description.special_optimization = Operator::kSpecialOpt; | |||||
| m_description.gemm_mode = Operator::kGemmMode; | |||||
| /// ConvolutionBackwardFilter operation is only used for depthwise convolution, | |||||
| /// so the option without_shared_load is always true | |||||
| m_description.without_shared_load = true; | |||||
| } | |||||
| virtual OperationDescription const& description() const { return m_description; } | |||||
| protected: | |||||
| ConvolutionDescription m_description; | |||||
| }; | |||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | |||||
| template <typename Operator_> | |||||
| class ConvolutionBackwardFilterOperation | |||||
| : public ConvolutionBackwardFilterOperationBase<Operator_> { | |||||
| public: | |||||
| using Operator = Operator_; | |||||
| using ElementSrc = typename Operator::ElementSrc; | |||||
| using LayoutSrc = typename Operator::LayoutSrc; | |||||
| using ElementDiff = typename Operator::ElementDiff; | |||||
| using LayoutDiff = typename Operator::LayoutDiff; | |||||
| using ElementGrad = typename Operator::ElementGrad; | |||||
| using LayoutGrad = typename Operator::LayoutGrad; | |||||
| using ElementAccumulator = typename Operator::ElementAccumulator; | |||||
| using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute; | |||||
| using OperatorArguments = typename Operator::Arguments; | |||||
| ConvolutionBackwardFilterOperation(char const* name = "unknown_gemm") | |||||
| : ConvolutionBackwardFilterOperationBase<Operator_>(name) {} | |||||
| virtual Status run( | |||||
| void const* arguments_ptr, void* device_workspace = nullptr, | |||||
| cudaStream_t stream = nullptr) const { | |||||
| cutlass::conv::Operator conv_op = this->m_description.conv_op; | |||||
| ConvolutionArguments const* conv_args = | |||||
| reinterpret_cast<ConvolutionArguments const*>(arguments_ptr); | |||||
| const auto& ps = conv_args->problem_size; | |||||
| OperatorArguments args; | |||||
| args.problem_size = ps; | |||||
| /// src in convolution arguments -> ref_src | |||||
| args.ref_src = { | |||||
| static_cast<ElementSrc*>(const_cast<void*>(conv_args->src)), | |||||
| LayoutSrc::packed(implicit_gemm_tensor_b_extent(conv_op, ps))}; | |||||
| /// filter in convolution arguments -> ref_diff | |||||
| args.ref_diff = { | |||||
| static_cast<ElementDiff*>(const_cast<void*>(conv_args->filter)), | |||||
| LayoutDiff::packed(implicit_gemm_tensor_a_extent(conv_op, ps))}; | |||||
| /// dst in convolution arguments -> ref_grad | |||||
| args.ref_grad = { | |||||
| static_cast<ElementGrad*>(conv_args->dst), | |||||
| LayoutGrad::packed(implicit_gemm_tensor_c_extent(conv_op, ps))}; | |||||
| args.output_op = init_epilogue_param<typename Operator::EpilogueOutputOp>().get( | |||||
| conv_args); | |||||
| Operator op; | |||||
| Status status = op.initialize(args, device_workspace); | |||||
| if (status != Status::kSuccess) { | |||||
| return status; | |||||
| } | |||||
| return op.run(stream); | |||||
| } | |||||
| }; | |||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | |||||
| } // namespace library | } // namespace library | ||||
| } // namespace cutlass | } // namespace cutlass | ||||
| @@ -44,6 +44,11 @@ namespace cutlass { | |||||
| namespace library { | namespace library { | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| #if ((__CUDACC_VER_MAJOR__ > 10) || \ | |||||
| (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1)) | |||||
| #define CUTLASS_ARCH_MMA_SM70_SUPPORTED 1 | |||||
| #endif | |||||
| #if ((__CUDACC_VER_MAJOR__ > 10) || \ | #if ((__CUDACC_VER_MAJOR__ > 10) || \ | ||||
| (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)) | (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)) | ||||
| #define CUTLASS_ARCH_MMA_SM75_SUPPORTED 1 | #define CUTLASS_ARCH_MMA_SM75_SUPPORTED 1 | ||||
| @@ -56,14 +61,18 @@ void initialize_all_conv2d_simt_operations(Manifest& manifest); | |||||
| void initialize_all_deconv_simt_operations(Manifest& manifest); | void initialize_all_deconv_simt_operations(Manifest& manifest); | ||||
| void initialize_all_dwconv2d_fprop_simt_operations(Manifest& manifest); | void initialize_all_dwconv2d_fprop_simt_operations(Manifest& manifest); | ||||
| void initialize_all_dwconv2d_dgrad_simt_operations(Manifest& manifest); | void initialize_all_dwconv2d_dgrad_simt_operations(Manifest& manifest); | ||||
| #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED | |||||
| void initialize_all_dwconv2d_wgrad_simt_operations(Manifest& manifest); | |||||
| #if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) && CUTLASS_ARCH_MMA_SM70_SUPPORTED | |||||
| void initialize_all_gemm_tensorop884_operations(Manifest& manifest); | void initialize_all_gemm_tensorop884_operations(Manifest& manifest); | ||||
| void initialize_all_dwconv2d_fprop_tensorop884_operations(Manifest& manifest); | |||||
| void initialize_all_dwconv2d_dgrad_tensorop884_operations(Manifest& manifest); | |||||
| void initialize_all_dwconv2d_wgrad_tensorop884_operations(Manifest& manifest); | |||||
| #endif | |||||
| #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED | |||||
| void initialize_all_gemm_tensorop1688_operations(Manifest& manifest); | void initialize_all_gemm_tensorop1688_operations(Manifest& manifest); | ||||
| void initialize_all_conv2d_tensorop8816_operations(Manifest& manifest); | void initialize_all_conv2d_tensorop8816_operations(Manifest& manifest); | ||||
| void initialize_all_conv2d_tensorop8832_operations(Manifest& manifest); | void initialize_all_conv2d_tensorop8832_operations(Manifest& manifest); | ||||
| void initialize_all_deconv_tensorop8816_operations(Manifest& manifest); | void initialize_all_deconv_tensorop8816_operations(Manifest& manifest); | ||||
| void initialize_all_dwconv2d_fprop_tensorop884_operations(Manifest& manifest); | |||||
| void initialize_all_dwconv2d_dgrad_tensorop884_operations(Manifest& manifest); | |||||
| #endif | #endif | ||||
| void initialize_all(Manifest& manifest) { | void initialize_all(Manifest& manifest) { | ||||
| @@ -72,14 +81,18 @@ void initialize_all(Manifest& manifest) { | |||||
| initialize_all_deconv_simt_operations(manifest); | initialize_all_deconv_simt_operations(manifest); | ||||
| initialize_all_dwconv2d_fprop_simt_operations(manifest); | initialize_all_dwconv2d_fprop_simt_operations(manifest); | ||||
| initialize_all_dwconv2d_dgrad_simt_operations(manifest); | initialize_all_dwconv2d_dgrad_simt_operations(manifest); | ||||
| #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED | |||||
| initialize_all_dwconv2d_wgrad_simt_operations(manifest); | |||||
| #if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) && CUTLASS_ARCH_MMA_SM70_SUPPORTED | |||||
| initialize_all_gemm_tensorop884_operations(manifest); | initialize_all_gemm_tensorop884_operations(manifest); | ||||
| initialize_all_dwconv2d_fprop_tensorop884_operations(manifest); | |||||
| initialize_all_dwconv2d_dgrad_tensorop884_operations(manifest); | |||||
| initialize_all_dwconv2d_wgrad_tensorop884_operations(manifest); | |||||
| #endif | |||||
| #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED | |||||
| initialize_all_gemm_tensorop1688_operations(manifest); | initialize_all_gemm_tensorop1688_operations(manifest); | ||||
| initialize_all_conv2d_tensorop8816_operations(manifest); | initialize_all_conv2d_tensorop8816_operations(manifest); | ||||
| initialize_all_conv2d_tensorop8832_operations(manifest); | initialize_all_conv2d_tensorop8832_operations(manifest); | ||||
| initialize_all_deconv_tensorop8816_operations(manifest); | initialize_all_deconv_tensorop8816_operations(manifest); | ||||
| initialize_all_dwconv2d_fprop_tensorop884_operations(manifest); | |||||
| initialize_all_dwconv2d_dgrad_tensorop884_operations(manifest); | |||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -279,7 +279,6 @@ struct ConvolutionKey { | |||||
| struct ConvolutionKeyHasher { | struct ConvolutionKeyHasher { | ||||
| inline size_t operator()(ConvolutionKey const& key) const { | inline size_t operator()(ConvolutionKey const& key) const { | ||||
| return Hash() | return Hash() | ||||
| .update(&key.conv_op, sizeof(key.conv_op)) | |||||
| .update(&key.conv_op, sizeof(key.conv_op)) | .update(&key.conv_op, sizeof(key.conv_op)) | ||||
| .update(&key.element_src, sizeof(key.element_src)) | .update(&key.element_src, sizeof(key.element_src)) | ||||
| .update(&key.layout_src, sizeof(key.layout_src)) | .update(&key.layout_src, sizeof(key.layout_src)) | ||||
| @@ -1322,6 +1322,8 @@ static struct { | |||||
| {"batch_convolution", "BatchConvolution", conv::ConvType::kBatchConvolution}, | {"batch_convolution", "BatchConvolution", conv::ConvType::kBatchConvolution}, | ||||
| {"local", "Local", conv::ConvType::kLocal}, | {"local", "Local", conv::ConvType::kLocal}, | ||||
| {"local_share", "LocalShare", conv::ConvType::kLocalShare}, | {"local_share", "LocalShare", conv::ConvType::kLocalShare}, | ||||
| {"depthwise_convolution", "DepthwiseConvolution", | |||||
| conv::ConvType::kDepthwiseConvolution}, | |||||
| }; | }; | ||||
| /// Converts a ConvType enumerant to a string | /// Converts a ConvType enumerant to a string | ||||
| @@ -44,7 +44,7 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() { | |||||
| for (auto&& algo : simt_float32_gemv_batched_strided) { | for (auto&& algo : simt_float32_gemv_batched_strided) { | ||||
| all_algos.push_back(&algo); | all_algos.push_back(&algo); | ||||
| } | } | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #if CUDA_VERSION >= 10010 | |||||
| for (auto&& algo : tensorop_float16) { | for (auto&& algo : tensorop_float16) { | ||||
| all_algos.push_back(&algo); | all_algos.push_back(&algo); | ||||
| } | } | ||||
| @@ -113,21 +113,26 @@ void MatrixMulForwardImpl::AlgoPack::fill_cutlass_algos() { | |||||
| simt_float32_gemv_batched_strided.emplace_back(128); | simt_float32_gemv_batched_strided.emplace_back(128); | ||||
| simt_float32_gemv_batched_strided.emplace_back(64); | simt_float32_gemv_batched_strided.emplace_back(64); | ||||
| simt_float32_gemv_batched_strided.emplace_back(32); | simt_float32_gemv_batched_strided.emplace_back(32); | ||||
| #define FOREACH_CUTLASS_MATMUL_F16_SHAPES(cb) \ | |||||
| cb(256, 128, 32, 64, 64, 32, 8, 8, 4); \ | |||||
| cb(128, 256, 32, 64, 64, 32, 8, 8, 4); \ | |||||
| cb(128, 128, 32, 64, 64, 32, 8, 8, 4); \ | |||||
| cb(256, 128, 32, 64, 64, 32, 16, 8, 8); \ | |||||
| cb(128, 256, 32, 64, 64, 32, 16, 8, 8); \ | |||||
| #define FOREACH_CUTLASS_MATMUL_MMA_SM70_SHAPES(cb) \ | |||||
| cb(256, 128, 32, 64, 64, 32, 8, 8, 4); \ | |||||
| cb(128, 256, 32, 64, 64, 32, 8, 8, 4); \ | |||||
| cb(128, 128, 32, 64, 64, 32, 8, 8, 4); | |||||
| #define FOREACH_CUTLASS_MATMUL_MMA_SM75_SHAPES(cb) \ | |||||
| cb(256, 128, 32, 64, 64, 32, 16, 8, 8); \ | |||||
| cb(128, 256, 32, 64, 64, 32, 16, 8, 8); \ | |||||
| cb(128, 128, 32, 64, 64, 32, 16, 8, 8); | cb(128, 128, 32, 64, 64, 32, 16, 8, 8); | ||||
| #define cb(...) \ | #define cb(...) \ | ||||
| tensorop_float16.emplace_back(AlgoParam{__VA_ARGS__}); \ | tensorop_float16.emplace_back(AlgoParam{__VA_ARGS__}); \ | ||||
| tensorop_float16_split_k.emplace_back(AlgoParam{__VA_ARGS__}); | tensorop_float16_split_k.emplace_back(AlgoParam{__VA_ARGS__}); | ||||
| #if CUDA_VERSION >= 10010 | |||||
| FOREACH_CUTLASS_MATMUL_MMA_SM70_SHAPES(cb) | |||||
| #endif | |||||
| #if CUDA_VERSION >= 10020 | #if CUDA_VERSION >= 10020 | ||||
| FOREACH_CUTLASS_MATMUL_F16_SHAPES(cb) | |||||
| FOREACH_CUTLASS_MATMUL_MMA_SM75_SHAPES(cb) | |||||
| #endif | #endif | ||||
| #undef cb | #undef cb | ||||
| #undef FOREACH_CUTLASS_MATMUL_F16_SHAPES | |||||
| #undef FOREACH_CUTLASS_MATMUL_MMA_SM70_SHAPES | |||||
| #undef FOREACH_CUTLASS_MATMUL_MMA_SM75_SHAPES | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -350,7 +350,7 @@ private: | |||||
| std::string m_name; | std::string m_name; | ||||
| }; | }; | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #if CUDA_VERSION >= 10010 | |||||
| class MatrixMulForwardImpl::AlgoFloat16TensorOp final | class MatrixMulForwardImpl::AlgoFloat16TensorOp final | ||||
| : public AlgoCutlassMatrixMulBase { | : public AlgoCutlassMatrixMulBase { | ||||
| public: | public: | ||||
| @@ -418,7 +418,7 @@ public: | |||||
| std::vector<AlgoFloat32SIMT> simt_float32; | std::vector<AlgoFloat32SIMT> simt_float32; | ||||
| std::vector<AlgoFloat32SIMTSplitK> simt_float32_split_k; | std::vector<AlgoFloat32SIMTSplitK> simt_float32_split_k; | ||||
| std::vector<AlgoFloat32SIMTGemvBatchedStrided> simt_float32_gemv_batched_strided; | std::vector<AlgoFloat32SIMTGemvBatchedStrided> simt_float32_gemv_batched_strided; | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #if CUDA_VERSION >= 10010 | |||||
| std::vector<AlgoFloat16TensorOp> tensorop_float16; | std::vector<AlgoFloat16TensorOp> tensorop_float16; | ||||
| std::vector<AlgoFloat16TensorOpSplitK> tensorop_float16_split_k; | std::vector<AlgoFloat16TensorOpSplitK> tensorop_float16_split_k; | ||||
| #endif | #endif | ||||
| @@ -15,7 +15,7 @@ | |||||
| #include "src/cuda/matrix_mul/algos.h" | #include "src/cuda/matrix_mul/algos.h" | ||||
| #include "src/cuda/utils.h" | #include "src/cuda/utils.h" | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #if CUDA_VERSION >= 10010 | |||||
| using namespace megdnn; | using namespace megdnn; | ||||
| using namespace cuda; | using namespace cuda; | ||||
| @@ -15,7 +15,7 @@ | |||||
| #include "src/cuda/matrix_mul/algos.h" | #include "src/cuda/matrix_mul/algos.h" | ||||
| #include "src/cuda/utils.h" | #include "src/cuda/utils.h" | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #if CUDA_VERSION >= 10010 | |||||
| using namespace megdnn; | using namespace megdnn; | ||||
| using namespace cuda; | using namespace cuda; | ||||
| @@ -46,8 +46,10 @@ public: | |||||
| class AlgoFloat32SIMT; | class AlgoFloat32SIMT; | ||||
| class AlgoFloat32SIMTSplitK; | class AlgoFloat32SIMTSplitK; | ||||
| class AlgoFloat32SIMTGemvBatchedStrided; | class AlgoFloat32SIMTGemvBatchedStrided; | ||||
| #if CUDA_VERSION >= 10010 | |||||
| class AlgoFloat16TensorOp; | class AlgoFloat16TensorOp; | ||||
| class AlgoFloat16TensorOpSplitK; | class AlgoFloat16TensorOpSplitK; | ||||
| #endif | |||||
| #endif | #endif | ||||
| class AlgoPack; | class AlgoPack; | ||||
| @@ -494,6 +494,21 @@ void check_chanwise(DType io_type, DType comp_type, Handle* handle, const char* | |||||
| checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32)) | checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32)) | ||||
| .execs({{2, 1, 1, 15, 15}, {8, 2, 7, 7}, {8, 2, 14, 14}}); | .execs({{2, 1, 1, 15, 15}, {8, 2, 7, 7}, {8, 2, 14, 14}}); | ||||
| } else if (std::is_same<Op, ConvolutionBackwardFilter>::value) { | } else if (std::is_same<Op, ConvolutionBackwardFilter>::value) { | ||||
| // align 8 | |||||
| checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32)) | |||||
| .execs({{8, 2, 16, 16}, {8, 2, 16, 16}, {2, 1, 1, 15, 15}}); | |||||
| // align 1 | |||||
| checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32)) | |||||
| .execs({{8, 2, 15, 15}, {8, 2, 15, 15}, {2, 1, 1, 15, 15}}); | |||||
| // align 2 | |||||
| checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32)) | |||||
| .execs({{8, 2, 14, 14}, {8, 2, 14, 14}, {2, 1, 1, 15, 15}}); | |||||
| // custom padding | |||||
| checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32)) | |||||
| .execs({{8, 2, 16, 16}, {8, 2, 8, 8}, {2, 1, 1, 15, 15}}); | |||||
| // custom stride | |||||
| checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32)) | |||||
| .execs({{8, 2, 14, 14}, {8, 2, 7, 7}, {2, 1, 1, 15, 15}}); | |||||
| } | } | ||||
| } | } | ||||
| } // namespace | } // namespace | ||||
| @@ -535,14 +550,32 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) | |||||
| #undef cb | #undef cb | ||||
| #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | |||||
| TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER_CUTLASS_FMA_##tag) { \ | |||||
| require_compute_capability(6, 1); \ | |||||
| check_chanwise<ConvolutionBackwardFilter>( \ | |||||
| dtype::Float32(), dtype::Float32(), handle_cuda(), \ | |||||
| "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | |||||
| "_" #wm "X" #wn "X" #wk "_2stage"); \ | |||||
| } | |||||
| MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) | |||||
| #undef cb | |||||
| #undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL | #undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL | ||||
| #if CUDA_VERSION >= 10010 | |||||
| #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) \ | #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) \ | ||||
| cb(1, 128, 128, 32, 32, 32, 32); \ | cb(1, 128, 128, 32, 32, 32, 32); \ | ||||
| cb(2, 128, 256, 32, 64, 64, 32); \ | cb(2, 128, 256, 32, 64, 64, 32); \ | ||||
| cb(3, 128, 64, 32, 32, 32, 32); \ | cb(3, 128, 64, 32, 32, 32, 32); \ | ||||
| cb(4, 64, 128, 32, 32, 32, 32); \ | cb(4, 64, 128, 32, 32, 32, 32); \ | ||||
| cb(5, 64, 64, 32, 32, 32, 32); | cb(5, 64, 64, 32, 32, 32, 32); | ||||
| #else | |||||
| // hmma instruction need cuda version >= 10.2, disable hmma testcases in this path | |||||
| #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) | |||||
| #endif | |||||
| // check both ioc16 and io16xc32 | // check both ioc16 and io16xc32 | ||||
| #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | ||||
| @@ -579,6 +612,19 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) | |||||
| #undef cb | #undef cb | ||||
| #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | |||||
| TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER_CUTLASS_HMMA_##tag) { \ | |||||
| require_compute_capability(7, 0); \ | |||||
| check_chanwise<ConvolutionBackwardData>( \ | |||||
| dtype::Float16(), dtype::Float32(), handle_cuda(), \ | |||||
| "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | |||||
| "_" #wm "X" #wn "X" #wk "_mma8X8X4_2stage"); \ | |||||
| } | |||||
| MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) | |||||
| #undef cb | |||||
| #undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_HMMA_KERNEL | #undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_HMMA_KERNEL | ||||
| #if MEGDNN_WITH_BENCHMARK | #if MEGDNN_WITH_BENCHMARK | ||||
| @@ -1434,6 +1480,77 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_LARGE_KERNEL) { | |||||
| } | } | ||||
| // clang-format on | // clang-format on | ||||
| } | } | ||||
| TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_FILTER_LARGE_KERNEL) { | |||||
| CUBenchmarker<ConvolutionBackwardFilter> bencher(handle_cuda()); | |||||
| size_t RUNS = 100; | |||||
| bencher.set_display(false).set_times(RUNS); | |||||
| std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{ | |||||
| new OprProxy<ConvolutionBackwardFilter>{true}}; | |||||
| bencher.set_proxy(proxy); | |||||
| Convolution::Param param; | |||||
| param.format = ConvBias::Param::Format::NCHW; | |||||
| param.sparse = Convolution::Param::Sparse::GROUP; | |||||
| NormalRNG rng; | |||||
| auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) { | |||||
| param.pad_h = f / 2; | |||||
| param.pad_w = f / 2; | |||||
| param.stride_h = s; | |||||
| param.stride_w = s; | |||||
| param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | |||||
| TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f}; | |||||
| TensorLayout dst_layout; | |||||
| auto opr = handle_cuda()->create_operator<Convolution>(); | |||||
| opr->param() = param; | |||||
| opr->deduce_layout( | |||||
| {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout); | |||||
| float bandwith = static_cast<float>( | |||||
| src.total_nr_elems() + filter.total_nr_elems() + | |||||
| dst_layout.total_nr_elems()) / | |||||
| (1024 * 1024 * 1024) * 1e3; | |||||
| bencher.set_param(param) | |||||
| .set_dtype(0, dtype::Float32()) | |||||
| .set_dtype(1, dtype::Float32()) | |||||
| .set_dtype(2, dtype::Float32()) | |||||
| .set_rng(0, &rng) | |||||
| .set_rng(1, &rng); | |||||
| bencher.proxy()->target_execution_policy = {}; | |||||
| auto time_in_ms_fp32 = bencher.execs({src, src, filter}) / RUNS; | |||||
| bencher.set_param(param) | |||||
| .set_dtype(0, dtype::Float16()) | |||||
| .set_dtype(1, dtype::Float16()) | |||||
| .set_dtype(2, dtype::Float16()) | |||||
| .set_rng(0, &rng) | |||||
| .set_rng(1, &rng); | |||||
| bencher.proxy()->target_execution_policy = {}; | |||||
| param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||||
| bencher.set_param(param); | |||||
| auto time_in_ms_pseudo_fp16 = bencher.execs({src, src, filter}) / RUNS; | |||||
| printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s " | |||||
| "pseudo float16: %.2fms %.2fGB/s " | |||||
| "speedup: " | |||||
| "%0.2f (fp16/fp32) \n", | |||||
| s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32, | |||||
| bandwith * 4 / time_in_ms_fp32, time_in_ms_pseudo_fp16, | |||||
| bandwith * 2 / time_in_ms_pseudo_fp16, | |||||
| time_in_ms_fp32 / time_in_ms_pseudo_fp16); | |||||
| }; | |||||
| // clang-format off | |||||
| for (size_t b : {32, 64}) | |||||
| for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) { | |||||
| run(b, 384, 32, 32, f, 1); | |||||
| run(b, 384, 64, 64, f, 1); | |||||
| } | |||||
| // clang-format on | |||||
| } | |||||
| #endif | #endif | ||||
| // vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen | ||||
| @@ -1093,8 +1093,11 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_GROUP) { | |||||
| run(2, 32, 7, 7, 3, 3, 64, 1, 1, 1, 1, 1, 1, 4, nlmode); | run(2, 32, 7, 7, 3, 3, 64, 1, 1, 1, 1, 1, 1, 4, nlmode); | ||||
| // strided case | // strided case | ||||
| run(2, 32, 7, 7, 3, 3, 64, 0, 0, 2, 2, 1, 1, 8, nlmode); | run(2, 32, 7, 7, 3, 3, 64, 0, 0, 2, 2, 1, 1, 8, nlmode); | ||||
| // dilate conv is supported in CUDNN since version 7.5.0 | |||||
| #if CUDNN_VERSION >= 7500 | |||||
| // dilated case | // dilated case | ||||
| run(2, 32, 7, 7, 3, 3, 64, 0, 0, 1, 1, 2, 2, 8, nlmode); | run(2, 32, 7, 7, 3, 3, 64, 0, 0, 1, 1, 2, 2, 8, nlmode); | ||||
| #endif | |||||
| } | } | ||||
| } | } | ||||
| @@ -213,7 +213,7 @@ std::vector<BenchArgs> get_feat_model_args() { | |||||
| return args; | return args; | ||||
| } | } | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #if CUDA_VERSION >= 10010 | |||||
| std::vector<BenchArgs> get_f16_feat_model_args() { | std::vector<BenchArgs> get_f16_feat_model_args() { | ||||
| std::vector<BenchArgs> args; | std::vector<BenchArgs> args; | ||||
| args.emplace_back(BenchArgs{128, 9216, 9216}); | args.emplace_back(BenchArgs{128, 9216, 9216}); | ||||
| @@ -367,7 +367,7 @@ MEGDNN_FOREACH_CUTLASS_KERNEL(cb) | |||||
| #undef cb | #undef cb | ||||
| #undef MEGDNN_FOREACH_CUTLASS_KERNEL | #undef MEGDNN_FOREACH_CUTLASS_KERNEL | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #if CUDA_VERSION >= 10010 | |||||
| #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ | #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ | ||||
| cb(1, 256, 128, 32, 64, 64, 32, 8, 8, 4); \ | cb(1, 256, 128, 32, 64, 64, 32, 8, 8, 4); \ | ||||
| cb(2, 128, 256, 32, 64, 64, 32, 8, 8, 4); \ | cb(2, 128, 256, 32, 64, 64, 32, 8, 8, 4); \ | ||||
| @@ -403,7 +403,9 @@ MEGDNN_FOREACH_CUTLASS_KERNEL(cb) | |||||
| #undef cb | #undef cb | ||||
| #undef MEGDNN_FOREACH_CUTLASS_KERNEL | #undef MEGDNN_FOREACH_CUTLASS_KERNEL | ||||
| #endif | |||||
| #if CUDA_VERSION >= 10020 | |||||
| #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ | #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ | ||||
| cb(1, 256, 128, 32, 64, 64, 32, 16, 8, 8); \ | cb(1, 256, 128, 32, 64, 64, 32, 16, 8, 8); \ | ||||
| cb(2, 128, 256, 32, 64, 64, 32, 16, 8, 8); \ | cb(2, 128, 256, 32, 64, 64, 32, 16, 8, 8); \ | ||||
| @@ -454,7 +456,7 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL_FEAT) { | |||||
| dtype::Float32(), "CUTLASS_FLOAT32_SIMT"); | dtype::Float32(), "CUTLASS_FLOAT32_SIMT"); | ||||
| } | } | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #if CUDA_VERSION >= 10010 | |||||
| TEST_F(CUDA, BENCHMARK_CUTLASS_F16_MATMUL_FEAT) { | TEST_F(CUDA, BENCHMARK_CUTLASS_F16_MATMUL_FEAT) { | ||||
| benchmark_matrix_mul( | benchmark_matrix_mul( | ||||
| handle_cuda(), get_f16_feat_model_args(), dtype::Float16(), | handle_cuda(), get_f16_feat_model_args(), dtype::Float16(), | ||||