GitOrigin-RevId: 229e1eb4be
tags/v1.6.0
| @@ -19,8 +19,8 @@ class Conv2dOperation: | |||||
| # | # | ||||
| def __init__(self, conv_kind, conv_type, arch, tile_description, src, flt, bias, dst, element_epilogue, \ | def __init__(self, conv_kind, conv_type, arch, tile_description, src, flt, bias, dst, element_epilogue, \ | ||||
| epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4, \ | epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4, \ | ||||
| need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, \ | |||||
| required_cuda_ver_major = 9, required_cuda_ver_minor = 2): | |||||
| special_optimization = SpecialOptimizeDesc.NoneSpecialOpt, implicit_gemm_mode = ImplicitGemmMode.GemmNT, \ | |||||
| without_shared_load = False, required_cuda_ver_major = 9, required_cuda_ver_minor = 2): | |||||
| self.operation_kind = OperationKind.Conv2d | self.operation_kind = OperationKind.Conv2d | ||||
| self.conv_kind = conv_kind | self.conv_kind = conv_kind | ||||
| @@ -34,7 +34,7 @@ class Conv2dOperation: | |||||
| self.element_epilogue = element_epilogue | self.element_epilogue = element_epilogue | ||||
| self.epilogue_functor = epilogue_functor | self.epilogue_functor = epilogue_functor | ||||
| self.swizzling_functor = swizzling_functor | self.swizzling_functor = swizzling_functor | ||||
| self.need_load_from_const = need_load_from_const | |||||
| self.special_optimization = special_optimization | |||||
| self.implicit_gemm_mode = implicit_gemm_mode | self.implicit_gemm_mode = implicit_gemm_mode | ||||
| self.without_shared_load = without_shared_load | self.without_shared_load = without_shared_load | ||||
| self.required_cuda_ver_major = required_cuda_ver_major | self.required_cuda_ver_major = required_cuda_ver_major | ||||
| @@ -60,16 +60,18 @@ class Conv2dOperation: | |||||
| else: | else: | ||||
| inst_shape = '' | inst_shape = '' | ||||
| unity_kernel = '' | |||||
| if not self.need_load_from_const: | |||||
| unity_kernel = '_1x1' | |||||
| special_opt = '' | |||||
| if self.special_optimization == SpecialOptimizeDesc.ConvFilterUnity: | |||||
| special_opt = '_1x1' | |||||
| elif self.special_optimization == SpecialOptimizeDesc.DeconvDoubleUpsampling: | |||||
| special_opt = '_s2' | |||||
| reorder_k = '' | reorder_k = '' | ||||
| if self.without_shared_load: | if self.without_shared_load: | ||||
| reorder_k = '_roc' | reorder_k = '_roc' | ||||
| return "%s%s%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \ | return "%s%s%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \ | ||||
| inst_shape, intermediate_type, ConvKindNames[self.conv_kind], unity_kernel, \ | |||||
| inst_shape, intermediate_type, ConvKindNames[self.conv_kind], special_opt, \ | |||||
| reorder_k, ShortEpilogueNames[self.epilogue_functor]) | reorder_k, ShortEpilogueNames[self.epilogue_functor]) | ||||
| # | # | ||||
| @@ -183,7 +185,7 @@ using Convolution = | |||||
| ${stages}, | ${stages}, | ||||
| ${alignment_src}, | ${alignment_src}, | ||||
| ${alignment_filter}, | ${alignment_filter}, | ||||
| ${nonuninity_kernel}, | |||||
| ${special_optimization}, | |||||
| ${math_operator}, | ${math_operator}, | ||||
| ${implicit_gemm_mode}, | ${implicit_gemm_mode}, | ||||
| ${without_shared_load}>; | ${without_shared_load}>; | ||||
| @@ -226,7 +228,7 @@ using Convolution = | |||||
| 'stages': str(operation.tile_description.stages), | 'stages': str(operation.tile_description.stages), | ||||
| 'alignment_src': str(operation.src.alignment), | 'alignment_src': str(operation.src.alignment), | ||||
| 'alignment_filter': str(operation.flt.alignment), | 'alignment_filter': str(operation.flt.alignment), | ||||
| 'nonuninity_kernel': str(operation.need_load_from_const).lower(), | |||||
| 'special_optimization': SpecialOptimizeDescTag[operation.special_optimization], | |||||
| 'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation], | 'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation], | ||||
| 'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode], | 'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode], | ||||
| 'without_shared_load': str(operation.without_shared_load).lower() | 'without_shared_load': str(operation.without_shared_load).lower() | ||||
| @@ -266,7 +268,7 @@ using Deconvolution = | |||||
| ${stages}, | ${stages}, | ||||
| ${alignment_src}, | ${alignment_src}, | ||||
| ${alignment_filter}, | ${alignment_filter}, | ||||
| ${nonuninity_kernel}, | |||||
| ${special_optimization}, | |||||
| ${math_operator}, | ${math_operator}, | ||||
| ${implicit_gemm_mode}>; | ${implicit_gemm_mode}>; | ||||
| """ | """ | ||||
| @@ -308,7 +310,7 @@ using Deconvolution = | |||||
| 'stages': str(operation.tile_description.stages), | 'stages': str(operation.tile_description.stages), | ||||
| 'alignment_src': str(operation.src.alignment), | 'alignment_src': str(operation.src.alignment), | ||||
| 'alignment_filter': str(operation.flt.alignment), | 'alignment_filter': str(operation.flt.alignment), | ||||
| 'nonuninity_kernel': str(operation.need_load_from_const).lower(), | |||||
| 'special_optimization': SpecialOptimizeDescTag[operation.special_optimization], | |||||
| 'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation], | 'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation], | ||||
| 'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode] | 'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode] | ||||
| } | } | ||||
| @@ -323,9 +325,9 @@ using Deconvolution = | |||||
| ################################################################################################### | ################################################################################################### | ||||
| # | # | ||||
| def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 128, \ | |||||
| skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, required_cuda_ver_major = 9, \ | |||||
| required_cuda_ver_minor = 2): | |||||
| def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 32, \ | |||||
| use_special_optimization = SpecialOptimizeDesc.NoneSpecialOpt, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, \ | |||||
| required_cuda_ver_major = 9, required_cuda_ver_minor = 2): | |||||
| operations = [] | operations = [] | ||||
| element_epilogue = DataType.f32 | element_epilogue = DataType.f32 | ||||
| @@ -412,10 +414,10 @@ def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_lay | |||||
| bias = TensorDescription(bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type]))) | bias = TensorDescription(bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type]))) | ||||
| dst = TensorDescription(dst_type, dst_layout, int(dst_align / DataTypeSize[dst_type])) | dst = TensorDescription(dst_type, dst_layout, int(dst_align / DataTypeSize[dst_type])) | ||||
| new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor) | |||||
| new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, SpecialOptimizeDesc.NoneSpecialOpt, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor) | |||||
| operations.append(new_operation) | operations.append(new_operation) | ||||
| if not skip_unity_kernel: | |||||
| new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor) | |||||
| if use_special_optimization != SpecialOptimizeDesc.NoneSpecialOpt: | |||||
| new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, use_special_optimization , implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor) | |||||
| operations.append(new_operation) | operations.append(new_operation) | ||||
| return operations | return operations | ||||
| @@ -168,10 +168,10 @@ def GenerateConv2d_Simt(args): | |||||
| for dst_type, dst_layout in zip(dst_types, dst_layouts): | for dst_type, dst_layout in zip(dst_types, dst_layouts): | ||||
| if dst_type == DataType.s4 or dst_type == DataType.u4: | if dst_type == DataType.s4 or dst_type == DataType.u4: | ||||
| min_cc = 75 | min_cc = 75 | ||||
| skip_unity_kernel = True | |||||
| use_special_optimization = SpecialOptimizeDesc.NoneSpecialOpt | |||||
| else: | else: | ||||
| min_cc = 61 | min_cc = 61 | ||||
| skip_unity_kernel = False | |||||
| use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity | |||||
| tile_descriptions = [ | tile_descriptions = [ | ||||
| TileDescription([128, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc), | TileDescription([128, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc), | ||||
| TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc), | TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc), | ||||
| @@ -182,10 +182,16 @@ def GenerateConv2d_Simt(args): | |||||
| TileDescription([ 64, 32, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), | TileDescription([ 64, 32, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), | ||||
| TileDescription([ 16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc), | TileDescription([ 16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc), | ||||
| TileDescription([ 16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | TileDescription([ 16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | ||||
| ] | |||||
| operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | |||||
| dst_layout, dst_type, min_cc, 32, 32, 32, | |||||
| skip_unity_kernel) | |||||
| ] | |||||
| for tile in tile_descriptions: | |||||
| if dst_layout == LayoutType.TensorNC32HW32 and tile.threadblock_shape[0] > 32: | |||||
| continue | |||||
| if (dst_layout == LayoutType.TensorNCHW or dst_layout == LayoutType.TensorNHWC) \ | |||||
| and tile.threadblock_shape[0] > 16: | |||||
| continue | |||||
| operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], | |||||
| dst_layout, dst_type, min_cc, 32, 32, 32, | |||||
| use_special_optimization) | |||||
| return operations | return operations | ||||
| @@ -214,6 +220,8 @@ def GenerateConv2d_TensorOp_8816(args): | |||||
| DataType.s8, | DataType.s8, | ||||
| ] | ] | ||||
| use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity | |||||
| min_cc = 75 | min_cc = 75 | ||||
| max_cc = 1024 | max_cc = 1024 | ||||
| @@ -232,28 +240,69 @@ def GenerateConv2d_TensorOp_8816(args): | |||||
| TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | ||||
| TileDescription([128, 64, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc), | TileDescription([128, 64, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc), | ||||
| TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc), | TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc), | ||||
| TileDescription([ 64, 128, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc), | |||||
| TileDescription([ 32, 128, 32], 1, [1, 2, 1], math_inst, min_cc, max_cc), | |||||
| ] | ] | ||||
| operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | ||||
| dst_layout, dst_type, min_cc, 128, 128, 64, | |||||
| False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) | |||||
| dst_layout, dst_type, min_cc, 128, 128, 64, use_special_optimization, | |||||
| ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) | |||||
| else: | else: | ||||
| assert dst_layout == LayoutType.TensorNC4HW4 | assert dst_layout == LayoutType.TensorNC4HW4 | ||||
| tile_descriptions = [ | tile_descriptions = [ | ||||
| TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc), | |||||
| TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||||
| TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||||
| TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||||
| TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | ||||
| TileDescription([128, 64, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc), | |||||
| TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc), | |||||
| TileDescription([ 64, 128, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc), | |||||
| TileDescription([ 32, 128, 32], 1, [1, 2, 1], math_inst, min_cc, max_cc), | TileDescription([ 32, 128, 32], 1, [1, 2, 1], math_inst, min_cc, max_cc), | ||||
| ] | ] | ||||
| operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | ||||
| dst_layout, dst_type, min_cc, 128, 128, 64, | |||||
| False, ImplicitGemmMode.GemmNT, False, cuda_major, cuda_minor) | |||||
| dst_layout, dst_type, min_cc, 128, 128, 64, use_special_optimization, | |||||
| ImplicitGemmMode.GemmNT, False, cuda_major, cuda_minor) | |||||
| layouts_nhwc = [ | |||||
| (LayoutType.TensorNHWC, LayoutType.TensorNC4HW4, 32), | |||||
| (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 64), | |||||
| (LayoutType.TensorNHWC, LayoutType.TensorNC16HW16, 128), | |||||
| ] | |||||
| dst_layouts_nhwc = [ | |||||
| LayoutType.TensorNHWC, | |||||
| ] | |||||
| for math_inst in math_instructions: | |||||
| for layout in layouts_nhwc: | |||||
| for dst_layout in dst_layouts_nhwc: | |||||
| dst_type = math_inst.element_b | |||||
| tile_descriptions = [ | |||||
| TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc), | |||||
| TileDescription([64, 16, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||||
| ] | |||||
| for tile in tile_descriptions: | |||||
| dst_align = 32 if tile.threadblock_shape[1] == 16 else 64 | |||||
| operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, | |||||
| dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, | |||||
| ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor) | |||||
| if tile.threadblock_shape[1] == 16 or tile.threadblock_shape[1] == 32: | |||||
| operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, | |||||
| dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, | |||||
| ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) | |||||
| out_dtypes = [DataType.s4, DataType.u4, DataType.f32] | |||||
| #INT8x8x4 and INT8x8x32 | |||||
| for math_inst in math_instructions: | |||||
| for layout in layouts_nhwc: | |||||
| for dst_layout in dst_layouts_nhwc: | |||||
| for out_dtype in out_dtypes: | |||||
| tile_descriptions = [ | |||||
| TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc), | |||||
| TileDescription([64, 16, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||||
| ] | |||||
| for tile in tile_descriptions: | |||||
| dst_align = 4 * DataTypeSize[out_dtype] if tile.threadblock_shape[1] == 16 or out_dtype == DataType.f32 \ | |||||
| else 8 * DataTypeSize[out_dtype] | |||||
| operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, | |||||
| out_dtype, min_cc, layout[2], layout[2], dst_align, use_special_optimization, | |||||
| ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor) | |||||
| if tile.threadblock_shape[1] == 16 or (tile.threadblock_shape[1] == 32 and out_dtype != DataType.f32): | |||||
| operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, | |||||
| out_dtype, min_cc, layout[2], layout[2], dst_align, use_special_optimization, | |||||
| ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) | |||||
| return operations | return operations | ||||
| @@ -281,6 +330,8 @@ def GenerateConv2d_TensorOp_8832(args): | |||||
| LayoutType.TensorNC64HW64, | LayoutType.TensorNC64HW64, | ||||
| ] | ] | ||||
| use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity | |||||
| min_cc = 75 | min_cc = 75 | ||||
| max_cc = 1024 | max_cc = 1024 | ||||
| @@ -298,8 +349,8 @@ def GenerateConv2d_TensorOp_8832(args): | |||||
| TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc), | TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc), | ||||
| ] | ] | ||||
| operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | ||||
| dst_layout, dst_type, min_cc, 128, 128, 64, | |||||
| False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) | |||||
| dst_layout, dst_type, min_cc, 128, 128, 64, use_special_optimization, | |||||
| ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) | |||||
| layouts_nhwc = [ | layouts_nhwc = [ | ||||
| (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32), | (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32), | ||||
| @@ -316,18 +367,39 @@ def GenerateConv2d_TensorOp_8832(args): | |||||
| for dst_layout in dst_layouts_nhwc: | for dst_layout in dst_layouts_nhwc: | ||||
| dst_type = math_inst.element_b | dst_type = math_inst.element_b | ||||
| tile_descriptions = [ | tile_descriptions = [ | ||||
| TileDescription([128, 16, 64], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||||
| TileDescription([128, 32, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc), | TileDescription([128, 32, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc), | ||||
| TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc), | TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc), | ||||
| ] | ] | ||||
| for tile in tile_descriptions: | for tile in tile_descriptions: | ||||
| operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], | |||||
| dst_layout, dst_type, min_cc, layout[2], layout[2], 32, | |||||
| False, ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor) | |||||
| dst_align = 16 if tile.threadblock_shape[1] == 16 else 32 | |||||
| operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, | |||||
| dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, | |||||
| ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor) | |||||
| if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64: | if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64: | ||||
| dst_align = 32 if tile.threadblock_shape[1] == 32 else 64 | dst_align = 32 if tile.threadblock_shape[1] == 32 else 64 | ||||
| operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], | |||||
| dst_layout, dst_type, min_cc, layout[2], layout[2], dst_align, | |||||
| False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) | |||||
| operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, | |||||
| dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, | |||||
| ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) | |||||
| # INT4x4x8 | |||||
| for math_inst in math_instructions: | |||||
| for layout in layouts_nhwc: | |||||
| for dst_layout in dst_layouts_nhwc: | |||||
| tile_descriptions = [ | |||||
| TileDescription([128, 16, 64], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||||
| TileDescription([128, 32, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc), | |||||
| TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc), | |||||
| ] | |||||
| for tile in tile_descriptions: | |||||
| dst_align = 32 if tile.threadblock_shape[1] == 16 else 64 | |||||
| operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, | |||||
| DataType.s8, min_cc, layout[2], layout[2], dst_align, use_special_optimization, | |||||
| ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor) | |||||
| if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64: | |||||
| dst_align = 64 if tile.threadblock_shape[1] == 32 else 128 | |||||
| operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, | |||||
| DataType.s8, min_cc, layout[2], layout[2], dst_align, use_special_optimization, | |||||
| ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) | |||||
| return operations | return operations | ||||
| @@ -354,6 +426,8 @@ def GenerateDeconv_Simt(args): | |||||
| DataType.s8, | DataType.s8, | ||||
| ] | ] | ||||
| use_special_optimization = SpecialOptimizeDesc.DeconvDoubleUpsampling | |||||
| min_cc = 61 | min_cc = 61 | ||||
| max_cc = 1024 | max_cc = 1024 | ||||
| @@ -361,7 +435,6 @@ def GenerateDeconv_Simt(args): | |||||
| for layout in layouts: | for layout in layouts: | ||||
| for dst_type, dst_layout in zip(dst_types, dst_layouts): | for dst_type, dst_layout in zip(dst_types, dst_layouts): | ||||
| tile_descriptions = [ | tile_descriptions = [ | ||||
| TileDescription([64, 128, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc), | |||||
| TileDescription([32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc), | TileDescription([32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc), | ||||
| TileDescription([16, 128, 16], 2, [1, 2, 1], math_inst, min_cc, max_cc), | TileDescription([16, 128, 16], 2, [1, 2, 1], math_inst, min_cc, max_cc), | ||||
| TileDescription([16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc), | TileDescription([16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc), | ||||
| @@ -369,7 +442,7 @@ def GenerateDeconv_Simt(args): | |||||
| ] | ] | ||||
| operations += GenerateConv2d(ConvKind.Dgrad, tile_descriptions, layout[0], layout[1], | operations += GenerateConv2d(ConvKind.Dgrad, tile_descriptions, layout[0], layout[1], | ||||
| dst_layout, dst_type, min_cc, 32, 32, 32, | dst_layout, dst_type, min_cc, 32, 32, 32, | ||||
| True) | |||||
| use_special_optimization) | |||||
| return operations | return operations | ||||
| ################################################################################ | ################################################################################ | ||||
| @@ -562,6 +562,24 @@ StrideSupportNames = { | |||||
| StrideSupport.Unity: 'unity_stride', | StrideSupport.Unity: 'unity_stride', | ||||
| } | } | ||||
| class SpecialOptimizeDesc(enum.Enum): | |||||
| NoneSpecialOpt = enum_auto() | |||||
| ConvFilterUnity = enum_auto() | |||||
| DeconvDoubleUpsampling = enum_auto() | |||||
| SpecialOptimizeDescNames = { | |||||
| SpecialOptimizeDesc.NoneSpecialOpt: 'none', | |||||
| SpecialOptimizeDesc.ConvFilterUnity: 'conv_filter_unity', | |||||
| SpecialOptimizeDesc.DeconvDoubleUpsampling: 'deconv_double_upsampling', | |||||
| } | |||||
| SpecialOptimizeDescTag = { | |||||
| SpecialOptimizeDesc.NoneSpecialOpt: 'cutlass::conv::SpecialOptimizeDesc::NONE', | |||||
| SpecialOptimizeDesc.ConvFilterUnity: 'cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY', | |||||
| SpecialOptimizeDesc.DeconvDoubleUpsampling: 'cutlass::conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING', | |||||
| } | |||||
| class ImplicitGemmMode(enum.Enum): | class ImplicitGemmMode(enum.Enum): | ||||
| GemmNT = enum_auto() | GemmNT = enum_auto() | ||||
| GemmTN = enum_auto() | GemmTN = enum_auto() | ||||
| @@ -455,11 +455,14 @@ cutlass_gen_list = [ | |||||
| "cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu", | "cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu", | ||||
| "cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu", | "cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu", | ||||
| "cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu", | "cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu", | ||||
| "cutlass_simt_s8_idgrad_id_s8_64x128x32_64x32x32_2_nc4hw4_k4rsc4.cu", | |||||
| "cutlass_simt_s8_idgrad_id_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu", | "cutlass_simt_s8_idgrad_id_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu", | ||||
| "cutlass_simt_s8_idgrad_s2_id_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu", | |||||
| "cutlass_simt_s8_idgrad_id_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu", | "cutlass_simt_s8_idgrad_id_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu", | ||||
| "cutlass_simt_s8_idgrad_s2_id_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu", | |||||
| "cutlass_simt_s8_idgrad_id_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu", | "cutlass_simt_s8_idgrad_id_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu", | ||||
| "cutlass_simt_s8_idgrad_s2_id_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu", | |||||
| "cutlass_simt_s8_idgrad_id_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu", | "cutlass_simt_s8_idgrad_id_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu", | ||||
| "cutlass_simt_s8_idgrad_s2_id_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu", | |||||
| "all_deconv_simt_operations.cu", | "all_deconv_simt_operations.cu", | ||||
| "cutlass_simt_s8_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | "cutlass_simt_s8_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | ||||
| "cutlass_simt_s8_ifprop_1x1_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | "cutlass_simt_s8_ifprop_1x1_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | ||||
| @@ -515,30 +518,6 @@ cutlass_gen_list = [ | |||||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | "cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | ||||
| "cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | "cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | ||||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | "cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | ||||
| "cutlass_simt_s8_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_1x1_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_1x1_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_1x1_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_1x1_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | "cutlass_simt_s8_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | ||||
| "cutlass_simt_s8_ifprop_1x1_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | "cutlass_simt_s8_ifprop_1x1_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | ||||
| "cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | "cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | ||||
| @@ -551,108 +530,18 @@ cutlass_gen_list = [ | |||||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | "cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | ||||
| "cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | "cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | ||||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | ||||
| "cutlass_simt_s8_ifprop_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_1x1_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
| "cutlass_simt_u4_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_u4_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | "cutlass_simt_u4_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | ||||
| "cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | "cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | ||||
| "cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | "cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | ||||
| "cutlass_simt_u4_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | "cutlass_simt_u4_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | ||||
| "cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | "cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | ||||
| "cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | "cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | ||||
| "cutlass_simt_s4_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
| "cutlass_simt_s4_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | "cutlass_simt_s4_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | ||||
| "cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | "cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | ||||
| "cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | "cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | ||||
| "cutlass_simt_s4_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | "cutlass_simt_s4_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | ||||
| "cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | "cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | ||||
| "cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | "cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | ||||
| "cutlass_simt_f32_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
| "cutlass_simt_f32_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | "cutlass_simt_f32_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | ||||
| "cutlass_simt_f32_ifprop_1x1_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | "cutlass_simt_f32_ifprop_1x1_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | ||||
| "cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | "cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | ||||
| @@ -708,72 +597,288 @@ cutlass_gen_list = [ | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32.cu", | "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32.cu", | ||||
| "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32.cu", | "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32.cu", | ||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32.cu", | "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32.cu", | ||||
| "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_id_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_id_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_id_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_id_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | "cutlass_tensorop_s8_i8816fprop_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | ||||
| "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | ||||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | "cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | ||||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | ||||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | ||||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | ||||
| "cutlass_tensorop_s8_i8816fprop_id_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_id_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | "cutlass_tensorop_s8_i8816fprop_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | ||||
| "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | ||||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | "cutlass_tensorop_s8_i8816fprop_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | ||||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | ||||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | "cutlass_tensorop_s8_i8816fprop_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | ||||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu", | ||||
| "cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_f32_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu", | |||||
| "all_conv2d_tensorop8816_operations.cu", | "all_conv2d_tensorop8816_operations.cu", | ||||
| "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64.cu", | "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64.cu", | ||||
| "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64.cu", | "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64.cu", | ||||
| @@ -815,6 +920,12 @@ cutlass_gen_list = [ | |||||
| "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu", | "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu", | ||||
| "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu", | "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu", | ||||
| "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu", | "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu", | ||||
| "cutlass_tensorop_s4_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | "cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | ||||
| "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | ||||
| "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | ||||
| @@ -839,6 +950,12 @@ cutlass_gen_list = [ | |||||
| "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | ||||
| "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | ||||
| "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | ||||
| "cutlass_tensorop_s4_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | "cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | ||||
| "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | ||||
| "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | ||||
| @@ -863,6 +980,12 @@ cutlass_gen_list = [ | |||||
| "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | ||||
| "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | ||||
| "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | ||||
| "cutlass_tensorop_s4_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s4_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | "cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | ||||
| "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | ||||
| "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | ||||
| @@ -887,6 +1010,10 @@ cutlass_gen_list = [ | |||||
| "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | ||||
| "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | ||||
| "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | ||||
| "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | ||||
| "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | ||||
| "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | ||||
| @@ -903,6 +1030,10 @@ cutlass_gen_list = [ | |||||
| "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | ||||
| "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | ||||
| "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | ||||
| "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | ||||
| "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | ||||
| "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | ||||
| @@ -919,6 +1050,10 @@ cutlass_gen_list = [ | |||||
| "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | ||||
| "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | ||||
| "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | ||||
| "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | ||||
| "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | ||||
| "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | ||||
| @@ -935,5 +1070,155 @@ cutlass_gen_list = [ | |||||
| "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | ||||
| "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | ||||
| "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | ||||
| "cutlass_tensorop_s8_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu", | |||||
| "all_conv2d_tensorop8832_operations.cu", | "all_conv2d_tensorop8832_operations.cu", | ||||
| ] | |||||
| ] | |||||
| @@ -553,7 +553,10 @@ void ConvolutionBase<Parameter>::check_or_deduce_dtype_fwd(DType src, | |||||
| dst.valid() && (dst.enumv() == src.enumv() || | dst.valid() && (dst.enumv() == src.enumv() || | ||||
| ((dst.enumv() == DTypeEnum::QuantizedS4 || | ((dst.enumv() == DTypeEnum::QuantizedS4 || | ||||
| dst.enumv() == DTypeEnum::Quantized4Asymm) && | dst.enumv() == DTypeEnum::Quantized4Asymm) && | ||||
| src.enumv() == DTypeEnum::QuantizedS8)); | |||||
| src.enumv() == DTypeEnum::QuantizedS8) || | |||||
| ((src.enumv() == DTypeEnum::QuantizedS4 || | |||||
| src.enumv() == DTypeEnum::Quantized4Asymm) && | |||||
| dst.enumv() == DTypeEnum::QuantizedS8)); | |||||
| if (cond_dst) { | if (cond_dst) { | ||||
| supported_dst_dtype.push_back(dst); | supported_dst_dtype.push_back(dst); | ||||
| } | } | ||||
| @@ -71,6 +71,9 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { | |||||
| for (auto&& algo : int8_nchw32_imma) { | for (auto&& algo : int8_nchw32_imma) { | ||||
| all_algos.push_back(&algo); | all_algos.push_back(&algo); | ||||
| } | } | ||||
| for (auto&& algo : int8_nhwc_imma) { | |||||
| all_algos.push_back(&algo); | |||||
| } | |||||
| for (auto&& algo : int4_int4_nchw64_imma) { | for (auto&& algo : int4_int4_nchw64_imma) { | ||||
| all_algos.push_back(&algo); | all_algos.push_back(&algo); | ||||
| } | } | ||||
| @@ -236,7 +239,21 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() { | |||||
| int8_nchw32_imma.emplace_back( | int8_nchw32_imma.emplace_back( | ||||
| AlgoParam{32, 128, 32, 32, 64, 32, 8, 8, 16, 1}); | AlgoParam{32, 128, 32, 32, 64, 32, 8, 8, 16, 1}); | ||||
| } | } | ||||
| { | |||||
| using AlgoParam = AlgoInt8NHWCIMMAImplicitGemm::AlgoParam; | |||||
| int8_nhwc_imma.emplace_back( | |||||
| AlgoParam{64, 16, 32, 64, 16, 32, 8, 8, 16, 2, 16}); | |||||
| int8_nhwc_imma.emplace_back( | |||||
| AlgoParam{64, 16, 32, 64, 16, 32, 8, 8, 16, 2, 8}); | |||||
| int8_nhwc_imma.emplace_back( | |||||
| AlgoParam{64, 16, 32, 64, 16, 32, 8, 8, 16, 2, 4}); | |||||
| int8_nhwc_imma.emplace_back( | |||||
| AlgoParam{128, 32, 32, 64, 32, 32, 8, 8, 16, 1, 16}); | |||||
| int8_nhwc_imma.emplace_back( | |||||
| AlgoParam{128, 32, 32, 64, 32, 32, 8, 8, 16, 1, 8}); | |||||
| int8_nhwc_imma.emplace_back( | |||||
| AlgoParam{128, 32, 32, 64, 32, 32, 8, 8, 16, 1, 4}); | |||||
| } | |||||
| { | { | ||||
| using AlgoParam = AlgoInt4Int4NCHW64IMMAImplicitGemm::AlgoParam; | using AlgoParam = AlgoInt4Int4NCHW64IMMAImplicitGemm::AlgoParam; | ||||
| int4_int4_nchw64_imma.emplace_back( | int4_int4_nchw64_imma.emplace_back( | ||||
| @@ -261,6 +278,12 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() { | |||||
| } | } | ||||
| { | { | ||||
| using AlgoParam = AlgoInt4Int4NHWCIMMAImplicitGemm::AlgoParam; | using AlgoParam = AlgoInt4Int4NHWCIMMAImplicitGemm::AlgoParam; | ||||
| int4_int4_nhwc_imma.emplace_back( | |||||
| AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 32}); | |||||
| int4_int4_nhwc_imma.emplace_back( | |||||
| AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 16}); | |||||
| int4_int4_nhwc_imma.emplace_back( | |||||
| AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 8}); | |||||
| int4_int4_nhwc_imma.emplace_back( | int4_int4_nhwc_imma.emplace_back( | ||||
| AlgoParam{128, 32, 64, 64, 32, 64, 8, 8, 32, 1, 32}); | AlgoParam{128, 32, 64, 64, 32, 64, 8, 8, 32, 1, 32}); | ||||
| int4_int4_nhwc_imma.emplace_back( | int4_int4_nhwc_imma.emplace_back( | ||||
| @@ -276,6 +299,12 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() { | |||||
| } | } | ||||
| { | { | ||||
| using AlgoParam = AlgoUInt4Int4NHWCIMMAImplicitGemm::AlgoParam; | using AlgoParam = AlgoUInt4Int4NHWCIMMAImplicitGemm::AlgoParam; | ||||
| uint4_int4_nhwc_imma.emplace_back( | |||||
| AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 32}); | |||||
| uint4_int4_nhwc_imma.emplace_back( | |||||
| AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 16}); | |||||
| uint4_int4_nhwc_imma.emplace_back( | |||||
| AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 8}); | |||||
| uint4_int4_nhwc_imma.emplace_back( | uint4_int4_nhwc_imma.emplace_back( | ||||
| AlgoParam{128, 32, 64, 64, 32, 64, 8, 8, 32, 1, 32}); | AlgoParam{128, 32, 64, 64, 32, 64, 8, 8, 32, 1, 32}); | ||||
| uint4_int4_nhwc_imma.emplace_back( | uint4_int4_nhwc_imma.emplace_back( | ||||
| @@ -72,6 +72,7 @@ public: | |||||
| CUDA_IMPLICIT_GEMM_REORDER_FILTER_CHWN4_IMMA_INT8, | CUDA_IMPLICIT_GEMM_REORDER_FILTER_CHWN4_IMMA_INT8, | ||||
| CUDA_IMPLICIT_GEMM_UNROLL_WIDTH_CHWN4_IMMA_INT8, | CUDA_IMPLICIT_GEMM_UNROLL_WIDTH_CHWN4_IMMA_INT8, | ||||
| CUDA_IMPLICIT_GEMM_IMMA_NCHW32_INT8, | CUDA_IMPLICIT_GEMM_IMMA_NCHW32_INT8, | ||||
| CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT8, | |||||
| CUDA_IMPLICIT_GEMM_IMMA_NCHW64_INT4_INT4, | CUDA_IMPLICIT_GEMM_IMMA_NCHW64_INT4_INT4, | ||||
| CUDA_IMPLICIT_GEMM_IMMA_NCHW64_UINT4_INT4, | CUDA_IMPLICIT_GEMM_IMMA_NCHW64_UINT4_INT4, | ||||
| CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT4_INT4, | CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT4_INT4, | ||||
| @@ -524,6 +525,7 @@ public: | |||||
| * + | * + | ||||
| * +--- AlgoInt8NCHW4DotProdImplicitGemm | * +--- AlgoInt8NCHW4DotProdImplicitGemm | ||||
| * +--- AlgoInt8NCHW32IMMAImplicitGemm | * +--- AlgoInt8NCHW32IMMAImplicitGemm | ||||
| * +--- AlgoInt8NHWCIMMAImplicitGemm | |||||
| * + | * + | ||||
| * +--- AlgoInt4NCHW64IMMAImplicitGemmBase | * +--- AlgoInt4NCHW64IMMAImplicitGemmBase | ||||
| * +----+--- AlgoInt4Int4NCHW64IMMAImplicitGemm | * +----+--- AlgoInt4Int4NCHW64IMMAImplicitGemm | ||||
| @@ -582,7 +584,7 @@ public: | |||||
| // operation (cutlass kernel) from the global OperationTable | // operation (cutlass kernel) from the global OperationTable | ||||
| const cutlass::library::Operation* get_cutlass_conv_op( | const cutlass::library::Operation* get_cutlass_conv_op( | ||||
| const SizeArgs& args, ConvOperator conv_op, ConvType conv_type, | const SizeArgs& args, ConvOperator conv_op, ConvType conv_type, | ||||
| bool load_from_const, bool without_shared_load) const; | |||||
| bool use_conv_filter_unity_opt, bool without_shared_load) const; | |||||
| // execute the cutlass kernel found by get_cutlass_conv_op. we give | // execute the cutlass kernel found by get_cutlass_conv_op. we give | ||||
| // subclasses full freedom to decide where and how these arguments are | // subclasses full freedom to decide where and how these arguments are | ||||
| @@ -829,6 +831,47 @@ private: | |||||
| std::string m_name; | std::string m_name; | ||||
| }; | }; | ||||
| class ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm final | |||||
| : public AlgoCutlassConvolutionBase { | |||||
| public: | |||||
| AlgoInt8NHWCIMMAImplicitGemm(AlgoParam algo_param) | |||||
| : AlgoCutlassConvolutionBase(algo_param) { | |||||
| m_name = ConvBias::algo_name<ConvBias::DirectParam>( | |||||
| ssprintf("INT8_NHWC_IMMA_IMPLICIT_GEMM_%s", | |||||
| to_string(m_algo_param).c_str()), | |||||
| ConvBias::DirectParam{}); | |||||
| } | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return m_name.c_str(); } | |||||
| AlgoAttribute attribute() const override { | |||||
| return AlgoAttribute::REPRODUCIBLE; | |||||
| } | |||||
| static std::string to_string(AlgoParam algo_param); | |||||
| size_t get_preprocess_workspace_in_bytes( | |||||
| const SizeArgs& args) const override; | |||||
| SmallVector<TensorLayout> deduce_preprocessed_filter_layout( | |||||
| const SizeArgs& args) const override; | |||||
| void exec_preprocess(const ExecArgs& args) const override; | |||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT8) | |||||
| std::string param() const override { | |||||
| std::string ret; | |||||
| serialize_write_pod(m_algo_param, ret); | |||||
| return ret; | |||||
| } | |||||
| private: | |||||
| std::tuple<float, float, float, float, float> get_constants( | |||||
| const ExecArgs& args) const; | |||||
| void reorder_filter(const ExecArgs& args, int interleaved, | |||||
| void* reordered_filter) const; | |||||
| std::string m_name; | |||||
| }; | |||||
| class ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase | class ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase | ||||
| : public AlgoCutlassConvolutionBase { | : public AlgoCutlassConvolutionBase { | ||||
| public: | public: | ||||
| @@ -1087,6 +1130,7 @@ public: | |||||
| #endif | #endif | ||||
| #if CUDA_VERSION >= 10020 | #if CUDA_VERSION >= 10020 | ||||
| std::vector<AlgoInt8NCHW32IMMAImplicitGemm> int8_nchw32_imma; | std::vector<AlgoInt8NCHW32IMMAImplicitGemm> int8_nchw32_imma; | ||||
| std::vector<AlgoInt8NHWCIMMAImplicitGemm> int8_nhwc_imma; | |||||
| std::vector<AlgoInt4Int4NCHW64IMMAImplicitGemm> int4_int4_nchw64_imma; | std::vector<AlgoInt4Int4NCHW64IMMAImplicitGemm> int4_int4_nchw64_imma; | ||||
| std::vector<AlgoUInt4Int4NCHW64IMMAImplicitGemm> uint4_int4_nchw64_imma; | std::vector<AlgoUInt4Int4NCHW64IMMAImplicitGemm> uint4_int4_nchw64_imma; | ||||
| std::vector<AlgoInt4Int4NHWCIMMAImplicitGemm> int4_int4_nhwc_imma; | std::vector<AlgoInt4Int4NHWCIMMAImplicitGemm> int4_int4_nhwc_imma; | ||||
| @@ -140,6 +140,11 @@ LayoutPack get_layout_pack(const param::ConvBias::Format format, | |||||
| LayoutTypeID::kTensorNC64HW64}; | LayoutTypeID::kTensorNC64HW64}; | ||||
| case Format::NHWC: | case Format::NHWC: | ||||
| switch (access_type) { | switch (access_type) { | ||||
| case 4: | |||||
| return {LayoutTypeID::kTensorNHWC, | |||||
| LayoutTypeID::kTensorNC4HW4, | |||||
| LayoutTypeID::kTensorNHWC, | |||||
| LayoutTypeID::kTensorNHWC}; | |||||
| case 8: | case 8: | ||||
| return {LayoutTypeID::kTensorNHWC, | return {LayoutTypeID::kTensorNHWC, | ||||
| LayoutTypeID::kTensorNC8HW8, | LayoutTypeID::kTensorNC8HW8, | ||||
| @@ -192,12 +197,18 @@ EpilogueType get_epilogue_type(const param::ConvBias::NonlineMode mode, | |||||
| const Operation* | const Operation* | ||||
| ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op( | ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op( | ||||
| const SizeArgs& args, ConvOperator conv_op, ConvType conv_type, | const SizeArgs& args, ConvOperator conv_op, ConvType conv_type, | ||||
| bool load_from_const, bool without_shared_load) const { | |||||
| using Format = param::ConvBias::Format; | |||||
| bool use_conv_filter_unity_opt, bool without_shared_load) const { | |||||
| auto&& param = args.opr->param(); | auto&& param = args.opr->param(); | ||||
| auto layouts = get_layout_pack(param.format, m_algo_param.access_size); | auto layouts = get_layout_pack(param.format, m_algo_param.access_size); | ||||
| auto epilogue_type = get_epilogue_type(param.nonlineMode, | |||||
| param.format != Format::NCHW4_NCHW); | |||||
| auto epilogue_type = get_epilogue_type( | |||||
| param.nonlineMode, | |||||
| args.dst_layout->dtype.enumv() != DTypeEnum::Float32); | |||||
| cutlass::conv::SpecialOptimizeDesc special_optimization = | |||||
| (use_conv_filter_unity_opt) | |||||
| ? cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY | |||||
| : cutlass::conv::SpecialOptimizeDesc::NONE; | |||||
| ConvolutionKey key{convert_conv_op(conv_op), | ConvolutionKey key{convert_conv_op(conv_op), | ||||
| convert_dtype(args.src_layout->dtype.enumv()), | convert_dtype(args.src_layout->dtype.enumv()), | ||||
| layouts.src, | layouts.src, | ||||
| @@ -219,7 +230,7 @@ ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op( | |||||
| m_algo_param.instruction_k, | m_algo_param.instruction_k, | ||||
| epilogue_type, | epilogue_type, | ||||
| m_algo_param.stage, | m_algo_param.stage, | ||||
| load_from_const, | |||||
| special_optimization, | |||||
| without_shared_load}; | without_shared_load}; | ||||
| return Singleton::get().operation_table.find_op(key); | return Singleton::get().operation_table.find_op(key); | ||||
| @@ -144,28 +144,48 @@ void megdnn::cuda::cutlass_wrapper::reorder_ncxhwx_imma_filter( | |||||
| IC, FH, FW, trans_oc); | IC, FH, FW, trans_oc); | ||||
| after_kernel_launch(); | after_kernel_launch(); | ||||
| } | } | ||||
| template <uint32_t size_bits, uint32_t alignbits> | |||||
| template <uint32_t size_bits> | |||||
| void megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter( | void megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter( | ||||
| int8_t* dst_filter, const int8_t* src_filter, uint32_t OC, uint32_t IC, | int8_t* dst_filter, const int8_t* src_filter, uint32_t OC, uint32_t IC, | ||||
| uint32_t FH, uint32_t FW, bool trans_oc, uint32_t oc_interleaved, | |||||
| cudaStream_t stream) { | |||||
| static constexpr uint32_t elements_per_access = alignbits / size_bits; | |||||
| uint32_t nr_threads = | |||||
| query_blocksize_for_kernel(reinterpret_cast<const void*>( | |||||
| reorder_nhwc_imma_filter_kernel<size_bits, alignbits, 32>)); | |||||
| uint32_t FH, uint32_t FW, bool trans_oc, uint32_t alignbits, | |||||
| uint32_t interleaved, cudaStream_t stream) { | |||||
| const uint32_t elements_per_access = alignbits / size_bits; | |||||
| void (*kern)(int8_t* __restrict__, const int8_t* __restrict__, uint32_t, | |||||
| uint32_t, uint32_t, uint32_t, bool); | |||||
| kern = nullptr; | |||||
| auto get_kern = [&kern](const uint32_t alignbits, | |||||
| const uint32_t interleaved) { | |||||
| #define DISPATCH_KERNEL(alignbits_, interleaved_) \ | |||||
| if (alignbits == alignbits_ && interleaved == interleaved_) { \ | |||||
| kern = reorder_nhwc_imma_filter_kernel<size_bits, alignbits_, \ | |||||
| interleaved_>; \ | |||||
| return; \ | |||||
| } | |||||
| DISPATCH_KERNEL(128, 16); | |||||
| DISPATCH_KERNEL(64, 16); | |||||
| DISPATCH_KERNEL(32, 16); | |||||
| DISPATCH_KERNEL(128, 32); | |||||
| DISPATCH_KERNEL(64, 32); | |||||
| DISPATCH_KERNEL(32, 32); | |||||
| DISPATCH_KERNEL(128, 64); | |||||
| DISPATCH_KERNEL(64, 64); | |||||
| DISPATCH_KERNEL(32, 64); | |||||
| #undef DISPATCH_KERNEL | |||||
| }; | |||||
| get_kern(alignbits, interleaved); | |||||
| uint32_t nr_threads = query_blocksize_for_kernel(kern); | |||||
| uint32_t vthreads = DIVUP(OC * IC * FH * FW, elements_per_access); | uint32_t vthreads = DIVUP(OC * IC * FH * FW, elements_per_access); | ||||
| nr_threads = std::min(nr_threads, vthreads); | nr_threads = std::min(nr_threads, vthreads); | ||||
| uint32_t nr_blocks = DIVUP(vthreads, nr_threads); | uint32_t nr_blocks = DIVUP(vthreads, nr_threads); | ||||
| if (oc_interleaved == 32) { | |||||
| reorder_nhwc_imma_filter_kernel<size_bits, alignbits, 32> | |||||
| <<<nr_blocks, nr_threads, 0, stream>>>( | |||||
| dst_filter, src_filter, OC, IC, FH, FW, trans_oc); | |||||
| } else { | |||||
| reorder_nhwc_imma_filter_kernel<size_bits, alignbits, 64> | |||||
| <<<nr_blocks, nr_threads, 0, stream>>>( | |||||
| dst_filter, src_filter, OC, IC, FH, FW, trans_oc); | |||||
| } | |||||
| kern<<<nr_blocks, nr_threads, 0, stream>>>(dst_filter, src_filter, OC, IC, | |||||
| FH, FW, trans_oc); | |||||
| after_kernel_launch(); | after_kernel_launch(); | ||||
| } | } | ||||
| @@ -180,15 +200,14 @@ INST(8, 32) | |||||
| INST(4, 64) | INST(4, 64) | ||||
| #undef INST | #undef INST | ||||
| #define INST(_size_bits, _alignbits) \ | |||||
| template void megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter< \ | |||||
| _size_bits, _alignbits>( \ | |||||
| int8_t * dst_filter, const int8_t* src_filter, uint32_t OC, \ | |||||
| uint32_t IC, uint32_t FH, uint32_t FW, bool trans_oc, \ | |||||
| uint32_t oc_interleaved, cudaStream_t stream); | |||||
| INST(4, 32) | |||||
| INST(4, 64) | |||||
| INST(4, 128) | |||||
| #define INST(_size_bits) \ | |||||
| template void \ | |||||
| megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter<_size_bits>( \ | |||||
| int8_t * dst_filter, const int8_t* src_filter, uint32_t OC, \ | |||||
| uint32_t IC, uint32_t FH, uint32_t FW, bool trans_oc, \ | |||||
| uint32_t alignbits, uint32_t interleaved, cudaStream_t stream); | |||||
| INST(4) | |||||
| INST(8) | |||||
| #undef INST | #undef INST | ||||
| // vim: syntax=cuda.doxygen | // vim: syntax=cuda.doxygen | ||||
| @@ -23,11 +23,11 @@ void reorder_ncxhwx_imma_filter(int8_t* dst_filter, const int8_t* src_filter, | |||||
| uint32_t FW, bool trans_oc, | uint32_t FW, bool trans_oc, | ||||
| cudaStream_t stream); | cudaStream_t stream); | ||||
| template <uint32_t size_bits, uint32_t alignbits> | |||||
| template <uint32_t size_bits> | |||||
| void reorder_nhwc_imma_filter(int8_t* dst_filter, const int8_t* src_filter, | void reorder_nhwc_imma_filter(int8_t* dst_filter, const int8_t* src_filter, | ||||
| uint32_t OC, uint32_t IC, uint32_t FH, | uint32_t OC, uint32_t IC, uint32_t FH, | ||||
| uint32_t FW, bool trans_oc, | |||||
| uint32_t oc_interleaved, cudaStream_t stream); | |||||
| uint32_t FW, bool trans_oc, uint32_t alignbits, | |||||
| uint32_t interleaved, cudaStream_t stream); | |||||
| } // namespace cutlass_wrapper | } // namespace cutlass_wrapper | ||||
| } // namespace cuda | } // namespace cuda | ||||
| } // namespace megdnn | } // namespace megdnn | ||||
| @@ -68,13 +68,27 @@ ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm::get_constants( | |||||
| args.filter_layout->dtype.param<dtype::QuantizedS4>().scale, | args.filter_layout->dtype.param<dtype::QuantizedS4>().scale, | ||||
| bias_scale = | bias_scale = | ||||
| args.bias_layout->dtype.param<dtype::QuantizedS32>().scale, | args.bias_layout->dtype.param<dtype::QuantizedS32>().scale, | ||||
| dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale; | |||||
| dst_scale; | |||||
| if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) { | |||||
| dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale; | |||||
| } else { // DTypeEnum::QuantizedS8 | |||||
| megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8); | |||||
| dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale; | |||||
| } | |||||
| float alpha = src_scale * filter_scale / dst_scale, | float alpha = src_scale * filter_scale / dst_scale, | ||||
| beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, theta = 0.f; | beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, theta = 0.f; | ||||
| if (args.z_layout->ndim > 0) { | if (args.z_layout->ndim > 0) { | ||||
| float z_scale = args.z_layout->dtype.param<dtype::QuantizedS4>().scale; | |||||
| float z_scale; | |||||
| if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS4) { | |||||
| z_scale = args.z_layout->dtype.param<dtype::QuantizedS4>().scale; | |||||
| } else { // DTypeEnum::QuantizedS8 | |||||
| megdnn_assert(args.z_layout->dtype.enumv() == | |||||
| DTypeEnum::QuantizedS8); | |||||
| z_scale = args.z_layout->dtype.param<dtype::QuantizedS8>().scale; | |||||
| } | |||||
| gamma = z_scale / dst_scale; | gamma = z_scale / dst_scale; | ||||
| } | } | ||||
| @@ -76,6 +76,14 @@ bool ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::is_available( | |||||
| if (fh * fw > kMaxFilterPixels) | if (fh * fw > kMaxFilterPixels) | ||||
| return false; | return false; | ||||
| bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); | |||||
| bool without_shared_load = true; | |||||
| const auto* op = get_cutlass_conv_op( | |||||
| args, ConvOperator::kFprop, ConvType::kConvolution, | |||||
| use_conv_filter_unity_opt, without_shared_load); | |||||
| if (op == nullptr) | |||||
| return false; | |||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -110,7 +118,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec( | |||||
| float dst_scale = 0.f; | float dst_scale = 0.f; | ||||
| float threshold = 0.f; | float threshold = 0.f; | ||||
| uint8_t src_zero = 0; | uint8_t src_zero = 0; | ||||
| bool load_from_const = !(fh == 1 && fw == 1); | |||||
| bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); | |||||
| bool without_shared_load = true; | bool without_shared_load = true; | ||||
| if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { | if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { | ||||
| @@ -126,7 +134,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec( | |||||
| const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop, | const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop, | ||||
| ConvType::kConvolution, | ConvType::kConvolution, | ||||
| load_from_const, without_shared_load); | |||||
| use_conv_filter_unity_opt, without_shared_load); | |||||
| execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr, | execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr, | ||||
| z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi, | z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi, | ||||
| @@ -56,8 +56,11 @@ bool ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::is_available( | |||||
| if (args.src_layout->dtype.enumv() != src_dtype() || | if (args.src_layout->dtype.enumv() != src_dtype() || | ||||
| args.filter_layout->dtype.enumv() != DTypeEnum::QuantizedS4 || | args.filter_layout->dtype.enumv() != DTypeEnum::QuantizedS4 || | ||||
| args.bias_layout->dtype.enumv() != DTypeEnum::QuantizedS32 || | |||||
| args.dst_layout->dtype.enumv() != src_dtype()) | |||||
| args.bias_layout->dtype.enumv() != DTypeEnum::QuantizedS32) | |||||
| return false; | |||||
| if (!(args.dst_layout->dtype.enumv() == src_dtype() || | |||||
| args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8)) | |||||
| return false; | return false; | ||||
| // uint4 do not support H_SWISH activition | // uint4 do not support H_SWISH activition | ||||
| @@ -83,6 +86,16 @@ bool ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::is_available( | |||||
| if ((co % 8 != 0) || (ci % m_algo_param.access_size != 0)) | if ((co % 8 != 0) || (ci % m_algo_param.access_size != 0)) | ||||
| return false; | return false; | ||||
| bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); | |||||
| bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) && | |||||
| (m_algo_param.threadblock_n == 32 || | |||||
| m_algo_param.threadblock_n == 64)); | |||||
| const auto* op = get_cutlass_conv_op( | |||||
| args, ConvOperator::kFprop, ConvType::kConvolution, | |||||
| use_conv_filter_unity_opt, without_shared_load); | |||||
| if (op == nullptr) | |||||
| return false; | |||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -117,26 +130,31 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::exec( | |||||
| float dst_scale = 0.f; | float dst_scale = 0.f; | ||||
| float threshold = 0.f; | float threshold = 0.f; | ||||
| uint8_t src_zero = 0; | uint8_t src_zero = 0; | ||||
| bool load_from_const = !(fh == 1 && fw == 1); | |||||
| bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); | |||||
| bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) && | bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) && | ||||
| (m_algo_param.threadblock_n == 32 || | (m_algo_param.threadblock_n == 32 || | ||||
| m_algo_param.threadblock_n == 64)); | m_algo_param.threadblock_n == 64)); | ||||
| if (args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { | |||||
| src_zero = args.src_layout->dtype.param<dtype::Quantized4Asymm>() | |||||
| .zero_point; | |||||
| } | |||||
| if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { | if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { | ||||
| dst_scale = | dst_scale = | ||||
| args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale; | args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale; | ||||
| src_zero = args.src_layout->dtype.param<dtype::Quantized4Asymm>() | |||||
| .zero_point; | |||||
| } else { // DTypeEnum::QuantizedS4 | |||||
| } else if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) { | |||||
| dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale; | dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale; | ||||
| } else { // DTypeEnum::QuantizedS8 | |||||
| dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale; | |||||
| } | } | ||||
| cudaStream_t stream = cuda_stream(args.opr->handle()); | cudaStream_t stream = cuda_stream(args.opr->handle()); | ||||
| const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop, | const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop, | ||||
| ConvType::kConvolution, | ConvType::kConvolution, | ||||
| load_from_const, without_shared_load); | |||||
| use_conv_filter_unity_opt, without_shared_load); | |||||
| execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr, | execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr, | ||||
| z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi, | z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi, | ||||
| @@ -166,29 +184,18 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::reorder_filter( | |||||
| cudaStream_t stream = cuda_stream(args.opr->handle()); | cudaStream_t stream = cuda_stream(args.opr->handle()); | ||||
| // reformat filter from nhwc to ncxhwx and reorder oc | // reformat filter from nhwc to ncxhwx and reorder oc | ||||
| // use trans_oc threadblock_n must be 32 or 64 | |||||
| // use trans_oc threadblock_n must be 32 or 64 and src dtype == dest dtype | |||||
| bool trans_oc = ((co % m_algo_param.threadblock_n == 0) && | bool trans_oc = ((co % m_algo_param.threadblock_n == 0) && | ||||
| (m_algo_param.threadblock_n == 32 || | (m_algo_param.threadblock_n == 32 || | ||||
| m_algo_param.threadblock_n == 64)); | m_algo_param.threadblock_n == 64)); | ||||
| uint32_t oc_iterleave = (m_algo_param.threadblock_n == 64) ? 64 : 32; | |||||
| if (iterleaved == 8) { | |||||
| cutlass_wrapper::reorder_nhwc_imma_filter<4, 32>( | |||||
| reinterpret_cast<int8_t*>(reordered_filter), | |||||
| reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci, | |||||
| fh, fw, trans_oc, oc_iterleave, stream); | |||||
| } else if (iterleaved == 16) { | |||||
| cutlass_wrapper::reorder_nhwc_imma_filter<4, 64>( | |||||
| reinterpret_cast<int8_t*>(reordered_filter), | |||||
| reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci, | |||||
| fh, fw, trans_oc, oc_iterleave, stream); | |||||
| } else { | |||||
| megdnn_assert(iterleaved == 32); | |||||
| cutlass_wrapper::reorder_nhwc_imma_filter<4, 128>( | |||||
| reinterpret_cast<int8_t*>(reordered_filter), | |||||
| reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci, | |||||
| fh, fw, trans_oc, oc_iterleave, stream); | |||||
| } | |||||
| uint32_t oc_iterleaved = (m_algo_param.threadblock_n == 64) ? 64 : 32; | |||||
| uint32_t alignbits = iterleaved * 4; | |||||
| cutlass_wrapper::reorder_nhwc_imma_filter<4>( | |||||
| reinterpret_cast<int8_t*>(reordered_filter), | |||||
| reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci, fh, | |||||
| fw, trans_oc, alignbits, oc_iterleaved, stream); | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -77,6 +77,14 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::is_available( | |||||
| // FIXME: too large filter size is not supported now | // FIXME: too large filter size is not supported now | ||||
| size_t kMaxFilterPixels = 848 / (2 * m_algo_param.warp_k / 32) - 2; | size_t kMaxFilterPixels = 848 / (2 * m_algo_param.warp_k / 32) - 2; | ||||
| available &= fh * fw <= kMaxFilterPixels; | available &= fh * fw <= kMaxFilterPixels; | ||||
| bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); | |||||
| bool without_shared_load = (param.format == Format::NCHW32); | |||||
| const auto* op = get_cutlass_conv_op( | |||||
| args, ConvOperator::kFprop, ConvType::kConvolution, | |||||
| use_conv_filter_unity_opt, without_shared_load); | |||||
| available &= (op != nullptr); | |||||
| return available; | return available; | ||||
| } | } | ||||
| @@ -155,12 +163,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( | |||||
| gamma = z_scale / dst_scale; | gamma = z_scale / dst_scale; | ||||
| } | } | ||||
| float delta = 0.f, theta = 0.f, threshold = 0.f; | float delta = 0.f, theta = 0.f, threshold = 0.f; | ||||
| bool load_from_const = !(fh == 1 && fw == 1); | |||||
| bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); | |||||
| bool without_shared_load = (param.format == Format::NCHW32); | bool without_shared_load = (param.format == Format::NCHW32); | ||||
| const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop, | const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop, | ||||
| ConvType::kConvolution, | ConvType::kConvolution, | ||||
| load_from_const, without_shared_load); | |||||
| use_conv_filter_unity_opt, without_shared_load); | |||||
| execute_cutlass_conv_op( | execute_cutlass_conv_op( | ||||
| op, args.src_tensor->raw_ptr, filter_ptr, args.bias_tensor->raw_ptr, | op, args.src_tensor->raw_ptr, filter_ptr, args.bias_tensor->raw_ptr, | ||||
| @@ -98,7 +98,14 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available( | |||||
| // FIXME: too large filter size is not supported now | // FIXME: too large filter size is not supported now | ||||
| size_t kMaxFilterPixels = 848 / (2 * m_algo_param.warp_k / 4) - 2; | size_t kMaxFilterPixels = 848 / (2 * m_algo_param.warp_k / 4) - 2; | ||||
| available &= fh * fw <= kMaxFilterPixels; | available &= fh * fw <= kMaxFilterPixels; | ||||
| ; | |||||
| bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); | |||||
| bool without_shared_load = false; | |||||
| const auto* op = get_cutlass_conv_op( | |||||
| args, ConvOperator::kFprop, ConvType::kConvolution, | |||||
| use_conv_filter_unity_opt, without_shared_load); | |||||
| available &= (op != nullptr); | |||||
| return available; | return available; | ||||
| } | } | ||||
| @@ -213,12 +220,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||||
| } | } | ||||
| } | } | ||||
| float threshold = 0.f; | float threshold = 0.f; | ||||
| bool load_from_const = !(fh == 1 && fw == 1); | |||||
| bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); | |||||
| bool without_shared_load = false; | bool without_shared_load = false; | ||||
| const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop, | |||||
| ConvType::kConvolution, | |||||
| load_from_const, without_shared_load); | |||||
| const auto* op = get_cutlass_conv_op( | |||||
| args, ConvOperator::kFprop, ConvType::kConvolution, | |||||
| use_conv_filter_unity_opt, without_shared_load); | |||||
| execute_cutlass_conv_op( | execute_cutlass_conv_op( | ||||
| op, args.src_tensor->raw_ptr, filter_ptr, args.bias_tensor->raw_ptr, | op, args.src_tensor->raw_ptr, filter_ptr, args.bias_tensor->raw_ptr, | ||||
| @@ -0,0 +1,294 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "src/common/conv_bias.h" | |||||
| #include "src/cuda/conv_bias/algo.h" | |||||
| #include "src/cuda/conv_bias/cutlass_reorder_filter.cuh" | |||||
| #include "src/cuda/convolution_helper/parameter.cuh" | |||||
| #include "src/cuda/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace cuda; | |||||
| using namespace convolution; | |||||
| #if CUDA_VERSION >= 10020 | |||||
| bool ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::is_available( | |||||
| const SizeArgs& args) const { | |||||
| if (args.bias_layout->ndim <= 0) | |||||
| return false; | |||||
| using Param = param::ConvBias; | |||||
| using Format = Param::Format; | |||||
| using Sparse = Param::Sparse; | |||||
| using Mode = Param::Mode; | |||||
| using NonlineMode = megdnn::param::ConvBias::NonlineMode; | |||||
| auto&& param = args.opr->param(); | |||||
| if (!check_bias_share_in_channel(*(args.bias_layout), param.format)) | |||||
| return false; | |||||
| if (param.format != Format::NHWC || param.sparse != Sparse::DENSE || | |||||
| param.mode != Mode::CROSS_CORRELATION) | |||||
| return false; | |||||
| if (param.nonlineMode != NonlineMode::IDENTITY && | |||||
| param.nonlineMode != NonlineMode::RELU && | |||||
| param.nonlineMode != NonlineMode::H_SWISH) | |||||
| return false; | |||||
| if (args.src_layout->dtype.enumv() != DTypeEnum::QuantizedS8 || | |||||
| args.filter_layout->dtype.enumv() != DTypeEnum::QuantizedS8) | |||||
| return false; | |||||
| auto dst_dtype = args.dst_layout->dtype.enumv(); | |||||
| if (!(dst_dtype == DTypeEnum::QuantizedS8 || | |||||
| dst_dtype == DTypeEnum::QuantizedS4 || | |||||
| dst_dtype == DTypeEnum::Quantized4Asymm || | |||||
| dst_dtype == DTypeEnum::Float32)) | |||||
| return false; | |||||
| if (!(args.bias_layout->dtype.enumv() == DTypeEnum::QuantizedS32 || | |||||
| (args.bias_layout->dtype.enumv() == DTypeEnum::Float32 && | |||||
| dst_dtype == DTypeEnum::Float32))) | |||||
| return false; | |||||
| if (!is_compute_capability_required(7, 5)) | |||||
| return false; | |||||
| size_t co = args.filter_layout->operator[](0), | |||||
| ci = args.filter_layout->operator[](3), | |||||
| fh = args.filter_layout->operator[](1), | |||||
| fw = args.filter_layout->operator[](2); | |||||
| // param buffer size is 4K, use 3.4K to store precomputed offset | |||||
| size_t kMaxFilterPixels = | |||||
| 848 / (m_algo_param.warp_k / m_algo_param.access_size) - 1; | |||||
| if (fh * fw > kMaxFilterPixels) | |||||
| return false; | |||||
| // co should be aligned with 4, and ci should be aligned with | |||||
| // algo_param.access_size | |||||
| if ((co % 4 != 0) || (ci % m_algo_param.access_size != 0)) | |||||
| return false; | |||||
| bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); | |||||
| bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) && | |||||
| (m_algo_param.threadblock_n == 16 || | |||||
| (m_algo_param.threadblock_n == 32 && | |||||
| dst_dtype != DTypeEnum::Float32))); | |||||
| const auto* op = get_cutlass_conv_op( | |||||
| args, ConvOperator::kFprop, ConvType::kConvolution, | |||||
| use_conv_filter_unity_opt, without_shared_load); | |||||
| if (op == nullptr) | |||||
| return false; | |||||
| return true; | |||||
| } | |||||
| size_t | |||||
| ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::get_workspace_in_bytes( | |||||
| const SizeArgs& args) const { | |||||
| if (args.preprocessed_filter) { | |||||
| return 0; | |||||
| } else { | |||||
| return args.filter_layout->span().dist_byte(); | |||||
| } | |||||
| } | |||||
| size_t ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm:: | |||||
| get_preprocess_workspace_in_bytes(const SizeArgs& args) const { | |||||
| return 0; | |||||
| } | |||||
| SmallVector<TensorLayout> ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm:: | |||||
| deduce_preprocessed_filter_layout(const SizeArgs& args) const { | |||||
| return {args.filter_layout->collapse_contiguous()}; | |||||
| } | |||||
| void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::exec_preprocess( | |||||
| const ExecArgs& args) const { | |||||
| void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; | |||||
| reorder_filter(args, m_algo_param.access_size, filter_ptr); | |||||
| } | |||||
| std::tuple<float, float, float, float, float> | |||||
| ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::get_constants( | |||||
| const ExecArgs& args) const { | |||||
| float src_scale = args.src_layout->dtype.param<dtype::QuantizedS8>().scale, | |||||
| filter_scale = | |||||
| args.filter_layout->dtype.param<dtype::QuantizedS8>().scale, | |||||
| bias_scale = 1.f, dst_scale; | |||||
| if (args.bias_layout->dtype.enumv() == DTypeEnum::QuantizedS32) { | |||||
| bias_scale = args.bias_layout->dtype.param<dtype::QuantizedS32>().scale; | |||||
| } | |||||
| uint8_t dst_zero = 0; | |||||
| if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8) { | |||||
| dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale; | |||||
| } else if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) { | |||||
| dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale; | |||||
| } else if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { | |||||
| dst_scale = | |||||
| args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale; | |||||
| dst_zero = args.dst_layout->dtype.param<dtype::Quantized4Asymm>() | |||||
| .zero_point; | |||||
| } else { // DTypeEnum::Float32 | |||||
| megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::Float32); | |||||
| dst_scale = 1.f; | |||||
| } | |||||
| float alpha = src_scale * filter_scale / dst_scale, | |||||
| beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, | |||||
| theta = dst_zero; | |||||
| if (args.z_layout->ndim > 0) { | |||||
| float z_scale; | |||||
| if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS8) { | |||||
| z_scale = args.z_layout->dtype.param<dtype::QuantizedS8>().scale; | |||||
| gamma = z_scale / dst_scale; | |||||
| } else if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS4) { | |||||
| z_scale = args.z_layout->dtype.param<dtype::QuantizedS4>().scale; | |||||
| gamma = z_scale / dst_scale; | |||||
| } else if (args.z_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { | |||||
| z_scale = | |||||
| args.z_layout->dtype.param<dtype::Quantized4Asymm>().scale; | |||||
| uint8_t z_zero = | |||||
| args.z_layout->dtype.param<dtype::Quantized4Asymm>() | |||||
| .zero_point; | |||||
| gamma = z_scale / dst_scale; | |||||
| delta = -z_zero * gamma; | |||||
| } else { // DTypeEnum::Float32 | |||||
| megdnn_assert(args.z_layout->dtype.enumv() == DTypeEnum::Float32); | |||||
| gamma = 1.f; | |||||
| } | |||||
| } | |||||
| if (args.opr->param().nonlineMode == | |||||
| param::ConvBias::NonlineMode::IDENTITY) { | |||||
| delta += theta; | |||||
| theta = 0.f; | |||||
| } | |||||
| return {alpha, beta, gamma, delta, theta}; | |||||
| } | |||||
| void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::exec( | |||||
| const ExecArgs& args) const { | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.filter_meta; | |||||
| size_t n = args.src_layout->operator[](0), | |||||
| ci = args.src_layout->operator[](3), | |||||
| hi = args.src_layout->operator[](1), | |||||
| wi = args.src_layout->operator[](2); | |||||
| size_t co = args.dst_layout->operator[](3), | |||||
| ho = args.dst_layout->operator[](1), | |||||
| wo = args.dst_layout->operator[](2); | |||||
| UNPACK_CONV_PARAMETER(fm, param); | |||||
| MARK_USED_VAR | |||||
| void* filter_ptr = nullptr; | |||||
| void* bias_ptr = nullptr; | |||||
| void* z_ptr = nullptr; | |||||
| if (args.preprocessed_filter) { | |||||
| filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr; | |||||
| } else { | |||||
| filter_ptr = reinterpret_cast<void*>(args.workspace.raw_ptr); | |||||
| reorder_filter(args, m_algo_param.access_size, filter_ptr); | |||||
| } | |||||
| bias_ptr = args.bias_tensor->raw_ptr; | |||||
| if (args.z_layout->ndim > 0) | |||||
| z_ptr = args.z_tensor->raw_ptr; | |||||
| // \note these constants of cutlass epilogue will be passed to method | |||||
| // `execute_cutlass_conv_op` by pointer and interpreted as ElementCompute*, | |||||
| // a different dtype here results in undefined epilogue behaviors | |||||
| float alpha, beta, gamma, delta, theta; | |||||
| std::tie(alpha, beta, gamma, delta, theta) = get_constants(args); | |||||
| float dst_scale = 1.f; | |||||
| float threshold = 0.f; | |||||
| bool use_conv_filter_unity_opt = (fh == 1 && fw == 1); | |||||
| auto dst_dtype = args.dst_layout->dtype.enumv(); | |||||
| bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) && | |||||
| (m_algo_param.threadblock_n == 16 || | |||||
| (m_algo_param.threadblock_n == 32 && | |||||
| dst_dtype != DTypeEnum::Float32))); | |||||
| if (dst_dtype == DTypeEnum::QuantizedS8) { // DTypeEnum::QuantizedS8 | |||||
| dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale; | |||||
| } else if (dst_dtype == DTypeEnum::QuantizedS4) { | |||||
| dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale; | |||||
| } else if (dst_dtype == DTypeEnum::Quantized4Asymm) { | |||||
| dst_scale = | |||||
| args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale; | |||||
| } else { // DTypeEnum::Float32 | |||||
| dst_scale = 1.f; | |||||
| } | |||||
| cudaStream_t stream = cuda_stream(args.opr->handle()); | |||||
| const auto* op = get_cutlass_conv_op( | |||||
| args, ConvOperator::kFprop, ConvType::kConvolution, | |||||
| use_conv_filter_unity_opt, without_shared_load); | |||||
| execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr, | |||||
| z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi, | |||||
| ci, co, fh, fw, ho, wo, ph, pw, sh, sw, dh, dw, | |||||
| &alpha, &beta, &gamma, &delta, &theta, &threshold, | |||||
| &dst_scale, stream); | |||||
| after_kernel_launch(); | |||||
| } | |||||
| std::string ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::to_string( | |||||
| AlgoParam algo_param) { | |||||
| return ssprintf("%dX%dX%d_%dX%dX%d_%d_%d", algo_param.threadblock_m, | |||||
| algo_param.threadblock_n, algo_param.threadblock_k, | |||||
| algo_param.warp_m, algo_param.warp_n, algo_param.warp_k, | |||||
| algo_param.stage, algo_param.access_size); | |||||
| } | |||||
| void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::reorder_filter( | |||||
| const ExecArgs& args, const int iterleaved, | |||||
| void* reordered_filter) const { | |||||
| size_t co = args.filter_layout->operator[](0), | |||||
| ci = args.filter_layout->operator[](3), | |||||
| fh = args.filter_layout->operator[](1), | |||||
| fw = args.filter_layout->operator[](2); | |||||
| cudaStream_t stream = cuda_stream(args.opr->handle()); | |||||
| // reformat filter from nhwc to ncxhwx and reorder oc | |||||
| // use trans_oc threadblock_n must be 16 or 32 and src dtype == dest dtype | |||||
| bool trans_oc = ((co % m_algo_param.threadblock_n == 0) && | |||||
| (m_algo_param.threadblock_n == 16 || | |||||
| (m_algo_param.threadblock_n == 32 && | |||||
| args.dst_layout->dtype.enumv() != DTypeEnum::Float32))); | |||||
| uint32_t oc_iterleaved = (m_algo_param.threadblock_n == 32) ? 32 : 16; | |||||
| uint32_t alignbits = iterleaved * 8; | |||||
| cutlass_wrapper::reorder_nhwc_imma_filter<8>( | |||||
| reinterpret_cast<int8_t*>(reordered_filter), | |||||
| reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci, fh, | |||||
| fw, trans_oc, alignbits, oc_iterleaved, stream); | |||||
| } | |||||
| #endif | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -102,22 +102,41 @@ ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm::get_constants( | |||||
| args.filter_layout->dtype.param<dtype::QuantizedS4>().scale, | args.filter_layout->dtype.param<dtype::QuantizedS4>().scale, | ||||
| bias_scale = | bias_scale = | ||||
| args.bias_layout->dtype.param<dtype::QuantizedS32>().scale, | args.bias_layout->dtype.param<dtype::QuantizedS32>().scale, | ||||
| dst_scale = | |||||
| args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale; | |||||
| dst_scale; | |||||
| uint8_t dst_zero = 0; | |||||
| if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { | |||||
| dst_scale = | |||||
| args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale; | |||||
| dst_zero = args.dst_layout->dtype.param<dtype::Quantized4Asymm>() | |||||
| .zero_point; | |||||
| } else { // DTypeEnum::QuantizedS8 | |||||
| megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8); | |||||
| dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale; | |||||
| } | |||||
| uint8_t dst_zero = | |||||
| args.dst_layout->dtype.param<dtype::Quantized4Asymm>().zero_point; | |||||
| float alpha = src_scale * filter_scale / dst_scale, | float alpha = src_scale * filter_scale / dst_scale, | ||||
| beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, | beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, | ||||
| theta = dst_zero; | theta = dst_zero; | ||||
| if (args.z_layout->ndim > 0) { | if (args.z_layout->ndim > 0) { | ||||
| float z_scale = | |||||
| args.z_layout->dtype.param<dtype::Quantized4Asymm>().scale; | |||||
| gamma = z_scale / dst_scale; | |||||
| uint8_t z_zero = | |||||
| args.z_layout->dtype.param<dtype::Quantized4Asymm>().zero_point; | |||||
| delta = -z_zero * gamma; | |||||
| float z_scale; | |||||
| if (args.z_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { | |||||
| z_scale = | |||||
| args.z_layout->dtype.param<dtype::Quantized4Asymm>().scale; | |||||
| uint8_t z_zero = | |||||
| args.z_layout->dtype.param<dtype::Quantized4Asymm>() | |||||
| .zero_point; | |||||
| gamma = z_scale / dst_scale; | |||||
| delta = -z_zero * gamma; | |||||
| } else { // DTypeEnum::QuantizedS8 | |||||
| megdnn_assert(args.z_layout->dtype.enumv() == | |||||
| DTypeEnum::QuantizedS8); | |||||
| z_scale = args.z_layout->dtype.param<dtype::QuantizedS8>().scale; | |||||
| gamma = z_scale / dst_scale; | |||||
| } | |||||
| } | } | ||||
| // identity epilogue has no theta: | // identity epilogue has no theta: | ||||
| @@ -65,6 +65,7 @@ public: | |||||
| class AlgoInt8CHWN4IMMAImplicitGemmReorderFilter; | class AlgoInt8CHWN4IMMAImplicitGemmReorderFilter; | ||||
| class AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth; | class AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth; | ||||
| class AlgoInt8NCHW32IMMAImplicitGemm; | class AlgoInt8NCHW32IMMAImplicitGemm; | ||||
| class AlgoInt8NHWCIMMAImplicitGemm; | |||||
| class AlgoInt4NCHW64IMMAImplicitGemmBase; | class AlgoInt4NCHW64IMMAImplicitGemmBase; | ||||
| class AlgoInt4Int4NCHW64IMMAImplicitGemm; | class AlgoInt4Int4NCHW64IMMAImplicitGemm; | ||||
| class AlgoUInt4Int4NCHW64IMMAImplicitGemm; | class AlgoUInt4Int4NCHW64IMMAImplicitGemm; | ||||
| @@ -275,6 +275,7 @@ public: | |||||
| private: | private: | ||||
| WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, | WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, | ||||
| const SizeArgs& args) const; | const SizeArgs& args) const; | ||||
| const void* get_available_op(const SizeArgs& args) const; | |||||
| AlgoParam m_algo_param; | AlgoParam m_algo_param; | ||||
| std::string m_name; | std::string m_name; | ||||
| }; | }; | ||||
| @@ -295,6 +296,7 @@ public: | |||||
| private: | private: | ||||
| WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, | WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, | ||||
| const SizeArgs& args) const; | const SizeArgs& args) const; | ||||
| const void* get_available_op(const SizeArgs& args) const; | |||||
| }; | }; | ||||
| class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj { | class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj { | ||||
| @@ -20,6 +20,43 @@ | |||||
| using namespace megdnn; | using namespace megdnn; | ||||
| using namespace cuda; | using namespace cuda; | ||||
| const void* | |||||
| ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::get_available_op( | |||||
| const SizeArgs& args) const { | |||||
| using namespace cutlass::library; | |||||
| auto&& fm = args.filter_meta; | |||||
| size_t sh = fm.stride[0], sw = fm.stride[1]; | |||||
| cutlass::conv::SpecialOptimizeDesc special_optimization = | |||||
| (sh == 2 && sw == 2) ? cutlass::conv::SpecialOptimizeDesc:: | |||||
| DECONV_DOUBLE_UPSAMPLING | |||||
| : cutlass::conv::SpecialOptimizeDesc::NONE; | |||||
| ConvolutionKey key{ | |||||
| cutlass::conv::Operator::kDgrad, | |||||
| NumericTypeID::kS8, | |||||
| LayoutTypeID::kTensorNC4HW4, | |||||
| NumericTypeID::kS8, | |||||
| LayoutTypeID::kTensorK4RSC4, | |||||
| NumericTypeID::kS8, | |||||
| LayoutTypeID::kTensorNC4HW4, | |||||
| NumericTypeID::kS32, | |||||
| LayoutTypeID::kTensorNC4HW4, | |||||
| cutlass::conv::ConvType::kConvolution, | |||||
| m_algo_param.threadblock_m, | |||||
| m_algo_param.threadblock_n, | |||||
| m_algo_param.threadblock_k, | |||||
| m_algo_param.warp_m, | |||||
| m_algo_param.warp_n, | |||||
| m_algo_param.warp_k, | |||||
| 1, | |||||
| 1, | |||||
| 4, | |||||
| cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, | |||||
| m_algo_param.stage, | |||||
| special_optimization, | |||||
| false}; | |||||
| return (void*)Singleton::get().operation_table.find_op(key); | |||||
| } | |||||
| bool ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: | bool ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: | ||||
| is_available(const SizeArgs& args) const { | is_available(const SizeArgs& args) const { | ||||
| auto&& fm = args.filter_meta; | auto&& fm = args.filter_meta; | ||||
| @@ -51,6 +88,7 @@ bool ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: | |||||
| // FIXME: too large filter size is not supported now | // FIXME: too large filter size is not supported now | ||||
| available &= fm.spatial[0] * fm.spatial[1] <= | available &= fm.spatial[0] * fm.spatial[1] <= | ||||
| (uint32_t)(848 / (2 * m_algo_param.warp_k / 4) - 2); | (uint32_t)(848 / (2 * m_algo_param.warp_k / 4) - 2); | ||||
| available &= (get_available_op(args) != nullptr); | |||||
| // only support sm_61 or later, platform should have fast native int8 | // only support sm_61 or later, platform should have fast native int8 | ||||
| // support | // support | ||||
| available &= is_compute_capability_required(6, 1); | available &= is_compute_capability_required(6, 1); | ||||
| @@ -105,40 +143,14 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||||
| args.grad_layout->dtype.param<dtype::QuantizedS8>().scale; | args.grad_layout->dtype.param<dtype::QuantizedS8>().scale; | ||||
| // \note these constants of cutlass epilogue will be passed to struct | // \note these constants of cutlass epilogue will be passed to struct | ||||
| // `ConvolutionArguments` by pointer and interpreted as ElementCompute*, a | |||||
| // different dtype here results in undefined epilogue behaviors | |||||
| // `ConvolutionArguments` by pointer and interpreted as ElementCompute*, | |||||
| // a different dtype here results in undefined epilogue behaviors | |||||
| float alpha = diff_scale * filter_scale / grad_scale, beta = 0.f, | float alpha = diff_scale * filter_scale / grad_scale, beta = 0.f, | ||||
| gamma = 0.f, delta = 0.f; | gamma = 0.f, delta = 0.f; | ||||
| using namespace cutlass::library; | using namespace cutlass::library; | ||||
| // only use 16x64x8_16x64x8_2stages impl | |||||
| ConvolutionKey key{ | |||||
| cutlass::conv::Operator::kDgrad, | |||||
| NumericTypeID::kS8, | |||||
| LayoutTypeID::kTensorNC4HW4, | |||||
| NumericTypeID::kS8, | |||||
| LayoutTypeID::kTensorK4RSC4, | |||||
| NumericTypeID::kS8, | |||||
| LayoutTypeID::kTensorNC4HW4, | |||||
| NumericTypeID::kS32, | |||||
| LayoutTypeID::kTensorNC4HW4, | |||||
| cutlass::conv::ConvType::kConvolution, | |||||
| m_algo_param.threadblock_m, | |||||
| m_algo_param.threadblock_n, | |||||
| m_algo_param.threadblock_k, | |||||
| m_algo_param.warp_m, | |||||
| m_algo_param.warp_n, | |||||
| m_algo_param.warp_k, | |||||
| 1, | |||||
| 1, | |||||
| 4, | |||||
| cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, | |||||
| m_algo_param.stage, | |||||
| true, | |||||
| false}; | |||||
| const Operation* op = Singleton::get().operation_table.find_op(key); | |||||
| const Operation* op = (const Operation*)get_available_op(args); | |||||
| // gcc prints warnings when size_t values are implicitly narrowed to int | // gcc prints warnings when size_t values are implicitly narrowed to int | ||||
| cutlass::conv::Conv2dProblemSize problem_size{ | cutlass::conv::Conv2dProblemSize problem_size{ | ||||
| @@ -167,7 +179,6 @@ void ConvolutionBackwardDataImpl::AlgoPack::fill_int8_dp4a_algos() { | |||||
| int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2}); | int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2}); | ||||
| int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1}); | int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1}); | ||||
| int8_nchw4_dotprod.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2}); | int8_nchw4_dotprod.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2}); | ||||
| int8_nchw4_dotprod.emplace_back(AlgoParam{64, 128, 32, 64, 32, 32, 2}); | |||||
| } | } | ||||
| // vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen | ||||
| @@ -19,6 +19,44 @@ | |||||
| using namespace megdnn; | using namespace megdnn; | ||||
| using namespace cuda; | using namespace cuda; | ||||
| const void* | |||||
| ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::get_available_op( | |||||
| const SizeArgs& args) const { | |||||
| using namespace cutlass::library; | |||||
| auto&& fm = args.filter_meta; | |||||
| size_t sh = fm.stride[0], sw = fm.stride[1]; | |||||
| cutlass::conv::SpecialOptimizeDesc special_optimization = | |||||
| (sh == 2 && sw == 2) ? cutlass::conv::SpecialOptimizeDesc:: | |||||
| DECONV_DOUBLE_UPSAMPLING | |||||
| : cutlass::conv::SpecialOptimizeDesc::NONE; | |||||
| // only use 16x64x8_16x64x8_2stages impl | |||||
| ConvolutionKey key{ | |||||
| cutlass::conv::Operator::kDgrad, | |||||
| NumericTypeID::kS8, | |||||
| LayoutTypeID::kTensorNC4HW4, | |||||
| NumericTypeID::kS8, | |||||
| LayoutTypeID::kTensorK4RSC4, | |||||
| NumericTypeID::kS8, | |||||
| LayoutTypeID::kTensorNC4HW4, | |||||
| NumericTypeID::kS32, | |||||
| LayoutTypeID::kTensorNC4HW4, | |||||
| cutlass::conv::ConvType::kConvolution, | |||||
| 16, | |||||
| 64, | |||||
| 8, | |||||
| 16, | |||||
| 64, | |||||
| 8, | |||||
| 1, | |||||
| 1, | |||||
| 4, | |||||
| cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, | |||||
| 2, | |||||
| special_optimization, | |||||
| false}; | |||||
| return (void*)Singleton::get().operation_table.find_op(key); | |||||
| } | |||||
| bool ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::is_available( | bool ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::is_available( | ||||
| const SizeArgs& args) const { | const SizeArgs& args) const { | ||||
| auto&& fm = args.filter_meta; | auto&& fm = args.filter_meta; | ||||
| @@ -52,6 +90,9 @@ bool ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::is_available( | |||||
| available &= (fm.dilation[0] == 1 && fm.dilation[1] == 1); | available &= (fm.dilation[0] == 1 && fm.dilation[1] == 1); | ||||
| // FIXME: too large filter size is not supported now | // FIXME: too large filter size is not supported now | ||||
| available &= fm.spatial[0] * fm.spatial[1] <= (848 / (2 * 8 / 4) - 2); | available &= fm.spatial[0] * fm.spatial[1] <= (848 / (2 * 8 / 4) - 2); | ||||
| available &= (get_available_op(args) != nullptr); | |||||
| // only support sm_61 or later, platform should have fast native int8 | // only support sm_61 or later, platform should have fast native int8 | ||||
| // support | // support | ||||
| available &= is_compute_capability_required(6, 1); | available &= is_compute_capability_required(6, 1); | ||||
| @@ -138,33 +179,7 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::exec( | |||||
| using namespace cutlass::library; | using namespace cutlass::library; | ||||
| // only use 16x64x8_16x64x8_2stages impl | |||||
| ConvolutionKey key{ | |||||
| cutlass::conv::Operator::kDgrad, | |||||
| NumericTypeID::kS8, | |||||
| LayoutTypeID::kTensorNC4HW4, | |||||
| NumericTypeID::kS8, | |||||
| LayoutTypeID::kTensorK4RSC4, | |||||
| NumericTypeID::kS8, | |||||
| LayoutTypeID::kTensorNC4HW4, | |||||
| NumericTypeID::kS32, | |||||
| LayoutTypeID::kTensorNC4HW4, | |||||
| cutlass::conv::ConvType::kConvolution, | |||||
| 16, | |||||
| 64, | |||||
| 8, | |||||
| 16, | |||||
| 64, | |||||
| 8, | |||||
| 1, | |||||
| 1, | |||||
| 4, | |||||
| cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, | |||||
| 2, | |||||
| true, | |||||
| false}; | |||||
| const Operation* op = Singleton::get().operation_table.find_op(key); | |||||
| const Operation* op = (const Operation*)get_available_op(args); | |||||
| // gcc prints warnings when size_t values are implicitly narrowed to int | // gcc prints warnings when size_t values are implicitly narrowed to int | ||||
| cutlass::conv::Conv2dProblemSize problem_size{ | cutlass::conv::Conv2dProblemSize problem_size{ | ||||
| @@ -119,8 +119,8 @@ public: | |||||
| m_description.threadblock_swizzle = ThreadblockSwizzleMap< | m_description.threadblock_swizzle = ThreadblockSwizzleMap< | ||||
| typename Operator::ThreadblockSwizzle>::kId; | typename Operator::ThreadblockSwizzle>::kId; | ||||
| m_description.need_load_from_const_mem = | |||||
| Operator::kNeedLoadFromConstMem; | |||||
| m_description.special_optimization = | |||||
| Operator::kSpecialOpt; | |||||
| m_description.gemm_mode = Operator::kGemmMode; | m_description.gemm_mode = Operator::kGemmMode; | ||||
| m_description.without_shared_load = Operator::kWithoutSharedLoad; | m_description.without_shared_load = Operator::kWithoutSharedLoad; | ||||
| } | } | ||||
| @@ -487,7 +487,7 @@ struct ConvolutionDescription : public OperationDescription { | |||||
| ThreadblockSwizzleID threadblock_swizzle; | ThreadblockSwizzleID threadblock_swizzle; | ||||
| bool need_load_from_const_mem; | |||||
| conv::SpecialOptimizeDesc special_optimization; | |||||
| conv::ImplicitGemmMode gemm_mode; | conv::ImplicitGemmMode gemm_mode; | ||||
| bool without_shared_load; | bool without_shared_load; | ||||
| }; | }; | ||||
| @@ -124,7 +124,7 @@ ConvolutionKey get_convolution_key_from_desc( | |||||
| key.epilogue_type = desc.epilogue_type; | key.epilogue_type = desc.epilogue_type; | ||||
| key.stages = desc.tile_description.threadblock_stages; | key.stages = desc.tile_description.threadblock_stages; | ||||
| key.need_load_from_const_mem = desc.need_load_from_const_mem; | |||||
| key.special_optimization = desc.special_optimization; | |||||
| key.without_shared_load = desc.without_shared_load; | key.without_shared_load = desc.without_shared_load; | ||||
| return key; | return key; | ||||
| @@ -156,23 +156,25 @@ void OperationTable::append(Manifest const& manifest) { | |||||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| Operation const* OperationTable::find_op(GemmKey const& key) const { | Operation const* OperationTable::find_op(GemmKey const& key) const { | ||||
| megdnn_assert(gemm_operations.count(key) > 0, | |||||
| "key not found in cutlass operation table"); | |||||
| auto const& ops = gemm_operations.at(key); | |||||
| megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu", | |||||
| ops.size()); | |||||
| return ops[0]; | |||||
| if (gemm_operations.count(key)) { | |||||
| auto const& ops = gemm_operations.at(key); | |||||
| megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu", | |||||
| ops.size()); | |||||
| return ops[0]; | |||||
| } | |||||
| return nullptr; | |||||
| } | } | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| Operation const* OperationTable::find_op(ConvolutionKey const& key) const { | Operation const* OperationTable::find_op(ConvolutionKey const& key) const { | ||||
| megdnn_assert(convolution_operations.count(key) > 0, | |||||
| "key not found in cutlass operation table"); | |||||
| auto const& ops = convolution_operations.at(key); | |||||
| megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu", | |||||
| ops.size()); | |||||
| return ops[0]; | |||||
| if (convolution_operations.count(key) > 0) { | |||||
| auto const& ops = convolution_operations.at(key); | |||||
| megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu", | |||||
| ops.size()); | |||||
| return ops[0]; | |||||
| } | |||||
| return nullptr; | |||||
| } | } | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| @@ -211,7 +211,7 @@ struct ConvolutionKey { | |||||
| epilogue::EpilogueType epilogue_type; | epilogue::EpilogueType epilogue_type; | ||||
| int stages; | int stages; | ||||
| bool need_load_from_const_mem; | |||||
| conv::SpecialOptimizeDesc special_optimization; | |||||
| bool without_shared_load; | bool without_shared_load; | ||||
| inline bool operator==(ConvolutionKey const& rhs) const { | inline bool operator==(ConvolutionKey const& rhs) const { | ||||
| @@ -234,7 +234,7 @@ struct ConvolutionKey { | |||||
| (instruction_shape_n == rhs.instruction_shape_n) && | (instruction_shape_n == rhs.instruction_shape_n) && | ||||
| (instruction_shape_k == rhs.instruction_shape_k) && | (instruction_shape_k == rhs.instruction_shape_k) && | ||||
| (epilogue_type == rhs.epilogue_type) && (stages == rhs.stages) && | (epilogue_type == rhs.epilogue_type) && (stages == rhs.stages) && | ||||
| (need_load_from_const_mem == rhs.need_load_from_const_mem) && | |||||
| (special_optimization == rhs.special_optimization) && | |||||
| (without_shared_load == rhs.without_shared_load); | (without_shared_load == rhs.without_shared_load); | ||||
| } | } | ||||
| @@ -270,8 +270,8 @@ struct ConvolutionKey { | |||||
| "\n instruction_shape: " + instruction_shape_str + | "\n instruction_shape: " + instruction_shape_str + | ||||
| "\n epilogue_type: " + to_string(epilogue_type) + | "\n epilogue_type: " + to_string(epilogue_type) + | ||||
| "\n stages: " + std::to_string(stages) + | "\n stages: " + std::to_string(stages) + | ||||
| "\n need_load_from_const_mem: " + | |||||
| to_string(need_load_from_const_mem) + | |||||
| "\n special_optimization: " + | |||||
| to_string(special_optimization) + | |||||
| "\n without_shared_load: " + to_string(without_shared_load) + | "\n without_shared_load: " + to_string(without_shared_load) + | ||||
| "\n}"; | "\n}"; | ||||
| } | } | ||||
| @@ -308,8 +308,8 @@ struct ConvolutionKeyHasher { | |||||
| sizeof(key.instruction_shape_k)) | sizeof(key.instruction_shape_k)) | ||||
| .update(&key.epilogue_type, sizeof(key.epilogue_type)) | .update(&key.epilogue_type, sizeof(key.epilogue_type)) | ||||
| .update(&key.stages, sizeof(key.stages)) | .update(&key.stages, sizeof(key.stages)) | ||||
| .update(&key.need_load_from_const_mem, | |||||
| sizeof(key.need_load_from_const_mem)) | |||||
| .update(&key.special_optimization, | |||||
| sizeof(key.special_optimization)) | |||||
| .update(&key.without_shared_load, | .update(&key.without_shared_load, | ||||
| sizeof(key.without_shared_load)) | sizeof(key.without_shared_load)) | ||||
| .digest(); | .digest(); | ||||
| @@ -1566,6 +1566,35 @@ char const* to_string(MathOperationID math_op, bool pretty) { | |||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| static struct { | |||||
| char const* text; | |||||
| char const* pretty; | |||||
| conv::SpecialOptimizeDesc enumerant; | |||||
| } SpecialOptimizeDesc_enumerants[] = { | |||||
| {"none_special_opt", "NoneSpecialOpt", conv::SpecialOptimizeDesc::NONE}, | |||||
| {"conv_filter_unity", "ConvFilterUnity", | |||||
| conv::SpecialOptimizeDesc::CONV_FILTER_UNITY}, | |||||
| {"deconv_double_upsampling", "DeconvDoubleUpsampling", | |||||
| conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING}, | |||||
| }; | |||||
| /// Converts an SpecialOptimizeDesc enumerant to a string | |||||
| char const* to_string(conv::SpecialOptimizeDesc special_opt, bool pretty) { | |||||
| for (auto const& possible : SpecialOptimizeDesc_enumerants) { | |||||
| if (special_opt == possible.enumerant) { | |||||
| if (pretty) { | |||||
| return possible.pretty; | |||||
| } else { | |||||
| return possible.text; | |||||
| } | |||||
| } | |||||
| } | |||||
| return pretty ? "Invalid" : "invalid"; | |||||
| } | |||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | |||||
| static struct { | static struct { | ||||
| char const* text; | char const* text; | ||||
| char const* pretty; | char const* pretty; | ||||
| @@ -207,6 +207,10 @@ char const* to_string(bool val, bool pretty = false); | |||||
| /// Converts a MathOperationID enumerant to a string | /// Converts a MathOperationID enumerant to a string | ||||
| char const* to_string(MathOperationID math_op, bool pretty = false); | char const* to_string(MathOperationID math_op, bool pretty = false); | ||||
| /// Converts a SpecialOptimizeDesc enumerant to a string | |||||
| char const* to_string(conv::SpecialOptimizeDesc special_opt, | |||||
| bool pretty = false); | |||||
| /// Converts an ImplicitGemmMode enumerant to a string | /// Converts an ImplicitGemmMode enumerant to a string | ||||
| char const* to_string(conv::ImplicitGemmMode mode, bool pretty = false); | char const* to_string(conv::ImplicitGemmMode mode, bool pretty = false); | ||||
| @@ -235,6 +235,7 @@ public: | |||||
| m_name{ssprintf("CUTLASS_FLOAT32_SIMT_%s", | m_name{ssprintf("CUTLASS_FLOAT32_SIMT_%s", | ||||
| m_algo_param.to_string().c_str())} {} | m_algo_param.to_string().c_str())} {} | ||||
| bool is_available(const SizeArgs& args) const override; | bool is_available(const SizeArgs& args) const override; | ||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | size_t get_workspace_in_bytes(const SizeArgs& args) const override; | ||||
| const char* name() const override { return m_name.c_str(); } | const char* name() const override { return m_name.c_str(); } | ||||
| AlgoAttribute attribute() const override { | AlgoAttribute attribute() const override { | ||||
| @@ -260,6 +261,7 @@ private: | |||||
| void do_exec(const ExecArgs& args) const override; | void do_exec(const ExecArgs& args) const override; | ||||
| int min_alignment_requirement() const override { return 1; } | int min_alignment_requirement() const override { return 1; } | ||||
| std::string m_name; | std::string m_name; | ||||
| const void* get_available_op(const SizeArgs& args) const; | |||||
| }; | }; | ||||
| class MatrixMulForwardImpl::AlgoFloat32SIMTSplitK final | class MatrixMulForwardImpl::AlgoFloat32SIMTSplitK final | ||||
| @@ -270,6 +272,7 @@ public: | |||||
| m_name{ssprintf("CUTLASS_FLOAT32_SIMT_SPLIT_K_%s", | m_name{ssprintf("CUTLASS_FLOAT32_SIMT_SPLIT_K_%s", | ||||
| m_algo_param.to_string().c_str())} {} | m_algo_param.to_string().c_str())} {} | ||||
| bool is_available(const SizeArgs& args) const override; | bool is_available(const SizeArgs& args) const override; | ||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | size_t get_workspace_in_bytes(const SizeArgs& args) const override; | ||||
| const char* name() const override { return m_name.c_str(); } | const char* name() const override { return m_name.c_str(); } | ||||
| AlgoAttribute attribute() const override { | AlgoAttribute attribute() const override { | ||||
| @@ -297,6 +300,7 @@ private: | |||||
| void do_exec(const ExecArgs& args) const override; | void do_exec(const ExecArgs& args) const override; | ||||
| int min_alignment_requirement() const override { return 1; } | int min_alignment_requirement() const override { return 1; } | ||||
| std::string m_name; | std::string m_name; | ||||
| const void* get_available_op(const SizeArgs& args) const; | |||||
| }; | }; | ||||
| class MatrixMulForwardImpl::AlgoFloat32SIMTGemvBatchedStrided final | class MatrixMulForwardImpl::AlgoFloat32SIMTGemvBatchedStrided final | ||||
| @@ -19,6 +19,39 @@ | |||||
| using namespace megdnn; | using namespace megdnn; | ||||
| using namespace cuda; | using namespace cuda; | ||||
| const void* MatrixMulForwardImpl::AlgoFloat32SIMT::get_available_op( | |||||
| const SizeArgs& args) const { | |||||
| using namespace cutlass::library; | |||||
| auto&& param = args.opr->param(); | |||||
| auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor | |||||
| : LayoutTypeID::kRowMajor; | |||||
| auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor | |||||
| : LayoutTypeID::kRowMajor; | |||||
| int alignment = min_alignment_requirement(); | |||||
| GemmKey key{NumericTypeID::kF32, | |||||
| layoutA, | |||||
| NumericTypeID::kF32, | |||||
| layoutB, | |||||
| NumericTypeID::kF32, | |||||
| LayoutTypeID::kRowMajor, | |||||
| NumericTypeID::kF32, | |||||
| m_algo_param.threadblock_m, | |||||
| m_algo_param.threadblock_n, | |||||
| m_algo_param.threadblock_k, | |||||
| m_algo_param.warp_m, | |||||
| m_algo_param.warp_n, | |||||
| m_algo_param.warp_k, | |||||
| 1, | |||||
| 1, | |||||
| 1, | |||||
| 2, | |||||
| alignment, | |||||
| alignment, | |||||
| SplitKMode::kNone}; | |||||
| return (void*)Singleton::get().operation_table.find_op(key); | |||||
| } | |||||
| bool MatrixMulForwardImpl::AlgoFloat32SIMT::is_available( | bool MatrixMulForwardImpl::AlgoFloat32SIMT::is_available( | ||||
| const SizeArgs& args) const { | const SizeArgs& args) const { | ||||
| bool available = | bool available = | ||||
| @@ -34,6 +67,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMT::is_available( | |||||
| m_algo_param.threadblock_n <= | m_algo_param.threadblock_n <= | ||||
| y_grid_limit); | y_grid_limit); | ||||
| available &= (get_available_op(args) != nullptr); | |||||
| return available; | return available; | ||||
| } | } | ||||
| @@ -61,34 +96,7 @@ void MatrixMulForwardImpl::AlgoFloat32SIMT::do_exec( | |||||
| using namespace cutlass::library; | using namespace cutlass::library; | ||||
| auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor | |||||
| : LayoutTypeID::kRowMajor; | |||||
| auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor | |||||
| : LayoutTypeID::kRowMajor; | |||||
| int alignment = min_alignment_requirement(); | |||||
| GemmKey key{NumericTypeID::kF32, | |||||
| layoutA, | |||||
| NumericTypeID::kF32, | |||||
| layoutB, | |||||
| NumericTypeID::kF32, | |||||
| LayoutTypeID::kRowMajor, | |||||
| NumericTypeID::kF32, | |||||
| m_algo_param.threadblock_m, | |||||
| m_algo_param.threadblock_n, | |||||
| m_algo_param.threadblock_k, | |||||
| m_algo_param.warp_m, | |||||
| m_algo_param.warp_n, | |||||
| m_algo_param.warp_k, | |||||
| 1, | |||||
| 1, | |||||
| 1, | |||||
| 2, | |||||
| alignment, | |||||
| alignment, | |||||
| SplitKMode::kNone}; | |||||
| const Operation* op = Singleton::get().operation_table.find_op(key); | |||||
| const Operation* op = (const Operation*)get_available_op(args); | |||||
| GemmArguments gemm_args{problem_size, | GemmArguments gemm_args{problem_size, | ||||
| args.tensor_a.raw_ptr, | args.tensor_a.raw_ptr, | ||||
| @@ -19,6 +19,39 @@ | |||||
| using namespace megdnn; | using namespace megdnn; | ||||
| using namespace cuda; | using namespace cuda; | ||||
| const void* MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::get_available_op( | |||||
| const SizeArgs& args) const { | |||||
| using namespace cutlass::library; | |||||
| auto&& param = args.opr->param(); | |||||
| auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor | |||||
| : LayoutTypeID::kRowMajor; | |||||
| auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor | |||||
| : LayoutTypeID::kRowMajor; | |||||
| int alignment = min_alignment_requirement(); | |||||
| GemmKey key{NumericTypeID::kF32, | |||||
| layoutA, | |||||
| NumericTypeID::kF32, | |||||
| layoutB, | |||||
| NumericTypeID::kF32, | |||||
| LayoutTypeID::kRowMajor, | |||||
| NumericTypeID::kF32, | |||||
| m_algo_param.threadblock_m, | |||||
| m_algo_param.threadblock_n, | |||||
| m_algo_param.threadblock_k, | |||||
| m_algo_param.warp_m, | |||||
| m_algo_param.warp_n, | |||||
| m_algo_param.warp_k, | |||||
| 1, | |||||
| 1, | |||||
| 1, | |||||
| 2, | |||||
| alignment, | |||||
| alignment, | |||||
| SplitKMode::kParallel}; | |||||
| return (void*)Singleton::get().operation_table.find_op(key); | |||||
| } | |||||
| bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available( | bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available( | ||||
| const SizeArgs& args) const { | const SizeArgs& args) const { | ||||
| auto&& param = args.opr->param(); | auto&& param = args.opr->param(); | ||||
| @@ -35,6 +68,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available( | |||||
| available &= ((m + m_algo_param.threadblock_m - 1) / | available &= ((m + m_algo_param.threadblock_m - 1) / | ||||
| m_algo_param.threadblock_m <= | m_algo_param.threadblock_m <= | ||||
| y_grid_limit); | y_grid_limit); | ||||
| available &= (get_available_op(args) != nullptr); | |||||
| return available; | return available; | ||||
| } | } | ||||
| @@ -66,35 +101,7 @@ void MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::do_exec( | |||||
| float alpha = 1.f, beta = 0.f; | float alpha = 1.f, beta = 0.f; | ||||
| using namespace cutlass::library; | using namespace cutlass::library; | ||||
| auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor | |||||
| : LayoutTypeID::kRowMajor; | |||||
| auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor | |||||
| : LayoutTypeID::kRowMajor; | |||||
| int alignment = min_alignment_requirement(); | |||||
| GemmKey key{NumericTypeID::kF32, | |||||
| layoutA, | |||||
| NumericTypeID::kF32, | |||||
| layoutB, | |||||
| NumericTypeID::kF32, | |||||
| LayoutTypeID::kRowMajor, | |||||
| NumericTypeID::kF32, | |||||
| m_algo_param.threadblock_m, | |||||
| m_algo_param.threadblock_n, | |||||
| m_algo_param.threadblock_k, | |||||
| m_algo_param.warp_m, | |||||
| m_algo_param.warp_n, | |||||
| m_algo_param.warp_k, | |||||
| 1, | |||||
| 1, | |||||
| 1, | |||||
| 2, | |||||
| alignment, | |||||
| alignment, | |||||
| SplitKMode::kParallel}; | |||||
| Operation const* op = Singleton::get().operation_table.find_op(key); | |||||
| const Operation* op = (const Operation*)get_available_op(args); | |||||
| GemmArguments gemm_args{problem_size, | GemmArguments gemm_args{problem_size, | ||||
| args.tensor_a.raw_ptr, | args.tensor_a.raw_ptr, | ||||
| @@ -882,6 +882,125 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) { | |||||
| ConvBias::DirectParam{}); | ConvBias::DirectParam{}); | ||||
| check(algo); | check(algo); | ||||
| } | } | ||||
| TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC) { | |||||
| require_compute_capability(7, 5); | |||||
| Checker<ConvBiasForward> checker(handle_cuda()); | |||||
| auto check = [&checker](const std::string& algo) { | |||||
| checker.set_before_exec_callback( | |||||
| conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str())); | |||||
| UniformIntRNG rng{-8, 8}; | |||||
| UniformIntRNG bias_rng{-50, 50}; | |||||
| checker.set_rng(0, &rng) | |||||
| .set_rng(1, &rng) | |||||
| .set_rng(2, &bias_rng) | |||||
| .set_rng(3, &rng) | |||||
| .set_dtype(0, dtype::QuantizedS8{1.2f}) | |||||
| .set_dtype(1, dtype::QuantizedS8{1.3f}) | |||||
| .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f}) | |||||
| .set_dtype(3, dtype::QuantizedS8{19.990229f}) | |||||
| .set_dtype(4, dtype::QuantizedS8{19.990228f}) | |||||
| .set_epsilon(1e-3); | |||||
| param::ConvBias param; | |||||
| param.pad_h = param.pad_w = 1; | |||||
| param.stride_h = param.stride_w = 1; | |||||
| param.format = param::ConvBias::Format::NHWC; | |||||
| checker.set_param(param).execs( | |||||
| {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}}); | |||||
| param.pad_h = param.pad_w = 0; | |||||
| param.nonlineMode = param::ConvBias::NonlineMode::RELU; | |||||
| checker.set_param(param).execs( | |||||
| {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}}); | |||||
| }; | |||||
| std::string algo = ConvBias::algo_name<ConvBias::DirectParam>( | |||||
| "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16", | |||||
| ConvBias::DirectParam{}); | |||||
| check(algo); | |||||
| algo = ConvBias::algo_name<ConvBias::DirectParam>( | |||||
| "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16", | |||||
| ConvBias::DirectParam{}); | |||||
| check(algo); | |||||
| } | |||||
| TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC_UINT4_WEIGHT_PREPROCESS) { | |||||
| require_compute_capability(7, 5); | |||||
| Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker( | |||||
| handle_cuda()); | |||||
| auto check = [&checker](const std::string& algo) { | |||||
| checker.set_before_exec_callback( | |||||
| conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str())); | |||||
| UniformIntRNG rng{-8, 8}; | |||||
| UniformIntRNG bias_rng{-50, 50}; | |||||
| UniformIntRNG rng_u4{0, 15}; | |||||
| checker.set_rng(0, &rng) | |||||
| .set_rng(1, &rng) | |||||
| .set_rng(2, &bias_rng) | |||||
| .set_rng(3, &rng_u4) | |||||
| .set_dtype(0, dtype::QuantizedS8{0.2f}) | |||||
| .set_dtype(1, dtype::QuantizedS8{0.3f}) | |||||
| .set_dtype(2, dtype::QuantizedS32{0.2f * 0.3f}) | |||||
| .set_dtype(3, dtype::Quantized4Asymm{0.5f, 8}) | |||||
| .set_dtype(4, dtype::Quantized4Asymm{0.5f, 4}) | |||||
| .set_epsilon(1 + 1e-3); | |||||
| param::ConvBias param; | |||||
| param.pad_h = param.pad_w = 1; | |||||
| param.stride_h = param.stride_w = 1; | |||||
| param.format = param::ConvBias::Format::NHWC; | |||||
| checker.set_param(param).execs( | |||||
| {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}}); | |||||
| param.pad_h = param.pad_w = 0; | |||||
| param.nonlineMode = param::ConvBias::NonlineMode::RELU; | |||||
| checker.set_param(param).execs( | |||||
| {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}}); | |||||
| }; | |||||
| std::string algo = ConvBias::algo_name<ConvBias::DirectParam>( | |||||
| "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16", | |||||
| ConvBias::DirectParam{}); | |||||
| check(algo); | |||||
| algo = ConvBias::algo_name<ConvBias::DirectParam>( | |||||
| "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16", | |||||
| ConvBias::DirectParam{}); | |||||
| check(algo); | |||||
| } | |||||
| TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC_FLOAT) { | |||||
| require_compute_capability(7, 5); | |||||
| Checker<ConvBiasForward> checker(handle_cuda()); | |||||
| auto check = [&checker](const std::string& algo) { | |||||
| checker.set_before_exec_callback( | |||||
| conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str())); | |||||
| UniformIntRNG rng{-8, 8}; | |||||
| UniformFloatRNG float_rng{-50, 50}; | |||||
| checker.set_rng(0, &rng) | |||||
| .set_rng(1, &rng) | |||||
| .set_rng(2, &float_rng) | |||||
| .set_rng(3, &float_rng) | |||||
| .set_dtype(0, dtype::QuantizedS8(1.9980618f)) | |||||
| .set_dtype(1, dtype::QuantizedS8(1.9980927f)) | |||||
| .set_dtype(2, dtype::Float32()) | |||||
| .set_dtype(3, dtype::Float32()) | |||||
| .set_dtype(4, dtype::Float32()); | |||||
| param::ConvBias param; | |||||
| param.pad_h = param.pad_w = 1; | |||||
| param.stride_h = param.stride_w = 1; | |||||
| param.format = param::ConvBias::Format::NHWC; | |||||
| checker.set_param(param).execs( | |||||
| {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}}); | |||||
| param.pad_h = param.pad_w = 0; | |||||
| param.nonlineMode = param::ConvBias::NonlineMode::RELU; | |||||
| checker.set_param(param).execs( | |||||
| {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}}); | |||||
| }; | |||||
| std::string algo = ConvBias::algo_name<ConvBias::DirectParam>( | |||||
| "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16", | |||||
| ConvBias::DirectParam{}); | |||||
| check(algo); | |||||
| algo = ConvBias::algo_name<ConvBias::DirectParam>( | |||||
| "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16", | |||||
| ConvBias::DirectParam{}); | |||||
| check(algo); | |||||
| } | |||||
| #endif | #endif | ||||
| TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW) { | TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW) { | ||||
| @@ -969,7 +1088,7 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_NCHW4) { | |||||
| checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker< | checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker< | ||||
| ConvBiasForward>( | ConvBiasForward>( | ||||
| ConvBias::algo_name<ConvBias::DirectParam>( | ConvBias::algo_name<ConvBias::DirectParam>( | ||||
| "INT8_NCHW32_IMMA_IMPLICIT_GEMM_128X128X64_64X64X64_2", | |||||
| "INT8_NCHW32_IMMA_IMPLICIT_GEMM_32X128X32_32X64X32_1", | |||||
| ConvBias::DirectParam{}) | ConvBias::DirectParam{}) | ||||
| .c_str())); | .c_str())); | ||||
| checker.set_dtype(0, dtype::QuantizedS8(1.9980618f)) | checker.set_dtype(0, dtype::QuantizedS8(1.9980618f)) | ||||
| @@ -1109,6 +1228,16 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) { | |||||
| "DIRECT:INT8_NCHW32_IMMA_IMPLICIT_GEMM", | "DIRECT:INT8_NCHW32_IMMA_IMPLICIT_GEMM", | ||||
| param::ConvBias::Format::NCHW32); | param::ConvBias::Format::NCHW32); | ||||
| } | } | ||||
| TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NHWC) { | |||||
| require_compute_capability(7, 5); | |||||
| benchmark_target_algo_with_cudnn_tsc( | |||||
| handle_cuda(), get_det_first_bench_args(16), | |||||
| dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f}, | |||||
| dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f}, | |||||
| "DIRECT:INT8_NHWC_IMMA_IMPLICIT_GEMM", | |||||
| param::ConvBias::Format::NHWC); | |||||
| } | |||||
| #endif | #endif | ||||
| TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4) { | TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4) { | ||||
| @@ -102,9 +102,7 @@ std::vector<BenchArgs> get_det_first_bench_args(size_t batch) { | |||||
| args.emplace_back(BenchArgs{batch, 16, 384, 640, 16, 3, 1}); | args.emplace_back(BenchArgs{batch, 16, 384, 640, 16, 3, 1}); | ||||
| args.emplace_back(BenchArgs{batch, 16, 384, 640, 32, 3, 2}); | args.emplace_back(BenchArgs{batch, 16, 384, 640, 32, 3, 2}); | ||||
| args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 1}); | args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 1}); | ||||
| args.emplace_back(BenchArgs{batch, 32, 384, 640, 64, 3, 2}); | |||||
| args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 1, 1}); | args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 1, 1}); | ||||
| args.emplace_back(BenchArgs{batch, 32, 384, 640, 64, 1, 2}); | |||||
| return args; | return args; | ||||
| } | } | ||||
| @@ -333,6 +331,9 @@ void benchmark_target_algo_with_cudnn_tsc( | |||||
| .reshape({shape[0], shape[1] / 4, 4, shape[2], | .reshape({shape[0], shape[1] / 4, 4, shape[2], | ||||
| shape[3]}) | shape[3]}) | ||||
| .dimshuffle({1, 3, 4, 0, 2})); | .dimshuffle({1, 3, 4, 0, 2})); | ||||
| } else if (format == Format::NHWC) { | |||||
| ret = static_cast<TensorShape>( | |||||
| TensorLayout{shape, dtype}.dimshuffle({0, 2, 3, 1})); | |||||
| } | } | ||||
| return ret; | return ret; | ||||
| }; | }; | ||||
| @@ -363,6 +364,9 @@ void benchmark_target_algo_with_cudnn_tsc( | |||||
| if ((format == Format::CHWN4 || format == Format::NCHW4) && | if ((format == Format::CHWN4 || format == Format::NCHW4) && | ||||
| (arg.ci % 16 != 0)) | (arg.ci % 16 != 0)) | ||||
| continue; | continue; | ||||
| // skip testcase which cannot enable nhwc tensorcore | |||||
| if ((format == Format::NHWC) && (arg.ci % 4 != 0 || arg.co % 4 != 0)) | |||||
| continue; | |||||
| Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0 | Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0 | ||||
| ? Format::NCHW32 | ? Format::NCHW32 | ||||
| : Format::NCHW4; | : Format::NCHW4; | ||||
| @@ -327,7 +327,6 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A) { | |||||
| all_params.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2}); | all_params.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2}); | ||||
| all_params.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1}); | all_params.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1}); | ||||
| all_params.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2}); | all_params.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2}); | ||||
| all_params.emplace_back(AlgoParam{64, 128, 32, 64, 32, 32, 2}); | |||||
| for (auto algo_param : all_params) { | for (auto algo_param : all_params) { | ||||
| Checker<ConvolutionBackwardData> checker(handle_cuda()); | Checker<ConvolutionBackwardData> checker(handle_cuda()); | ||||