GitOrigin-RevId: da3bcfb85a
tags/v1.5.0
| @@ -1,5 +1,6 @@ | |||
| # Mark generated files as binary, ignore them in git diff. | |||
| # dnn | |||
| dnn/scripts/cutlass_generator/list.bzl binary | |||
| dnn/src/cuda/conv_bias/int4/kimpl/* binary | |||
| dnn/src/cuda/conv_bias/int8/kimpl/* binary | |||
| dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary | |||
| @@ -0,0 +1,18 @@ | |||
| load("list.bzl", "cutlass_gen_list") | |||
| genrule( | |||
| name = "cutlass_kimpls", | |||
| outs = cutlass_gen_list, | |||
| cmd = """GEN=$(location //brain/megbrain/dnn/scripts/cutlass_generator:generator.py) | |||
| pwd > /tmp/a | |||
| echo $(@D) > /tmp/b | |||
| python3 $$GEN --operations gemm --type simt $(@D) | |||
| python3 $$GEN --operations gemv --type simt $(@D) | |||
| python3 $$GEN --operations deconv --type simt $(@D) | |||
| python3 $$GEN --operations conv2d --type simt $(@D) | |||
| python3 $$GEN --operations conv2d --type tensorop8816 $(@D) | |||
| python3 $$GEN --operations conv2d --type tensorop8832 $(@D) | |||
| """, | |||
| tools = ["//brain/megbrain/dnn/scripts/cutlass_generator:generator.py"], | |||
| visibility = ["//visibility:public"], | |||
| ) | |||
| @@ -0,0 +1,19 @@ | |||
| # Generate device kernel registration code for CUTLASS kernels | |||
| ## Usage | |||
| ```bash | |||
| python3 generator.py [--operations {gemm, gemv, conv2d, deconv}] [--type {simt, tensorop8816, tensorop8832}] | |||
| output | |||
| ``` | |||
| - operations: operation kind, including gemm|gemv|conv2d|deconv | |||
| - type: opcode class, simt|tensorop8816|tensorop8832 | |||
| - output: the output directory for CUTLASS kernels | |||
| ## Generate file list for bazel | |||
| We generate `list.bzl` because the `genrule` method of bazel requires that the output file list be specified in the analysis phase. | |||
| Please call `gen_list.py` when new operations are added. | |||
| ```bash | |||
| python3 gen_list.py | |||
| ``` | |||
| @@ -0,0 +1,614 @@ | |||
| # | |||
| # \file generator.py | |||
| # | |||
| # \brief Generates the CUTLASS Library's instances | |||
| # | |||
| # | |||
| import enum | |||
| import os.path | |||
| import shutil | |||
| from typing import Tuple, List | |||
| from lazy_file import LazyFile | |||
| from library import * | |||
| ################################################################################################### | |||
| # | |||
| class Conv2dOperation: | |||
| # | |||
| def __init__(self, conv_kind, conv_type, arch, tile_description, src, flt, bias, dst, element_epilogue, \ | |||
| epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4, \ | |||
| need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNt): | |||
| self.operation_kind = OperationKind.Conv2d | |||
| self.conv_kind = conv_kind | |||
| self.arch = arch | |||
| self.tile_description = tile_description | |||
| self.conv_type = conv_type | |||
| self.src = src | |||
| self.flt = flt | |||
| self.bias = bias | |||
| self.dst = dst | |||
| self.element_epilogue = element_epilogue | |||
| self.epilogue_functor = epilogue_functor | |||
| self.swizzling_functor = swizzling_functor | |||
| self.need_load_from_const = need_load_from_const | |||
| self.implicit_gemm_mode = implicit_gemm_mode | |||
| # | |||
| def accumulator_type(self): | |||
| accum = self.tile_description.math_instruction.element_accumulator | |||
| return accum | |||
| # | |||
| def core_name(self): | |||
| ''' The basic operation kind is prefixed with a letter indicating the accumulation type. ''' | |||
| intermediate_type = '' | |||
| if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp: | |||
| inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape) | |||
| if self.tile_description.math_instruction.element_a != self.flt.element and \ | |||
| self.tile_description.math_instruction.element_a != self.accumulator_type(): | |||
| intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a] | |||
| else: | |||
| inst_shape = '' | |||
| unity_kernel = '' | |||
| if not self.need_load_from_const: | |||
| unity_kernel = '_1x1' | |||
| return "%s%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \ | |||
| inst_shape, intermediate_type, ConvKindNames[self.conv_kind], unity_kernel, \ | |||
| ShortEpilogueNames[self.epilogue_functor]) | |||
| # | |||
| def extended_name(self): | |||
| if self.dst.element != self.tile_description.math_instruction.element_accumulator: | |||
| if self.src.element != self.flt.element: | |||
| extended_name = "${element_dst}_${core_name}_${element_src}_${element_flt}" | |||
| elif self.src.element == self.flt.element: | |||
| extended_name = "${element_dst}_${core_name}_${element_src}" | |||
| else: | |||
| if self.src.element != self.flt.element: | |||
| extended_name = "${core_name}_${element_src}_${element_flt}" | |||
| elif self.src.element == self.flt.element: | |||
| extended_name = "${core_name}_${element_src}" | |||
| extended_name = SubstituteTemplate(extended_name, { | |||
| 'element_src': DataTypeNames[self.src.element], | |||
| 'element_flt': DataTypeNames[self.flt.element], | |||
| 'element_dst': DataTypeNames[self.dst.element], | |||
| 'core_name': self.core_name() | |||
| }) | |||
| return extended_name | |||
| # | |||
| def layout_name(self): | |||
| if self.src.layout == self.dst.layout: | |||
| layout_name = "${src_layout}_${flt_layout}" | |||
| else: | |||
| layout_name = "${src_layout}_${flt_layout}_${dst_layout}" | |||
| layout_name = SubstituteTemplate(layout_name, { | |||
| 'src_layout': ShortLayoutTypeNames[self.src.layout], | |||
| 'flt_layout': ShortLayoutTypeNames[self.flt.layout], | |||
| 'dst_layout': ShortLayoutTypeNames[self.dst.layout], | |||
| }) | |||
| return layout_name | |||
| # | |||
| def configuration_name(self): | |||
| ''' The full procedural name indicates architecture, extended name, tile size, and layout. ''' | |||
| opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class] | |||
| warp_shape = [int(self.tile_description.threadblock_shape[idx] / self.tile_description.warp_count[idx]) for idx in range(3)] | |||
| threadblock = "%dx%dx%d_%dx%dx%d_%d" % ( | |||
| self.tile_description.threadblock_shape[0], | |||
| self.tile_description.threadblock_shape[1], | |||
| self.tile_description.threadblock_shape[2], | |||
| warp_shape[0], | |||
| warp_shape[1], | |||
| warp_shape[2], | |||
| self.tile_description.stages, | |||
| ) | |||
| configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}" | |||
| return SubstituteTemplate( | |||
| configuration_name, | |||
| { | |||
| 'opcode_class': opcode_class_name, | |||
| 'extended_name': self.extended_name(), | |||
| 'threadblock': threadblock, | |||
| 'layout': self.layout_name(), | |||
| } | |||
| ) | |||
| # | |||
| def procedural_name(self): | |||
| ''' The full procedural name indicates architecture, extended name, tile size, and layout. ''' | |||
| return self.configuration_name() | |||
| ################################################################################################### | |||
| # | |||
| # Emits single instances of a CUTLASS device-wide operator | |||
| # | |||
| ################################################################################################### | |||
| class EmitConv2dInstance: | |||
| def __init__(self): | |||
| self.template = """ | |||
| // kernel instance "${operation_name}" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| ${element_src}, | |||
| ${layout_src}, | |||
| ${element_flt}, | |||
| ${layout_flt}, | |||
| ${element_dst}, | |||
| ${layout_dst}, | |||
| ${element_bias}, | |||
| ${layout_bias}, | |||
| ${element_accumulator}, | |||
| ${conv_type}, | |||
| ${opcode_class}, | |||
| ${arch}, | |||
| cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, | |||
| cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, | |||
| cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, | |||
| ${epilogue_functor}< | |||
| ${element_dst}, | |||
| ${epilogue_vector_length}, | |||
| ${element_accumulator}, | |||
| ${element_bias}, | |||
| ${element_epilogue} | |||
| >, | |||
| ${swizzling_functor}, | |||
| ${stages}, | |||
| ${alignment_src}, | |||
| ${alignment_filter}, | |||
| ${nonuninity_kernel}, | |||
| ${math_operator}, | |||
| ${implicit_gemm_mode}>; | |||
| """ | |||
| def emit(self, operation): | |||
| warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)] | |||
| epilogue_vector_length = int(min(operation.dst.alignment * DataTypeSize[operation.dst.element], 128) / DataTypeSize[operation.dst.element]) | |||
| values = { | |||
| 'operation_name': operation.procedural_name(), | |||
| 'conv_type': ConvTypeTag[operation.conv_type], | |||
| 'element_src': DataTypeTag[operation.src.element], | |||
| 'layout_src': LayoutTag[operation.src.layout], | |||
| 'element_flt': DataTypeTag[operation.flt.element], | |||
| 'layout_flt': LayoutTag[operation.flt.layout], | |||
| 'element_dst': DataTypeTag[operation.dst.element], | |||
| 'layout_dst': LayoutTag[operation.dst.layout], | |||
| 'element_bias': DataTypeTag[operation.bias.element], | |||
| 'layout_bias': LayoutTag[operation.bias.layout], | |||
| 'element_accumulator': DataTypeTag[operation.accumulator_type()], | |||
| 'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], | |||
| 'arch': "cutlass::arch::Sm%d" % operation.arch, | |||
| 'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), | |||
| 'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), | |||
| 'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), | |||
| 'warp_shape_m': str(warp_shape[0]), | |||
| 'warp_shape_n': str(warp_shape[1]), | |||
| 'warp_shape_k': str(warp_shape[2]), | |||
| 'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]), | |||
| 'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]), | |||
| 'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]), | |||
| 'epilogue_vector_length': str(epilogue_vector_length), | |||
| 'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor], | |||
| 'element_epilogue': str(DataTypeTag[operation.element_epilogue]), | |||
| 'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor], | |||
| 'stages': str(operation.tile_description.stages), | |||
| 'alignment_src': str(operation.src.alignment), | |||
| 'alignment_filter': str(operation.flt.alignment), | |||
| 'nonuninity_kernel': str(operation.need_load_from_const).lower(), | |||
| 'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation], | |||
| 'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode] | |||
| } | |||
| return SubstituteTemplate(self.template, values) | |||
| class EmitDeconvInstance: | |||
| def __init__(self): | |||
| self.template = """ | |||
| // kernel instance "${operation_name}" generated by cutlass generator | |||
| using Deconvolution = | |||
| typename cutlass::conv::device::Deconvolution< | |||
| ${element_src}, | |||
| ${layout_src}, | |||
| ${element_flt}, | |||
| ${layout_flt}, | |||
| ${element_dst}, | |||
| ${layout_dst}, | |||
| ${element_bias}, | |||
| ${layout_bias}, | |||
| ${element_accumulator}, | |||
| ${opcode_class}, | |||
| ${arch}, | |||
| cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, | |||
| cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, | |||
| cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, | |||
| ${epilogue_functor}< | |||
| ${element_dst}, | |||
| ${epilogue_vector_length}, | |||
| ${element_accumulator}, | |||
| ${element_bias}, | |||
| ${element_epilogue} | |||
| >, | |||
| ${swizzling_functor}, | |||
| ${stages}, | |||
| ${alignment_src}, | |||
| ${alignment_filter}, | |||
| ${nonuninity_kernel}, | |||
| ${math_operator}, | |||
| ${implicit_gemm_mode}>; | |||
| """ | |||
| def emit(self, operation): | |||
| warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)] | |||
| epilogue_vector_length = int(min(operation.dst.alignment * DataTypeSize[operation.dst.element], 128) / DataTypeSize[operation.dst.element]) | |||
| values = { | |||
| 'operation_name': operation.procedural_name(), | |||
| 'element_src': DataTypeTag[operation.src.element], | |||
| 'layout_src': LayoutTag[operation.src.layout], | |||
| 'element_flt': DataTypeTag[operation.flt.element], | |||
| 'layout_flt': LayoutTag[operation.flt.layout], | |||
| 'element_dst': DataTypeTag[operation.dst.element], | |||
| 'layout_dst': LayoutTag[operation.dst.layout], | |||
| 'element_bias': DataTypeTag[operation.bias.element], | |||
| 'layout_bias': LayoutTag[operation.bias.layout], | |||
| 'element_accumulator': DataTypeTag[operation.accumulator_type()], | |||
| 'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], | |||
| 'arch': "cutlass::arch::Sm%d" % operation.arch, | |||
| 'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), | |||
| 'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), | |||
| 'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), | |||
| 'warp_shape_m': str(warp_shape[0]), | |||
| 'warp_shape_n': str(warp_shape[1]), | |||
| 'warp_shape_k': str(warp_shape[2]), | |||
| 'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]), | |||
| 'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]), | |||
| 'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]), | |||
| 'epilogue_vector_length': str(epilogue_vector_length), | |||
| 'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor], | |||
| 'element_epilogue': str(DataTypeTag[operation.element_epilogue]), | |||
| 'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor], | |||
| 'stages': str(operation.tile_description.stages), | |||
| 'alignment_src': str(operation.src.alignment), | |||
| 'alignment_filter': str(operation.flt.alignment), | |||
| 'nonuninity_kernel': str(operation.need_load_from_const).lower(), | |||
| 'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation], | |||
| 'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode] | |||
| } | |||
| return SubstituteTemplate(self.template, values) | |||
| ################################################################################################### | |||
| # | |||
| # Generator functions for all layouts | |||
| # | |||
| ################################################################################################### | |||
| # | |||
| def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 128, \ | |||
| skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNt): | |||
| operations = [] | |||
| element_epilogue = DataType.f32 | |||
| if conv_kind == ConvKind.Fprop: | |||
| if src_layout == LayoutType.TensorNHWC: | |||
| swizzling_functor = SwizzlingFunctor.ConvFpropNHWC | |||
| else: | |||
| swizzling_functor = SwizzlingFunctor.ConvFpropNCxHWx | |||
| else: | |||
| swizzling_functor = SwizzlingFunctor.ConvDgradNCxHWx | |||
| # skip rule | |||
| def filter_tile_with_layout(tile: TileDescription, layout: LayoutType) -> bool: | |||
| return layout == LayoutType.TensorNC32HW32 and \ | |||
| tile.threadblock_shape[0] % 32 != 0 | |||
| # rule for bias_type and epilogues | |||
| def get_bias_type_and_epilogues(tile: TileDescription, \ | |||
| out_dtype: DataType) -> Tuple[DataType, List[EpilogueFunctor]]: | |||
| if tile.math_instruction.element_accumulator == DataType.s32 and \ | |||
| out_dtype != DataType.f32: | |||
| bias_type = DataType.s32 | |||
| if tile.math_instruction.element_b == DataType.u4: | |||
| epilogues = [EpilogueFunctor.BiasAddLinearCombinationClamp, EpilogueFunctor.BiasAddLinearCombinationReluClamp] | |||
| else: | |||
| epilogues = [EpilogueFunctor.BiasAddLinearCombinationClamp, EpilogueFunctor.BiasAddLinearCombinationReluClamp, \ | |||
| EpilogueFunctor.BiasAddLinearCombinationHSwishClamp] | |||
| elif tile.math_instruction.element_accumulator == DataType.f32 or \ | |||
| out_dtype == DataType.f32: | |||
| bias_type = DataType.f32 | |||
| epilogues = [EpilogueFunctor.BiasAddLinearCombination, EpilogueFunctor.BiasAddLinearCombinationRelu, \ | |||
| EpilogueFunctor.BiasAddLinearCombinationHSwish] | |||
| return bias_type, epilogues | |||
| # rule for filter alignment | |||
| def get_flt_align(tile: TileDescription) -> int: | |||
| nonlocal flt_align | |||
| if tile.math_instruction.opcode_class == OpcodeClass.Simt \ | |||
| and tile.math_instruction.element_accumulator == DataType.s32: | |||
| thread_num = tile.warp_count[0] * tile.warp_count[1] * tile.warp_count[2] * 32 | |||
| flt_block = tile.threadblock_shape[0] * tile.threadblock_shape[2] \ | |||
| * DataTypeSize[tile.math_instruction.element_a] | |||
| load_per_thread = flt_block//thread_num | |||
| if load_per_thread >= 128: | |||
| flt_align = 128 | |||
| elif load_per_thread >= 64: | |||
| flt_align = 64 | |||
| else: | |||
| assert load_per_thread >= 32 | |||
| flt_align = 32 | |||
| return flt_align | |||
| def get_dst_align(tile: TileDescription, out_layout: LayoutType) -> int: | |||
| nonlocal dst_align | |||
| if tile.math_instruction.opcode_class == OpcodeClass.TensorOp \ | |||
| and dst_layout == LayoutType.TensorNC4HW4: | |||
| dst_align = 32 | |||
| return dst_align | |||
| def filter_epilogue_with_conv_kind(epilogue: EpilogueFunctor, conv_kind: ConvKind) -> bool: | |||
| return conv_kind == ConvKind.Dgrad \ | |||
| and epilogue != EpilogueFunctor.BiasAddLinearCombinationClamp | |||
| # loop over all tile descriptions | |||
| for tile in tile_descriptions: | |||
| if filter_tile_with_layout(tile, dst_layout): | |||
| continue | |||
| bias_type, epilogues = get_bias_type_and_epilogues(tile, dst_type) | |||
| flt_align = get_flt_align(tile) | |||
| dst_align = get_dst_align(tile, dst_layout) | |||
| for epilogue in epilogues: | |||
| if filter_epilogue_with_conv_kind(epilogue, conv_kind): | |||
| continue | |||
| if dst_type == DataType.f32: | |||
| bias_type = DataType.f32 | |||
| # | |||
| src = TensorDescription(tile.math_instruction.element_b, src_layout, int(src_align / DataTypeSize[tile.math_instruction.element_b])) | |||
| flt = TensorDescription(tile.math_instruction.element_a, flt_layout, int(flt_align / DataTypeSize[tile.math_instruction.element_a])) | |||
| bias = TensorDescription(bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type]))) | |||
| dst = TensorDescription(dst_type, dst_layout, int(dst_align / DataTypeSize[dst_type])) | |||
| new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode) | |||
| operations.append(new_operation) | |||
| if not skip_unity_kernel: | |||
| new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode) | |||
| operations.append(new_operation) | |||
| return operations | |||
| ################################################################################################### | |||
| # | |||
| # Emitters functions for all targets | |||
| # | |||
| ################################################################################################### | |||
| class EmitConv2dConfigurationLibrary: | |||
| def __init__(self, operation_path, configuration_name): | |||
| self.configuration_name = configuration_name | |||
| self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name) | |||
| self.instance_emitter = EmitConv2dInstance() | |||
| self.instance_template = """ | |||
| ${operation_instance} | |||
| // Derived class | |||
| struct ${operation_name} : | |||
| public ${operation_name}_base { }; | |||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | |||
| """ | |||
| self.header_template = """ | |||
| /* | |||
| Generated by conv2d_operation.py - Do not edit. | |||
| */ | |||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | |||
| #include "cutlass/cutlass.h" | |||
| #include "cutlass/library/library.h" | |||
| #include "cutlass/library/manifest.h" | |||
| #include "library_internal.h" | |||
| #include "conv2d_operation.h" | |||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | |||
| """ | |||
| self.configuration_header = """ | |||
| namespace cutlass { | |||
| namespace library { | |||
| // Initialize all instances | |||
| void initialize_${configuration_name}(Manifest &manifest) { | |||
| """ | |||
| self.configuration_instance = """ | |||
| using Operation_${operation_name} = cutlass::conv::device::ImplicitGemmConvolution< | |||
| ${operation_name}>; | |||
| manifest.append(new cutlass::library::Conv2dOperation< | |||
| Operation_${operation_name}>( | |||
| "${operation_name}")); | |||
| """ | |||
| self.configuration_epilogue = """ | |||
| } | |||
| """ | |||
| self.epilogue_template = """ | |||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | |||
| } // namespace library | |||
| } // namespace cutlass | |||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | |||
| """ | |||
| # | |||
| def __enter__(self): | |||
| self.configuration_file = open(self.configuration_path, "w") | |||
| self.configuration_file.write(SubstituteTemplate(self.header_template, { | |||
| 'configuration_name': self.configuration_name | |||
| })) | |||
| self.operations = [] | |||
| return self | |||
| # | |||
| def emit(self, operation): | |||
| self.operations.append(operation) | |||
| self.configuration_file.write(SubstituteTemplate(self.instance_template, { | |||
| 'configuration_name': self.configuration_name, | |||
| 'operation_name': operation.procedural_name(), | |||
| 'operation_instance': self.instance_emitter.emit(operation) | |||
| })) | |||
| # | |||
| def __exit__(self, exception_type, exception_value, traceback): | |||
| self.configuration_file.write(SubstituteTemplate(self.configuration_header, { | |||
| 'configuration_name': self.configuration_name | |||
| })) | |||
| for operation in self.operations: | |||
| self.configuration_file.write(SubstituteTemplate(self.configuration_instance, { | |||
| 'configuration_name': self.configuration_name, | |||
| 'operation_name': operation.procedural_name() | |||
| })) | |||
| self.configuration_file.write(self.configuration_epilogue) | |||
| self.configuration_file.write(self.epilogue_template) | |||
| self.configuration_file.close() | |||
| ################################################################################################### | |||
| ################################################################################################### | |||
| # Emitters for Conv Kernel Wrapper | |||
| # | |||
| ################################################################################################### | |||
| class EmitConvSingleKernelWrapper(): | |||
| def __init__(self, kernel_path, operation, wrapper_path): | |||
| self.kernel_path = kernel_path | |||
| self.wrapper_path = wrapper_path | |||
| self.operation = operation | |||
| self.conv_wrappers = { \ | |||
| ConvKind.Fprop: """ | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| """, \ | |||
| ConvKind.Dgrad: """ | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper<Deconvolution>( | |||
| const typename Deconvolution::ElementSrc* d_src, | |||
| const typename Deconvolution::ElementFilter* d_filter, | |||
| const typename Deconvolution::ElementBias* d_bias, | |||
| const typename Deconvolution::ElementDst* d_z, | |||
| typename Deconvolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Deconvolution::ConvolutionParameter const& conv_param, | |||
| typename Deconvolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream); | |||
| """, \ | |||
| } | |||
| if self.operation.conv_kind == ConvKind.Fprop: | |||
| self.instance_emitter = EmitConv2dInstance() | |||
| else: | |||
| assert self.operation.conv_kind == ConvKind.Dgrad | |||
| self.instance_emitter = EmitDeconvInstance() | |||
| self.header_template = """ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "${wrapper_path}" | |||
| """ | |||
| self.instance_template = """ | |||
| ${operation_instance} | |||
| """ | |||
| self.wrapper_template = """ | |||
| ${wrapper_instance} | |||
| """ | |||
| self.epilogue_template = """ | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| """ | |||
| # | |||
| def __enter__(self): | |||
| self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % self.operation.procedural_name()) | |||
| self.kernel_file = LazyFile(self.kernel_path) | |||
| self.kernel_file.write(SubstituteTemplate(self.header_template, { | |||
| 'wrapper_path': self.wrapper_path, | |||
| })) | |||
| return self | |||
| # | |||
| def emit(self): | |||
| self.kernel_file.write(SubstituteTemplate(self.instance_template, { | |||
| 'operation_instance': self.instance_emitter.emit(self.operation), | |||
| })) | |||
| # emit wrapper | |||
| wrapper = SubstituteTemplate(self.wrapper_template, { | |||
| 'wrapper_instance': self.conv_wrappers[self.operation.conv_kind], | |||
| }) | |||
| self.kernel_file.write(wrapper) | |||
| # | |||
| def __exit__(self, exception_type, exception_value, traceback): | |||
| self.kernel_file.write(self.epilogue_template) | |||
| self.kernel_file.close() | |||
| ################################################################################################### | |||
| ################################################################################################### | |||
| @@ -0,0 +1,38 @@ | |||
| from generator import ( | |||
| GenerateGemmOperations, | |||
| GenerateGemvOperations, | |||
| GenerateConv2dOperations, | |||
| GenerateDeconvOperations, | |||
| ) | |||
| class GenArg: | |||
| def __init__(self, gen_op, gen_type): | |||
| self.operations = gen_op | |||
| self.type = gen_type | |||
| def write_op_list(f, gen_op, gen_type): | |||
| if gen_op == "gemm": | |||
| operations = GenerateGemmOperations(GenArg(gen_op, gen_type)) | |||
| elif gen_op == "gemv": | |||
| operations = GenerateGemvOperations(GenArg(gen_op, gen_type)) | |||
| elif gen_op == "conv2d": | |||
| operations = GenerateConv2dOperations(GenArg(gen_op, gen_type)) | |||
| elif gen_op == "deconv": | |||
| operations = GenerateDeconvOperations(GenArg(gen_op, gen_type)) | |||
| for op in operations: | |||
| f.write(' "%s.cu",\n' % op.procedural_name()) | |||
| if __name__ == "__main__": | |||
| with open("list.bzl", "w") as f: | |||
| f.write("# Generated by dnn/scripts/cutlass_generator/gen_list.py\n\n") | |||
| f.write("cutlass_gen_list = [\n") | |||
| write_op_list(f, "gemm", "simt") | |||
| write_op_list(f, "gemv", "simt") | |||
| write_op_list(f, "deconv", "simt") | |||
| write_op_list(f, "conv2d", "simt") | |||
| write_op_list(f, "conv2d", "tensorop8816") | |||
| write_op_list(f, "conv2d", "tensorop8832") | |||
| f.write("]") | |||
| @@ -0,0 +1,651 @@ | |||
| # | |||
| # \file generator.py | |||
| # | |||
| # \brief Generates the CUTLASS Library's instances | |||
| # | |||
| import enum | |||
| import os.path | |||
| import shutil | |||
| import argparse | |||
| from library import * | |||
| from manifest import * | |||
| ################################################################################################### | |||
| # | |||
| def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0): | |||
| # by default, use the latest CUDA Toolkit version | |||
| cuda_version = [11, 0, 132] | |||
| # Update cuda_version based on parsed string | |||
| if semantic_ver_string != '': | |||
| for i, x in enumerate([int(x) for x in semantic_ver_string.split('.')]): | |||
| if i < len(cuda_version): | |||
| cuda_version[i] = x | |||
| else: | |||
| cuda_version.append(x) | |||
| return cuda_version >= [major, minor, patch] | |||
| ################################################################################################### | |||
| ################################################################################################### | |||
| # | |||
| def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \ | |||
| alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \ | |||
| swizzling_functor = SwizzlingFunctor.Identity8): | |||
| if complex_transforms is None: | |||
| complex_transforms = [(ComplexTransform.none, ComplexTransform.none),] | |||
| element_a, element_b, element_c, element_epilogue = data_type | |||
| operations = [] | |||
| # by default, only generate the largest tile and largest alignment | |||
| if manifest.args.kernels == '': | |||
| tile_descriptions = [tile_descriptions[0],] | |||
| alignment_constraints = [alignment_constraints[0],] | |||
| for layout in layouts: | |||
| for tile_description in tile_descriptions: | |||
| for alignment in alignment_constraints: | |||
| for complex_transform in complex_transforms: | |||
| alignment_c = min(8, alignment) | |||
| A = TensorDescription(element_a, layout[0], alignment, complex_transform[0]) | |||
| B = TensorDescription(element_b, layout[1], alignment, complex_transform[1]) | |||
| C = TensorDescription(element_c, layout[2], alignment_c) | |||
| new_operation = GemmOperation(GemmKind.Universal, tile_description.minimum_compute_capability, \ | |||
| tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor) | |||
| manifest.append(new_operation) | |||
| operations.append(new_operation) | |||
| return operations | |||
| ########################################################################################################### | |||
| # ConvolutionOperator support variations | |||
| # ____________________________________________________________________ | |||
| # ConvolutionalOperator | Analytic | Optimized | |||
| # ____________________________________________________________________ | |||
| # | Fprop | (strided) | (strided) | |||
| # | Dgrad | (strided, unity*) | (unity) | |||
| # | Wgrad | (strided) | (strided) | |||
| # ____________________________________________________________________ | |||
| # | |||
| # Note : Operator marked (*) are supported but not generated to keep the instantiated kernel count low | |||
| ########################################################################################################### | |||
| # Convolution for 2D operations | |||
| def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignment, \ | |||
| conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], epilogue_functor = EpilogueFunctor.LinearCombination): | |||
| element_a, element_b, element_c, element_epilogue = data_type | |||
| # one exceptional case | |||
| alignment_c = min(8, alignment) | |||
| # iterator algorithm (analytic and optimized) | |||
| iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized] | |||
| # by default, only generate the largest tile size | |||
| if manifest.args.kernels == '': | |||
| tile_descriptions = [tile_descriptions[0],] | |||
| operations = [] | |||
| for tile in tile_descriptions: | |||
| for conv_kind in conv_kinds: | |||
| for iterator_algorithm in iterator_algorithms: | |||
| A = TensorDescription(element_a, layout[0], alignment) | |||
| B = TensorDescription(element_b, layout[1], alignment) | |||
| C = TensorDescription(element_c, layout[2], alignment_c) | |||
| # unity stride only for Optimized Dgrad | |||
| if (iterator_algorithm == IteratorAlgorithm.Optimized) and (conv_kind == ConvKind.Dgrad): | |||
| new_operation = Conv2dOperation(conv_kind, iterator_algorithm, tile.minimum_compute_capability, tile,\ | |||
| A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor) | |||
| manifest.append(new_operation) | |||
| operations.append(new_operation) | |||
| # strided dgrad is not supported by Optimized Dgrad | |||
| if (iterator_algorithm == IteratorAlgorithm.Optimized) and (conv_kind == ConvKind.Dgrad): | |||
| continue | |||
| # strided support for Fprop (Analytic/Optimized), Dgrad (Analytic), and Wgrad (Analytic) | |||
| new_operation = Conv2dOperation(conv_kind, iterator_algorithm, tile.minimum_compute_capability, tile,\ | |||
| A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor) | |||
| manifest.append(new_operation) | |||
| operations.append(new_operation) | |||
| return operations | |||
| ################################################################################################### | |||
| ################################################################################################### | |||
| def GenerateConv2d_Simt(args): | |||
| operations = [] | |||
| layouts = [ | |||
| (LayoutType.TensorNC4HW4, LayoutType.TensorC4RSK4), | |||
| ] | |||
| math_instructions = [ | |||
| MathInstruction( \ | |||
| [1, 1, 4], \ | |||
| DataType.s8, DataType.s8, DataType.s32, \ | |||
| OpcodeClass.Simt, \ | |||
| MathOperation.multiply_add), | |||
| ] | |||
| dst_layouts = [ | |||
| LayoutType.TensorNC4HW4, | |||
| LayoutType.TensorNC32HW32, | |||
| LayoutType.TensorNHWC, | |||
| LayoutType.TensorNHWC, | |||
| LayoutType.TensorNCHW | |||
| ] | |||
| dst_types = [ | |||
| DataType.s8, | |||
| DataType.s8, | |||
| DataType.u4, | |||
| DataType.s4, | |||
| DataType.f32, | |||
| ] | |||
| max_cc = 1024 | |||
| for math_inst in math_instructions: | |||
| for layout in layouts: | |||
| for dst_type, dst_layout in zip(dst_types, dst_layouts): | |||
| if dst_type == DataType.s4 or dst_type == DataType.u4: | |||
| min_cc = 75 | |||
| skip_unity_kernel = True | |||
| else: | |||
| min_cc = 61 | |||
| skip_unity_kernel = False | |||
| tile_descriptions = [ | |||
| TileDescription([128, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 64, 128, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 64, 64, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([128, 32, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 32, 64, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 64, 32, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 32, 32, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
| ] | |||
| operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | |||
| dst_layout, dst_type, min_cc, 32, 32, 32, | |||
| skip_unity_kernel) | |||
| return operations | |||
| def GenerateConv2d_TensorOp_8816(args): | |||
| operations = [] | |||
| layouts = [ | |||
| (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32), | |||
| ] | |||
| math_instructions = [ | |||
| MathInstruction( \ | |||
| [8, 8, 16], \ | |||
| DataType.s8, DataType.s8, DataType.s32, \ | |||
| OpcodeClass.TensorOp, \ | |||
| MathOperation.multiply_add_saturate), | |||
| ] | |||
| dst_layouts = [ | |||
| LayoutType.TensorNC32HW32, | |||
| LayoutType.TensorNC4HW4, | |||
| ] | |||
| dst_types = [ | |||
| DataType.s8, | |||
| DataType.s8, | |||
| ] | |||
| min_cc = 75 | |||
| max_cc = 1024 | |||
| for math_inst in math_instructions: | |||
| for layout in layouts: | |||
| for dst_type, dst_layout in zip(dst_types, dst_layouts): | |||
| if dst_layout == LayoutType.TensorNC32HW32: | |||
| tile_descriptions = [ | |||
| TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 64, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 32, 64, 64], 2, [1, 4, 1], math_inst, min_cc, max_cc), | |||
| ] | |||
| else: | |||
| assert dst_layout == LayoutType.TensorNC4HW4 | |||
| tile_descriptions = [ | |||
| TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 64, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 32, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
| ] | |||
| operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | |||
| dst_layout, dst_type, min_cc, 128, 128, 64, | |||
| False) | |||
| return operations | |||
| def GenerateConv2d_TensorOp_8832(args): | |||
| operations = [] | |||
| layouts = [ | |||
| (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64), | |||
| ] | |||
| math_instructions = [ | |||
| MathInstruction( \ | |||
| [8, 8, 32], \ | |||
| DataType.s4, DataType.s4, DataType.s32, \ | |||
| OpcodeClass.TensorOp, \ | |||
| MathOperation.multiply_add_saturate), \ | |||
| MathInstruction( \ | |||
| [8, 8, 32], \ | |||
| DataType.s4, DataType.u4, DataType.s32, \ | |||
| OpcodeClass.TensorOp, \ | |||
| MathOperation.multiply_add_saturate) | |||
| ] | |||
| dst_layouts = [ | |||
| LayoutType.TensorNC64HW64, | |||
| ] | |||
| min_cc = 75 | |||
| max_cc = 1024 | |||
| for math_inst in math_instructions: | |||
| for layout in layouts: | |||
| for dst_layout in dst_layouts: | |||
| dst_type = math_inst.element_b | |||
| tile_descriptions = [ | |||
| TileDescription([256, 128, 128], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([128, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
| ] | |||
| operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | |||
| dst_layout, dst_type, min_cc, 128, 128, 64, | |||
| True) | |||
| layouts_nhwc = [ | |||
| (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32), | |||
| (LayoutType.TensorNHWC, LayoutType.TensorNC16HW16, 64), | |||
| (LayoutType.TensorNHWC, LayoutType.TensorNC32HW32, 128), | |||
| ] | |||
| dst_layouts_nhwc = [ | |||
| LayoutType.TensorNHWC, | |||
| ] | |||
| for math_inst in math_instructions: | |||
| for layout in layouts_nhwc: | |||
| for dst_layout in dst_layouts_nhwc: | |||
| dst_type = math_inst.element_b | |||
| tile_descriptions = [ | |||
| TileDescription([128, 32, 64], 2, [2, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([128, 64, 64], 2, [2, 1, 1], math_inst, min_cc, max_cc), | |||
| ] | |||
| operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | |||
| dst_layout, dst_type, min_cc, layout[2], layout[2], 32, | |||
| False, ImplicitGemmMode.GemmTn) | |||
| return operations | |||
| def GenerateDeconv_Simt(args): | |||
| operations = [] | |||
| layouts = [ | |||
| (LayoutType.TensorNC4HW4, LayoutType.TensorK4RSC4), | |||
| ] | |||
| math_instructions = [ | |||
| MathInstruction( \ | |||
| [1, 1, 4], \ | |||
| DataType.s8, DataType.s8, DataType.s32, \ | |||
| OpcodeClass.Simt, \ | |||
| MathOperation.multiply_add), | |||
| ] | |||
| dst_layouts = [ | |||
| LayoutType.TensorNC4HW4, | |||
| ] | |||
| dst_types = [ | |||
| DataType.s8, | |||
| ] | |||
| min_cc = 61 | |||
| max_cc = 1024 | |||
| for math_inst in math_instructions: | |||
| for layout in layouts: | |||
| for dst_type, dst_layout in zip(dst_types, dst_layouts): | |||
| tile_descriptions = [ | |||
| TileDescription([64, 128, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([16, 128, 16], 2, [1, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
| ] | |||
| operations += GenerateConv2d(ConvKind.Dgrad, tile_descriptions, layout[0], layout[1], | |||
| dst_layout, dst_type, min_cc, 32, 32, 32, | |||
| True) | |||
| return operations | |||
| ################################################################################ | |||
| # parameters | |||
| # Edge - for tiles, the edges represent the length of one side | |||
| # Ratio - the maximum ratio between 2 edges, limits the skinnyness of tiles | |||
| # MaxEdge - maximum length of each edge | |||
| # Min/Max - minimum/maximum of the product of edge lengths | |||
| ################################################################################ | |||
| warpsPerThreadblockEdge = [1, 2, 4, 8, 16] | |||
| warpsPerThreadblockRatio = 2 | |||
| warpsPerThreadblockMax = 16 | |||
| # NOTE 1x32 and 2x16 warp tile shapes fail validation for ~10% of cases | |||
| warpShapeEdges = [8, 16, 32, 64, 128, 256] | |||
| warpShapeRatio = 4 | |||
| warpShapeMax = 64*64 | |||
| warpShapeMin = 8*8 | |||
| threadblockEdgeMax = 256 | |||
| # char, type bits/elem, max tile, L0 threadblock tiles | |||
| precisions = { | |||
| "c" : [ "cutlass::complex<float>", 64, 64*128, [ [ 64, 128], [ 64, 32] ] ], | |||
| "d" : [ "double", 64, 64*64, [ [ 64, 64], [ 32, 32] ] ], | |||
| "h" : [ "cutlass::half_t", 16, 128*256, [ [256, 128], [ 64, 128], [ 64, 32] ] ], | |||
| "i" : [ "int", 32, 128*128, [ [128, 64], [ 16, 32] ] ], | |||
| "s" : [ "float", 32, 128*128, [ [128, 256], [128, 128], [ 64, 64] ] ], | |||
| "z" : [ "cutlass::complex<double>", 128, 64*64, [ [ 32, 64], [ 16, 32] ] ], | |||
| } | |||
| # L1 will have a single kernel for every unique shape | |||
| # L2 will have everything else | |||
| def GenerateGemm_Simt(args): | |||
| ################################################################################ | |||
| # warps per threadblock | |||
| ################################################################################ | |||
| warpsPerThreadblocks = [] | |||
| for warpsPerThreadblock0 in warpsPerThreadblockEdge: | |||
| for warpsPerThreadblock1 in warpsPerThreadblockEdge: | |||
| if warpsPerThreadblock0 / warpsPerThreadblock1 <= warpsPerThreadblockRatio \ | |||
| and warpsPerThreadblock1 / warpsPerThreadblock0 <= warpsPerThreadblockRatio \ | |||
| and warpsPerThreadblock0 * warpsPerThreadblock1 <= warpsPerThreadblockMax: | |||
| warpsPerThreadblocks.append([warpsPerThreadblock0, | |||
| warpsPerThreadblock1]) | |||
| ################################################################################ | |||
| # warp shapes | |||
| ################################################################################ | |||
| warpNumThreads = 32 | |||
| warpShapes = [] | |||
| for warp0 in warpShapeEdges: | |||
| for warp1 in warpShapeEdges: | |||
| if warp0 / warp1 <= warpShapeRatio \ | |||
| and warp1 / warp0 <= warpShapeRatio \ | |||
| and warp0 * warp1 <= warpShapeMax \ | |||
| and warp0*warp1 > warpShapeMin: | |||
| warpShapes.append([warp0, warp1]) | |||
| # sgemm | |||
| precisionType, precisionBits, threadblockMaxElements, threadblockTilesL0 = precisions["s"] | |||
| layouts = [ | |||
| (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.RowMajor), # nn | |||
| (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.RowMajor), # nt | |||
| (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor), # tn | |||
| (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor), # tt | |||
| ] | |||
| math_instructions = [ | |||
| MathInstruction( \ | |||
| [1, 1, 1], \ | |||
| DataType.f32, DataType.f32, DataType.f32, \ | |||
| OpcodeClass.Simt, \ | |||
| MathOperation.multiply_add), | |||
| ] | |||
| min_cc = 50 | |||
| max_cc = 1024 | |||
| operations = [] | |||
| for math_inst in math_instructions: | |||
| for layout in layouts: | |||
| data_type = [ | |||
| math_inst.element_a, | |||
| math_inst.element_b, | |||
| math_inst.element_accumulator, | |||
| math_inst.element_accumulator, | |||
| ] | |||
| tile_descriptions = [ | |||
| TileDescription([64, 256, 8], 2, [2, 4, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([256, 64, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 32, 256, 8], 2, [2, 4, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([256, 32, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([128, 64, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 64, 128, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([128, 32, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 32, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 64, 64, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 32, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 64, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 32, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 8, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 16, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([ 16, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc), | |||
| ] | |||
| for warpsPerThreadblock in warpsPerThreadblocks: | |||
| for warpShape in warpShapes: | |||
| warpThreadsM = 0 | |||
| if warpShape[0] > warpShape[1]: | |||
| warpThreadsM = 8 | |||
| else: | |||
| warpThreadsM = 4 | |||
| warpThreadsN = warpNumThreads / warpThreadsM | |||
| # skip shapes with conflicting rectangularity | |||
| # they are unlikely to be fastest | |||
| blockG = warpsPerThreadblock[0] > warpsPerThreadblock[1] | |||
| blockL = warpsPerThreadblock[0] < warpsPerThreadblock[1] | |||
| warpG = warpShape[0] > warpShape[1] | |||
| warpL = warpShape[0] < warpShape[1] | |||
| blockG2 = warpsPerThreadblock[0] > warpsPerThreadblock[1]*2 | |||
| blockL2 = warpsPerThreadblock[0]*2 < warpsPerThreadblock[1] | |||
| warpG2 = warpShape[0] > warpShape[1]*2 | |||
| warpL2 = warpShape[0]*2 < warpShape[1] | |||
| if blockG2 and warpL: continue | |||
| if blockL2 and warpG: continue | |||
| if warpG2 and blockL: continue | |||
| if warpL2 and blockG: continue | |||
| # check threadblock ratios and max | |||
| threadblockTile = [warpShape[0]*warpsPerThreadblock[0], | |||
| warpShape[1]*warpsPerThreadblock[1]] | |||
| if threadblockTile[0] * threadblockTile[1] > threadblockMaxElements: continue | |||
| if threadblockTile[0] > threadblockEdgeMax: continue | |||
| if threadblockTile[1] > threadblockEdgeMax: continue | |||
| totalThreads = warpNumThreads*warpsPerThreadblock[0]*warpsPerThreadblock[1] | |||
| # calculate unroll | |||
| # ensure that every iteration at least a full load of A,B are done | |||
| unrollMin = 8 | |||
| unrollMin0 = totalThreads // threadblockTile[0] | |||
| unrollMin1 = totalThreads // threadblockTile[1] | |||
| unroll = max(unrollMin, unrollMin0, unrollMin1) | |||
| threadTileM = warpShape[0] // warpThreadsM | |||
| threadTileN = warpShape[1] // warpThreadsN | |||
| if threadTileM < 2 or threadTileN < 2: continue | |||
| if threadTileM*threadTileN*precisionBits > 8*8*32: continue | |||
| # epilogue currently only supports N < WarpNumThreads | |||
| if threadblockTile[1] < warpNumThreads: continue | |||
| # limit smem | |||
| smemBitsA = threadblockTile[0]*unroll*2*precisionBits | |||
| smemBitsB = threadblockTile[1]*unroll*2*precisionBits | |||
| smemKBytes = (smemBitsA+smemBitsB)/8/1024 | |||
| if (smemKBytes > 48): continue | |||
| tile = TileDescription([threadblockTile[0], threadblockTile[1], unroll], \ | |||
| 2, \ | |||
| [threadblockTile[0]//warpShape[0], threadblockTile[1]//warpShape[1], 1], \ | |||
| math_inst, min_cc, max_cc) | |||
| def filter(t: TileDescription) -> bool: | |||
| nonlocal tile | |||
| return t.threadblock_shape[0] == tile.threadblock_shape[0] and \ | |||
| t.threadblock_shape[1] == tile.threadblock_shape[1] and \ | |||
| t.threadblock_shape[2] == tile.threadblock_shape[2] and \ | |||
| t.warp_count[0] == tile.warp_count[0] and \ | |||
| t.warp_count[1] == tile.warp_count[1] and \ | |||
| t.warp_count[2] == tile.warp_count[2] and \ | |||
| t.stages == tile.stages | |||
| if not any(t for t in tile_descriptions if filter(t)): continue | |||
| operations += GeneratesGemm(tile, data_type, layout[0], layout[1], layout[2], min_cc) | |||
| return operations | |||
| # | |||
| def GenerateGemv_Simt(args): | |||
| threadBlockShape_N = [128, 64, 32] | |||
| ldgBits_A = [128, 64, 32] | |||
| ldgBits_B = [128, 64, 32] | |||
| layouts = [ | |||
| (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor), | |||
| ] | |||
| math_instructions = [ | |||
| MathInstruction( \ | |||
| [1, 1, 1], \ | |||
| DataType.f32, DataType.f32, DataType.f32, \ | |||
| OpcodeClass.Simt, \ | |||
| MathOperation.multiply_add), | |||
| ] | |||
| min_cc = 50 | |||
| operations = [] | |||
| for math_inst in math_instructions: | |||
| for layout in layouts: | |||
| data_type = [ | |||
| math_inst.element_a, | |||
| math_inst.element_b, | |||
| math_inst.element_accumulator, | |||
| math_inst.element_accumulator, | |||
| ] | |||
| for threadblock_shape_n in threadBlockShape_N: | |||
| for align_a in ldgBits_A: | |||
| for align_b in ldgBits_B: | |||
| ldg_elements_a = align_a // DataTypeSize[math_inst.element_a] | |||
| ldg_elements_b = align_b // DataTypeSize[math_inst.element_b] | |||
| threadblock_shape_k = (256 * ldg_elements_a) // (threadblock_shape_n // ldg_elements_b) | |||
| threadblock_shape = [1, threadblock_shape_n, threadblock_shape_k] | |||
| thread_shape = [1, ldg_elements_b, ldg_elements_a] | |||
| operations.append(GeneratesGemv(math_inst, \ | |||
| threadblock_shape, \ | |||
| thread_shape, \ | |||
| data_type, \ | |||
| layout[0], \ | |||
| layout[1], \ | |||
| layout[2], \ | |||
| min_cc, \ | |||
| align_a, \ | |||
| align_b)) | |||
| return operations | |||
| # | |||
| def GenerateConv2dOperations(args): | |||
| if args.type == "simt": | |||
| return GenerateConv2d_Simt(args) | |||
| elif args.type == "tensorop8816": | |||
| return GenerateConv2d_TensorOp_8816(args) | |||
| else: | |||
| assert args.type == "tensorop8832", "operation conv2d only support" \ | |||
| "simt, tensorop8816 and tensorop8832. (got:{})".format(args.type) | |||
| return GenerateConv2d_TensorOp_8832(args) | |||
| def GenerateDeconvOperations(args): | |||
| assert args.type == "simt", "operation deconv only support" \ | |||
| "simt. (got:{})".format(args.type) | |||
| return GenerateDeconv_Simt(args) | |||
| def GenerateGemmOperations(args): | |||
| assert args.type == "simt", "operation gemm only support" \ | |||
| "simt. (got:{})".format(args.type) | |||
| return GenerateGemm_Simt(args) | |||
| def GenerateGemvOperations(args): | |||
| assert args.type == "simt", "operation gemv only support" \ | |||
| "simt. (got:{})".format(args.type) | |||
| return GenerateGemv_Simt(args) | |||
| ################################################################################################### | |||
| ################################################################################################### | |||
| if __name__ == "__main__": | |||
| parser = argparse.ArgumentParser(description="Generates device kernel registration code for CUTLASS Kernels") | |||
| parser.add_argument("--operations", type=str, choices=['gemm', 'gemv', 'conv2d', 'deconv'], | |||
| required=True, help="Specifies the operation to generate (gemm, gemv, conv2d, deconv)") | |||
| parser.add_argument("output", type=str, help="output directory for CUTLASS kernel files") | |||
| parser.add_argument("--type", type=str, choices=['simt', 'tensorop8816', 'tensorop8832'], | |||
| default='simt', help="kernel type of CUTLASS kernel generator") | |||
| operation2wrapper_path = { | |||
| "gemm": "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl", \ | |||
| "gemv": "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl", \ | |||
| "conv2d": "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl", \ | |||
| "deconv": "src/cuda/convolution/backward_data/implicit_gemm_deconv_cutlass_wrapper.cuinl", \ | |||
| } | |||
| args = parser.parse_args() | |||
| wrapper_path = operation2wrapper_path[args.operations] | |||
| if args.operations == "gemm": | |||
| operations = GenerateGemmOperations(args) | |||
| elif args.operations == "gemv": | |||
| operations = GenerateGemvOperations(args) | |||
| elif args.operations == "conv2d": | |||
| operations = GenerateConv2dOperations(args) | |||
| elif args.operations == "deconv": | |||
| operations = GenerateDeconvOperations(args) | |||
| if args.operations == "conv2d" or args.operations == "deconv": | |||
| for operation in operations: | |||
| with EmitConvSingleKernelWrapper(args.output, operation, wrapper_path) as emitter: | |||
| emitter.emit() | |||
| elif args.operations == "gemm" or args.operations == "gemv": | |||
| for operation in operations: | |||
| with EmitGemmSingleKernelWrapper(args.output, operation, wrapper_path) as emitter: | |||
| emitter.emit() | |||
| # | |||
| ################################################################################################### | |||
| @@ -0,0 +1,27 @@ | |||
| # | |||
| # \file lazy_file.py | |||
| # | |||
| # \brief LazyFile updates the target file only when the content is changed | |||
| # in order to avoid generating new cutlass kimpls each time cmake is called | |||
| # | |||
| import io | |||
| import os | |||
| class LazyFile: | |||
| def __init__(self, filename): | |||
| self.filename = filename | |||
| self.buffer = io.StringIO() | |||
| def write(self, data): | |||
| self.buffer.write(str(data)) | |||
| def close(self): | |||
| if os.path.isfile(self.filename): | |||
| old_data = open(self.filename).read() | |||
| else: | |||
| old_data = "" | |||
| new_data = self.buffer.getvalue() | |||
| if old_data != new_data: | |||
| with open(self.filename, "w") as f: | |||
| f.write(new_data) | |||
| @@ -0,0 +1,614 @@ | |||
| # | |||
| # \file generator.py | |||
| # | |||
| # \brief Generates the CUTLASS Library's instances | |||
| # | |||
| import re | |||
| ################################################################################################### | |||
| import enum | |||
| # The following block implements enum.auto() for Python 3.5 variants that don't include it such | |||
| # as the default 3.5.2 on Ubuntu 16.04. | |||
| # | |||
| # https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility | |||
| try: | |||
| from enum import auto as enum_auto | |||
| except ImportError: | |||
| __cutlass_library_auto_enum = 0 | |||
| def enum_auto() -> int: | |||
| global __cutlass_library_auto_enum | |||
| i = __cutlass_library_auto_enum | |||
| __cutlass_library_auto_enum += 1 | |||
| return i | |||
| ################################################################################################### | |||
| # | |||
| class GeneratorTarget(enum.Enum): | |||
| Library = enum_auto() | |||
| # | |||
| GeneratorTargetNames = { | |||
| GeneratorTarget.Library: 'library' | |||
| } | |||
| # | |||
| ################################################################################################### | |||
| # | |||
| class DataType(enum.Enum): | |||
| b1 = enum_auto() | |||
| u4 = enum_auto() | |||
| u8 = enum_auto() | |||
| u16 = enum_auto() | |||
| u32 = enum_auto() | |||
| u64 = enum_auto() | |||
| s4 = enum_auto() | |||
| s8 = enum_auto() | |||
| s16 = enum_auto() | |||
| s32 = enum_auto() | |||
| s64 = enum_auto() | |||
| f16 = enum_auto() | |||
| bf16 = enum_auto() | |||
| f32 = enum_auto() | |||
| tf32 = enum_auto() | |||
| f64 = enum_auto() | |||
| cf16 = enum_auto() | |||
| cbf16 = enum_auto() | |||
| cf32 = enum_auto() | |||
| ctf32 = enum_auto() | |||
| cf64 = enum_auto() | |||
| cs4 = enum_auto() | |||
| cs8 = enum_auto() | |||
| cs16 = enum_auto() | |||
| cs32 = enum_auto() | |||
| cs64 = enum_auto() | |||
| cu4 = enum_auto() | |||
| cu8 = enum_auto() | |||
| cu16 = enum_auto() | |||
| cu32 = enum_auto() | |||
| cu64 = enum_auto() | |||
| invalid = enum_auto() | |||
| # | |||
| ShortDataTypeNames = { | |||
| DataType.s32: 'i', | |||
| DataType.f16: 'h', | |||
| DataType.f32: 's', | |||
| DataType.f64: 'd', | |||
| DataType.cf32: 'c', | |||
| DataType.cf64: 'z', | |||
| } | |||
| # | |||
| DataTypeNames = { | |||
| DataType.b1: "b1", | |||
| DataType.u4: "u4", | |||
| DataType.u8: "u8", | |||
| DataType.u16: "u16", | |||
| DataType.u32: "u32", | |||
| DataType.u64: "u64", | |||
| DataType.s4: "s4", | |||
| DataType.s8: "s8", | |||
| DataType.s16: "s16", | |||
| DataType.s32: "s32", | |||
| DataType.s64: "s64", | |||
| DataType.f16: "f16", | |||
| DataType.bf16: "bf16", | |||
| DataType.f32: "f32", | |||
| DataType.tf32: "tf32", | |||
| DataType.f64: "f64", | |||
| DataType.cf16: "cf16", | |||
| DataType.cbf16: "cbf16", | |||
| DataType.cf32: "cf32", | |||
| DataType.ctf32: "ctf32", | |||
| DataType.cf64: "cf64", | |||
| DataType.cu4: "cu4", | |||
| DataType.cu8: "cu8", | |||
| DataType.cu16: "cu16", | |||
| DataType.cu32: "cu32", | |||
| DataType.cu64: "cu64", | |||
| DataType.cs4: "cs4", | |||
| DataType.cs8: "cs8", | |||
| DataType.cs16: "cs16", | |||
| DataType.cs32: "cs32", | |||
| DataType.cs64: "cs64", | |||
| } | |||
| DataTypeTag = { | |||
| DataType.b1: "cutlass::uint1b_t", | |||
| DataType.u4: "cutlass::uint4b_t", | |||
| DataType.u8: "uint8_t", | |||
| DataType.u16: "uint16_t", | |||
| DataType.u32: "uint32_t", | |||
| DataType.u64: "uint64_t", | |||
| DataType.s4: "cutlass::int4b_t", | |||
| DataType.s8: "int8_t", | |||
| DataType.s16: "int16_t", | |||
| DataType.s32: "int32_t", | |||
| DataType.s64: "int64_t", | |||
| DataType.f16: "cutlass::half_t", | |||
| DataType.bf16: "cutlass::bfloat16_t", | |||
| DataType.f32: "float", | |||
| DataType.tf32: "cutlass::tfloat32_t", | |||
| DataType.f64: "double", | |||
| DataType.cf16: "cutlass::complex<cutlass::half_t>", | |||
| DataType.cbf16: "cutlass::complex<cutlass::bfloat16_t>", | |||
| DataType.cf32: "cutlass::complex<float>", | |||
| DataType.ctf32: "cutlass::complex<cutlass::tfloat32_t>", | |||
| DataType.cf64: "cutlass::complex<double>", | |||
| DataType.cu4: "cutlass::complex<cutlass::uint4b_t>", | |||
| DataType.cu8: "cutlass::complex<cutlass::uint8_t>", | |||
| DataType.cu16: "cutlass::complex<cutlass::uint16_t>", | |||
| DataType.cu32: "cutlass::complex<cutlass::uint32_t>", | |||
| DataType.cu64: "cutlass::complex<cutlass::uint64_t>", | |||
| DataType.cs4: "cutlass::complex<cutlass::int4b_t>", | |||
| DataType.cs8: "cutlass::complex<cutlass::int8_t>", | |||
| DataType.cs16: "cutlass::complex<cutlass::int16_t>", | |||
| DataType.cs32: "cutlass::complex<cutlass::int32_t>", | |||
| DataType.cs64: "cutlass::complex<cutlass::int64_t>", | |||
| } | |||
| DataTypeSize = { | |||
| DataType.b1: 1, | |||
| DataType.u4: 4, | |||
| DataType.u8: 4, | |||
| DataType.u16: 16, | |||
| DataType.u32: 32, | |||
| DataType.u64: 64, | |||
| DataType.s4: 4, | |||
| DataType.s8: 8, | |||
| DataType.s16: 16, | |||
| DataType.s32: 32, | |||
| DataType.s64: 64, | |||
| DataType.f16: 16, | |||
| DataType.bf16: 16, | |||
| DataType.f32: 32, | |||
| DataType.tf32: 32, | |||
| DataType.f64: 64, | |||
| DataType.cf16: 32, | |||
| DataType.cbf16: 32, | |||
| DataType.cf32: 64, | |||
| DataType.ctf32: 32, | |||
| DataType.cf64: 128, | |||
| DataType.cu4: 8, | |||
| DataType.cu8: 16, | |||
| DataType.cu16: 32, | |||
| DataType.cu32: 64, | |||
| DataType.cu64: 128, | |||
| DataType.cs4: 8, | |||
| DataType.cs8: 16, | |||
| DataType.cs16: 32, | |||
| DataType.cs32: 64, | |||
| DataType.cs64: 128, | |||
| } | |||
| ################################################################################################### | |||
| # | |||
| class ComplexTransform(enum.Enum): | |||
| none = enum_auto() | |||
| conj = enum_auto() | |||
| # | |||
| ComplexTransformTag = { | |||
| ComplexTransform.none: 'cutlass::ComplexTransform::kNone', | |||
| ComplexTransform.conj: 'cutlass::ComplexTransform::kConjugate', | |||
| } | |||
| # | |||
| RealComplexBijection = [ | |||
| (DataType.f16, DataType.cf16), | |||
| (DataType.f32, DataType.cf32), | |||
| (DataType.f64, DataType.cf64), | |||
| ] | |||
| # | |||
| def is_complex(data_type): | |||
| for r, c in RealComplexBijection: | |||
| if data_type == c: | |||
| return True | |||
| return False | |||
| # | |||
| def get_complex_from_real(real_type): | |||
| for r, c in RealComplexBijection: | |||
| if real_type == r: | |||
| return c | |||
| return DataType.invalid | |||
| # | |||
| def get_real_from_complex(complex_type): | |||
| for r, c in RealComplexBijection: | |||
| if complex_type == c: | |||
| return r | |||
| return DataType.invalid | |||
| # | |||
| class ComplexMultiplyOp(enum.Enum): | |||
| multiply_add = enum_auto() | |||
| gaussian = enum_auto() | |||
| ################################################################################################### | |||
| # | |||
| class MathOperation(enum.Enum): | |||
| multiply_add = enum_auto() | |||
| multiply_add_saturate = enum_auto() | |||
| xor_popc = enum_auto() | |||
| multiply_add_fast_bf16 = enum_auto() | |||
| multiply_add_fast_f16 = enum_auto() | |||
| multiply_add_complex = enum_auto() | |||
| multiply_add_complex_gaussian = enum_auto() | |||
| # | |||
| MathOperationTag = { | |||
| MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd', | |||
| MathOperation.multiply_add_saturate: 'cutlass::arch::OpMultiplyAddSaturate', | |||
| MathOperation.xor_popc: 'cutlass::arch::OpXorPopc', | |||
| MathOperation.multiply_add_fast_bf16: 'cutlass::arch::OpMultiplyAddFastBF16', | |||
| MathOperation.multiply_add_fast_f16: 'cutlass::arch::OpMultiplyAddFastF16', | |||
| MathOperation.multiply_add_complex: 'cutlass::arch::OpMultiplyAddComplex', | |||
| MathOperation.multiply_add_complex_gaussian: 'cutlass::arch::OpMultiplyAddGaussianComplex', | |||
| } | |||
| ################################################################################################### | |||
| # | |||
| class LayoutType(enum.Enum): | |||
| ColumnMajor = enum_auto() | |||
| RowMajor = enum_auto() | |||
| ColumnMajorInterleaved2 = enum_auto() | |||
| RowMajorInterleaved2 = enum_auto() | |||
| ColumnMajorInterleaved32 = enum_auto() | |||
| RowMajorInterleaved32 = enum_auto() | |||
| ColumnMajorInterleaved64 = enum_auto() | |||
| RowMajorInterleaved64 = enum_auto() | |||
| TensorNHWC = enum_auto() | |||
| TensorNDHWC = enum_auto() | |||
| TensorNCHW = enum_auto() | |||
| TensorNGHWC = enum_auto() | |||
| TensorNC4HW4 = enum_auto() | |||
| TensorC4RSK4 = enum_auto() | |||
| TensorNC8HW8 = enum_auto() | |||
| TensorNC16HW16 = enum_auto() | |||
| TensorNC32HW32 = enum_auto() | |||
| TensorNC64HW64 = enum_auto() | |||
| TensorC32RSK32 = enum_auto() | |||
| TensorC64RSK64 = enum_auto() | |||
| TensorK4RSC4 = enum_auto() | |||
| # | |||
| LayoutTag = { | |||
| LayoutType.ColumnMajor: 'cutlass::layout::ColumnMajor', | |||
| LayoutType.RowMajor: 'cutlass::layout::RowMajor', | |||
| LayoutType.ColumnMajorInterleaved2: 'cutlass::layout::ColumnMajorInterleaved<2>', | |||
| LayoutType.RowMajorInterleaved2: 'cutlass::layout::RowMajorInterleaved<2>', | |||
| LayoutType.ColumnMajorInterleaved32: 'cutlass::layout::ColumnMajorInterleaved<32>', | |||
| LayoutType.RowMajorInterleaved32: 'cutlass::layout::RowMajorInterleaved<32>', | |||
| LayoutType.ColumnMajorInterleaved64: 'cutlass::layout::ColumnMajorInterleaved<64>', | |||
| LayoutType.RowMajorInterleaved64: 'cutlass::layout::RowMajorInterleaved<64>', | |||
| LayoutType.TensorNHWC: 'cutlass::layout::TensorNHWC', | |||
| LayoutType.TensorNDHWC: 'cutlass::layout::TensorNDHWC', | |||
| LayoutType.TensorNCHW: 'cutlass::layout::TensorNCHW', | |||
| LayoutType.TensorNGHWC: 'cutlass::layout::TensorNGHWC', | |||
| LayoutType.TensorNC4HW4: 'cutlass::layout::TensorNCxHWx<4>', | |||
| LayoutType.TensorC4RSK4: 'cutlass::layout::TensorCxRSKx<4>', | |||
| LayoutType.TensorNC8HW8: 'cutlass::layout::TensorNCxHWx<8>', | |||
| LayoutType.TensorNC16HW16: 'cutlass::layout::TensorNCxHWx<16>', | |||
| LayoutType.TensorNC32HW32: 'cutlass::layout::TensorNCxHWx<32>', | |||
| LayoutType.TensorC32RSK32: 'cutlass::layout::TensorCxRSKx<32>', | |||
| LayoutType.TensorNC64HW64: 'cutlass::layout::TensorNCxHWx<64>', | |||
| LayoutType.TensorC64RSK64: 'cutlass::layout::TensorCxRSKx<64>', | |||
| LayoutType.TensorK4RSC4: 'cutlass::layout::TensorKxRSCx<4>', | |||
| } | |||
| # | |||
| TransposedLayout = { | |||
| LayoutType.ColumnMajor: LayoutType.RowMajor, | |||
| LayoutType.RowMajor: LayoutType.ColumnMajor, | |||
| LayoutType.ColumnMajorInterleaved2: LayoutType.RowMajorInterleaved2, | |||
| LayoutType.RowMajorInterleaved2: LayoutType.ColumnMajorInterleaved2, | |||
| LayoutType.ColumnMajorInterleaved32: LayoutType.RowMajorInterleaved32, | |||
| LayoutType.RowMajorInterleaved32: LayoutType.ColumnMajorInterleaved32, | |||
| LayoutType.ColumnMajorInterleaved64: LayoutType.RowMajorInterleaved64, | |||
| LayoutType.RowMajorInterleaved64: LayoutType.ColumnMajorInterleaved64, | |||
| LayoutType.TensorNHWC: LayoutType.TensorNHWC | |||
| } | |||
| # | |||
| ShortLayoutTypeNames = { | |||
| LayoutType.ColumnMajor: 'n', | |||
| LayoutType.ColumnMajorInterleaved32: 'n2', | |||
| LayoutType.ColumnMajorInterleaved32: 'n32', | |||
| LayoutType.ColumnMajorInterleaved64: 'n64', | |||
| LayoutType.RowMajor: 't', | |||
| LayoutType.RowMajorInterleaved2: 't2', | |||
| LayoutType.RowMajorInterleaved32: 't32', | |||
| LayoutType.RowMajorInterleaved64: 't64', | |||
| LayoutType.TensorNHWC: 'nhwc', | |||
| LayoutType.TensorNDHWC: 'ndhwc', | |||
| LayoutType.TensorNCHW: 'nchw', | |||
| LayoutType.TensorNGHWC: 'nghwc', | |||
| LayoutType.TensorNC4HW4: 'nc4hw4', | |||
| LayoutType.TensorC4RSK4: 'c4rsk4', | |||
| LayoutType.TensorNC8HW8: 'nc8hw8', | |||
| LayoutType.TensorNC16HW16: 'nc16hw16', | |||
| LayoutType.TensorNC32HW32: 'nc32hw32', | |||
| LayoutType.TensorNC64HW64: 'nc64hw64', | |||
| LayoutType.TensorC32RSK32: 'c32rsk32', | |||
| LayoutType.TensorC64RSK64: 'c64rsk64', | |||
| LayoutType.TensorK4RSC4: 'k4rsc4', | |||
| } | |||
| # | |||
| ShortComplexLayoutNames = { | |||
| (LayoutType.ColumnMajor, ComplexTransform.none): 'n', | |||
| (LayoutType.ColumnMajor, ComplexTransform.conj): 'c', | |||
| (LayoutType.RowMajor, ComplexTransform.none): 't', | |||
| (LayoutType.RowMajor, ComplexTransform.conj): 'h' | |||
| } | |||
| ################################################################################################### | |||
| # | |||
| class OpcodeClass(enum.Enum): | |||
| Simt = enum_auto() | |||
| TensorOp = enum_auto() | |||
| WmmaTensorOp = enum_auto() | |||
| OpcodeClassNames = { | |||
| OpcodeClass.Simt: 'simt', | |||
| OpcodeClass.TensorOp: 'tensorop', | |||
| OpcodeClass.WmmaTensorOp: 'wmma_tensorop', | |||
| } | |||
| OpcodeClassTag = { | |||
| OpcodeClass.Simt: 'cutlass::arch::OpClassSimt', | |||
| OpcodeClass.TensorOp: 'cutlass::arch::OpClassTensorOp', | |||
| OpcodeClass.WmmaTensorOp: 'cutlass::arch::OpClassWmmaTensorOp', | |||
| } | |||
| ################################################################################################### | |||
| # | |||
| class OperationKind(enum.Enum): | |||
| Gemm = enum_auto() | |||
| Conv2d = enum_auto() | |||
| # | |||
| OperationKindNames = { | |||
| OperationKind.Gemm: 'gemm' | |||
| , OperationKind.Conv2d: 'conv2d' | |||
| } | |||
| # | |||
| class Target(enum.Enum): | |||
| library = enum_auto() | |||
| ArchitectureNames = { | |||
| 50: 'maxwell', | |||
| 60: 'pascal', | |||
| 61: 'pascal', | |||
| 70: 'volta', | |||
| 75: 'turing', | |||
| 80: 'ampere', | |||
| } | |||
| ################################################################################################### | |||
| # | |||
| def SubstituteTemplate(template, values): | |||
| text = template | |||
| changed = True | |||
| while changed: | |||
| changed = False | |||
| for key, value in values.items(): | |||
| regex = "\\$\\{%s\\}" % key | |||
| newtext = re.sub(regex, value, text) | |||
| if newtext != text: | |||
| changed = True | |||
| text = newtext | |||
| return text | |||
| ################################################################################################### | |||
| # | |||
| class GemmKind(enum.Enum): | |||
| Gemm = enum_auto() | |||
| Sparse = enum_auto() | |||
| Universal = enum_auto() | |||
| PlanarComplex = enum_auto() | |||
| PlanarComplexArray = enum_auto() | |||
| SplitKParallel = enum_auto() | |||
| GemvBatchedStrided = enum_auto() | |||
| # | |||
| GemmKindNames = { | |||
| GemmKind.Gemm: "gemm", | |||
| GemmKind.Sparse: "spgemm", | |||
| GemmKind.Universal: "gemm", | |||
| GemmKind.PlanarComplex: "gemm_planar_complex", | |||
| GemmKind.PlanarComplexArray: "gemm_planar_complex_array", | |||
| GemmKind.SplitKParallel: "gemm_split_k_parallel", | |||
| GemmKind.GemvBatchedStrided: "gemv_batched_strided", | |||
| } | |||
| # | |||
| class EpilogueFunctor(enum.Enum): | |||
| LinearCombination = enum_auto() | |||
| LinearCombinationClamp = enum_auto() | |||
| BiasAddLinearCombination = enum_auto() | |||
| BiasAddLinearCombinationRelu = enum_auto() | |||
| BiasAddLinearCombinationHSwish = enum_auto() | |||
| BiasAddLinearCombinationClamp = enum_auto() | |||
| BiasAddLinearCombinationReluClamp = enum_auto() | |||
| BiasAddLinearCombinationHSwishClamp = enum_auto() | |||
| # | |||
| EpilogueFunctorTag = { | |||
| EpilogueFunctor.LinearCombination: 'cutlass::epilogue::thread::LinearCombination', | |||
| EpilogueFunctor.LinearCombinationClamp: 'cutlass::epilogue::thread::LinearCombinationClamp', | |||
| EpilogueFunctor.BiasAddLinearCombination: 'cutlass::epilogue::thread::BiasAddLinearCombination', | |||
| EpilogueFunctor.BiasAddLinearCombinationRelu: 'cutlass::epilogue::thread::BiasAddLinearCombinationRelu', | |||
| EpilogueFunctor.BiasAddLinearCombinationHSwish: 'cutlass::epilogue::thread::BiasAddLinearCombinationHSwish', | |||
| EpilogueFunctor.BiasAddLinearCombinationClamp: 'cutlass::epilogue::thread::BiasAddLinearCombinationClamp', | |||
| EpilogueFunctor.BiasAddLinearCombinationReluClamp: 'cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp', | |||
| EpilogueFunctor.BiasAddLinearCombinationHSwishClamp: 'cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp', | |||
| } | |||
| # | |||
| ShortEpilogueNames = { | |||
| EpilogueFunctor.BiasAddLinearCombinationHSwishClamp: 'hswish', | |||
| EpilogueFunctor.BiasAddLinearCombinationReluClamp: 'relu', | |||
| EpilogueFunctor.BiasAddLinearCombinationClamp: 'identity', | |||
| EpilogueFunctor.BiasAddLinearCombinationHSwish: 'hswish', | |||
| EpilogueFunctor.BiasAddLinearCombinationRelu: 'relu', | |||
| EpilogueFunctor.BiasAddLinearCombination: 'identity', | |||
| } | |||
| # | |||
| class SwizzlingFunctor(enum.Enum): | |||
| Identity1 = enum_auto() | |||
| Identity2 = enum_auto() | |||
| Identity4 = enum_auto() | |||
| Identity8 = enum_auto() | |||
| ConvFpropNCxHWx = enum_auto() | |||
| ConvFpropNHWC = enum_auto() | |||
| ConvDgradNCxHWx = enum_auto() | |||
| # | |||
| SwizzlingFunctorTag = { | |||
| SwizzlingFunctor.Identity1: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>', | |||
| SwizzlingFunctor.Identity2: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>', | |||
| SwizzlingFunctor.Identity4: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>', | |||
| SwizzlingFunctor.Identity8: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>', | |||
| SwizzlingFunctor.ConvFpropNCxHWx: 'cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle', | |||
| SwizzlingFunctor.ConvFpropNHWC: 'cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle', | |||
| SwizzlingFunctor.ConvDgradNCxHWx: 'cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle', | |||
| } | |||
| ################################################################################################### | |||
| class ConvType(enum.Enum): | |||
| Convolution = enum_auto() | |||
| BatchConvolution = enum_auto() | |||
| Local = enum_auto() | |||
| LocalShare = enum_auto() | |||
| ConvTypeTag = { | |||
| ConvType.Convolution: 'cutlass::conv::ConvType::kConvolution', | |||
| ConvType.BatchConvolution: 'cutlass::conv::ConvType::kBatchConvolution', | |||
| ConvType.Local: 'cutlass::conv::ConvType::kLocal', | |||
| ConvType.LocalShare : 'cutlass::conv::ConvType::kLocalShare', | |||
| } | |||
| # | |||
| class ConvKind(enum.Enum): | |||
| Fprop = enum_auto() | |||
| Dgrad = enum_auto() | |||
| Wgrad = enum_auto() | |||
| # | |||
| ConvKindTag = { | |||
| ConvKind.Fprop: 'cutlass::conv::Operator::kFprop', | |||
| ConvKind.Dgrad: 'cutlass::conv::Operator::kDgrad', | |||
| ConvKind.Wgrad: 'cutlass::conv::Operator::kWgrad' | |||
| } | |||
| ConvKindNames = { | |||
| ConvKind.Fprop: 'fprop', | |||
| ConvKind.Dgrad: 'dgrad', | |||
| ConvKind.Wgrad: 'wgrad', | |||
| } | |||
| # | |||
| class IteratorAlgorithm(enum.Enum): | |||
| Analytic = enum_auto() | |||
| Optimized = enum_auto() | |||
| # | |||
| IteratorAlgorithmTag = { | |||
| IteratorAlgorithm.Analytic: 'cutlass::conv::IteratorAlgorithm::kAnalytic', | |||
| IteratorAlgorithm.Optimized: 'cutlass::conv::IteratorAlgorithm::kOptimized', | |||
| } | |||
| IteratorAlgorithmNames = { | |||
| IteratorAlgorithm.Analytic: 'analytic', | |||
| IteratorAlgorithm.Optimized: 'optimized', | |||
| } | |||
| # | |||
| class StrideSupport(enum.Enum): | |||
| Strided = enum_auto() | |||
| Unity = enum_auto() | |||
| # | |||
| StrideSupportTag = { | |||
| StrideSupport.Strided: 'cutlass::conv::StrideSupport::kStrided', | |||
| StrideSupport.Unity: 'cutlass::conv::StrideSupport::kUnity', | |||
| } | |||
| StrideSupportNames = { | |||
| StrideSupport.Strided: '', | |||
| StrideSupport.Unity: 'unity_stride', | |||
| } | |||
| class ImplicitGemmMode(enum.Enum): | |||
| GemmNt = enum_auto() | |||
| GemmTn = enum_auto() | |||
| ImplicitGemmModeNames = { | |||
| ImplicitGemmMode.GemmNt: 'gemm_nt', | |||
| ImplicitGemmMode.GemmTn: 'gemm_tn', | |||
| } | |||
| ImplicitGemmModeTag = { | |||
| ImplicitGemmMode.GemmNt: 'cutlass::conv::ImplicitGemmMode::GEMM_NT', | |||
| ImplicitGemmMode.GemmTn: 'cutlass::conv::ImplicitGemmMode::GEMM_TN', | |||
| } | |||
| ################################################################################################### | |||
| # | |||
| class MathInstruction: | |||
| def __init__(self, instruction_shape, element_a, element_b, element_accumulator, opcode_class, math_operation = MathOperation.multiply_add): | |||
| self.instruction_shape = instruction_shape | |||
| self.element_a = element_a | |||
| self.element_b = element_b | |||
| self.element_accumulator = element_accumulator | |||
| self.opcode_class = opcode_class | |||
| self.math_operation = math_operation | |||
| # | |||
| class TileDescription: | |||
| def __init__(self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute): | |||
| self.threadblock_shape = threadblock_shape | |||
| self.stages = stages | |||
| self.warp_count = warp_count | |||
| self.math_instruction = math_instruction | |||
| self.minimum_compute_capability = min_compute | |||
| self.maximum_compute_capability = max_compute | |||
| def procedural_name(self): | |||
| return "%dx%d_%dx%d" % (self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2], self.stages) | |||
| # | |||
| class TensorDescription: | |||
| def __init__(self, element, layout, alignment = 1, complex_transform = ComplexTransform.none): | |||
| self.element = element | |||
| self.layout = layout | |||
| self.alignment = alignment | |||
| self.complex_transform = complex_transform | |||
| ################################################################################################### | |||
| @@ -0,0 +1,578 @@ | |||
| # Generated by dnn/scripts/cutlass_generator/gen_list.py | |||
| cutlass_gen_list = [ | |||
| "cutlass_simt_sgemm_8x32_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_16x32_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_16x64_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_32x32_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_32x64_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_64x32_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_16x128_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_32x128_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_64x64_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_128x32_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_64x128_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_128x64_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_32x256_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_64x256_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_128x128_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_256x32_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_256x64_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1.cu", | |||
| "cutlass_simt_sgemm_8x32_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_16x32_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_16x64_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_32x32_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_32x64_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_64x32_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_16x128_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_32x128_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_64x64_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_128x32_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_64x128_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_128x64_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_32x256_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_64x256_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_128x128_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_256x32_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_256x64_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1.cu", | |||
| "cutlass_simt_sgemm_8x32_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_16x32_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_16x64_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_32x32_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_32x64_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_64x32_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_16x128_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_32x128_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_64x64_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_128x32_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_64x128_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_128x64_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_32x256_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_64x256_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_128x128_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_256x32_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_256x64_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1.cu", | |||
| "cutlass_simt_sgemm_8x32_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_16x32_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_16x64_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_32x32_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_32x64_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_64x32_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_16x128_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_32x128_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_64x64_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_128x32_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_64x128_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_128x64_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_32x256_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_64x256_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_128x128_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_256x32_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_256x64_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu", | |||
| "cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu", | |||
| "cutlass_simt_s8_idgrad_identity_s8_64x128x32_64x32x32_2_nc4hw4_k4rsc4.cu", | |||
| "cutlass_simt_s8_idgrad_identity_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu", | |||
| "cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu", | |||
| "cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu", | |||
| "cutlass_simt_s8_idgrad_identity_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
| "cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | |||
| "cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
| "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
| ] | |||
| @@ -0,0 +1,351 @@ | |||
| # | |||
| # \file generator.py | |||
| # | |||
| # \brief Generates the CUTLASS Library's instances | |||
| # | |||
| import enum | |||
| import os.path | |||
| import shutil | |||
| from library import * | |||
| from gemm_operation import * | |||
| from conv2d_operation import * | |||
| ################################################################################################### | |||
| class EmitOperationKindLibrary: | |||
| def __init__(self, generated_path, kind, args): | |||
| self.generated_path = generated_path | |||
| self.kind = kind | |||
| self.args = args | |||
| self.emitters = { | |||
| OperationKind.Gemm: EmitGemmConfigurationLibrary | |||
| , OperationKind.Conv2d: EmitConv2dConfigurationLibrary | |||
| } | |||
| self.configurations = []; | |||
| self.header_template =""" | |||
| /* | |||
| Generated by manifest.py - Do not edit. | |||
| */ | |||
| #include "cutlass/cutlass.h" | |||
| #include "cutlass/library/library.h" | |||
| #include "cutlass/library/manifest.h" | |||
| namespace cutlass { | |||
| namespace library { | |||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | |||
| """ | |||
| self.entry_template = """ | |||
| // | |||
| // Entry point to construct operations | |||
| // | |||
| void initialize_all_${operation_name}_operations(Manifest &manifest) { | |||
| """ | |||
| self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n" | |||
| self.configuration_template =" initialize_${configuration_name}(manifest);\n" | |||
| self.epilogue_template =""" | |||
| } | |||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | |||
| } // namespace library | |||
| } // namespace cutlass | |||
| """ | |||
| # | |||
| def __enter__(self): | |||
| self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind]) | |||
| os.mkdir(self.operation_path) | |||
| self.top_level_path = os.path.join(self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind]) | |||
| self.top_level_file = open(self.top_level_path, "w") | |||
| self.top_level_file.write(self.header_template) | |||
| self.source_files = [self.top_level_path,] | |||
| return self | |||
| # | |||
| def emit(self, configuration_name, operations): | |||
| with self.emitters[self.kind](self.operation_path, configuration_name) as configuration_emitter: | |||
| for operation in operations: | |||
| configuration_emitter.emit(operation) | |||
| self.source_files.append(configuration_emitter.configuration_path) | |||
| self.configurations.append(configuration_name) | |||
| self.top_level_file.write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} )) | |||
| # | |||
| def __exit__(self, exception_type, exception_value, traceback): | |||
| self.top_level_file.write(SubstituteTemplate(self.entry_template, {'operation_name': OperationKindNames[self.kind]})) | |||
| for configuration_name in self.configurations: | |||
| self.top_level_file.write(SubstituteTemplate(self.configuration_template, {'configuration_name': configuration_name})) | |||
| self.top_level_file.write(self.epilogue_template) | |||
| self.top_level_file.close() | |||
| ################################################################################################### | |||
| ################################################################################################### | |||
| class Options: | |||
| def __init__(self): | |||
| pass | |||
| ################################################################################################### | |||
| # | |||
| class Manifest: | |||
| # | |||
| def __init__(self, args): | |||
| self.operations = {} | |||
| self.args = args | |||
| architectures = args.architectures.split(';') if len(args.architectures) else ['50',] | |||
| self.compute_capabilities = [int(x) for x in architectures] | |||
| self.selected_kernels = [] | |||
| if args.operations == 'all': | |||
| self.operations_enabled = [] | |||
| else: | |||
| operations_list = [ | |||
| OperationKind.Gemm | |||
| , OperationKind.Conv2d | |||
| ] | |||
| self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')] | |||
| if args.kernels == 'all': | |||
| self.kernel_names = [] | |||
| else: | |||
| self.kernel_names = [x for x in args.kernels.split(',') if x != ''] | |||
| self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != ''] | |||
| if args.kernel_filter_file is None: | |||
| self.kernel_filter_list = [] | |||
| else: | |||
| self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file) | |||
| self.operation_count = 0 | |||
| self.operations_by_name = {} | |||
| self.top_level_prologue = ''' | |||
| #include "cutlass/library/library.h" | |||
| #include "cutlass/library/manifest.h" | |||
| namespace cutlass { | |||
| namespace library { | |||
| ${prototypes} | |||
| void initialize_all(Manifest &manifest) { | |||
| ''' | |||
| self.top_level_reserve = ' manifest.reserve(${operation_count});\n\n' | |||
| self.top_level_epilogue = ''' | |||
| } | |||
| } // namespace library | |||
| } // namespace cutlass | |||
| ''' | |||
| def get_kernel_filters (self, kernelListFile): | |||
| if os.path.isfile(kernelListFile): | |||
| with open(kernelListFile, 'r') as fileReader: | |||
| lines = [line.rstrip() for line in fileReader if not line.startswith("#")] | |||
| lines = [re.compile(line) for line in lines if line] | |||
| return lines | |||
| else: | |||
| return [] | |||
| def filter_out_kernels(self, kernel_name, kernel_filter_list): | |||
| for kernel_filter_re in kernel_filter_list: | |||
| if kernel_filter_re.search(kernel_name) is not None: | |||
| return True | |||
| return False | |||
| # | |||
| def _filter_string_matches(self, filter_string, haystack): | |||
| ''' Returns true if all substrings appear in the haystack in order''' | |||
| substrings = filter_string.split('*') | |||
| for sub in substrings: | |||
| idx = haystack.find(sub) | |||
| if idx < 0: | |||
| return False | |||
| haystack = haystack[idx + len(sub):] | |||
| return True | |||
| # | |||
| def filter(self, operation): | |||
| ''' Filtering operations based on various criteria''' | |||
| # filter based on compute capability | |||
| enabled = False | |||
| for cc in self.compute_capabilities: | |||
| if cc >= operation.tile_description.minimum_compute_capability and \ | |||
| cc <= operation.tile_description.maximum_compute_capability: | |||
| enabled = True | |||
| break | |||
| if not enabled: | |||
| return False | |||
| if len(self.operations_enabled) and not operation.operation_kind in self.operations_enabled: | |||
| return False | |||
| # eliminate duplicates | |||
| if operation.procedural_name() in self.operations_by_name.keys(): | |||
| return False | |||
| # Filter based on list of valid substrings | |||
| if len(self.kernel_names): | |||
| name = operation.procedural_name() | |||
| enabled = False | |||
| # compare against the include list | |||
| for name_substr in self.kernel_names: | |||
| if self._filter_string_matches(name_substr, name): | |||
| enabled = True | |||
| break | |||
| # compare against the exclude list | |||
| for name_substr in self.ignore_kernel_names: | |||
| if self._filter_string_matches(name_substr, name): | |||
| enabled = False | |||
| break | |||
| if len(self.kernel_filter_list) > 0: | |||
| enabled = False | |||
| if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list): | |||
| enabled = True | |||
| # todo: filter based on compute data type | |||
| return enabled | |||
| # | |||
| # | |||
| def append(self, operation): | |||
| ''' | |||
| Inserts the operation. | |||
| operation_kind -> configuration_name -> [] | |||
| ''' | |||
| if self.filter(operation): | |||
| self.selected_kernels.append(operation.procedural_name()) | |||
| self.operations_by_name[operation.procedural_name()] = operation | |||
| # add the configuration | |||
| configuration_name = operation.configuration_name() | |||
| if operation.operation_kind not in self.operations.keys(): | |||
| self.operations[operation.operation_kind] = {} | |||
| if configuration_name not in self.operations[operation.operation_kind].keys(): | |||
| self.operations[operation.operation_kind][configuration_name] = [] | |||
| self.operations[operation.operation_kind][configuration_name].append(operation) | |||
| self.operation_count += 1 | |||
| # | |||
| # | |||
| def emit(self, target = GeneratorTarget.Library): | |||
| operation_emitters = { | |||
| GeneratorTarget.Library: EmitOperationKindLibrary | |||
| } | |||
| generated_path = os.path.join(self.args.curr_build_dir, 'generated') | |||
| # create generated/ | |||
| if os.path.exists(generated_path): | |||
| shutil.rmtree(generated_path) | |||
| os.mkdir(generated_path) | |||
| source_files = [] | |||
| top_level_path = os.path.join(generated_path, 'initialize_all.cpp') | |||
| with open(top_level_path, 'w') as top_level_file: | |||
| if target == GeneratorTarget.Library: | |||
| source_files.append(top_level_path) | |||
| prototypes = [] | |||
| for operation_kind, configurations in self.operations.items(): | |||
| prototypes.append(SubstituteTemplate( | |||
| "void initialize_all_${operation_kind}_operations(Manifest &manifest);", | |||
| {'operation_kind': OperationKindNames[operation_kind]})) | |||
| top_level_file.write(SubstituteTemplate(self.top_level_prologue, | |||
| {'prototypes': "\n".join(prototypes)})) | |||
| top_level_file.write(SubstituteTemplate( | |||
| self.top_level_reserve, {'operation_count': str(self.operation_count)})) | |||
| # for each operation kind, emit initializer for all configurations | |||
| for operation_kind, configurations in self.operations.items(): | |||
| with operation_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter: | |||
| for configuration_name, operations in configurations.items(): | |||
| operation_kind_emitter.emit(configuration_name, operations) | |||
| source_files += operation_kind_emitter.source_files | |||
| top_level_file.write(SubstituteTemplate( | |||
| " initialize_all_${operation_kind}_operations(manifest);\n", | |||
| {'operation_kind': OperationKindNames[operation_kind]})) | |||
| top_level_file.write(self.top_level_epilogue) | |||
| # write the manifest.cmake file containing paths from all targets | |||
| manifest_path = os.path.join(generated_path, "manifest.cmake") | |||
| with open(manifest_path, "w") as manifest_file: | |||
| target_name = 'cutlass_library_objs' | |||
| target_text = SubstituteTemplate("""cutlass_target_sources( | |||
| ${target_name} | |||
| BATCH_SOURCES ON | |||
| PRIVATE | |||
| """, { 'target_name': target_name}) | |||
| manifest_file.write(target_text) | |||
| for source_file in source_files: | |||
| manifest_file.write(" %s\n" % str(source_file.replace('\\', '/'))) | |||
| manifest_file.write(")") | |||
| # | |||
| ################################################################################################### | |||
| @@ -113,6 +113,31 @@ if(MGE_WITH_CUDA) | |||
| list(APPEND SOURCES ${SOURCES_}) | |||
| file(GLOB_RECURSE CUSOURCES cuda/*.cu) | |||
| set(CUTLASS_GEN_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/../scripts/cutlass_generator/generator.py) | |||
| set(CUTLASS_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/cuda/cutlass/generated) | |||
| function(gen_cutlass_kimpl op type) | |||
| set(CURRENT_CUTLASS_GEN_DIR ${CUTLASS_GEN_DIR}/${op}_${type}) | |||
| file(MAKE_DIRECTORY ${CURRENT_CUTLASS_GEN_DIR}) | |||
| execute_process( | |||
| COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${CUTLASS_GEN_SCRIPT} --operations ${op} --type ${type} ${CURRENT_CUTLASS_GEN_DIR} | |||
| RESULT_VARIABLE gen_cutlass_result | |||
| OUTPUT_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log | |||
| ERROR_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log | |||
| ) | |||
| if (NOT gen_cutlass_result EQUAL 0) | |||
| message(FATAL_ERROR "Error generating library instances. See ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log") | |||
| endif() | |||
| endfunction() | |||
| gen_cutlass_kimpl(gemm simt) | |||
| gen_cutlass_kimpl(gemv simt) | |||
| gen_cutlass_kimpl(deconv simt) | |||
| gen_cutlass_kimpl(conv2d simt) | |||
| gen_cutlass_kimpl(conv2d tensorop8816) | |||
| gen_cutlass_kimpl(conv2d tensorop8832) | |||
| file(GLOB_RECURSE CUTLASS_SOURCES ${CUTLASS_GEN_DIR}/*.cu) | |||
| list(APPEND SOURCES ${CUTLASS_SOURCES}) | |||
| list(APPEND SOURCES ${CUSOURCES}) | |||
| endif() | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::int4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<256, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
| cutlass::int4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::int4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<256, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::int4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::int4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::int4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<256, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::int4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| false, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::uint4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<256, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
| cutlass::uint4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::uint4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 32, 64>, | |||
| cutlass::gemm::GemmShape<64, 32, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<16>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 16, | |||
| 16, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<32>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorNCxHWx<8>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::layout::TensorNHWC, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<128, 64, 64>, | |||
| cutlass::gemm::GemmShape<64, 64, 64>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::uint4b_t, | |||
| 8, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
| 2, | |||
| 8, | |||
| 8, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| cutlass::int4b_t, | |||
| cutlass::layout::TensorCxRSKx<64>, | |||
| cutlass::uint4b_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::layout::TensorNCxHWx<64>, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassTensorOp, | |||
| cutlass::arch::Sm75, | |||
| cutlass::gemm::GemmShape<256, 128, 128>, | |||
| cutlass::gemm::GemmShape<64, 64, 128>, | |||
| cutlass::gemm::GemmShape<8, 8, 32>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
| cutlass::uint4b_t, | |||
| 16, | |||
| int32_t, | |||
| int32_t, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 32, | |||
| 32, | |||
| true, | |||
| cutlass::arch::OpMultiplyAddSaturate, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| int8_t, | |||
| cutlass::layout::TensorNCxHWx<4>, | |||
| int8_t, | |||
| cutlass::layout::TensorCxRSKx<4>, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassSimt, | |||
| cutlass::arch::Sm61, | |||
| cutlass::gemm::GemmShape<128, 128, 32>, | |||
| cutlass::gemm::GemmShape<64, 32, 32>, | |||
| cutlass::gemm::GemmShape<1, 1, 4>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
| float, | |||
| 1, | |||
| int32_t, | |||
| float, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 4, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAdd, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| int8_t, | |||
| cutlass::layout::TensorNCxHWx<4>, | |||
| int8_t, | |||
| cutlass::layout::TensorCxRSKx<4>, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassSimt, | |||
| cutlass::arch::Sm61, | |||
| cutlass::gemm::GemmShape<128, 32, 32>, | |||
| cutlass::gemm::GemmShape<64, 32, 32>, | |||
| cutlass::gemm::GemmShape<1, 1, 4>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
| float, | |||
| 1, | |||
| int32_t, | |||
| float, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 4, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAdd, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| int8_t, | |||
| cutlass::layout::TensorNCxHWx<4>, | |||
| int8_t, | |||
| cutlass::layout::TensorCxRSKx<4>, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassSimt, | |||
| cutlass::arch::Sm61, | |||
| cutlass::gemm::GemmShape<128, 64, 32>, | |||
| cutlass::gemm::GemmShape<64, 32, 32>, | |||
| cutlass::gemm::GemmShape<1, 1, 4>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
| float, | |||
| 1, | |||
| int32_t, | |||
| float, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 4, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAdd, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| int8_t, | |||
| cutlass::layout::TensorNCxHWx<4>, | |||
| int8_t, | |||
| cutlass::layout::TensorCxRSKx<4>, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassSimt, | |||
| cutlass::arch::Sm61, | |||
| cutlass::gemm::GemmShape<16, 128, 16>, | |||
| cutlass::gemm::GemmShape<16, 128, 16>, | |||
| cutlass::gemm::GemmShape<1, 1, 4>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
| float, | |||
| 1, | |||
| int32_t, | |||
| float, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 1, | |||
| 4, | |||
| 8, | |||
| false, | |||
| cutlass::arch::OpMultiplyAdd, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| int8_t, | |||
| cutlass::layout::TensorNCxHWx<4>, | |||
| int8_t, | |||
| cutlass::layout::TensorCxRSKx<4>, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassSimt, | |||
| cutlass::arch::Sm61, | |||
| cutlass::gemm::GemmShape<16, 64, 8>, | |||
| cutlass::gemm::GemmShape<16, 64, 8>, | |||
| cutlass::gemm::GemmShape<1, 1, 4>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
| float, | |||
| 1, | |||
| int32_t, | |||
| float, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 4, | |||
| 4, | |||
| false, | |||
| cutlass::arch::OpMultiplyAdd, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| int8_t, | |||
| cutlass::layout::TensorNCxHWx<4>, | |||
| int8_t, | |||
| cutlass::layout::TensorCxRSKx<4>, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassSimt, | |||
| cutlass::arch::Sm61, | |||
| cutlass::gemm::GemmShape<32, 128, 32>, | |||
| cutlass::gemm::GemmShape<32, 64, 32>, | |||
| cutlass::gemm::GemmShape<1, 1, 4>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
| float, | |||
| 1, | |||
| int32_t, | |||
| float, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 4, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAdd, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| int8_t, | |||
| cutlass::layout::TensorNCxHWx<4>, | |||
| int8_t, | |||
| cutlass::layout::TensorCxRSKx<4>, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassSimt, | |||
| cutlass::arch::Sm61, | |||
| cutlass::gemm::GemmShape<32, 32, 32>, | |||
| cutlass::gemm::GemmShape<32, 32, 32>, | |||
| cutlass::gemm::GemmShape<1, 1, 4>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
| float, | |||
| 1, | |||
| int32_t, | |||
| float, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 4, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAdd, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| int8_t, | |||
| cutlass::layout::TensorNCxHWx<4>, | |||
| int8_t, | |||
| cutlass::layout::TensorCxRSKx<4>, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassSimt, | |||
| cutlass::arch::Sm61, | |||
| cutlass::gemm::GemmShape<32, 64, 32>, | |||
| cutlass::gemm::GemmShape<32, 64, 32>, | |||
| cutlass::gemm::GemmShape<1, 1, 4>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
| float, | |||
| 1, | |||
| int32_t, | |||
| float, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 4, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAdd, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| int8_t, | |||
| cutlass::layout::TensorNCxHWx<4>, | |||
| int8_t, | |||
| cutlass::layout::TensorCxRSKx<4>, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassSimt, | |||
| cutlass::arch::Sm61, | |||
| cutlass::gemm::GemmShape<64, 128, 32>, | |||
| cutlass::gemm::GemmShape<64, 32, 32>, | |||
| cutlass::gemm::GemmShape<1, 1, 4>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
| float, | |||
| 1, | |||
| int32_t, | |||
| float, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 4, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAdd, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| int8_t, | |||
| cutlass::layout::TensorNCxHWx<4>, | |||
| int8_t, | |||
| cutlass::layout::TensorCxRSKx<4>, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassSimt, | |||
| cutlass::arch::Sm61, | |||
| cutlass::gemm::GemmShape<64, 32, 32>, | |||
| cutlass::gemm::GemmShape<64, 32, 32>, | |||
| cutlass::gemm::GemmShape<1, 1, 4>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
| float, | |||
| 1, | |||
| int32_t, | |||
| float, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 4, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAdd, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| int8_t, | |||
| cutlass::layout::TensorNCxHWx<4>, | |||
| int8_t, | |||
| cutlass::layout::TensorCxRSKx<4>, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassSimt, | |||
| cutlass::arch::Sm61, | |||
| cutlass::gemm::GemmShape<64, 64, 32>, | |||
| cutlass::gemm::GemmShape<64, 32, 32>, | |||
| cutlass::gemm::GemmShape<1, 1, 4>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
| float, | |||
| 1, | |||
| int32_t, | |||
| float, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 4, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAdd, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| int8_t, | |||
| cutlass::layout::TensorNCxHWx<4>, | |||
| int8_t, | |||
| cutlass::layout::TensorCxRSKx<4>, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassSimt, | |||
| cutlass::arch::Sm61, | |||
| cutlass::gemm::GemmShape<128, 128, 32>, | |||
| cutlass::gemm::GemmShape<64, 32, 32>, | |||
| cutlass::gemm::GemmShape<1, 1, 4>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombination< | |||
| float, | |||
| 1, | |||
| int32_t, | |||
| float, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 4, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAdd, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| int8_t, | |||
| cutlass::layout::TensorNCxHWx<4>, | |||
| int8_t, | |||
| cutlass::layout::TensorCxRSKx<4>, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassSimt, | |||
| cutlass::arch::Sm61, | |||
| cutlass::gemm::GemmShape<128, 32, 32>, | |||
| cutlass::gemm::GemmShape<64, 32, 32>, | |||
| cutlass::gemm::GemmShape<1, 1, 4>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombination< | |||
| float, | |||
| 1, | |||
| int32_t, | |||
| float, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 4, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAdd, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| int8_t, | |||
| cutlass::layout::TensorNCxHWx<4>, | |||
| int8_t, | |||
| cutlass::layout::TensorCxRSKx<4>, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassSimt, | |||
| cutlass::arch::Sm61, | |||
| cutlass::gemm::GemmShape<128, 64, 32>, | |||
| cutlass::gemm::GemmShape<64, 32, 32>, | |||
| cutlass::gemm::GemmShape<1, 1, 4>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombination< | |||
| float, | |||
| 1, | |||
| int32_t, | |||
| float, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 4, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAdd, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| int8_t, | |||
| cutlass::layout::TensorNCxHWx<4>, | |||
| int8_t, | |||
| cutlass::layout::TensorCxRSKx<4>, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassSimt, | |||
| cutlass::arch::Sm61, | |||
| cutlass::gemm::GemmShape<16, 128, 16>, | |||
| cutlass::gemm::GemmShape<16, 128, 16>, | |||
| cutlass::gemm::GemmShape<1, 1, 4>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombination< | |||
| float, | |||
| 1, | |||
| int32_t, | |||
| float, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 1, | |||
| 4, | |||
| 8, | |||
| false, | |||
| cutlass::arch::OpMultiplyAdd, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| int8_t, | |||
| cutlass::layout::TensorNCxHWx<4>, | |||
| int8_t, | |||
| cutlass::layout::TensorCxRSKx<4>, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassSimt, | |||
| cutlass::arch::Sm61, | |||
| cutlass::gemm::GemmShape<16, 64, 8>, | |||
| cutlass::gemm::GemmShape<16, 64, 8>, | |||
| cutlass::gemm::GemmShape<1, 1, 4>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombination< | |||
| float, | |||
| 1, | |||
| int32_t, | |||
| float, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 4, | |||
| 4, | |||
| false, | |||
| cutlass::arch::OpMultiplyAdd, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| int8_t, | |||
| cutlass::layout::TensorNCxHWx<4>, | |||
| int8_t, | |||
| cutlass::layout::TensorCxRSKx<4>, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassSimt, | |||
| cutlass::arch::Sm61, | |||
| cutlass::gemm::GemmShape<32, 128, 32>, | |||
| cutlass::gemm::GemmShape<32, 64, 32>, | |||
| cutlass::gemm::GemmShape<1, 1, 4>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombination< | |||
| float, | |||
| 1, | |||
| int32_t, | |||
| float, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 4, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAdd, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| @@ -1,59 +0,0 @@ | |||
| #if !MEGDNN_TEGRA_X1 | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
| // kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
| using Convolution = | |||
| typename cutlass::conv::device::Convolution< | |||
| int8_t, | |||
| cutlass::layout::TensorNCxHWx<4>, | |||
| int8_t, | |||
| cutlass::layout::TensorCxRSKx<4>, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| float, | |||
| cutlass::layout::TensorNCHW, | |||
| int32_t, | |||
| cutlass::conv::ConvType::kConvolution, | |||
| cutlass::arch::OpClassSimt, | |||
| cutlass::arch::Sm61, | |||
| cutlass::gemm::GemmShape<32, 32, 32>, | |||
| cutlass::gemm::GemmShape<32, 32, 32>, | |||
| cutlass::gemm::GemmShape<1, 1, 4>, | |||
| cutlass::epilogue::thread::BiasAddLinearCombination< | |||
| float, | |||
| 1, | |||
| int32_t, | |||
| float, | |||
| float | |||
| >, | |||
| cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
| 2, | |||
| 4, | |||
| 16, | |||
| false, | |||
| cutlass::arch::OpMultiplyAdd, | |||
| cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
| template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
| const typename Convolution::ElementSrc* d_src, | |||
| const typename Convolution::ElementFilter* d_filter, | |||
| const typename Convolution::ElementBias* d_bias, | |||
| const typename Convolution::ElementDst* d_z, | |||
| typename Convolution::ElementDst* d_dst, | |||
| int* workspace, | |||
| typename Convolution::ConvolutionParameter const& conv_param, | |||
| typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
| cudaStream_t stream, | |||
| typename Convolution::ExtraParam extra_param); | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||