do not compile cutlass tensorop kernels, when using cuda version less than 10.2
GitOrigin-RevId: d4c37d5f41
tags/v1.6.0-rc1
| @@ -19,7 +19,8 @@ class Conv2dOperation: | |||||
| # | # | ||||
| def __init__(self, conv_kind, conv_type, arch, tile_description, src, flt, bias, dst, element_epilogue, \ | def __init__(self, conv_kind, conv_type, arch, tile_description, src, flt, bias, dst, element_epilogue, \ | ||||
| epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4, \ | epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4, \ | ||||
| need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False): | |||||
| need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, \ | |||||
| required_cuda_ver_major = 9, required_cuda_ver_minor = 2): | |||||
| self.operation_kind = OperationKind.Conv2d | self.operation_kind = OperationKind.Conv2d | ||||
| self.conv_kind = conv_kind | self.conv_kind = conv_kind | ||||
| @@ -36,6 +37,9 @@ class Conv2dOperation: | |||||
| self.need_load_from_const = need_load_from_const | self.need_load_from_const = need_load_from_const | ||||
| self.implicit_gemm_mode = implicit_gemm_mode | self.implicit_gemm_mode = implicit_gemm_mode | ||||
| self.without_shared_load = without_shared_load | self.without_shared_load = without_shared_load | ||||
| self.required_cuda_ver_major = required_cuda_ver_major | |||||
| self.required_cuda_ver_minor = required_cuda_ver_minor | |||||
| # | # | ||||
| def accumulator_type(self): | def accumulator_type(self): | ||||
| accum = self.tile_description.math_instruction.element_accumulator | accum = self.tile_description.math_instruction.element_accumulator | ||||
| @@ -320,7 +324,8 @@ using Deconvolution = | |||||
| # | # | ||||
| def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 128, \ | def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 128, \ | ||||
| skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False): | |||||
| skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, required_cuda_ver_major = 9, \ | |||||
| required_cuda_ver_minor = 2): | |||||
| operations = [] | operations = [] | ||||
| element_epilogue = DataType.f32 | element_epilogue = DataType.f32 | ||||
| @@ -407,10 +412,10 @@ def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_lay | |||||
| bias = TensorDescription(bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type]))) | bias = TensorDescription(bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type]))) | ||||
| dst = TensorDescription(dst_type, dst_layout, int(dst_align / DataTypeSize[dst_type])) | dst = TensorDescription(dst_type, dst_layout, int(dst_align / DataTypeSize[dst_type])) | ||||
| new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode, without_shared_load) | |||||
| new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor) | |||||
| operations.append(new_operation) | operations.append(new_operation) | ||||
| if not skip_unity_kernel: | if not skip_unity_kernel: | ||||
| new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode, without_shared_load) | |||||
| new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor) | |||||
| operations.append(new_operation) | operations.append(new_operation) | ||||
| return operations | return operations | ||||
| @@ -545,7 +550,7 @@ class EmitConvSingleKernelWrapper(): | |||||
| self.convolution_name = "Deconvolution" | self.convolution_name = "Deconvolution" | ||||
| self.header_template = """ | self.header_template = """ | ||||
| #if !MEGDNN_TEGRA_X1 | |||||
| #if __CUDACC_VER_MAJOR__ > ${required_cuda_ver_major} || (__CUDACC_VER_MAJOR__ == ${required_cuda_ver_major} && __CUDACC_VER_MINOR__ >= ${required_cuda_ver_minor}) | |||||
| // ignore warning of cutlass | // ignore warning of cutlass | ||||
| #pragma GCC diagnostic push | #pragma GCC diagnostic push | ||||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
| @@ -589,14 +594,17 @@ void initialize_${operation_name}(Manifest &manifest) { | |||||
| else: | else: | ||||
| self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % self.operation.procedural_name()) | self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % self.operation.procedural_name()) | ||||
| self.kernel_file = open(self.kernel_path, "w") | self.kernel_file = open(self.kernel_path, "w") | ||||
| self.kernel_file.write(self.header_template) | |||||
| self.kernel_file.write(SubstituteTemplate(self.header_template, { | |||||
| 'required_cuda_ver_major': str(self.operation.required_cuda_ver_major), | |||||
| 'required_cuda_ver_minor': str(self.operation.required_cuda_ver_minor), | |||||
| })) | |||||
| return self | return self | ||||
| # | # | ||||
| def emit(self): | def emit(self): | ||||
| self.kernel_file.write(SubstituteTemplate(self.instance_template, { | self.kernel_file.write(SubstituteTemplate(self.instance_template, { | ||||
| 'operation_instance': self.instance_emitter.emit(self.operation), | 'operation_instance': self.instance_emitter.emit(self.operation), | ||||
| })) | |||||
| })) | |||||
| # emit manifest helper | # emit manifest helper | ||||
| manifest = SubstituteTemplate(self.manifest_template, { | manifest = SubstituteTemplate(self.manifest_template, { | ||||
| @@ -23,7 +23,8 @@ from library import * | |||||
| class GemmOperation: | class GemmOperation: | ||||
| # | # | ||||
| def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \ | def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \ | ||||
| epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8): | |||||
| epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, \ | |||||
| required_cuda_ver_major = 9, required_cuda_ver_minor = 2): | |||||
| self.operation_kind = OperationKind.Gemm | self.operation_kind = OperationKind.Gemm | ||||
| self.arch = arch | self.arch = arch | ||||
| @@ -35,6 +36,9 @@ class GemmOperation: | |||||
| self.element_epilogue = element_epilogue | self.element_epilogue = element_epilogue | ||||
| self.epilogue_functor = epilogue_functor | self.epilogue_functor = epilogue_functor | ||||
| self.swizzling_functor = swizzling_functor | self.swizzling_functor = swizzling_functor | ||||
| self.required_cuda_ver_major = required_cuda_ver_major | |||||
| self.required_cuda_ver_minor = required_cuda_ver_minor | |||||
| # | # | ||||
| def is_complex(self): | def is_complex(self): | ||||
| @@ -161,7 +165,8 @@ class GemmOperation: | |||||
| # | # | ||||
| class GemvBatchedStridedOperation: | class GemvBatchedStridedOperation: | ||||
| # | # | ||||
| def __init__(self, gemm_kind, arch, math_inst, threadblock_shape, thread_shape, A, B, C): | |||||
| def __init__(self, gemm_kind, arch, math_inst, threadblock_shape, thread_shape, A, B, C, \ | |||||
| required_cuda_ver_major = 9, required_cuda_ver_minor = 2): | |||||
| self.operation_kind = OperationKind.Gemm | self.operation_kind = OperationKind.Gemm | ||||
| self.arch = arch | self.arch = arch | ||||
| @@ -172,6 +177,8 @@ class GemvBatchedStridedOperation: | |||||
| self.A = A | self.A = A | ||||
| self.B = B | self.B = B | ||||
| self.C = C | self.C = C | ||||
| self.required_cuda_ver_major = required_cuda_ver_major | |||||
| self.required_cuda_ver_minor = required_cuda_ver_minor | |||||
| # | # | ||||
| def accumulator_type(self): | def accumulator_type(self): | ||||
| @@ -243,7 +250,7 @@ class GemvBatchedStridedOperation: | |||||
| return self.procedural_name() | return self.procedural_name() | ||||
| # | # | ||||
| def GeneratesGemm(tile, data_type, layout_a, layout_b, layout_c, min_cc, align_a = 32, align_b = 32, align_c = 32): | |||||
| def GeneratesGemm(tile, data_type, layout_a, layout_b, layout_c, min_cc, align_a = 32, align_b = 32, align_c = 32, required_cuda_ver_major = 9, required_cuda_ver_minor = 2): | |||||
| operations = [] | operations = [] | ||||
| swizzling_functor = SwizzlingFunctor.Identity1 | swizzling_functor = SwizzlingFunctor.Identity1 | ||||
| @@ -261,20 +268,23 @@ def GeneratesGemm(tile, data_type, layout_a, layout_b, layout_c, min_cc, align_a | |||||
| B = TensorDescription(element_b, layout_b, int(align_b//DataTypeSize[element_b])) | B = TensorDescription(element_b, layout_b, int(align_b//DataTypeSize[element_b])) | ||||
| C = TensorDescription(element_c, layout_c, int(align_c//DataTypeSize[element_c])) | C = TensorDescription(element_c, layout_c, int(align_c//DataTypeSize[element_c])) | ||||
| operations.append(GemmOperation(GemmKind.Gemm, min_cc, tile, A, B, C, \ | operations.append(GemmOperation(GemmKind.Gemm, min_cc, tile, A, B, C, \ | ||||
| element_epilogue, epilogue, swizzling_functor)) | |||||
| element_epilogue, epilogue, swizzling_functor, \ | |||||
| required_cuda_ver_major, required_cuda_ver_minor)) | |||||
| operations.append(GemmOperation(GemmKind.SplitKParallel, min_cc, tile, A, B, C, \ | operations.append(GemmOperation(GemmKind.SplitKParallel, min_cc, tile, A, B, C, \ | ||||
| element_epilogue, epilogue, swizzling_functor)) | |||||
| element_epilogue, epilogue, swizzling_functor, \ | |||||
| required_cuda_ver_major, required_cuda_ver_minor)) | |||||
| return operations | return operations | ||||
| def GeneratesGemv(math_inst, threadblock_shape, thread_shape, data_type, layout_a, layout_b, layout_c, min_cc, \ | def GeneratesGemv(math_inst, threadblock_shape, thread_shape, data_type, layout_a, layout_b, layout_c, min_cc, \ | ||||
| align_a = 32, align_b = 32, align_c = 32): | |||||
| align_a = 32, align_b = 32, align_c = 32, \ | |||||
| required_cuda_ver_major = 9, required_cuda_ver_minor = 2): | |||||
| element_a, element_b, element_c, element_epilogue = data_type | element_a, element_b, element_c, element_epilogue = data_type | ||||
| A = TensorDescription(element_a, layout_a, int(align_a//DataTypeSize[element_a])) | A = TensorDescription(element_a, layout_a, int(align_a//DataTypeSize[element_a])) | ||||
| B = TensorDescription(element_b, layout_b, int(align_b//DataTypeSize[element_b])) | B = TensorDescription(element_b, layout_b, int(align_b//DataTypeSize[element_b])) | ||||
| C = TensorDescription(element_c, layout_c, int(align_c//DataTypeSize[element_c])) | C = TensorDescription(element_c, layout_c, int(align_c//DataTypeSize[element_c])) | ||||
| return GemvBatchedStridedOperation(GemmKind.GemvBatchedStrided, min_cc, math_inst, threadblock_shape, thread_shape, \ | return GemvBatchedStridedOperation(GemmKind.GemvBatchedStrided, min_cc, math_inst, threadblock_shape, thread_shape, \ | ||||
| A, B, C) | |||||
| A, B, C, required_cuda_ver_major, required_cuda_ver_minor) | |||||
| ################################################################################################### | ################################################################################################### | ||||
| # | # | ||||
| @@ -1025,7 +1035,7 @@ class EmitGemmSingleKernelWrapper: | |||||
| self.instance_emitter = instance_emitters[self.operation.gemm_kind] | self.instance_emitter = instance_emitters[self.operation.gemm_kind] | ||||
| self.header_template = """ | self.header_template = """ | ||||
| #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
| #if __CUDACC_VER_MAJOR__ > ${required_cuda_ver_major} || (__CUDACC_VER_MAJOR__ == ${required_cuda_ver_major} && __CUDACC_VER_MINOR__ >= ${required_cuda_ver_minor}) | |||||
| // ignore warning of cutlass | // ignore warning of cutlass | ||||
| #pragma GCC diagnostic push | #pragma GCC diagnostic push | ||||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
| @@ -1065,7 +1075,10 @@ void initialize_${operation_name}(Manifest &manifest) { | |||||
| def __enter__(self): | def __enter__(self): | ||||
| self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % self.operation.procedural_name()) | self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % self.operation.procedural_name()) | ||||
| self.kernel_file = open(self.kernel_path, "w") | self.kernel_file = open(self.kernel_path, "w") | ||||
| self.kernel_file.write(self.header_template) | |||||
| self.kernel_file.write(SubstituteTemplate(self.header_template, { | |||||
| 'required_cuda_ver_major': str(self.operation.required_cuda_ver_major), | |||||
| 'required_cuda_ver_minor': str(self.operation.required_cuda_ver_minor), | |||||
| })) | |||||
| return self | return self | ||||
| # | # | ||||
| @@ -1109,7 +1122,7 @@ template void megdnn::cuda::cutlass_wrapper:: | |||||
| self.instance_emitter = EmitGemvBatchedStridedInstance() | self.instance_emitter = EmitGemvBatchedStridedInstance() | ||||
| self.header_template = """ | self.header_template = """ | ||||
| #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
| #if __CUDACC_VER_MAJOR__ > ${required_cuda_ver_major} || (__CUDACC_VER_MAJOR__ == ${required_cuda_ver_major} && __CUDACC_VER_MINOR__ >= ${required_cuda_ver_minor}) | |||||
| // ignore warning of cutlass | // ignore warning of cutlass | ||||
| #pragma GCC diagnostic push | #pragma GCC diagnostic push | ||||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | #pragma GCC diagnostic ignored "-Wunused-parameter" | ||||
| @@ -1136,7 +1149,9 @@ ${operation_instance} | |||||
| self.kernel_file = open(self.kernel_path, "w") | self.kernel_file = open(self.kernel_path, "w") | ||||
| self.kernel_file.write(SubstituteTemplate(self.header_template, { | self.kernel_file.write(SubstituteTemplate(self.header_template, { | ||||
| 'wrapper_path': self.wrapper_path, | 'wrapper_path': self.wrapper_path, | ||||
| })) | |||||
| 'required_cuda_ver_major': str(self.operation.required_cuda_ver_major), | |||||
| 'required_cuda_ver_minor': str(self.operation.required_cuda_ver_minor), | |||||
| })) | |||||
| return self | return self | ||||
| # | # | ||||
| @@ -217,6 +217,9 @@ def GenerateConv2d_TensorOp_8816(args): | |||||
| min_cc = 75 | min_cc = 75 | ||||
| max_cc = 1024 | max_cc = 1024 | ||||
| cuda_major = 10 | |||||
| cuda_minor = 2 | |||||
| for math_inst in math_instructions: | for math_inst in math_instructions: | ||||
| for layout in layouts: | for layout in layouts: | ||||
| for dst_type, dst_layout in zip(dst_types, dst_layouts): | for dst_type, dst_layout in zip(dst_types, dst_layouts): | ||||
| @@ -234,7 +237,7 @@ def GenerateConv2d_TensorOp_8816(args): | |||||
| ] | ] | ||||
| operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | ||||
| dst_layout, dst_type, min_cc, 128, 128, 64, | dst_layout, dst_type, min_cc, 128, 128, 64, | ||||
| False, ImplicitGemmMode.GemmTN, True) | |||||
| False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) | |||||
| else: | else: | ||||
| assert dst_layout == LayoutType.TensorNC4HW4 | assert dst_layout == LayoutType.TensorNC4HW4 | ||||
| tile_descriptions = [ | tile_descriptions = [ | ||||
| @@ -250,7 +253,7 @@ def GenerateConv2d_TensorOp_8816(args): | |||||
| ] | ] | ||||
| operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | ||||
| dst_layout, dst_type, min_cc, 128, 128, 64, | dst_layout, dst_type, min_cc, 128, 128, 64, | ||||
| False) | |||||
| False, ImplicitGemmMode.GemmNT, False, cuda_major, cuda_minor) | |||||
| return operations | return operations | ||||
| @@ -281,6 +284,9 @@ def GenerateConv2d_TensorOp_8832(args): | |||||
| min_cc = 75 | min_cc = 75 | ||||
| max_cc = 1024 | max_cc = 1024 | ||||
| cuda_major = 10 | |||||
| cuda_minor = 2 | |||||
| for math_inst in math_instructions: | for math_inst in math_instructions: | ||||
| for layout in layouts: | for layout in layouts: | ||||
| for dst_layout in dst_layouts: | for dst_layout in dst_layouts: | ||||
| @@ -293,7 +299,7 @@ def GenerateConv2d_TensorOp_8832(args): | |||||
| ] | ] | ||||
| operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | ||||
| dst_layout, dst_type, min_cc, 128, 128, 64, | dst_layout, dst_type, min_cc, 128, 128, 64, | ||||
| False, ImplicitGemmMode.GemmTN, True) | |||||
| False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) | |||||
| layouts_nhwc = [ | layouts_nhwc = [ | ||||
| (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32), | (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32), | ||||
| @@ -316,12 +322,12 @@ def GenerateConv2d_TensorOp_8832(args): | |||||
| for tile in tile_descriptions: | for tile in tile_descriptions: | ||||
| operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], | operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], | ||||
| dst_layout, dst_type, min_cc, layout[2], layout[2], 32, | dst_layout, dst_type, min_cc, layout[2], layout[2], 32, | ||||
| False, ImplicitGemmMode.GemmTN, False) | |||||
| False, ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor) | |||||
| if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64: | if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64: | ||||
| dst_align = 32 if tile.threadblock_shape[1] == 32 else 64 | dst_align = 32 if tile.threadblock_shape[1] == 32 else 64 | ||||
| operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], | operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], | ||||
| dst_layout, dst_type, min_cc, layout[2], layout[2], dst_align, | dst_layout, dst_type, min_cc, layout[2], layout[2], dst_align, | ||||
| False, ImplicitGemmMode.GemmTN, True) | |||||
| False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) | |||||
| return operations | return operations | ||||
| @@ -624,6 +630,8 @@ def GeneratesGemm_TensorOp_1688(args): | |||||
| alignment_constraints = [8, 4, 2, | alignment_constraints = [8, 4, 2, | ||||
| #1 | #1 | ||||
| ] | ] | ||||
| cuda_major = 10 | |||||
| cuda_minor = 2 | |||||
| operations = [] | operations = [] | ||||
| for math_inst in math_instructions: | for math_inst in math_instructions: | ||||
| @@ -655,7 +663,9 @@ def GeneratesGemm_TensorOp_1688(args): | |||||
| min_cc, \ | min_cc, \ | ||||
| align * 16, \ | align * 16, \ | ||||
| align * 16, \ | align * 16, \ | ||||
| align * 16) | |||||
| align * 16, \ | |||||
| cuda_major, \ | |||||
| cuda_minor) | |||||
| return operations | return operations | ||||
| # | # | ||||
| @@ -686,6 +696,8 @@ def GeneratesGemm_TensorOp_884(args): | |||||
| alignment_constraints = [8, 4, 2, | alignment_constraints = [8, 4, 2, | ||||
| # 1 | # 1 | ||||
| ] | ] | ||||
| cuda_major = 10 | |||||
| cuda_minor = 2 | |||||
| operations = [] | operations = [] | ||||
| for math_inst in math_instructions: | for math_inst in math_instructions: | ||||
| @@ -717,7 +729,9 @@ def GeneratesGemm_TensorOp_884(args): | |||||
| min_cc, \ | min_cc, \ | ||||
| align * 16, \ | align * 16, \ | ||||
| align * 16, \ | align * 16, \ | ||||
| align * 16) | |||||
| align * 16, \ | |||||
| cuda_major, \ | |||||
| cuda_minor) | |||||
| return operations | return operations | ||||
| @@ -351,6 +351,13 @@ void initialize_all(Manifest &manifest) { | |||||
| ################################################################################################### | ################################################################################################### | ||||
| def GenerateManifest(args, operations, output_dir): | def GenerateManifest(args, operations, output_dir): | ||||
| assert isinstance(operations, list) | |||||
| if len(operations) == 0: | |||||
| return | |||||
| op = operations[0] | |||||
| required_cuda_ver_major = op.required_cuda_ver_major | |||||
| required_cuda_ver_minor = op.required_cuda_ver_minor | |||||
| manifest_path = os.path.join(output_dir, "all_%s_%s_operations.cu" % (args.operations, args.type)) | manifest_path = os.path.join(output_dir, "all_%s_%s_operations.cu" % (args.operations, args.type)) | ||||
| f = open(manifest_path, "w") | f = open(manifest_path, "w") | ||||
| f.write(""" | f.write(""" | ||||
| @@ -358,7 +365,7 @@ def GenerateManifest(args, operations, output_dir): | |||||
| Generated by generator.py - Do not edit. | Generated by generator.py - Do not edit. | ||||
| */ | */ | ||||
| #if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | |||||
| #if __CUDACC_VER_MAJOR__ > %s || (__CUDACC_VER_MAJOR__ == %s && __CUDACC_VER_MINOR__ >= %s) | |||||
| #include "cutlass/cutlass.h" | #include "cutlass/cutlass.h" | ||||
| #include "src/cuda/cutlass/library.h" | #include "src/cuda/cutlass/library.h" | ||||
| @@ -367,7 +374,8 @@ def GenerateManifest(args, operations, output_dir): | |||||
| namespace cutlass { | namespace cutlass { | ||||
| namespace library { | namespace library { | ||||
| """) | |||||
| """ % (str(required_cuda_ver_major), str(required_cuda_ver_major), str(required_cuda_ver_minor))) | |||||
| for op in operations: | for op in operations: | ||||
| f.write("void initialize_%s(Manifest &manifest);\n" % op.procedural_name()) | f.write("void initialize_%s(Manifest &manifest);\n" % op.procedural_name()) | ||||
| @@ -44,26 +44,34 @@ namespace cutlass { | |||||
| namespace library { | namespace library { | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| #if ((__CUDACC_VER_MAJOR__ > 10) || \ | |||||
| (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)) | |||||
| #define CUTLASS_ARCH_MMA_SM75_SUPPORTED 1 | |||||
| #endif | |||||
| #if __CUDACC_VER_MAJOR__ > 9 || \ | #if __CUDACC_VER_MAJOR__ > 9 || \ | ||||
| (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) | ||||
| void initialize_all_gemm_simt_operations(Manifest& manifest); | void initialize_all_gemm_simt_operations(Manifest& manifest); | ||||
| void initialize_all_conv2d_simt_operations(Manifest& manifest); | |||||
| void initialize_all_deconv_simt_operations(Manifest& manifest); | |||||
| #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED | |||||
| void initialize_all_gemm_tensorop884_operations(Manifest& manifest); | void initialize_all_gemm_tensorop884_operations(Manifest& manifest); | ||||
| void initialize_all_gemm_tensorop1688_operations(Manifest& manifest); | void initialize_all_gemm_tensorop1688_operations(Manifest& manifest); | ||||
| void initialize_all_conv2d_simt_operations(Manifest& manifest); | |||||
| void initialize_all_conv2d_tensorop8816_operations(Manifest& manifest); | void initialize_all_conv2d_tensorop8816_operations(Manifest& manifest); | ||||
| void initialize_all_conv2d_tensorop8832_operations(Manifest& manifest); | void initialize_all_conv2d_tensorop8832_operations(Manifest& manifest); | ||||
| void initialize_all_deconv_simt_operations(Manifest& manifest); | |||||
| #endif | |||||
| void initialize_all(Manifest& manifest) { | void initialize_all(Manifest& manifest) { | ||||
| initialize_all_gemm_simt_operations(manifest); | initialize_all_gemm_simt_operations(manifest); | ||||
| initialize_all_conv2d_simt_operations(manifest); | |||||
| initialize_all_deconv_simt_operations(manifest); | |||||
| #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED | |||||
| initialize_all_gemm_tensorop884_operations(manifest); | initialize_all_gemm_tensorop884_operations(manifest); | ||||
| initialize_all_gemm_tensorop1688_operations(manifest); | initialize_all_gemm_tensorop1688_operations(manifest); | ||||
| initialize_all_conv2d_simt_operations(manifest); | |||||
| initialize_all_conv2d_tensorop8816_operations(manifest); | initialize_all_conv2d_tensorop8816_operations(manifest); | ||||
| initialize_all_conv2d_tensorop8832_operations(manifest); | initialize_all_conv2d_tensorop8832_operations(manifest); | ||||
| initialize_all_deconv_simt_operations(manifest); | |||||
| #endif | |||||
| } | } | ||||
| #else | #else | ||||
| @@ -43,12 +43,14 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() { | |||||
| for (auto&& algo : simt_float32_gemv_batched_strided) { | for (auto&& algo : simt_float32_gemv_batched_strided) { | ||||
| all_algos.push_back(&algo); | all_algos.push_back(&algo); | ||||
| } | } | ||||
| #if CUDA_VERSION >= 10020 | |||||
| for (auto&& algo : tensorop_float16) { | for (auto&& algo : tensorop_float16) { | ||||
| all_algos.push_back(&algo); | all_algos.push_back(&algo); | ||||
| } | } | ||||
| for (auto&& algo : tensorop_float16_split_k) { | for (auto&& algo : tensorop_float16_split_k) { | ||||
| all_algos.push_back(&algo); | all_algos.push_back(&algo); | ||||
| } | } | ||||
| #endif | |||||
| #endif | #endif | ||||
| all_algos.push_back(&naive); | all_algos.push_back(&naive); | ||||
| @@ -107,7 +109,9 @@ void MatrixMulForwardImpl::AlgoPack::fill_cutlass_algos() { | |||||
| #define cb(...) \ | #define cb(...) \ | ||||
| tensorop_float16.emplace_back(AlgoParam{__VA_ARGS__}); \ | tensorop_float16.emplace_back(AlgoParam{__VA_ARGS__}); \ | ||||
| tensorop_float16_split_k.emplace_back(AlgoParam{__VA_ARGS__}); | tensorop_float16_split_k.emplace_back(AlgoParam{__VA_ARGS__}); | ||||
| #if CUDA_VERSION >= 10020 | |||||
| FOREACH_CUTLASS_MATMUL_F16_SHAPES(cb) | FOREACH_CUTLASS_MATMUL_F16_SHAPES(cb) | ||||
| #endif | |||||
| #undef cb | #undef cb | ||||
| #undef FOREACH_CUTLASS_MATMUL_F16_SHAPES | #undef FOREACH_CUTLASS_MATMUL_F16_SHAPES | ||||
| } | } | ||||
| @@ -241,6 +241,20 @@ public: | |||||
| return AlgoAttribute::REPRODUCIBLE; | return AlgoAttribute::REPRODUCIBLE; | ||||
| } | } | ||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT) | MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT) | ||||
| std::string param() const override { | |||||
| std::string ret; | |||||
| // FIXME: algo param compatible with old version, to avoid fastrun cache error | |||||
| struct AlgoParam_ { | |||||
| int threadblock_m, threadblock_n, threadblock_k; | |||||
| int warp_m, warp_n, warp_k; | |||||
| }; | |||||
| AlgoParam_ algo_param{ | |||||
| m_algo_param.threadblock_m, m_algo_param.threadblock_n, | |||||
| m_algo_param.threadblock_k, m_algo_param.warp_m, | |||||
| m_algo_param.warp_n, m_algo_param.warp_k}; | |||||
| serialize_write_pod(algo_param, ret); | |||||
| return ret; | |||||
| } | |||||
| private: | private: | ||||
| void do_exec(const ExecArgs& args) const override; | void do_exec(const ExecArgs& args) const override; | ||||
| @@ -263,6 +277,21 @@ public: | |||||
| AlgoAttribute::USABLE_DEPEND_ON_SHAPE; | AlgoAttribute::USABLE_DEPEND_ON_SHAPE; | ||||
| } | } | ||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT_SPLIT_K) | MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT_SPLIT_K) | ||||
| std::string param() const override { | |||||
| std::string ret; | |||||
| // FIXME: algo param compatible with old version, to avoid fastrun cache | |||||
| // error | |||||
| struct AlgoParam_ { | |||||
| int threadblock_m, threadblock_n, threadblock_k; | |||||
| int warp_m, warp_n, warp_k; | |||||
| }; | |||||
| AlgoParam_ algo_param{ | |||||
| m_algo_param.threadblock_m, m_algo_param.threadblock_n, | |||||
| m_algo_param.threadblock_k, m_algo_param.warp_m, | |||||
| m_algo_param.warp_n, m_algo_param.warp_k}; | |||||
| serialize_write_pod(algo_param, ret); | |||||
| return ret; | |||||
| } | |||||
| private: | private: | ||||
| void do_exec(const ExecArgs& args) const override; | void do_exec(const ExecArgs& args) const override; | ||||
| @@ -297,6 +326,7 @@ private: | |||||
| std::string m_name; | std::string m_name; | ||||
| }; | }; | ||||
| #if CUDA_VERSION >= 10020 | |||||
| class MatrixMulForwardImpl::AlgoFloat16TensorOp final | class MatrixMulForwardImpl::AlgoFloat16TensorOp final | ||||
| : public AlgoCutlassMatrixMulBase { | : public AlgoCutlassMatrixMulBase { | ||||
| public: | public: | ||||
| @@ -345,7 +375,7 @@ private: | |||||
| int min_alignment_requirement() const override { return 2; } | int min_alignment_requirement() const override { return 2; } | ||||
| std::string m_name; | std::string m_name; | ||||
| }; | }; | ||||
| #endif | |||||
| #endif | #endif | ||||
| class MatrixMulForwardImpl::AlgoPack : NonCopyableObj { | class MatrixMulForwardImpl::AlgoPack : NonCopyableObj { | ||||
| @@ -370,8 +400,10 @@ public: | |||||
| std::vector<AlgoFloat32SIMTSplitK> simt_float32_split_k; | std::vector<AlgoFloat32SIMTSplitK> simt_float32_split_k; | ||||
| std::vector<AlgoFloat32SIMTGemvBatchedStrided> | std::vector<AlgoFloat32SIMTGemvBatchedStrided> | ||||
| simt_float32_gemv_batched_strided; | simt_float32_gemv_batched_strided; | ||||
| #if CUDA_VERSION >= 10020 | |||||
| std::vector<AlgoFloat16TensorOp> tensorop_float16; | std::vector<AlgoFloat16TensorOp> tensorop_float16; | ||||
| std::vector<AlgoFloat16TensorOpSplitK> tensorop_float16_split_k; | std::vector<AlgoFloat16TensorOpSplitK> tensorop_float16_split_k; | ||||
| #endif | |||||
| #endif | #endif | ||||
| std::vector<AlgoBase*> all_algos; | std::vector<AlgoBase*> all_algos; | ||||
| @@ -2,7 +2,7 @@ | |||||
| * \file dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp | * \file dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp | ||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | ||||
| * | * | ||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | * | ||||
| * Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
| * software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
| @@ -15,7 +15,7 @@ | |||||
| #include "src/cuda/matrix_mul/algos.h" | #include "src/cuda/matrix_mul/algos.h" | ||||
| #include "src/cuda/utils.h" | #include "src/cuda/utils.h" | ||||
| #if CUDA_VERSION >= 9020 | |||||
| #if CUDA_VERSION >= 10020 | |||||
| using namespace megdnn; | using namespace megdnn; | ||||
| using namespace cuda; | using namespace cuda; | ||||
| @@ -2,7 +2,7 @@ | |||||
| * \file dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp | * \file dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp | ||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | ||||
| * | * | ||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | * | ||||
| * Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
| * software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
| @@ -15,14 +15,14 @@ | |||||
| #include "src/cuda/matrix_mul/algos.h" | #include "src/cuda/matrix_mul/algos.h" | ||||
| #include "src/cuda/utils.h" | #include "src/cuda/utils.h" | ||||
| #if CUDA_VERSION >= 9020 | |||||
| #if CUDA_VERSION >= 10020 | |||||
| using namespace megdnn; | using namespace megdnn; | ||||
| using namespace cuda; | using namespace cuda; | ||||
| bool MatrixMulForwardImpl::AlgoFloat16TensorOpSplitK::is_available( | bool MatrixMulForwardImpl::AlgoFloat16TensorOpSplitK::is_available( | ||||
| const SizeArgs& args) const { | const SizeArgs& args) const { | ||||
| auto&& param = args.opr->param(); | auto&& param = args.opr->param(); | ||||
| int n = args.layout_c.shape[1], | |||||
| int m = args.layout_c.shape[0], n = args.layout_c.shape[1], | |||||
| k = args.layout_a.shape[param.transposeA ? 0 : 1]; | k = args.layout_a.shape[param.transposeA ? 0 : 1]; | ||||
| bool available = | bool available = | ||||
| args.opr->param().format == param::MatrixMul::Format::DEFAULT && | args.opr->param().format == param::MatrixMul::Format::DEFAULT && | ||||
| @@ -32,8 +32,8 @@ bool MatrixMulForwardImpl::AlgoFloat16TensorOpSplitK::is_available( | |||||
| auto&& device_prop = cuda::current_device_prop(); | auto&& device_prop = cuda::current_device_prop(); | ||||
| int y_grid_limit = device_prop.maxGridSize[1]; | int y_grid_limit = device_prop.maxGridSize[1]; | ||||
| // limit y grid | // limit y grid | ||||
| available &= ((n + m_algo_param.threadblock_n - 1) / | |||||
| m_algo_param.threadblock_n <= | |||||
| available &= ((m + m_algo_param.threadblock_m - 1) / | |||||
| m_algo_param.threadblock_m <= | |||||
| y_grid_limit); | y_grid_limit); | ||||
| if (m_algo_param.instruction_m == 8 && m_algo_param.instruction_n == 8 && | if (m_algo_param.instruction_m == 8 && m_algo_param.instruction_n == 8 && | ||||
| m_algo_param.instruction_k == 4) { | m_algo_param.instruction_k == 4) { | ||||
| @@ -2,7 +2,7 @@ | |||||
| * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp | * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp | ||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | ||||
| * | * | ||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | * | ||||
| * Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
| * software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
| @@ -2,7 +2,7 @@ | |||||
| * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt_gemv_batched_strided.cpp | * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt_gemv_batched_strided.cpp | ||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | ||||
| * | * | ||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | * | ||||
| * Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
| * software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
| @@ -2,7 +2,7 @@ | |||||
| * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp | * \file dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp | ||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | ||||
| * | * | ||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | * | ||||
| * Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
| * software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
| @@ -22,7 +22,7 @@ using namespace cuda; | |||||
| bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available( | bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available( | ||||
| const SizeArgs& args) const { | const SizeArgs& args) const { | ||||
| auto&& param = args.opr->param(); | auto&& param = args.opr->param(); | ||||
| int n = args.layout_c.shape[1], | |||||
| int m = args.layout_c.shape[0], n = args.layout_c.shape[1], | |||||
| k = args.layout_a.shape[param.transposeA ? 0 : 1]; | k = args.layout_a.shape[param.transposeA ? 0 : 1]; | ||||
| bool available = | bool available = | ||||
| args.opr->param().format == param::MatrixMul::Format::DEFAULT && | args.opr->param().format == param::MatrixMul::Format::DEFAULT && | ||||
| @@ -32,8 +32,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available( | |||||
| auto&& device_prop = cuda::current_device_prop(); | auto&& device_prop = cuda::current_device_prop(); | ||||
| int y_grid_limit = device_prop.maxGridSize[1]; | int y_grid_limit = device_prop.maxGridSize[1]; | ||||
| // limit y grid | // limit y grid | ||||
| available &= ((n + m_algo_param.threadblock_n - 1) / | |||||
| m_algo_param.threadblock_n <= | |||||
| available &= ((m + m_algo_param.threadblock_m - 1) / | |||||
| m_algo_param.threadblock_m <= | |||||
| y_grid_limit); | y_grid_limit); | ||||
| return available; | return available; | ||||
| } | } | ||||
| @@ -2,7 +2,7 @@ | |||||
| * \file dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp | * \file dnn/src/cuda/matrix_mul/cutlass_matrix_mul_base.cpp | ||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | ||||
| * | * | ||||
| * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | * | ||||
| * Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
| * software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
| @@ -21,7 +21,6 @@ | |||||
| #include "test/cuda/fixture.h" | #include "test/cuda/fixture.h" | ||||
| #include "test/cuda/utils.h" | #include "test/cuda/utils.h" | ||||
| #define MEGDNN_WITH_BENCHMARK 1 | |||||
| #if CUDA_VERSION >= 9020 | #if CUDA_VERSION >= 9020 | ||||
| namespace megdnn { | namespace megdnn { | ||||
| namespace test { | namespace test { | ||||
| @@ -373,6 +372,7 @@ MEGDNN_FOREACH_CUTLASS_KERNEL(cb) | |||||
| #undef cb | #undef cb | ||||
| #undef MEGDNN_FOREACH_CUTLASS_KERNEL | #undef MEGDNN_FOREACH_CUTLASS_KERNEL | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ | #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ | ||||
| cb(1, 256, 128, 32, 64, 64, 32, 8, 8, 4); \ | cb(1, 256, 128, 32, 64, 64, 32, 8, 8, 4); \ | ||||
| cb(2, 128, 256, 32, 64, 64, 32, 8, 8, 4); \ | cb(2, 128, 256, 32, 64, 64, 32, 8, 8, 4); \ | ||||
| @@ -448,6 +448,7 @@ MEGDNN_FOREACH_CUTLASS_KERNEL(cb) | |||||
| #undef cb | #undef cb | ||||
| #undef MEGDNN_FOREACH_CUTLASS_KERNEL | #undef MEGDNN_FOREACH_CUTLASS_KERNEL | ||||
| #endif | |||||
| #if MEGDNN_WITH_BENCHMARK | #if MEGDNN_WITH_BENCHMARK | ||||
| TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL) { | TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL) { | ||||
| @@ -462,12 +463,14 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL_FEAT) { | |||||
| "CUTLASS_FLOAT32_SIMT"); | "CUTLASS_FLOAT32_SIMT"); | ||||
| } | } | ||||
| #if CUDA_VERSION >= 10020 | |||||
| TEST_F(CUDA, BENCHMARK_CUTLASS_F16_MATMUL_FEAT) { | TEST_F(CUDA, BENCHMARK_CUTLASS_F16_MATMUL_FEAT) { | ||||
| benchmark_matrix_mul(handle_cuda(), get_f16_feat_model_args(), | benchmark_matrix_mul(handle_cuda(), get_f16_feat_model_args(), | ||||
| dtype::Float16(), dtype::Float16(), dtype::Float16(), | dtype::Float16(), dtype::Float16(), dtype::Float16(), | ||||
| "CUTLASS_FLOAT16_TENSOR_OP"); | "CUTLASS_FLOAT16_TENSOR_OP"); | ||||
| } | } | ||||
| #endif | #endif | ||||
| #endif | |||||
| } // namespace test | } // namespace test | ||||
| } // namespace megdnn | } // namespace megdnn | ||||
| #endif | #endif | ||||