GitOrigin-RevId: feb09ebb58
tags/v1.9.0
| @@ -10,24 +10,25 @@ import shutil | |||||
| from library import * | from library import * | ||||
| from gemm_operation import * | from gemm_operation import * | ||||
| from conv2d_operation import * | |||||
| from conv2d_operation import * | |||||
| ################################################################################################### | ################################################################################################### | ||||
| class EmitOperationKindLibrary: | class EmitOperationKindLibrary: | ||||
| def __init__(self, generated_path, kind, args): | |||||
| self.generated_path = generated_path | |||||
| self.kind = kind | |||||
| self.args = args | |||||
| def __init__(self, generated_path, kind, args): | |||||
| self.generated_path = generated_path | |||||
| self.kind = kind | |||||
| self.args = args | |||||
| self.emitters = { | |||||
| OperationKind.Gemm: EmitGemmConfigurationLibrary | |||||
| , OperationKind.Conv2d: EmitConv2dConfigurationLibrary | |||||
| } | |||||
| self.emitters = { | |||||
| OperationKind.Gemm: EmitGemmConfigurationLibrary, | |||||
| OperationKind.Conv2d: EmitConv2dConfigurationLibrary, | |||||
| } | |||||
| self.configurations = []; | |||||
| self.configurations = [] | |||||
| self.header_template =""" | |||||
| self.header_template = """ | |||||
| /* | /* | ||||
| Generated by manifest.py - Do not edit. | Generated by manifest.py - Do not edit. | ||||
| */ | */ | ||||
| @@ -42,17 +43,19 @@ namespace library { | |||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| """ | """ | ||||
| self.entry_template = """ | |||||
| self.entry_template = """ | |||||
| // | // | ||||
| // Entry point to construct operations | // Entry point to construct operations | ||||
| // | // | ||||
| void initialize_all_${operation_name}_operations(Manifest &manifest) { | void initialize_all_${operation_name}_operations(Manifest &manifest) { | ||||
| """ | """ | ||||
| self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n" | |||||
| self.configuration_template =" initialize_${configuration_name}(manifest);\n" | |||||
| self.configuration_prototype_template = ( | |||||
| "void initialize_${configuration_name}(Manifest &manifest);\n" | |||||
| ) | |||||
| self.configuration_template = " initialize_${configuration_name}(manifest);\n" | |||||
| self.epilogue_template =""" | |||||
| self.epilogue_template = """ | |||||
| } | } | ||||
| @@ -63,91 +66,118 @@ void initialize_all_${operation_name}_operations(Manifest &manifest) { | |||||
| """ | """ | ||||
| # | |||||
| def __enter__(self): | |||||
| self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind]) | |||||
| os.mkdir(self.operation_path) | |||||
| # | |||||
| def __enter__(self): | |||||
| self.operation_path = os.path.join( | |||||
| self.generated_path, OperationKindNames[self.kind] | |||||
| ) | |||||
| os.mkdir(self.operation_path) | |||||
| self.top_level_path = os.path.join( | |||||
| self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind] | |||||
| ) | |||||
| self.top_level_file = open(self.top_level_path, "w") | |||||
| self.top_level_file.write(self.header_template) | |||||
| self.top_level_path = os.path.join(self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind]) | |||||
| self.source_files = [self.top_level_path] | |||||
| self.top_level_file = open(self.top_level_path, "w") | |||||
| self.top_level_file.write(self.header_template) | |||||
| return self | |||||
| self.source_files = [self.top_level_path,] | |||||
| # | |||||
| def emit(self, configuration_name, operations): | |||||
| return self | |||||
| with self.emitters[self.kind]( | |||||
| self.operation_path, configuration_name | |||||
| ) as configuration_emitter: | |||||
| for operation in operations: | |||||
| configuration_emitter.emit(operation) | |||||
| # | |||||
| def emit(self, configuration_name, operations): | |||||
| self.source_files.append(configuration_emitter.configuration_path) | |||||
| with self.emitters[self.kind](self.operation_path, configuration_name) as configuration_emitter: | |||||
| for operation in operations: | |||||
| configuration_emitter.emit(operation) | |||||
| self.source_files.append(configuration_emitter.configuration_path) | |||||
| self.configurations.append(configuration_name) | |||||
| self.top_level_file.write( | |||||
| SubstituteTemplate( | |||||
| self.configuration_prototype_template, | |||||
| {"configuration_name": configuration_name}, | |||||
| ) | |||||
| ) | |||||
| self.configurations.append(configuration_name) | |||||
| self.top_level_file.write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} )) | |||||
| # | |||||
| def __exit__(self, exception_type, exception_value, traceback): | |||||
| self.top_level_file.write( | |||||
| SubstituteTemplate( | |||||
| self.entry_template, {"operation_name": OperationKindNames[self.kind]} | |||||
| ) | |||||
| ) | |||||
| # | |||||
| def __exit__(self, exception_type, exception_value, traceback): | |||||
| self.top_level_file.write(SubstituteTemplate(self.entry_template, {'operation_name': OperationKindNames[self.kind]})) | |||||
| for configuration_name in self.configurations: | |||||
| self.top_level_file.write( | |||||
| SubstituteTemplate( | |||||
| self.configuration_template, | |||||
| {"configuration_name": configuration_name}, | |||||
| ) | |||||
| ) | |||||
| for configuration_name in self.configurations: | |||||
| self.top_level_file.write(SubstituteTemplate(self.configuration_template, {'configuration_name': configuration_name})) | |||||
| self.top_level_file.write(self.epilogue_template) | |||||
| self.top_level_file.close() | |||||
| self.top_level_file.write(self.epilogue_template) | |||||
| self.top_level_file.close() | |||||
| ################################################################################################### | ################################################################################################### | ||||
| ################################################################################################### | ################################################################################################### | ||||
| class Options: | class Options: | ||||
| def __init__(self): | |||||
| pass | |||||
| def __init__(self): | |||||
| pass | |||||
| ################################################################################################### | ################################################################################################### | ||||
| # | # | ||||
| class Manifest: | class Manifest: | ||||
| # | |||||
| def __init__(self, args): | |||||
| self.operations = {} | |||||
| self.args = args | |||||
| # | |||||
| def __init__(self, args): | |||||
| self.operations = {} | |||||
| self.args = args | |||||
| architectures = ( | |||||
| args.architectures.split(";") if len(args.architectures) else ["50"] | |||||
| ) | |||||
| self.compute_capabilities = [int(x) for x in architectures] | |||||
| architectures = args.architectures.split(';') if len(args.architectures) else ['50',] | |||||
| self.compute_capabilities = [int(x) for x in architectures] | |||||
| self.selected_kernels = [] | |||||
| if args.operations == 'all': | |||||
| self.operations_enabled = [] | |||||
| else: | |||||
| self.selected_kernels = [] | |||||
| operations_list = [ | |||||
| OperationKind.Gemm | |||||
| , OperationKind.Conv2d | |||||
| ] | |||||
| if args.operations == "all": | |||||
| self.operations_enabled = [] | |||||
| else: | |||||
| self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')] | |||||
| operations_list = [OperationKind.Gemm, OperationKind.Conv2d] | |||||
| if args.kernels == 'all': | |||||
| self.kernel_names = [] | |||||
| else: | |||||
| self.kernel_names = [x for x in args.kernels.split(',') if x != ''] | |||||
| self.operations_enabled = [ | |||||
| x | |||||
| for x in operations_list | |||||
| if OperationKindNames[x] in args.operations.split(",") | |||||
| ] | |||||
| self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != ''] | |||||
| if args.kernels == "all": | |||||
| self.kernel_names = [] | |||||
| else: | |||||
| self.kernel_names = [x for x in args.kernels.split(",") if x != ""] | |||||
| if args.kernel_filter_file is None: | |||||
| self.kernel_filter_list = [] | |||||
| else: | |||||
| self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file) | |||||
| self.ignore_kernel_names = [ | |||||
| x for x in args.ignore_kernels.split(",") if x != "" | |||||
| ] | |||||
| if args.kernel_filter_file is None: | |||||
| self.kernel_filter_list = [] | |||||
| else: | |||||
| self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file) | |||||
| self.operation_count = 0 | |||||
| self.operations_by_name = {} | |||||
| self.top_level_prologue = ''' | |||||
| self.operation_count = 0 | |||||
| self.operations_by_name = {} | |||||
| self.top_level_prologue = """ | |||||
| #include "cutlass/library/library.h" | #include "cutlass/library/library.h" | ||||
| #include "cutlass/library/manifest.h" | #include "cutlass/library/manifest.h" | ||||
| @@ -159,208 +189,241 @@ ${prototypes} | |||||
| void initialize_all(Manifest &manifest) { | void initialize_all(Manifest &manifest) { | ||||
| ''' | |||||
| self.top_level_reserve = ' manifest.reserve(${operation_count});\n\n' | |||||
| self.top_level_epilogue = ''' | |||||
| """ | |||||
| self.top_level_reserve = " manifest.reserve(${operation_count});\n\n" | |||||
| self.top_level_epilogue = """ | |||||
| } | } | ||||
| } // namespace library | } // namespace library | ||||
| } // namespace cutlass | } // namespace cutlass | ||||
| ''' | |||||
| def get_kernel_filters (self, kernelListFile): | |||||
| if os.path.isfile(kernelListFile): | |||||
| with open(kernelListFile, 'r') as fileReader: | |||||
| lines = [line.rstrip() for line in fileReader if not line.startswith("#")] | |||||
| lines = [re.compile(line) for line in lines if line] | |||||
| return lines | |||||
| else: | |||||
| return [] | |||||
| """ | |||||
| def get_kernel_filters(self, kernelListFile): | |||||
| if os.path.isfile(kernelListFile): | |||||
| with open(kernelListFile, "r") as fileReader: | |||||
| lines = [ | |||||
| line.rstrip() for line in fileReader if not line.startswith("#") | |||||
| ] | |||||
| lines = [re.compile(line) for line in lines if line] | |||||
| return lines | |||||
| else: | |||||
| return [] | |||||
| def filter_out_kernels(self, kernel_name, kernel_filter_list): | |||||
| def filter_out_kernels(self, kernel_name, kernel_filter_list): | |||||
| for kernel_filter_re in kernel_filter_list: | |||||
| if kernel_filter_re.search(kernel_name) is not None: | |||||
| return True | |||||
| return False | |||||
| for kernel_filter_re in kernel_filter_list: | |||||
| if kernel_filter_re.search(kernel_name) is not None: | |||||
| return True | |||||
| # | |||||
| def _filter_string_matches(self, filter_string, haystack): | |||||
| ''' Returns true if all substrings appear in the haystack in order''' | |||||
| substrings = filter_string.split('*') | |||||
| for sub in substrings: | |||||
| idx = haystack.find(sub) | |||||
| if idx < 0: | |||||
| return False | return False | ||||
| haystack = haystack[idx + len(sub):] | |||||
| return True | |||||
| # | |||||
| def filter(self, operation): | |||||
| ''' Filtering operations based on various criteria''' | |||||
| # filter based on compute capability | |||||
| enabled = False | |||||
| for cc in self.compute_capabilities: | |||||
| if cc >= operation.tile_description.minimum_compute_capability and \ | |||||
| cc <= operation.tile_description.maximum_compute_capability: | |||||
| enabled = True | |||||
| break | |||||
| if not enabled: | |||||
| return False | |||||
| if len(self.operations_enabled) and not operation.operation_kind in self.operations_enabled: | |||||
| return False | |||||
| # eliminate duplicates | |||||
| if operation.procedural_name() in self.operations_by_name.keys(): | |||||
| return False | |||||
| # Filter based on list of valid substrings | |||||
| if len(self.kernel_names): | |||||
| name = operation.procedural_name() | |||||
| enabled = False | |||||
| # compare against the include list | |||||
| for name_substr in self.kernel_names: | |||||
| if self._filter_string_matches(name_substr, name): | |||||
| enabled = True | |||||
| break | |||||
| # compare against the exclude list | |||||
| for name_substr in self.ignore_kernel_names: | |||||
| if self._filter_string_matches(name_substr, name): | |||||
| enabled = False | |||||
| break | |||||
| if len(self.kernel_filter_list) > 0: | |||||
| # | |||||
| def _filter_string_matches(self, filter_string, haystack): | |||||
| """ Returns true if all substrings appear in the haystack in order""" | |||||
| substrings = filter_string.split("*") | |||||
| for sub in substrings: | |||||
| idx = haystack.find(sub) | |||||
| if idx < 0: | |||||
| return False | |||||
| haystack = haystack[idx + len(sub) :] | |||||
| return True | |||||
| # | |||||
| def filter(self, operation): | |||||
| """ Filtering operations based on various criteria""" | |||||
| # filter based on compute capability | |||||
| enabled = False | enabled = False | ||||
| if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list): | |||||
| enabled = True | |||||
| for cc in self.compute_capabilities: | |||||
| if ( | |||||
| cc >= operation.tile_description.minimum_compute_capability | |||||
| and cc <= operation.tile_description.maximum_compute_capability | |||||
| ): | |||||
| enabled = True | |||||
| break | |||||
| if not enabled: | |||||
| return False | |||||
| if ( | |||||
| len(self.operations_enabled) | |||||
| and not operation.operation_kind in self.operations_enabled | |||||
| ): | |||||
| return False | |||||
| # eliminate duplicates | |||||
| if operation.procedural_name() in self.operations_by_name.keys(): | |||||
| return False | |||||
| # Filter based on list of valid substrings | |||||
| if len(self.kernel_names): | |||||
| name = operation.procedural_name() | |||||
| enabled = False | |||||
| # compare against the include list | |||||
| for name_substr in self.kernel_names: | |||||
| if self._filter_string_matches(name_substr, name): | |||||
| enabled = True | |||||
| break | |||||
| # compare against the exclude list | |||||
| for name_substr in self.ignore_kernel_names: | |||||
| if self._filter_string_matches(name_substr, name): | |||||
| enabled = False | |||||
| break | |||||
| if len(self.kernel_filter_list) > 0: | |||||
| enabled = False | |||||
| if self.filter_out_kernels( | |||||
| operation.procedural_name(), self.kernel_filter_list | |||||
| ): | |||||
| enabled = True | |||||
| # todo: filter based on compute data type | |||||
| return enabled | |||||
| # | |||||
| # | |||||
| def append(self, operation): | |||||
| """ | |||||
| Inserts the operation. | |||||
| operation_kind -> configuration_name -> [] | |||||
| """ | |||||
| # todo: filter based on compute data type | |||||
| return enabled | |||||
| # | |||||
| if self.filter(operation): | |||||
| # | |||||
| def append(self, operation): | |||||
| ''' | |||||
| Inserts the operation. | |||||
| self.selected_kernels.append(operation.procedural_name()) | |||||
| operation_kind -> configuration_name -> [] | |||||
| ''' | |||||
| self.operations_by_name[operation.procedural_name()] = operation | |||||
| if self.filter(operation): | |||||
| self.selected_kernels.append(operation.procedural_name()) | |||||
| # add the configuration | |||||
| configuration_name = operation.configuration_name() | |||||
| self.operations_by_name[operation.procedural_name()] = operation | |||||
| if operation.operation_kind not in self.operations.keys(): | |||||
| self.operations[operation.operation_kind] = {} | |||||
| # add the configuration | |||||
| configuration_name = operation.configuration_name() | |||||
| if ( | |||||
| configuration_name | |||||
| not in self.operations[operation.operation_kind].keys() | |||||
| ): | |||||
| self.operations[operation.operation_kind][configuration_name] = [] | |||||
| if operation.operation_kind not in self.operations.keys(): | |||||
| self.operations[operation.operation_kind] = {} | |||||
| self.operations[operation.operation_kind][configuration_name].append( | |||||
| operation | |||||
| ) | |||||
| self.operation_count += 1 | |||||
| if configuration_name not in self.operations[operation.operation_kind].keys(): | |||||
| self.operations[operation.operation_kind][configuration_name] = [] | |||||
| # | |||||
| self.operations[operation.operation_kind][configuration_name].append(operation) | |||||
| self.operation_count += 1 | |||||
| # | |||||
| # | |||||
| def emit(self, target=GeneratorTarget.Library): | |||||
| # | |||||
| def emit(self, target = GeneratorTarget.Library): | |||||
| operation_emitters = {GeneratorTarget.Library: EmitOperationKindLibrary} | |||||
| operation_emitters = { | |||||
| GeneratorTarget.Library: EmitOperationKindLibrary | |||||
| } | |||||
| generated_path = os.path.join(self.args.curr_build_dir, "generated") | |||||
| generated_path = os.path.join(self.args.curr_build_dir, 'generated') | |||||
| # create generated/ | |||||
| if os.path.exists(generated_path): | |||||
| shutil.rmtree(generated_path) | |||||
| # create generated/ | |||||
| if os.path.exists(generated_path): | |||||
| shutil.rmtree(generated_path) | |||||
| os.mkdir(generated_path) | |||||
| os.mkdir(generated_path) | |||||
| source_files = [] | |||||
| source_files = [] | |||||
| top_level_path = os.path.join(generated_path, "initialize_all.cpp") | |||||
| with open(top_level_path, "w") as top_level_file: | |||||
| top_level_path = os.path.join(generated_path, 'initialize_all.cpp') | |||||
| with open(top_level_path, 'w') as top_level_file: | |||||
| if target == GeneratorTarget.Library: | |||||
| source_files.append(top_level_path) | |||||
| if target == GeneratorTarget.Library: | |||||
| source_files.append(top_level_path) | |||||
| prototypes = [] | |||||
| for operation_kind, configurations in self.operations.items(): | |||||
| prototypes.append( | |||||
| SubstituteTemplate( | |||||
| "void initialize_all_${operation_kind}_operations(Manifest &manifest);", | |||||
| {"operation_kind": OperationKindNames[operation_kind]}, | |||||
| ) | |||||
| ) | |||||
| prototypes = [] | |||||
| for operation_kind, configurations in self.operations.items(): | |||||
| prototypes.append(SubstituteTemplate( | |||||
| "void initialize_all_${operation_kind}_operations(Manifest &manifest);", | |||||
| {'operation_kind': OperationKindNames[operation_kind]})) | |||||
| top_level_file.write( | |||||
| SubstituteTemplate( | |||||
| self.top_level_prologue, {"prototypes": "\n".join(prototypes)} | |||||
| ) | |||||
| ) | |||||
| top_level_file.write(SubstituteTemplate(self.top_level_prologue, | |||||
| {'prototypes': "\n".join(prototypes)})) | |||||
| top_level_file.write( | |||||
| SubstituteTemplate( | |||||
| self.top_level_reserve, | |||||
| {"operation_count": str(self.operation_count)}, | |||||
| ) | |||||
| ) | |||||
| top_level_file.write(SubstituteTemplate( | |||||
| self.top_level_reserve, {'operation_count': str(self.operation_count)})) | |||||
| # for each operation kind, emit initializer for all configurations | |||||
| for operation_kind, configurations in self.operations.items(): | |||||
| # for each operation kind, emit initializer for all configurations | |||||
| for operation_kind, configurations in self.operations.items(): | |||||
| with operation_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter: | |||||
| for configuration_name, operations in configurations.items(): | |||||
| operation_kind_emitter.emit(configuration_name, operations) | |||||
| with operation_emitters[target]( | |||||
| generated_path, operation_kind, self.args | |||||
| ) as operation_kind_emitter: | |||||
| for configuration_name, operations in configurations.items(): | |||||
| operation_kind_emitter.emit(configuration_name, operations) | |||||
| source_files += operation_kind_emitter.source_files | |||||
| source_files += operation_kind_emitter.source_files | |||||
| top_level_file.write(SubstituteTemplate( | |||||
| " initialize_all_${operation_kind}_operations(manifest);\n", | |||||
| {'operation_kind': OperationKindNames[operation_kind]})) | |||||
| top_level_file.write( | |||||
| SubstituteTemplate( | |||||
| " initialize_all_${operation_kind}_operations(manifest);\n", | |||||
| {"operation_kind": OperationKindNames[operation_kind]}, | |||||
| ) | |||||
| ) | |||||
| top_level_file.write(self.top_level_epilogue) | |||||
| top_level_file.write(self.top_level_epilogue) | |||||
| # write the manifest.cmake file containing paths from all targets | |||||
| manifest_path = os.path.join(generated_path, "manifest.cmake") | |||||
| with open(manifest_path, "w") as manifest_file: | |||||
| # write the manifest.cmake file containing paths from all targets | |||||
| manifest_path = os.path.join(generated_path, "manifest.cmake") | |||||
| with open(manifest_path, "w") as manifest_file: | |||||
| target_name = 'cutlass_library_objs' | |||||
| target_name = "cutlass_library_objs" | |||||
| target_text = SubstituteTemplate("""cutlass_target_sources( | |||||
| target_text = SubstituteTemplate( | |||||
| """cutlass_target_sources( | |||||
| ${target_name} | ${target_name} | ||||
| BATCH_SOURCES ON | BATCH_SOURCES ON | ||||
| PRIVATE | PRIVATE | ||||
| """, { 'target_name': target_name}) | |||||
| """, | |||||
| {"target_name": target_name}, | |||||
| ) | |||||
| manifest_file.write(target_text) | |||||
| manifest_file.write(target_text) | |||||
| for source_file in source_files: | |||||
| manifest_file.write(" %s\n" % str(source_file.replace("\\", "/"))) | |||||
| manifest_file.write(")") | |||||
| # | |||||
| for source_file in source_files: | |||||
| manifest_file.write(" %s\n" % str(source_file.replace('\\', '/'))) | |||||
| manifest_file.write(")") | |||||
| # | |||||
| ################################################################################################### | ################################################################################################### | ||||
| def GenerateManifest(args, operations, output_dir): | def GenerateManifest(args, operations, output_dir): | ||||
| assert isinstance(operations, list) | |||||
| if len(operations) == 0: | |||||
| return | |||||
| op = operations[0] | |||||
| required_cuda_ver_major = op.required_cuda_ver_major | |||||
| required_cuda_ver_minor = op.required_cuda_ver_minor | |||||
| manifest_path = os.path.join(output_dir, "all_%s_%s_operations.cu" % (args.operations, args.type)) | |||||
| f = open(manifest_path, "w") | |||||
| f.write(""" | |||||
| assert isinstance(operations, list) | |||||
| if len(operations) == 0: | |||||
| return | |||||
| op = operations[0] | |||||
| required_cuda_ver_major = op.required_cuda_ver_major | |||||
| required_cuda_ver_minor = op.required_cuda_ver_minor | |||||
| manifest_path = os.path.join( | |||||
| output_dir, "all_%s_%s_operations.cu" % (args.operations, args.type) | |||||
| ) | |||||
| f = open(manifest_path, "w") | |||||
| f.write( | |||||
| """ | |||||
| /* | /* | ||||
| Generated by generator.py - Do not edit. | Generated by generator.py - Do not edit. | ||||
| */ | */ | ||||
| @@ -374,24 +437,35 @@ def GenerateManifest(args, operations, output_dir): | |||||
| namespace cutlass { | namespace cutlass { | ||||
| namespace library { | namespace library { | ||||
| """ % (str(required_cuda_ver_major), str(required_cuda_ver_major), str(required_cuda_ver_minor))) | |||||
| for op in operations: | |||||
| f.write("void initialize_%s(Manifest &manifest);\n" % op.procedural_name()) | |||||
| f.write(""" | |||||
| """ | |||||
| % ( | |||||
| str(required_cuda_ver_major), | |||||
| str(required_cuda_ver_major), | |||||
| str(required_cuda_ver_minor), | |||||
| ) | |||||
| ) | |||||
| for op in operations: | |||||
| f.write("void initialize_%s(Manifest &manifest);\n" % op.procedural_name()) | |||||
| f.write( | |||||
| """ | |||||
| void initialize_all_%s_%s_operations(Manifest &manifest) { | void initialize_all_%s_%s_operations(Manifest &manifest) { | ||||
| """ % (args.operations, args.type)) | |||||
| """ | |||||
| % (args.operations, args.type) | |||||
| ) | |||||
| for op in operations: | |||||
| f.write(" initialize_%s(manifest);\n" % op.procedural_name()) | |||||
| for op in operations: | |||||
| f.write(" initialize_%s(manifest);\n" % op.procedural_name()) | |||||
| f.write(""" | |||||
| f.write( | |||||
| """ | |||||
| } | } | ||||
| } // namespace library | } // namespace library | ||||
| } // namespace cutlass | } // namespace cutlass | ||||
| #endif | #endif | ||||
| """) | |||||
| f.close() | |||||
| """ | |||||
| ) | |||||
| f.close() | |||||
| @@ -181,6 +181,8 @@ if(MGE_WITH_CUDA) | |||||
| gen_cutlass_kimpl(conv2d simt CUTLASS_SOURCES) | gen_cutlass_kimpl(conv2d simt CUTLASS_SOURCES) | ||||
| gen_cutlass_kimpl(conv2d tensorop8816 CUTLASS_SOURCES) | gen_cutlass_kimpl(conv2d tensorop8816 CUTLASS_SOURCES) | ||||
| gen_cutlass_kimpl(conv2d tensorop8832 CUTLASS_SOURCES) | gen_cutlass_kimpl(conv2d tensorop8832 CUTLASS_SOURCES) | ||||
| gen_cutlass_kimpl(dwconv2d_fprop simt CUTLASS_SOURCES) | |||||
| gen_cutlass_kimpl(dwconv2d_fprop tensorop884 CUTLASS_SOURCES) | |||||
| list(APPEND SOURCES ${CUTLASS_SOURCES}) | list(APPEND SOURCES ${CUTLASS_SOURCES}) | ||||
| list(APPEND SOURCES ${CUSOURCES}) | list(APPEND SOURCES ${CUSOURCES}) | ||||
| endif() | endif() | ||||
| @@ -92,6 +92,7 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { | |||||
| for (auto&& algo : int8_nchw4_dotprod) { | for (auto&& algo : int8_nchw4_dotprod) { | ||||
| all_algos.push_back(&algo); | all_algos.push_back(&algo); | ||||
| } | } | ||||
| fill_dwconv_algos(); | |||||
| all_algos.push_back(&int8_chwn4_dotprod); | all_algos.push_back(&int8_chwn4_dotprod); | ||||
| all_algos.push_back(&fallback_nchw_qs8); | all_algos.push_back(&fallback_nchw_qs8); | ||||
| for (size_t i = all_algo_size; i < all_algos.size(); ++i) { | for (size_t i = all_algo_size; i < all_algos.size(); ++i) { | ||||
| @@ -301,6 +302,32 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() { | |||||
| } | } | ||||
| #endif | #endif | ||||
| void ConvBiasForwardImpl::AlgoPack::fill_dwconv_algos() { | |||||
| using AlgoParam = AlgoCutlassConvolutionBase::AlgoParam; | |||||
| f32_implicit_bmm.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8, 1, 1, 1, 2}); | |||||
| f32_implicit_bmm.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8, 1, 1, 1, 2}); | |||||
| f32_implicit_bmm.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8, 1, 1, 1, 2}); | |||||
| f32_implicit_bmm.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8, 1, 1, 1, 2}); | |||||
| f32_implicit_bmm.emplace_back(AlgoParam{64, 128, 8, 64, 32, 8, 1, 1, 1, 2}); | |||||
| f32_implicit_bmm.emplace_back(AlgoParam{64, 64, 8, 64, 32, 8, 1, 1, 1, 2}); | |||||
| f32_implicit_bmm.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8, 1, 1, 1, 2}); | |||||
| f32_implicit_bmm.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8, 1, 1, 1, 2}); | |||||
| f32_implicit_bmm.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8, 1, 1, 1, 2}); | |||||
| for (auto&& algo : f32_implicit_bmm) { | |||||
| all_algos.push_back(&algo); | |||||
| } | |||||
| #if CUDA_VERSION >= 10020 | |||||
| f16_implicit_bmm.emplace_back(AlgoParam{128, 128, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| f16_implicit_bmm.emplace_back(AlgoParam{128, 256, 32, 64, 64, 32, 8, 8, 4, 2}); | |||||
| f16_implicit_bmm.emplace_back(AlgoParam{128, 64, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| f16_implicit_bmm.emplace_back(AlgoParam{64, 128, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| f16_implicit_bmm.emplace_back(AlgoParam{64, 64, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| for (auto&& algo : f16_implicit_bmm) { | |||||
| all_algos.push_back(&algo); | |||||
| } | |||||
| #endif | |||||
| } | |||||
| void ConvBiasForwardImpl::AlgoPack::fill_dp4a_algos() { | void ConvBiasForwardImpl::AlgoPack::fill_dp4a_algos() { | ||||
| using AlgoParam = AlgoInt8NCHW4DotProdImplicitGemm::AlgoParam; | using AlgoParam = AlgoInt8NCHW4DotProdImplicitGemm::AlgoParam; | ||||
| int8_nchw4_dotprod.emplace_back(AlgoParam{128, 128, 32, 64, 32, 32, 1, 1, 4, 2}); | int8_nchw4_dotprod.emplace_back(AlgoParam{128, 128, 32, 64, 32, 32, 1, 1, 4, 2}); | ||||
| @@ -84,7 +84,9 @@ public: | |||||
| CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW32_IMMA_INT8, | CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW32_IMMA_INT8, | ||||
| CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_INT4_INT4, | CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_INT4_INT4, | ||||
| CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_UINT4_INT4, | CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_UINT4_INT4, | ||||
| CUDA_FALLBACK_NCHW_INT4 | |||||
| CUDA_FALLBACK_NCHW_INT4, | |||||
| CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32, | |||||
| CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16, | |||||
| }; | }; | ||||
| using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | ||||
| @@ -503,6 +505,8 @@ public: | |||||
| * +----+--- AlgoInt4Int4NHWCIMMAImplicitGemm | * +----+--- AlgoInt4Int4NHWCIMMAImplicitGemm | ||||
| * +----+--- AlgoUInt4Int4NHWCIMMAImplicitGemm | * +----+--- AlgoUInt4Int4NHWCIMMAImplicitGemm | ||||
| * + | * + | ||||
| * +--- AlgoFloat32NCHWImplicitBatchedGemm | |||||
| * +--- AlgoFloat16NCHWHMMAImplicitBatchedGemm | |||||
| */ | */ | ||||
| /* | /* | ||||
| @@ -516,7 +520,13 @@ public: | |||||
| // corresponds to cutlass::conv::ConvType. we hope that algo.h does not | // corresponds to cutlass::conv::ConvType. we hope that algo.h does not | ||||
| // depend on cutlass headers | // depend on cutlass headers | ||||
| enum class ConvType { kConvolution, kBatchConvolution, kLocal, kLocalShare }; | |||||
| enum class ConvType { | |||||
| kConvolution, | |||||
| kBatchConvolution, | |||||
| kLocal, | |||||
| kLocalShare, | |||||
| kDepthwiseConvolution, | |||||
| }; | |||||
| // common parameters for operation selection | // common parameters for operation selection | ||||
| struct AlgoParam { | struct AlgoParam { | ||||
| @@ -558,7 +568,8 @@ public: | |||||
| size_t wo, size_t ph, size_t pw, size_t sh, size_t sw, size_t dh, size_t dw, | size_t wo, size_t ph, size_t pw, size_t sh, size_t sw, size_t dh, size_t dw, | ||||
| const void* alpha, const void* beta, const void* gamma, const void* delta, | const void* alpha, const void* beta, const void* gamma, const void* delta, | ||||
| const void* theta, const void* threshold, const void* dst_scale, | const void* theta, const void* threshold, const void* dst_scale, | ||||
| cudaStream_t stream, const void* extra_param = nullptr) const; | |||||
| cudaStream_t stream, const void* extra_param = nullptr, | |||||
| size_t groups = 1) const; | |||||
| protected: | protected: | ||||
| AlgoParam m_algo_param; | AlgoParam m_algo_param; | ||||
| @@ -992,6 +1003,54 @@ private: | |||||
| }; | }; | ||||
| #endif | #endif | ||||
| class ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm final | |||||
| : public AlgoCutlassConvolutionBase { | |||||
| public: | |||||
| AlgoFloat32NCHWFMAImplicitBatchedGemm(AlgoParam algo_param) | |||||
| : AlgoCutlassConvolutionBase(algo_param) { | |||||
| m_name = ConvBias::algo_name<ConvBias::DirectParam>( | |||||
| ssprintf( | |||||
| "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM%s", | |||||
| m_algo_param.to_string().c_str()), | |||||
| ConvBias::DirectParam{}); | |||||
| } | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override { | |||||
| return 0; | |||||
| } | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return m_name.c_str(); }; | |||||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } | |||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32); | |||||
| private: | |||||
| std::string m_name; | |||||
| }; | |||||
| class ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm final | |||||
| : public AlgoCutlassConvolutionBase { | |||||
| public: | |||||
| AlgoFloat16NCHWHMMAImplicitBatchedGemm(AlgoParam algo_param) | |||||
| : AlgoCutlassConvolutionBase(algo_param) { | |||||
| m_name = ConvBias::algo_name<ConvBias::DirectParam>( | |||||
| ssprintf( | |||||
| "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM%s", | |||||
| m_algo_param.to_string().c_str()), | |||||
| ConvBias::DirectParam{}); | |||||
| } | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override { | |||||
| return 0; | |||||
| } | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return m_name.c_str(); }; | |||||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } | |||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16); | |||||
| private: | |||||
| std::string m_name; | |||||
| }; | |||||
| class ConvBiasForwardImpl::AlgoBFloat16 final : public AlgoBase { | class ConvBiasForwardImpl::AlgoBFloat16 final : public AlgoBase { | ||||
| public: | public: | ||||
| bool is_available(const SizeArgs& args) const override; | bool is_available(const SizeArgs& args) const override; | ||||
| @@ -1048,6 +1107,8 @@ public: | |||||
| std::vector<AlgoInt4Int4NHWCIMMAImplicitGemm> int4_int4_nhwc_imma; | std::vector<AlgoInt4Int4NHWCIMMAImplicitGemm> int4_int4_nhwc_imma; | ||||
| std::vector<AlgoUInt4Int4NHWCIMMAImplicitGemm> uint4_int4_nhwc_imma; | std::vector<AlgoUInt4Int4NHWCIMMAImplicitGemm> uint4_int4_nhwc_imma; | ||||
| #endif | #endif | ||||
| std::vector<AlgoFloat32NCHWFMAImplicitBatchedGemm> f32_implicit_bmm; | |||||
| std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> f16_implicit_bmm; | |||||
| AlgoGroupConvGeneral group; | AlgoGroupConvGeneral group; | ||||
| AlgoBFloat16 bfloat16; | AlgoBFloat16 bfloat16; | ||||
| @@ -1063,6 +1124,7 @@ private: | |||||
| #endif | #endif | ||||
| void fill_cudnn_algos(); | void fill_cudnn_algos(); | ||||
| void fill_dp4a_algos(); | void fill_dp4a_algos(); | ||||
| void fill_dwconv_algos(); | |||||
| }; | }; | ||||
| } // namespace cuda | } // namespace cuda | ||||
| @@ -74,13 +74,18 @@ cutlass::conv::ConvType convert_conv_type(Base::ConvType conv_type) { | |||||
| return cutlass::conv::ConvType::kLocal; | return cutlass::conv::ConvType::kLocal; | ||||
| case Base::ConvType::kLocalShare: | case Base::ConvType::kLocalShare: | ||||
| return cutlass::conv::ConvType::kLocalShare; | return cutlass::conv::ConvType::kLocalShare; | ||||
| case Base::ConvType::kDepthwiseConvolution: | |||||
| return cutlass::conv::ConvType::kDepthwiseConvolution; | |||||
| default: | default: | ||||
| megdnn_assert(0, "invalid conv type"); | megdnn_assert(0, "invalid conv type"); | ||||
| } | } | ||||
| } | } | ||||
| NumericTypeID convert_dtype(DTypeEnum dtype) { | |||||
| switch (dtype) { | |||||
| NumericTypeID convert_dtype(DType dtype) { | |||||
| // just make convolution with no bias happy | |||||
| if (!dtype.valid()) | |||||
| return NumericTypeID::kF32; | |||||
| switch (dtype.enumv()) { | |||||
| case DTypeEnum::Float32: | case DTypeEnum::Float32: | ||||
| return NumericTypeID::kF32; | return NumericTypeID::kF32; | ||||
| case DTypeEnum::Float16: | case DTypeEnum::Float16: | ||||
| @@ -100,6 +105,21 @@ NumericTypeID convert_dtype(DTypeEnum dtype) { | |||||
| } | } | ||||
| } | } | ||||
| NumericTypeID get_accumulator_dtype( | |||||
| DType dtype, const param::ConvBias::ComputeMode comp_mode) { | |||||
| if (dtype.category() == DTypeCategory::QUANTIZED) { | |||||
| return NumericTypeID::kS32; | |||||
| } else { | |||||
| megdnn_assert(dtype.category() == DTypeCategory::FLOAT); | |||||
| if (comp_mode == param::ConvBias::ComputeMode::DEFAULT) { | |||||
| return convert_dtype(dtype); | |||||
| } else { | |||||
| megdnn_assert(comp_mode == param::ConvBias::ComputeMode::FLOAT32); | |||||
| return NumericTypeID::kF32; | |||||
| } | |||||
| } | |||||
| } | |||||
| struct LayoutPack { | struct LayoutPack { | ||||
| LayoutTypeID src; | LayoutTypeID src; | ||||
| LayoutTypeID filter; | LayoutTypeID filter; | ||||
| @@ -149,6 +169,9 @@ LayoutPack get_layout_pack(const param::ConvBias::Format format, int access_type | |||||
| default: | default: | ||||
| megdnn_assert(0, "invalid access_type"); | megdnn_assert(0, "invalid access_type"); | ||||
| } | } | ||||
| case Format::NCHW: | |||||
| return {LayoutTypeID::kTensorNCHW, LayoutTypeID::kTensorNCHW, | |||||
| LayoutTypeID::kTensorNCHW, LayoutTypeID::kTensorNCHW}; | |||||
| default: | default: | ||||
| megdnn_assert(0, "invalid format"); | megdnn_assert(0, "invalid format"); | ||||
| } | } | ||||
| @@ -177,6 +200,93 @@ EpilogueType get_epilogue_type(const param::ConvBias::NonlineMode mode, bool cla | |||||
| megdnn_assert(0, "invalid nonlinear mode"); | megdnn_assert(0, "invalid nonlinear mode"); | ||||
| } | } | ||||
| std::pair<int, int> get_tensor_alignment( | |||||
| const param::ConvBias::Format format, const TensorLayout& src, | |||||
| const TensorLayout& filter, const Base::AlgoParam& algo_param, | |||||
| bool is_chanwise) { | |||||
| int alignment_src = 0; | |||||
| int alignment_filter = 0; | |||||
| using Format = param::ConvBias::Format; | |||||
| // get tensor alignment for tensor op operations | |||||
| // for tensor op operations, the alignment is determined by the size of a vector | |||||
| auto get_tensor_alignment_tensor_op = [&]() { | |||||
| switch (format) { | |||||
| /// case int8 | |||||
| case Format::NCHW32: | |||||
| case Format::NCHW32_NCHW4: | |||||
| alignment_src = 16; | |||||
| alignment_filter = 16; | |||||
| break; | |||||
| /// case int4 or uint4 | |||||
| case Format::NCHW64: | |||||
| alignment_src = 32; | |||||
| alignment_filter = 32; | |||||
| break; | |||||
| case Format::NHWC: | |||||
| alignment_src = alignment_filter = algo_param.access_size; | |||||
| break; | |||||
| default: | |||||
| megdnn_throw("invalid format"); | |||||
| }; | |||||
| }; | |||||
| // get tensor alignment for dot product operations | |||||
| // for integer dot product operations, alignment src is always 4 | |||||
| // and the alignment filter is determined by the threadblock shape | |||||
| auto get_tensor_alignment_dp4a = [&]() { | |||||
| megdnn_assert( | |||||
| format == Format::NCHW4 || format == Format::NCHW4_NCHW || | |||||
| format == Format::NCHW4_NHWC || format == Format::NCHW4_NCHW32); | |||||
| alignment_src = 4; | |||||
| // determine alignment filter | |||||
| constexpr int warp_size = 32; | |||||
| int threads = warp_size * algo_param.threadblock_m * algo_param.threadblock_n * | |||||
| algo_param.threadblock_k / | |||||
| (algo_param.warp_m * algo_param.warp_n * algo_param.warp_k); | |||||
| int threadblock_loads = filter.dtype.size( | |||||
| algo_param.threadblock_m * algo_param.threadblock_n * | |||||
| algo_param.threadblock_k); | |||||
| int load_per_thread = threadblock_loads / threads; | |||||
| if (load_per_thread >= 16) | |||||
| alignment_filter = 16; | |||||
| else if (load_per_thread >= 8) | |||||
| alignment_filter = 8; | |||||
| else { | |||||
| megdnn_assert(load_per_thread >= 4); | |||||
| alignment_filter = 4; | |||||
| } | |||||
| }; | |||||
| // get tensor alignment for depthwise convolution | |||||
| auto get_tensor_alignment_dwconv2d_nchw = [&]() { | |||||
| alignment_filter = 1; | |||||
| size_t wi = src.dtype.size(src[3]); // width extent in bytes | |||||
| for (size_t candidate : {16, 4, 2}) { | |||||
| if (wi % candidate == 0) { | |||||
| alignment_src = candidate; | |||||
| break; | |||||
| } | |||||
| } | |||||
| alignment_src /= src.dtype.size(1); | |||||
| }; | |||||
| if (format == Format::NCHW32 || format == Format::NCHW32_NCHW4 || | |||||
| format == Format::NCHW64 || format == Format::NCHW64) { | |||||
| get_tensor_alignment_tensor_op(); | |||||
| } else if ( | |||||
| format == Format::NCHW4 || format == Format::NCHW4_NCHW || | |||||
| format == Format::NCHW4_NHWC || format == Format::NCHW4_NCHW32) { | |||||
| get_tensor_alignment_dp4a(); | |||||
| } else { | |||||
| /// the following is used for depthwise convolution | |||||
| megdnn_assert(format == Format::NCHW && is_chanwise); | |||||
| get_tensor_alignment_dwconv2d_nchw(); | |||||
| } | |||||
| megdnn_assert(alignment_src >= 1 && alignment_filter >= 1); | |||||
| return {alignment_src, alignment_filter}; | |||||
| } | |||||
| } // namespace | } // namespace | ||||
| const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op( | const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op( | ||||
| @@ -185,23 +295,36 @@ const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_co | |||||
| auto&& param = args.opr->param(); | auto&& param = args.opr->param(); | ||||
| auto layouts = get_layout_pack(param.format, m_algo_param.access_size); | auto layouts = get_layout_pack(param.format, m_algo_param.access_size); | ||||
| auto epilogue_type = get_epilogue_type( | auto epilogue_type = get_epilogue_type( | ||||
| param.nonlineMode, args.dst_layout->dtype.enumv() != DTypeEnum::Float32); | |||||
| param.nonlineMode, | |||||
| args.dst_layout->dtype.category() != DTypeCategory::FLOAT); | |||||
| cutlass::conv::SpecialOptimizeDesc special_optimization = | cutlass::conv::SpecialOptimizeDesc special_optimization = | ||||
| (use_conv_filter_unity_opt) | (use_conv_filter_unity_opt) | ||||
| ? cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY | ? cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY | ||||
| : cutlass::conv::SpecialOptimizeDesc::NONE; | : cutlass::conv::SpecialOptimizeDesc::NONE; | ||||
| int alignment_src, alignment_filter; | |||||
| auto&& fm = args.filter_meta; | |||||
| bool is_chanwise = param.sparse == param::ConvBias::Sparse::GROUP && fm.icpg == 1 && | |||||
| fm.ocpg == 1; | |||||
| std::tie(alignment_src, alignment_filter) = get_tensor_alignment( | |||||
| param.format, *args.src_layout, *args.filter_layout, m_algo_param, | |||||
| is_chanwise); | |||||
| auto accumulator_dtype = | |||||
| get_accumulator_dtype(args.src_layout->dtype, param.compute_mode); | |||||
| ConvolutionKey key{ | ConvolutionKey key{ | ||||
| convert_conv_op(conv_op), | convert_conv_op(conv_op), | ||||
| convert_dtype(args.src_layout->dtype.enumv()), | |||||
| convert_dtype(args.src_layout->dtype), | |||||
| layouts.src, | layouts.src, | ||||
| convert_dtype(args.filter_layout->dtype.enumv()), | |||||
| convert_dtype(args.filter_layout->dtype), | |||||
| layouts.filter, | layouts.filter, | ||||
| convert_dtype(args.dst_layout->dtype.enumv()), | |||||
| convert_dtype(args.dst_layout->dtype), | |||||
| layouts.dst, | layouts.dst, | ||||
| convert_dtype(args.bias_layout->dtype.enumv()), | |||||
| convert_dtype(args.bias_layout->dtype), | |||||
| layouts.bias, | layouts.bias, | ||||
| accumulator_dtype, | |||||
| convert_conv_type(conv_type), | convert_conv_type(conv_type), | ||||
| m_algo_param.threadblock_m, | m_algo_param.threadblock_m, | ||||
| m_algo_param.threadblock_n, | m_algo_param.threadblock_n, | ||||
| @@ -215,6 +338,8 @@ const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_co | |||||
| epilogue_type, | epilogue_type, | ||||
| m_algo_param.stage, | m_algo_param.stage, | ||||
| special_optimization, | special_optimization, | ||||
| alignment_src, | |||||
| alignment_filter, | |||||
| without_shared_load}; | without_shared_load}; | ||||
| return Singleton::get().operation_table.find_op(key); | return Singleton::get().operation_table.find_op(key); | ||||
| @@ -227,13 +352,16 @@ void ConvBiasForwardImpl::AlgoCutlassConvolutionBase::execute_cutlass_conv_op( | |||||
| size_t pw, size_t sh, size_t sw, size_t dh, size_t dw, const void* alpha, | size_t pw, size_t sh, size_t sw, size_t dh, size_t dw, const void* alpha, | ||||
| const void* beta, const void* gamma, const void* delta, const void* theta, | const void* beta, const void* gamma, const void* delta, const void* theta, | ||||
| const void* threshold, const void* dst_scale, cudaStream_t stream, | const void* threshold, const void* dst_scale, cudaStream_t stream, | ||||
| const void* extra_param) const { | |||||
| const void* extra_param, size_t groups) const { | |||||
| // gcc prints warnings when size_t values are implicitly narrowed to int | // gcc prints warnings when size_t values are implicitly narrowed to int | ||||
| cutlass::conv::Conv2dProblemSize problem_size{ | cutlass::conv::Conv2dProblemSize problem_size{ | ||||
| int(n), int(hi), int(wi), int(ci), | |||||
| int(co), int(fh), int(fw), int(ho), | |||||
| int(wo), int(ph), int(pw), int(sh), | |||||
| int(sw), int(dh), int(dw), cutlass::conv::Mode::kCrossCorrelation}; | |||||
| int(n), int(hi), int(wi), int(ci), | |||||
| int(co), int(fh), int(fw), int(ho), | |||||
| int(wo), int(ph), int(pw), int(sh), | |||||
| int(sw), int(dh), int(dw), cutlass::conv::Mode::kCrossCorrelation, | |||||
| 1, // split k slices, always 1 | |||||
| int(groups), // groups | |||||
| }; | |||||
| ConvolutionArguments conv_args{ | ConvolutionArguments conv_args{ | ||||
| problem_size, src, filter, bias, z, dst, alpha, | problem_size, src, filter, bias, z, dst, alpha, | ||||
| @@ -0,0 +1,95 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "src/common/conv_bias.h" | |||||
| #include "src/cuda/conv_bias/algo.h" | |||||
| #include "src/cuda/convolution_helper/parameter.cuh" | |||||
| #include "src/cuda/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace cuda; | |||||
| bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available( | |||||
| const SizeArgs& args) const { | |||||
| #define RETURN_IF_FALSE(stmt_) \ | |||||
| if (!(stmt_)) \ | |||||
| return false; | |||||
| RETURN_IF_FALSE( | |||||
| args.src_layout->is_contiguous() && args.dst_layout->is_contiguous()); | |||||
| using Param = param::ConvBias; | |||||
| using Format = Param::Format; | |||||
| using Sparse = Param::Sparse; | |||||
| using Mode = Param::Mode; | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.filter_meta; | |||||
| RETURN_IF_FALSE( | |||||
| param.format == Format::NCHW && | |||||
| args.src_layout->dtype.enumv() == DTypeEnum::Float16 && | |||||
| args.filter_layout->dtype.enumv() == DTypeEnum::Float16 && | |||||
| args.dst_layout->dtype.enumv() == DTypeEnum::Float16); | |||||
| RETURN_IF_FALSE( | |||||
| args.bias_layout->ndim <= 0 || | |||||
| (args.bias_layout->dtype.enumv() == DTypeEnum::Float16 && | |||||
| check_bias_share_in_channel(*args.bias_layout, param.format))); | |||||
| RETURN_IF_FALSE( | |||||
| args.z_layout->ndim <= 0 || | |||||
| args.z_layout->dtype.enumv() == DTypeEnum::Float16); | |||||
| RETURN_IF_FALSE(param.sparse == Sparse::GROUP); | |||||
| RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION); | |||||
| // check if channelwise convolution | |||||
| RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1); | |||||
| const auto* op = get_cutlass_conv_op( | |||||
| args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false); | |||||
| RETURN_IF_FALSE(op != nullptr); | |||||
| return true; | |||||
| #undef RETURN_IF_FALSE | |||||
| } | |||||
| void ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::exec( | |||||
| const ExecArgs& args) const { | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.filter_meta; | |||||
| size_t n = args.src_layout->operator[](0), hi = args.src_layout->operator[](2), | |||||
| wi = args.src_layout->operator[](3); | |||||
| size_t ho = args.dst_layout->operator[](2), wo = args.dst_layout->operator[](3); | |||||
| size_t co = fm.group; | |||||
| size_t ci = co; | |||||
| // check if channelwise convolution | |||||
| megdnn_assert(fm.icpg == 1 && fm.ocpg == 1); | |||||
| auto&& stream = cuda_stream(args.opr->handle()); | |||||
| float alpha = 1.f; | |||||
| float beta = args.bias_layout->ndim > 0 ? 1.f : 0.f; | |||||
| void* bias_ptr = args.bias_layout->ndim > 0 ? args.bias_tensor->raw_ptr() : nullptr; | |||||
| float gamma = args.z_layout->ndim > 0 ? 1.f : 0.f; | |||||
| void* z_ptr = args.z_layout->ndim > 0 ? args.z_tensor->raw_ptr() : nullptr; | |||||
| // dummy parameters, used for quantization cases | |||||
| float theta = 0.f; | |||||
| float delta = 0.f; | |||||
| float threshold = 0.f; | |||||
| const auto* op = get_cutlass_conv_op( | |||||
| args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false); | |||||
| UNPACK_CONV_PARAMETER(fm, param); | |||||
| MARK_USED_VAR | |||||
| execute_cutlass_conv_op( | |||||
| op, args.src_tensor->raw_ptr(), args.filter_tensor->raw_ptr(), bias_ptr, | |||||
| z_ptr, args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho, | |||||
| wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta, | |||||
| &threshold, nullptr, stream, nullptr, fm.group); | |||||
| after_kernel_launch(); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,95 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "src/common/conv_bias.h" | |||||
| #include "src/cuda/conv_bias/algo.h" | |||||
| #include "src/cuda/convolution_helper/parameter.cuh" | |||||
| #include "src/cuda/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace cuda; | |||||
| bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available( | |||||
| const SizeArgs& args) const { | |||||
| #define RETURN_IF_FALSE(stmt_) \ | |||||
| if (!(stmt_)) \ | |||||
| return false; | |||||
| RETURN_IF_FALSE( | |||||
| args.src_layout->is_contiguous() && args.dst_layout->is_contiguous()); | |||||
| using Param = param::ConvBias; | |||||
| using Format = Param::Format; | |||||
| using Sparse = Param::Sparse; | |||||
| using Mode = Param::Mode; | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.filter_meta; | |||||
| RETURN_IF_FALSE( | |||||
| param.format == Format::NCHW && | |||||
| args.src_layout->dtype.enumv() == DTypeEnum::Float32 && | |||||
| args.filter_layout->dtype.enumv() == DTypeEnum::Float32 && | |||||
| args.dst_layout->dtype.enumv() == DTypeEnum::Float32); | |||||
| RETURN_IF_FALSE( | |||||
| args.bias_layout->ndim <= 0 || | |||||
| (args.bias_layout->dtype.enumv() == DTypeEnum::Float32 && | |||||
| check_bias_share_in_channel(*args.bias_layout, param.format))); | |||||
| RETURN_IF_FALSE( | |||||
| args.z_layout->ndim <= 0 || | |||||
| args.z_layout->dtype.enumv() == DTypeEnum::Float32); | |||||
| RETURN_IF_FALSE(param.sparse == Sparse::GROUP); | |||||
| RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION); | |||||
| // check if channelwise convolution | |||||
| RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1); | |||||
| const auto* op = get_cutlass_conv_op( | |||||
| args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false); | |||||
| RETURN_IF_FALSE(op != nullptr); | |||||
| return true; | |||||
| #undef RETURN_IF_FALSE | |||||
| } | |||||
| void ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::exec( | |||||
| const ExecArgs& args) const { | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.filter_meta; | |||||
| size_t n = args.src_layout->operator[](0), hi = args.src_layout->operator[](2), | |||||
| wi = args.src_layout->operator[](3); | |||||
| size_t ho = args.dst_layout->operator[](2), wo = args.dst_layout->operator[](3); | |||||
| size_t co = fm.group; | |||||
| size_t ci = co; | |||||
| // check if channelwise convolution | |||||
| megdnn_assert(fm.icpg == 1 && fm.ocpg == 1); | |||||
| auto&& stream = cuda_stream(args.opr->handle()); | |||||
| float alpha = 1.f; | |||||
| float beta = args.bias_layout->ndim > 0 ? 1.f : 0.f; | |||||
| void* bias_ptr = args.bias_layout->ndim > 0 ? args.bias_tensor->raw_ptr() : nullptr; | |||||
| float gamma = args.z_layout->ndim > 0 ? 1.f : 0.f; | |||||
| void* z_ptr = args.z_layout->ndim > 0 ? args.z_tensor->raw_ptr() : nullptr; | |||||
| // dummy parameters, used for quantization cases | |||||
| float theta = 0.f; | |||||
| float delta = 0.f; | |||||
| float threshold = 0.f; | |||||
| const auto* op = get_cutlass_conv_op( | |||||
| args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false); | |||||
| UNPACK_CONV_PARAMETER(fm, param); | |||||
| MARK_USED_VAR | |||||
| execute_cutlass_conv_op( | |||||
| op, args.src_tensor->raw_ptr(), args.filter_tensor->raw_ptr(), bias_ptr, | |||||
| z_ptr, args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho, | |||||
| wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta, | |||||
| &threshold, nullptr, stream, nullptr, fm.group); | |||||
| after_kernel_launch(); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -71,6 +71,9 @@ public: | |||||
| class AlgoInt4Int4NHWCIMMAImplicitGemm; | class AlgoInt4Int4NHWCIMMAImplicitGemm; | ||||
| class AlgoUInt4Int4NHWCIMMAImplicitGemm; | class AlgoUInt4Int4NHWCIMMAImplicitGemm; | ||||
| class AlgoBFloat16; | class AlgoBFloat16; | ||||
| // The following algorithms are suitable for channel wise convolution | |||||
| class AlgoFloat32NCHWFMAImplicitBatchedGemm; | |||||
| class AlgoFloat16NCHWHMMAImplicitBatchedGemm; | |||||
| class AlgoPack; | class AlgoPack; | ||||
| @@ -39,6 +39,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: | |||||
| LayoutTypeID::kTensorNC4HW4, | LayoutTypeID::kTensorNC4HW4, | ||||
| NumericTypeID::kS32, | NumericTypeID::kS32, | ||||
| LayoutTypeID::kTensorNC4HW4, | LayoutTypeID::kTensorNC4HW4, | ||||
| NumericTypeID::kS32, | |||||
| cutlass::conv::ConvType::kConvolution, | cutlass::conv::ConvType::kConvolution, | ||||
| m_algo_param.threadblock_m, | m_algo_param.threadblock_m, | ||||
| m_algo_param.threadblock_n, | m_algo_param.threadblock_n, | ||||
| @@ -52,6 +53,8 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: | |||||
| cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, | cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, | ||||
| m_algo_param.stage, | m_algo_param.stage, | ||||
| special_optimization, | special_optimization, | ||||
| 4, | |||||
| 16, | |||||
| false}; | false}; | ||||
| return (void*)Singleton::get().operation_table.find_op(key); | return (void*)Singleton::get().operation_table.find_op(key); | ||||
| } | } | ||||
| @@ -39,6 +39,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm:: | |||||
| LayoutTypeID::kTensorNC4HW4, | LayoutTypeID::kTensorNC4HW4, | ||||
| NumericTypeID::kS32, | NumericTypeID::kS32, | ||||
| LayoutTypeID::kTensorNC4HW4, | LayoutTypeID::kTensorNC4HW4, | ||||
| NumericTypeID::kS32, | |||||
| cutlass::conv::ConvType::kConvolution, | cutlass::conv::ConvType::kConvolution, | ||||
| 16, | 16, | ||||
| 64, | 64, | ||||
| @@ -52,6 +53,8 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm:: | |||||
| cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, | cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, | ||||
| 2, | 2, | ||||
| special_optimization, | special_optimization, | ||||
| 4, | |||||
| 4, | |||||
| false}; | false}; | ||||
| return (void*)Singleton::get().operation_table.find_op(key); | return (void*)Singleton::get().operation_table.find_op(key); | ||||
| } | } | ||||
| @@ -50,6 +50,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NHWCIMMAImplicitGemm::get_avail | |||||
| LayoutTypeID::kTensorNHWC, | LayoutTypeID::kTensorNHWC, | ||||
| NumericTypeID::kS32, | NumericTypeID::kS32, | ||||
| LayoutTypeID::kTensorNHWC, | LayoutTypeID::kTensorNHWC, | ||||
| NumericTypeID::kS32, | |||||
| cutlass::conv::ConvType::kConvolution, | cutlass::conv::ConvType::kConvolution, | ||||
| m_algo_param.threadblock_m, | m_algo_param.threadblock_m, | ||||
| m_algo_param.threadblock_n, | m_algo_param.threadblock_n, | ||||
| @@ -63,6 +64,8 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NHWCIMMAImplicitGemm::get_avail | |||||
| cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, | cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, | ||||
| m_algo_param.stage, | m_algo_param.stage, | ||||
| special_optimization, | special_optimization, | ||||
| m_algo_param.access_size, | |||||
| m_algo_param.access_size, | |||||
| false}; | false}; | ||||
| return (void*)Singleton::get().operation_table.find_op(key); | return (void*)Singleton::get().operation_table.find_op(key); | ||||
| } | } | ||||
| @@ -54,24 +54,28 @@ namespace library { | |||||
| void initialize_all_gemm_simt_operations(Manifest& manifest); | void initialize_all_gemm_simt_operations(Manifest& manifest); | ||||
| void initialize_all_conv2d_simt_operations(Manifest& manifest); | void initialize_all_conv2d_simt_operations(Manifest& manifest); | ||||
| void initialize_all_deconv_simt_operations(Manifest& manifest); | void initialize_all_deconv_simt_operations(Manifest& manifest); | ||||
| void initialize_all_dwconv2d_fprop_simt_operations(Manifest& manifest); | |||||
| #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED | #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED | ||||
| void initialize_all_gemm_tensorop884_operations(Manifest& manifest); | void initialize_all_gemm_tensorop884_operations(Manifest& manifest); | ||||
| void initialize_all_gemm_tensorop1688_operations(Manifest& manifest); | void initialize_all_gemm_tensorop1688_operations(Manifest& manifest); | ||||
| void initialize_all_conv2d_tensorop8816_operations(Manifest& manifest); | void initialize_all_conv2d_tensorop8816_operations(Manifest& manifest); | ||||
| void initialize_all_conv2d_tensorop8832_operations(Manifest& manifest); | void initialize_all_conv2d_tensorop8832_operations(Manifest& manifest); | ||||
| void initialize_all_deconv_tensorop8816_operations(Manifest& manifest); | void initialize_all_deconv_tensorop8816_operations(Manifest& manifest); | ||||
| void initialize_all_dwconv2d_fprop_tensorop884_operations(Manifest& manifest); | |||||
| #endif | #endif | ||||
| void initialize_all(Manifest& manifest) { | void initialize_all(Manifest& manifest) { | ||||
| initialize_all_gemm_simt_operations(manifest); | initialize_all_gemm_simt_operations(manifest); | ||||
| initialize_all_conv2d_simt_operations(manifest); | initialize_all_conv2d_simt_operations(manifest); | ||||
| initialize_all_deconv_simt_operations(manifest); | initialize_all_deconv_simt_operations(manifest); | ||||
| initialize_all_dwconv2d_fprop_simt_operations(manifest); | |||||
| #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED | #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED | ||||
| initialize_all_gemm_tensorop884_operations(manifest); | initialize_all_gemm_tensorop884_operations(manifest); | ||||
| initialize_all_gemm_tensorop1688_operations(manifest); | initialize_all_gemm_tensorop1688_operations(manifest); | ||||
| initialize_all_conv2d_tensorop8816_operations(manifest); | initialize_all_conv2d_tensorop8816_operations(manifest); | ||||
| initialize_all_conv2d_tensorop8832_operations(manifest); | initialize_all_conv2d_tensorop8832_operations(manifest); | ||||
| initialize_all_deconv_tensorop8816_operations(manifest); | initialize_all_deconv_tensorop8816_operations(manifest); | ||||
| initialize_all_dwconv2d_fprop_tensorop884_operations(manifest); | |||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -223,6 +223,9 @@ enum class ThreadblockSwizzleID { | |||||
| kConvolutionFpropTrans, | kConvolutionFpropTrans, | ||||
| kConvolutionDgradNCxHWx, | kConvolutionDgradNCxHWx, | ||||
| kConvolutionDgradTrans, | kConvolutionDgradTrans, | ||||
| kDepthwiseConvolutionFprop, | |||||
| kDepthwiseConvolutionDgrad, | |||||
| kDepthwiseConvolutionWgrad, | |||||
| kInvalid | kInvalid | ||||
| }; | }; | ||||
| @@ -570,6 +570,27 @@ struct ThreadblockSwizzleMap< | |||||
| ThreadblockSwizzleID::kConvolutionDgradTrans; | ThreadblockSwizzleID::kConvolutionDgradTrans; | ||||
| }; | }; | ||||
| template <> | |||||
| struct ThreadblockSwizzleMap< | |||||
| conv::threadblock::DepthwiseConvolutionFpropThreadblockSwizzle> { | |||||
| static ThreadblockSwizzleID const kId = | |||||
| ThreadblockSwizzleID::kDepthwiseConvolutionFprop; | |||||
| }; | |||||
| template <> | |||||
| struct ThreadblockSwizzleMap< | |||||
| conv::threadblock::DepthwiseConvolutionDgradThreadblockSwizzle> { | |||||
| static ThreadblockSwizzleID const kId = | |||||
| ThreadblockSwizzleID::kDepthwiseConvolutionDgrad; | |||||
| }; | |||||
| template <> | |||||
| struct ThreadblockSwizzleMap< | |||||
| conv::threadblock::DepthwiseConvolutionWgradThreadblockSwizzle> { | |||||
| static ThreadblockSwizzleID const kId = | |||||
| ThreadblockSwizzleID::kDepthwiseConvolutionWgrad; | |||||
| }; | |||||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template <typename Element, typename Layout> | template <typename Element, typename Layout> | ||||
| @@ -99,6 +99,8 @@ ConvolutionKey get_convolution_key_from_desc(const ConvolutionDescription& desc) | |||||
| key.layout_dst = desc.dst.layout; | key.layout_dst = desc.dst.layout; | ||||
| key.element_bias = desc.bias.element; | key.element_bias = desc.bias.element; | ||||
| key.layout_bias = desc.bias.layout; | key.layout_bias = desc.bias.layout; | ||||
| key.element_accumulator = | |||||
| desc.tile_description.math_instruction.element_accumulator; | |||||
| key.convolution_type = desc.convolution_type; | key.convolution_type = desc.convolution_type; | ||||
| @@ -124,6 +126,8 @@ ConvolutionKey get_convolution_key_from_desc(const ConvolutionDescription& desc) | |||||
| key.stages = desc.tile_description.threadblock_stages; | key.stages = desc.tile_description.threadblock_stages; | ||||
| key.special_optimization = desc.special_optimization; | key.special_optimization = desc.special_optimization; | ||||
| key.alignment_src = desc.src.alignment; | |||||
| key.alignment_filter = desc.filter.alignment; | |||||
| key.without_shared_load = desc.without_shared_load; | key.without_shared_load = desc.without_shared_load; | ||||
| return key; | return key; | ||||
| @@ -188,6 +188,7 @@ struct ConvolutionKey { | |||||
| library::LayoutTypeID layout_dst; | library::LayoutTypeID layout_dst; | ||||
| library::NumericTypeID element_bias; | library::NumericTypeID element_bias; | ||||
| library::LayoutTypeID layout_bias; | library::LayoutTypeID layout_bias; | ||||
| NumericTypeID element_accumulator; | |||||
| conv::ConvType convolution_type; | conv::ConvType convolution_type; | ||||
| @@ -206,6 +207,10 @@ struct ConvolutionKey { | |||||
| epilogue::EpilogueType epilogue_type; | epilogue::EpilogueType epilogue_type; | ||||
| int stages; | int stages; | ||||
| conv::SpecialOptimizeDesc special_optimization; | conv::SpecialOptimizeDesc special_optimization; | ||||
| int alignment_src; | |||||
| int alignment_filter; | |||||
| bool without_shared_load; | bool without_shared_load; | ||||
| inline bool operator==(ConvolutionKey const& rhs) const { | inline bool operator==(ConvolutionKey const& rhs) const { | ||||
| @@ -215,6 +220,7 @@ struct ConvolutionKey { | |||||
| (layout_filter == rhs.layout_filter) && | (layout_filter == rhs.layout_filter) && | ||||
| (element_dst == rhs.element_dst) && (layout_dst == rhs.layout_dst) && | (element_dst == rhs.element_dst) && (layout_dst == rhs.layout_dst) && | ||||
| (element_bias == rhs.element_bias) && (layout_bias == rhs.layout_bias) && | (element_bias == rhs.element_bias) && (layout_bias == rhs.layout_bias) && | ||||
| (element_accumulator == rhs.element_accumulator) && | |||||
| (convolution_type == rhs.convolution_type) && | (convolution_type == rhs.convolution_type) && | ||||
| (threadblock_shape_m == rhs.threadblock_shape_m) && | (threadblock_shape_m == rhs.threadblock_shape_m) && | ||||
| (threadblock_shape_n == rhs.threadblock_shape_n) && | (threadblock_shape_n == rhs.threadblock_shape_n) && | ||||
| @@ -227,6 +233,8 @@ struct ConvolutionKey { | |||||
| (instruction_shape_k == rhs.instruction_shape_k) && | (instruction_shape_k == rhs.instruction_shape_k) && | ||||
| (epilogue_type == rhs.epilogue_type) && (stages == rhs.stages) && | (epilogue_type == rhs.epilogue_type) && (stages == rhs.stages) && | ||||
| (special_optimization == rhs.special_optimization) && | (special_optimization == rhs.special_optimization) && | ||||
| (alignment_src == rhs.alignment_src) && | |||||
| (alignment_filter == rhs.alignment_filter) && | |||||
| (without_shared_load == rhs.without_shared_load); | (without_shared_load == rhs.without_shared_load); | ||||
| } | } | ||||
| @@ -254,6 +262,7 @@ struct ConvolutionKey { | |||||
| "\n layout_dst: " + to_string(layout_dst) + | "\n layout_dst: " + to_string(layout_dst) + | ||||
| "\n element_bias: " + to_string(element_bias) + | "\n element_bias: " + to_string(element_bias) + | ||||
| "\n layout_bias: " + to_string(layout_bias) + | "\n layout_bias: " + to_string(layout_bias) + | ||||
| "\n element_accumulator: " + to_string(element_accumulator) + | |||||
| "\n convolution_type: " + to_string(convolution_type) + | "\n convolution_type: " + to_string(convolution_type) + | ||||
| "\n threadblock_shape: " + threadblock_shape_str + | "\n threadblock_shape: " + threadblock_shape_str + | ||||
| "\n warp_shape: " + warp_shape_str + | "\n warp_shape: " + warp_shape_str + | ||||
| @@ -261,6 +270,8 @@ struct ConvolutionKey { | |||||
| "\n epilogue_type: " + to_string(epilogue_type) + | "\n epilogue_type: " + to_string(epilogue_type) + | ||||
| "\n stages: " + std::to_string(stages) + | "\n stages: " + std::to_string(stages) + | ||||
| "\n special_optimization: " + to_string(special_optimization) + | "\n special_optimization: " + to_string(special_optimization) + | ||||
| "\n alignment_src: " + std::to_string(alignment_src) + | |||||
| "\n alignment_filter: " + std::to_string(alignment_filter) + | |||||
| "\n without_shared_load: " + to_string(without_shared_load) + "\n}"; | "\n without_shared_load: " + to_string(without_shared_load) + "\n}"; | ||||
| } | } | ||||
| }; | }; | ||||
| @@ -278,6 +289,7 @@ struct ConvolutionKeyHasher { | |||||
| .update(&key.layout_dst, sizeof(key.layout_dst)) | .update(&key.layout_dst, sizeof(key.layout_dst)) | ||||
| .update(&key.element_bias, sizeof(key.element_bias)) | .update(&key.element_bias, sizeof(key.element_bias)) | ||||
| .update(&key.layout_bias, sizeof(key.layout_bias)) | .update(&key.layout_bias, sizeof(key.layout_bias)) | ||||
| .update(&key.element_accumulator, sizeof(key.element_accumulator)) | |||||
| .update(&key.convolution_type, sizeof(key.convolution_type)) | .update(&key.convolution_type, sizeof(key.convolution_type)) | ||||
| .update(&key.threadblock_shape_m, sizeof(key.threadblock_shape_m)) | .update(&key.threadblock_shape_m, sizeof(key.threadblock_shape_m)) | ||||
| .update(&key.threadblock_shape_n, sizeof(key.threadblock_shape_n)) | .update(&key.threadblock_shape_n, sizeof(key.threadblock_shape_n)) | ||||
| @@ -291,6 +303,8 @@ struct ConvolutionKeyHasher { | |||||
| .update(&key.epilogue_type, sizeof(key.epilogue_type)) | .update(&key.epilogue_type, sizeof(key.epilogue_type)) | ||||
| .update(&key.stages, sizeof(key.stages)) | .update(&key.stages, sizeof(key.stages)) | ||||
| .update(&key.special_optimization, sizeof(key.special_optimization)) | .update(&key.special_optimization, sizeof(key.special_optimization)) | ||||
| .update(&key.alignment_src, sizeof(key.alignment_src)) | |||||
| .update(&key.alignment_filter, sizeof(key.alignment_filter)) | |||||
| .update(&key.without_shared_load, sizeof(key.without_shared_load)) | .update(&key.without_shared_load, sizeof(key.without_shared_load)) | ||||
| .digest(); | .digest(); | ||||
| } | } | ||||
| @@ -38,8 +38,10 @@ bool check_need_full_bench() { | |||||
| } | } | ||||
| #endif | #endif | ||||
| Convolution::Param gconv_param(Convolution::Param p) { | |||||
| Convolution::Param gconv_param(Convolution::Param p, bool io16xc32 = false) { | |||||
| p.sparse = Convolution::Param::Sparse::GROUP; | p.sparse = Convolution::Param::Sparse::GROUP; | ||||
| if (io16xc32) | |||||
| p.compute_mode = Convolution::Param::ComputeMode::FLOAT32; | |||||
| return p; | return p; | ||||
| } | } | ||||
| @@ -421,6 +423,129 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER) { | |||||
| } | } | ||||
| } | } | ||||
| namespace { | |||||
| template <typename Op> | |||||
| struct AlgoCheckerMaker { | |||||
| static auto make(const char* name, bool* require_algo) { | |||||
| return AlgoChecker<Op>(name, require_algo); | |||||
| } | |||||
| }; | |||||
| template <> | |||||
| struct AlgoCheckerMaker<ConvolutionForward> { | |||||
| static auto make(const char* name, bool* require_algo) { | |||||
| return AlgoChecker<ConvolutionForward>( | |||||
| ExecutionPolicyAlgoName{ | |||||
| "DEFAULT", | |||||
| {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( | |||||
| name, {}) | |||||
| .c_str(), | |||||
| {}}}}, | |||||
| require_algo); | |||||
| } | |||||
| }; | |||||
| template <typename Op> | |||||
| void check_chanwise(DType io_type, DType comp_type, Handle* handle, const char* name) { | |||||
| Checker<Op> checker(handle); | |||||
| bool require_algo = false; | |||||
| checker.set_before_exec_callback(AlgoCheckerMaker<Op>::make(name, &require_algo)); | |||||
| checker.set_dtype(0, io_type).set_dtype(1, io_type).set_dtype(2, io_type); | |||||
| bool io16xc32 = false; | |||||
| if (io_type == dtype::Float16()) { | |||||
| if (comp_type == dtype::Float16()) { | |||||
| checker.set_epsilon(1e-1); | |||||
| } else { | |||||
| io16xc32 = true; | |||||
| } | |||||
| } | |||||
| // dispatch testcase by operation | |||||
| if (std::is_same<Op, ConvolutionForward>::value) { | |||||
| // align 8 | |||||
| checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32)) | |||||
| .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}}); | |||||
| // align 1 | |||||
| checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32)) | |||||
| .execs({{8, 2, 15, 15}, {2, 1, 1, 15, 15}, {}}); | |||||
| // align 2 | |||||
| checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32)) | |||||
| .execs({{8, 2, 14, 14}, {2, 1, 1, 15, 15}, {}}); | |||||
| // custom padding | |||||
| checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32)) | |||||
| .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}}); | |||||
| // custom stride | |||||
| checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32)) | |||||
| .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}}); | |||||
| } else if (std::is_same<Op, ConvolutionBackwardData>::value) { | |||||
| // align 8 | |||||
| checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32)) | |||||
| .execs({{2, 1, 1, 15, 15}, {8, 2, 16, 16}, {8, 2, 16, 16}}); | |||||
| // align 1 | |||||
| checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32)) | |||||
| .execs({{2, 1, 1, 15, 15}, {8, 2, 15, 15}, {8, 2, 15, 15}}); | |||||
| // align 2 | |||||
| checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32)) | |||||
| .execs({{2, 1, 1, 15, 15}, {8, 2, 14, 14}, {8, 2, 14, 14}}); | |||||
| // custom padding | |||||
| checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32)) | |||||
| .execs({{2, 1, 1, 15, 15}, {8, 2, 8, 8}, {8, 2, 16, 16}}); | |||||
| // custom stride | |||||
| checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32)) | |||||
| .execs({{2, 1, 1, 15, 15}, {8, 2, 7, 7}, {8, 2, 14, 14}}); | |||||
| } else if (std::is_same<Op, ConvolutionBackwardFilter>::value) { | |||||
| } | |||||
| } | |||||
| } // namespace | |||||
| #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_FMA_KERNEL(cb) \ | |||||
| cb(1, 128, 128, 8, 32, 64, 8); \ | |||||
| cb(2, 128, 64, 8, 64, 32, 8); \ | |||||
| cb(3, 128, 32, 8, 64, 32, 8); \ | |||||
| cb(4, 64, 128, 8, 64, 32, 8); \ | |||||
| cb(5, 32, 128, 8, 32, 64, 8); \ | |||||
| cb(6, 64, 64, 8, 64, 32, 8); \ | |||||
| cb(7, 32, 64, 8, 32, 64, 8); \ | |||||
| cb(8, 32, 32, 8, 32, 32, 8); \ | |||||
| cb(9, 64, 32, 8, 64, 32, 8); | |||||
| #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | |||||
| TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_FMA_##tag) { \ | |||||
| check_chanwise<ConvolutionForward>( \ | |||||
| dtype::Float32(), dtype::Float32(), handle_cuda(), \ | |||||
| "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | |||||
| "_" #wm "X" #wn "X" #wk "_2stage"); \ | |||||
| } | |||||
| MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_FMA_KERNEL(cb) | |||||
| #undef cb | |||||
| #undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_FMA_KERNEL | |||||
| #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_HMMA_KERNEL(cb) \ | |||||
| cb(1, 128, 128, 32, 32, 32, 32); \ | |||||
| cb(2, 128, 256, 32, 64, 64, 32); \ | |||||
| cb(3, 128, 64, 32, 32, 32, 32); \ | |||||
| cb(4, 64, 128, 32, 32, 32, 32); \ | |||||
| cb(5, 64, 64, 32, 32, 32, 32); | |||||
| // check both ioc16 and io16xc32 | |||||
| #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | |||||
| TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_HMMA_##tag) { \ | |||||
| check_chanwise<ConvolutionForward>( \ | |||||
| dtype::Float16(), dtype::Float16(), handle_cuda(), \ | |||||
| "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | |||||
| "_" #wm "X" #wn "X" #wk "_2stage"); \ | |||||
| check_chanwise<ConvolutionForward>( \ | |||||
| dtype::Float16(), dtype::Float32(), handle_cuda(), \ | |||||
| "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | |||||
| "_" #wm "X" #wn "X" #wk "_2stage"); \ | |||||
| } | |||||
| MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_HMMA_KERNEL(cb) | |||||
| #undef cb | |||||
| #undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_HMMA_KERNEL | |||||
| #if MEGDNN_WITH_BENCHMARK | #if MEGDNN_WITH_BENCHMARK | ||||
| TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) { | TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) { | ||||
| auto handle = handle_cuda(); | auto handle = handle_cuda(); | ||||
| @@ -1123,6 +1248,82 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_FILTER) { | |||||
| // clang-format on | // clang-format on | ||||
| } | } | ||||
| TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_LARGE_KERNEL) { | |||||
| CUBenchmarker<ConvolutionForward> bencher(handle_cuda()); | |||||
| size_t RUNS = 100; | |||||
| bencher.set_display(false).set_times(RUNS); | |||||
| std::unique_ptr<OprProxy<ConvolutionForward>> proxy{ | |||||
| new OprProxy<ConvolutionForward>{true}}; | |||||
| bencher.set_proxy(proxy); | |||||
| Convolution::Param param; | |||||
| param.format = ConvBias::Param::Format::NCHW; | |||||
| param.sparse = Convolution::Param::Sparse::GROUP; | |||||
| NormalRNG rng; | |||||
| auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) { | |||||
| param.pad_h = f / 2; | |||||
| param.pad_w = f / 2; | |||||
| param.stride_h = s; | |||||
| param.stride_w = s; | |||||
| param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | |||||
| TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f}; | |||||
| TensorLayout dst_layout; | |||||
| auto opr = handle_cuda()->create_operator<Convolution>(); | |||||
| opr->param() = param; | |||||
| opr->deduce_layout( | |||||
| {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout); | |||||
| float bandwith = static_cast<float>( | |||||
| src.total_nr_elems() + filter.total_nr_elems() + | |||||
| dst_layout.total_nr_elems()) / | |||||
| (1024 * 1024 * 1024) * 1e3; | |||||
| bencher.set_param(param) | |||||
| .set_dtype(0, dtype::Float32()) | |||||
| .set_dtype(1, dtype::Float32()) | |||||
| .set_dtype(2, dtype::Float32()) | |||||
| .set_rng(0, &rng) | |||||
| .set_rng(1, &rng); | |||||
| bencher.proxy()->target_execution_policy = {}; | |||||
| auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS; | |||||
| bencher.set_param(param) | |||||
| .set_dtype(0, dtype::Float16()) | |||||
| .set_dtype(1, dtype::Float16()) | |||||
| .set_dtype(2, dtype::Float16()) | |||||
| .set_rng(0, &rng) | |||||
| .set_rng(1, &rng); | |||||
| bencher.proxy()->target_execution_policy = {}; | |||||
| auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS; | |||||
| bencher.proxy()->target_execution_policy.algo.reset(); | |||||
| param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||||
| bencher.set_param(param); | |||||
| auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS; | |||||
| printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s " | |||||
| "float16: %.2fms %.2fGB/s " | |||||
| "pseudo float16: %.2fms %.2fGB/s " | |||||
| "speedup: " | |||||
| "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n", | |||||
| s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32, | |||||
| bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16, | |||||
| bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16, | |||||
| bandwith * 2 / time_in_ms_pseudo_fp16, time_in_ms_fp32 / time_in_ms_fp16, | |||||
| time_in_ms_pseudo_fp16 / time_in_ms_fp16); | |||||
| }; | |||||
| // clang-format off | |||||
| for (size_t b : {32, 64}) | |||||
| for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) { | |||||
| run(b, 384, 32, 32, f, 1); | |||||
| run(b, 384, 64, 64, f, 1); | |||||
| } | |||||
| // clang-format on | |||||
| } | |||||
| #endif | #endif | ||||
| // vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen | ||||