| @@ -13,6 +13,12 @@ genrule( | |||||
| CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations conv2d --type simt $(@D) | CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations conv2d --type simt $(@D) | ||||
| CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations conv2d --type tensorop8816 $(@D) | CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations conv2d --type tensorop8816 $(@D) | ||||
| CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations conv2d --type tensorop8832 $(@D) | CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations conv2d --type tensorop8832 $(@D) | ||||
| CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_fprop --type simt $(@D) | |||||
| CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_fprop --type tensorop884 $(@D) | |||||
| CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_dgrad --type simt $(@D) | |||||
| CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_dgrad --type tensorop884 $(@D) | |||||
| CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_wgrad --type simt $(@D) | |||||
| CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_wgrad --type tensorop884 $(@D) | |||||
| """, | """, | ||||
| tools = ["//brain/megbrain/dnn/scripts/cutlass_generator:generator.py"], | tools = ["//brain/megbrain/dnn/scripts/cutlass_generator:generator.py"], | ||||
| visibility = ["//visibility:public"], | visibility = ["//visibility:public"], | ||||
| @@ -3,6 +3,9 @@ from generator import ( | |||||
| GenerateGemvOperations, | GenerateGemvOperations, | ||||
| GenerateConv2dOperations, | GenerateConv2dOperations, | ||||
| GenerateDeconvOperations, | GenerateDeconvOperations, | ||||
| GenerateDwconv2dFpropOperations, | |||||
| GenerateDwconv2dDgradOperations, | |||||
| GenerateDwconv2dWgradOperations, | |||||
| ) | ) | ||||
| @@ -21,6 +24,12 @@ def write_op_list(f, gen_op, gen_type): | |||||
| operations = GenerateConv2dOperations(GenArg(gen_op, gen_type)) | operations = GenerateConv2dOperations(GenArg(gen_op, gen_type)) | ||||
| elif gen_op == "deconv": | elif gen_op == "deconv": | ||||
| operations = GenerateDeconvOperations(GenArg(gen_op, gen_type)) | operations = GenerateDeconvOperations(GenArg(gen_op, gen_type)) | ||||
| elif gen_op == "dwconv2d_fprop": | |||||
| operations = GenerateDwconv2dFpropOperations(GenArg(gen_op, gen_type)) | |||||
| elif gen_op == "dwconv2d_dgrad": | |||||
| operations = GenerateDwconv2dDgradOperations(GenArg(gen_op, gen_type)) | |||||
| elif gen_op == "dwconv2d_wgrad": | |||||
| operations = GenerateDwconv2dWgradOperations(GenArg(gen_op, gen_type)) | |||||
| for op in operations: | for op in operations: | ||||
| f.write(' "%s.cu",\n' % op.procedural_name()) | f.write(' "%s.cu",\n' % op.procedural_name()) | ||||
| if gen_op != "gemv": | if gen_op != "gemv": | ||||
| @@ -40,4 +49,10 @@ if __name__ == "__main__": | |||||
| write_op_list(f, "conv2d", "simt") | write_op_list(f, "conv2d", "simt") | ||||
| write_op_list(f, "conv2d", "tensorop8816") | write_op_list(f, "conv2d", "tensorop8816") | ||||
| write_op_list(f, "conv2d", "tensorop8832") | write_op_list(f, "conv2d", "tensorop8832") | ||||
| write_op_list(f, "dwconv2d_fprop", "simt") | |||||
| write_op_list(f, "dwconv2d_fprop", "tensorop884") | |||||
| write_op_list(f, "dwconv2d_dgrad", "simt") | |||||
| write_op_list(f, "dwconv2d_dgrad", "tensorop884") | |||||
| write_op_list(f, "dwconv2d_wgrad", "simt") | |||||
| write_op_list(f, "dwconv2d_wgrad", "tensorop884") | |||||
| f.write("]") | f.write("]") | ||||
| @@ -10,24 +10,25 @@ import shutil | |||||
| from library import * | from library import * | ||||
| from gemm_operation import * | from gemm_operation import * | ||||
| from conv2d_operation import * | |||||
| from conv2d_operation import * | |||||
| ################################################################################################### | ################################################################################################### | ||||
| class EmitOperationKindLibrary: | class EmitOperationKindLibrary: | ||||
| def __init__(self, generated_path, kind, args): | |||||
| self.generated_path = generated_path | |||||
| self.kind = kind | |||||
| self.args = args | |||||
| def __init__(self, generated_path, kind, args): | |||||
| self.generated_path = generated_path | |||||
| self.kind = kind | |||||
| self.args = args | |||||
| self.emitters = { | |||||
| OperationKind.Gemm: EmitGemmConfigurationLibrary | |||||
| , OperationKind.Conv2d: EmitConv2dConfigurationLibrary | |||||
| } | |||||
| self.emitters = { | |||||
| OperationKind.Gemm: EmitGemmConfigurationLibrary, | |||||
| OperationKind.Conv2d: EmitConv2dConfigurationLibrary, | |||||
| } | |||||
| self.configurations = []; | |||||
| self.configurations = [] | |||||
| self.header_template =""" | |||||
| self.header_template = """ | |||||
| /* | /* | ||||
| Generated by manifest.py - Do not edit. | Generated by manifest.py - Do not edit. | ||||
| */ | */ | ||||
| @@ -42,17 +43,19 @@ namespace library { | |||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| """ | """ | ||||
| self.entry_template = """ | |||||
| self.entry_template = """ | |||||
| // | // | ||||
| // Entry point to construct operations | // Entry point to construct operations | ||||
| // | // | ||||
| void initialize_all_${operation_name}_operations(Manifest &manifest) { | void initialize_all_${operation_name}_operations(Manifest &manifest) { | ||||
| """ | """ | ||||
| self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n" | |||||
| self.configuration_template =" initialize_${configuration_name}(manifest);\n" | |||||
| self.configuration_prototype_template = ( | |||||
| "void initialize_${configuration_name}(Manifest &manifest);\n" | |||||
| ) | |||||
| self.configuration_template = " initialize_${configuration_name}(manifest);\n" | |||||
| self.epilogue_template =""" | |||||
| self.epilogue_template = """ | |||||
| } | } | ||||
| @@ -63,91 +66,118 @@ void initialize_all_${operation_name}_operations(Manifest &manifest) { | |||||
| """ | """ | ||||
| # | |||||
| def __enter__(self): | |||||
| self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind]) | |||||
| os.mkdir(self.operation_path) | |||||
| # | |||||
| def __enter__(self): | |||||
| self.operation_path = os.path.join( | |||||
| self.generated_path, OperationKindNames[self.kind] | |||||
| ) | |||||
| os.mkdir(self.operation_path) | |||||
| self.top_level_path = os.path.join( | |||||
| self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind] | |||||
| ) | |||||
| self.top_level_file = open(self.top_level_path, "w") | |||||
| self.top_level_file.write(self.header_template) | |||||
| self.top_level_path = os.path.join(self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind]) | |||||
| self.source_files = [self.top_level_path] | |||||
| self.top_level_file = open(self.top_level_path, "w") | |||||
| self.top_level_file.write(self.header_template) | |||||
| return self | |||||
| self.source_files = [self.top_level_path,] | |||||
| # | |||||
| def emit(self, configuration_name, operations): | |||||
| return self | |||||
| with self.emitters[self.kind]( | |||||
| self.operation_path, configuration_name | |||||
| ) as configuration_emitter: | |||||
| for operation in operations: | |||||
| configuration_emitter.emit(operation) | |||||
| # | |||||
| def emit(self, configuration_name, operations): | |||||
| self.source_files.append(configuration_emitter.configuration_path) | |||||
| with self.emitters[self.kind](self.operation_path, configuration_name) as configuration_emitter: | |||||
| for operation in operations: | |||||
| configuration_emitter.emit(operation) | |||||
| self.source_files.append(configuration_emitter.configuration_path) | |||||
| self.configurations.append(configuration_name) | |||||
| self.top_level_file.write( | |||||
| SubstituteTemplate( | |||||
| self.configuration_prototype_template, | |||||
| {"configuration_name": configuration_name}, | |||||
| ) | |||||
| ) | |||||
| self.configurations.append(configuration_name) | |||||
| self.top_level_file.write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} )) | |||||
| # | |||||
| def __exit__(self, exception_type, exception_value, traceback): | |||||
| self.top_level_file.write( | |||||
| SubstituteTemplate( | |||||
| self.entry_template, {"operation_name": OperationKindNames[self.kind]} | |||||
| ) | |||||
| ) | |||||
| # | |||||
| def __exit__(self, exception_type, exception_value, traceback): | |||||
| self.top_level_file.write(SubstituteTemplate(self.entry_template, {'operation_name': OperationKindNames[self.kind]})) | |||||
| for configuration_name in self.configurations: | |||||
| self.top_level_file.write( | |||||
| SubstituteTemplate( | |||||
| self.configuration_template, | |||||
| {"configuration_name": configuration_name}, | |||||
| ) | |||||
| ) | |||||
| for configuration_name in self.configurations: | |||||
| self.top_level_file.write(SubstituteTemplate(self.configuration_template, {'configuration_name': configuration_name})) | |||||
| self.top_level_file.write(self.epilogue_template) | |||||
| self.top_level_file.close() | |||||
| self.top_level_file.write(self.epilogue_template) | |||||
| self.top_level_file.close() | |||||
| ################################################################################################### | ################################################################################################### | ||||
| ################################################################################################### | ################################################################################################### | ||||
| class Options: | class Options: | ||||
| def __init__(self): | |||||
| pass | |||||
| def __init__(self): | |||||
| pass | |||||
| ################################################################################################### | ################################################################################################### | ||||
| # | # | ||||
| class Manifest: | class Manifest: | ||||
| # | |||||
| def __init__(self, args): | |||||
| self.operations = {} | |||||
| self.args = args | |||||
| # | |||||
| def __init__(self, args): | |||||
| self.operations = {} | |||||
| self.args = args | |||||
| architectures = ( | |||||
| args.architectures.split(";") if len(args.architectures) else ["50"] | |||||
| ) | |||||
| self.compute_capabilities = [int(x) for x in architectures] | |||||
| architectures = args.architectures.split(';') if len(args.architectures) else ['50',] | |||||
| self.compute_capabilities = [int(x) for x in architectures] | |||||
| self.selected_kernels = [] | |||||
| if args.operations == 'all': | |||||
| self.operations_enabled = [] | |||||
| else: | |||||
| self.selected_kernels = [] | |||||
| operations_list = [ | |||||
| OperationKind.Gemm | |||||
| , OperationKind.Conv2d | |||||
| ] | |||||
| if args.operations == "all": | |||||
| self.operations_enabled = [] | |||||
| else: | |||||
| self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')] | |||||
| operations_list = [OperationKind.Gemm, OperationKind.Conv2d] | |||||
| if args.kernels == 'all': | |||||
| self.kernel_names = [] | |||||
| else: | |||||
| self.kernel_names = [x for x in args.kernels.split(',') if x != ''] | |||||
| self.operations_enabled = [ | |||||
| x | |||||
| for x in operations_list | |||||
| if OperationKindNames[x] in args.operations.split(",") | |||||
| ] | |||||
| self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != ''] | |||||
| if args.kernels == "all": | |||||
| self.kernel_names = [] | |||||
| else: | |||||
| self.kernel_names = [x for x in args.kernels.split(",") if x != ""] | |||||
| if args.kernel_filter_file is None: | |||||
| self.kernel_filter_list = [] | |||||
| else: | |||||
| self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file) | |||||
| self.ignore_kernel_names = [ | |||||
| x for x in args.ignore_kernels.split(",") if x != "" | |||||
| ] | |||||
| if args.kernel_filter_file is None: | |||||
| self.kernel_filter_list = [] | |||||
| else: | |||||
| self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file) | |||||
| self.operation_count = 0 | |||||
| self.operations_by_name = {} | |||||
| self.top_level_prologue = ''' | |||||
| self.operation_count = 0 | |||||
| self.operations_by_name = {} | |||||
| self.top_level_prologue = """ | |||||
| #include "cutlass/library/library.h" | #include "cutlass/library/library.h" | ||||
| #include "cutlass/library/manifest.h" | #include "cutlass/library/manifest.h" | ||||
| @@ -159,208 +189,241 @@ ${prototypes} | |||||
| void initialize_all(Manifest &manifest) { | void initialize_all(Manifest &manifest) { | ||||
| ''' | |||||
| self.top_level_reserve = ' manifest.reserve(${operation_count});\n\n' | |||||
| self.top_level_epilogue = ''' | |||||
| """ | |||||
| self.top_level_reserve = " manifest.reserve(${operation_count});\n\n" | |||||
| self.top_level_epilogue = """ | |||||
| } | } | ||||
| } // namespace library | } // namespace library | ||||
| } // namespace cutlass | } // namespace cutlass | ||||
| ''' | |||||
| def get_kernel_filters (self, kernelListFile): | |||||
| if os.path.isfile(kernelListFile): | |||||
| with open(kernelListFile, 'r') as fileReader: | |||||
| lines = [line.rstrip() for line in fileReader if not line.startswith("#")] | |||||
| lines = [re.compile(line) for line in lines if line] | |||||
| return lines | |||||
| else: | |||||
| return [] | |||||
| """ | |||||
| def get_kernel_filters(self, kernelListFile): | |||||
| if os.path.isfile(kernelListFile): | |||||
| with open(kernelListFile, "r") as fileReader: | |||||
| lines = [ | |||||
| line.rstrip() for line in fileReader if not line.startswith("#") | |||||
| ] | |||||
| lines = [re.compile(line) for line in lines if line] | |||||
| return lines | |||||
| else: | |||||
| return [] | |||||
| def filter_out_kernels(self, kernel_name, kernel_filter_list): | |||||
| def filter_out_kernels(self, kernel_name, kernel_filter_list): | |||||
| for kernel_filter_re in kernel_filter_list: | |||||
| if kernel_filter_re.search(kernel_name) is not None: | |||||
| return True | |||||
| return False | |||||
| for kernel_filter_re in kernel_filter_list: | |||||
| if kernel_filter_re.search(kernel_name) is not None: | |||||
| return True | |||||
| # | |||||
| def _filter_string_matches(self, filter_string, haystack): | |||||
| ''' Returns true if all substrings appear in the haystack in order''' | |||||
| substrings = filter_string.split('*') | |||||
| for sub in substrings: | |||||
| idx = haystack.find(sub) | |||||
| if idx < 0: | |||||
| return False | return False | ||||
| haystack = haystack[idx + len(sub):] | |||||
| return True | |||||
| # | |||||
| def filter(self, operation): | |||||
| ''' Filtering operations based on various criteria''' | |||||
| # filter based on compute capability | |||||
| enabled = False | |||||
| for cc in self.compute_capabilities: | |||||
| if cc >= operation.tile_description.minimum_compute_capability and \ | |||||
| cc <= operation.tile_description.maximum_compute_capability: | |||||
| enabled = True | |||||
| break | |||||
| if not enabled: | |||||
| return False | |||||
| if len(self.operations_enabled) and not operation.operation_kind in self.operations_enabled: | |||||
| return False | |||||
| # eliminate duplicates | |||||
| if operation.procedural_name() in self.operations_by_name.keys(): | |||||
| return False | |||||
| # Filter based on list of valid substrings | |||||
| if len(self.kernel_names): | |||||
| name = operation.procedural_name() | |||||
| enabled = False | |||||
| # compare against the include list | |||||
| for name_substr in self.kernel_names: | |||||
| if self._filter_string_matches(name_substr, name): | |||||
| enabled = True | |||||
| break | |||||
| # compare against the exclude list | |||||
| for name_substr in self.ignore_kernel_names: | |||||
| if self._filter_string_matches(name_substr, name): | |||||
| enabled = False | |||||
| break | |||||
| if len(self.kernel_filter_list) > 0: | |||||
| # | |||||
| def _filter_string_matches(self, filter_string, haystack): | |||||
| """ Returns true if all substrings appear in the haystack in order""" | |||||
| substrings = filter_string.split("*") | |||||
| for sub in substrings: | |||||
| idx = haystack.find(sub) | |||||
| if idx < 0: | |||||
| return False | |||||
| haystack = haystack[idx + len(sub) :] | |||||
| return True | |||||
| # | |||||
| def filter(self, operation): | |||||
| """ Filtering operations based on various criteria""" | |||||
| # filter based on compute capability | |||||
| enabled = False | enabled = False | ||||
| if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list): | |||||
| enabled = True | |||||
| for cc in self.compute_capabilities: | |||||
| if ( | |||||
| cc >= operation.tile_description.minimum_compute_capability | |||||
| and cc <= operation.tile_description.maximum_compute_capability | |||||
| ): | |||||
| enabled = True | |||||
| break | |||||
| if not enabled: | |||||
| return False | |||||
| if ( | |||||
| len(self.operations_enabled) | |||||
| and not operation.operation_kind in self.operations_enabled | |||||
| ): | |||||
| return False | |||||
| # eliminate duplicates | |||||
| if operation.procedural_name() in self.operations_by_name.keys(): | |||||
| return False | |||||
| # Filter based on list of valid substrings | |||||
| if len(self.kernel_names): | |||||
| name = operation.procedural_name() | |||||
| enabled = False | |||||
| # compare against the include list | |||||
| for name_substr in self.kernel_names: | |||||
| if self._filter_string_matches(name_substr, name): | |||||
| enabled = True | |||||
| break | |||||
| # compare against the exclude list | |||||
| for name_substr in self.ignore_kernel_names: | |||||
| if self._filter_string_matches(name_substr, name): | |||||
| enabled = False | |||||
| break | |||||
| if len(self.kernel_filter_list) > 0: | |||||
| enabled = False | |||||
| if self.filter_out_kernels( | |||||
| operation.procedural_name(), self.kernel_filter_list | |||||
| ): | |||||
| enabled = True | |||||
| # todo: filter based on compute data type | |||||
| return enabled | |||||
| # | |||||
| # | |||||
| def append(self, operation): | |||||
| """ | |||||
| Inserts the operation. | |||||
| operation_kind -> configuration_name -> [] | |||||
| """ | |||||
| # todo: filter based on compute data type | |||||
| return enabled | |||||
| # | |||||
| if self.filter(operation): | |||||
| # | |||||
| def append(self, operation): | |||||
| ''' | |||||
| Inserts the operation. | |||||
| self.selected_kernels.append(operation.procedural_name()) | |||||
| operation_kind -> configuration_name -> [] | |||||
| ''' | |||||
| self.operations_by_name[operation.procedural_name()] = operation | |||||
| if self.filter(operation): | |||||
| self.selected_kernels.append(operation.procedural_name()) | |||||
| # add the configuration | |||||
| configuration_name = operation.configuration_name() | |||||
| self.operations_by_name[operation.procedural_name()] = operation | |||||
| if operation.operation_kind not in self.operations.keys(): | |||||
| self.operations[operation.operation_kind] = {} | |||||
| # add the configuration | |||||
| configuration_name = operation.configuration_name() | |||||
| if ( | |||||
| configuration_name | |||||
| not in self.operations[operation.operation_kind].keys() | |||||
| ): | |||||
| self.operations[operation.operation_kind][configuration_name] = [] | |||||
| if operation.operation_kind not in self.operations.keys(): | |||||
| self.operations[operation.operation_kind] = {} | |||||
| self.operations[operation.operation_kind][configuration_name].append( | |||||
| operation | |||||
| ) | |||||
| self.operation_count += 1 | |||||
| if configuration_name not in self.operations[operation.operation_kind].keys(): | |||||
| self.operations[operation.operation_kind][configuration_name] = [] | |||||
| # | |||||
| self.operations[operation.operation_kind][configuration_name].append(operation) | |||||
| self.operation_count += 1 | |||||
| # | |||||
| # | |||||
| def emit(self, target=GeneratorTarget.Library): | |||||
| # | |||||
| def emit(self, target = GeneratorTarget.Library): | |||||
| operation_emitters = {GeneratorTarget.Library: EmitOperationKindLibrary} | |||||
| operation_emitters = { | |||||
| GeneratorTarget.Library: EmitOperationKindLibrary | |||||
| } | |||||
| generated_path = os.path.join(self.args.curr_build_dir, "generated") | |||||
| generated_path = os.path.join(self.args.curr_build_dir, 'generated') | |||||
| # create generated/ | |||||
| if os.path.exists(generated_path): | |||||
| shutil.rmtree(generated_path) | |||||
| # create generated/ | |||||
| if os.path.exists(generated_path): | |||||
| shutil.rmtree(generated_path) | |||||
| os.mkdir(generated_path) | |||||
| os.mkdir(generated_path) | |||||
| source_files = [] | |||||
| source_files = [] | |||||
| top_level_path = os.path.join(generated_path, "initialize_all.cpp") | |||||
| with open(top_level_path, "w") as top_level_file: | |||||
| top_level_path = os.path.join(generated_path, 'initialize_all.cpp') | |||||
| with open(top_level_path, 'w') as top_level_file: | |||||
| if target == GeneratorTarget.Library: | |||||
| source_files.append(top_level_path) | |||||
| if target == GeneratorTarget.Library: | |||||
| source_files.append(top_level_path) | |||||
| prototypes = [] | |||||
| for operation_kind, configurations in self.operations.items(): | |||||
| prototypes.append( | |||||
| SubstituteTemplate( | |||||
| "void initialize_all_${operation_kind}_operations(Manifest &manifest);", | |||||
| {"operation_kind": OperationKindNames[operation_kind]}, | |||||
| ) | |||||
| ) | |||||
| prototypes = [] | |||||
| for operation_kind, configurations in self.operations.items(): | |||||
| prototypes.append(SubstituteTemplate( | |||||
| "void initialize_all_${operation_kind}_operations(Manifest &manifest);", | |||||
| {'operation_kind': OperationKindNames[operation_kind]})) | |||||
| top_level_file.write( | |||||
| SubstituteTemplate( | |||||
| self.top_level_prologue, {"prototypes": "\n".join(prototypes)} | |||||
| ) | |||||
| ) | |||||
| top_level_file.write(SubstituteTemplate(self.top_level_prologue, | |||||
| {'prototypes': "\n".join(prototypes)})) | |||||
| top_level_file.write( | |||||
| SubstituteTemplate( | |||||
| self.top_level_reserve, | |||||
| {"operation_count": str(self.operation_count)}, | |||||
| ) | |||||
| ) | |||||
| top_level_file.write(SubstituteTemplate( | |||||
| self.top_level_reserve, {'operation_count': str(self.operation_count)})) | |||||
| # for each operation kind, emit initializer for all configurations | |||||
| for operation_kind, configurations in self.operations.items(): | |||||
| # for each operation kind, emit initializer for all configurations | |||||
| for operation_kind, configurations in self.operations.items(): | |||||
| with operation_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter: | |||||
| for configuration_name, operations in configurations.items(): | |||||
| operation_kind_emitter.emit(configuration_name, operations) | |||||
| with operation_emitters[target]( | |||||
| generated_path, operation_kind, self.args | |||||
| ) as operation_kind_emitter: | |||||
| for configuration_name, operations in configurations.items(): | |||||
| operation_kind_emitter.emit(configuration_name, operations) | |||||
| source_files += operation_kind_emitter.source_files | |||||
| source_files += operation_kind_emitter.source_files | |||||
| top_level_file.write(SubstituteTemplate( | |||||
| " initialize_all_${operation_kind}_operations(manifest);\n", | |||||
| {'operation_kind': OperationKindNames[operation_kind]})) | |||||
| top_level_file.write( | |||||
| SubstituteTemplate( | |||||
| " initialize_all_${operation_kind}_operations(manifest);\n", | |||||
| {"operation_kind": OperationKindNames[operation_kind]}, | |||||
| ) | |||||
| ) | |||||
| top_level_file.write(self.top_level_epilogue) | |||||
| top_level_file.write(self.top_level_epilogue) | |||||
| # write the manifest.cmake file containing paths from all targets | |||||
| manifest_path = os.path.join(generated_path, "manifest.cmake") | |||||
| with open(manifest_path, "w") as manifest_file: | |||||
| # write the manifest.cmake file containing paths from all targets | |||||
| manifest_path = os.path.join(generated_path, "manifest.cmake") | |||||
| with open(manifest_path, "w") as manifest_file: | |||||
| target_name = 'cutlass_library_objs' | |||||
| target_name = "cutlass_library_objs" | |||||
| target_text = SubstituteTemplate("""cutlass_target_sources( | |||||
| target_text = SubstituteTemplate( | |||||
| """cutlass_target_sources( | |||||
| ${target_name} | ${target_name} | ||||
| BATCH_SOURCES ON | BATCH_SOURCES ON | ||||
| PRIVATE | PRIVATE | ||||
| """, { 'target_name': target_name}) | |||||
| """, | |||||
| {"target_name": target_name}, | |||||
| ) | |||||
| manifest_file.write(target_text) | |||||
| manifest_file.write(target_text) | |||||
| for source_file in source_files: | |||||
| manifest_file.write(" %s\n" % str(source_file.replace("\\", "/"))) | |||||
| manifest_file.write(")") | |||||
| # | |||||
| for source_file in source_files: | |||||
| manifest_file.write(" %s\n" % str(source_file.replace('\\', '/'))) | |||||
| manifest_file.write(")") | |||||
| # | |||||
| ################################################################################################### | ################################################################################################### | ||||
| def GenerateManifest(args, operations, output_dir): | def GenerateManifest(args, operations, output_dir): | ||||
| assert isinstance(operations, list) | |||||
| if len(operations) == 0: | |||||
| return | |||||
| op = operations[0] | |||||
| required_cuda_ver_major = op.required_cuda_ver_major | |||||
| required_cuda_ver_minor = op.required_cuda_ver_minor | |||||
| manifest_path = os.path.join(output_dir, "all_%s_%s_operations.cu" % (args.operations, args.type)) | |||||
| f = open(manifest_path, "w") | |||||
| f.write(""" | |||||
| assert isinstance(operations, list) | |||||
| if len(operations) == 0: | |||||
| return | |||||
| op = operations[0] | |||||
| required_cuda_ver_major = op.required_cuda_ver_major | |||||
| required_cuda_ver_minor = op.required_cuda_ver_minor | |||||
| manifest_path = os.path.join( | |||||
| output_dir, "all_%s_%s_operations.cu" % (args.operations, args.type) | |||||
| ) | |||||
| f = open(manifest_path, "w") | |||||
| f.write( | |||||
| """ | |||||
| /* | /* | ||||
| Generated by generator.py - Do not edit. | Generated by generator.py - Do not edit. | ||||
| */ | */ | ||||
| @@ -374,24 +437,35 @@ def GenerateManifest(args, operations, output_dir): | |||||
| namespace cutlass { | namespace cutlass { | ||||
| namespace library { | namespace library { | ||||
| """ % (str(required_cuda_ver_major), str(required_cuda_ver_major), str(required_cuda_ver_minor))) | |||||
| for op in operations: | |||||
| f.write("void initialize_%s(Manifest &manifest);\n" % op.procedural_name()) | |||||
| f.write(""" | |||||
| """ | |||||
| % ( | |||||
| str(required_cuda_ver_major), | |||||
| str(required_cuda_ver_major), | |||||
| str(required_cuda_ver_minor), | |||||
| ) | |||||
| ) | |||||
| for op in operations: | |||||
| f.write("void initialize_%s(Manifest &manifest);\n" % op.procedural_name()) | |||||
| f.write( | |||||
| """ | |||||
| void initialize_all_%s_%s_operations(Manifest &manifest) { | void initialize_all_%s_%s_operations(Manifest &manifest) { | ||||
| """ % (args.operations, args.type)) | |||||
| """ | |||||
| % (args.operations, args.type) | |||||
| ) | |||||
| for op in operations: | |||||
| f.write(" initialize_%s(manifest);\n" % op.procedural_name()) | |||||
| for op in operations: | |||||
| f.write(" initialize_%s(manifest);\n" % op.procedural_name()) | |||||
| f.write(""" | |||||
| f.write( | |||||
| """ | |||||
| } | } | ||||
| } // namespace library | } // namespace library | ||||
| } // namespace cutlass | } // namespace cutlass | ||||
| #endif | #endif | ||||
| """) | |||||
| f.close() | |||||
| """ | |||||
| ) | |||||
| f.close() | |||||
| @@ -181,6 +181,12 @@ if(MGE_WITH_CUDA) | |||||
| gen_cutlass_kimpl(conv2d simt CUTLASS_SOURCES) | gen_cutlass_kimpl(conv2d simt CUTLASS_SOURCES) | ||||
| gen_cutlass_kimpl(conv2d tensorop8816 CUTLASS_SOURCES) | gen_cutlass_kimpl(conv2d tensorop8816 CUTLASS_SOURCES) | ||||
| gen_cutlass_kimpl(conv2d tensorop8832 CUTLASS_SOURCES) | gen_cutlass_kimpl(conv2d tensorop8832 CUTLASS_SOURCES) | ||||
| gen_cutlass_kimpl(dwconv2d_fprop simt CUTLASS_SOURCES) | |||||
| gen_cutlass_kimpl(dwconv2d_fprop tensorop884 CUTLASS_SOURCES) | |||||
| gen_cutlass_kimpl(dwconv2d_dgrad simt CUTLASS_SOURCES) | |||||
| gen_cutlass_kimpl(dwconv2d_dgrad tensorop884 CUTLASS_SOURCES) | |||||
| gen_cutlass_kimpl(dwconv2d_wgrad simt CUTLASS_SOURCES) | |||||
| gen_cutlass_kimpl(dwconv2d_wgrad tensorop884 CUTLASS_SOURCES) | |||||
| list(APPEND SOURCES ${CUTLASS_SOURCES}) | list(APPEND SOURCES ${CUTLASS_SOURCES}) | ||||
| list(APPEND SOURCES ${CUSOURCES}) | list(APPEND SOURCES ${CUSOURCES}) | ||||
| endif() | endif() | ||||
| @@ -19,6 +19,7 @@ using namespace cuda; | |||||
| ConvBiasForwardImpl::AlgoPack::AlgoPack() { | ConvBiasForwardImpl::AlgoPack::AlgoPack() { | ||||
| non_cudnn_algos.push_back(&chanwise); | non_cudnn_algos.push_back(&chanwise); | ||||
| non_cudnn_algos.push_back(&chanwise_small); | non_cudnn_algos.push_back(&chanwise_small); | ||||
| non_cudnn_algos.push_back(&depthwise_large_filter); | |||||
| non_cudnn_algos.push_back(&inplace_matmul); | non_cudnn_algos.push_back(&inplace_matmul); | ||||
| non_cudnn_algos.push_back(&matmul); | non_cudnn_algos.push_back(&matmul); | ||||
| @@ -34,6 +35,7 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { | |||||
| std::vector<AlgoBase*> conv_algos; | std::vector<AlgoBase*> conv_algos; | ||||
| conv_algos.push_back(&chanwise); | conv_algos.push_back(&chanwise); | ||||
| conv_algos.push_back(&chanwise_small); | conv_algos.push_back(&chanwise_small); | ||||
| conv_algos.push_back(&depthwise_large_filter); | |||||
| conv_algos.push_back(&chanwise8x8x32); | conv_algos.push_back(&chanwise8x8x32); | ||||
| for (auto&& algo : cudnn_convs) { | for (auto&& algo : cudnn_convs) { | ||||
| conv_algos.push_back(&algo); | conv_algos.push_back(&algo); | ||||
| @@ -92,6 +94,7 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { | |||||
| for (auto&& algo : int8_nchw4_dotprod) { | for (auto&& algo : int8_nchw4_dotprod) { | ||||
| all_algos.push_back(&algo); | all_algos.push_back(&algo); | ||||
| } | } | ||||
| fill_dwconv_algos(); | |||||
| all_algos.push_back(&int8_chwn4_dotprod); | all_algos.push_back(&int8_chwn4_dotprod); | ||||
| all_algos.push_back(&fallback_nchw_qs8); | all_algos.push_back(&fallback_nchw_qs8); | ||||
| for (size_t i = all_algo_size; i < all_algos.size(); ++i) { | for (size_t i = all_algo_size; i < all_algos.size(); ++i) { | ||||
| @@ -301,6 +304,34 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() { | |||||
| } | } | ||||
| #endif | #endif | ||||
| void ConvBiasForwardImpl::AlgoPack::fill_dwconv_algos() { | |||||
| using AlgoParam = AlgoCutlassConvolutionBase::AlgoParam; | |||||
| /// preferred algo | |||||
| f32_implicit_bmm.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8, 1, 1, 1, 2}); | |||||
| f32_implicit_bmm.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8, 1, 1, 1, 2}); | |||||
| f32_implicit_bmm.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8, 1, 1, 1, 2}); | |||||
| f32_implicit_bmm.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8, 1, 1, 1, 2}); | |||||
| f32_implicit_bmm.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8, 1, 1, 1, 2}); | |||||
| f32_implicit_bmm.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8, 1, 1, 1, 2}); | |||||
| f32_implicit_bmm.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8, 1, 1, 1, 2}); | |||||
| f32_implicit_bmm.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8, 1, 1, 1, 2}); | |||||
| f32_implicit_bmm.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8, 1, 1, 1, 2}); | |||||
| for (auto&& algo : f32_implicit_bmm) { | |||||
| all_algos.push_back(&algo); | |||||
| } | |||||
| #if CUDA_VERSION >= 10010 | |||||
| /// preferred algo | |||||
| f16_implicit_bmm.emplace_back(AlgoParam{64, 128, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| f16_implicit_bmm.emplace_back(AlgoParam{128, 128, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| f16_implicit_bmm.emplace_back(AlgoParam{128, 256, 32, 64, 64, 32, 8, 8, 4, 2}); | |||||
| f16_implicit_bmm.emplace_back(AlgoParam{128, 64, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| f16_implicit_bmm.emplace_back(AlgoParam{64, 64, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| for (auto&& algo : f16_implicit_bmm) { | |||||
| all_algos.push_back(&algo); | |||||
| } | |||||
| #endif | |||||
| } | |||||
| void ConvBiasForwardImpl::AlgoPack::fill_dp4a_algos() { | void ConvBiasForwardImpl::AlgoPack::fill_dp4a_algos() { | ||||
| using AlgoParam = AlgoInt8NCHW4DotProdImplicitGemm::AlgoParam; | using AlgoParam = AlgoInt8NCHW4DotProdImplicitGemm::AlgoParam; | ||||
| int8_nchw4_dotprod.emplace_back(AlgoParam{128, 128, 32, 64, 32, 32, 1, 1, 4, 2}); | int8_nchw4_dotprod.emplace_back(AlgoParam{128, 128, 32, 64, 32, 32, 1, 1, 4, 2}); | ||||
| @@ -22,7 +22,6 @@ | |||||
| #include "src/cuda/conv_bias/opr_impl.h" | #include "src/cuda/conv_bias/opr_impl.h" | ||||
| #include "src/cuda/convolution_helper/parameter.cuh" | #include "src/cuda/convolution_helper/parameter.cuh" | ||||
| #include "src/cuda/cudnn_wrapper.h" | #include "src/cuda/cudnn_wrapper.h" | ||||
| #include "src/cuda/handle.h" | |||||
| #include <cuda.h> | #include <cuda.h> | ||||
| #include <memory> | #include <memory> | ||||
| @@ -57,6 +56,7 @@ public: | |||||
| CUDA_CUDNN_CONVBIAS, | CUDA_CUDNN_CONVBIAS, | ||||
| CUDA_CHANWISE, | CUDA_CHANWISE, | ||||
| CUDA_CHANWISE_SMALL, | CUDA_CHANWISE_SMALL, | ||||
| CUDA_DEPTHWISE_LARGE_FILTER, | |||||
| CUDA_CHANWISE_INT8X8X32, | CUDA_CHANWISE_INT8X8X32, | ||||
| CUDA_CUDNN_CONV, | CUDA_CUDNN_CONV, | ||||
| CUDA_INPLACE_MATMUL, | CUDA_INPLACE_MATMUL, | ||||
| @@ -84,7 +84,9 @@ public: | |||||
| CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW32_IMMA_INT8, | CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW32_IMMA_INT8, | ||||
| CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_INT4_INT4, | CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_INT4_INT4, | ||||
| CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_UINT4_INT4, | CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_UINT4_INT4, | ||||
| CUDA_FALLBACK_NCHW_INT4 | |||||
| CUDA_FALLBACK_NCHW_INT4, | |||||
| CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32, | |||||
| CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16, | |||||
| }; | }; | ||||
| using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | ||||
| @@ -255,6 +257,26 @@ private: | |||||
| mutable std::string m_name; | mutable std::string m_name; | ||||
| }; | }; | ||||
| class ConvBiasForwardImpl::AlgoDepthwiseLargeFilter final : public AlgoBase { | |||||
| public: | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { | |||||
| if (m_name.empty()) { | |||||
| m_name = ConvBiasForward::algo_name<DirectParam>( | |||||
| "DEPTHWISE_LARGE_FILTER", {}); | |||||
| } | |||||
| return m_name.c_str(); | |||||
| } | |||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_DEPTHWISE_LARGE_FILTER) | |||||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } | |||||
| private: | |||||
| mutable std::string m_name; | |||||
| }; | |||||
| class ConvBiasForwardImpl::AlgoChanwise8x8x32 final : public AlgoBase { | class ConvBiasForwardImpl::AlgoChanwise8x8x32 final : public AlgoBase { | ||||
| public: | public: | ||||
| bool is_available(const SizeArgs& args) const override; | bool is_available(const SizeArgs& args) const override; | ||||
| @@ -503,6 +525,8 @@ public: | |||||
| * +----+--- AlgoInt4Int4NHWCIMMAImplicitGemm | * +----+--- AlgoInt4Int4NHWCIMMAImplicitGemm | ||||
| * +----+--- AlgoUInt4Int4NHWCIMMAImplicitGemm | * +----+--- AlgoUInt4Int4NHWCIMMAImplicitGemm | ||||
| * + | * + | ||||
| * +--- AlgoFloat32NCHWImplicitBatchedGemm | |||||
| * +--- AlgoFloat16NCHWHMMAImplicitBatchedGemm | |||||
| */ | */ | ||||
| /* | /* | ||||
| @@ -516,7 +540,13 @@ public: | |||||
| // corresponds to cutlass::conv::ConvType. we hope that algo.h does not | // corresponds to cutlass::conv::ConvType. we hope that algo.h does not | ||||
| // depend on cutlass headers | // depend on cutlass headers | ||||
| enum class ConvType { kConvolution, kBatchConvolution, kLocal, kLocalShare }; | |||||
| enum class ConvType { | |||||
| kConvolution, | |||||
| kBatchConvolution, | |||||
| kLocal, | |||||
| kLocalShare, | |||||
| kDepthwiseConvolution, | |||||
| }; | |||||
| // common parameters for operation selection | // common parameters for operation selection | ||||
| struct AlgoParam { | struct AlgoParam { | ||||
| @@ -558,7 +588,8 @@ public: | |||||
| size_t wo, size_t ph, size_t pw, size_t sh, size_t sw, size_t dh, size_t dw, | size_t wo, size_t ph, size_t pw, size_t sh, size_t sw, size_t dh, size_t dw, | ||||
| const void* alpha, const void* beta, const void* gamma, const void* delta, | const void* alpha, const void* beta, const void* gamma, const void* delta, | ||||
| const void* theta, const void* threshold, const void* dst_scale, | const void* theta, const void* threshold, const void* dst_scale, | ||||
| cudaStream_t stream, const void* extra_param = nullptr) const; | |||||
| cudaStream_t stream, const void* extra_param = nullptr, | |||||
| size_t groups = 1) const; | |||||
| protected: | protected: | ||||
| AlgoParam m_algo_param; | AlgoParam m_algo_param; | ||||
| @@ -992,6 +1023,54 @@ private: | |||||
| }; | }; | ||||
| #endif | #endif | ||||
| class ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm final | |||||
| : public AlgoCutlassConvolutionBase { | |||||
| public: | |||||
| AlgoFloat32NCHWFMAImplicitBatchedGemm(AlgoParam algo_param) | |||||
| : AlgoCutlassConvolutionBase(algo_param) { | |||||
| m_name = ConvBias::algo_name<ConvBias::DirectParam>( | |||||
| ssprintf( | |||||
| "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM%s", | |||||
| m_algo_param.to_string().c_str()), | |||||
| ConvBias::DirectParam{}); | |||||
| } | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override { | |||||
| return 0; | |||||
| } | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return m_name.c_str(); }; | |||||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } | |||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32); | |||||
| private: | |||||
| std::string m_name; | |||||
| }; | |||||
| class ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm final | |||||
| : public AlgoCutlassConvolutionBase { | |||||
| public: | |||||
| AlgoFloat16NCHWHMMAImplicitBatchedGemm(AlgoParam algo_param) | |||||
| : AlgoCutlassConvolutionBase(algo_param) { | |||||
| m_name = ConvBias::algo_name<ConvBias::DirectParam>( | |||||
| ssprintf( | |||||
| "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM%s", | |||||
| m_algo_param.to_string().c_str()), | |||||
| ConvBias::DirectParam{}); | |||||
| } | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override { | |||||
| return 0; | |||||
| } | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return m_name.c_str(); }; | |||||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } | |||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16); | |||||
| private: | |||||
| std::string m_name; | |||||
| }; | |||||
| class ConvBiasForwardImpl::AlgoBFloat16 final : public AlgoBase { | class ConvBiasForwardImpl::AlgoBFloat16 final : public AlgoBase { | ||||
| public: | public: | ||||
| bool is_available(const SizeArgs& args) const override; | bool is_available(const SizeArgs& args) const override; | ||||
| @@ -1025,6 +1104,7 @@ public: | |||||
| AlgoFallbackNCHWQS8 fallback_nchw_qs8; | AlgoFallbackNCHWQS8 fallback_nchw_qs8; | ||||
| AlgoChanwise chanwise; | AlgoChanwise chanwise; | ||||
| AlgoChanwiseSmall chanwise_small; | AlgoChanwiseSmall chanwise_small; | ||||
| AlgoDepthwiseLargeFilter depthwise_large_filter; | |||||
| AlgoChanwise8x8x32 chanwise8x8x32; | AlgoChanwise8x8x32 chanwise8x8x32; | ||||
| AlgoInplaceMatmul inplace_matmul; | AlgoInplaceMatmul inplace_matmul; | ||||
| AlgoMatmul matmul; | AlgoMatmul matmul; | ||||
| @@ -1048,6 +1128,8 @@ public: | |||||
| std::vector<AlgoInt4Int4NHWCIMMAImplicitGemm> int4_int4_nhwc_imma; | std::vector<AlgoInt4Int4NHWCIMMAImplicitGemm> int4_int4_nhwc_imma; | ||||
| std::vector<AlgoUInt4Int4NHWCIMMAImplicitGemm> uint4_int4_nhwc_imma; | std::vector<AlgoUInt4Int4NHWCIMMAImplicitGemm> uint4_int4_nhwc_imma; | ||||
| #endif | #endif | ||||
| std::vector<AlgoFloat32NCHWFMAImplicitBatchedGemm> f32_implicit_bmm; | |||||
| std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> f16_implicit_bmm; | |||||
| AlgoGroupConvGeneral group; | AlgoGroupConvGeneral group; | ||||
| AlgoBFloat16 bfloat16; | AlgoBFloat16 bfloat16; | ||||
| @@ -1063,6 +1145,7 @@ private: | |||||
| #endif | #endif | ||||
| void fill_cudnn_algos(); | void fill_cudnn_algos(); | ||||
| void fill_dp4a_algos(); | void fill_dp4a_algos(); | ||||
| void fill_dwconv_algos(); | |||||
| }; | }; | ||||
| } // namespace cuda | } // namespace cuda | ||||
| @@ -0,0 +1,179 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter.cuh | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #pragma once | |||||
| namespace { | |||||
| #define DIVUP(x, y) (((x) + (y)-1) / (y)) | |||||
| enum DepthwiseConv2dDirection { DIRECTION_FORWARD, DIRECTION_BACKWARD }; | |||||
| template <typename ThreadConfig_, int oh_, int ow_> | |||||
| struct OutTileConfig { | |||||
| using ThreadConfig = ThreadConfig_; | |||||
| static int const unroll_h = oh_; | |||||
| static int const unroll_w = ThreadConfig::thread_x * ow_; | |||||
| static int const unroll_size = unroll_h * unroll_w; | |||||
| static int const block_h = unroll_h * ThreadConfig::thread_y; | |||||
| static int const block_w = unroll_w; | |||||
| }; | |||||
| template <int fh_, int fw_> | |||||
| struct FilterTileConfig { | |||||
| static int const unroll_h = fh_; | |||||
| static int const unroll_w = fw_; | |||||
| static int const unroll_size = unroll_h * unroll_w; | |||||
| }; | |||||
| template <int x_, int y_> | |||||
| struct ThreadConfig { | |||||
| static int const thread_x = x_; | |||||
| static_assert((thread_x & (thread_x - 1)) == 0, "thread_x must be pow of 2!"); | |||||
| static int const thread_y = y_; | |||||
| static int const nr_threads = x_ * y_; | |||||
| }; | |||||
| template < | |||||
| typename ldg_dtype, typename ThreadConfig_, typename OutTileConfig_, | |||||
| typename FilterTileConfig_, int stride_w, int stride_h> | |||||
| struct ConvTraitInner { | |||||
| using ThreadConfig = ThreadConfig_; | |||||
| using OutTileConfig = OutTileConfig_; | |||||
| using FilterTileConfig = FilterTileConfig_; | |||||
| using CompType = ldg_dtype; | |||||
| struct SrcTileConfig { | |||||
| static int const unroll_h = | |||||
| OutTileConfig::unroll_h + FilterTileConfig::unroll_h - 1; | |||||
| static int const unroll_w = | |||||
| (OutTileConfig::unroll_w - 1) * stride_w + FilterTileConfig::unroll_w; | |||||
| static int const unroll_size = unroll_h * unroll_w; | |||||
| }; | |||||
| struct SrcTileCount { | |||||
| static int const smem_src_h = | |||||
| (OutTileConfig::block_h - 1) * stride_h + FilterTileConfig::unroll_h; | |||||
| static int const smem_buff_h = FilterTileConfig::unroll_h; | |||||
| static int const smem_load_h = smem_src_h + smem_buff_h; | |||||
| static int const smem_h = smem_load_h + smem_buff_h; | |||||
| static int const smem_w = | |||||
| DIVUP((OutTileConfig::block_w - 1) * stride_w + | |||||
| FilterTileConfig::unroll_w * ThreadConfig::thread_x, | |||||
| 2) * | |||||
| 2; | |||||
| static int const smem_size = smem_h * smem_w; | |||||
| static int const load_w = | |||||
| smem_w > ThreadConfig::nr_threads ? ThreadConfig::nr_threads : smem_w; | |||||
| static int const load_h = 1; | |||||
| static int const reg_h = 1; | |||||
| static int const reg_w = DIVUP(smem_w, load_w); | |||||
| static bool constexpr check_bounds_h = smem_h % load_h != 0; | |||||
| static bool constexpr check_bounds_w = smem_w % load_w != 0; | |||||
| }; | |||||
| struct FilterTileCount { | |||||
| static int const smem_flt_h = FilterTileConfig::unroll_h; | |||||
| static int const smem_buff_h = FilterTileConfig::unroll_h; | |||||
| static int const smem_load_h = smem_flt_h + smem_buff_h; | |||||
| static int const smem_h = smem_load_h + smem_buff_h; | |||||
| static int const smem_w = FilterTileConfig::unroll_w * ThreadConfig::thread_x; | |||||
| static int const smem_size = smem_h * smem_w; | |||||
| static int const load_w = smem_w > 32 ? 32 : smem_w; | |||||
| static int const load_h = ThreadConfig::nr_threads / load_w; | |||||
| static int const reg_h = 1; | |||||
| static int const reg_w = DIVUP(smem_w, load_w); | |||||
| static bool constexpr check_bounds_h = smem_h % load_h != 0; | |||||
| static bool constexpr check_bounds_w = smem_w % load_w != 0; | |||||
| }; | |||||
| }; | |||||
| #define CHECK_AB_FWD(a, b) \ | |||||
| if (param.out_w > b * 4) { \ | |||||
| if (param.stride_h == 1 && param.stride_w == 1) { \ | |||||
| using FilterTileConfig_ = FilterTileConfig<unroll_fh, a + 2>; \ | |||||
| using ThreadConfig_ = ThreadConfig<4, 32>; \ | |||||
| using OutTileConfig_ = OutTileConfig<ThreadConfig_, unroll_oh, b + 1>; \ | |||||
| using IConvTrait = ConvTraitInner< \ | |||||
| float, ThreadConfig_, OutTileConfig_, FilterTileConfig_, 1, 1>; \ | |||||
| using SrcTileConfig = typename IConvTrait::SrcTileConfig; \ | |||||
| using SrcTileCount = typename IConvTrait::SrcTileCount; \ | |||||
| using FilterTileCount = typename IConvTrait::FilterTileCount; \ | |||||
| \ | |||||
| if (device_prop.regsPerBlock < \ | |||||
| 4 * 32 * \ | |||||
| (FilterTileConfig_::unroll_h * \ | |||||
| FilterTileConfig_::unroll_w * 2 + \ | |||||
| SrcTileConfig::unroll_h * SrcTileConfig::unroll_w) || \ | |||||
| device_prop.sharedMemPerBlock < \ | |||||
| static_cast<size_t>( \ | |||||
| (SrcTileCount::smem_size + \ | |||||
| FilterTileCount::smem_size))) { \ | |||||
| return false; \ | |||||
| } \ | |||||
| return true; \ | |||||
| } else if (param.stride_h == 2 && param.stride_w == 2) { \ | |||||
| using FilterTileConfig_ = FilterTileConfig<unroll_fh, a + 2>; \ | |||||
| using ThreadConfig_ = ThreadConfig<4, 32>; \ | |||||
| using OutTileConfig_ = OutTileConfig<ThreadConfig_, unroll_oh, b + 1>; \ | |||||
| using IConvTrait = ConvTraitInner< \ | |||||
| float, ThreadConfig_, OutTileConfig_, FilterTileConfig_, 2, 2>; \ | |||||
| using SrcTileConfig = typename IConvTrait::SrcTileConfig; \ | |||||
| using SrcTileCount = typename IConvTrait::SrcTileCount; \ | |||||
| using FilterTileCount = typename IConvTrait::FilterTileCount; \ | |||||
| \ | |||||
| if (device_prop.regsPerBlock < \ | |||||
| 4 * 32 * \ | |||||
| (FilterTileConfig_::unroll_h * \ | |||||
| FilterTileConfig_::unroll_w * 2 + \ | |||||
| SrcTileConfig::unroll_h * SrcTileConfig::unroll_w) || \ | |||||
| device_prop.sharedMemPerBlock < \ | |||||
| static_cast<size_t>( \ | |||||
| (SrcTileCount::smem_size + \ | |||||
| FilterTileCount::smem_size))) { \ | |||||
| return false; \ | |||||
| } \ | |||||
| return true; \ | |||||
| } \ | |||||
| } | |||||
| #define CHECK_AB_BWD(a, b) \ | |||||
| if (param.out_w > b * 4) { \ | |||||
| using FilterTileConfig_ = FilterTileConfig<unroll_fh, a + 2>; \ | |||||
| using ThreadConfig_ = ThreadConfig<4, 32>; \ | |||||
| using OutTileConfig_ = OutTileConfig<ThreadConfig_, unroll_oh, b + 1>; \ | |||||
| using IConvTrait = ConvTraitInner< \ | |||||
| float, ThreadConfig_, OutTileConfig_, FilterTileConfig_, 1, 1>; \ | |||||
| using SrcTileConfig = typename IConvTrait::SrcTileConfig; \ | |||||
| using SrcTileCount = typename IConvTrait::SrcTileCount; \ | |||||
| using FilterTileCount = typename IConvTrait::FilterTileCount; \ | |||||
| \ | |||||
| if (device_prop.regsPerBlock < \ | |||||
| 4 * 32 * \ | |||||
| (FilterTileConfig_::unroll_h * \ | |||||
| FilterTileConfig_::unroll_w * 2 + \ | |||||
| SrcTileConfig::unroll_h * SrcTileConfig::unroll_w) || \ | |||||
| device_prop.sharedMemPerBlock < \ | |||||
| static_cast<size_t>( \ | |||||
| (SrcTileCount::smem_size + FilterTileCount::smem_size))) { \ | |||||
| return false; \ | |||||
| } \ | |||||
| return true; \ | |||||
| } | |||||
| #define CHECK_A(a, cb) \ | |||||
| if (param.flt_w > a * 4) { \ | |||||
| CHECK_AB_##cb( \ | |||||
| a, \ | |||||
| 15) else CHECK_AB_##cb(a, 14) else CHECK_AB_##cb(a, 13) else CHECK_AB_##cb(a, 12) else CHECK_AB_##cb(a, 11) else CHECK_AB_##cb(a, 10) else CHECK_AB_##cb(a, 9) else CHECK_AB_##cb(a, 8) else CHECK_AB_##cb(a, 7) else CHECK_AB_##cb(a, 6) else CHECK_AB_##cb(a, 5) else CHECK_AB_##cb(a, 4) else CHECK_AB_##cb(a, 3) else CHECK_AB_##cb(a, 2) else CHECK_AB_##cb(a, 1) else CHECK_AB_##cb(a, 0) \ | |||||
| } | |||||
| #define CHECK(cb) \ | |||||
| CHECK_A(6, cb) \ | |||||
| else CHECK_A(4, cb) else CHECK_A(2, cb) else CHECK_A(0, cb) | |||||
| } // namespace | |||||
| @@ -0,0 +1,914 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #pragma once | |||||
| #include "depthwise_large_filter.cuh" | |||||
| #include "src/cuda/cuda_shfl_compat.cuh" | |||||
| namespace { | |||||
| template < | |||||
| typename T, DepthwiseConv2dDirection kDirection, typename ThreadConfig_, | |||||
| typename TileCount_> | |||||
| struct Global2SharedMem { | |||||
| using TileCount = TileCount_; | |||||
| using ThreadConfig = ThreadConfig_; | |||||
| T reg[TileCount::reg_w]; | |||||
| const int tidx = threadIdx.x; | |||||
| const int tidy = threadIdx.y; | |||||
| const int tid = tidy * ThreadConfig::thread_x + tidx; | |||||
| const int gl_load_y = tid / TileCount::load_w; | |||||
| const int gl_load_x = tid - gl_load_y * TileCount::load_w; | |||||
| const bool is_fwd = (kDirection == DIRECTION_FORWARD); | |||||
| int w_offset; | |||||
| T* smem; | |||||
| int stride; | |||||
| int start_h, start_w, bound_h, bound_w, ring_smem_h, ring_src_h; | |||||
| // just used in backward src data | |||||
| int stride_h, stride_w; | |||||
| const T* g_ptr; | |||||
| __device__ __forceinline__ Global2SharedMem( | |||||
| T* smem_, int stride_, int s_h, int s_w, int b_h, int b_w, int stride_h_, | |||||
| int stride_w_); | |||||
| __device__ __forceinline__ void first_copy(); | |||||
| __device__ __forceinline__ void copy(); | |||||
| __device__ __forceinline__ void commit(); | |||||
| __device__ __forceinline__ void iter_forward(); | |||||
| __device__ __forceinline__ T* sh_ptr(int y, int x) { | |||||
| return &smem[y * TileCount::smem_w + x]; | |||||
| } | |||||
| __device__ __forceinline__ T* sh_ptr_as_copy_t(int y, int x) { | |||||
| return reinterpret_cast<T*>(sh_ptr(y, x)); | |||||
| } | |||||
| }; | |||||
| template < | |||||
| typename ldg_dtype, DepthwiseConv2dDirection kDirection, typename ThreadConfig_, | |||||
| typename OutTileConfig_, typename FilterTileConfig_, int stride_w, int stride_h> | |||||
| struct ConvTrait { | |||||
| using ThreadConfig = ThreadConfig_; | |||||
| using OutTileConfig = OutTileConfig_; | |||||
| using FilterTileConfig = FilterTileConfig_; | |||||
| using CompType = ldg_dtype; | |||||
| using CI = ConvTraitInner< | |||||
| ldg_dtype, ThreadConfig_, OutTileConfig_, FilterTileConfig_, stride_w, | |||||
| stride_h>; | |||||
| using SrcTileConfig = typename CI::SrcTileConfig; | |||||
| using SrcTileCount = typename CI::SrcTileCount; | |||||
| using FilterTileCount = typename CI::FilterTileCount; | |||||
| using SrcGlobal2ShareVisitor = Global2SharedMem< | |||||
| CompType, DepthwiseConv2dDirection::DIRECTION_FORWARD, ThreadConfig, | |||||
| SrcTileCount>; | |||||
| using FilterGlobal2ShareVisitor = | |||||
| Global2SharedMem<CompType, kDirection, ThreadConfig, FilterTileCount>; | |||||
| }; | |||||
| template < | |||||
| typename T, DepthwiseConv2dDirection kDirection, typename ThreadConfig_, | |||||
| typename TileCount_> | |||||
| __device__ __forceinline__ | |||||
| Global2SharedMem<T, kDirection, ThreadConfig_, TileCount_>::Global2SharedMem( | |||||
| T* smem_, int stride_, int s_h, int s_w, int b_h, int b_w, int stride_h_, | |||||
| int stride_w_) | |||||
| : smem(smem_), | |||||
| stride(stride_), | |||||
| start_h(s_h), | |||||
| start_w(s_w), | |||||
| bound_h(b_h), | |||||
| bound_w(b_w), | |||||
| ring_smem_h(TileCount::smem_load_h), | |||||
| stride_h(stride_h_), | |||||
| stride_w(stride_w_) { | |||||
| if (is_fwd) { | |||||
| ring_src_h = s_h + TileCount::smem_load_h; | |||||
| w_offset = 0; | |||||
| } else { | |||||
| ring_src_h = s_h - 1; | |||||
| w_offset = TileCount::smem_w - b_w; | |||||
| // stride_h and stride_w just used in backward src data. | |||||
| stride_h = stride_w = 1; | |||||
| } | |||||
| } | |||||
| template < | |||||
| typename T, DepthwiseConv2dDirection kDirection, typename ThreadConfig_, | |||||
| typename TileCount_> | |||||
| __device__ __forceinline__ void Global2SharedMem< | |||||
| T, kDirection, ThreadConfig_, TileCount_>::first_copy() { | |||||
| static int const load_w = TileCount::smem_w > 32 ? 32 : TileCount::smem_w; | |||||
| static int const load_h = ThreadConfig::nr_threads / load_w; | |||||
| static int const h_per_thread = DIVUP(TileCount::smem_load_h, load_h); | |||||
| static int const w_per_thread = DIVUP(TileCount::smem_w, load_w); | |||||
| static bool constexpr check_bounds_h = TileCount::smem_load_h % load_h != 0; | |||||
| static bool constexpr check_bounds_w = TileCount::smem_w % load_w != 0; | |||||
| const int y_base_idx = tid / load_w; | |||||
| const int x_base_idx = tid - y_base_idx * load_w; | |||||
| #pragma unroll | |||||
| for (int i = 0; i < h_per_thread; ++i) { | |||||
| int smem_h_idx = y_base_idx + i * load_h; | |||||
| int src_h_idx; | |||||
| if (is_fwd) { | |||||
| src_h_idx = start_h + smem_h_idx; | |||||
| } else { | |||||
| src_h_idx = start_h + TileCount::smem_load_h - smem_h_idx - 1; | |||||
| } | |||||
| if (check_bounds_h && smem_h_idx >= TileCount::smem_load_h) | |||||
| continue; | |||||
| #pragma unroll | |||||
| for (int j = 0; j < w_per_thread; ++j) { | |||||
| int smem_w_idx = x_base_idx + j * load_w; | |||||
| int src_w_idx; | |||||
| if (is_fwd) { | |||||
| src_w_idx = start_w + smem_w_idx; | |||||
| } else { | |||||
| src_w_idx = start_w + TileCount::smem_w - w_offset - smem_w_idx - 1; | |||||
| } | |||||
| if (check_bounds_w && smem_w_idx >= TileCount::smem_w) | |||||
| continue; | |||||
| T val = 0.0f; | |||||
| if (src_h_idx >= 0 && src_h_idx < bound_h && src_w_idx >= 0 && | |||||
| src_w_idx < bound_w && | |||||
| ((is_fwd && src_h_idx % stride_h == 0 && src_w_idx % stride_w == 0) || | |||||
| (!is_fwd && TileCount::smem_load_h - smem_h_idx - 1 >= 0 && | |||||
| TileCount::smem_w - w_offset - smem_w_idx - 1 >= 0))) { | |||||
| val = g_ptr[src_h_idx / stride_h * stride + src_w_idx / stride_w]; | |||||
| } | |||||
| *(sh_ptr_as_copy_t(smem_h_idx, smem_w_idx)) = val; | |||||
| } | |||||
| } | |||||
| } | |||||
| template < | |||||
| typename T, DepthwiseConv2dDirection kDirection, typename ThreadConfig_, | |||||
| typename TileCount_> | |||||
| __device__ __forceinline__ void Global2SharedMem< | |||||
| T, kDirection, ThreadConfig_, TileCount_>::copy() { | |||||
| #pragma unroll | |||||
| for (int j = 0; j < TileCount::reg_w; ++j) { | |||||
| int smem_w_idx = gl_load_x + j * TileCount::load_w; | |||||
| int src_w_idx; | |||||
| if (is_fwd) { | |||||
| src_w_idx = start_w + smem_w_idx; | |||||
| } else { | |||||
| src_w_idx = start_w + TileCount::smem_w - w_offset - smem_w_idx - 1; | |||||
| } | |||||
| if (TileCount::check_bounds_w && smem_w_idx >= TileCount::smem_w) | |||||
| continue; | |||||
| T val = 0.0f; | |||||
| if (ring_src_h >= 0 && ring_src_h < bound_h && src_w_idx >= 0 && | |||||
| src_w_idx < bound_w && | |||||
| ((is_fwd && ring_src_h % stride_h == 0 && src_w_idx % stride_w == 0) || | |||||
| (!is_fwd && TileCount::smem_w - w_offset - smem_w_idx - 1 >= 0))) { | |||||
| val = g_ptr[ring_src_h / stride_h * stride + src_w_idx / stride_w]; | |||||
| } | |||||
| reg[j] = val; | |||||
| } | |||||
| } | |||||
| template < | |||||
| typename T, DepthwiseConv2dDirection kDirection, typename ThreadConfig_, | |||||
| typename TileCount_> | |||||
| __device__ __forceinline__ void Global2SharedMem< | |||||
| T, kDirection, ThreadConfig_, TileCount_>::commit() { | |||||
| #pragma unroll | |||||
| for (int j = 0; j < TileCount::reg_w; ++j) { | |||||
| int smem_w_idx = gl_load_x + j * TileCount::load_w; | |||||
| if (TileCount::check_bounds_w && smem_w_idx >= TileCount::smem_w) | |||||
| continue; | |||||
| *(sh_ptr_as_copy_t(ring_smem_h, smem_w_idx)) = reg[j]; | |||||
| } | |||||
| } | |||||
| template < | |||||
| typename T, DepthwiseConv2dDirection kDirection, typename ThreadConfig_, | |||||
| typename TileCount_> | |||||
| __device__ __forceinline__ void Global2SharedMem< | |||||
| T, kDirection, ThreadConfig_, TileCount_>::iter_forward() { | |||||
| if (is_fwd) { | |||||
| ring_src_h++; | |||||
| } else { | |||||
| ring_src_h--; | |||||
| } | |||||
| ring_smem_h = (ring_smem_h + 1) % TileCount::smem_h; | |||||
| } | |||||
| // CUDA kernel to compute the depthwise convolution forward pass in NCHW format, | |||||
| // tailored for small images up to 32x32. Stride and depth multiplier must be 1. | |||||
| // Padding must be 'SAME', which allows to reuse the index computation. Only | |||||
| // use this kernel if CanLaunchDepthwiseConv2dGPU(args) returns true. | |||||
| // Tiles of the input and filter tensors are loaded into shared memory before | |||||
| // performing the convolution. Each thread handles two elements per iteration, | |||||
| // one each in the lower and upper half of a tile. | |||||
| // Backprop input direction is the same as forward direction with the filter | |||||
| // rotated by 180°. | |||||
| #if CUDA_VERSION >= 9000 | |||||
| template <typename ConvTrait, DepthwiseConv2dDirection kDirection> | |||||
| __global__ void DepthwiseConv2dGPUKernelNCHW( | |||||
| const Param param, const __half* input, const __half* filter, __half* output) { | |||||
| using T = __half; | |||||
| using T2 = __half2; | |||||
| using ThreadConfig = typename ConvTrait::ThreadConfig; | |||||
| using SrcTileConfig = typename ConvTrait::SrcTileConfig; | |||||
| using FilterTileConfig = typename ConvTrait::FilterTileConfig; | |||||
| using OutTileConfig = typename ConvTrait::OutTileConfig; | |||||
| using SrcTileCount = typename ConvTrait::SrcTileCount; | |||||
| using FilterTileCount = typename ConvTrait::FilterTileCount; | |||||
| using SrcGlobal2ShareVisitor = typename ConvTrait::SrcGlobal2ShareVisitor; | |||||
| using FilterGlobal2ShareVisitor = typename ConvTrait::FilterGlobal2ShareVisitor; | |||||
| const bool is_fwd = (kDirection == DepthwiseConv2dDirection::DIRECTION_FORWARD); | |||||
| int off_ochannel = blockIdx.x, off_obw = blockIdx.y, off_obh = blockIdx.z, | |||||
| off_oh = threadIdx.y, off_ow = threadIdx.x; | |||||
| constexpr int t2_src_unroll_w = (SrcTileConfig::unroll_w + 3) / 2; | |||||
| constexpr int t2_flt_unroll_w = (FilterTileConfig::unroll_w + 2) / 2; | |||||
| constexpr int t2_out_unroll_w = (OutTileConfig::unroll_w + 1) / 2; | |||||
| extern __shared__ __align__(8) unsigned char smem[]; | |||||
| static_assert(sizeof(T) <= 8, "Insufficient alignment detected"); | |||||
| T* smem_src = reinterpret_cast<T*>(smem); | |||||
| T* smem_flt = reinterpret_cast<T*>(&smem_src[SrcTileCount::smem_size]); | |||||
| int stride_h = is_fwd ? param.stride_h : 1; | |||||
| int stride_w = is_fwd ? param.stride_w : 1; | |||||
| int off_ichannel = off_ochannel / param.chl_mul, | |||||
| off_fchannel = off_ichannel % param.src_chl, | |||||
| out_start_h = off_obh * OutTileConfig::block_h, | |||||
| out_start_w = off_obw * OutTileConfig::block_w, | |||||
| src_start_h = out_start_h * stride_h - param.pad_h, | |||||
| src_start_w = out_start_w * stride_w - param.pad_w, | |||||
| out_base_h_idx = out_start_h + off_oh * OutTileConfig::unroll_h; | |||||
| T* smem_src_ptr = smem_src + off_ow * FilterTileConfig::unroll_w; | |||||
| T* smem_flt_ptr = smem_flt + off_ow * FilterTileConfig::unroll_w; | |||||
| T* out_base_ptr = output + off_ochannel * param.out_h * param.out_w; | |||||
| SrcGlobal2ShareVisitor gl2sh_src = { | |||||
| smem_src, | |||||
| param.src_w, | |||||
| is_fwd ? src_start_h | |||||
| : src_start_h - (param.out_h / 2 + param.flt_h / 2 - param.pad_h - | |||||
| param.src_h * param.stride_h / 2), | |||||
| is_fwd ? src_start_w | |||||
| : src_start_w - (param.out_w / 2 + param.flt_w / 2 - param.pad_w - | |||||
| param.src_w * param.stride_w / 2), | |||||
| is_fwd ? param.src_h : param.src_h * param.stride_h, | |||||
| is_fwd ? param.src_w : param.src_w * param.stride_w, | |||||
| is_fwd ? 1 : param.stride_h, | |||||
| is_fwd ? 1 : param.stride_w}; | |||||
| FilterGlobal2ShareVisitor gl2sh_flt = {smem_flt, | |||||
| param.flt_w, | |||||
| is_fwd ? 0 : param.flt_h - 2, | |||||
| 0, | |||||
| param.flt_h, | |||||
| param.flt_w, | |||||
| 1, | |||||
| 1}; | |||||
| gl2sh_src.g_ptr = input + off_ichannel * param.src_h * param.src_w; | |||||
| gl2sh_flt.g_ptr = filter + off_fchannel * param.flt_h * param.flt_w; | |||||
| gl2sh_src.first_copy(); | |||||
| gl2sh_flt.first_copy(); | |||||
| __syncthreads(); | |||||
| T2 reg_src[SrcTileConfig::unroll_h * t2_src_unroll_w], | |||||
| reg_flt[2][FilterTileConfig::unroll_h * t2_flt_unroll_w]; | |||||
| T2 sum[OutTileConfig::unroll_size] = {{0.0, 0.0}}; | |||||
| for (int fh = 0; fh < param.flt_h; fh += FilterTileConfig::unroll_h) { | |||||
| gl2sh_src.copy(); | |||||
| gl2sh_flt.copy(); | |||||
| #pragma unroll | |||||
| for (int s_h = 0; s_h < SrcTileConfig::unroll_h; ++s_h) { | |||||
| #pragma unroll | |||||
| for (int s_w = 0; s_w < t2_src_unroll_w; ++s_w) { | |||||
| int src_offset = (off_oh * stride_h + fh + s_h) % SrcTileCount::smem_h * | |||||
| SrcTileCount::smem_w + | |||||
| s_w * 2; | |||||
| reg_src[s_h * t2_src_unroll_w + s_w] = | |||||
| *reinterpret_cast<T2*>(smem_src_ptr + src_offset); | |||||
| } | |||||
| } | |||||
| #pragma unroll | |||||
| for (int f_h = 0; f_h < FilterTileConfig::unroll_h; ++f_h) { | |||||
| #pragma unroll | |||||
| for (int f_w = 0; f_w < t2_flt_unroll_w - 1; ++f_w) { | |||||
| int flt_offset = | |||||
| (fh + f_h) % FilterTileCount::smem_h * FilterTileCount::smem_w + | |||||
| f_w * 2; | |||||
| reg_flt[0][f_h * t2_flt_unroll_w + f_w] = | |||||
| *reinterpret_cast<T2*>(smem_flt_ptr + flt_offset); | |||||
| if (f_w > 0) { | |||||
| reg_flt[1][f_h * t2_flt_unroll_w + f_w] = | |||||
| T2{reg_flt[0][f_h * t2_flt_unroll_w + f_w - 1].y, | |||||
| reg_flt[0][f_h * t2_flt_unroll_w + f_w].x}; | |||||
| } else { | |||||
| reg_flt[1][f_h * t2_flt_unroll_w + f_w] = | |||||
| T2{0.0, reg_flt[0][f_h * t2_flt_unroll_w + f_w].x}; | |||||
| } | |||||
| } | |||||
| reg_flt[0][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 1] = T2{0.0, 0.0}; | |||||
| reg_flt[1][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 1] = | |||||
| T2{reg_flt[0][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 2].y, 0.0}; | |||||
| } | |||||
| #pragma unroll | |||||
| for (int inner_fh = 0; inner_fh < FilterTileConfig::unroll_h; ++inner_fh) { | |||||
| #pragma unroll | |||||
| for (int oh = 0; oh < OutTileConfig::unroll_h; ++oh) { | |||||
| #pragma unroll | |||||
| for (int fw = 0; fw < t2_flt_unroll_w; ++fw) { | |||||
| #pragma unroll | |||||
| for (int ow = 0; ow < OutTileConfig::unroll_w; ++ow) { | |||||
| sum[oh * t2_out_unroll_w + ow] = megdnn::cuda::fma2( | |||||
| reg_flt[ow * stride_w % 2] | |||||
| [inner_fh * t2_flt_unroll_w + fw], | |||||
| reg_src[(inner_fh + oh) * t2_src_unroll_w + fw + | |||||
| ow * stride_w / 2], | |||||
| sum[oh * t2_out_unroll_w + ow]); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| __syncthreads(); | |||||
| gl2sh_src.commit(); | |||||
| gl2sh_flt.commit(); | |||||
| gl2sh_src.iter_forward(); | |||||
| gl2sh_flt.iter_forward(); | |||||
| __syncthreads(); | |||||
| } | |||||
| for (int o = 0; o < OutTileConfig::unroll_size; ++o) { | |||||
| for (int i = 1; i < ThreadConfig::thread_x; i = i << 1) { | |||||
| sum[o] = megdnn::cuda::hadd2(sum[o], __shfl_xor(sum[o], i, 32)); | |||||
| } | |||||
| } | |||||
| if (threadIdx.x == 0) { | |||||
| #pragma unroll | |||||
| for (int i = 0; i < OutTileConfig::unroll_h; ++i) { | |||||
| int out_h_idx = out_base_h_idx + i; | |||||
| if (out_h_idx < param.out_h) { | |||||
| #pragma unroll | |||||
| for (int j = 0; j < OutTileConfig::unroll_w; ++j) { | |||||
| int out_w_idx = out_start_w + j; | |||||
| if (out_w_idx >= param.out_w) | |||||
| return; | |||||
| out_base_ptr[out_h_idx * param.out_w + out_w_idx] = __float2half( | |||||
| __half2float(sum[i * OutTileConfig::unroll_w + j].x) + | |||||
| __half2float(sum[i * OutTileConfig::unroll_w + j].y)); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| template <typename ConvTrait, DepthwiseConv2dDirection kDirection> | |||||
| __global__ void DepthwiseConv2dGPUKernelNCHWC32( | |||||
| const Param param, const __half* input, const __half* filter, __half* output) { | |||||
| using T = __half; | |||||
| using T2 = __half2; | |||||
| using ThreadConfig = typename ConvTrait::ThreadConfig; | |||||
| using SrcTileConfig = typename ConvTrait::SrcTileConfig; | |||||
| using FilterTileConfig = typename ConvTrait::FilterTileConfig; | |||||
| using OutTileConfig = typename ConvTrait::OutTileConfig; | |||||
| using SrcTileCount = typename ConvTrait::SrcTileCount; | |||||
| using FilterTileCount = typename ConvTrait::FilterTileCount; | |||||
| using SrcGlobal2ShareVisitor = typename ConvTrait::SrcGlobal2ShareVisitor; | |||||
| using FilterGlobal2ShareVisitor = typename ConvTrait::FilterGlobal2ShareVisitor; | |||||
| const bool is_fwd = (kDirection == DepthwiseConv2dDirection::DIRECTION_FORWARD); | |||||
| int off_ochannel = blockIdx.x, off_obw = blockIdx.y, off_obh = blockIdx.z, | |||||
| off_oh = threadIdx.y, off_ow = threadIdx.x; | |||||
| constexpr int t2_src_unroll_w = (SrcTileConfig::unroll_w + 3) / 2; | |||||
| constexpr int t2_flt_unroll_w = (FilterTileConfig::unroll_w + 2) / 2; | |||||
| constexpr int t2_out_unroll_w = (OutTileConfig::unroll_w + 1) / 2; | |||||
| extern __shared__ __align__(8) unsigned char smem[]; | |||||
| static_assert(sizeof(T) <= 8, "Insufficient alignment detected"); | |||||
| T* smem_src = reinterpret_cast<T*>(smem); | |||||
| T* smem_flt = reinterpret_cast<T*>(&smem_src[SrcTileCount::smem_size]); | |||||
| int stride_h = is_fwd ? param.stride_h : 1; | |||||
| int stride_w = is_fwd ? param.stride_w : 1; | |||||
| int off_ichannel = off_ochannel / param.chl_mul, | |||||
| off_fchannel = off_ichannel % param.src_chl, | |||||
| out_start_h = off_obh * OutTileConfig::block_h, | |||||
| out_start_w = off_obw * OutTileConfig::block_w, | |||||
| src_start_h = out_start_h * stride_h - param.pad_h, | |||||
| src_start_w = out_start_w * stride_w - param.pad_w, | |||||
| out_base_h_idx = out_start_h + off_oh * OutTileConfig::unroll_h; | |||||
| T* smem_src_ptr = smem_src + off_ow * FilterTileConfig::unroll_w; | |||||
| T* smem_flt_ptr = smem_flt + off_ow * FilterTileConfig::unroll_w; | |||||
| T* out_base_ptr = output + off_ochannel * param.out_h * param.out_w; | |||||
| SrcGlobal2ShareVisitor gl2sh_src = { | |||||
| smem_src, | |||||
| param.src_w, | |||||
| is_fwd ? src_start_h | |||||
| : src_start_h - (param.out_h / 2 + param.flt_h / 2 - param.pad_h - | |||||
| param.src_h * param.stride_h / 2), | |||||
| is_fwd ? src_start_w | |||||
| : src_start_w - (param.out_w / 2 + param.flt_w / 2 - param.pad_w - | |||||
| param.src_w * param.stride_w / 2), | |||||
| is_fwd ? param.src_h : param.src_h * param.stride_h, | |||||
| is_fwd ? param.src_w : param.src_w * param.stride_w, | |||||
| is_fwd ? 1 : param.stride_h, | |||||
| is_fwd ? 1 : param.stride_w}; | |||||
| FilterGlobal2ShareVisitor gl2sh_flt = {smem_flt, | |||||
| param.flt_w, | |||||
| is_fwd ? 0 : param.flt_h - 2, | |||||
| 0, | |||||
| param.flt_h, | |||||
| param.flt_w, | |||||
| 1, | |||||
| 1}; | |||||
| gl2sh_src.g_ptr = input + off_ichannel * param.src_h * param.src_w; | |||||
| gl2sh_flt.g_ptr = filter + off_fchannel * param.flt_h * param.flt_w; | |||||
| gl2sh_src.first_copy(); | |||||
| gl2sh_flt.first_copy(); | |||||
| __syncthreads(); | |||||
| T2 reg_src[SrcTileConfig::unroll_h * t2_src_unroll_w], | |||||
| reg_flt[2][FilterTileConfig::unroll_h * t2_flt_unroll_w]; | |||||
| float2 sum[OutTileConfig::unroll_size] = {{0.0, 0.0}}; | |||||
| for (int fh = 0; fh < param.flt_h; fh += FilterTileConfig::unroll_h) { | |||||
| gl2sh_src.copy(); | |||||
| gl2sh_flt.copy(); | |||||
| #pragma unroll | |||||
| for (int s_h = 0; s_h < SrcTileConfig::unroll_h; ++s_h) { | |||||
| #pragma unroll | |||||
| for (int s_w = 0; s_w < t2_src_unroll_w; ++s_w) { | |||||
| int src_offset = (off_oh * stride_h + fh + s_h) % SrcTileCount::smem_h * | |||||
| SrcTileCount::smem_w + | |||||
| s_w * 2; | |||||
| reg_src[s_h * t2_src_unroll_w + s_w] = | |||||
| *reinterpret_cast<T2*>(smem_src_ptr + src_offset); | |||||
| } | |||||
| } | |||||
| #pragma unroll | |||||
| for (int f_h = 0; f_h < FilterTileConfig::unroll_h; ++f_h) { | |||||
| #pragma unroll | |||||
| for (int f_w = 0; f_w < t2_flt_unroll_w - 1; ++f_w) { | |||||
| int flt_offset = | |||||
| (fh + f_h) % FilterTileCount::smem_h * FilterTileCount::smem_w + | |||||
| f_w * 2; | |||||
| reg_flt[0][f_h * t2_flt_unroll_w + f_w] = | |||||
| *reinterpret_cast<T2*>(smem_flt_ptr + flt_offset); | |||||
| if (f_w > 0) { | |||||
| reg_flt[1][f_h * t2_flt_unroll_w + f_w] = | |||||
| T2{reg_flt[0][f_h * t2_flt_unroll_w + f_w - 1].y, | |||||
| reg_flt[0][f_h * t2_flt_unroll_w + f_w].x}; | |||||
| } else { | |||||
| reg_flt[1][f_h * t2_flt_unroll_w + f_w] = | |||||
| T2{0.0, reg_flt[0][f_h * t2_flt_unroll_w + f_w].x}; | |||||
| } | |||||
| } | |||||
| reg_flt[0][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 1] = T2{0.0, 0.0}; | |||||
| reg_flt[1][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 1] = | |||||
| T2{reg_flt[0][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 2].y, 0.0}; | |||||
| } | |||||
| #pragma unroll | |||||
| for (int inner_fh = 0; inner_fh < FilterTileConfig::unroll_h; ++inner_fh) { | |||||
| #pragma unroll | |||||
| for (int oh = 0; oh < OutTileConfig::unroll_h; ++oh) { | |||||
| #pragma unroll | |||||
| for (int fw = 0; fw < t2_flt_unroll_w; ++fw) { | |||||
| #pragma unroll | |||||
| for (int ow = 0; ow < OutTileConfig::unroll_w; ++ow) { | |||||
| sum[oh * t2_out_unroll_w + ow] = megdnn::cuda::fma2( | |||||
| reg_flt[ow * stride_w % 2] | |||||
| [inner_fh * t2_flt_unroll_w + fw], | |||||
| reg_src[(inner_fh + oh) * t2_src_unroll_w + fw + | |||||
| ow * stride_w / 2], | |||||
| sum[oh * t2_out_unroll_w + ow]); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| __syncthreads(); | |||||
| gl2sh_src.commit(); | |||||
| gl2sh_flt.commit(); | |||||
| gl2sh_src.iter_forward(); | |||||
| gl2sh_flt.iter_forward(); | |||||
| __syncthreads(); | |||||
| } | |||||
| for (int o = 0; o < OutTileConfig::unroll_size; ++o) { | |||||
| for (int i = 1; i < ThreadConfig::thread_x; i = i << 1) { | |||||
| sum[o].x += __shfl_xor(sum[o].x, i, 32); | |||||
| sum[o].y += __shfl_xor(sum[o].y, i, 32); | |||||
| } | |||||
| } | |||||
| if (threadIdx.x == 0) { | |||||
| #pragma unroll | |||||
| for (int i = 0; i < OutTileConfig::unroll_h; ++i) { | |||||
| int out_h_idx = out_base_h_idx + i; | |||||
| if (out_h_idx < param.out_h) { | |||||
| #pragma unroll | |||||
| for (int j = 0; j < OutTileConfig::unroll_w; ++j) { | |||||
| int out_w_idx = out_start_w + j; | |||||
| if (out_w_idx >= param.out_w) | |||||
| return; | |||||
| out_base_ptr[out_h_idx * param.out_w + out_w_idx] = __float2half( | |||||
| sum[i * OutTileConfig::unroll_w + j].x + | |||||
| sum[i * OutTileConfig::unroll_w + j].y); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| #endif | |||||
| template <typename ConvTrait, DepthwiseConv2dDirection kDirection> | |||||
| __global__ void DepthwiseConv2dGPUKernelNCHW( | |||||
| const Param param, const float* input, const float* filter, float* output) { | |||||
| using T = float; | |||||
| using T2 = float2; | |||||
| using ThreadConfig = typename ConvTrait::ThreadConfig; | |||||
| using SrcTileConfig = typename ConvTrait::SrcTileConfig; | |||||
| using FilterTileConfig = typename ConvTrait::FilterTileConfig; | |||||
| using OutTileConfig = typename ConvTrait::OutTileConfig; | |||||
| using SrcTileCount = typename ConvTrait::SrcTileCount; | |||||
| using FilterTileCount = typename ConvTrait::FilterTileCount; | |||||
| using SrcGlobal2ShareVisitor = typename ConvTrait::SrcGlobal2ShareVisitor; | |||||
| using FilterGlobal2ShareVisitor = typename ConvTrait::FilterGlobal2ShareVisitor; | |||||
| const bool is_fwd = (kDirection == DepthwiseConv2dDirection::DIRECTION_FORWARD); | |||||
| int off_ochannel = blockIdx.x, off_obw = blockIdx.y, off_obh = blockIdx.z, | |||||
| off_oh = threadIdx.y, off_ow = threadIdx.x; | |||||
| extern __shared__ __align__(8) unsigned char smem[]; | |||||
| static_assert(sizeof(T) <= 8, "Insufficient alignment detected"); | |||||
| T* smem_src = reinterpret_cast<T*>(smem); | |||||
| T* smem_flt = reinterpret_cast<T*>(&smem_src[SrcTileCount::smem_size]); | |||||
| int stride_h = is_fwd ? param.stride_h : 1; | |||||
| int stride_w = is_fwd ? param.stride_w : 1; | |||||
| int off_ichannel = off_ochannel / param.chl_mul, | |||||
| off_fchannel = off_ichannel % param.src_chl, | |||||
| out_start_h = off_obh * OutTileConfig::block_h, | |||||
| out_start_w = off_obw * OutTileConfig::block_w, | |||||
| src_start_h = out_start_h * stride_h - param.pad_h, | |||||
| src_start_w = out_start_w * stride_w - param.pad_w, | |||||
| out_base_h_idx = out_start_h + off_oh * OutTileConfig::unroll_h; | |||||
| T* smem_src_ptr = smem_src + off_ow * FilterTileConfig::unroll_w; | |||||
| T* smem_flt_ptr = smem_flt + off_ow * FilterTileConfig::unroll_w; | |||||
| T* out_base_ptr = output + off_ochannel * param.out_h * param.out_w; | |||||
| SrcGlobal2ShareVisitor gl2sh_src = { | |||||
| smem_src, | |||||
| param.src_w, | |||||
| is_fwd ? src_start_h | |||||
| : src_start_h - (param.out_h / 2 + param.flt_h / 2 - param.pad_h - | |||||
| param.src_h * param.stride_h / 2), | |||||
| is_fwd ? src_start_w | |||||
| : src_start_w - (param.out_w / 2 + param.flt_w / 2 - param.pad_w - | |||||
| param.src_w * param.stride_w / 2), | |||||
| is_fwd ? param.src_h : param.src_h * param.stride_h, | |||||
| is_fwd ? param.src_w : param.src_w * param.stride_w, | |||||
| is_fwd ? 1 : param.stride_h, | |||||
| is_fwd ? 1 : param.stride_w}; | |||||
| FilterGlobal2ShareVisitor gl2sh_flt = {smem_flt, | |||||
| param.flt_w, | |||||
| is_fwd ? 0 : param.flt_h - 2, | |||||
| 0, | |||||
| param.flt_h, | |||||
| param.flt_w, | |||||
| 1, | |||||
| 1}; | |||||
| gl2sh_src.g_ptr = input + off_ichannel * param.src_h * param.src_w; | |||||
| gl2sh_flt.g_ptr = filter + off_fchannel * param.flt_h * param.flt_w; | |||||
| gl2sh_src.first_copy(); | |||||
| gl2sh_flt.first_copy(); | |||||
| __syncthreads(); | |||||
| T reg_src[SrcTileConfig::unroll_h * SrcTileConfig::unroll_w], | |||||
| reg_flt[FilterTileConfig::unroll_h * FilterTileConfig::unroll_w]; | |||||
| T sum[OutTileConfig::unroll_size] = {0.0}; | |||||
| for (int fh = 0; fh < param.flt_h; fh += FilterTileConfig::unroll_h) { | |||||
| gl2sh_src.copy(); | |||||
| gl2sh_flt.copy(); | |||||
| #pragma unroll | |||||
| for (int s_h = 0; s_h < SrcTileConfig::unroll_h; ++s_h) { | |||||
| #pragma unroll | |||||
| for (int s_w = 0; s_w < SrcTileConfig::unroll_w; ++s_w) { | |||||
| reg_src[s_h * SrcTileConfig::unroll_w + s_w] = smem_src_ptr | |||||
| [(off_oh * stride_h + fh + s_h) % SrcTileCount::smem_h * | |||||
| SrcTileCount::smem_w + | |||||
| s_w]; | |||||
| } | |||||
| } | |||||
| #pragma unroll | |||||
| for (int f_h = 0; f_h < FilterTileConfig::unroll_h; ++f_h) { | |||||
| #pragma unroll | |||||
| for (int f_w = 0; f_w < FilterTileConfig::unroll_w; ++f_w) { | |||||
| reg_flt[f_h * FilterTileConfig::unroll_w + f_w] = smem_flt_ptr | |||||
| [(fh + f_h) % FilterTileCount::smem_h * | |||||
| FilterTileCount::smem_w + | |||||
| f_w]; | |||||
| } | |||||
| } | |||||
| #pragma unroll | |||||
| for (int inner_fh = 0; inner_fh < FilterTileConfig::unroll_h; ++inner_fh) { | |||||
| #pragma unroll | |||||
| for (int oh = 0; oh < OutTileConfig::unroll_h; ++oh) { | |||||
| #pragma unroll | |||||
| for (int fw = 0; fw < FilterTileConfig::unroll_w; ++fw) { | |||||
| #pragma unroll | |||||
| for (int ow = 0; ow < OutTileConfig::unroll_w; ++ow) { | |||||
| sum[oh * OutTileConfig::unroll_w + ow] += | |||||
| reg_flt[inner_fh * FilterTileConfig::unroll_w + fw] * | |||||
| reg_src[(inner_fh + oh) * SrcTileConfig::unroll_w + fw + | |||||
| ow * stride_w]; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| __syncthreads(); | |||||
| gl2sh_src.commit(); | |||||
| gl2sh_flt.commit(); | |||||
| gl2sh_src.iter_forward(); | |||||
| gl2sh_flt.iter_forward(); | |||||
| __syncthreads(); | |||||
| } | |||||
| for (int o = 0; o < OutTileConfig::unroll_size; ++o) { | |||||
| for (int i = 1; i < ThreadConfig::thread_x; i = i << 1) { | |||||
| sum[o] += __shfl_xor(sum[o], i, 32); | |||||
| } | |||||
| } | |||||
| if (threadIdx.x == 0) { | |||||
| #pragma unroll | |||||
| for (int i = 0; i < OutTileConfig::unroll_h; ++i) { | |||||
| int out_h_idx = out_base_h_idx + i; | |||||
| if (out_h_idx < param.out_h) { | |||||
| #pragma unroll | |||||
| for (int j = 0; j < OutTileConfig::unroll_w; ++j) { | |||||
| int out_w_idx = out_start_w + j; | |||||
| if (out_w_idx >= param.out_w) | |||||
| return; | |||||
| out_base_ptr[out_h_idx * param.out_w + out_w_idx] = | |||||
| sum[i * OutTileConfig::unroll_w + j]; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| template <typename ConvTrait, DepthwiseConv2dDirection kDirection> | |||||
| __global__ void DepthwiseConv2dGPUKernelNCHWC32( | |||||
| const Param param, const float* input, const float* filter, float* output) { | |||||
| using T = float; | |||||
| using T2 = float2; | |||||
| using ThreadConfig = typename ConvTrait::ThreadConfig; | |||||
| using SrcTileConfig = typename ConvTrait::SrcTileConfig; | |||||
| using FilterTileConfig = typename ConvTrait::FilterTileConfig; | |||||
| using OutTileConfig = typename ConvTrait::OutTileConfig; | |||||
| using SrcTileCount = typename ConvTrait::SrcTileCount; | |||||
| using FilterTileCount = typename ConvTrait::FilterTileCount; | |||||
| using SrcGlobal2ShareVisitor = typename ConvTrait::SrcGlobal2ShareVisitor; | |||||
| using FilterGlobal2ShareVisitor = typename ConvTrait::FilterGlobal2ShareVisitor; | |||||
| const bool is_fwd = (kDirection == DepthwiseConv2dDirection::DIRECTION_FORWARD); | |||||
| int off_ochannel = blockIdx.x, off_obw = blockIdx.y, off_obh = blockIdx.z, | |||||
| off_oh = threadIdx.y, off_ow = threadIdx.x; | |||||
| extern __shared__ __align__(8) unsigned char smem[]; | |||||
| static_assert(sizeof(T) <= 8, "Insufficient alignment detected"); | |||||
| T* smem_src = reinterpret_cast<T*>(smem); | |||||
| T* smem_flt = reinterpret_cast<T*>(&smem_src[SrcTileCount::smem_size]); | |||||
| int stride_h = is_fwd ? param.stride_h : 1; | |||||
| int stride_w = is_fwd ? param.stride_w : 1; | |||||
| int off_ichannel = off_ochannel / param.chl_mul, | |||||
| off_fchannel = off_ichannel % param.src_chl, | |||||
| out_start_h = off_obh * OutTileConfig::block_h, | |||||
| out_start_w = off_obw * OutTileConfig::block_w, | |||||
| src_start_h = out_start_h * stride_h - param.pad_h, | |||||
| src_start_w = out_start_w * stride_w - param.pad_w, | |||||
| out_base_h_idx = out_start_h + off_oh * OutTileConfig::unroll_h; | |||||
| T* smem_src_ptr = smem_src + off_ow * FilterTileConfig::unroll_w; | |||||
| T* smem_flt_ptr = smem_flt + off_ow * FilterTileConfig::unroll_w; | |||||
| T* out_base_ptr = output + off_ochannel * param.out_h * param.out_w; | |||||
| SrcGlobal2ShareVisitor gl2sh_src = { | |||||
| smem_src, | |||||
| param.src_w, | |||||
| is_fwd ? src_start_h | |||||
| : src_start_h - (param.out_h / 2 + param.flt_h / 2 - param.pad_h - | |||||
| param.src_h * param.stride_h / 2), | |||||
| is_fwd ? src_start_w | |||||
| : src_start_w - (param.out_w / 2 + param.flt_w / 2 - param.pad_w - | |||||
| param.src_w * param.stride_w / 2), | |||||
| is_fwd ? param.src_h : param.src_h * param.stride_h, | |||||
| is_fwd ? param.src_w : param.src_w * param.stride_w, | |||||
| is_fwd ? 1 : param.stride_h, | |||||
| is_fwd ? 1 : param.stride_w}; | |||||
| FilterGlobal2ShareVisitor gl2sh_flt = {smem_flt, | |||||
| param.flt_w, | |||||
| is_fwd ? 0 : param.flt_h - 2, | |||||
| 0, | |||||
| param.flt_h, | |||||
| param.flt_w, | |||||
| 1, | |||||
| 1}; | |||||
| gl2sh_src.g_ptr = input + off_ichannel * param.src_h * param.src_w; | |||||
| gl2sh_flt.g_ptr = filter + off_fchannel * param.flt_h * param.flt_w; | |||||
| gl2sh_src.first_copy(); | |||||
| gl2sh_flt.first_copy(); | |||||
| __syncthreads(); | |||||
| T reg_src[SrcTileConfig::unroll_h * SrcTileConfig::unroll_w], | |||||
| reg_flt[FilterTileConfig::unroll_h * FilterTileConfig::unroll_w]; | |||||
| T sum[OutTileConfig::unroll_size] = {0.0}; | |||||
| for (int fh = 0; fh < param.flt_h; fh += FilterTileConfig::unroll_h) { | |||||
| gl2sh_src.copy(); | |||||
| gl2sh_flt.copy(); | |||||
| #pragma unroll | |||||
| for (int s_h = 0; s_h < SrcTileConfig::unroll_h; ++s_h) { | |||||
| #pragma unroll | |||||
| for (int s_w = 0; s_w < SrcTileConfig::unroll_w; ++s_w) { | |||||
| reg_src[s_h * SrcTileConfig::unroll_w + s_w] = smem_src_ptr | |||||
| [(off_oh * stride_h + fh + s_h) % SrcTileCount::smem_h * | |||||
| SrcTileCount::smem_w + | |||||
| s_w]; | |||||
| } | |||||
| } | |||||
| #pragma unroll | |||||
| for (int f_h = 0; f_h < FilterTileConfig::unroll_h; ++f_h) { | |||||
| #pragma unroll | |||||
| for (int f_w = 0; f_w < FilterTileConfig::unroll_w; ++f_w) { | |||||
| reg_flt[f_h * FilterTileConfig::unroll_w + f_w] = smem_flt_ptr | |||||
| [(fh + f_h) % FilterTileCount::smem_h * | |||||
| FilterTileCount::smem_w + | |||||
| f_w]; | |||||
| } | |||||
| } | |||||
| #pragma unroll | |||||
| for (int inner_fh = 0; inner_fh < FilterTileConfig::unroll_h; ++inner_fh) { | |||||
| #pragma unroll | |||||
| for (int oh = 0; oh < OutTileConfig::unroll_h; ++oh) { | |||||
| #pragma unroll | |||||
| for (int fw = 0; fw < FilterTileConfig::unroll_w; ++fw) { | |||||
| #pragma unroll | |||||
| for (int ow = 0; ow < OutTileConfig::unroll_w; ++ow) { | |||||
| sum[oh * OutTileConfig::unroll_w + ow] += | |||||
| reg_flt[inner_fh * FilterTileConfig::unroll_w + fw] * | |||||
| reg_src[(inner_fh + oh) * SrcTileConfig::unroll_w + fw + | |||||
| ow * stride_w]; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| __syncthreads(); | |||||
| gl2sh_src.commit(); | |||||
| gl2sh_flt.commit(); | |||||
| gl2sh_src.iter_forward(); | |||||
| gl2sh_flt.iter_forward(); | |||||
| __syncthreads(); | |||||
| } | |||||
| for (int o = 0; o < OutTileConfig::unroll_size; ++o) { | |||||
| for (int i = 1; i < ThreadConfig::thread_x; i = i << 1) { | |||||
| sum[o] += __shfl_xor(sum[o], i, 32); | |||||
| } | |||||
| } | |||||
| if (threadIdx.x == 0) { | |||||
| #pragma unroll | |||||
| for (int i = 0; i < OutTileConfig::unroll_h; ++i) { | |||||
| int out_h_idx = out_base_h_idx + i; | |||||
| if (out_h_idx < param.out_h) { | |||||
| #pragma unroll | |||||
| for (int j = 0; j < OutTileConfig::unroll_w; ++j) { | |||||
| int out_w_idx = out_start_w + j; | |||||
| if (out_w_idx >= param.out_w) | |||||
| return; | |||||
| out_base_ptr[out_h_idx * param.out_w + out_w_idx] = | |||||
| sum[i * OutTileConfig::unroll_w + j]; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| template < | |||||
| typename T, typename T2, DepthwiseConv2dDirection kDirection, int unroll_fw, | |||||
| int unroll_ow, int stride> | |||||
| void LaunchDepthwiseConv2dGPU( | |||||
| const Param& param, const T* input, const T* filter, T* output, | |||||
| cudaStream_t stream) { | |||||
| static int const unroll_oh = 1, unroll_fh = 1; | |||||
| using FilterTileConfig = FilterTileConfig<unroll_fh, unroll_fw>; | |||||
| using ThreadConfig = ThreadConfig<4, 32>; | |||||
| using OutTileConfig = OutTileConfig<ThreadConfig, unroll_oh, unroll_ow>; | |||||
| using IConvTrait = ConvTrait< | |||||
| T, kDirection, ThreadConfig, OutTileConfig, FilterTileConfig, stride, | |||||
| stride>; | |||||
| using SrcTileCount = typename IConvTrait::SrcTileCount; | |||||
| using FilterTileCount = typename IConvTrait::FilterTileCount; | |||||
| dim3 block(ThreadConfig::thread_x, ThreadConfig::thread_y); | |||||
| dim3 grid; | |||||
| grid.x = param.batch * param.src_chl * param.chl_mul; | |||||
| grid.y = DIVUP(param.out_w, OutTileConfig::block_w); | |||||
| grid.z = DIVUP(param.out_h, OutTileConfig::block_h); | |||||
| const int shared_storage = | |||||
| (SrcTileCount::smem_size + FilterTileCount::smem_size) * sizeof(T); | |||||
| void (*kernel)(const Param, const T*, const T*, T*); | |||||
| if (param.is_compute_deafult) { | |||||
| kernel = DepthwiseConv2dGPUKernelNCHW<IConvTrait, kDirection>; | |||||
| } else { | |||||
| kernel = DepthwiseConv2dGPUKernelNCHWC32<IConvTrait, kDirection>; | |||||
| } | |||||
| kernel<<<grid, block, shared_storage, stream>>>(param, input, filter, output); | |||||
| after_kernel_launch(); | |||||
| } | |||||
| #define INSTANCE_AB(type1, type2, a, b, direction) \ | |||||
| if (param.out_w > b * 4) { \ | |||||
| if (direction == DepthwiseConv2dDirection::DIRECTION_BACKWARD || \ | |||||
| (param.stride_h == 1 && param.stride_w == 1)) { \ | |||||
| LaunchDepthwiseConv2dGPU<type1, type2, direction, a + 2, b + 1, 1>( \ | |||||
| param, src, flt, dst, stream); \ | |||||
| } else if (param.stride_h == 2 && param.stride_w == 2) { \ | |||||
| LaunchDepthwiseConv2dGPU<type1, type2, direction, a + 2, b + 1, 2>( \ | |||||
| param, src, flt, dst, stream); \ | |||||
| } \ | |||||
| } | |||||
| #define INSTANCE_A(type1, type2, a, direction) \ | |||||
| if (param.flt_w > a * 4) { \ | |||||
| INSTANCE_AB(type1, type2, a, 15, direction) \ | |||||
| else INSTANCE_AB(type1, type2, a, 14, direction) else INSTANCE_AB(type1, type2, a, 13, direction) else INSTANCE_AB(type1, type2, a, 12, direction) else INSTANCE_AB(type1, type2, a, 11, direction) else INSTANCE_AB(type1, type2, a, 10, direction) else INSTANCE_AB( \ | |||||
| type1, type2, \ | |||||
| a, 9, direction) else INSTANCE_AB(type1, type2, a, 8, direction) else INSTANCE_AB(type1, type2, a, 7, direction) else INSTANCE_AB(type1, type2, a, 6, direction) else INSTANCE_AB(type1, type2, a, 5, direction) else INSTANCE_AB(type1, type2, a, 4, direction) else INSTANCE_AB(type1, type2, a, 3, direction) else INSTANCE_AB(type1, type2, a, 2, direction) else INSTANCE_AB(type1, type2, a, 1, direction) else INSTANCE_AB(type1, type2, a, 0, direction) \ | |||||
| } | |||||
| #define INSTANCE(type1, type2, direction) \ | |||||
| INSTANCE_A(type1, type2, 6, direction) \ | |||||
| else INSTANCE_A(type1, type2, 4, direction) else INSTANCE_A( \ | |||||
| type1, type2, 2, direction) else INSTANCE_A(type1, type2, 0, direction) | |||||
| } // anonymous namespace | |||||
| @@ -0,0 +1,56 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/conv_bias/chanwise/fwd_depthwise_large_filter.cu | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "cuda.h" | |||||
| #include "cuda_fp16.h" | |||||
| #include "src/cuda/conv_bias/chanwise/kern.cuh" | |||||
| #include "src/cuda/conv_bias/chanwise/kern_helper.cuh" | |||||
| #include "src/cuda/conv_bias/chanwise/launch_config.cuh" | |||||
| #include "src/cuda/fp16_help.cuh" | |||||
| using namespace megdnn; | |||||
| using namespace cuda; | |||||
| using namespace conv_bias; | |||||
| using namespace chanwise; | |||||
| #include "src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh" | |||||
| namespace megdnn { | |||||
| namespace cuda { | |||||
| namespace conv_bias { | |||||
| namespace chanwise { | |||||
| // =====================================fwd===================================== | |||||
| #define check | |||||
| template <> | |||||
| void run_fwd_depthwise_large_filter( | |||||
| float* dst, const float* src, const float* flt, const Param& param, | |||||
| cudaStream_t stream) { | |||||
| INSTANCE(float, float2, DepthwiseConv2dDirection::DIRECTION_FORWARD) | |||||
| } | |||||
| #if CUDA_VERSION >= 9000 | |||||
| template <> | |||||
| void run_fwd_depthwise_large_filter( | |||||
| __half* dst, const __half* src, const __half* flt, const Param& param, | |||||
| cudaStream_t stream) { | |||||
| INSTANCE(__half, __half2, DepthwiseConv2dDirection::DIRECTION_FORWARD) | |||||
| } | |||||
| #endif | |||||
| } // namespace chanwise | |||||
| } // namespace conv_bias | |||||
| } // namespace cuda | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cuda.doxygen | |||||
| @@ -27,8 +27,10 @@ namespace chanwise { | |||||
| struct Param { | struct Param { | ||||
| uint32_t batch, src_chl, src_h, src_w, chl_mul, flt_h, flt_w, out_h, out_w, pad_h, | uint32_t batch, src_chl, src_h, src_w, chl_mul, flt_h, flt_w, out_h, out_w, pad_h, | ||||
| pad_w, stride_h, stride_w, dilation_h, dilation_w; | pad_w, stride_h, stride_w, dilation_h, dilation_w; | ||||
| bool is_compute_deafult; | |||||
| #if MEGDNN_CC_HOST | #if MEGDNN_CC_HOST | ||||
| static Param from_fwd_args(const BiasForwardSizeArgs& args) { | |||||
| static Param from_fwd_args( | |||||
| const BiasForwardSizeArgs& args, bool is_compute_deafult_ = true) { | |||||
| #define U(v) static_cast<uint32_t>(v) | #define U(v) static_cast<uint32_t>(v) | ||||
| auto&& src = args.src_layout->shape; | auto&& src = args.src_layout->shape; | ||||
| auto&& dst = args.dst_layout->shape; | auto&& dst = args.dst_layout->shape; | ||||
| @@ -42,11 +44,12 @@ struct Param { | |||||
| hw_pos = 1; | hw_pos = 1; | ||||
| } | } | ||||
| return { | return { | ||||
| U(src[0]), U(src[c_pos]), U(src[hw_pos]), | |||||
| U(src[hw_pos + 1]), U(fm.ocpg), U(fm.spatial[0]), | |||||
| U(fm.spatial[1]), U(dst[hw_pos]), U(dst[hw_pos + 1]), | |||||
| U(fm.padding[0]), U(fm.padding[1]), U(fm.stride[0]), | |||||
| U(fm.stride[1]), U(fm.dilation[0]), U(fm.dilation[1]), | |||||
| U(src[0]), U(src[c_pos]), U(src[hw_pos]), | |||||
| U(src[hw_pos + 1]), U(fm.ocpg), U(fm.spatial[0]), | |||||
| U(fm.spatial[1]), U(dst[hw_pos]), U(dst[hw_pos + 1]), | |||||
| U(fm.padding[0]), U(fm.padding[1]), U(fm.stride[0]), | |||||
| U(fm.stride[1]), U(fm.dilation[0]), U(fm.dilation[1]), | |||||
| is_compute_deafult_, | |||||
| }; | }; | ||||
| #undef U | #undef U | ||||
| } | } | ||||
| @@ -61,6 +64,10 @@ template <typename T> | |||||
| void run_fwd_small( | void run_fwd_small( | ||||
| T* dst, const T* src, const T* flt, const Param& param, cudaStream_t stream); | T* dst, const T* src, const T* flt, const Param& param, cudaStream_t stream); | ||||
| template <typename T> | |||||
| void run_fwd_depthwise_large_filter( | |||||
| T* dst, const T* src, const T* flt, const Param& param, cudaStream_t stream); | |||||
| // implemented in fwd_8x8x32.cu | // implemented in fwd_8x8x32.cu | ||||
| void run_fwd_8x8x32( | void run_fwd_8x8x32( | ||||
| int32_t* dst, const int8_t* src, const int8_t* flt, const Param& param, | int32_t* dst, const int8_t* src, const int8_t* flt, const Param& param, | ||||
| @@ -74,13 +74,18 @@ cutlass::conv::ConvType convert_conv_type(Base::ConvType conv_type) { | |||||
| return cutlass::conv::ConvType::kLocal; | return cutlass::conv::ConvType::kLocal; | ||||
| case Base::ConvType::kLocalShare: | case Base::ConvType::kLocalShare: | ||||
| return cutlass::conv::ConvType::kLocalShare; | return cutlass::conv::ConvType::kLocalShare; | ||||
| case Base::ConvType::kDepthwiseConvolution: | |||||
| return cutlass::conv::ConvType::kDepthwiseConvolution; | |||||
| default: | default: | ||||
| megdnn_assert(0, "invalid conv type"); | megdnn_assert(0, "invalid conv type"); | ||||
| } | } | ||||
| } | } | ||||
| NumericTypeID convert_dtype(DTypeEnum dtype) { | |||||
| switch (dtype) { | |||||
| NumericTypeID convert_dtype(DType dtype) { | |||||
| // just make convolution with no bias happy | |||||
| if (!dtype.valid()) | |||||
| return NumericTypeID::kF32; | |||||
| switch (dtype.enumv()) { | |||||
| case DTypeEnum::Float32: | case DTypeEnum::Float32: | ||||
| return NumericTypeID::kF32; | return NumericTypeID::kF32; | ||||
| case DTypeEnum::Float16: | case DTypeEnum::Float16: | ||||
| @@ -100,6 +105,21 @@ NumericTypeID convert_dtype(DTypeEnum dtype) { | |||||
| } | } | ||||
| } | } | ||||
| NumericTypeID get_accumulator_dtype( | |||||
| DType dtype, const param::ConvBias::ComputeMode comp_mode) { | |||||
| if (dtype.category() == DTypeCategory::QUANTIZED) { | |||||
| return NumericTypeID::kS32; | |||||
| } else { | |||||
| megdnn_assert(dtype.category() == DTypeCategory::FLOAT); | |||||
| if (comp_mode == param::ConvBias::ComputeMode::DEFAULT) { | |||||
| return convert_dtype(dtype); | |||||
| } else { | |||||
| megdnn_assert(comp_mode == param::ConvBias::ComputeMode::FLOAT32); | |||||
| return NumericTypeID::kF32; | |||||
| } | |||||
| } | |||||
| } | |||||
| struct LayoutPack { | struct LayoutPack { | ||||
| LayoutTypeID src; | LayoutTypeID src; | ||||
| LayoutTypeID filter; | LayoutTypeID filter; | ||||
| @@ -149,6 +169,9 @@ LayoutPack get_layout_pack(const param::ConvBias::Format format, int access_type | |||||
| default: | default: | ||||
| megdnn_assert(0, "invalid access_type"); | megdnn_assert(0, "invalid access_type"); | ||||
| } | } | ||||
| case Format::NCHW: | |||||
| return {LayoutTypeID::kTensorNCHW, LayoutTypeID::kTensorNCHW, | |||||
| LayoutTypeID::kTensorNCHW, LayoutTypeID::kTensorNCHW}; | |||||
| default: | default: | ||||
| megdnn_assert(0, "invalid format"); | megdnn_assert(0, "invalid format"); | ||||
| } | } | ||||
| @@ -177,6 +200,94 @@ EpilogueType get_epilogue_type(const param::ConvBias::NonlineMode mode, bool cla | |||||
| megdnn_assert(0, "invalid nonlinear mode"); | megdnn_assert(0, "invalid nonlinear mode"); | ||||
| } | } | ||||
| std::pair<int, int> get_tensor_alignment( | |||||
| const param::ConvBias::Format format, const TensorLayout& src, | |||||
| const TensorLayout& filter, const Base::AlgoParam& algo_param, | |||||
| bool is_chanwise) { | |||||
| int alignment_src = 0; | |||||
| int alignment_filter = 0; | |||||
| using Format = param::ConvBias::Format; | |||||
| // get tensor alignment for tensor op operations | |||||
| // for tensor op operations, the alignment is determined by the size of a vector | |||||
| auto get_tensor_alignment_tensor_op = [&]() { | |||||
| switch (format) { | |||||
| /// case int8 | |||||
| case Format::NCHW32: | |||||
| case Format::NCHW32_NCHW4: | |||||
| alignment_src = 16; | |||||
| alignment_filter = 16; | |||||
| break; | |||||
| /// case int4 or uint4 | |||||
| case Format::NCHW64: | |||||
| alignment_src = 32; | |||||
| alignment_filter = 32; | |||||
| break; | |||||
| case Format::NHWC: | |||||
| alignment_src = alignment_filter = algo_param.access_size; | |||||
| break; | |||||
| default: | |||||
| megdnn_throw("invalid format"); | |||||
| }; | |||||
| }; | |||||
| // get tensor alignment for dot product operations | |||||
| // for integer dot product operations, alignment src is always 4 | |||||
| // and the alignment filter is determined by the threadblock shape | |||||
| auto get_tensor_alignment_dp4a = [&]() { | |||||
| megdnn_assert( | |||||
| format == Format::NCHW4 || format == Format::NCHW4_NCHW || | |||||
| format == Format::NCHW4_NHWC || format == Format::NCHW4_NCHW32); | |||||
| alignment_src = 4; | |||||
| // determine alignment filter | |||||
| constexpr int warp_size = 32; | |||||
| int threads = warp_size * algo_param.threadblock_m * algo_param.threadblock_n * | |||||
| algo_param.threadblock_k / | |||||
| (algo_param.warp_m * algo_param.warp_n * algo_param.warp_k); | |||||
| int threadblock_loads = | |||||
| filter.dtype.size(algo_param.threadblock_m * algo_param.threadblock_k); | |||||
| int load_per_thread = threadblock_loads / threads; | |||||
| if (load_per_thread >= 16) | |||||
| alignment_filter = 16; | |||||
| else if (load_per_thread >= 8) | |||||
| alignment_filter = 8; | |||||
| else { | |||||
| megdnn_assert(load_per_thread >= 4); | |||||
| alignment_filter = 4; | |||||
| } | |||||
| }; | |||||
| // get tensor alignment for depthwise convolution | |||||
| auto get_tensor_alignment_dwconv2d_nchw = [&]() { | |||||
| alignment_filter = 1; | |||||
| size_t wi = src.dtype.size(src[3]); // width extent in bytes | |||||
| for (size_t candidate : {16, 4, 2}) { | |||||
| if (wi % candidate == 0) { | |||||
| alignment_src = candidate; | |||||
| break; | |||||
| } | |||||
| } | |||||
| alignment_src /= src.dtype.size(1); | |||||
| }; | |||||
| /// TODO: need a better way to check whether tensor core instruction is used | |||||
| if (format == Format::NCHW32 || format == Format::NCHW32_NCHW4 || | |||||
| format == Format::NCHW64 || format == Format::NCHW64 || | |||||
| format == Format::NHWC) { | |||||
| get_tensor_alignment_tensor_op(); | |||||
| } else if ( | |||||
| format == Format::NCHW4 || format == Format::NCHW4_NCHW || | |||||
| format == Format::NCHW4_NHWC || format == Format::NCHW4_NCHW32) { | |||||
| get_tensor_alignment_dp4a(); | |||||
| } else { | |||||
| /// the following is used for depthwise convolution | |||||
| megdnn_assert(format == Format::NCHW && is_chanwise); | |||||
| get_tensor_alignment_dwconv2d_nchw(); | |||||
| } | |||||
| megdnn_assert(alignment_src >= 1 && alignment_filter >= 1); | |||||
| return {alignment_src, alignment_filter}; | |||||
| } | |||||
| } // namespace | } // namespace | ||||
| const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op( | const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op( | ||||
| @@ -185,23 +296,36 @@ const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_co | |||||
| auto&& param = args.opr->param(); | auto&& param = args.opr->param(); | ||||
| auto layouts = get_layout_pack(param.format, m_algo_param.access_size); | auto layouts = get_layout_pack(param.format, m_algo_param.access_size); | ||||
| auto epilogue_type = get_epilogue_type( | auto epilogue_type = get_epilogue_type( | ||||
| param.nonlineMode, args.dst_layout->dtype.enumv() != DTypeEnum::Float32); | |||||
| param.nonlineMode, | |||||
| args.dst_layout->dtype.category() != DTypeCategory::FLOAT); | |||||
| cutlass::conv::SpecialOptimizeDesc special_optimization = | cutlass::conv::SpecialOptimizeDesc special_optimization = | ||||
| (use_conv_filter_unity_opt) | (use_conv_filter_unity_opt) | ||||
| ? cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY | ? cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY | ||||
| : cutlass::conv::SpecialOptimizeDesc::NONE; | : cutlass::conv::SpecialOptimizeDesc::NONE; | ||||
| int alignment_src, alignment_filter; | |||||
| auto&& fm = args.filter_meta; | |||||
| bool is_chanwise = param.sparse == param::ConvBias::Sparse::GROUP && fm.icpg == 1 && | |||||
| fm.ocpg == 1; | |||||
| std::tie(alignment_src, alignment_filter) = get_tensor_alignment( | |||||
| param.format, *args.src_layout, *args.filter_layout, m_algo_param, | |||||
| is_chanwise); | |||||
| auto accumulator_dtype = | |||||
| get_accumulator_dtype(args.src_layout->dtype, param.compute_mode); | |||||
| ConvolutionKey key{ | ConvolutionKey key{ | ||||
| convert_conv_op(conv_op), | convert_conv_op(conv_op), | ||||
| convert_dtype(args.src_layout->dtype.enumv()), | |||||
| convert_dtype(args.src_layout->dtype), | |||||
| layouts.src, | layouts.src, | ||||
| convert_dtype(args.filter_layout->dtype.enumv()), | |||||
| convert_dtype(args.filter_layout->dtype), | |||||
| layouts.filter, | layouts.filter, | ||||
| convert_dtype(args.dst_layout->dtype.enumv()), | |||||
| convert_dtype(args.dst_layout->dtype), | |||||
| layouts.dst, | layouts.dst, | ||||
| convert_dtype(args.bias_layout->dtype.enumv()), | |||||
| convert_dtype(args.bias_layout->dtype), | |||||
| layouts.bias, | layouts.bias, | ||||
| accumulator_dtype, | |||||
| convert_conv_type(conv_type), | convert_conv_type(conv_type), | ||||
| m_algo_param.threadblock_m, | m_algo_param.threadblock_m, | ||||
| m_algo_param.threadblock_n, | m_algo_param.threadblock_n, | ||||
| @@ -215,6 +339,8 @@ const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_co | |||||
| epilogue_type, | epilogue_type, | ||||
| m_algo_param.stage, | m_algo_param.stage, | ||||
| special_optimization, | special_optimization, | ||||
| alignment_src, | |||||
| alignment_filter, | |||||
| without_shared_load}; | without_shared_load}; | ||||
| return Singleton::get().operation_table.find_op(key); | return Singleton::get().operation_table.find_op(key); | ||||
| @@ -227,13 +353,16 @@ void ConvBiasForwardImpl::AlgoCutlassConvolutionBase::execute_cutlass_conv_op( | |||||
| size_t pw, size_t sh, size_t sw, size_t dh, size_t dw, const void* alpha, | size_t pw, size_t sh, size_t sw, size_t dh, size_t dw, const void* alpha, | ||||
| const void* beta, const void* gamma, const void* delta, const void* theta, | const void* beta, const void* gamma, const void* delta, const void* theta, | ||||
| const void* threshold, const void* dst_scale, cudaStream_t stream, | const void* threshold, const void* dst_scale, cudaStream_t stream, | ||||
| const void* extra_param) const { | |||||
| const void* extra_param, size_t groups) const { | |||||
| // gcc prints warnings when size_t values are implicitly narrowed to int | // gcc prints warnings when size_t values are implicitly narrowed to int | ||||
| cutlass::conv::Conv2dProblemSize problem_size{ | cutlass::conv::Conv2dProblemSize problem_size{ | ||||
| int(n), int(hi), int(wi), int(ci), | |||||
| int(co), int(fh), int(fw), int(ho), | |||||
| int(wo), int(ph), int(pw), int(sh), | |||||
| int(sw), int(dh), int(dw), cutlass::conv::Mode::kCrossCorrelation}; | |||||
| int(n), int(hi), int(wi), int(ci), | |||||
| int(co), int(fh), int(fw), int(ho), | |||||
| int(wo), int(ph), int(pw), int(sh), | |||||
| int(sw), int(dh), int(dw), cutlass::conv::Mode::kCrossCorrelation, | |||||
| 1, // split k slices, always 1 | |||||
| int(groups), // groups | |||||
| }; | |||||
| ConvolutionArguments conv_args{ | ConvolutionArguments conv_args{ | ||||
| problem_size, src, filter, bias, z, dst, alpha, | problem_size, src, filter, bias, z, dst, alpha, | ||||
| @@ -0,0 +1,111 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/conv_bias/depthwise_large_filter.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "src/cuda/conv_bias/chanwise/depthwise_large_filter.cuh" | |||||
| #include "src/common/conv_bias.h" | |||||
| #include "src/cuda/conv_bias/algo.h" | |||||
| #include "src/cuda/conv_bias/chanwise/kern.cuh" | |||||
| #include "src/cuda/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace cuda; | |||||
| using namespace conv_bias; | |||||
| namespace { | |||||
| inline bool is_available_depthwise_large_filter(const chanwise::Param& param) { | |||||
| if ((param.stride_h == 1 && param.stride_w == 1) || | |||||
| (param.stride_h == 2 && param.stride_w == 2)) { | |||||
| auto&& device_prop = cuda::current_device_prop(); | |||||
| static int const unroll_oh = 1, unroll_fh = 1; | |||||
| CHECK(FWD) | |||||
| } | |||||
| return false; | |||||
| } | |||||
| } // anonymous namespace | |||||
| bool ConvBiasForwardImpl::AlgoDepthwiseLargeFilter::is_available( | |||||
| const SizeArgs& args) const { | |||||
| if (!args.src_layout->is_contiguous() || !args.dst_layout->is_contiguous()) { | |||||
| return false; | |||||
| } | |||||
| if (args.src_layout->dtype != args.filter_layout->dtype && | |||||
| (args.src_layout->dtype != dtype::Float32() | |||||
| #if CUDA_VERSION >= 9000 | |||||
| || args.src_layout->dtype != dtype::Float16() | |||||
| #endif | |||||
| )) { | |||||
| return false; | |||||
| } | |||||
| if (args.z_layout->ndim > 0) | |||||
| return false; | |||||
| auto param = chanwise::Param::from_fwd_args( | |||||
| args, args.opr->param().compute_mode == Param::ComputeMode::DEFAULT); | |||||
| auto&& fm = args.filter_meta; | |||||
| return fm.group > 1 && args.filter_meta.format == Param::Format::NCHW && | |||||
| args.src_layout->dtype.category() == DTypeCategory::FLOAT && | |||||
| args.opr->param().compute_mode == Param::ComputeMode::DEFAULT && | |||||
| fm.spatial_ndim == 2 && fm.icpg == 1 && fm.ocpg == 1 && | |||||
| fm.dilation[0] == 1 && fm.dilation[1] == 1 && !fm.should_flip && | |||||
| is_available_depthwise_large_filter(param); | |||||
| } | |||||
| size_t ConvBiasForwardImpl::AlgoDepthwiseLargeFilter::get_workspace_in_bytes( | |||||
| const SizeArgs& args) const { | |||||
| auto dst_layout = *args.dst_layout; | |||||
| if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) { | |||||
| dst_layout.dtype = DType(); | |||||
| args.opr->check_or_deduce_dtype_fwd( | |||||
| args.src_layout->dtype, args.filter_layout->dtype, dst_layout.dtype); | |||||
| return dst_layout.span().dist_byte(); | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| void ConvBiasForwardImpl::AlgoDepthwiseLargeFilter::exec(const ExecArgs& args) const { | |||||
| WorkspaceBundle bundle{args.workspace.raw_ptr, {get_workspace_in_bytes(args)}}; | |||||
| TensorND conv_dst_tensor = *args.dst_tensor; | |||||
| if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) { | |||||
| conv_dst_tensor = TensorND{bundle.get(0), conv_dst_tensor.layout}; | |||||
| conv_dst_tensor.layout.dtype = DType(); | |||||
| args.opr->check_or_deduce_dtype_fwd( | |||||
| args.src_layout->dtype, args.filter_layout->dtype, | |||||
| conv_dst_tensor.layout.dtype); | |||||
| } | |||||
| { | |||||
| auto kparam = chanwise::Param::from_fwd_args( | |||||
| args, args.opr->param().compute_mode == Param::ComputeMode::DEFAULT); | |||||
| auto stream = cuda_stream(args.handle); | |||||
| switch (args.src_layout->dtype.enumv()) { | |||||
| case DTypeEnum::Float32: | |||||
| chanwise::run_fwd_depthwise_large_filter( | |||||
| conv_dst_tensor.ptr<float>(), args.src_tensor->ptr<float>(), | |||||
| args.filter_tensor->ptr<float>(), kparam, stream); | |||||
| break; | |||||
| #if CUDA_VERSION >= 9000 | |||||
| case DTypeEnum::Float16: | |||||
| chanwise::run_fwd_depthwise_large_filter( | |||||
| static_cast<half*>(conv_dst_tensor.raw_ptr()), | |||||
| static_cast<half*>(args.src_tensor->raw_ptr()), | |||||
| static_cast<half*>(args.filter_tensor->raw_ptr()), kparam, | |||||
| stream); | |||||
| break; | |||||
| #endif | |||||
| default: | |||||
| megdnn_assert_internal(0); | |||||
| } | |||||
| } | |||||
| handle_bias_and_nonlinear( | |||||
| args.handle, args.nonlinear_mode, &conv_dst_tensor, args.dst_tensor, | |||||
| args.bias_tensor); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,99 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "src/common/conv_bias.h" | |||||
| #include "src/cuda/conv_bias/algo.h" | |||||
| #include "src/cuda/convolution_helper/parameter.cuh" | |||||
| #include "src/cuda/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace cuda; | |||||
| bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available( | |||||
| const SizeArgs& args) const { | |||||
| #define RETURN_IF_FALSE(stmt_) \ | |||||
| if (!(stmt_)) \ | |||||
| return false; | |||||
| RETURN_IF_FALSE(is_compute_capability_required(7, 0)); | |||||
| RETURN_IF_FALSE( | |||||
| args.src_layout->is_contiguous() && args.dst_layout->is_contiguous()); | |||||
| using Param = param::ConvBias; | |||||
| using Format = Param::Format; | |||||
| using Sparse = Param::Sparse; | |||||
| using Mode = Param::Mode; | |||||
| using NonlineMode = Param::NonlineMode; | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.filter_meta; | |||||
| RETURN_IF_FALSE( | |||||
| param.format == Format::NCHW && | |||||
| args.src_layout->dtype.enumv() == DTypeEnum::Float16 && | |||||
| args.filter_layout->dtype.enumv() == DTypeEnum::Float16 && | |||||
| args.dst_layout->dtype.enumv() == DTypeEnum::Float16); | |||||
| RETURN_IF_FALSE(param.nonlineMode != NonlineMode::SIGMOID); | |||||
| RETURN_IF_FALSE( | |||||
| args.bias_layout->ndim <= 0 || | |||||
| (args.bias_layout->dtype.enumv() == DTypeEnum::Float16 && | |||||
| check_bias_share_in_channel(*args.bias_layout, param.format))); | |||||
| RETURN_IF_FALSE( | |||||
| args.z_layout->ndim <= 0 || | |||||
| args.z_layout->dtype.enumv() == DTypeEnum::Float16); | |||||
| RETURN_IF_FALSE(param.sparse == Sparse::GROUP); | |||||
| RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION); | |||||
| // check if channelwise convolution | |||||
| RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1); | |||||
| RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1); | |||||
| const auto* op = get_cutlass_conv_op( | |||||
| args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false); | |||||
| RETURN_IF_FALSE(op != nullptr); | |||||
| return true; | |||||
| #undef RETURN_IF_FALSE | |||||
| } | |||||
| void ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::exec( | |||||
| const ExecArgs& args) const { | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.filter_meta; | |||||
| size_t n = args.src_layout->operator[](0), hi = args.src_layout->operator[](2), | |||||
| wi = args.src_layout->operator[](3); | |||||
| size_t ho = args.dst_layout->operator[](2), wo = args.dst_layout->operator[](3); | |||||
| size_t co = fm.group; | |||||
| size_t ci = co; | |||||
| // check if channelwise convolution | |||||
| megdnn_assert(fm.icpg == 1 && fm.ocpg == 1); | |||||
| auto&& stream = cuda_stream(args.opr->handle()); | |||||
| float alpha = 1.f; | |||||
| float beta = args.bias_layout->ndim > 0 ? 1.f : 0.f; | |||||
| void* bias_ptr = args.bias_layout->ndim > 0 ? args.bias_tensor->raw_ptr() : nullptr; | |||||
| float gamma = args.z_layout->ndim > 0 ? 1.f : 0.f; | |||||
| void* z_ptr = args.z_layout->ndim > 0 ? args.z_tensor->raw_ptr() : nullptr; | |||||
| // dummy parameters, used for quantization cases | |||||
| float theta = 0.f; | |||||
| float delta = 0.f; | |||||
| float threshold = 0.f; | |||||
| const auto* op = get_cutlass_conv_op( | |||||
| args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false); | |||||
| UNPACK_CONV_PARAMETER(fm, param); | |||||
| MARK_USED_VAR | |||||
| execute_cutlass_conv_op( | |||||
| op, args.src_tensor->raw_ptr(), args.filter_tensor->raw_ptr(), bias_ptr, | |||||
| z_ptr, args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho, | |||||
| wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta, | |||||
| &threshold, nullptr, stream, nullptr, fm.group); | |||||
| after_kernel_launch(); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,99 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "src/common/conv_bias.h" | |||||
| #include "src/cuda/conv_bias/algo.h" | |||||
| #include "src/cuda/convolution_helper/parameter.cuh" | |||||
| #include "src/cuda/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace cuda; | |||||
| bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available( | |||||
| const SizeArgs& args) const { | |||||
| #define RETURN_IF_FALSE(stmt_) \ | |||||
| if (!(stmt_)) \ | |||||
| return false; | |||||
| RETURN_IF_FALSE(is_compute_capability_required(6, 1)); | |||||
| RETURN_IF_FALSE( | |||||
| args.src_layout->is_contiguous() && args.dst_layout->is_contiguous()); | |||||
| using Param = param::ConvBias; | |||||
| using Format = Param::Format; | |||||
| using Sparse = Param::Sparse; | |||||
| using Mode = Param::Mode; | |||||
| using NonlineMode = Param::NonlineMode; | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.filter_meta; | |||||
| RETURN_IF_FALSE( | |||||
| param.format == Format::NCHW && | |||||
| args.src_layout->dtype.enumv() == DTypeEnum::Float32 && | |||||
| args.filter_layout->dtype.enumv() == DTypeEnum::Float32 && | |||||
| args.dst_layout->dtype.enumv() == DTypeEnum::Float32); | |||||
| RETURN_IF_FALSE(param.nonlineMode != NonlineMode::SIGMOID); | |||||
| RETURN_IF_FALSE( | |||||
| args.bias_layout->ndim <= 0 || | |||||
| (args.bias_layout->dtype.enumv() == DTypeEnum::Float32 && | |||||
| check_bias_share_in_channel(*args.bias_layout, param.format))); | |||||
| RETURN_IF_FALSE( | |||||
| args.z_layout->ndim <= 0 || | |||||
| args.z_layout->dtype.enumv() == DTypeEnum::Float32); | |||||
| RETURN_IF_FALSE(param.sparse == Sparse::GROUP); | |||||
| RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION); | |||||
| // check if channelwise convolution | |||||
| RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1); | |||||
| RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1); | |||||
| const auto* op = get_cutlass_conv_op( | |||||
| args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false); | |||||
| RETURN_IF_FALSE(op != nullptr); | |||||
| return true; | |||||
| #undef RETURN_IF_FALSE | |||||
| } | |||||
| void ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::exec( | |||||
| const ExecArgs& args) const { | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.filter_meta; | |||||
| size_t n = args.src_layout->operator[](0), hi = args.src_layout->operator[](2), | |||||
| wi = args.src_layout->operator[](3); | |||||
| size_t ho = args.dst_layout->operator[](2), wo = args.dst_layout->operator[](3); | |||||
| size_t co = fm.group; | |||||
| size_t ci = co; | |||||
| // check if channelwise convolution | |||||
| megdnn_assert(fm.icpg == 1 && fm.ocpg == 1); | |||||
| auto&& stream = cuda_stream(args.opr->handle()); | |||||
| float alpha = 1.f; | |||||
| float beta = args.bias_layout->ndim > 0 ? 1.f : 0.f; | |||||
| void* bias_ptr = args.bias_layout->ndim > 0 ? args.bias_tensor->raw_ptr() : nullptr; | |||||
| float gamma = args.z_layout->ndim > 0 ? 1.f : 0.f; | |||||
| void* z_ptr = args.z_layout->ndim > 0 ? args.z_tensor->raw_ptr() : nullptr; | |||||
| // dummy parameters, used for quantization cases | |||||
| float theta = 0.f; | |||||
| float delta = 0.f; | |||||
| float threshold = 0.f; | |||||
| const auto* op = get_cutlass_conv_op( | |||||
| args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false); | |||||
| UNPACK_CONV_PARAMETER(fm, param); | |||||
| MARK_USED_VAR | |||||
| execute_cutlass_conv_op( | |||||
| op, args.src_tensor->raw_ptr(), args.filter_tensor->raw_ptr(), bias_ptr, | |||||
| z_ptr, args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho, | |||||
| wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta, | |||||
| &threshold, nullptr, stream, nullptr, fm.group); | |||||
| after_kernel_launch(); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -145,9 +145,22 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( | |||||
| const bool prefer_dnn_chanwise = slow_cudnn_chanwise_impl || | const bool prefer_dnn_chanwise = slow_cudnn_chanwise_impl || | ||||
| args.filter_meta.stride[0] != 1 || | args.filter_meta.stride[0] != 1 || | ||||
| args.filter_meta.stride[1] != 1 || hw_size < 512; | args.filter_meta.stride[1] != 1 || hw_size < 512; | ||||
| //! choose for large kernel cases | |||||
| size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1]; | |||||
| size_t hi = src[2], wi = src[3]; | |||||
| const bool prefer_dnn_lk_implbmm = hi <= 2 * fh && wi <= 2 * fw; | |||||
| //! avoid bad case in cudnn, check dnn chanwise impl first | //! avoid bad case in cudnn, check dnn chanwise impl first | ||||
| if (is_chanwise) { | if (is_chanwise) { | ||||
| if (prefer_dnn_chanwise) { | |||||
| if (prefer_dnn_lk_implbmm) { | |||||
| #if CUDA_VERSION >= 10020 | |||||
| if (sm_algo_pack.f16_implicit_bmm[0].is_available_attribute( | |||||
| args, positive_attr, negative_attr, workspace_limit_in_bytes)) | |||||
| return &sm_algo_pack.f16_implicit_bmm[0]; | |||||
| #endif | |||||
| if (sm_algo_pack.f32_implicit_bmm[0].is_available_attribute( | |||||
| args, positive_attr, negative_attr, workspace_limit_in_bytes)) | |||||
| return &sm_algo_pack.f32_implicit_bmm[0]; | |||||
| } else if (prefer_dnn_chanwise) { | |||||
| if (sm_algo_pack.chanwise.is_available_attribute( | if (sm_algo_pack.chanwise.is_available_attribute( | ||||
| args, positive_attr, negative_attr, workspace_limit_in_bytes)) | args, positive_attr, negative_attr, workspace_limit_in_bytes)) | ||||
| return &sm_algo_pack.chanwise; | return &sm_algo_pack.chanwise; | ||||
| @@ -45,6 +45,7 @@ public: | |||||
| class AlgoCUDNNConvBiasActivation; | class AlgoCUDNNConvBiasActivation; | ||||
| class AlgoChanwise; | class AlgoChanwise; | ||||
| class AlgoChanwiseSmall; | class AlgoChanwiseSmall; | ||||
| class AlgoDepthwiseLargeFilter; | |||||
| class AlgoChanwise8x8x32; | class AlgoChanwise8x8x32; | ||||
| class AlgoCUDNNConv; | class AlgoCUDNNConv; | ||||
| class AlgoFallbackNCHWQS8; | class AlgoFallbackNCHWQS8; | ||||
| @@ -71,6 +72,9 @@ public: | |||||
| class AlgoInt4Int4NHWCIMMAImplicitGemm; | class AlgoInt4Int4NHWCIMMAImplicitGemm; | ||||
| class AlgoUInt4Int4NHWCIMMAImplicitGemm; | class AlgoUInt4Int4NHWCIMMAImplicitGemm; | ||||
| class AlgoBFloat16; | class AlgoBFloat16; | ||||
| // The following algorithms are suitable for channel wise convolution | |||||
| class AlgoFloat32NCHWFMAImplicitBatchedGemm; | |||||
| class AlgoFloat16NCHWHMMAImplicitBatchedGemm; | |||||
| class AlgoPack; | class AlgoPack; | ||||
| @@ -19,10 +19,12 @@ using namespace cuda; | |||||
| ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() { | ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() { | ||||
| non_cudnn_algos.push_back(&chanwise); | non_cudnn_algos.push_back(&chanwise); | ||||
| non_cudnn_algos.push_back(&chanwise_small); | non_cudnn_algos.push_back(&chanwise_small); | ||||
| non_cudnn_algos.push_back(&depthwise_large_filter); | |||||
| non_cudnn_algos.push_back(&matmul); | non_cudnn_algos.push_back(&matmul); | ||||
| all_algos.push_back(&chanwise); // prefer chanwise | all_algos.push_back(&chanwise); // prefer chanwise | ||||
| all_algos.push_back(&chanwise_small); // prefer small chanwise | all_algos.push_back(&chanwise_small); // prefer small chanwise | ||||
| all_algos.push_back(&depthwise_large_filter); | |||||
| fill_cudnn_algos(); | fill_cudnn_algos(); | ||||
| for (auto&& i : cudnn) { | for (auto&& i : cudnn) { | ||||
| @@ -41,6 +43,7 @@ ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() { | |||||
| all_algos.push_back(&algo); | all_algos.push_back(&algo); | ||||
| int8_algos.push_back(&algo); | int8_algos.push_back(&algo); | ||||
| } | } | ||||
| fill_dwconv_algos(); | |||||
| int8_algos.push_back(&int8_nchw_dotprod); | int8_algos.push_back(&int8_nchw_dotprod); | ||||
| all_algos.push_back(&int8_nchw_dotprod); | all_algos.push_back(&int8_nchw_dotprod); | ||||
| @@ -54,6 +57,39 @@ ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() { | |||||
| } | } | ||||
| } | } | ||||
| void ConvolutionBackwardDataImpl::AlgoPack::fill_dwconv_algos() { | |||||
| { | |||||
| using AlgoParam = AlgoFloat32NCHWFMAImplicitBatchedGemm::AlgoParam; | |||||
| /// preferred algo | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8, 2}); | |||||
| for (auto&& algo : implbmm_nchw_fma) { | |||||
| all_algos.push_back(&algo); | |||||
| } | |||||
| } | |||||
| #if CUDA_VERSION >= 10010 | |||||
| { | |||||
| using AlgoParam = AlgoFloat16NCHWHMMAImplicitBatchedGemm::AlgoParam; | |||||
| /// preferred algo | |||||
| implbmm_nchw_hmma.emplace_back(AlgoParam{64, 128, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| implbmm_nchw_hmma.emplace_back(AlgoParam{128, 128, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| implbmm_nchw_hmma.emplace_back(AlgoParam{128, 256, 32, 64, 64, 32, 8, 8, 4, 2}); | |||||
| implbmm_nchw_hmma.emplace_back(AlgoParam{128, 64, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| implbmm_nchw_hmma.emplace_back(AlgoParam{64, 64, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| for (auto&& algo : implbmm_nchw_hmma) { | |||||
| all_algos.push_back(&algo); | |||||
| } | |||||
| } | |||||
| #endif | |||||
| } | |||||
| MEGDNN_DEF_GET_ALGO_FROM_DESC(ConvolutionBackwardDataImpl) | MEGDNN_DEF_GET_ALGO_FROM_DESC(ConvolutionBackwardDataImpl) | ||||
| ConvolutionBackwardDataImpl::AlgoCUDNN* ConvolutionBackwardDataImpl::AlgoPack:: | ConvolutionBackwardDataImpl::AlgoCUDNN* ConvolutionBackwardDataImpl::AlgoPack:: | ||||
| @@ -37,11 +37,14 @@ public: | |||||
| CUDA_MATMUL, | CUDA_MATMUL, | ||||
| CUDA_CHANWISE, | CUDA_CHANWISE, | ||||
| CUDA_CHANWISE_SMALL, | CUDA_CHANWISE_SMALL, | ||||
| CUDA_DEPTHWISE_LARGE_FILTER, | |||||
| CUDA_BFLOAT16, | CUDA_BFLOAT16, | ||||
| CUDA_GROUP_CONV_GENERAL, | CUDA_GROUP_CONV_GENERAL, | ||||
| CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8, | CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8, | ||||
| CUDA_IMPLICIT_GEMM_NCHW_DOTPROD_INT8, | CUDA_IMPLICIT_GEMM_NCHW_DOTPROD_INT8, | ||||
| CUDA_IMPLICIT_GEMM_NHWC_IMMA_INT8 | |||||
| CUDA_IMPLICIT_GEMM_NHWC_IMMA_INT8, | |||||
| CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32, | |||||
| CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16, | |||||
| }; | }; | ||||
| using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | ||||
| @@ -190,6 +193,20 @@ public: | |||||
| } | } | ||||
| }; | }; | ||||
| class ConvolutionBackwardDataImpl::AlgoDepthwiseLargeFilter final : public AlgoBase { | |||||
| public: | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return "DEPTHWISE_LARGE_FILTER"; } | |||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_DEPTHWISE_LARGE_FILTER) | |||||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } | |||||
| private: | |||||
| mutable std::string m_name; | |||||
| }; | |||||
| class ConvolutionBackwardDataImpl::AlgoBFloat16 final : public AlgoBase { | class ConvolutionBackwardDataImpl::AlgoBFloat16 final : public AlgoBase { | ||||
| public: | public: | ||||
| bool is_available(const SizeArgs& args) const override; | bool is_available(const SizeArgs& args) const override; | ||||
| @@ -315,6 +332,82 @@ private: | |||||
| std::string m_name; | std::string m_name; | ||||
| }; | }; | ||||
| class ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm final | |||||
| : public AlgoBase { | |||||
| public: | |||||
| struct AlgoParam { | |||||
| int threadblock_m; | |||||
| int threadblock_n; | |||||
| int threadblock_k; | |||||
| int warp_m; | |||||
| int warp_n; | |||||
| int warp_k; | |||||
| int stage; | |||||
| std::string to_string() { | |||||
| return ssprintf( | |||||
| "_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n, | |||||
| threadblock_k, warp_m, warp_n, warp_k, stage); | |||||
| } | |||||
| }; | |||||
| AlgoFloat32NCHWFMAImplicitBatchedGemm(AlgoParam algo_param) | |||||
| : m_algo_param{algo_param}, | |||||
| m_name{ssprintf( | |||||
| "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM%s", | |||||
| m_algo_param.to_string().c_str())} {} | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override { return 0; } | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return m_name.c_str(); } | |||||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } | |||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32) | |||||
| private: | |||||
| const void* get_available_op(const SizeArgs& args) const; | |||||
| AlgoParam m_algo_param; | |||||
| std::string m_name; | |||||
| }; | |||||
| class ConvolutionBackwardDataImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm final | |||||
| : public AlgoBase { | |||||
| public: | |||||
| /// add instruction shape as member of algo param, because f16 tensor core has 2 | |||||
| /// different matrix shapes (i.e. mma.884 and mma.1688) | |||||
| struct AlgoParam { | |||||
| int threadblock_m; | |||||
| int threadblock_n; | |||||
| int threadblock_k; | |||||
| int warp_m; | |||||
| int warp_n; | |||||
| int warp_k; | |||||
| int instruction_m; | |||||
| int instruction_n; | |||||
| int instruction_k; | |||||
| int stage; | |||||
| std::string to_string() { | |||||
| return ssprintf( | |||||
| "_%dX%dX%d_%dX%dX%d_mma%dX%dX%d_%dstage", threadblock_m, | |||||
| threadblock_n, threadblock_k, warp_m, warp_n, warp_k, instruction_m, | |||||
| instruction_n, instruction_k, stage); | |||||
| } | |||||
| }; | |||||
| AlgoFloat16NCHWHMMAImplicitBatchedGemm(AlgoParam algo_param) | |||||
| : m_algo_param{algo_param}, | |||||
| m_name{ssprintf( | |||||
| "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM%s", | |||||
| m_algo_param.to_string().c_str())} {} | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override { return 0; } | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return m_name.c_str(); } | |||||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } | |||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16) | |||||
| private: | |||||
| const void* get_available_op(const SizeArgs& args) const; | |||||
| AlgoParam m_algo_param; | |||||
| std::string m_name; | |||||
| }; | |||||
| class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj { | class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj { | ||||
| // defined in cudnn.cpp | // defined in cudnn.cpp | ||||
| void fill_cudnn_algos(); | void fill_cudnn_algos(); | ||||
| @@ -322,6 +415,7 @@ class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj { | |||||
| void fill_int8_dp4a_algos(); | void fill_int8_dp4a_algos(); | ||||
| // defined in implicit_gemm_int8_nhwc_imma.cpp | // defined in implicit_gemm_int8_nhwc_imma.cpp | ||||
| void fill_int8_imma_algos(); | void fill_int8_imma_algos(); | ||||
| void fill_dwconv_algos(); | |||||
| AlgoBase::Mapper m_all_algos_map; | AlgoBase::Mapper m_all_algos_map; | ||||
| @@ -332,11 +426,14 @@ public: | |||||
| AlgoMatmul matmul; | AlgoMatmul matmul; | ||||
| AlgoChanwise chanwise; | AlgoChanwise chanwise; | ||||
| AlgoChanwiseSmall chanwise_small; | AlgoChanwiseSmall chanwise_small; | ||||
| AlgoDepthwiseLargeFilter depthwise_large_filter; | |||||
| AlgoBFloat16 bfloat16; | AlgoBFloat16 bfloat16; | ||||
| AlgoGroupConvGeneral group; | AlgoGroupConvGeneral group; | ||||
| std::vector<AlgoInt8NCHW4DotProdImplicitGemm> int8_nchw4_dotprod; | std::vector<AlgoInt8NCHW4DotProdImplicitGemm> int8_nchw4_dotprod; | ||||
| AlgoInt8NCHWDotProdImplicitGemm int8_nchw_dotprod; | AlgoInt8NCHWDotProdImplicitGemm int8_nchw_dotprod; | ||||
| std::vector<AlgoInt8NHWCIMMAImplicitGemm> int8_nhwc_imma; | std::vector<AlgoInt8NHWCIMMAImplicitGemm> int8_nhwc_imma; | ||||
| std::vector<AlgoFloat32NCHWFMAImplicitBatchedGemm> implbmm_nchw_fma; | |||||
| std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> implbmm_nchw_hmma; | |||||
| std::vector<AlgoBase*> | std::vector<AlgoBase*> | ||||
| //! all algorithms | //! all algorithms | ||||
| @@ -0,0 +1,89 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/convolution/backward_data/depthwise_large_filter.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "src/cuda/conv_bias/chanwise/depthwise_large_filter.cuh" | |||||
| #include "src/cuda/convolution/backward_data/algo.h" | |||||
| #include "src/cuda/convolution/chanwise/kern.cuh" | |||||
| #include "src/cuda/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace cuda; | |||||
| using namespace convolution; | |||||
| namespace { | |||||
| inline bool is_available_depthwise_large_filter(const chanwise::Param& param) { | |||||
| if ((param.stride_h == 1 && param.stride_w == 1) || | |||||
| (param.stride_h == 2 && param.stride_w == 2)) { | |||||
| auto&& device_prop = cuda::current_device_prop(); | |||||
| static int const unroll_oh = 1, unroll_fh = 1; | |||||
| CHECK(BWD) | |||||
| } | |||||
| return false; | |||||
| } | |||||
| } // anonymous namespace | |||||
| bool ConvolutionBackwardDataImpl::AlgoDepthwiseLargeFilter::is_available( | |||||
| const SizeArgs& args) const { | |||||
| if (!args.grad_layout->is_contiguous() || !args.diff_layout->is_contiguous()) { | |||||
| return false; | |||||
| } | |||||
| if (args.diff_layout->dtype != args.filter_layout->dtype && | |||||
| (args.diff_layout->dtype != dtype::Float32() | |||||
| #if CUDA_VERSION >= 9000 | |||||
| || args.diff_layout->dtype != dtype::Float16() | |||||
| #endif | |||||
| )) { | |||||
| return false; | |||||
| } | |||||
| auto param = chanwise::Param::from_fwd_args( | |||||
| {args.handle, args.diff_layout, args.filter_layout, args.filter_meta, | |||||
| args.grad_layout}); | |||||
| auto&& fm = args.filter_meta; | |||||
| return fm.group > 1 && args.filter_meta.format == Param::Format::NCHW && | |||||
| args.diff_layout->dtype.category() == DTypeCategory::FLOAT && | |||||
| args.opr->param().compute_mode == Param::ComputeMode::DEFAULT && | |||||
| fm.spatial_ndim == 2 && fm.icpg == 1 && fm.ocpg == 1 && | |||||
| fm.dilation[0] == 1 && fm.dilation[1] == 1 && !fm.should_flip && | |||||
| is_available_depthwise_large_filter(param); | |||||
| } | |||||
| size_t ConvolutionBackwardDataImpl::AlgoDepthwiseLargeFilter::get_workspace_in_bytes( | |||||
| const SizeArgs& args) const { | |||||
| return 0; | |||||
| } | |||||
| void ConvolutionBackwardDataImpl::AlgoDepthwiseLargeFilter::exec( | |||||
| const ExecArgs& args) const { | |||||
| auto kparam = chanwise::Param::from_fwd_args( | |||||
| {args.handle, args.diff_layout, args.filter_layout, args.filter_meta, | |||||
| args.grad_layout}); | |||||
| auto stream = cuda_stream(args.handle); | |||||
| switch (args.diff_layout->dtype.enumv()) { | |||||
| case DTypeEnum::Float32: | |||||
| chanwise::run_bwd_depthwise_large_filter( | |||||
| args.grad_tensor->ptr<float>(), args.diff_tensor->ptr<float>(), | |||||
| args.filter_tensor->ptr<float>(), kparam, stream); | |||||
| break; | |||||
| #if CUDA_VERSION >= 9000 | |||||
| case DTypeEnum::Float16: | |||||
| chanwise::run_bwd_depthwise_large_filter( | |||||
| static_cast<half*>(args.grad_tensor->raw_ptr()), | |||||
| static_cast<half*>(args.diff_tensor->raw_ptr()), | |||||
| static_cast<half*>(args.filter_tensor->raw_ptr()), kparam, stream); | |||||
| break; | |||||
| #endif | |||||
| default: | |||||
| megdnn_assert_internal(0); | |||||
| } | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,149 @@ | |||||
| /** | |||||
| * \file | |||||
| * dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float16_nchw_hmma.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "src/cuda/convolution/backward_data/algo.h" | |||||
| #include "src/cuda/cutlass/singleton.h" | |||||
| #include "src/cuda/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace cuda; | |||||
| using namespace cutlass::library; | |||||
| const void* ConvolutionBackwardDataImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm:: | |||||
| get_available_op(const SizeArgs& args) const { | |||||
| int alignment_diff = 0; | |||||
| int wo = args.diff_layout->dtype.size(args.diff_layout->operator[](3)); | |||||
| for (int candidate : {16, 4, 2}) { | |||||
| if (wo % candidate == 0) { | |||||
| alignment_diff = candidate; | |||||
| break; | |||||
| } | |||||
| } | |||||
| alignment_diff /= args.diff_layout->dtype.size(1); | |||||
| NumericTypeID accumulator_dtype = | |||||
| args.opr->param().compute_mode == param::Convolution::ComputeMode::DEFAULT | |||||
| ? NumericTypeID::kF16 | |||||
| : NumericTypeID::kF32; | |||||
| ConvolutionKey key{ | |||||
| cutlass::conv::Operator::kDgrad, | |||||
| NumericTypeID::kF16, | |||||
| LayoutTypeID::kTensorNCHW, | |||||
| NumericTypeID::kF16, | |||||
| LayoutTypeID::kTensorNCHW, | |||||
| NumericTypeID::kF16, | |||||
| LayoutTypeID::kTensorNCHW, | |||||
| NumericTypeID::kF16, | |||||
| LayoutTypeID::kTensorNCHW, | |||||
| accumulator_dtype, | |||||
| cutlass::conv::ConvType::kDepthwiseConvolution, | |||||
| m_algo_param.threadblock_m, | |||||
| m_algo_param.threadblock_n, | |||||
| m_algo_param.threadblock_k, | |||||
| m_algo_param.warp_m, | |||||
| m_algo_param.warp_n, | |||||
| m_algo_param.warp_k, | |||||
| m_algo_param.instruction_m, | |||||
| m_algo_param.instruction_n, | |||||
| m_algo_param.instruction_k, | |||||
| cutlass::epilogue::EpilogueType::kBiasAddLinearCombination, | |||||
| m_algo_param.stage, | |||||
| cutlass::conv::SpecialOptimizeDesc::NONE, | |||||
| alignment_diff, | |||||
| 1, | |||||
| false}; | |||||
| return (void*)Singleton::get().operation_table.find_op(key); | |||||
| } | |||||
| bool ConvolutionBackwardDataImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available( | |||||
| const SizeArgs& args) const { | |||||
| #define RETURN_IF_FALSE(stmt_) \ | |||||
| if (!(stmt_)) \ | |||||
| return false; | |||||
| RETURN_IF_FALSE(is_compute_capability_required(7, 0)); | |||||
| RETURN_IF_FALSE( | |||||
| args.diff_layout->is_contiguous() && args.grad_layout->is_contiguous()); | |||||
| using Param = param::Convolution; | |||||
| using Format = Param::Format; | |||||
| using Sparse = Param::Sparse; | |||||
| using Mode = Param::Mode; | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.filter_meta; | |||||
| RETURN_IF_FALSE( | |||||
| param.format == Format::NCHW && | |||||
| args.diff_layout->dtype.enumv() == DTypeEnum::Float16 && | |||||
| args.filter_layout->dtype.enumv() == DTypeEnum::Float16 && | |||||
| args.grad_layout->dtype.enumv() == DTypeEnum::Float16); | |||||
| RETURN_IF_FALSE(param.sparse == Sparse::GROUP); | |||||
| RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION); | |||||
| // check if channelwise convolution | |||||
| RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1); | |||||
| RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1); | |||||
| const auto* op = get_available_op(args); | |||||
| RETURN_IF_FALSE(op != nullptr); | |||||
| return true; | |||||
| #undef RETURN_IF_FALSE | |||||
| } | |||||
| void ConvolutionBackwardDataImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::exec( | |||||
| const ExecArgs& args) const { | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.filter_meta; | |||||
| int n = args.diff_layout->operator[](0), ho = args.diff_layout->operator[](2), | |||||
| wo = args.diff_layout->operator[](3); | |||||
| int hi = args.grad_layout->operator[](2), wi = args.grad_layout->operator[](3); | |||||
| int co = fm.group, ci = co, groups = co; | |||||
| int fh = fm.spatial[0], fw = fm.spatial[1]; | |||||
| int sh = fm.stride[0], sw = fm.stride[1]; | |||||
| int ph = fm.padding[0], pw = fm.padding[1]; | |||||
| int dh = param.dilate_h, dw = param.dilate_w; | |||||
| // check if channelwise convolution | |||||
| megdnn_assert(fm.icpg == 1 && fm.ocpg == 1); | |||||
| auto&& stream = cuda_stream(args.opr->handle()); | |||||
| float alpha = 1.f; | |||||
| float beta = 0.f; | |||||
| float gamma = 0.f; | |||||
| float delta = 0.f; | |||||
| const Operation* op = (const Operation*)get_available_op(args); | |||||
| cutlass::conv::Conv2dProblemSize problem_size{ | |||||
| n, hi, wi, ci, co, fh, fw, ho, | |||||
| wo, ph, pw, sh, sw, dh, dw, cutlass::conv::Mode::kCrossCorrelation, | |||||
| 1, // split k slices, always 1 | |||||
| groups, // groups | |||||
| }; | |||||
| cutlass::library::ConvolutionArguments conv_args{ | |||||
| problem_size, | |||||
| args.diff_tensor->raw_ptr(), | |||||
| args.filter_tensor->raw_ptr(), | |||||
| nullptr, | |||||
| nullptr, | |||||
| args.grad_tensor->raw_ptr(), | |||||
| &alpha, | |||||
| &beta, | |||||
| &gamma, | |||||
| &delta, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr}; | |||||
| cutlass_check(op->run(&conv_args, nullptr, stream)); | |||||
| after_kernel_launch(); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,145 @@ | |||||
| /** | |||||
| * \file | |||||
| * dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float32_nchw_fma.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "src/cuda/convolution/backward_data/algo.h" | |||||
| #include "src/cuda/cutlass/singleton.h" | |||||
| #include "src/cuda/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace cuda; | |||||
| using namespace cutlass::library; | |||||
| const void* ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm:: | |||||
| get_available_op(const SizeArgs& args) const { | |||||
| int alignment_diff = 0; | |||||
| int wo = args.diff_layout->dtype.size(args.diff_layout->operator[](3)); | |||||
| for (int candidate : {16, 4}) { | |||||
| if (wo % candidate == 0) { | |||||
| alignment_diff = candidate; | |||||
| break; | |||||
| } | |||||
| } | |||||
| alignment_diff /= args.diff_layout->dtype.size(1); | |||||
| ConvolutionKey key{ | |||||
| cutlass::conv::Operator::kDgrad, | |||||
| NumericTypeID::kF32, | |||||
| LayoutTypeID::kTensorNCHW, | |||||
| NumericTypeID::kF32, | |||||
| LayoutTypeID::kTensorNCHW, | |||||
| NumericTypeID::kF32, | |||||
| LayoutTypeID::kTensorNCHW, | |||||
| NumericTypeID::kF32, | |||||
| LayoutTypeID::kTensorNCHW, | |||||
| NumericTypeID::kF32, | |||||
| cutlass::conv::ConvType::kDepthwiseConvolution, | |||||
| m_algo_param.threadblock_m, | |||||
| m_algo_param.threadblock_n, | |||||
| m_algo_param.threadblock_k, | |||||
| m_algo_param.warp_m, | |||||
| m_algo_param.warp_n, | |||||
| m_algo_param.warp_k, | |||||
| 1, | |||||
| 1, | |||||
| 1, | |||||
| cutlass::epilogue::EpilogueType::kBiasAddLinearCombination, | |||||
| m_algo_param.stage, | |||||
| cutlass::conv::SpecialOptimizeDesc::NONE, | |||||
| alignment_diff, | |||||
| 1, | |||||
| false}; | |||||
| return (void*)Singleton::get().operation_table.find_op(key); | |||||
| } | |||||
| bool ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available( | |||||
| const SizeArgs& args) const { | |||||
| #define RETURN_IF_FALSE(stmt_) \ | |||||
| if (!(stmt_)) \ | |||||
| return false; | |||||
| RETURN_IF_FALSE(is_compute_capability_required(6, 1)); | |||||
| RETURN_IF_FALSE( | |||||
| args.diff_layout->is_contiguous() && args.grad_layout->is_contiguous()); | |||||
| using Param = param::Convolution; | |||||
| using Format = Param::Format; | |||||
| using Sparse = Param::Sparse; | |||||
| using Mode = Param::Mode; | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.filter_meta; | |||||
| RETURN_IF_FALSE( | |||||
| param.format == Format::NCHW && | |||||
| args.diff_layout->dtype.enumv() == DTypeEnum::Float32 && | |||||
| args.filter_layout->dtype.enumv() == DTypeEnum::Float32 && | |||||
| args.grad_layout->dtype.enumv() == DTypeEnum::Float32); | |||||
| RETURN_IF_FALSE(param.sparse == Sparse::GROUP); | |||||
| RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION); | |||||
| // check if channelwise convolution | |||||
| RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1); | |||||
| RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1); | |||||
| const auto* op = get_available_op(args); | |||||
| RETURN_IF_FALSE(op != nullptr); | |||||
| return true; | |||||
| #undef RETURN_IF_FALSE | |||||
| } | |||||
| void ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::exec( | |||||
| const ExecArgs& args) const { | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.filter_meta; | |||||
| int n = args.diff_layout->operator[](0), ho = args.diff_layout->operator[](2), | |||||
| wo = args.diff_layout->operator[](3); | |||||
| int hi = args.grad_layout->operator[](2), wi = args.grad_layout->operator[](3); | |||||
| int co = fm.group, ci = co, groups = co; | |||||
| int fh = fm.spatial[0], fw = fm.spatial[1]; | |||||
| int sh = fm.stride[0], sw = fm.stride[1]; | |||||
| int ph = fm.padding[0], pw = fm.padding[1]; | |||||
| int dh = param.dilate_h, dw = param.dilate_w; | |||||
| // check if channelwise convolution | |||||
| megdnn_assert(fm.icpg == 1 && fm.ocpg == 1); | |||||
| auto&& stream = cuda_stream(args.opr->handle()); | |||||
| float alpha = 1.f; | |||||
| float beta = 0.f; | |||||
| float gamma = 0.f; | |||||
| float delta = 0.f; | |||||
| const Operation* op = (const Operation*)get_available_op(args); | |||||
| cutlass::conv::Conv2dProblemSize problem_size{ | |||||
| n, hi, wi, ci, co, fh, fw, ho, | |||||
| wo, ph, pw, sh, sw, dh, dw, cutlass::conv::Mode::kCrossCorrelation, | |||||
| 1, // split k slices, always 1 | |||||
| groups, // groups | |||||
| }; | |||||
| cutlass::library::ConvolutionArguments conv_args{ | |||||
| problem_size, | |||||
| args.diff_tensor->raw_ptr(), | |||||
| args.filter_tensor->raw_ptr(), | |||||
| nullptr, | |||||
| nullptr, | |||||
| args.grad_tensor->raw_ptr(), | |||||
| &alpha, | |||||
| &beta, | |||||
| &gamma, | |||||
| &delta, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr}; | |||||
| cutlass_check(op->run(&conv_args, nullptr, stream)); | |||||
| after_kernel_launch(); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -29,6 +29,19 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: | |||||
| (sh == 2 && sw == 2) | (sh == 2 && sw == 2) | ||||
| ? cutlass::conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING | ? cutlass::conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING | ||||
| : cutlass::conv::SpecialOptimizeDesc::NONE; | : cutlass::conv::SpecialOptimizeDesc::NONE; | ||||
| int alignment_filter = 4; | |||||
| constexpr int warp_size = 32; | |||||
| int threads = warp_size * m_algo_param.threadblock_m * m_algo_param.threadblock_n * | |||||
| m_algo_param.threadblock_k / | |||||
| (m_algo_param.warp_m * m_algo_param.warp_n * m_algo_param.warp_k); | |||||
| int threadblock_loads = args.filter_layout->dtype.size( | |||||
| m_algo_param.threadblock_m * m_algo_param.threadblock_k); | |||||
| int load_per_thread = threadblock_loads / threads; | |||||
| if (load_per_thread >= 16) | |||||
| alignment_filter = 16; | |||||
| else if (load_per_thread >= 8) | |||||
| alignment_filter = 8; | |||||
| megdnn_assert(load_per_thread >= 4); | |||||
| ConvolutionKey key{ | ConvolutionKey key{ | ||||
| cutlass::conv::Operator::kDgrad, | cutlass::conv::Operator::kDgrad, | ||||
| NumericTypeID::kS8, | NumericTypeID::kS8, | ||||
| @@ -39,6 +52,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: | |||||
| LayoutTypeID::kTensorNC4HW4, | LayoutTypeID::kTensorNC4HW4, | ||||
| NumericTypeID::kS32, | NumericTypeID::kS32, | ||||
| LayoutTypeID::kTensorNC4HW4, | LayoutTypeID::kTensorNC4HW4, | ||||
| NumericTypeID::kS32, | |||||
| cutlass::conv::ConvType::kConvolution, | cutlass::conv::ConvType::kConvolution, | ||||
| m_algo_param.threadblock_m, | m_algo_param.threadblock_m, | ||||
| m_algo_param.threadblock_n, | m_algo_param.threadblock_n, | ||||
| @@ -52,6 +66,8 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: | |||||
| cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, | cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, | ||||
| m_algo_param.stage, | m_algo_param.stage, | ||||
| special_optimization, | special_optimization, | ||||
| 4, | |||||
| alignment_filter, | |||||
| false}; | false}; | ||||
| return (void*)Singleton::get().operation_table.find_op(key); | return (void*)Singleton::get().operation_table.find_op(key); | ||||
| } | } | ||||
| @@ -39,6 +39,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm:: | |||||
| LayoutTypeID::kTensorNC4HW4, | LayoutTypeID::kTensorNC4HW4, | ||||
| NumericTypeID::kS32, | NumericTypeID::kS32, | ||||
| LayoutTypeID::kTensorNC4HW4, | LayoutTypeID::kTensorNC4HW4, | ||||
| NumericTypeID::kS32, | |||||
| cutlass::conv::ConvType::kConvolution, | cutlass::conv::ConvType::kConvolution, | ||||
| 16, | 16, | ||||
| 64, | 64, | ||||
| @@ -52,6 +53,8 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm:: | |||||
| cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, | cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, | ||||
| 2, | 2, | ||||
| special_optimization, | special_optimization, | ||||
| 4, | |||||
| 4, | |||||
| false}; | false}; | ||||
| return (void*)Singleton::get().operation_table.find_op(key); | return (void*)Singleton::get().operation_table.find_op(key); | ||||
| } | } | ||||
| @@ -50,6 +50,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NHWCIMMAImplicitGemm::get_avail | |||||
| LayoutTypeID::kTensorNHWC, | LayoutTypeID::kTensorNHWC, | ||||
| NumericTypeID::kS32, | NumericTypeID::kS32, | ||||
| LayoutTypeID::kTensorNHWC, | LayoutTypeID::kTensorNHWC, | ||||
| NumericTypeID::kS32, | |||||
| cutlass::conv::ConvType::kConvolution, | cutlass::conv::ConvType::kConvolution, | ||||
| m_algo_param.threadblock_m, | m_algo_param.threadblock_m, | ||||
| m_algo_param.threadblock_n, | m_algo_param.threadblock_n, | ||||
| @@ -63,6 +64,8 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NHWCIMMAImplicitGemm::get_avail | |||||
| cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, | cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp, | ||||
| m_algo_param.stage, | m_algo_param.stage, | ||||
| special_optimization, | special_optimization, | ||||
| m_algo_param.access_size, | |||||
| m_algo_param.access_size, | |||||
| false}; | false}; | ||||
| return (void*)Singleton::get().operation_table.find_op(key); | return (void*)Singleton::get().operation_table.find_op(key); | ||||
| } | } | ||||
| @@ -25,6 +25,7 @@ ConvolutionBackwardFilterImpl::AlgoPack::AlgoPack() { | |||||
| for (auto&& i : cudnn) { | for (auto&& i : cudnn) { | ||||
| all_algos.push_back(&i); | all_algos.push_back(&i); | ||||
| } | } | ||||
| fill_dwconv_algos(); | |||||
| all_algos.push_back(&matmul); | all_algos.push_back(&matmul); | ||||
| all_algos.push_back(&group); | all_algos.push_back(&group); | ||||
| @@ -48,6 +49,39 @@ ConvolutionBackwardFilterImpl::AlgoCUDNN* ConvolutionBackwardFilterImpl::AlgoPac | |||||
| "can not find cudnn bwd_filter algorithm %d", static_cast<int>(algo))); | "can not find cudnn bwd_filter algorithm %d", static_cast<int>(algo))); | ||||
| } | } | ||||
| void ConvolutionBackwardFilterImpl::AlgoPack::fill_dwconv_algos() { | |||||
| { | |||||
| using AlgoParam = AlgoFloat32NCHWFMAImplicitBatchedGemm::AlgoParam; | |||||
| /// preferred algo | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8, 2}); | |||||
| implbmm_nchw_fma.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8, 2}); | |||||
| for (auto&& algo : implbmm_nchw_fma) { | |||||
| all_algos.push_back(&algo); | |||||
| } | |||||
| } | |||||
| #if CUDA_VERSION >= 10010 | |||||
| { | |||||
| using AlgoParam = AlgoFloat16NCHWHMMAImplicitBatchedGemm::AlgoParam; | |||||
| /// preferred algo | |||||
| implbmm_nchw_hmma.emplace_back(AlgoParam{64, 128, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| implbmm_nchw_hmma.emplace_back(AlgoParam{128, 128, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| implbmm_nchw_hmma.emplace_back(AlgoParam{128, 256, 32, 64, 64, 32, 8, 8, 4, 2}); | |||||
| implbmm_nchw_hmma.emplace_back(AlgoParam{128, 64, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| implbmm_nchw_hmma.emplace_back(AlgoParam{64, 64, 32, 32, 32, 32, 8, 8, 4, 2}); | |||||
| for (auto&& algo : implbmm_nchw_hmma) { | |||||
| all_algos.push_back(&algo); | |||||
| } | |||||
| } | |||||
| #endif | |||||
| } | |||||
| ConvolutionBackwardFilterImpl::AlgoPack ConvolutionBackwardFilterImpl::sm_algo_pack; | ConvolutionBackwardFilterImpl::AlgoPack ConvolutionBackwardFilterImpl::sm_algo_pack; | ||||
| ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( | ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( | ||||
| @@ -37,6 +37,8 @@ public: | |||||
| CUDA_CHANWISE, | CUDA_CHANWISE, | ||||
| CUDA_BFLOAT16, | CUDA_BFLOAT16, | ||||
| CUDA_GROUP_CONV_GENERAL, | CUDA_GROUP_CONV_GENERAL, | ||||
| CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32, | |||||
| CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16, | |||||
| }; | }; | ||||
| using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>; | ||||
| @@ -210,9 +212,86 @@ private: | |||||
| WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; | ||||
| }; | }; | ||||
| class ConvolutionBackwardFilterImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm final | |||||
| : public AlgoBase { | |||||
| public: | |||||
| struct AlgoParam { | |||||
| int threadblock_m; | |||||
| int threadblock_n; | |||||
| int threadblock_k; | |||||
| int warp_m; | |||||
| int warp_n; | |||||
| int warp_k; | |||||
| int stage; | |||||
| std::string to_string() { | |||||
| return ssprintf( | |||||
| "_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n, | |||||
| threadblock_k, warp_m, warp_n, warp_k, stage); | |||||
| } | |||||
| }; | |||||
| AlgoFloat32NCHWFMAImplicitBatchedGemm(AlgoParam algo_param) | |||||
| : m_algo_param{algo_param}, | |||||
| m_name{ssprintf( | |||||
| "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM%s", | |||||
| m_algo_param.to_string().c_str())} {} | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override { return 0; } | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return m_name.c_str(); } | |||||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } | |||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32) | |||||
| private: | |||||
| const void* get_available_op(const SizeArgs& args) const; | |||||
| AlgoParam m_algo_param; | |||||
| std::string m_name; | |||||
| }; | |||||
| class ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm final | |||||
| : public AlgoBase { | |||||
| public: | |||||
| /// add instruction shape as member of algo param, because f16 tensor core has 2 | |||||
| /// different matrix shapes (i.e. mma.884 and mma.1688) | |||||
| struct AlgoParam { | |||||
| int threadblock_m; | |||||
| int threadblock_n; | |||||
| int threadblock_k; | |||||
| int warp_m; | |||||
| int warp_n; | |||||
| int warp_k; | |||||
| int instruction_m; | |||||
| int instruction_n; | |||||
| int instruction_k; | |||||
| int stage; | |||||
| std::string to_string() { | |||||
| return ssprintf( | |||||
| "_%dX%dX%d_%dX%dX%d_mma%dX%dX%d_%dstage", threadblock_m, | |||||
| threadblock_n, threadblock_k, warp_m, warp_n, warp_k, instruction_m, | |||||
| instruction_n, instruction_k, stage); | |||||
| } | |||||
| }; | |||||
| AlgoFloat16NCHWHMMAImplicitBatchedGemm(AlgoParam algo_param) | |||||
| : m_algo_param{algo_param}, | |||||
| m_name{ssprintf( | |||||
| "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM%s", | |||||
| m_algo_param.to_string().c_str())} {} | |||||
| bool is_available(const SizeArgs& args) const override; | |||||
| size_t get_workspace_in_bytes(const SizeArgs& args) const override; | |||||
| void exec(const ExecArgs& args) const override; | |||||
| const char* name() const override { return m_name.c_str(); } | |||||
| AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } | |||||
| MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16) | |||||
| private: | |||||
| const void* get_available_op(const SizeArgs& args) const; | |||||
| AlgoParam m_algo_param; | |||||
| std::string m_name; | |||||
| }; | |||||
| class ConvolutionBackwardFilterImpl::AlgoPack : NonCopyableObj { | class ConvolutionBackwardFilterImpl::AlgoPack : NonCopyableObj { | ||||
| // defined in cudnn.cpp | // defined in cudnn.cpp | ||||
| void fill_cudnn_algos(); | void fill_cudnn_algos(); | ||||
| void fill_dwconv_algos(); | |||||
| AlgoBase::Mapper m_all_algos_map; | AlgoBase::Mapper m_all_algos_map; | ||||
| @@ -224,6 +303,8 @@ public: | |||||
| AlgoChanwise chanwise; | AlgoChanwise chanwise; | ||||
| AlgoGroupConvGeneral group; | AlgoGroupConvGeneral group; | ||||
| AlgoBFloat16 bfloat16; | AlgoBFloat16 bfloat16; | ||||
| std::vector<AlgoFloat32NCHWFMAImplicitBatchedGemm> implbmm_nchw_fma; | |||||
| std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> implbmm_nchw_hmma; | |||||
| std::vector<AlgoBase*> | std::vector<AlgoBase*> | ||||
| //! all algorithms | //! all algorithms | ||||
| @@ -0,0 +1,172 @@ | |||||
| /** | |||||
| * \file | |||||
| * dnn/src/cuda/convolution/backward_filter/implicit_batched_gemm_float16_nchw_hmma.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "src/cuda/convolution/backward_filter/algo.h" | |||||
| #include "src/cuda/cutlass/singleton.h" | |||||
| #include "src/cuda/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace cuda; | |||||
| using namespace cutlass::library; | |||||
| const void* ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm:: | |||||
| get_available_op(const SizeArgs& args) const { | |||||
| auto get_alignment = [](const TensorLayout& layout) { | |||||
| int alignment = 0; | |||||
| int width = layout.dtype.size(layout[3]); | |||||
| for (int candidate : {16, 4, 2}) { | |||||
| if (width % candidate == 0) { | |||||
| alignment = candidate; | |||||
| break; | |||||
| } | |||||
| } | |||||
| alignment /= layout.dtype.size(1); | |||||
| return alignment; | |||||
| }; | |||||
| int alignment_src = get_alignment(*args.src_layout); | |||||
| int alignment_diff = get_alignment(*args.diff_layout); | |||||
| megdnn_assert(alignment_src >= 1 && alignment_diff >= 1); | |||||
| NumericTypeID accumulator_dtype = | |||||
| args.opr->param().compute_mode == param::Convolution::ComputeMode::DEFAULT | |||||
| ? NumericTypeID::kF16 | |||||
| : NumericTypeID::kF32; | |||||
| ConvolutionKey key{ | |||||
| cutlass::conv::Operator::kWgrad, | |||||
| NumericTypeID::kF16, // src tensor data type | |||||
| LayoutTypeID::kTensorNCHW, // src tensor layout | |||||
| NumericTypeID::kF16, // diff tensor data type | |||||
| LayoutTypeID::kTensorNCHW, // diff tensor layout | |||||
| NumericTypeID::kF32, // grad tensor data type | |||||
| LayoutTypeID::kTensorNCHW, // grad tensor layout | |||||
| NumericTypeID::kF32, // dummy argument, not used. | |||||
| LayoutTypeID::kTensorNCHW, // dummy argument, not used | |||||
| accumulator_dtype, | |||||
| cutlass::conv::ConvType::kDepthwiseConvolution, | |||||
| m_algo_param.threadblock_m, | |||||
| m_algo_param.threadblock_n, | |||||
| m_algo_param.threadblock_k, | |||||
| m_algo_param.warp_m, | |||||
| m_algo_param.warp_n, | |||||
| m_algo_param.warp_k, | |||||
| m_algo_param.instruction_m, | |||||
| m_algo_param.instruction_n, | |||||
| m_algo_param.instruction_k, | |||||
| cutlass::epilogue::EpilogueType::kLinearCombination, // no bias | |||||
| m_algo_param.stage, | |||||
| cutlass::conv::SpecialOptimizeDesc::NONE, | |||||
| alignment_src, | |||||
| alignment_diff, | |||||
| true}; | |||||
| return (void*)Singleton::get().operation_table.find_op(key); | |||||
| } | |||||
| bool ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm:: | |||||
| is_available(const SizeArgs& args) const { | |||||
| #define RETURN_IF_FALSE(stmt_) \ | |||||
| if (!(stmt_)) \ | |||||
| return false; | |||||
| RETURN_IF_FALSE(is_compute_capability_required(7, 0)); | |||||
| RETURN_IF_FALSE( | |||||
| args.src_layout->is_contiguous() && args.diff_layout->is_contiguous() && | |||||
| args.grad_layout->is_contiguous()); | |||||
| using Param = param::Convolution; | |||||
| using Format = Param::Format; | |||||
| using Sparse = Param::Sparse; | |||||
| using Mode = Param::Mode; | |||||
| using ComputeMode = Param::ComputeMode; | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.grad_filter_meta; | |||||
| RETURN_IF_FALSE(param.compute_mode == ComputeMode::FLOAT32); | |||||
| RETURN_IF_FALSE( | |||||
| param.format == Format::NCHW && | |||||
| args.src_layout->dtype.enumv() == DTypeEnum::Float16 && | |||||
| args.diff_layout->dtype.enumv() == DTypeEnum::Float16 && | |||||
| args.grad_layout->dtype.enumv() == DTypeEnum::Float16); | |||||
| RETURN_IF_FALSE(param.sparse == Sparse::GROUP); | |||||
| RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION); | |||||
| // check if channelwise convolution | |||||
| RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1); | |||||
| RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1); | |||||
| const auto* op = get_available_op(args); | |||||
| RETURN_IF_FALSE(op != nullptr); | |||||
| return true; | |||||
| #undef RETURN_IF_FALSE | |||||
| } | |||||
| size_t ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm:: | |||||
| get_workspace_in_bytes(const SizeArgs& args) const { | |||||
| auto layout = *args.grad_layout; | |||||
| // modify data type | |||||
| layout.modify_dtype_inplace(dtype::Float32()); | |||||
| return layout.span().dist_byte(); | |||||
| } | |||||
| void ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::exec( | |||||
| const ExecArgs& args) const { | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.grad_filter_meta; | |||||
| int hi = args.src_layout->operator[](2), wi = args.src_layout->operator[](3); | |||||
| int n = args.diff_layout->operator[](0), ho = args.diff_layout->operator[](2), | |||||
| wo = args.diff_layout->operator[](3); | |||||
| int co = fm.group, ci = co, groups = co; | |||||
| int fh = fm.spatial[0], fw = fm.spatial[1]; | |||||
| int sh = fm.stride[0], sw = fm.stride[1]; | |||||
| int ph = fm.padding[0], pw = fm.padding[1]; | |||||
| int dh = param.dilate_h, dw = param.dilate_w; | |||||
| // check if channelwise convolution | |||||
| megdnn_assert(fm.icpg == 1 && fm.ocpg == 1); | |||||
| auto&& stream = cuda_stream(args.opr->handle()); | |||||
| float alpha = 1.f; | |||||
| float beta = 0.f; | |||||
| const Operation* op = (const Operation*)get_available_op(args); | |||||
| cutlass::conv::Conv2dProblemSize problem_size{ | |||||
| n, hi, wi, ci, co, fh, fw, ho, | |||||
| wo, ph, pw, sh, sw, dh, dw, cutlass::conv::Mode::kCrossCorrelation, | |||||
| 1, // split k slices, always 1 | |||||
| groups, // groups | |||||
| }; | |||||
| cutlass::library::ConvolutionArguments conv_args{ | |||||
| problem_size, | |||||
| args.src_tensor->raw_ptr(), | |||||
| args.diff_tensor->raw_ptr(), | |||||
| nullptr, | |||||
| nullptr, | |||||
| args.workspace.raw_ptr, | |||||
| &alpha, | |||||
| &beta, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr}; | |||||
| cutlass_check(op->run(&conv_args, nullptr, stream)); | |||||
| after_kernel_launch(); | |||||
| auto&& typecvt = args.opr->handle()->create_operator<TypeCvt>(); | |||||
| auto f32_grad_layout = *args.grad_layout; | |||||
| // modify data type | |||||
| f32_grad_layout.modify_dtype_inplace(dtype::Float32()); | |||||
| TensorND src{args.workspace.raw_ptr, f32_grad_layout}, | |||||
| dst{args.grad_tensor->raw_ptr(), *args.grad_layout}; | |||||
| typecvt->exec(src, dst); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,135 @@ | |||||
| /** | |||||
| * \file | |||||
| * dnn/src/cuda/convolution/backward_filter/implicit_batched_gemm_float32_nchw_fma.cpp | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
| * implied. | |||||
| */ | |||||
| #include "src/cuda/convolution/backward_filter/algo.h" | |||||
| #include "src/cuda/cutlass/singleton.h" | |||||
| #include "src/cuda/utils.h" | |||||
| using namespace megdnn; | |||||
| using namespace cuda; | |||||
| using namespace cutlass::library; | |||||
| const void* ConvolutionBackwardFilterImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm:: | |||||
| get_available_op(const SizeArgs& args) const { | |||||
| ConvolutionKey key{ | |||||
| cutlass::conv::Operator::kWgrad, | |||||
| NumericTypeID::kF32, // src tensor data type | |||||
| LayoutTypeID::kTensorNCHW, // src tensor layout | |||||
| NumericTypeID::kF32, // diff tensor data type | |||||
| LayoutTypeID::kTensorNCHW, // diff tensor layout | |||||
| NumericTypeID::kF32, // grad tensor data type | |||||
| LayoutTypeID::kTensorNCHW, // grad tensor layout | |||||
| NumericTypeID::kF32, // dummy argument, not used. | |||||
| LayoutTypeID::kTensorNCHW, // dummy argument, not used | |||||
| NumericTypeID::kF32, | |||||
| cutlass::conv::ConvType::kDepthwiseConvolution, | |||||
| m_algo_param.threadblock_m, | |||||
| m_algo_param.threadblock_n, | |||||
| m_algo_param.threadblock_k, | |||||
| m_algo_param.warp_m, | |||||
| m_algo_param.warp_n, | |||||
| m_algo_param.warp_k, | |||||
| 1, | |||||
| 1, | |||||
| 1, | |||||
| cutlass::epilogue::EpilogueType::kLinearCombination, // no bias | |||||
| m_algo_param.stage, | |||||
| cutlass::conv::SpecialOptimizeDesc::NONE, | |||||
| 1, | |||||
| 1, | |||||
| true}; | |||||
| return (void*)Singleton::get().operation_table.find_op(key); | |||||
| } | |||||
| bool ConvolutionBackwardFilterImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available( | |||||
| const SizeArgs& args) const { | |||||
| #define RETURN_IF_FALSE(stmt_) \ | |||||
| if (!(stmt_)) \ | |||||
| return false; | |||||
| RETURN_IF_FALSE(is_compute_capability_required(6, 1)); | |||||
| RETURN_IF_FALSE( | |||||
| args.src_layout->is_contiguous() && args.diff_layout->is_contiguous() && | |||||
| args.grad_layout->is_contiguous()); | |||||
| using Param = param::Convolution; | |||||
| using Format = Param::Format; | |||||
| using Sparse = Param::Sparse; | |||||
| using Mode = Param::Mode; | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.grad_filter_meta; | |||||
| RETURN_IF_FALSE( | |||||
| param.format == Format::NCHW && | |||||
| args.src_layout->dtype.enumv() == DTypeEnum::Float32 && | |||||
| args.diff_layout->dtype.enumv() == DTypeEnum::Float32 && | |||||
| args.grad_layout->dtype.enumv() == DTypeEnum::Float32); | |||||
| RETURN_IF_FALSE(param.sparse == Sparse::GROUP); | |||||
| RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION); | |||||
| // check if channelwise convolution | |||||
| RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1); | |||||
| RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1); | |||||
| const auto* op = get_available_op(args); | |||||
| RETURN_IF_FALSE(op != nullptr); | |||||
| return true; | |||||
| #undef RETURN_IF_FALSE | |||||
| } | |||||
| void ConvolutionBackwardFilterImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::exec( | |||||
| const ExecArgs& args) const { | |||||
| auto&& param = args.opr->param(); | |||||
| auto&& fm = args.grad_filter_meta; | |||||
| int hi = args.src_layout->operator[](2), wi = args.src_layout->operator[](3); | |||||
| int n = args.diff_layout->operator[](0), ho = args.diff_layout->operator[](2), | |||||
| wo = args.diff_layout->operator[](3); | |||||
| int co = fm.group, ci = co, groups = co; | |||||
| int fh = fm.spatial[0], fw = fm.spatial[1]; | |||||
| int sh = fm.stride[0], sw = fm.stride[1]; | |||||
| int ph = fm.padding[0], pw = fm.padding[1]; | |||||
| int dh = param.dilate_h, dw = param.dilate_w; | |||||
| // check if channelwise convolution | |||||
| megdnn_assert(fm.icpg == 1 && fm.ocpg == 1); | |||||
| auto&& stream = cuda_stream(args.opr->handle()); | |||||
| float alpha = 1.f; | |||||
| float beta = 0.f; | |||||
| const Operation* op = (const Operation*)get_available_op(args); | |||||
| cutlass::conv::Conv2dProblemSize problem_size{ | |||||
| n, hi, wi, ci, co, fh, fw, ho, | |||||
| wo, ph, pw, sh, sw, dh, dw, cutlass::conv::Mode::kCrossCorrelation, | |||||
| 1, // split k slices, always 1 | |||||
| groups, // groups | |||||
| }; | |||||
| cutlass::library::ConvolutionArguments conv_args{ | |||||
| problem_size, | |||||
| args.src_tensor->raw_ptr(), | |||||
| args.diff_tensor->raw_ptr(), | |||||
| nullptr, | |||||
| nullptr, | |||||
| args.grad_tensor->raw_ptr(), | |||||
| &alpha, | |||||
| &beta, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr, | |||||
| nullptr}; | |||||
| cutlass_check(op->run(&conv_args, nullptr, stream)); | |||||
| after_kernel_launch(); | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | |||||
| @@ -0,0 +1,54 @@ | |||||
| /** | |||||
| * \file dnn/src/cuda/conv_bias/chanwise/bwd_large_filter.cu | |||||
| * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
| * | |||||
| * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, | |||||
| * software distributed under the License is distributed on an | |||||
| * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| */ | |||||
| #include "./kern.cuh" | |||||
| #include "./kern_helper.cuh" | |||||
| #include "cuda.h" | |||||
| #include "cuda_fp16.h" | |||||
| #include "src/cuda/convolution/chanwise/launch_config.cuh" | |||||
| #include "src/cuda/fp16_help.cuh" | |||||
| using namespace megdnn; | |||||
| using namespace cuda; | |||||
| using namespace convolution; | |||||
| using namespace chanwise; | |||||
| #include "src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh" | |||||
| namespace megdnn { | |||||
| namespace cuda { | |||||
| namespace convolution { | |||||
| namespace chanwise { | |||||
| // =====================================fwd===================================== | |||||
| template <> | |||||
| void run_bwd_depthwise_large_filter( | |||||
| float* dst, const float* src, const float* flt, const Param& param, | |||||
| cudaStream_t stream) { | |||||
| INSTANCE(float, float2, DepthwiseConv2dDirection::DIRECTION_BACKWARD) | |||||
| } | |||||
| #if CUDA_VERSION >= 9000 | |||||
| template <> | |||||
| void run_bwd_depthwise_large_filter( | |||||
| __half* dst, const __half* src, const __half* flt, const Param& param, | |||||
| cudaStream_t stream) { | |||||
| INSTANCE(__half, __half2, DepthwiseConv2dDirection::DIRECTION_BACKWARD) | |||||
| } | |||||
| #endif | |||||
| } // namespace chanwise | |||||
| } // namespace convolution | |||||
| } // namespace cuda | |||||
| } // namespace megdnn | |||||
| // vim: syntax=cuda.doxygen | |||||
| @@ -27,8 +27,10 @@ namespace chanwise { | |||||
| struct Param { | struct Param { | ||||
| uint32_t batch, src_chl, src_h, src_w, chl_mul, flt_h, flt_w, out_h, out_w, pad_h, | uint32_t batch, src_chl, src_h, src_w, chl_mul, flt_h, flt_w, out_h, out_w, pad_h, | ||||
| pad_w, stride_h, stride_w, dilation_h, dilation_w; | pad_w, stride_h, stride_w, dilation_h, dilation_w; | ||||
| bool is_compute_deafult; | |||||
| #if MEGDNN_CC_HOST | #if MEGDNN_CC_HOST | ||||
| static Param from_fwd_args(const ForwardSizeArgs& args) { | |||||
| static Param from_fwd_args( | |||||
| const ForwardSizeArgs& args, bool is_compute_deafult_ = true) { | |||||
| #define U(v) static_cast<uint32_t>(v) | #define U(v) static_cast<uint32_t>(v) | ||||
| auto&& src = args.src_layout->shape; | auto&& src = args.src_layout->shape; | ||||
| auto&& dst = args.dst_layout->shape; | auto&& dst = args.dst_layout->shape; | ||||
| @@ -42,11 +44,12 @@ struct Param { | |||||
| hw_pos = 1; | hw_pos = 1; | ||||
| } | } | ||||
| return { | return { | ||||
| U(src[0]), U(src[c_pos]), U(src[hw_pos]), | |||||
| U(src[hw_pos + 1]), U(fm.ocpg), U(fm.spatial[0]), | |||||
| U(fm.spatial[1]), U(dst[hw_pos]), U(dst[hw_pos + 1]), | |||||
| U(fm.padding[0]), U(fm.padding[1]), U(fm.stride[0]), | |||||
| U(fm.stride[1]), U(fm.dilation[0]), U(fm.dilation[1]), | |||||
| U(src[0]), U(src[c_pos]), U(src[hw_pos]), | |||||
| U(src[hw_pos + 1]), U(fm.ocpg), U(fm.spatial[0]), | |||||
| U(fm.spatial[1]), U(dst[hw_pos]), U(dst[hw_pos + 1]), | |||||
| U(fm.padding[0]), U(fm.padding[1]), U(fm.stride[0]), | |||||
| U(fm.stride[1]), U(fm.dilation[0]), U(fm.dilation[1]), | |||||
| is_compute_deafult_, | |||||
| }; | }; | ||||
| #undef U | #undef U | ||||
| } | } | ||||
| @@ -63,6 +66,10 @@ void run_bwd_data( | |||||
| T* src_grad, const T* dst_grad, const T* flt, const Param& param, | T* src_grad, const T* dst_grad, const T* flt, const Param& param, | ||||
| cudaStream_t stream); | cudaStream_t stream); | ||||
| template <typename T> | |||||
| void run_bwd_depthwise_large_filter( | |||||
| T* dst, const T* src, const T* flt, const Param& param, cudaStream_t stream); | |||||
| template <typename T> | template <typename T> | ||||
| void run_bwd_filter( | void run_bwd_filter( | ||||
| T* filter_grad, const T* src, const T* dst_grad, const Param& param, | T* filter_grad, const T* src, const T* dst_grad, const Param& param, | ||||
| @@ -115,6 +115,22 @@ ConvolutionBackwardDataImpl::Algorithm* ConvolutionBackwardDataImpl:: | |||||
| const AlgoAttribute& negative_attr) { | const AlgoAttribute& negative_attr) { | ||||
| AlgoBase::SizeArgs args(this, filter, diff, grad); | AlgoBase::SizeArgs args(this, filter, diff, grad); | ||||
| //! choose for large kernel cases | |||||
| size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1]; | |||||
| size_t ho = diff[2], wo = diff[3]; | |||||
| const bool prefer_dnn_lk_implbmm = args.filter_meta.format == Param::Format::NCHW && | |||||
| ho <= 2 * fh && wo <= 2 * fw; | |||||
| if (prefer_dnn_lk_implbmm) { | |||||
| #if CUDA_VERSION >= 10020 | |||||
| if (sm_algo_pack.implbmm_nchw_hmma[0].is_available_attribute( | |||||
| args, positive_attr, negative_attr, workspace_limit_in_bytes)) | |||||
| return &sm_algo_pack.implbmm_nchw_hmma[0]; | |||||
| #endif | |||||
| if (sm_algo_pack.implbmm_nchw_fma[0].is_available_attribute( | |||||
| args, positive_attr, negative_attr, workspace_limit_in_bytes)) | |||||
| return &sm_algo_pack.implbmm_nchw_fma[0]; | |||||
| } | |||||
| if (args.filter_meta.group > 1 && | if (args.filter_meta.group > 1 && | ||||
| sm_algo_pack.chanwise.is_available_attribute( | sm_algo_pack.chanwise.is_available_attribute( | ||||
| args, positive_attr, negative_attr, workspace_limit_in_bytes)) { | args, positive_attr, negative_attr, workspace_limit_in_bytes)) { | ||||
| @@ -242,6 +258,23 @@ ConvolutionBackwardFilterImpl::Algorithm* ConvolutionBackwardFilterImpl:: | |||||
| const AlgoAttribute& negative_attr) { | const AlgoAttribute& negative_attr) { | ||||
| AlgoBase::SizeArgs args(this, src, diff, grad); | AlgoBase::SizeArgs args(this, src, diff, grad); | ||||
| //! choose for large kernel cases | |||||
| size_t fh = args.grad_filter_meta.spatial[0], fw = args.grad_filter_meta.spatial[1]; | |||||
| size_t ho = diff[2], wo = diff[3]; | |||||
| const bool prefer_dnn_lk_implbmm = | |||||
| args.grad_filter_meta.format == Param::Format::NCHW && ho <= 2 * fh && | |||||
| wo <= 2 * fw; | |||||
| if (prefer_dnn_lk_implbmm) { | |||||
| #if CUDA_VERSION >= 10020 | |||||
| if (sm_algo_pack.implbmm_nchw_hmma[0].is_available_attribute( | |||||
| args, positive_attr, negative_attr, workspace_limit_in_bytes)) | |||||
| return &sm_algo_pack.implbmm_nchw_hmma[0]; | |||||
| #endif | |||||
| if (sm_algo_pack.implbmm_nchw_fma[0].is_available_attribute( | |||||
| args, positive_attr, negative_attr, workspace_limit_in_bytes)) | |||||
| return &sm_algo_pack.implbmm_nchw_fma[0]; | |||||
| } | |||||
| if (args.grad_filter_meta.group > 1 && | if (args.grad_filter_meta.group > 1 && | ||||
| sm_algo_pack.chanwise.is_available_attribute( | sm_algo_pack.chanwise.is_available_attribute( | ||||
| args, positive_attr, negative_attr, workspace_limit_in_bytes)) { | args, positive_attr, negative_attr, workspace_limit_in_bytes)) { | ||||
| @@ -97,11 +97,14 @@ public: | |||||
| class AlgoMatmul; | class AlgoMatmul; | ||||
| class AlgoChanwise; | class AlgoChanwise; | ||||
| class AlgoChanwiseSmall; | class AlgoChanwiseSmall; | ||||
| class AlgoDepthwiseLargeFilter; | |||||
| class AlgoGroupConvGeneral; | class AlgoGroupConvGeneral; | ||||
| class AlgoBFloat16; | class AlgoBFloat16; | ||||
| class AlgoInt8NCHW4DotProdImplicitGemm; | class AlgoInt8NCHW4DotProdImplicitGemm; | ||||
| class AlgoInt8NCHWDotProdImplicitGemm; | class AlgoInt8NCHWDotProdImplicitGemm; | ||||
| class AlgoInt8NHWCIMMAImplicitGemm; | class AlgoInt8NHWCIMMAImplicitGemm; | ||||
| class AlgoFloat32NCHWFMAImplicitBatchedGemm; | |||||
| class AlgoFloat16NCHWHMMAImplicitBatchedGemm; | |||||
| class AlgoPack; | class AlgoPack; | ||||
| @@ -154,6 +157,8 @@ public: | |||||
| class AlgoChanwise; | class AlgoChanwise; | ||||
| class AlgoGroupConvGeneral; | class AlgoGroupConvGeneral; | ||||
| class AlgoBFloat16; | class AlgoBFloat16; | ||||
| class AlgoFloat32NCHWFMAImplicitBatchedGemm; | |||||
| class AlgoFloat16NCHWHMMAImplicitBatchedGemm; | |||||
| class AlgoPack; | class AlgoPack; | ||||
| @@ -135,6 +135,15 @@ namespace detail { | |||||
| template <typename EpilogueOp, epilogue::EpilogueType type> | template <typename EpilogueOp, epilogue::EpilogueType type> | ||||
| struct init_epilogue_param_; | struct init_epilogue_param_; | ||||
| template <typename EpilogueOp> | |||||
| struct init_epilogue_param_<EpilogueOp, epilogue::EpilogueType::kLinearCombination> { | |||||
| using ElementCompute = typename EpilogueOp::ElementCompute; | |||||
| typename EpilogueOp::Params get(ConvolutionArguments const* conv_args) { | |||||
| return {*static_cast<ElementCompute const*>(conv_args->alpha), | |||||
| *static_cast<ElementCompute const*>(conv_args->beta)}; | |||||
| } | |||||
| }; | |||||
| template <typename EpilogueOp> | template <typename EpilogueOp> | ||||
| struct init_epilogue_param_< | struct init_epilogue_param_< | ||||
| EpilogueOp, epilogue::EpilogueType::kBiasAddLinearCombination> { | EpilogueOp, epilogue::EpilogueType::kBiasAddLinearCombination> { | ||||
| @@ -290,6 +299,159 @@ public: | |||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| /// We add a new template class to handle convolution backward filter operation, because | |||||
| /// the device-level convolution operator of backward filter is different from the | |||||
| /// others (convolution forward and convolution backward data). | |||||
| /// But the description object is reused in this wrapper of convolution backward filter. | |||||
| /// The reason is that we do not want to introduce an another unnecessary structure. | |||||
| /// TODO: Maybe the device-level operator in cutlass for convoluton forward, backward | |||||
| /// data and backward filter should be combined. | |||||
| template <typename Operator_> | |||||
| class ConvolutionBackwardFilterOperationBase : public Operation { | |||||
| public: | |||||
| using Operator = Operator_; | |||||
| using ElementSrc = typename Operator::ElementSrc; | |||||
| using LayoutSrc = typename Operator::LayoutSrc; | |||||
| using ElementDiff = typename Operator::ElementDiff; | |||||
| using LayoutDiff = typename Operator::LayoutDiff; | |||||
| using ElementGrad = typename Operator::ElementGrad; | |||||
| using LayoutGrad = typename Operator::LayoutGrad; | |||||
| using ElementAccumulator = typename Operator::ElementAccumulator; | |||||
| ConvolutionBackwardFilterOperationBase(char const* name = "unknown_convolution") { | |||||
| m_description.name = name; | |||||
| m_description.provider = Provider::kCUTLASS; | |||||
| m_description.kind = OperationKind::kConvolution; | |||||
| m_description.conv_op = Operator::kConvolutionalOperator; | |||||
| m_description.tile_description.threadblock_shape = make_Coord( | |||||
| Operator::ThreadblockShape::kM, Operator::ThreadblockShape::kN, | |||||
| Operator::ThreadblockShape::kK); | |||||
| m_description.tile_description.threadblock_stages = Operator::kStages; | |||||
| m_description.tile_description.warp_count = make_Coord( | |||||
| Operator::ConvolutionKernel::WarpCount::kM, | |||||
| Operator::ConvolutionKernel::WarpCount::kN, | |||||
| Operator::ConvolutionKernel::WarpCount::kK); | |||||
| m_description.tile_description.math_instruction.instruction_shape = make_Coord( | |||||
| Operator::InstructionShape::kM, Operator::InstructionShape::kN, | |||||
| Operator::InstructionShape::kK); | |||||
| m_description.tile_description.math_instruction.element_accumulator = | |||||
| NumericTypeMap<ElementAccumulator>::kId; | |||||
| m_description.tile_description.math_instruction.opcode_class = | |||||
| OpcodeClassMap<typename Operator::OperatorClass>::kId; | |||||
| m_description.tile_description.math_instruction.math_operation = | |||||
| MathOperationMap<typename Operator::Operator>::kId; | |||||
| m_description.tile_description.minimum_compute_capability = | |||||
| ArchMap<typename Operator::ArchTag, | |||||
| typename Operator::OperatorClass>::kMin; | |||||
| m_description.tile_description.maximum_compute_capability = | |||||
| ArchMap<typename Operator::ArchTag, | |||||
| typename Operator::OperatorClass>::kMax; | |||||
| /// src in description -> src in C++ template | |||||
| m_description.src = | |||||
| make_TensorDescription<ElementSrc, LayoutSrc>(Operator::kAlignmentSrc); | |||||
| /// filter in description -> diff in C++ template | |||||
| m_description.filter = make_TensorDescription<ElementDiff, LayoutDiff>( | |||||
| Operator::kAlignmentDiff); | |||||
| /// dst in description -> grad in C++ template | |||||
| m_description.dst = make_TensorDescription<ElementGrad, LayoutGrad>( | |||||
| Operator::kAlignmentGrad); | |||||
| /// because bias tensor is not used in ConvolutionBackwardFilter operation, the | |||||
| /// following tensor description is a dummy arguments | |||||
| m_description.bias = make_TensorDescription<ElementGrad, LayoutGrad>( | |||||
| Operator::kAlignmentGrad); | |||||
| m_description.convolution_type = Operator::kConvolutionType; | |||||
| m_description.arch_tag = ArchTagMap<typename Operator::ArchTag>::kId; | |||||
| m_description.epilogue_type = Operator::EpilogueOutputOp::kType; | |||||
| m_description.epilogue_count = Operator::EpilogueOutputOp::kCount; | |||||
| m_description.threadblock_swizzle = | |||||
| ThreadblockSwizzleMap<typename Operator::ThreadblockSwizzle>::kId; | |||||
| m_description.special_optimization = Operator::kSpecialOpt; | |||||
| m_description.gemm_mode = Operator::kGemmMode; | |||||
| /// ConvolutionBackwardFilter operation is only used for depthwise convolution, | |||||
| /// so the option without_shared_load is always true | |||||
| m_description.without_shared_load = true; | |||||
| } | |||||
| virtual OperationDescription const& description() const { return m_description; } | |||||
| protected: | |||||
| ConvolutionDescription m_description; | |||||
| }; | |||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | |||||
| template <typename Operator_> | |||||
| class ConvolutionBackwardFilterOperation | |||||
| : public ConvolutionBackwardFilterOperationBase<Operator_> { | |||||
| public: | |||||
| using Operator = Operator_; | |||||
| using ElementSrc = typename Operator::ElementSrc; | |||||
| using LayoutSrc = typename Operator::LayoutSrc; | |||||
| using ElementDiff = typename Operator::ElementDiff; | |||||
| using LayoutDiff = typename Operator::LayoutDiff; | |||||
| using ElementGrad = typename Operator::ElementGrad; | |||||
| using LayoutGrad = typename Operator::LayoutGrad; | |||||
| using ElementAccumulator = typename Operator::ElementAccumulator; | |||||
| using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute; | |||||
| using OperatorArguments = typename Operator::Arguments; | |||||
| ConvolutionBackwardFilterOperation(char const* name = "unknown_gemm") | |||||
| : ConvolutionBackwardFilterOperationBase<Operator_>(name) {} | |||||
| virtual Status run( | |||||
| void const* arguments_ptr, void* device_workspace = nullptr, | |||||
| cudaStream_t stream = nullptr) const { | |||||
| cutlass::conv::Operator conv_op = this->m_description.conv_op; | |||||
| ConvolutionArguments const* conv_args = | |||||
| reinterpret_cast<ConvolutionArguments const*>(arguments_ptr); | |||||
| const auto& ps = conv_args->problem_size; | |||||
| OperatorArguments args; | |||||
| args.problem_size = ps; | |||||
| /// src in convolution arguments -> ref_src | |||||
| args.ref_src = { | |||||
| static_cast<ElementSrc*>(const_cast<void*>(conv_args->src)), | |||||
| LayoutSrc::packed(implicit_gemm_tensor_b_extent(conv_op, ps))}; | |||||
| /// filter in convolution arguments -> ref_diff | |||||
| args.ref_diff = { | |||||
| static_cast<ElementDiff*>(const_cast<void*>(conv_args->filter)), | |||||
| LayoutDiff::packed(implicit_gemm_tensor_a_extent(conv_op, ps))}; | |||||
| /// dst in convolution arguments -> ref_grad | |||||
| args.ref_grad = { | |||||
| static_cast<ElementGrad*>(conv_args->dst), | |||||
| LayoutGrad::packed(implicit_gemm_tensor_c_extent(conv_op, ps))}; | |||||
| args.output_op = init_epilogue_param<typename Operator::EpilogueOutputOp>().get( | |||||
| conv_args); | |||||
| Operator op; | |||||
| Status status = op.initialize(args, device_workspace); | |||||
| if (status != Status::kSuccess) { | |||||
| return status; | |||||
| } | |||||
| return op.run(stream); | |||||
| } | |||||
| }; | |||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | |||||
| } // namespace library | } // namespace library | ||||
| } // namespace cutlass | } // namespace cutlass | ||||
| @@ -44,6 +44,11 @@ namespace cutlass { | |||||
| namespace library { | namespace library { | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| #if ((__CUDACC_VER_MAJOR__ > 10) || \ | |||||
| (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1)) | |||||
| #define CUTLASS_ARCH_MMA_SM70_SUPPORTED 1 | |||||
| #endif | |||||
| #if ((__CUDACC_VER_MAJOR__ > 10) || \ | #if ((__CUDACC_VER_MAJOR__ > 10) || \ | ||||
| (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)) | (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)) | ||||
| #define CUTLASS_ARCH_MMA_SM75_SUPPORTED 1 | #define CUTLASS_ARCH_MMA_SM75_SUPPORTED 1 | ||||
| @@ -54,8 +59,16 @@ namespace library { | |||||
| void initialize_all_gemm_simt_operations(Manifest& manifest); | void initialize_all_gemm_simt_operations(Manifest& manifest); | ||||
| void initialize_all_conv2d_simt_operations(Manifest& manifest); | void initialize_all_conv2d_simt_operations(Manifest& manifest); | ||||
| void initialize_all_deconv_simt_operations(Manifest& manifest); | void initialize_all_deconv_simt_operations(Manifest& manifest); | ||||
| #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED | |||||
| void initialize_all_dwconv2d_fprop_simt_operations(Manifest& manifest); | |||||
| void initialize_all_dwconv2d_dgrad_simt_operations(Manifest& manifest); | |||||
| void initialize_all_dwconv2d_wgrad_simt_operations(Manifest& manifest); | |||||
| #if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) && CUTLASS_ARCH_MMA_SM70_SUPPORTED | |||||
| void initialize_all_gemm_tensorop884_operations(Manifest& manifest); | void initialize_all_gemm_tensorop884_operations(Manifest& manifest); | ||||
| void initialize_all_dwconv2d_fprop_tensorop884_operations(Manifest& manifest); | |||||
| void initialize_all_dwconv2d_dgrad_tensorop884_operations(Manifest& manifest); | |||||
| void initialize_all_dwconv2d_wgrad_tensorop884_operations(Manifest& manifest); | |||||
| #endif | |||||
| #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED | |||||
| void initialize_all_gemm_tensorop1688_operations(Manifest& manifest); | void initialize_all_gemm_tensorop1688_operations(Manifest& manifest); | ||||
| void initialize_all_conv2d_tensorop8816_operations(Manifest& manifest); | void initialize_all_conv2d_tensorop8816_operations(Manifest& manifest); | ||||
| void initialize_all_conv2d_tensorop8832_operations(Manifest& manifest); | void initialize_all_conv2d_tensorop8832_operations(Manifest& manifest); | ||||
| @@ -66,8 +79,16 @@ void initialize_all(Manifest& manifest) { | |||||
| initialize_all_gemm_simt_operations(manifest); | initialize_all_gemm_simt_operations(manifest); | ||||
| initialize_all_conv2d_simt_operations(manifest); | initialize_all_conv2d_simt_operations(manifest); | ||||
| initialize_all_deconv_simt_operations(manifest); | initialize_all_deconv_simt_operations(manifest); | ||||
| #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED | |||||
| initialize_all_dwconv2d_fprop_simt_operations(manifest); | |||||
| initialize_all_dwconv2d_dgrad_simt_operations(manifest); | |||||
| initialize_all_dwconv2d_wgrad_simt_operations(manifest); | |||||
| #if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) && CUTLASS_ARCH_MMA_SM70_SUPPORTED | |||||
| initialize_all_gemm_tensorop884_operations(manifest); | initialize_all_gemm_tensorop884_operations(manifest); | ||||
| initialize_all_dwconv2d_fprop_tensorop884_operations(manifest); | |||||
| initialize_all_dwconv2d_dgrad_tensorop884_operations(manifest); | |||||
| initialize_all_dwconv2d_wgrad_tensorop884_operations(manifest); | |||||
| #endif | |||||
| #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED | |||||
| initialize_all_gemm_tensorop1688_operations(manifest); | initialize_all_gemm_tensorop1688_operations(manifest); | ||||
| initialize_all_conv2d_tensorop8816_operations(manifest); | initialize_all_conv2d_tensorop8816_operations(manifest); | ||||
| initialize_all_conv2d_tensorop8832_operations(manifest); | initialize_all_conv2d_tensorop8832_operations(manifest); | ||||
| @@ -223,6 +223,9 @@ enum class ThreadblockSwizzleID { | |||||
| kConvolutionFpropTrans, | kConvolutionFpropTrans, | ||||
| kConvolutionDgradNCxHWx, | kConvolutionDgradNCxHWx, | ||||
| kConvolutionDgradTrans, | kConvolutionDgradTrans, | ||||
| kDepthwiseConvolutionFprop, | |||||
| kDepthwiseConvolutionDgrad, | |||||
| kDepthwiseConvolutionWgrad, | |||||
| kInvalid | kInvalid | ||||
| }; | }; | ||||
| @@ -570,6 +570,27 @@ struct ThreadblockSwizzleMap< | |||||
| ThreadblockSwizzleID::kConvolutionDgradTrans; | ThreadblockSwizzleID::kConvolutionDgradTrans; | ||||
| }; | }; | ||||
| template <> | |||||
| struct ThreadblockSwizzleMap< | |||||
| conv::threadblock::DepthwiseConvolutionFpropThreadblockSwizzle> { | |||||
| static ThreadblockSwizzleID const kId = | |||||
| ThreadblockSwizzleID::kDepthwiseConvolutionFprop; | |||||
| }; | |||||
| template <> | |||||
| struct ThreadblockSwizzleMap< | |||||
| conv::threadblock::DepthwiseConvolutionDgradThreadblockSwizzle> { | |||||
| static ThreadblockSwizzleID const kId = | |||||
| ThreadblockSwizzleID::kDepthwiseConvolutionDgrad; | |||||
| }; | |||||
| template <> | |||||
| struct ThreadblockSwizzleMap< | |||||
| conv::threadblock::DepthwiseConvolutionWgradThreadblockSwizzle> { | |||||
| static ThreadblockSwizzleID const kId = | |||||
| ThreadblockSwizzleID::kDepthwiseConvolutionWgrad; | |||||
| }; | |||||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template <typename Element, typename Layout> | template <typename Element, typename Layout> | ||||
| @@ -99,6 +99,8 @@ ConvolutionKey get_convolution_key_from_desc(const ConvolutionDescription& desc) | |||||
| key.layout_dst = desc.dst.layout; | key.layout_dst = desc.dst.layout; | ||||
| key.element_bias = desc.bias.element; | key.element_bias = desc.bias.element; | ||||
| key.layout_bias = desc.bias.layout; | key.layout_bias = desc.bias.layout; | ||||
| key.element_accumulator = | |||||
| desc.tile_description.math_instruction.element_accumulator; | |||||
| key.convolution_type = desc.convolution_type; | key.convolution_type = desc.convolution_type; | ||||
| @@ -124,6 +126,8 @@ ConvolutionKey get_convolution_key_from_desc(const ConvolutionDescription& desc) | |||||
| key.stages = desc.tile_description.threadblock_stages; | key.stages = desc.tile_description.threadblock_stages; | ||||
| key.special_optimization = desc.special_optimization; | key.special_optimization = desc.special_optimization; | ||||
| key.alignment_src = desc.src.alignment; | |||||
| key.alignment_filter = desc.filter.alignment; | |||||
| key.without_shared_load = desc.without_shared_load; | key.without_shared_load = desc.without_shared_load; | ||||
| return key; | return key; | ||||
| @@ -188,6 +188,7 @@ struct ConvolutionKey { | |||||
| library::LayoutTypeID layout_dst; | library::LayoutTypeID layout_dst; | ||||
| library::NumericTypeID element_bias; | library::NumericTypeID element_bias; | ||||
| library::LayoutTypeID layout_bias; | library::LayoutTypeID layout_bias; | ||||
| NumericTypeID element_accumulator; | |||||
| conv::ConvType convolution_type; | conv::ConvType convolution_type; | ||||
| @@ -206,6 +207,10 @@ struct ConvolutionKey { | |||||
| epilogue::EpilogueType epilogue_type; | epilogue::EpilogueType epilogue_type; | ||||
| int stages; | int stages; | ||||
| conv::SpecialOptimizeDesc special_optimization; | conv::SpecialOptimizeDesc special_optimization; | ||||
| int alignment_src; | |||||
| int alignment_filter; | |||||
| bool without_shared_load; | bool without_shared_load; | ||||
| inline bool operator==(ConvolutionKey const& rhs) const { | inline bool operator==(ConvolutionKey const& rhs) const { | ||||
| @@ -215,6 +220,7 @@ struct ConvolutionKey { | |||||
| (layout_filter == rhs.layout_filter) && | (layout_filter == rhs.layout_filter) && | ||||
| (element_dst == rhs.element_dst) && (layout_dst == rhs.layout_dst) && | (element_dst == rhs.element_dst) && (layout_dst == rhs.layout_dst) && | ||||
| (element_bias == rhs.element_bias) && (layout_bias == rhs.layout_bias) && | (element_bias == rhs.element_bias) && (layout_bias == rhs.layout_bias) && | ||||
| (element_accumulator == rhs.element_accumulator) && | |||||
| (convolution_type == rhs.convolution_type) && | (convolution_type == rhs.convolution_type) && | ||||
| (threadblock_shape_m == rhs.threadblock_shape_m) && | (threadblock_shape_m == rhs.threadblock_shape_m) && | ||||
| (threadblock_shape_n == rhs.threadblock_shape_n) && | (threadblock_shape_n == rhs.threadblock_shape_n) && | ||||
| @@ -227,6 +233,8 @@ struct ConvolutionKey { | |||||
| (instruction_shape_k == rhs.instruction_shape_k) && | (instruction_shape_k == rhs.instruction_shape_k) && | ||||
| (epilogue_type == rhs.epilogue_type) && (stages == rhs.stages) && | (epilogue_type == rhs.epilogue_type) && (stages == rhs.stages) && | ||||
| (special_optimization == rhs.special_optimization) && | (special_optimization == rhs.special_optimization) && | ||||
| (alignment_src == rhs.alignment_src) && | |||||
| (alignment_filter == rhs.alignment_filter) && | |||||
| (without_shared_load == rhs.without_shared_load); | (without_shared_load == rhs.without_shared_load); | ||||
| } | } | ||||
| @@ -254,6 +262,7 @@ struct ConvolutionKey { | |||||
| "\n layout_dst: " + to_string(layout_dst) + | "\n layout_dst: " + to_string(layout_dst) + | ||||
| "\n element_bias: " + to_string(element_bias) + | "\n element_bias: " + to_string(element_bias) + | ||||
| "\n layout_bias: " + to_string(layout_bias) + | "\n layout_bias: " + to_string(layout_bias) + | ||||
| "\n element_accumulator: " + to_string(element_accumulator) + | |||||
| "\n convolution_type: " + to_string(convolution_type) + | "\n convolution_type: " + to_string(convolution_type) + | ||||
| "\n threadblock_shape: " + threadblock_shape_str + | "\n threadblock_shape: " + threadblock_shape_str + | ||||
| "\n warp_shape: " + warp_shape_str + | "\n warp_shape: " + warp_shape_str + | ||||
| @@ -261,6 +270,8 @@ struct ConvolutionKey { | |||||
| "\n epilogue_type: " + to_string(epilogue_type) + | "\n epilogue_type: " + to_string(epilogue_type) + | ||||
| "\n stages: " + std::to_string(stages) + | "\n stages: " + std::to_string(stages) + | ||||
| "\n special_optimization: " + to_string(special_optimization) + | "\n special_optimization: " + to_string(special_optimization) + | ||||
| "\n alignment_src: " + std::to_string(alignment_src) + | |||||
| "\n alignment_filter: " + std::to_string(alignment_filter) + | |||||
| "\n without_shared_load: " + to_string(without_shared_load) + "\n}"; | "\n without_shared_load: " + to_string(without_shared_load) + "\n}"; | ||||
| } | } | ||||
| }; | }; | ||||
| @@ -268,7 +279,6 @@ struct ConvolutionKey { | |||||
| struct ConvolutionKeyHasher { | struct ConvolutionKeyHasher { | ||||
| inline size_t operator()(ConvolutionKey const& key) const { | inline size_t operator()(ConvolutionKey const& key) const { | ||||
| return Hash() | return Hash() | ||||
| .update(&key.conv_op, sizeof(key.conv_op)) | |||||
| .update(&key.conv_op, sizeof(key.conv_op)) | .update(&key.conv_op, sizeof(key.conv_op)) | ||||
| .update(&key.element_src, sizeof(key.element_src)) | .update(&key.element_src, sizeof(key.element_src)) | ||||
| .update(&key.layout_src, sizeof(key.layout_src)) | .update(&key.layout_src, sizeof(key.layout_src)) | ||||
| @@ -278,6 +288,7 @@ struct ConvolutionKeyHasher { | |||||
| .update(&key.layout_dst, sizeof(key.layout_dst)) | .update(&key.layout_dst, sizeof(key.layout_dst)) | ||||
| .update(&key.element_bias, sizeof(key.element_bias)) | .update(&key.element_bias, sizeof(key.element_bias)) | ||||
| .update(&key.layout_bias, sizeof(key.layout_bias)) | .update(&key.layout_bias, sizeof(key.layout_bias)) | ||||
| .update(&key.element_accumulator, sizeof(key.element_accumulator)) | |||||
| .update(&key.convolution_type, sizeof(key.convolution_type)) | .update(&key.convolution_type, sizeof(key.convolution_type)) | ||||
| .update(&key.threadblock_shape_m, sizeof(key.threadblock_shape_m)) | .update(&key.threadblock_shape_m, sizeof(key.threadblock_shape_m)) | ||||
| .update(&key.threadblock_shape_n, sizeof(key.threadblock_shape_n)) | .update(&key.threadblock_shape_n, sizeof(key.threadblock_shape_n)) | ||||
| @@ -291,6 +302,8 @@ struct ConvolutionKeyHasher { | |||||
| .update(&key.epilogue_type, sizeof(key.epilogue_type)) | .update(&key.epilogue_type, sizeof(key.epilogue_type)) | ||||
| .update(&key.stages, sizeof(key.stages)) | .update(&key.stages, sizeof(key.stages)) | ||||
| .update(&key.special_optimization, sizeof(key.special_optimization)) | .update(&key.special_optimization, sizeof(key.special_optimization)) | ||||
| .update(&key.alignment_src, sizeof(key.alignment_src)) | |||||
| .update(&key.alignment_filter, sizeof(key.alignment_filter)) | |||||
| .update(&key.without_shared_load, sizeof(key.without_shared_load)) | .update(&key.without_shared_load, sizeof(key.without_shared_load)) | ||||
| .digest(); | .digest(); | ||||
| } | } | ||||
| @@ -1322,6 +1322,8 @@ static struct { | |||||
| {"batch_convolution", "BatchConvolution", conv::ConvType::kBatchConvolution}, | {"batch_convolution", "BatchConvolution", conv::ConvType::kBatchConvolution}, | ||||
| {"local", "Local", conv::ConvType::kLocal}, | {"local", "Local", conv::ConvType::kLocal}, | ||||
| {"local_share", "LocalShare", conv::ConvType::kLocalShare}, | {"local_share", "LocalShare", conv::ConvType::kLocalShare}, | ||||
| {"depthwise_convolution", "DepthwiseConvolution", | |||||
| conv::ConvType::kDepthwiseConvolution}, | |||||
| }; | }; | ||||
| /// Converts a ConvType enumerant to a string | /// Converts a ConvType enumerant to a string | ||||
| @@ -45,6 +45,21 @@ fma2(const __half2 a, const __half2 b, const __half2 c) { | |||||
| #endif | #endif | ||||
| } | } | ||||
| __device__ __forceinline__ __half2 hadd2(const __half2 a, const __half2 b) { | |||||
| #if __CUDA_ARCH__ >= 530 | |||||
| return __hadd2(a, b); | |||||
| #else | |||||
| return {__float2half(__half2float(a.x) + __half2float(b.x)), | |||||
| __float2half(__half2float(a.y) + __half2float(b.y))}; | |||||
| #endif | |||||
| } | |||||
| __device__ __forceinline__ float2 | |||||
| fma2(const __half2 a, const __half2 b, const float2 c) { | |||||
| return {__half2float(a.x) * __half2float(b.x) + c.x, | |||||
| __half2float(a.y) * __half2float(b.y) + c.y}; | |||||
| } | |||||
| #endif // CUDA_VERSION >= 9000 | #endif // CUDA_VERSION >= 9000 | ||||
| } // namespace cuda | } // namespace cuda | ||||
| @@ -44,7 +44,7 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() { | |||||
| for (auto&& algo : simt_float32_gemv_batched_strided) { | for (auto&& algo : simt_float32_gemv_batched_strided) { | ||||
| all_algos.push_back(&algo); | all_algos.push_back(&algo); | ||||
| } | } | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #if CUDA_VERSION >= 10010 | |||||
| for (auto&& algo : tensorop_float16) { | for (auto&& algo : tensorop_float16) { | ||||
| all_algos.push_back(&algo); | all_algos.push_back(&algo); | ||||
| } | } | ||||
| @@ -113,21 +113,26 @@ void MatrixMulForwardImpl::AlgoPack::fill_cutlass_algos() { | |||||
| simt_float32_gemv_batched_strided.emplace_back(128); | simt_float32_gemv_batched_strided.emplace_back(128); | ||||
| simt_float32_gemv_batched_strided.emplace_back(64); | simt_float32_gemv_batched_strided.emplace_back(64); | ||||
| simt_float32_gemv_batched_strided.emplace_back(32); | simt_float32_gemv_batched_strided.emplace_back(32); | ||||
| #define FOREACH_CUTLASS_MATMUL_F16_SHAPES(cb) \ | |||||
| cb(256, 128, 32, 64, 64, 32, 8, 8, 4); \ | |||||
| cb(128, 256, 32, 64, 64, 32, 8, 8, 4); \ | |||||
| cb(128, 128, 32, 64, 64, 32, 8, 8, 4); \ | |||||
| cb(256, 128, 32, 64, 64, 32, 16, 8, 8); \ | |||||
| cb(128, 256, 32, 64, 64, 32, 16, 8, 8); \ | |||||
| #define FOREACH_CUTLASS_MATMUL_MMA_SM70_SHAPES(cb) \ | |||||
| cb(256, 128, 32, 64, 64, 32, 8, 8, 4); \ | |||||
| cb(128, 256, 32, 64, 64, 32, 8, 8, 4); \ | |||||
| cb(128, 128, 32, 64, 64, 32, 8, 8, 4); | |||||
| #define FOREACH_CUTLASS_MATMUL_MMA_SM75_SHAPES(cb) \ | |||||
| cb(256, 128, 32, 64, 64, 32, 16, 8, 8); \ | |||||
| cb(128, 256, 32, 64, 64, 32, 16, 8, 8); \ | |||||
| cb(128, 128, 32, 64, 64, 32, 16, 8, 8); | cb(128, 128, 32, 64, 64, 32, 16, 8, 8); | ||||
| #define cb(...) \ | #define cb(...) \ | ||||
| tensorop_float16.emplace_back(AlgoParam{__VA_ARGS__}); \ | tensorop_float16.emplace_back(AlgoParam{__VA_ARGS__}); \ | ||||
| tensorop_float16_split_k.emplace_back(AlgoParam{__VA_ARGS__}); | tensorop_float16_split_k.emplace_back(AlgoParam{__VA_ARGS__}); | ||||
| #if CUDA_VERSION >= 10010 | |||||
| FOREACH_CUTLASS_MATMUL_MMA_SM70_SHAPES(cb) | |||||
| #endif | |||||
| #if CUDA_VERSION >= 10020 | #if CUDA_VERSION >= 10020 | ||||
| FOREACH_CUTLASS_MATMUL_F16_SHAPES(cb) | |||||
| FOREACH_CUTLASS_MATMUL_MMA_SM75_SHAPES(cb) | |||||
| #endif | #endif | ||||
| #undef cb | #undef cb | ||||
| #undef FOREACH_CUTLASS_MATMUL_F16_SHAPES | |||||
| #undef FOREACH_CUTLASS_MATMUL_MMA_SM70_SHAPES | |||||
| #undef FOREACH_CUTLASS_MATMUL_MMA_SM75_SHAPES | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -350,7 +350,7 @@ private: | |||||
| std::string m_name; | std::string m_name; | ||||
| }; | }; | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #if CUDA_VERSION >= 10010 | |||||
| class MatrixMulForwardImpl::AlgoFloat16TensorOp final | class MatrixMulForwardImpl::AlgoFloat16TensorOp final | ||||
| : public AlgoCutlassMatrixMulBase { | : public AlgoCutlassMatrixMulBase { | ||||
| public: | public: | ||||
| @@ -418,7 +418,7 @@ public: | |||||
| std::vector<AlgoFloat32SIMT> simt_float32; | std::vector<AlgoFloat32SIMT> simt_float32; | ||||
| std::vector<AlgoFloat32SIMTSplitK> simt_float32_split_k; | std::vector<AlgoFloat32SIMTSplitK> simt_float32_split_k; | ||||
| std::vector<AlgoFloat32SIMTGemvBatchedStrided> simt_float32_gemv_batched_strided; | std::vector<AlgoFloat32SIMTGemvBatchedStrided> simt_float32_gemv_batched_strided; | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #if CUDA_VERSION >= 10010 | |||||
| std::vector<AlgoFloat16TensorOp> tensorop_float16; | std::vector<AlgoFloat16TensorOp> tensorop_float16; | ||||
| std::vector<AlgoFloat16TensorOpSplitK> tensorop_float16_split_k; | std::vector<AlgoFloat16TensorOpSplitK> tensorop_float16_split_k; | ||||
| #endif | #endif | ||||
| @@ -15,7 +15,7 @@ | |||||
| #include "src/cuda/matrix_mul/algos.h" | #include "src/cuda/matrix_mul/algos.h" | ||||
| #include "src/cuda/utils.h" | #include "src/cuda/utils.h" | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #if CUDA_VERSION >= 10010 | |||||
| using namespace megdnn; | using namespace megdnn; | ||||
| using namespace cuda; | using namespace cuda; | ||||
| @@ -15,7 +15,7 @@ | |||||
| #include "src/cuda/matrix_mul/algos.h" | #include "src/cuda/matrix_mul/algos.h" | ||||
| #include "src/cuda/utils.h" | #include "src/cuda/utils.h" | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #if CUDA_VERSION >= 10010 | |||||
| using namespace megdnn; | using namespace megdnn; | ||||
| using namespace cuda; | using namespace cuda; | ||||
| @@ -46,8 +46,10 @@ public: | |||||
| class AlgoFloat32SIMT; | class AlgoFloat32SIMT; | ||||
| class AlgoFloat32SIMTSplitK; | class AlgoFloat32SIMTSplitK; | ||||
| class AlgoFloat32SIMTGemvBatchedStrided; | class AlgoFloat32SIMTGemvBatchedStrided; | ||||
| #if CUDA_VERSION >= 10010 | |||||
| class AlgoFloat16TensorOp; | class AlgoFloat16TensorOp; | ||||
| class AlgoFloat16TensorOpSplitK; | class AlgoFloat16TensorOpSplitK; | ||||
| #endif | |||||
| #endif | #endif | ||||
| class AlgoPack; | class AlgoPack; | ||||
| @@ -35,6 +35,7 @@ void PaddingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) { | |||||
| param().padding_val, stream); \ | param().padding_val, stream); \ | ||||
| } | } | ||||
| MEGDNN_FOREACH_COMPUTING_DTYPE(cb) | MEGDNN_FOREACH_COMPUTING_DTYPE(cb) | ||||
| MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) | |||||
| #undef cb | #undef cb | ||||
| } | } | ||||
| @@ -60,7 +60,8 @@ __global__ void paddingConst_kernel( | |||||
| params.src_stride[dim].divisor(); | params.src_stride[dim].divisor(); | ||||
| */ | */ | ||||
| } | } | ||||
| dst[out_index] = in_src_valid_area ? src[in_index] : padding_val; | |||||
| dst[out_index] = | |||||
| in_src_valid_area ? src[in_index] : static_cast<T>(padding_val); | |||||
| } | } | ||||
| } | } | ||||
| @@ -256,6 +257,7 @@ void padding_backward_proxy( | |||||
| const float_t padding_val, cudaStream_t stream); | const float_t padding_val, cudaStream_t stream); | ||||
| #define cb(DType) INST(typename DTypeTrait<DType>::ctype) | #define cb(DType) INST(typename DTypeTrait<DType>::ctype) | ||||
| MEGDNN_FOREACH_COMPUTING_DTYPE(cb) | MEGDNN_FOREACH_COMPUTING_DTYPE(cb) | ||||
| MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) | |||||
| #undef cb | #undef cb | ||||
| #undef INST | #undef INST | ||||
| @@ -171,7 +171,7 @@ void PaddingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) { | |||||
| switch (param().padding_mode) { | switch (param().padding_mode) { | ||||
| case param::Padding::PaddingMode::CONSTANT: | case param::Padding::PaddingMode::CONSTANT: | ||||
| #define cb(DType) \ | #define cb(DType) \ | ||||
| if (src.layout.dtype == DType()) { \ | |||||
| if (src.layout.dtype.enumv() == DTypeTrait<DType>::enumv) { \ | |||||
| using T = typename DTypeTrait<DType>::ctype; \ | using T = typename DTypeTrait<DType>::ctype; \ | ||||
| MEGDNN_DISPATCH_CPU_KERN_OPR(exec_const_internal<T>( \ | MEGDNN_DISPATCH_CPU_KERN_OPR(exec_const_internal<T>( \ | ||||
| src.layout.ndim, n, src.ptr<T>(), dst.ptr<T>(), params, \ | src.layout.ndim, n, src.ptr<T>(), dst.ptr<T>(), params, \ | ||||
| @@ -179,28 +179,31 @@ void PaddingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) { | |||||
| return; \ | return; \ | ||||
| } | } | ||||
| MEGDNN_FOREACH_COMPUTING_DTYPE(cb) | MEGDNN_FOREACH_COMPUTING_DTYPE(cb) | ||||
| MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) | |||||
| #undef cb | #undef cb | ||||
| break; | break; | ||||
| case param::Padding::PaddingMode::REPLICATE: | case param::Padding::PaddingMode::REPLICATE: | ||||
| #define cb(DType) \ | #define cb(DType) \ | ||||
| if (src.layout.dtype == DType()) { \ | |||||
| if (src.layout.dtype.enumv() == DTypeTrait<DType>::enumv) { \ | |||||
| using T = typename DTypeTrait<DType>::ctype; \ | using T = typename DTypeTrait<DType>::ctype; \ | ||||
| MEGDNN_DISPATCH_CPU_KERN_OPR(exec_replicate_internal<T>( \ | MEGDNN_DISPATCH_CPU_KERN_OPR(exec_replicate_internal<T>( \ | ||||
| src.layout.ndim, n, src.ptr<T>(), dst.ptr<T>(), params)); \ | src.layout.ndim, n, src.ptr<T>(), dst.ptr<T>(), params)); \ | ||||
| return; \ | return; \ | ||||
| } | } | ||||
| MEGDNN_FOREACH_COMPUTING_DTYPE(cb) | MEGDNN_FOREACH_COMPUTING_DTYPE(cb) | ||||
| MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) | |||||
| #undef cb | #undef cb | ||||
| break; | break; | ||||
| case param::Padding::PaddingMode::REFLECT: | case param::Padding::PaddingMode::REFLECT: | ||||
| #define cb(DType) \ | #define cb(DType) \ | ||||
| if (src.layout.dtype == DType()) { \ | |||||
| if (src.layout.dtype.enumv() == DTypeTrait<DType>::enumv) { \ | |||||
| using T = typename DTypeTrait<DType>::ctype; \ | using T = typename DTypeTrait<DType>::ctype; \ | ||||
| MEGDNN_DISPATCH_CPU_KERN_OPR(exec_reflect_internal<T>( \ | MEGDNN_DISPATCH_CPU_KERN_OPR(exec_reflect_internal<T>( \ | ||||
| src.layout.ndim, n, src.ptr<T>(), dst.ptr<T>(), params)); \ | src.layout.ndim, n, src.ptr<T>(), dst.ptr<T>(), params)); \ | ||||
| return; \ | return; \ | ||||
| } | } | ||||
| MEGDNN_FOREACH_COMPUTING_DTYPE(cb) | MEGDNN_FOREACH_COMPUTING_DTYPE(cb) | ||||
| MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) | |||||
| #undef cb | #undef cb | ||||
| break; | break; | ||||
| default: | default: | ||||
| @@ -569,6 +569,7 @@ public: | |||||
| }); | }); | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| megdnn_assert(false, "Expected algo not found: %s\n", policy_name.name.c_str()); | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -20,6 +20,7 @@ | |||||
| #include "test/common/workspace_wrapper.h" | #include "test/common/workspace_wrapper.h" | ||||
| #include "test/cuda/benchmark.h" | #include "test/cuda/benchmark.h" | ||||
| #include "test/cuda/fixture.h" | #include "test/cuda/fixture.h" | ||||
| #include "test/cuda/utils.h" | |||||
| #include <cuda_profiler_api.h> | #include <cuda_profiler_api.h> | ||||
| #include <cuda_runtime_api.h> | #include <cuda_runtime_api.h> | ||||
| @@ -38,8 +39,10 @@ bool check_need_full_bench() { | |||||
| } | } | ||||
| #endif | #endif | ||||
| Convolution::Param gconv_param(Convolution::Param p) { | |||||
| Convolution::Param gconv_param(Convolution::Param p, bool io16xc32 = false) { | |||||
| p.sparse = Convolution::Param::Sparse::GROUP; | p.sparse = Convolution::Param::Sparse::GROUP; | ||||
| if (io16xc32) | |||||
| p.compute_mode = Convolution::Param::ComputeMode::FLOAT32; | |||||
| return p; | return p; | ||||
| } | } | ||||
| @@ -421,6 +424,209 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER) { | |||||
| } | } | ||||
| } | } | ||||
| namespace { | |||||
| template <typename Op> | |||||
| struct AlgoCheckerMaker { | |||||
| static auto make(const char* name, bool* require_algo) { | |||||
| return AlgoChecker<Op>(name, require_algo); | |||||
| } | |||||
| }; | |||||
| template <> | |||||
| struct AlgoCheckerMaker<ConvolutionForward> { | |||||
| static auto make(const char* name, bool* require_algo) { | |||||
| return AlgoChecker<ConvolutionForward>( | |||||
| ExecutionPolicyAlgoName{ | |||||
| "DEFAULT", | |||||
| {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( | |||||
| name, {}) | |||||
| .c_str(), | |||||
| {}}}}, | |||||
| require_algo); | |||||
| } | |||||
| }; | |||||
| template <typename Op> | |||||
| void check_chanwise(DType io_type, DType comp_type, Handle* handle, const char* name) { | |||||
| Checker<Op> checker(handle); | |||||
| bool require_algo = false; | |||||
| checker.set_before_exec_callback(AlgoCheckerMaker<Op>::make(name, &require_algo)); | |||||
| checker.set_dtype(0, io_type).set_dtype(1, io_type).set_dtype(2, io_type); | |||||
| bool io16xc32 = false; | |||||
| if (io_type == dtype::Float16()) { | |||||
| if (comp_type == dtype::Float16()) { | |||||
| checker.set_epsilon(1e-1); | |||||
| } else { | |||||
| io16xc32 = true; | |||||
| } | |||||
| } | |||||
| // dispatch testcase by operation | |||||
| if (std::is_same<Op, ConvolutionForward>::value) { | |||||
| // align 8 | |||||
| checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32)) | |||||
| .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}}); | |||||
| // align 1 | |||||
| checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32)) | |||||
| .execs({{8, 2, 15, 15}, {2, 1, 1, 15, 15}, {}}); | |||||
| // align 2 | |||||
| checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32)) | |||||
| .execs({{8, 2, 14, 14}, {2, 1, 1, 15, 15}, {}}); | |||||
| // custom padding | |||||
| checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32)) | |||||
| .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}}); | |||||
| // custom stride | |||||
| checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32)) | |||||
| .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}}); | |||||
| } else if (std::is_same<Op, ConvolutionBackwardData>::value) { | |||||
| // align 8 | |||||
| checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32)) | |||||
| .execs({{2, 1, 1, 15, 15}, {8, 2, 16, 16}, {8, 2, 16, 16}}); | |||||
| // align 1 | |||||
| checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32)) | |||||
| .execs({{2, 1, 1, 15, 15}, {8, 2, 15, 15}, {8, 2, 15, 15}}); | |||||
| // align 2 | |||||
| checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32)) | |||||
| .execs({{2, 1, 1, 15, 15}, {8, 2, 14, 14}, {8, 2, 14, 14}}); | |||||
| // custom padding | |||||
| checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32)) | |||||
| .execs({{2, 1, 1, 15, 15}, {8, 2, 8, 8}, {8, 2, 16, 16}}); | |||||
| // custom stride | |||||
| checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32)) | |||||
| .execs({{2, 1, 1, 15, 15}, {8, 2, 7, 7}, {8, 2, 14, 14}}); | |||||
| } else if (std::is_same<Op, ConvolutionBackwardFilter>::value) { | |||||
| // align 8 | |||||
| checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32)) | |||||
| .execs({{8, 2, 16, 16}, {8, 2, 16, 16}, {2, 1, 1, 15, 15}}); | |||||
| // align 1 | |||||
| checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32)) | |||||
| .execs({{8, 2, 15, 15}, {8, 2, 15, 15}, {2, 1, 1, 15, 15}}); | |||||
| // align 2 | |||||
| checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32)) | |||||
| .execs({{8, 2, 14, 14}, {8, 2, 14, 14}, {2, 1, 1, 15, 15}}); | |||||
| // custom padding | |||||
| checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32)) | |||||
| .execs({{8, 2, 16, 16}, {8, 2, 8, 8}, {2, 1, 1, 15, 15}}); | |||||
| // custom stride | |||||
| checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32)) | |||||
| .execs({{8, 2, 14, 14}, {8, 2, 7, 7}, {2, 1, 1, 15, 15}}); | |||||
| } | |||||
| } | |||||
| } // namespace | |||||
| #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) \ | |||||
| cb(1, 128, 128, 8, 32, 64, 8); \ | |||||
| cb(2, 128, 64, 8, 64, 32, 8); \ | |||||
| cb(3, 128, 32, 8, 64, 32, 8); \ | |||||
| cb(4, 64, 128, 8, 32, 64, 8); \ | |||||
| cb(5, 32, 128, 8, 32, 64, 8); \ | |||||
| cb(6, 64, 64, 8, 32, 64, 8); \ | |||||
| cb(7, 32, 64, 8, 32, 64, 8); \ | |||||
| cb(8, 32, 32, 8, 32, 32, 8); \ | |||||
| cb(9, 64, 32, 8, 64, 32, 8); | |||||
| #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | |||||
| TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_FMA_##tag) { \ | |||||
| require_compute_capability(6, 1); \ | |||||
| check_chanwise<ConvolutionForward>( \ | |||||
| dtype::Float32(), dtype::Float32(), handle_cuda(), \ | |||||
| "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | |||||
| "_" #wm "X" #wn "X" #wk "_2stage"); \ | |||||
| } | |||||
| MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) | |||||
| #undef cb | |||||
| #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | |||||
| TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_FMA_##tag) { \ | |||||
| require_compute_capability(6, 1); \ | |||||
| check_chanwise<ConvolutionBackwardData>( \ | |||||
| dtype::Float32(), dtype::Float32(), handle_cuda(), \ | |||||
| "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | |||||
| "_" #wm "X" #wn "X" #wk "_2stage"); \ | |||||
| } | |||||
| MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) | |||||
| #undef cb | |||||
| #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | |||||
| TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER_CUTLASS_FMA_##tag) { \ | |||||
| require_compute_capability(6, 1); \ | |||||
| check_chanwise<ConvolutionBackwardFilter>( \ | |||||
| dtype::Float32(), dtype::Float32(), handle_cuda(), \ | |||||
| "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | |||||
| "_" #wm "X" #wn "X" #wk "_2stage"); \ | |||||
| } | |||||
| MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) | |||||
| #undef cb | |||||
| #undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL | |||||
| #if CUDA_VERSION >= 10010 | |||||
| #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) \ | |||||
| cb(1, 128, 128, 32, 32, 32, 32); \ | |||||
| cb(2, 128, 256, 32, 64, 64, 32); \ | |||||
| cb(3, 128, 64, 32, 32, 32, 32); \ | |||||
| cb(4, 64, 128, 32, 32, 32, 32); \ | |||||
| cb(5, 64, 64, 32, 32, 32, 32); | |||||
| #else | |||||
| // hmma instruction need cuda version >= 10.2, disable hmma testcases in this path | |||||
| #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) | |||||
| #endif | |||||
| // check both ioc16 and io16xc32 | |||||
| #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | |||||
| TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_HMMA_##tag) { \ | |||||
| require_compute_capability(7, 0); \ | |||||
| check_chanwise<ConvolutionForward>( \ | |||||
| dtype::Float16(), dtype::Float16(), handle_cuda(), \ | |||||
| "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | |||||
| "_" #wm "X" #wn "X" #wk "_2stage"); \ | |||||
| check_chanwise<ConvolutionForward>( \ | |||||
| dtype::Float16(), dtype::Float32(), handle_cuda(), \ | |||||
| "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | |||||
| "_" #wm "X" #wn "X" #wk "_2stage"); \ | |||||
| } | |||||
| MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) | |||||
| #undef cb | |||||
| #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | |||||
| TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_HMMA_##tag) { \ | |||||
| require_compute_capability(7, 0); \ | |||||
| check_chanwise<ConvolutionBackwardData>( \ | |||||
| dtype::Float16(), dtype::Float16(), handle_cuda(), \ | |||||
| "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | |||||
| "_" #wm "X" #wn "X" #wk "_mma8X8X4_2stage"); \ | |||||
| check_chanwise<ConvolutionBackwardData>( \ | |||||
| dtype::Float16(), dtype::Float32(), handle_cuda(), \ | |||||
| "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | |||||
| "_" #wm "X" #wn "X" #wk "_mma8X8X4_2stage"); \ | |||||
| } | |||||
| MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) | |||||
| #undef cb | |||||
| #define cb(tag, tbm, tbn, tbk, wm, wn, wk) \ | |||||
| TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER_CUTLASS_HMMA_##tag) { \ | |||||
| require_compute_capability(7, 0); \ | |||||
| check_chanwise<ConvolutionBackwardData>( \ | |||||
| dtype::Float16(), dtype::Float32(), handle_cuda(), \ | |||||
| "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \ | |||||
| "_" #wm "X" #wn "X" #wk "_mma8X8X4_2stage"); \ | |||||
| } | |||||
| MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) | |||||
| #undef cb | |||||
| #undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_HMMA_KERNEL | |||||
| #if MEGDNN_WITH_BENCHMARK | #if MEGDNN_WITH_BENCHMARK | ||||
| TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) { | TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) { | ||||
| auto handle = handle_cuda(); | auto handle = handle_cuda(); | ||||
| @@ -1123,6 +1329,228 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_FILTER) { | |||||
| // clang-format on | // clang-format on | ||||
| } | } | ||||
| TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_LARGE_KERNEL) { | |||||
| CUBenchmarker<ConvolutionForward> bencher(handle_cuda()); | |||||
| size_t RUNS = 100; | |||||
| bencher.set_display(false).set_times(RUNS); | |||||
| std::unique_ptr<OprProxy<ConvolutionForward>> proxy{ | |||||
| new OprProxy<ConvolutionForward>{true}}; | |||||
| bencher.set_proxy(proxy); | |||||
| Convolution::Param param; | |||||
| param.format = ConvBias::Param::Format::NCHW; | |||||
| param.sparse = Convolution::Param::Sparse::GROUP; | |||||
| NormalRNG rng; | |||||
| auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) { | |||||
| param.pad_h = f / 2; | |||||
| param.pad_w = f / 2; | |||||
| param.stride_h = s; | |||||
| param.stride_w = s; | |||||
| param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | |||||
| TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f}; | |||||
| TensorLayout dst_layout; | |||||
| auto opr = handle_cuda()->create_operator<Convolution>(); | |||||
| opr->param() = param; | |||||
| opr->deduce_layout( | |||||
| {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout); | |||||
| float bandwith = static_cast<float>( | |||||
| src.total_nr_elems() + filter.total_nr_elems() + | |||||
| dst_layout.total_nr_elems()) / | |||||
| (1024 * 1024 * 1024) * 1e3; | |||||
| bencher.set_param(param) | |||||
| .set_dtype(0, dtype::Float32()) | |||||
| .set_dtype(1, dtype::Float32()) | |||||
| .set_dtype(2, dtype::Float32()) | |||||
| .set_rng(0, &rng) | |||||
| .set_rng(1, &rng); | |||||
| bencher.proxy()->target_execution_policy = {}; | |||||
| auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS; | |||||
| bencher.set_param(param) | |||||
| .set_dtype(0, dtype::Float16()) | |||||
| .set_dtype(1, dtype::Float16()) | |||||
| .set_dtype(2, dtype::Float16()) | |||||
| .set_rng(0, &rng) | |||||
| .set_rng(1, &rng); | |||||
| bencher.proxy()->target_execution_policy = {}; | |||||
| auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS; | |||||
| bencher.proxy()->target_execution_policy.algo.reset(); | |||||
| param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||||
| bencher.set_param(param); | |||||
| auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS; | |||||
| printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s " | |||||
| "float16: %.2fms %.2fGB/s " | |||||
| "pseudo float16: %.2fms %.2fGB/s " | |||||
| "speedup: " | |||||
| "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n", | |||||
| s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32, | |||||
| bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16, | |||||
| bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16, | |||||
| bandwith * 2 / time_in_ms_pseudo_fp16, time_in_ms_fp32 / time_in_ms_fp16, | |||||
| time_in_ms_pseudo_fp16 / time_in_ms_fp16); | |||||
| }; | |||||
| // clang-format off | |||||
| for (size_t b : {32, 64}) | |||||
| for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) { | |||||
| run(b, 384, 32, 32, f, 1); | |||||
| run(b, 384, 64, 64, f, 1); | |||||
| } | |||||
| // clang-format on | |||||
| } | |||||
| TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_LARGE_KERNEL) { | |||||
| CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda()); | |||||
| size_t RUNS = 100; | |||||
| bencher.set_display(false).set_times(RUNS); | |||||
| std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{ | |||||
| new OprProxy<ConvolutionBackwardData>{true}}; | |||||
| bencher.set_proxy(proxy); | |||||
| Convolution::Param param; | |||||
| param.format = ConvBias::Param::Format::NCHW; | |||||
| param.sparse = Convolution::Param::Sparse::GROUP; | |||||
| NormalRNG rng; | |||||
| auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) { | |||||
| param.pad_h = f / 2; | |||||
| param.pad_w = f / 2; | |||||
| param.stride_h = s; | |||||
| param.stride_w = s; | |||||
| param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | |||||
| TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f}; | |||||
| TensorLayout dst_layout; | |||||
| auto opr = handle_cuda()->create_operator<Convolution>(); | |||||
| opr->param() = param; | |||||
| opr->deduce_layout( | |||||
| {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout); | |||||
| float bandwith = static_cast<float>( | |||||
| src.total_nr_elems() + filter.total_nr_elems() + | |||||
| dst_layout.total_nr_elems()) / | |||||
| (1024 * 1024 * 1024) * 1e3; | |||||
| bencher.set_param(param) | |||||
| .set_dtype(0, dtype::Float32()) | |||||
| .set_dtype(1, dtype::Float32()) | |||||
| .set_dtype(2, dtype::Float32()) | |||||
| .set_rng(0, &rng) | |||||
| .set_rng(1, &rng); | |||||
| bencher.proxy()->target_execution_policy = {}; | |||||
| auto time_in_ms_fp32 = bencher.execs({filter, src, src}) / RUNS; | |||||
| bencher.set_param(param) | |||||
| .set_dtype(0, dtype::Float16()) | |||||
| .set_dtype(1, dtype::Float16()) | |||||
| .set_dtype(2, dtype::Float16()) | |||||
| .set_rng(0, &rng) | |||||
| .set_rng(1, &rng); | |||||
| bencher.proxy()->target_execution_policy = {}; | |||||
| auto time_in_ms_fp16 = bencher.execs({filter, src, src}) / RUNS; | |||||
| bencher.proxy()->target_execution_policy.algo.reset(); | |||||
| param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||||
| bencher.set_param(param); | |||||
| auto time_in_ms_pseudo_fp16 = bencher.execs({filter, src, src}) / RUNS; | |||||
| printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s " | |||||
| "float16: %.2fms %.2fGB/s " | |||||
| "pseudo float16: %.2fms %.2fGB/s " | |||||
| "speedup: " | |||||
| "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n", | |||||
| s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32, | |||||
| bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16, | |||||
| bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16, | |||||
| bandwith * 2 / time_in_ms_pseudo_fp16, time_in_ms_fp32 / time_in_ms_fp16, | |||||
| time_in_ms_pseudo_fp16 / time_in_ms_fp16); | |||||
| }; | |||||
| // clang-format off | |||||
| for (size_t b : {32, 64}) | |||||
| for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) { | |||||
| run(b, 384, 32, 32, f, 1); | |||||
| run(b, 384, 64, 64, f, 1); | |||||
| } | |||||
| // clang-format on | |||||
| } | |||||
| TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_FILTER_LARGE_KERNEL) { | |||||
| CUBenchmarker<ConvolutionBackwardFilter> bencher(handle_cuda()); | |||||
| size_t RUNS = 100; | |||||
| bencher.set_display(false).set_times(RUNS); | |||||
| std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{ | |||||
| new OprProxy<ConvolutionBackwardFilter>{true}}; | |||||
| bencher.set_proxy(proxy); | |||||
| Convolution::Param param; | |||||
| param.format = ConvBias::Param::Format::NCHW; | |||||
| param.sparse = Convolution::Param::Sparse::GROUP; | |||||
| NormalRNG rng; | |||||
| auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) { | |||||
| param.pad_h = f / 2; | |||||
| param.pad_w = f / 2; | |||||
| param.stride_h = s; | |||||
| param.stride_w = s; | |||||
| param.compute_mode = param::Convolution::ComputeMode::DEFAULT; | |||||
| TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f}; | |||||
| TensorLayout dst_layout; | |||||
| auto opr = handle_cuda()->create_operator<Convolution>(); | |||||
| opr->param() = param; | |||||
| opr->deduce_layout( | |||||
| {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout); | |||||
| float bandwith = static_cast<float>( | |||||
| src.total_nr_elems() + filter.total_nr_elems() + | |||||
| dst_layout.total_nr_elems()) / | |||||
| (1024 * 1024 * 1024) * 1e3; | |||||
| bencher.set_param(param) | |||||
| .set_dtype(0, dtype::Float32()) | |||||
| .set_dtype(1, dtype::Float32()) | |||||
| .set_dtype(2, dtype::Float32()) | |||||
| .set_rng(0, &rng) | |||||
| .set_rng(1, &rng); | |||||
| bencher.proxy()->target_execution_policy = {}; | |||||
| auto time_in_ms_fp32 = bencher.execs({src, src, filter}) / RUNS; | |||||
| bencher.set_param(param) | |||||
| .set_dtype(0, dtype::Float16()) | |||||
| .set_dtype(1, dtype::Float16()) | |||||
| .set_dtype(2, dtype::Float16()) | |||||
| .set_rng(0, &rng) | |||||
| .set_rng(1, &rng); | |||||
| bencher.proxy()->target_execution_policy = {}; | |||||
| param.compute_mode = param::Convolution::ComputeMode::FLOAT32; | |||||
| bencher.set_param(param); | |||||
| auto time_in_ms_pseudo_fp16 = bencher.execs({src, src, filter}) / RUNS; | |||||
| printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s " | |||||
| "pseudo float16: %.2fms %.2fGB/s " | |||||
| "speedup: " | |||||
| "%0.2f (fp16/fp32) \n", | |||||
| s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32, | |||||
| bandwith * 4 / time_in_ms_fp32, time_in_ms_pseudo_fp16, | |||||
| bandwith * 2 / time_in_ms_pseudo_fp16, | |||||
| time_in_ms_fp32 / time_in_ms_pseudo_fp16); | |||||
| }; | |||||
| // clang-format off | |||||
| for (size_t b : {32, 64}) | |||||
| for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) { | |||||
| run(b, 384, 32, 32, f, 1); | |||||
| run(b, 384, 64, 64, f, 1); | |||||
| } | |||||
| // clang-format on | |||||
| } | |||||
| #endif | #endif | ||||
| // vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen | ||||
| @@ -695,6 +695,78 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_CHANWISE_SMALL) { | |||||
| } | } | ||||
| } | } | ||||
| TEST_F(CUDA, CONV_BIAS_FORWARD_DEPTHWISE_LARGE_FILTER) { | |||||
| Checker<ConvBiasForward> checker(handle_cuda()); | |||||
| checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>( | |||||
| ConvBiasForward::algo_name<ConvBias::DirectParam>( | |||||
| "DEPTHWISE_LARGE_FILTER", {}) | |||||
| .c_str())); | |||||
| for (auto dtype : std::vector<DType> { | |||||
| dtype::Float32(), | |||||
| #if CUDA_VERSION >= 9000 | |||||
| dtype::Float16() | |||||
| #endif | |||||
| }) { | |||||
| auto run = [&checker, &dtype]( | |||||
| size_t n, size_t g, size_t h, size_t fh, size_t padding, | |||||
| size_t stride) { | |||||
| param::ConvBias cur_param; | |||||
| cur_param.mode = param::ConvBias::Mode::CROSS_CORRELATION; | |||||
| cur_param.sparse = ConvBias::Param::Sparse::GROUP; | |||||
| checker.set_dtype(0, dtype) | |||||
| .set_dtype(1, dtype) | |||||
| .set_dtype(2, dtype) | |||||
| .set_dtype(3, dtype) | |||||
| .set_dtype(4, dtype); | |||||
| float scale = 64.f / sqrt(fh * fh); | |||||
| UniformFloatRNG rng(scale, 2 * scale); | |||||
| checker.set_rng(0, &rng) | |||||
| .set_rng(1, &rng) | |||||
| .set_rng(2, &rng) | |||||
| .set_rng(3, &rng) | |||||
| .set_rng(4, &rng); | |||||
| if (dtype.enumv() == DTypeEnum::Float16) { | |||||
| checker.set_epsilon(1e-1); | |||||
| } | |||||
| cur_param.pad_h = cur_param.pad_w = padding; | |||||
| cur_param.stride_h = cur_param.stride_w = stride; | |||||
| checker.set_param(cur_param).execs( | |||||
| {{n, g, h, h}, {g, 1, 1, fh, fh}, {}, {}, {}}); | |||||
| }; | |||||
| run(4, 8, 32, 5, 5 / 2, 1); | |||||
| run(4, 8, 32, 7, 7 / 2, 1); | |||||
| run(4, 8, 32, 9, 9 / 2, 1); | |||||
| run(4, 8, 32, 11, 11 / 2, 1); | |||||
| run(4, 8, 32, 13, 13 / 2, 1); | |||||
| run(4, 8, 32, 15, 15 / 2, 1); | |||||
| run(4, 8, 32, 17, 17 / 2, 1); | |||||
| run(4, 8, 32, 19, 19 / 2, 1); | |||||
| run(4, 8, 32, 21, 21 / 2, 1); | |||||
| run(4, 8, 32, 23, 23 / 2, 1); | |||||
| run(4, 8, 32, 25, 25 / 2, 1); | |||||
| run(4, 8, 32, 27, 27 / 2, 1); | |||||
| run(4, 8, 32, 29, 29 / 2, 1); | |||||
| run(4, 8, 32, 31, 31 / 2, 1); | |||||
| run(4, 8, 64, 5, 5 / 3, 2); | |||||
| run(4, 8, 64, 7, 7 / 3, 2); | |||||
| run(4, 8, 64, 9, 9 / 3, 2); | |||||
| run(4, 8, 64, 11, 11 / 3, 2); | |||||
| run(4, 8, 64, 13, 13 / 3, 2); | |||||
| run(4, 8, 64, 15, 15 / 3, 2); | |||||
| run(4, 8, 64, 17, 17 / 3, 2); | |||||
| run(4, 8, 64, 19, 19 / 3, 2); | |||||
| run(4, 8, 64, 21, 21 / 3, 2); | |||||
| run(4, 8, 64, 23, 23 / 3, 2); | |||||
| run(4, 8, 64, 25, 25 / 3, 2); | |||||
| run(4, 8, 64, 27, 27 / 3, 2); | |||||
| run(4, 8, 64, 29, 29 / 3, 2); | |||||
| run(4, 8, 64, 31, 31 / 3, 2); | |||||
| run(1, 2, 128, 31, 10, 2); | |||||
| run(1, 2, 256, 31, 10, 2); | |||||
| } | |||||
| } | |||||
| TEST_F(CUDA, CONV_BIAS_FORWARD_CHANWISE_8x8x32) { | TEST_F(CUDA, CONV_BIAS_FORWARD_CHANWISE_8x8x32) { | ||||
| require_compute_capability(6, 1); | require_compute_capability(6, 1); | ||||
| Checker<ConvBiasForward> checker(handle_cuda()); | Checker<ConvBiasForward> checker(handle_cuda()); | ||||
| @@ -1033,7 +1105,7 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_GROUP) { | |||||
| ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( | ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( | ||||
| "CUDA:GROUP_CONV", {}) | "CUDA:GROUP_CONV", {}) | ||||
| .c_str(), | .c_str(), | ||||
| {{"CUDNN", {}}}})); | |||||
| {{"DEFAULT:CUDNN", {}}}})); | |||||
| ConvBias::Param param; | ConvBias::Param param; | ||||
| param.sparse = ConvBias::Param::Sparse::GROUP; | param.sparse = ConvBias::Param::Sparse::GROUP; | ||||
| param.nonlineMode = mode; | param.nonlineMode = mode; | ||||
| @@ -1093,8 +1165,11 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_GROUP) { | |||||
| run(2, 32, 7, 7, 3, 3, 64, 1, 1, 1, 1, 1, 1, 4, nlmode); | run(2, 32, 7, 7, 3, 3, 64, 1, 1, 1, 1, 1, 1, 4, nlmode); | ||||
| // strided case | // strided case | ||||
| run(2, 32, 7, 7, 3, 3, 64, 0, 0, 2, 2, 1, 1, 8, nlmode); | run(2, 32, 7, 7, 3, 3, 64, 0, 0, 2, 2, 1, 1, 8, nlmode); | ||||
| // dilate conv is supported in CUDNN since version 7.5.0 | |||||
| #if CUDNN_VERSION >= 7500 | |||||
| // dilated case | // dilated case | ||||
| run(2, 32, 7, 7, 3, 3, 64, 0, 0, 1, 1, 2, 2, 8, nlmode); | run(2, 32, 7, 7, 3, 3, 64, 0, 0, 1, 1, 2, 2, 8, nlmode); | ||||
| #endif | |||||
| } | } | ||||
| } | } | ||||
| @@ -1471,6 +1546,132 @@ TEST_F(CUDA, BENCHMARK_CONV_BIAS_FORWARD_TENSORCORE_INT8) { | |||||
| run_bench(256, 512, 7, 7, 512, 3, 3, 1, 1, 1000); | run_bench(256, 512, 7, 7, 512, 3, 3, 1, 1, 1000); | ||||
| run_bench(256, 512, 7, 7, 2048, 1, 1, 1, 1, 1000); | run_bench(256, 512, 7, 7, 2048, 1, 1, 1, 1, 1000); | ||||
| } | } | ||||
| TEST_F(CUDA, BENCHMARK_CONV_BIAS_FORWARD_DEPTHWISE_LARGE_FILTER_FP16) { | |||||
| require_compute_capability(7, 5); | |||||
| Benchmarker<ConvBiasForward> bencher(handle_cuda()); | |||||
| bencher.set_display(false); | |||||
| bencher.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>( | |||||
| ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( | |||||
| "DEPTHWISE_LARGE_FILTER", {}) | |||||
| .c_str())); | |||||
| ConvBias::Param param; | |||||
| param.format = ConvBias::Param::Format::NCHW; | |||||
| using NonlineMode = ConvBias::Param::NonlineMode; | |||||
| param.nonlineMode = NonlineMode::IDENTITY; | |||||
| param.sparse = ConvBias::Param::Sparse::GROUP; | |||||
| auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh, | |||||
| size_t fw, size_t sh, size_t sw, size_t nr_times) { | |||||
| param.pad_h = fh / 2; | |||||
| param.pad_w = fw / 2; | |||||
| param.stride_h = sh; | |||||
| param.stride_w = sw; | |||||
| bencher.set_param(param) | |||||
| .set_dtype(0, dtype::Float16()) | |||||
| .set_dtype(1, dtype::Float16()) | |||||
| .set_dtype(2, dtype::Float16()) | |||||
| .set_dtype(4, dtype::Float16()); | |||||
| bencher.set_times(nr_times); | |||||
| size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h); | |||||
| size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w); | |||||
| TensorShape inp{batch, g, hi, wi}, kern{g, 1, 1, fh, fw}, out{batch, g, ho, wo}; | |||||
| float bandwith = static_cast<float>( | |||||
| inp.total_nr_elems() + kern.total_nr_elems() + | |||||
| out.total_nr_elems()) / | |||||
| (1024 * 1024 * 1024) * 1e3; | |||||
| auto time_in_ms = bencher.execs({inp, kern, {}, {}, out}) / nr_times; | |||||
| auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12; | |||||
| printf("chanwise_depthwise_large_filter: inp=%s, kern=%s, out=%s, time: " | |||||
| "%.2fms, " | |||||
| "perf: %.2f Tops bandwidth: %.2fGB/s.\n", | |||||
| inp.to_string().c_str(), kern.to_string().c_str(), | |||||
| out.to_string().c_str(), time_in_ms, ops, bandwith * 4 / time_in_ms); | |||||
| }; | |||||
| run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10); | |||||
| } | |||||
| TEST_F(CUDA, BENCHMARK_CONV_BIAS_FORWARD_DEPTHWISE_LARGE_FILTER_FP32) { | |||||
| require_compute_capability(7, 5); | |||||
| Benchmarker<ConvBiasForward> bencher(handle_cuda()); | |||||
| bencher.set_display(false); | |||||
| bencher.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>( | |||||
| ConvBiasForward::algo_name<ConvBiasForward::DirectParam>( | |||||
| "DEPTHWISE_LARGE_FILTER", {}) | |||||
| .c_str())); | |||||
| ConvBias::Param param; | |||||
| param.format = ConvBias::Param::Format::NCHW; | |||||
| using NonlineMode = ConvBias::Param::NonlineMode; | |||||
| param.nonlineMode = NonlineMode::IDENTITY; | |||||
| param.sparse = ConvBias::Param::Sparse::GROUP; | |||||
| auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh, | |||||
| size_t fw, size_t sh, size_t sw, size_t nr_times) { | |||||
| param.pad_h = fh / 2; | |||||
| param.pad_w = fw / 2; | |||||
| param.stride_h = sh; | |||||
| param.stride_w = sw; | |||||
| bencher.set_param(param) | |||||
| .set_dtype(0, dtype::Float32()) | |||||
| .set_dtype(1, dtype::Float32()) | |||||
| .set_dtype(2, dtype::Float32()) | |||||
| .set_dtype(4, dtype::Float32()); | |||||
| bencher.set_times(nr_times); | |||||
| size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h); | |||||
| size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w); | |||||
| TensorShape inp{batch, g, hi, wi}, kern{g, 1, 1, fh, fw}, out{batch, g, ho, wo}; | |||||
| float bandwith = static_cast<float>( | |||||
| inp.total_nr_elems() + kern.total_nr_elems() + | |||||
| out.total_nr_elems()) / | |||||
| (1024 * 1024 * 1024) * 1e3; | |||||
| auto time_in_ms = bencher.execs({inp, kern, {}, {}, out}) / nr_times; | |||||
| auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12; | |||||
| printf("chanwise_depthwise_large_filter: inp=%s, kern=%s, out=%s, time: " | |||||
| "%.2fms, " | |||||
| "perf: %.2f Tops bandwidth: %.2fGB/s.\n", | |||||
| inp.to_string().c_str(), kern.to_string().c_str(), | |||||
| out.to_string().c_str(), time_in_ms, ops, bandwith * 4 / time_in_ms); | |||||
| }; | |||||
| run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10); | |||||
| run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10); | |||||
| } | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -724,6 +724,70 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) { | |||||
| TensorLayoutArray{filter, dst, src}); | TensorLayoutArray{filter, dst, src}); | ||||
| } | } | ||||
| TEST_F(CUDA, CONVOLUTION_BACKWARD_DEPTHWISE_LARGE_FILTER) { | |||||
| Checker<ConvolutionBackwardData> checker(handle_cuda()); | |||||
| checker.set_before_exec_callback( | |||||
| AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER")); | |||||
| for (auto dtype : std::vector<DType> { | |||||
| dtype::Float32(), | |||||
| #if CUDA_VERSION >= 9000 | |||||
| dtype::Float16() | |||||
| #endif | |||||
| }) { | |||||
| auto run = [&checker, &dtype]( | |||||
| size_t n, size_t g, size_t h, size_t fh, size_t padding, | |||||
| size_t stride) { | |||||
| param::Convolution param; | |||||
| param.stride_h = param.stride_w = stride; | |||||
| param.pad_h = param.pad_w = padding; | |||||
| param.mode = Convolution::Mode::CROSS_CORRELATION; | |||||
| param.sparse = param::Convolution::Sparse::GROUP; | |||||
| checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype); | |||||
| float scale = 64.f / sqrt(fh * fh); | |||||
| UniformFloatRNG rng(scale, scale * 2); | |||||
| checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &rng); | |||||
| if (dtype.enumv() == DTypeEnum::Float16) | |||||
| checker.set_epsilon(1e-1); | |||||
| checker.set_param(param).execs( | |||||
| {{g, 1, 1, fh, fh}, | |||||
| {n, g, (h + 2 * padding - fh + 1) / stride, | |||||
| (h + 2 * padding - fh + 1) / stride}, | |||||
| {n, g, h, h}}); | |||||
| }; | |||||
| run(4, 8, 32, 5, 5 / 2, 1); | |||||
| run(4, 8, 32, 7, 7 / 2, 1); | |||||
| run(4, 8, 32, 9, 9 / 2, 1); | |||||
| run(4, 8, 32, 11, 11 / 2, 1); | |||||
| run(4, 8, 32, 13, 13 / 2, 1); | |||||
| run(4, 8, 32, 15, 15 / 2, 1); | |||||
| run(4, 8, 32, 17, 17 / 2, 1); | |||||
| run(4, 8, 32, 19, 19 / 2, 1); | |||||
| run(4, 8, 32, 21, 21 / 2, 1); | |||||
| run(4, 8, 32, 23, 23 / 2, 1); | |||||
| run(4, 8, 32, 25, 25 / 2, 1); | |||||
| run(4, 8, 32, 27, 27 / 2, 1); | |||||
| run(4, 8, 32, 29, 29 / 2, 1); | |||||
| run(4, 8, 32, 31, 31 / 2, 1); | |||||
| run(4, 8, 64, 5, 5 / 2, 2); | |||||
| run(4, 8, 64, 7, 7 / 3, 2); | |||||
| run(4, 8, 64, 9, 9 / 3, 2); | |||||
| run(4, 8, 64, 11, 11 / 3, 2); | |||||
| run(4, 8, 64, 13, 13 / 3, 2); | |||||
| run(4, 8, 64, 15, 15 / 3, 2); | |||||
| run(4, 8, 64, 17, 17 / 3, 2); | |||||
| run(4, 8, 64, 19, 19 / 3, 2); | |||||
| run(4, 8, 64, 21, 21 / 3, 2); | |||||
| run(4, 8, 64, 23, 23 / 3, 2); | |||||
| run(4, 8, 64, 25, 25 / 3, 2); | |||||
| run(4, 8, 64, 27, 27 / 3, 2); | |||||
| run(4, 8, 64, 29, 29 / 3, 2); | |||||
| run(4, 8, 64, 31, 31 / 3, 2); | |||||
| run(1, 2, 128, 31, 31 / 3, 2); | |||||
| run(1, 2, 256, 31, 31 / 3, 2); | |||||
| } | |||||
| } | |||||
| #if MEGDNN_WITH_BENCHMARK | #if MEGDNN_WITH_BENCHMARK | ||||
| TEST_F(CUDA, CONV_FWD_BENCHMARK) { | TEST_F(CUDA, CONV_FWD_BENCHMARK) { | ||||
| auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH = 1, | auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH = 1, | ||||
| @@ -901,6 +965,104 @@ TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) { | |||||
| run(32, 64, 64, 56, 56, 1, 1, 0); | run(32, 64, 64, 56, 56, 1, 1, 0); | ||||
| } | } | ||||
| TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_DEPTHWISE_LARGE_FILTER_FP32) { | |||||
| CUBenchmarker<ConvolutionBackwardData> bencher{handle_cuda()}; | |||||
| bencher.set_display(false); | |||||
| bencher.set_before_exec_callback( | |||||
| AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER")); | |||||
| auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH, | |||||
| size_t SH, size_t nr_times) { | |||||
| bencher.set_dtype(0, dtype::Float32()) | |||||
| .set_dtype(1, dtype::Float32()) | |||||
| .set_dtype(2, dtype::Float32()); | |||||
| param::Convolution param; | |||||
| param.stride_h = param.stride_w = SH; | |||||
| param.pad_h = param.pad_w = FH / 2; | |||||
| param.sparse = param::Convolution::Sparse::GROUP; | |||||
| bencher.set_param(param); | |||||
| bencher.set_times(nr_times); | |||||
| TensorLayout src{{N, g, IH, IW}, dtype::Float32()}, | |||||
| filter{{g, 1, 1, FH, FH}, dtype::Float32()}; | |||||
| TensorLayout dst; | |||||
| { | |||||
| auto&& opr = handle_cuda()->create_operator<Convolution>(); | |||||
| opr->param() = param; | |||||
| opr->deduce_layout(src, filter, dst); | |||||
| } | |||||
| auto time_ms_fp32 = bencher.execl({filter, dst, src}) / nr_times; | |||||
| float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH; | |||||
| printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(), | |||||
| filter.to_string().c_str(), dst.to_string().c_str()); | |||||
| printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp32, | |||||
| (flo / (time_ms_fp32 * 1e9))); | |||||
| }; | |||||
| run(64, 384, 384, 32, 32, 3, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 5, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 7, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 9, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 11, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 13, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 15, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 17, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 19, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 21, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 23, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 25, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 27, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 29, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 31, 1, 10); | |||||
| } | |||||
| TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_DEPTHWISE_LARGE_FILTER_FP16) { | |||||
| CUBenchmarker<ConvolutionBackwardData> bencher{handle_cuda()}; | |||||
| bencher.set_display(false); | |||||
| bencher.set_before_exec_callback( | |||||
| AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER")); | |||||
| auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH, | |||||
| size_t SH, size_t nr_times) { | |||||
| bencher.set_dtype(0, dtype::Float16()) | |||||
| .set_dtype(1, dtype::Float16()) | |||||
| .set_dtype(2, dtype::Float16()); | |||||
| param::Convolution param; | |||||
| param.stride_h = param.stride_w = SH; | |||||
| param.pad_h = param.pad_w = FH / 2; | |||||
| param.sparse = param::Convolution::Sparse::GROUP; | |||||
| bencher.set_param(param); | |||||
| bencher.set_times(nr_times); | |||||
| TensorLayout src{{N, g, IH, IW}, dtype::Float16()}, | |||||
| filter{{g, 1, 1, FH, FH}, dtype::Float16()}; | |||||
| TensorLayout dst; | |||||
| { | |||||
| auto&& opr = handle_cuda()->create_operator<Convolution>(); | |||||
| opr->param() = param; | |||||
| opr->deduce_layout(src, filter, dst); | |||||
| } | |||||
| auto time_ms_fp16 = bencher.execl({filter, dst, src}) / nr_times; | |||||
| float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH; | |||||
| printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(), | |||||
| filter.to_string().c_str(), dst.to_string().c_str()); | |||||
| printf("time_fp16=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp16, | |||||
| (flo / (time_ms_fp16 * 1e9))); | |||||
| }; | |||||
| run(64, 384, 384, 32, 32, 3, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 5, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 7, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 9, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 11, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 13, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 15, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 17, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 19, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 21, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 23, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 25, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 27, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 29, 1, 10); | |||||
| run(64, 384, 384, 32, 32, 31, 1, 10); | |||||
| } | |||||
| TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) { | TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) { | ||||
| CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()}; | CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()}; | ||||
| std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{ | std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{ | ||||
| @@ -1065,6 +1227,46 @@ TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) { | |||||
| run(32, 512, 1024, 14, 14, 1, 2, 0); | run(32, 512, 1024, 14, 14, 1, 2, 0); | ||||
| run(32, 64, 64, 56, 56, 1, 1, 0); | run(32, 64, 64, 56, 56, 1, 1, 0); | ||||
| } | } | ||||
| TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_FILTER_DEPTHWISE_LARGE_FILTER) { | |||||
| CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()}; | |||||
| std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{ | |||||
| new OprProxy<ConvolutionBackwardFilter>{true}}; | |||||
| size_t RUNS = 10; | |||||
| bench.set_proxy(proxy).set_times(RUNS); | |||||
| bench.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>( | |||||
| "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFTv7.6.3")); | |||||
| auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH, | |||||
| size_t SH, size_t PH) { | |||||
| bench.set_dtype(0, dtype::Float32()) | |||||
| .set_dtype(1, dtype::Float32()) | |||||
| .set_dtype(2, dtype::Float32()); | |||||
| param::Convolution param; | |||||
| param.stride_h = param.stride_w = SH; | |||||
| param.pad_h = param.pad_w = FH / 2; | |||||
| param.sparse = param::Convolution::Sparse::GROUP; | |||||
| bench.set_param(param); | |||||
| bench.proxy()->target_execution_policy.algo.reset(); | |||||
| TensorLayout src{{N, g, IH, IW}, dtype::Float32()}, | |||||
| filter{{g, 1, 1, FH, FH}, dtype::Float32()}; | |||||
| TensorLayout dst; | |||||
| { | |||||
| auto&& opr = handle_cuda()->create_operator<Convolution>(); | |||||
| opr->param() = param; | |||||
| opr->deduce_layout(src, filter, dst); | |||||
| } | |||||
| auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS; | |||||
| float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH; | |||||
| printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(), | |||||
| filter.to_string().c_str(), dst.to_string().c_str()); | |||||
| printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp32, | |||||
| (flo / (time_ms_fp32 * 1e9))); | |||||
| }; | |||||
| run(64, 384, 384, 32, 32, 31, 1, 15); | |||||
| } | |||||
| #endif | #endif | ||||
| #undef CUDNN_VERSION_STRING | #undef CUDNN_VERSION_STRING | ||||
| @@ -213,7 +213,7 @@ std::vector<BenchArgs> get_feat_model_args() { | |||||
| return args; | return args; | ||||
| } | } | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #if CUDA_VERSION >= 10010 | |||||
| std::vector<BenchArgs> get_f16_feat_model_args() { | std::vector<BenchArgs> get_f16_feat_model_args() { | ||||
| std::vector<BenchArgs> args; | std::vector<BenchArgs> args; | ||||
| args.emplace_back(BenchArgs{128, 9216, 9216}); | args.emplace_back(BenchArgs{128, 9216, 9216}); | ||||
| @@ -367,7 +367,7 @@ MEGDNN_FOREACH_CUTLASS_KERNEL(cb) | |||||
| #undef cb | #undef cb | ||||
| #undef MEGDNN_FOREACH_CUTLASS_KERNEL | #undef MEGDNN_FOREACH_CUTLASS_KERNEL | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #if CUDA_VERSION >= 10010 | |||||
| #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ | #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ | ||||
| cb(1, 256, 128, 32, 64, 64, 32, 8, 8, 4); \ | cb(1, 256, 128, 32, 64, 64, 32, 8, 8, 4); \ | ||||
| cb(2, 128, 256, 32, 64, 64, 32, 8, 8, 4); \ | cb(2, 128, 256, 32, 64, 64, 32, 8, 8, 4); \ | ||||
| @@ -403,7 +403,9 @@ MEGDNN_FOREACH_CUTLASS_KERNEL(cb) | |||||
| #undef cb | #undef cb | ||||
| #undef MEGDNN_FOREACH_CUTLASS_KERNEL | #undef MEGDNN_FOREACH_CUTLASS_KERNEL | ||||
| #endif | |||||
| #if CUDA_VERSION >= 10020 | |||||
| #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ | #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb) \ | ||||
| cb(1, 256, 128, 32, 64, 64, 32, 16, 8, 8); \ | cb(1, 256, 128, 32, 64, 64, 32, 16, 8, 8); \ | ||||
| cb(2, 128, 256, 32, 64, 64, 32, 16, 8, 8); \ | cb(2, 128, 256, 32, 64, 64, 32, 16, 8, 8); \ | ||||
| @@ -454,7 +456,7 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL_FEAT) { | |||||
| dtype::Float32(), "CUTLASS_FLOAT32_SIMT"); | dtype::Float32(), "CUTLASS_FLOAT32_SIMT"); | ||||
| } | } | ||||
| #if CUDA_VERSION >= 10020 | |||||
| #if CUDA_VERSION >= 10010 | |||||
| TEST_F(CUDA, BENCHMARK_CUTLASS_F16_MATMUL_FEAT) { | TEST_F(CUDA, BENCHMARK_CUTLASS_F16_MATMUL_FEAT) { | ||||
| benchmark_matrix_mul( | benchmark_matrix_mul( | ||||
| handle_cuda(), get_f16_feat_model_args(), dtype::Float16(), | handle_cuda(), get_f16_feat_model_args(), dtype::Float16(), | ||||
| @@ -101,6 +101,36 @@ TEST_F(CUDA, PADDING_REFLECT2) { | |||||
| 4, 1, 6, 3, 6, 1, 6, 3})}); | 4, 1, 6, 3, 6, 1, 6, 3})}); | ||||
| } | } | ||||
| TEST_F(CUDA, PADDING_REFLECT2_QUANTIZED) { | |||||
| Checker<Padding> checker(handle_cuda(), false); | |||||
| param::Padding param; | |||||
| param.padding_mode = param::Padding::PaddingMode::REFLECT; | |||||
| param.front_offset_dim0 = 2; | |||||
| param.front_offset_dim1 = 1; | |||||
| param.front_offset_dim2 = 0; | |||||
| param.front_offset_dim3 = 0; | |||||
| param.front_offset_dim4 = 0; | |||||
| param.front_offset_dim5 = 0; | |||||
| param.front_offset_dim6 = 0; | |||||
| param.back_offset_dim0 = 0; | |||||
| param.back_offset_dim1 = 2; | |||||
| param.back_offset_dim2 = 0; | |||||
| param.back_offset_dim3 = 0; | |||||
| param.back_offset_dim4 = 0; | |||||
| param.back_offset_dim5 = 0; | |||||
| param.back_offset_dim6 = 0; | |||||
| checker.set_param(param).exect( | |||||
| Testcase{ | |||||
| TensorValue( | |||||
| {3, 3}, dtype::QuantizedS8(), {1, 2, 3, 4, 5, 6, 7, 8, 9}), | |||||
| {}}, | |||||
| Testcase{{}, TensorValue({5, 6}, dtype::QuantizedS8(), {8, 7, 8, 9, 8, 7, 5, | |||||
| 4, 5, 6, 5, 4, 2, 1, | |||||
| 2, 3, 2, 1, 5, 4, 5, | |||||
| 6, 5, 4, 8, 7, 8, 9, | |||||
| 8, 7})}); | |||||
| } | |||||
| TEST_F(CUDA, PADDING_REPLICATE) { | TEST_F(CUDA, PADDING_REPLICATE) { | ||||
| Checker<Padding> checker(handle_cuda(), false); | Checker<Padding> checker(handle_cuda(), false); | ||||
| param::Padding param; | param::Padding param; | ||||
| @@ -83,6 +83,36 @@ TEST_F(NAIVE, PADDING_REFLECT) { | |||||
| {10}, dtype::Float32(), {3, 2, 1, 2, 3, 4, 5, 4, 3, 2})}); | {10}, dtype::Float32(), {3, 2, 1, 2, 3, 4, 5, 4, 3, 2})}); | ||||
| } | } | ||||
| TEST_F(NAIVE, PADDING_REFLECT2) { | |||||
| Checker<Padding> checker(handle(), false); | |||||
| param::Padding param; | |||||
| param.padding_mode = param::Padding::PaddingMode::REFLECT; | |||||
| param.front_offset_dim0 = 2; | |||||
| param.front_offset_dim1 = 1; | |||||
| param.front_offset_dim2 = 0; | |||||
| param.front_offset_dim3 = 0; | |||||
| param.front_offset_dim4 = 0; | |||||
| param.front_offset_dim5 = 0; | |||||
| param.front_offset_dim6 = 0; | |||||
| param.back_offset_dim0 = 0; | |||||
| param.back_offset_dim1 = 2; | |||||
| param.back_offset_dim2 = 0; | |||||
| param.back_offset_dim3 = 0; | |||||
| param.back_offset_dim4 = 0; | |||||
| param.back_offset_dim5 = 0; | |||||
| param.back_offset_dim6 = 0; | |||||
| checker.set_param(param).exect( | |||||
| Testcase{ | |||||
| TensorValue( | |||||
| {3, 3}, dtype::QuantizedS8(), {1, 2, 3, 4, 5, 6, 7, 8, 9}), | |||||
| {}}, | |||||
| Testcase{{}, TensorValue({5, 6}, dtype::QuantizedS8(), {8, 7, 8, 9, 8, 7, 5, | |||||
| 4, 5, 6, 5, 4, 2, 1, | |||||
| 2, 3, 2, 1, 5, 4, 5, | |||||
| 6, 5, 4, 8, 7, 8, 9, | |||||
| 8, 7})}); | |||||
| } | |||||
| TEST_F(NAIVE, PADDING_REPLICATE) { | TEST_F(NAIVE, PADDING_REPLICATE) { | ||||
| Checker<Padding> checker(handle(), false); | Checker<Padding> checker(handle(), false); | ||||
| param::Padding param; | param::Padding param; | ||||
| @@ -119,12 +119,16 @@ def unpack_getitem(inp, tuple_val, *, allow_newaxis=True): | |||||
| else 1 | else 1 | ||||
| ) | ) | ||||
| else: | else: | ||||
| if ndim_indexed > inp.ndim: | |||||
| raise IndexError( | |||||
| "too many indices for tensor: tensor is {}-dimensional, but {} were indexed".format( | |||||
| inp.ndim, len(tuple_val) | |||||
| try: | |||||
| if ndim_indexed > inp.ndim: | |||||
| raise IndexError( | |||||
| "too many indices for tensor: tensor is {}-dimensional, but {} were indexed".format( | |||||
| inp.ndim, len(tuple_val) | |||||
| ) | |||||
| ) | ) | ||||
| ) | |||||
| except ValueError: | |||||
| # ignore | |||||
| pass | |||||
| tuple_val = remove_ellipsis(inp, tuple_val) | tuple_val = remove_ellipsis(inp, tuple_val) | ||||
| use_subtensor = True | use_subtensor = True | ||||
| @@ -18,6 +18,7 @@ from ..functional import ( | |||||
| conv_transpose3d, | conv_transpose3d, | ||||
| deformable_conv2d, | deformable_conv2d, | ||||
| local_conv2d, | local_conv2d, | ||||
| pad, | |||||
| relu, | relu, | ||||
| ) | ) | ||||
| from ..tensor import Parameter | from ..tensor import Parameter | ||||
| @@ -126,7 +127,7 @@ class Conv1d(_ConvNd): | |||||
| kernel_size: size of weight on spatial dimensions. | kernel_size: size of weight on spatial dimensions. | ||||
| stride: stride of the 1D convolution operation. | stride: stride of the 1D convolution operation. | ||||
| padding: size of the paddings added to the input on both sides of its | padding: size of the paddings added to the input on both sides of its | ||||
| spatial dimensions. Only zero-padding is supported. Default: 0 | |||||
| spatial dimensions. Default: 0 | |||||
| dilation: dilation of the 1D convolution operation. Default: 1 | dilation: dilation of the 1D convolution operation. Default: 1 | ||||
| groups: number of groups to divide input and output channels into, | groups: number of groups to divide input and output channels into, | ||||
| so as to perform a "grouped convolution". When ``groups`` is not 1, | so as to perform a "grouped convolution". When ``groups`` is not 1, | ||||
| @@ -139,6 +140,8 @@ class Conv1d(_ConvNd): | |||||
| placed on the precision of intermediate results. When set to "float32", | placed on the precision of intermediate results. When set to "float32", | ||||
| "float32" would be used for accumulator and intermediate result, but only | "float32" would be used for accumulator and intermediate result, but only | ||||
| effective when input and output are of float16 dtype. | effective when input and output are of float16 dtype. | ||||
| padding_mode: "zeros", "reflect" or "replicate". Default: "zeros". | |||||
| Refer to :class:`~.module.padding.Pad` for more information. | |||||
| Note: | Note: | ||||
| * ``weight`` usually has shape ``(out_channels, in_channels, kernel_size)`` , | * ``weight`` usually has shape ``(out_channels, in_channels, kernel_size)`` , | ||||
| @@ -177,6 +180,7 @@ class Conv1d(_ConvNd): | |||||
| bias: bool = True, | bias: bool = True, | ||||
| conv_mode: str = "cross_correlation", | conv_mode: str = "cross_correlation", | ||||
| compute_mode: str = "default", | compute_mode: str = "default", | ||||
| padding_mode: str = "zeros", | |||||
| **kwargs | **kwargs | ||||
| ): | ): | ||||
| kernel_size = kernel_size | kernel_size = kernel_size | ||||
| @@ -185,6 +189,7 @@ class Conv1d(_ConvNd): | |||||
| dilation = dilation | dilation = dilation | ||||
| self.conv_mode = conv_mode | self.conv_mode = conv_mode | ||||
| self.compute_mode = compute_mode | self.compute_mode = compute_mode | ||||
| self.padding_mode = padding_mode | |||||
| super().__init__( | super().__init__( | ||||
| in_channels, | in_channels, | ||||
| out_channels, | out_channels, | ||||
| @@ -223,7 +228,27 @@ class Conv1d(_ConvNd): | |||||
| # Assume format is NCH(W=1) | # Assume format is NCH(W=1) | ||||
| return (1, self.out_channels, 1) | return (1, self.out_channels, 1) | ||||
| def get_pad_witdth(self): | |||||
| return ((0, 0), (0, 0), (self.padding, self.padding)) | |||||
| def calc_conv(self, inp, weight, bias): | def calc_conv(self, inp, weight, bias): | ||||
| assert self.padding_mode in [ | |||||
| "zeros", | |||||
| "reflect", | |||||
| "replicate", | |||||
| ] | |||||
| if self.padding_mode != "zeros": | |||||
| return conv1d( | |||||
| pad(inp, self.get_pad_witdth(), self.padding_mode), | |||||
| weight, | |||||
| bias, | |||||
| self.stride, | |||||
| 0, | |||||
| self.dilation, | |||||
| self.groups, | |||||
| self.conv_mode, | |||||
| self.compute_mode, | |||||
| ) | |||||
| return conv1d( | return conv1d( | ||||
| inp, | inp, | ||||
| weight, | weight, | ||||
| @@ -287,7 +312,7 @@ class Conv2d(_ConvNd): | |||||
| ``(kernel_size, kernel_size)``. | ``(kernel_size, kernel_size)``. | ||||
| stride: stride of the 2D convolution operation. Default: 1 | stride: stride of the 2D convolution operation. Default: 1 | ||||
| padding: size of the paddings added to the input on both sides of its | padding: size of the paddings added to the input on both sides of its | ||||
| spatial dimensions. Only zero-padding is supported. Default: 0 | |||||
| spatial dimensions. Default: 0 | |||||
| dilation: dilation of the 2D convolution operation. Default: 1 | dilation: dilation of the 2D convolution operation. Default: 1 | ||||
| groups: number of groups into which the input and output channels are divided, | groups: number of groups into which the input and output channels are divided, | ||||
| so as to perform a ``grouped convolution``. When ``groups`` is not 1, | so as to perform a ``grouped convolution``. When ``groups`` is not 1, | ||||
| @@ -300,6 +325,8 @@ class Conv2d(_ConvNd): | |||||
| placed on the precision of intermediate results. When set to "float32", | placed on the precision of intermediate results. When set to "float32", | ||||
| "float32" would be used for accumulator and intermediate result, but only | "float32" would be used for accumulator and intermediate result, but only | ||||
| effective when input and output are of float16 dtype. | effective when input and output are of float16 dtype. | ||||
| padding_mode: "zeros", "reflect" or "replicate". Default: "zeros". | |||||
| Refer to :class:`~.module.padding.Pad` for more information. | |||||
| Note: | Note: | ||||
| * ``weight`` usually has shape ``(out_channels, in_channels, height, width)`` , | * ``weight`` usually has shape ``(out_channels, in_channels, height, width)`` , | ||||
| @@ -338,6 +365,7 @@ class Conv2d(_ConvNd): | |||||
| bias: bool = True, | bias: bool = True, | ||||
| conv_mode: str = "cross_correlation", | conv_mode: str = "cross_correlation", | ||||
| compute_mode: str = "default", | compute_mode: str = "default", | ||||
| padding_mode: str = "zeros", | |||||
| **kwargs | **kwargs | ||||
| ): | ): | ||||
| kernel_size = _pair_nonzero(kernel_size) | kernel_size = _pair_nonzero(kernel_size) | ||||
| @@ -346,6 +374,7 @@ class Conv2d(_ConvNd): | |||||
| dilation = _pair_nonzero(dilation) | dilation = _pair_nonzero(dilation) | ||||
| self.conv_mode = conv_mode | self.conv_mode = conv_mode | ||||
| self.compute_mode = compute_mode | self.compute_mode = compute_mode | ||||
| self.padding_mode = padding_mode | |||||
| super().__init__( | super().__init__( | ||||
| in_channels, | in_channels, | ||||
| out_channels, | out_channels, | ||||
| @@ -384,7 +413,32 @@ class Conv2d(_ConvNd): | |||||
| # Assume format is NCHW | # Assume format is NCHW | ||||
| return (1, self.out_channels, 1, 1) | return (1, self.out_channels, 1, 1) | ||||
| def get_pad_witdth(self): | |||||
| return ( | |||||
| (0, 0), | |||||
| (0, 0), | |||||
| (self.padding[0], self.padding[0]), | |||||
| (self.padding[1], self.padding[1]), | |||||
| ) | |||||
| def calc_conv(self, inp, weight, bias): | def calc_conv(self, inp, weight, bias): | ||||
| assert self.padding_mode in [ | |||||
| "zeros", | |||||
| "reflect", | |||||
| "replicate", | |||||
| ] | |||||
| if self.padding_mode != "zeros": | |||||
| return conv2d( | |||||
| pad(inp, self.get_pad_witdth(), self.padding_mode), | |||||
| weight, | |||||
| bias, | |||||
| self.stride, | |||||
| 0, | |||||
| self.dilation, | |||||
| self.groups, | |||||
| self.conv_mode, | |||||
| self.compute_mode, | |||||
| ) | |||||
| return conv2d( | return conv2d( | ||||
| inp, | inp, | ||||
| weight, | weight, | ||||
| @@ -30,6 +30,7 @@ class _ConvBnActivation2d(Module): | |||||
| momentum=0.9, | momentum=0.9, | ||||
| affine=True, | affine=True, | ||||
| track_running_stats=True, | track_running_stats=True, | ||||
| padding_mode: str = "zeros", | |||||
| **kwargs | **kwargs | ||||
| ): | ): | ||||
| super().__init__(**kwargs) | super().__init__(**kwargs) | ||||
| @@ -44,6 +45,7 @@ class _ConvBnActivation2d(Module): | |||||
| bias, | bias, | ||||
| conv_mode, | conv_mode, | ||||
| compute_mode, | compute_mode, | ||||
| padding_mode, | |||||
| **kwargs, | **kwargs, | ||||
| ) | ) | ||||
| self.bn = BatchNorm2d(out_channels, eps, momentum, affine, track_running_stats) | self.bn = BatchNorm2d(out_channels, eps, momentum, affine, track_running_stats) | ||||
| @@ -138,11 +138,7 @@ class Module(metaclass=ABCMeta): | |||||
| return HookHandler(self._forward_hooks, hook) | return HookHandler(self._forward_hooks, hook) | ||||
| def __call__(self, *inputs, **kwargs): | def __call__(self, *inputs, **kwargs): | ||||
| AutoNaming.push_scope( | |||||
| self.name | |||||
| if self.name is not None | |||||
| else (self._short_name if hasattr(self, "_short_name") else self._name) | |||||
| ) | |||||
| AutoNaming.push_scope(self.name if self.name is not None else self._short_name) | |||||
| for hook in self._forward_pre_hooks.values(): | for hook in self._forward_pre_hooks.values(): | ||||
| modified_inputs = hook(self, inputs) | modified_inputs = hook(self, inputs) | ||||
| if modified_inputs is not None: | if modified_inputs is not None: | ||||
| @@ -685,6 +681,12 @@ class Module(metaclass=ABCMeta): | |||||
| set_name(self, prefix, k, v) | set_name(self, prefix, k, v) | ||||
| super().__setattr__(name, value) | super().__setattr__(name, value) | ||||
| def __setstate__(self, state): | |||||
| if "_short_name" not in state: | |||||
| state["_short_name"] = state["_name"] | |||||
| state["_name"] = None | |||||
| self.__dict__.update(state) | |||||
| def __delattr__(self, name: str): | def __delattr__(self, name: str): | ||||
| if name in self.__dict__ and _is_module(self.__dict__[name]): | if name in self.__dict__ and _is_module(self.__dict__[name]): | ||||
| modules = self.__dict__.get("_modules") | modules = self.__dict__.get("_modules") | ||||
| @@ -38,6 +38,7 @@ class Conv2d(Float.Conv2d, QATModule): | |||||
| float_module.bias is not None, | float_module.bias is not None, | ||||
| float_module.conv_mode, | float_module.conv_mode, | ||||
| float_module.compute_mode, | float_module.compute_mode, | ||||
| float_module.padding_mode, | |||||
| name=float_module.name, | name=float_module.name, | ||||
| ) | ) | ||||
| qat_module.weight = float_module.weight | qat_module.weight = float_module.weight | ||||
| @@ -147,6 +147,7 @@ class _ConvBnActivation2d(Float._ConvBnActivation2d, QATModule): | |||||
| float_module.conv.bias is not None, | float_module.conv.bias is not None, | ||||
| float_module.conv.conv_mode, | float_module.conv.conv_mode, | ||||
| float_module.conv.compute_mode, | float_module.conv.compute_mode, | ||||
| padding_mode=float_module.conv.padding_mode, | |||||
| name=float_module.name, | name=float_module.name, | ||||
| ) | ) | ||||
| qat_module.conv.weight = float_module.conv.weight | qat_module.conv.weight = float_module.conv.weight | ||||
| @@ -11,7 +11,7 @@ import numpy as np | |||||
| from ... import module as Float | from ... import module as Float | ||||
| from ...core.tensor import dtype | from ...core.tensor import dtype | ||||
| from ...functional.nn import conv_bias_activation | |||||
| from ...functional.nn import conv_bias_activation, pad | |||||
| from ...functional.quantized import conv_transpose2d | from ...functional.quantized import conv_transpose2d | ||||
| from ...tensor import Parameter | from ...tensor import Parameter | ||||
| from ..qat import conv as QAT | from ..qat import conv as QAT | ||||
| @@ -38,6 +38,7 @@ class Conv2d(Float.Conv2d, QuantizedModule): | |||||
| conv_mode: str = "cross_correlation", | conv_mode: str = "cross_correlation", | ||||
| compute_mode: str = "default", | compute_mode: str = "default", | ||||
| dtype=None, | dtype=None, | ||||
| padding_mode: str = "zeros", | |||||
| **kwargs | **kwargs | ||||
| ): | ): | ||||
| super().__init__( | super().__init__( | ||||
| @@ -51,13 +52,33 @@ class Conv2d(Float.Conv2d, QuantizedModule): | |||||
| True, | True, | ||||
| conv_mode, | conv_mode, | ||||
| compute_mode, | compute_mode, | ||||
| padding_mode, | |||||
| ) | ) | ||||
| self.output_dtype = dtype | self.output_dtype = dtype | ||||
| def calc_conv_quantized(self, inp, nonlinear_mode="identity"): | def calc_conv_quantized(self, inp, nonlinear_mode="identity"): | ||||
| assert self.padding_mode in [ | |||||
| "zeros", | |||||
| "reflect", | |||||
| "replicate", | |||||
| ] | |||||
| inp_scale = dtype.get_scale(inp.dtype) | inp_scale = dtype.get_scale(inp.dtype) | ||||
| w_scale = dtype.get_scale(self.weight.dtype) | w_scale = dtype.get_scale(self.weight.dtype) | ||||
| bias_scale = inp_scale * w_scale | bias_scale = inp_scale * w_scale | ||||
| if self.padding_mode != "zeros": | |||||
| return conv_bias_activation( | |||||
| pad(inp, self.get_pad_witdth(), self.padding_mode), | |||||
| self.weight, | |||||
| self.bias.astype(dtype.qint32(bias_scale)), | |||||
| self.output_dtype, | |||||
| self.stride, | |||||
| 0, | |||||
| self.dilation, | |||||
| self.groups, | |||||
| conv_mode=self.conv_mode, | |||||
| compute_mode=self.compute_mode, | |||||
| nonlinear_mode=nonlinear_mode, | |||||
| ) | |||||
| return conv_bias_activation( | return conv_bias_activation( | ||||
| inp, | inp, | ||||
| self.weight, | self.weight, | ||||
| @@ -88,6 +109,7 @@ class Conv2d(Float.Conv2d, QuantizedModule): | |||||
| qat_module.dilation, | qat_module.dilation, | ||||
| qat_module.groups, | qat_module.groups, | ||||
| dtype=output_dtype, | dtype=output_dtype, | ||||
| padding_mode=qat_module.padding_mode, | |||||
| name=qat_module.name, | name=qat_module.name, | ||||
| ) | ) | ||||
| weight = qat_module.weight.astype(qat_module.get_weight_dtype()) | weight = qat_module.weight.astype(qat_module.get_weight_dtype()) | ||||
| @@ -31,6 +31,7 @@ class _ConvBnActivation2d(Conv2d): | |||||
| qat_module.conv.groups, | qat_module.conv.groups, | ||||
| dtype=output_dtype, | dtype=output_dtype, | ||||
| name=qat_module.name, | name=qat_module.name, | ||||
| padding_mode=qat_module.conv.padding_mode, | |||||
| ) | ) | ||||
| w_fold, b_fold = qat_module.fold_weight_bias( | w_fold, b_fold = qat_module.fold_weight_bias( | ||||
| qat_module.bn.running_mean, qat_module.bn.running_var | qat_module.bn.running_mean, qat_module.bn.running_var | ||||
| @@ -126,6 +126,9 @@ def convbn2d_module_loader(expr): | |||||
| module = expr.inputs[0].owner | module = expr.inputs[0].owner | ||||
| if not hasattr(module.bn, "param_dim"): | if not hasattr(module.bn, "param_dim"): | ||||
| module.bn.param_dim = "dim_1c11" | module.bn.param_dim = "dim_1c11" | ||||
| module = expr.inputs[0].owner | |||||
| if not hasattr(module.conv, "padding_mode"): | |||||
| module.conv.padding_mode = "zeros" | |||||
| @register_opdef_loader(BatchNorm) | @register_opdef_loader(BatchNorm) | ||||
| @@ -162,3 +165,36 @@ def tensor_gen_func_loader(expr): | |||||
| else: | else: | ||||
| device = None | device = None | ||||
| expr.set_args_kwargs(shape, dtype=dtype, device=device) | expr.set_args_kwargs(shape, dtype=dtype, device=device) | ||||
| @register_functional_loader(("megengine.functional.nn", "pad")) | |||||
| def pad_func_loader(expr): | |||||
| if "pad_witdth" in expr.kwargs: | |||||
| kwargs = expr.kwargs | |||||
| kwargs["pad_width"] = kwargs.pop("pad_witdth") | |||||
| expr.set_args_kwargs(*expr.args, **kwargs) | |||||
| @register_module_loader( | |||||
| ("megengine.module.conv", "Conv1d"), | |||||
| ("megengine.module.conv", "Conv2d"), | |||||
| ("megengine.module.conv", "ConvRelu2d"), | |||||
| ("megengine.module.qat.conv", "Conv2d"), | |||||
| ("megengine.module.qat.conv", "ConvRelu2d"), | |||||
| ("megengine.module.quantized.conv", "Conv2d"), | |||||
| ("megengine.module.quantized.conv", "ConvRelu2d"), | |||||
| ) | |||||
| def conv2d_module_loader(expr): | |||||
| module = expr.inputs[0].owner | |||||
| if not hasattr(module, "padding_mode"): | |||||
| module.padding_mode = "zeros" | |||||
| @register_module_loader( | |||||
| ("megengine.module.quantized.conv_bn", "ConvBn2d"), | |||||
| ("megengine.module.quantized.conv_bn", "ConvBnRelu2d"), | |||||
| ) | |||||
| def quantized_convbn2d_module_loader(expr): | |||||
| module = expr.inputs[0].owner | |||||
| if not hasattr(module, "padding_mode"): | |||||
| module.padding_mode = "zeros" | |||||
| @@ -50,7 +50,7 @@ class _ModuleState: | |||||
| if self.obj is None: | if self.obj is None: | ||||
| typem = getattr(import_module(self.module[0]), self.module[1]) | typem = getattr(import_module(self.module[0]), self.module[1]) | ||||
| m_obj = typem.__new__(typem) | m_obj = typem.__new__(typem) | ||||
| m_obj.__dict__.update(self.state) | |||||
| m_obj.__setstate__(self.state) | |||||
| self.obj = m_obj | self.obj = m_obj | ||||
| return self.obj | return self.obj | ||||
| @@ -1681,11 +1681,13 @@ class TracedModuleBuilder(NodeMixin): | |||||
| if isinstance(wrapped, TracedModuleBuilder): | if isinstance(wrapped, TracedModuleBuilder): | ||||
| if not isinstance(mod_attr, (List, Dict, QATModule)): | if not isinstance(mod_attr, (List, Dict, QATModule)): | ||||
| assert mod_attr is wrapped._mod | |||||
| else: | |||||
| assert ( | |||||
| mod_attr is wrapped._mod | |||||
| ), "TracedModule do not support modify module attributes, please check your code." | |||||
| if isinstance(wrapped, RawTensor): | |||||
| assert ( | assert ( | ||||
| mod_attr is wrapped | mod_attr is wrapped | ||||
| ), "TracedModule do not support modify attributes, please check your code." | |||||
| ), "TracedModule do not support modify tensor attributes, please check your code." | |||||
| if isinstance(wrapped, (NodeMixin, RawTensor)): | if isinstance(wrapped, (NodeMixin, RawTensor)): | ||||
| NodeMixin.wrap( | NodeMixin.wrap( | ||||
| @@ -2296,7 +2298,7 @@ class TracedModule(Module): | |||||
| for k, v in state.items(): | for k, v in state.items(): | ||||
| if isinstance(v, _ModuleState): | if isinstance(v, _ModuleState): | ||||
| state[k] = v.to_module() | state[k] = v.to_module() | ||||
| self.__dict__.update(state) | |||||
| super().__setstate__(state) | |||||
| self._update_ref() | self._update_ref() | ||||
| for _, graph in self.argdef_graph_map.items(): | for _, graph in self.argdef_graph_map.items(): | ||||
| @@ -272,16 +272,12 @@ PyObject* TensorWrapper::device() { | |||||
| PyObject* TensorWrapper::numpy() { | PyObject* TensorWrapper::numpy() { | ||||
| auto hv = m_tensor->numpy(); | auto hv = m_tensor->numpy(); | ||||
| // if (!hv) { | |||||
| // PyErr_SetString(PyExc_ValueError, "tensor invalid"); | |||||
| // return nullptr; | |||||
| // } | |||||
| auto arr = py::reinterpret_steal<py::array>( | |||||
| npy::ndarray_from_tensor(hv->as_nd(true), npy::ShareType::TRY_SHARE)); | |||||
| if (!arr) { | |||||
| if (!hv) { | |||||
| PyErr_SetString(PyExc_ValueError, "tensor invalid"); | PyErr_SetString(PyExc_ValueError, "tensor invalid"); | ||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| auto arr = py::reinterpret_steal<py::array>( | |||||
| npy::ndarray_from_tensor(hv->as_nd(true), npy::ShareType::TRY_SHARE)); | |||||
| if (hv->shape().is_scalar()) { | if (hv->shape().is_scalar()) { | ||||
| mgb_assert(PyArray_Check(arr.ptr())); | mgb_assert(PyArray_Check(arr.ptr())); | ||||
| return PyArray_Squeeze(reinterpret_cast<PyArrayObject*>(arr.ptr())); | return PyArray_Squeeze(reinterpret_cast<PyArrayObject*>(arr.ptr())); | ||||
| @@ -7,6 +7,8 @@ | |||||
| # software distributed under the License is distributed on an | # software distributed under the License is distributed on an | ||||
| # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| import collections | import collections | ||||
| import platform | |||||
| from tempfile import NamedTemporaryFile | |||||
| import numpy as np | import numpy as np | ||||
| import pytest | import pytest | ||||
| @@ -16,6 +18,8 @@ import megengine | |||||
| import megengine.core.tensor.megbrain_graph as G | import megengine.core.tensor.megbrain_graph as G | ||||
| import megengine.functional as F | import megengine.functional as F | ||||
| import megengine.jit as jit | import megengine.jit as jit | ||||
| import megengine.random as rand | |||||
| import megengine.utils.comp_graph_tools as cgtools | |||||
| from megengine.core._imperative_rt.core2 import apply | from megengine.core._imperative_rt.core2 import apply | ||||
| from megengine.core._trace_option import use_symbolic_shape | from megengine.core._trace_option import use_symbolic_shape | ||||
| from megengine.core.ops import builtin | from megengine.core.ops import builtin | ||||
| @@ -724,3 +728,26 @@ def test_nd_int_indexing(symbolic): | |||||
| np.testing.assert_equal(out.numpy(), npy_out) | np.testing.assert_equal(out.numpy(), npy_out) | ||||
| run_test([inp, idx], lambda inp, idx: inp[idx]) | run_test([inp, idx], lambda inp, idx: inp[idx]) | ||||
| @pytest.mark.skipif( | |||||
| platform.system() == "Windows", reason="windows temp file issue, fixme later" | |||||
| ) | |||||
| def test_subtensor_when_shape_invalid(): | |||||
| @jit.trace(symbolic=True, capture_as_const=True) | |||||
| def fun(inp): | |||||
| shape = inp.shape | |||||
| H = shape[-1] | |||||
| NH = H * 8 + 4 | |||||
| arr = F.arange(4, NH, 8) | |||||
| arr_shape = arr.shape | |||||
| return arr_shape[0] | |||||
| inp = rand.uniform(size=[1, 3, 224, 224]) | |||||
| fun(inp) | |||||
| with NamedTemporaryFile() as f: | |||||
| fun.dump(f.name, arg_names=["data"], optimize_for_inference=True) | |||||
| inp = rand.uniform(size=[1, 3, 512, 512]) | |||||
| net = cgtools.GraphInference(f.name) | |||||
| net.run(inp_dict={"data": inp}) | |||||
| @@ -87,3 +87,17 @@ def test_compatibility(): | |||||
| test_old_tensor("tensor_v1_1.mge") | test_old_tensor("tensor_v1_1.mge") | ||||
| test_old_tensor("tensor_v1_2.mge") | test_old_tensor("tensor_v1_2.mge") | ||||
| t = mge.tensor([1]) | |||||
| getattr(t, "qparams") | |||||
| new_args = t.__getnewargs__() | |||||
| assert ( | |||||
| len(new_args) == 3 | |||||
| and isinstance(new_args[0], np.ndarray) | |||||
| and new_args[1] == np.int32 | |||||
| and isinstance(new_args[2], str) | |||||
| ), "Modify Tensor __getnewargs__ may break pickle serialization compatible" | |||||
| state = t.__getstate__() | |||||
| assert set(state.keys()) == set( | |||||
| ["qparams"] | |||||
| ), "Modify Tensor __getstate__ may break pickle serialization compatible" | |||||
| @@ -681,3 +681,27 @@ def test_repr_module_reset_attr(): | |||||
| m1 = ResetAttrModule(False) | m1 = ResetAttrModule(False) | ||||
| output = [m0.__repr__(), m1.__repr__()] | output = [m0.__repr__(), m1.__repr__()] | ||||
| assert output == ground_truth | assert output == ground_truth | ||||
| def test_module_compatible(): | |||||
| class Empty(Module): | |||||
| def forward(self): | |||||
| pass | |||||
| empty_module = Empty() | |||||
| old_attributes = set( | |||||
| [ | |||||
| "_modules", | |||||
| "name", | |||||
| "training", | |||||
| "quantize_disabled", | |||||
| "_forward_pre_hooks", | |||||
| "_forward_hooks", | |||||
| "_name", | |||||
| "_short_name", | |||||
| ] | |||||
| ) | |||||
| current_attributes = set(empty_module.__dict__.keys()) | |||||
| assert ( | |||||
| old_attributes == current_attributes | |||||
| ), "Add or delete attributes in Module class may break compatibility of pickle serialization" | |||||
| @@ -60,7 +60,18 @@ def test_qat_convbn2d(): | |||||
| ) | ) | ||||
| def test_qat_conv(): | |||||
| @pytest.mark.parametrize( | |||||
| "padding, padding_mode", | |||||
| [ | |||||
| (0, "zeros"), | |||||
| ((1, 2), "zeros"), | |||||
| (3, "reflect"), | |||||
| ((1, 2), "reflect"), | |||||
| (4, "replicate"), | |||||
| ((1, 2), "replicate"), | |||||
| ], | |||||
| ) | |||||
| def test_qat_conv(padding, padding_mode): | |||||
| in_channels = 32 | in_channels = 32 | ||||
| out_channels = 64 | out_channels = 64 | ||||
| @@ -72,7 +83,13 @@ def test_qat_conv(): | |||||
| self.quant = QuantStub() | self.quant = QuantStub() | ||||
| self.dequant = DequantStub() | self.dequant = DequantStub() | ||||
| self.conv = Conv2d( | self.conv = Conv2d( | ||||
| in_channels, out_channels, kernel_size, groups=groups, bias=bias | |||||
| in_channels, | |||||
| out_channels, | |||||
| kernel_size, | |||||
| groups=groups, | |||||
| bias=bias, | |||||
| padding=padding, | |||||
| padding_mode=padding_mode, | |||||
| ) | ) | ||||
| self.conv_relu = ConvRelu2d( | self.conv_relu = ConvRelu2d( | ||||
| out_channels, in_channels, kernel_size, groups=groups, bias=bias | out_channels, in_channels, kernel_size, groups=groups, bias=bias | ||||
| @@ -236,11 +236,16 @@ def test_linear(): | |||||
| @pytest.mark.parametrize("module", ["Conv2d", "ConvBn2d", "ConvBnRelu2d"]) | @pytest.mark.parametrize("module", ["Conv2d", "ConvBn2d", "ConvBnRelu2d"]) | ||||
| def test_conv(module): | |||||
| normal_net = getattr(Float, module)(3, 3, 3, 1, 1, 1, bias=True) | |||||
| @pytest.mark.parametrize("padding_mode", ["zeros", "reflect", "replicate"]) | |||||
| def test_conv(module, padding_mode): | |||||
| normal_net = getattr(Float, module)( | |||||
| 3, 3, 3, 1, 1, 1, bias=True, padding_mode=padding_mode | |||||
| ) | |||||
| normal_net.eval() | normal_net.eval() | ||||
| qat_net = getattr(QAT, module)(3, 3, 3, 1, 1, 1, bias=True) | |||||
| qat_net = getattr(QAT, module)( | |||||
| 3, 3, 3, 1, 1, 1, bias=True, padding_mode=padding_mode | |||||
| ) | |||||
| qat_net.eval() | qat_net.eval() | ||||
| disable_observer(qat_net) | disable_observer(qat_net) | ||||
| @@ -51,6 +51,7 @@ bool is_scalar_shape(ValueRef shape) { | |||||
| if (shape.is<ScalarValue>()) { | if (shape.is<ScalarValue>()) { | ||||
| return false; | return false; | ||||
| } | } | ||||
| // may have performance issue | |||||
| auto shape_of_shape = shape.shape(); | auto shape_of_shape = shape.shape(); | ||||
| if (!shape_of_shape) { | if (!shape_of_shape) { | ||||
| // assume not scalar | // assume not scalar | ||||
| @@ -211,14 +212,21 @@ std::vector<ValueRef> subtensor_rule( | |||||
| const Subtensor& subtensor, Span<ValueRef> inputs) { | const Subtensor& subtensor, Span<ValueRef> inputs) { | ||||
| mgb_assert(inputs.size() >= 1); | mgb_assert(inputs.size() >= 1); | ||||
| auto input = inputs[0]; | auto input = inputs[0]; | ||||
| size_t ndim = input.is<ScalarValue>() ? 0 : input.shape()->ndim; | |||||
| for (auto&& [axis, begin, end, step, idx] : subtensor.items) { | |||||
| if (idx) { | |||||
| ndim--; | |||||
| bool is_scalar; | |||||
| mgb_assert(!input.is<ScalarValue>(), "subtensor shouldn't have scalar input"); | |||||
| if (auto shape = input.shape()) { | |||||
| size_t ndim = input.shape()->ndim; | |||||
| for (auto&& [axis, begin, end, step, idx] : subtensor.items) { | |||||
| if (idx) { | |||||
| ndim--; | |||||
| } | |||||
| } | } | ||||
| is_scalar = ndim == 0; | |||||
| } else { | |||||
| is_scalar = false; | |||||
| } | } | ||||
| auto output = imperative::apply(subtensor, unwrap_inputs(inputs))[0]; | auto output = imperative::apply(subtensor, unwrap_inputs(inputs))[0]; | ||||
| if (!ndim) { | |||||
| if (is_scalar) { | |||||
| return {ScalarValue::make(output)}; | return {ScalarValue::make(output)}; | ||||
| } else { | } else { | ||||
| return {output}; | return {output}; | ||||
| @@ -261,8 +269,7 @@ std::vector<ValueRef> fastpath_copy_rule( | |||||
| std::vector<ValueRef> reshape_rule(const Reshape& reshape, Span<ValueRef> inputs) { | std::vector<ValueRef> reshape_rule(const Reshape& reshape, Span<ValueRef> inputs) { | ||||
| mgb_assert(inputs.size() == 2); | mgb_assert(inputs.size() == 2); | ||||
| bool is_scalar = | |||||
| (!inputs[1].is<ScalarValue>()) && *inputs[1].shape() == ValueShape{0}; | |||||
| bool is_scalar = is_scalar_shape(inputs[1]); | |||||
| auto unwrapped_input = inputs[0].is<ScalarValue>() | auto unwrapped_input = inputs[0].is<ScalarValue>() | ||||
| ? inputs[0].cast<ScalarValue>().value() | ? inputs[0].cast<ScalarValue>().value() | ||||
| : inputs[0]; | : inputs[0]; | ||||
| @@ -15,7 +15,7 @@ | |||||
| #define MGE_MAJOR 1 | #define MGE_MAJOR 1 | ||||
| #define MGE_MINOR 8 | #define MGE_MINOR 8 | ||||
| #define MGE_PATCH 0 | |||||
| #define MGE_PATCH 2 | |||||
| // for rc version, could be like "rc1", "rc2", etc | // for rc version, could be like "rc1", "rc2", etc | ||||
| #define MGE_EXTRA_NAME "" | #define MGE_EXTRA_NAME "" | ||||
| @@ -57,6 +57,7 @@ | |||||
| #cmakedefine01 MEGDNN_64_BIT | #cmakedefine01 MEGDNN_64_BIT | ||||
| #cmakedefine01 MEGDNN_THREADS_512 | #cmakedefine01 MEGDNN_THREADS_512 | ||||
| #cmakedefine01 MEGDNN_ENABLE_MULTI_THREADS | #cmakedefine01 MEGDNN_ENABLE_MULTI_THREADS | ||||
| #cmakedefine01 MEGDNN_WITH_BENCHMARK | |||||
| // whether atlas is available | // whether atlas is available | ||||
| #ifndef MGB_ATLAS | #ifndef MGB_ATLAS | ||||
| @@ -1 +1 @@ | |||||
| Subproject commit 31798848e40c2752d4b3db193491a63b77455029 | |||||
| Subproject commit 5a639171ed8489d877a56bfd28cfd9e6df8c5403 | |||||