feat(mge/third_party): update cutlass version

chore(release): bump version
feat(cuda): add fp16 compute 16 kernel
--- a/dnn/scripts/cutlass_generator/BUILD
+++ b/dnn/scripts/cutlass_generator/BUILD
@@ -13,6 +13,12 @@ genrule(
          CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations conv2d --type simt $(@D)
          CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations conv2d --type tensorop8816 $(@D)
          CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations conv2d --type tensorop8832 $(@D)
          CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_fprop --type simt $(@D)
          CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_fprop --type tensorop884 $(@D)
          CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_dgrad --type simt $(@D)
          CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_dgrad --type tensorop884 $(@D)
          CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_wgrad --type simt $(@D)
          CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_wgrad --type tensorop884 $(@D)
          """,
    tools = ["//brain/megbrain/dnn/scripts/cutlass_generator:generator.py"],
    visibility = ["//visibility:public"],
--- a/dnn/scripts/cutlass_generator/conv2d_operation.py
+++ b/dnn/scripts/cutlass_generator/conv2d_operation.py
--- a/dnn/scripts/cutlass_generator/gemm_operation.py
+++ b/dnn/scripts/cutlass_generator/gemm_operation.py
--- a/dnn/scripts/cutlass_generator/gen_list.py
+++ b/dnn/scripts/cutlass_generator/gen_list.py
@@ -3,6 +3,9 @@ from generator import (
    GenerateGemvOperations,
    GenerateConv2dOperations,
    GenerateDeconvOperations,
    GenerateDwconv2dFpropOperations,
    GenerateDwconv2dDgradOperations,
    GenerateDwconv2dWgradOperations,
 )


@@ -21,6 +24,12 @@ def write_op_list(f, gen_op, gen_type):
        operations = GenerateConv2dOperations(GenArg(gen_op, gen_type))
    elif gen_op == "deconv":
        operations = GenerateDeconvOperations(GenArg(gen_op, gen_type))
    elif gen_op == "dwconv2d_fprop":
        operations = GenerateDwconv2dFpropOperations(GenArg(gen_op, gen_type))
    elif gen_op == "dwconv2d_dgrad":
        operations = GenerateDwconv2dDgradOperations(GenArg(gen_op, gen_type))
    elif gen_op == "dwconv2d_wgrad":
        operations = GenerateDwconv2dWgradOperations(GenArg(gen_op, gen_type))
    for op in operations:
        f.write('    "%s.cu",\n' % op.procedural_name())
    if gen_op != "gemv":
@@ -40,4 +49,10 @@ if __name__ == "__main__":
        write_op_list(f, "conv2d", "simt")
        write_op_list(f, "conv2d", "tensorop8816")
        write_op_list(f, "conv2d", "tensorop8832")
        write_op_list(f, "dwconv2d_fprop", "simt")
        write_op_list(f, "dwconv2d_fprop", "tensorop884")
        write_op_list(f, "dwconv2d_dgrad", "simt")
        write_op_list(f, "dwconv2d_dgrad", "tensorop884")
        write_op_list(f, "dwconv2d_wgrad", "simt")
        write_op_list(f, "dwconv2d_wgrad", "tensorop884")
        f.write("]")
--- a/dnn/scripts/cutlass_generator/generator.py
+++ b/dnn/scripts/cutlass_generator/generator.py
--- a/dnn/scripts/cutlass_generator/library.py
+++ b/dnn/scripts/cutlass_generator/library.py
--- a/dnn/scripts/cutlass_generator/list.bzl
+++ b/dnn/scripts/cutlass_generator/list.bzl
--- a/dnn/scripts/cutlass_generator/manifest.py
+++ b/dnn/scripts/cutlass_generator/manifest.py
@@ -10,24 +10,25 @@ import shutil

 from library import *
 from gemm_operation import *
 from conv2d_operation import *  
 from conv2d_operation import *

 ###################################################################################################


 class EmitOperationKindLibrary:
  def __init__(self, generated_path, kind, args):
    self.generated_path = generated_path
    self.kind = kind
    self.args = args
    def __init__(self, generated_path, kind, args):
        self.generated_path = generated_path
        self.kind = kind
        self.args = args

    self.emitters = {
      OperationKind.Gemm: EmitGemmConfigurationLibrary
      , OperationKind.Conv2d: EmitConv2dConfigurationLibrary
    }
        self.emitters = {
            OperationKind.Gemm: EmitGemmConfigurationLibrary,
            OperationKind.Conv2d: EmitConv2dConfigurationLibrary,
        }

    self.configurations = [];
        self.configurations = []

    self.header_template ="""
        self.header_template = """
 /*
 Generated by manifest.py - Do not edit.
 */
@@ -42,17 +43,19 @@ namespace library {
 ///////////////////////////////////////////////////////////////////////////////////////////////////

 """
    self.entry_template = """
        self.entry_template = """

 //
 // Entry point to construct operations
 //
 void initialize_all_${operation_name}_operations(Manifest &manifest) {
 """
    self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n"
    self.configuration_template ="  initialize_${configuration_name}(manifest);\n"
        self.configuration_prototype_template = (
            "void initialize_${configuration_name}(Manifest &manifest);\n"
        )
        self.configuration_template = "  initialize_${configuration_name}(manifest);\n"

    self.epilogue_template ="""
        self.epilogue_template = """

 }

@@ -63,91 +66,118 @@ void initialize_all_${operation_name}_operations(Manifest &manifest) {

 """

  #
  def __enter__(self):
    self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind])
    os.mkdir(self.operation_path)
    #
    def __enter__(self):
        self.operation_path = os.path.join(
            self.generated_path, OperationKindNames[self.kind]
        )
        os.mkdir(self.operation_path)

        self.top_level_path = os.path.join(
            self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind]
        )

        self.top_level_file = open(self.top_level_path, "w")
        self.top_level_file.write(self.header_template)

    self.top_level_path = os.path.join(self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind])
        self.source_files = [self.top_level_path]

    self.top_level_file = open(self.top_level_path, "w")
    self.top_level_file.write(self.header_template)
        return self

    self.source_files = [self.top_level_path,]
    #
    def emit(self, configuration_name, operations):

    return self
        with self.emitters[self.kind](
            self.operation_path, configuration_name
        ) as configuration_emitter:
            for operation in operations:
                configuration_emitter.emit(operation)

  #
  def emit(self, configuration_name, operations):
            self.source_files.append(configuration_emitter.configuration_path)

    with self.emitters[self.kind](self.operation_path, configuration_name) as configuration_emitter:
      for operation in operations:
        configuration_emitter.emit(operation)
      
      self.source_files.append(configuration_emitter.configuration_path)
        self.configurations.append(configuration_name)
        self.top_level_file.write(
            SubstituteTemplate(
                self.configuration_prototype_template,
                {"configuration_name": configuration_name},
            )
        )

    self.configurations.append(configuration_name)
    self.top_level_file.write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} ))
    #
    def __exit__(self, exception_type, exception_value, traceback):
        self.top_level_file.write(
            SubstituteTemplate(
                self.entry_template, {"operation_name": OperationKindNames[self.kind]}
            )
        )

  #
  def __exit__(self, exception_type, exception_value, traceback):
    self.top_level_file.write(SubstituteTemplate(self.entry_template, {'operation_name': OperationKindNames[self.kind]}))
        for configuration_name in self.configurations:
            self.top_level_file.write(
                SubstituteTemplate(
                    self.configuration_template,
                    {"configuration_name": configuration_name},
                )
            )

    for configuration_name in self.configurations:
      self.top_level_file.write(SubstituteTemplate(self.configuration_template, {'configuration_name': configuration_name}))
        self.top_level_file.write(self.epilogue_template)
        self.top_level_file.close()

    self.top_level_file.write(self.epilogue_template)
    self.top_level_file.close()

 ###################################################################################################
 ###################################################################################################


 class Options:
  def __init__(self):
    pass
    def __init__(self):
        pass


 ###################################################################################################

 #
 class Manifest:

  #
  def __init__(self, args):
    self.operations = {}
    self.args = args
    #
    def __init__(self, args):
        self.operations = {}
        self.args = args

        architectures = (
            args.architectures.split(";") if len(args.architectures) else ["50"]
        )
        self.compute_capabilities = [int(x) for x in architectures]

    architectures = args.architectures.split(';') if len(args.architectures) else ['50',]
    self.compute_capabilities = [int(x) for x in architectures]
    
    self.selected_kernels = []
    
    if args.operations == 'all':
      self.operations_enabled = []
    else:
        self.selected_kernels = []

      operations_list = [
        OperationKind.Gemm
        , OperationKind.Conv2d     
      ] 
        if args.operations == "all":
            self.operations_enabled = []
        else:

      self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')]
            operations_list = [OperationKind.Gemm, OperationKind.Conv2d]

    if args.kernels == 'all':
      self.kernel_names = []
    else:
      self.kernel_names = [x for x in args.kernels.split(',') if x != '']
            self.operations_enabled = [
                x
                for x in operations_list
                if OperationKindNames[x] in args.operations.split(",")
            ]

    self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != '']
        if args.kernels == "all":
            self.kernel_names = []
        else:
            self.kernel_names = [x for x in args.kernels.split(",") if x != ""]

    if args.kernel_filter_file is None:
        self.kernel_filter_list = []
    else:
        self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)
        self.ignore_kernel_names = [
            x for x in args.ignore_kernels.split(",") if x != ""
        ]

        if args.kernel_filter_file is None:
            self.kernel_filter_list = []
        else:
            self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)

    self.operation_count = 0
    self.operations_by_name = {}
    self.top_level_prologue = '''
        self.operation_count = 0
        self.operations_by_name = {}
        self.top_level_prologue = """

 #include "cutlass/library/library.h"
 #include "cutlass/library/manifest.h"
@@ -159,208 +189,241 @@ ${prototypes}

 void initialize_all(Manifest &manifest) {

 '''
    self.top_level_reserve = '  manifest.reserve(${operation_count});\n\n'
    self.top_level_epilogue = '''
 """
        self.top_level_reserve = "  manifest.reserve(${operation_count});\n\n"
        self.top_level_epilogue = """
 }

 } // namespace library
 } // namespace cutlass

 '''


  def get_kernel_filters (self, kernelListFile):
    if os.path.isfile(kernelListFile):
        with open(kernelListFile, 'r') as fileReader:
            lines = [line.rstrip() for line in fileReader if not line.startswith("#")]
        
        lines = [re.compile(line) for line in lines if line]
        return lines
    else:
        return []
 """

    def get_kernel_filters(self, kernelListFile):
        if os.path.isfile(kernelListFile):
            with open(kernelListFile, "r") as fileReader:
                lines = [
                    line.rstrip() for line in fileReader if not line.startswith("#")
                ]

            lines = [re.compile(line) for line in lines if line]
            return lines
        else:
            return []

  def filter_out_kernels(self, kernel_name, kernel_filter_list):
    def filter_out_kernels(self, kernel_name, kernel_filter_list):

    for kernel_filter_re in kernel_filter_list:
        if kernel_filter_re.search(kernel_name) is not None:
            return True
        
    return False
        for kernel_filter_re in kernel_filter_list:
            if kernel_filter_re.search(kernel_name) is not None:
                return True

    
  #
  def _filter_string_matches(self, filter_string, haystack):
    ''' Returns true if all substrings appear in the haystack in order'''
    substrings = filter_string.split('*')
    for sub in substrings:
      idx = haystack.find(sub)
      if idx < 0:
        return False
      haystack = haystack[idx + len(sub):]
    return True

  #
  def filter(self, operation):
    ''' Filtering operations based on various criteria'''

    # filter based on compute capability
    enabled = False
    for cc in self.compute_capabilities:
      if cc >= operation.tile_description.minimum_compute_capability and \
        cc <= operation.tile_description.maximum_compute_capability:

        enabled = True
        break

    if not enabled:
      return False

    if len(self.operations_enabled) and not operation.operation_kind in self.operations_enabled:
      return False

    # eliminate duplicates
    if operation.procedural_name() in self.operations_by_name.keys():
      return False

    # Filter based on list of valid substrings
    if len(self.kernel_names):
      name = operation.procedural_name()
      enabled = False

      # compare against the include list
      for name_substr in self.kernel_names:
        if self._filter_string_matches(name_substr, name):
          enabled = True
          break

      # compare against the exclude list
      for name_substr in self.ignore_kernel_names:
        if self._filter_string_matches(name_substr, name):
          enabled = False
          break
          
    if len(self.kernel_filter_list) > 0:

    #
    def _filter_string_matches(self, filter_string, haystack):
        """ Returns true if all substrings appear in the haystack in order"""
        substrings = filter_string.split("*")
        for sub in substrings:
            idx = haystack.find(sub)
            if idx < 0:
                return False
            haystack = haystack[idx + len(sub) :]
        return True

    #
    def filter(self, operation):
        """ Filtering operations based on various criteria"""

        # filter based on compute capability
        enabled = False
        if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list):
            enabled = True
        for cc in self.compute_capabilities:
            if (
                cc >= operation.tile_description.minimum_compute_capability
                and cc <= operation.tile_description.maximum_compute_capability
            ):

                enabled = True
                break

        if not enabled:
            return False

        if (
            len(self.operations_enabled)
            and not operation.operation_kind in self.operations_enabled
        ):
            return False

        # eliminate duplicates
        if operation.procedural_name() in self.operations_by_name.keys():
            return False

        # Filter based on list of valid substrings
        if len(self.kernel_names):
            name = operation.procedural_name()
            enabled = False

            # compare against the include list
            for name_substr in self.kernel_names:
                if self._filter_string_matches(name_substr, name):
                    enabled = True
                    break

            # compare against the exclude list
            for name_substr in self.ignore_kernel_names:
                if self._filter_string_matches(name_substr, name):
                    enabled = False
                    break

        if len(self.kernel_filter_list) > 0:
            enabled = False
            if self.filter_out_kernels(
                operation.procedural_name(), self.kernel_filter_list
            ):
                enabled = True

        # todo: filter based on compute data type
        return enabled

    #

    #
    def append(self, operation):
        """ 
      Inserts the operation.

      operation_kind -> configuration_name -> []
    """

    # todo: filter based on compute data type
    return enabled
  #
        if self.filter(operation):

  #
  def append(self, operation):
    ''' 
      Inserts the operation.
            self.selected_kernels.append(operation.procedural_name())

      operation_kind -> configuration_name -> []
    '''
            self.operations_by_name[operation.procedural_name()] = operation

    if self.filter(operation):
    
      self.selected_kernels.append(operation.procedural_name())
            # add the configuration
            configuration_name = operation.configuration_name()

      self.operations_by_name[operation.procedural_name()] = operation
            if operation.operation_kind not in self.operations.keys():
                self.operations[operation.operation_kind] = {}

      # add the configuration
      configuration_name = operation.configuration_name()
            if (
                configuration_name
                not in self.operations[operation.operation_kind].keys()
            ):
                self.operations[operation.operation_kind][configuration_name] = []

      if operation.operation_kind not in self.operations.keys():
        self.operations[operation.operation_kind] = {}
            self.operations[operation.operation_kind][configuration_name].append(
                operation
            )
            self.operation_count += 1

      if configuration_name not in self.operations[operation.operation_kind].keys():
        self.operations[operation.operation_kind][configuration_name] = []
    #

      self.operations[operation.operation_kind][configuration_name].append(operation)
      self.operation_count += 1
  #
    #
    def emit(self, target=GeneratorTarget.Library):

  #
  def emit(self, target = GeneratorTarget.Library):
        operation_emitters = {GeneratorTarget.Library: EmitOperationKindLibrary}

    operation_emitters = {
      GeneratorTarget.Library: EmitOperationKindLibrary 
    }
        generated_path = os.path.join(self.args.curr_build_dir, "generated")

    generated_path = os.path.join(self.args.curr_build_dir, 'generated')
        # create generated/
        if os.path.exists(generated_path):
            shutil.rmtree(generated_path)

    # create generated/
    if os.path.exists(generated_path):
      shutil.rmtree(generated_path)
        os.mkdir(generated_path)

    os.mkdir(generated_path)
        source_files = []

    source_files = []
        top_level_path = os.path.join(generated_path, "initialize_all.cpp")
        with open(top_level_path, "w") as top_level_file:

    top_level_path = os.path.join(generated_path, 'initialize_all.cpp')
    with open(top_level_path, 'w') as top_level_file:
            if target == GeneratorTarget.Library:
                source_files.append(top_level_path)

      if target == GeneratorTarget.Library:
        source_files.append(top_level_path)
            prototypes = []
            for operation_kind, configurations in self.operations.items():
                prototypes.append(
                    SubstituteTemplate(
                        "void initialize_all_${operation_kind}_operations(Manifest &manifest);",
                        {"operation_kind": OperationKindNames[operation_kind]},
                    )
                )

      prototypes = []
      for operation_kind, configurations in self.operations.items():
        prototypes.append(SubstituteTemplate(
          "void initialize_all_${operation_kind}_operations(Manifest &manifest);",
          {'operation_kind': OperationKindNames[operation_kind]}))
            top_level_file.write(
                SubstituteTemplate(
                    self.top_level_prologue, {"prototypes": "\n".join(prototypes)}
                )
            )

      top_level_file.write(SubstituteTemplate(self.top_level_prologue,
        {'prototypes': "\n".join(prototypes)}))
            top_level_file.write(
                SubstituteTemplate(
                    self.top_level_reserve,
                    {"operation_count": str(self.operation_count)},
                )
            )

      top_level_file.write(SubstituteTemplate(
        self.top_level_reserve, {'operation_count': str(self.operation_count)}))
            # for each operation kind, emit initializer for all configurations
            for operation_kind, configurations in self.operations.items():

      # for each operation kind, emit initializer for all configurations
      for operation_kind, configurations in self.operations.items():
        
        with operation_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter:
          for configuration_name, operations in configurations.items():
            operation_kind_emitter.emit(configuration_name, operations)
                with operation_emitters[target](
                    generated_path, operation_kind, self.args
                ) as operation_kind_emitter:
                    for configuration_name, operations in configurations.items():
                        operation_kind_emitter.emit(configuration_name, operations)

          source_files += operation_kind_emitter.source_files
                    source_files += operation_kind_emitter.source_files

        top_level_file.write(SubstituteTemplate(
          "  initialize_all_${operation_kind}_operations(manifest);\n",
          {'operation_kind': OperationKindNames[operation_kind]}))
                top_level_file.write(
                    SubstituteTemplate(
                        "  initialize_all_${operation_kind}_operations(manifest);\n",
                        {"operation_kind": OperationKindNames[operation_kind]},
                    )
                )

      top_level_file.write(self.top_level_epilogue)
            top_level_file.write(self.top_level_epilogue)

    # write the manifest.cmake file containing paths from all targets
    manifest_path = os.path.join(generated_path, "manifest.cmake")
    with open(manifest_path, "w") as manifest_file:
        # write the manifest.cmake file containing paths from all targets
        manifest_path = os.path.join(generated_path, "manifest.cmake")
        with open(manifest_path, "w") as manifest_file:

      target_name = 'cutlass_library_objs'
            target_name = "cutlass_library_objs"

      target_text = SubstituteTemplate("""cutlass_target_sources(
            target_text = SubstituteTemplate(
                """cutlass_target_sources(
  ${target_name}
  BATCH_SOURCES ON
  PRIVATE
 """, { 'target_name': target_name})
 """,
                {"target_name": target_name},
            )

      manifest_file.write(target_text)
            manifest_file.write(target_text)

            for source_file in source_files:
                manifest_file.write("    %s\n" % str(source_file.replace("\\", "/")))
            manifest_file.write(")")

    #

      for source_file in source_files:
        manifest_file.write("    %s\n" % str(source_file.replace('\\', '/')))
      manifest_file.write(")")
  #

 ###################################################################################################


 def GenerateManifest(args, operations, output_dir):
  assert isinstance(operations, list)
  if len(operations) == 0:
    return
  op = operations[0]
  required_cuda_ver_major = op.required_cuda_ver_major
  required_cuda_ver_minor = op.required_cuda_ver_minor

  manifest_path = os.path.join(output_dir, "all_%s_%s_operations.cu" % (args.operations, args.type))
  f = open(manifest_path, "w")
  f.write("""
    assert isinstance(operations, list)
    if len(operations) == 0:
        return
    op = operations[0]
    required_cuda_ver_major = op.required_cuda_ver_major
    required_cuda_ver_minor = op.required_cuda_ver_minor

    manifest_path = os.path.join(
        output_dir, "all_%s_%s_operations.cu" % (args.operations, args.type)
    )
    f = open(manifest_path, "w")
    f.write(
        """
 /*
 Generated by generator.py - Do not edit.
 */
@@ -374,24 +437,35 @@ def GenerateManifest(args, operations, output_dir):
 namespace cutlass {
 namespace library {

 """ % (str(required_cuda_ver_major), str(required_cuda_ver_major), str(required_cuda_ver_minor)))

  for op in operations:
      f.write("void initialize_%s(Manifest &manifest);\n" % op.procedural_name())

  f.write("""
 """
        % (
            str(required_cuda_ver_major),
            str(required_cuda_ver_major),
            str(required_cuda_ver_minor),
        )
    )

    for op in operations:
        f.write("void initialize_%s(Manifest &manifest);\n" % op.procedural_name())

    f.write(
        """
 void initialize_all_%s_%s_operations(Manifest &manifest) {
 """ % (args.operations, args.type))
 """
        % (args.operations, args.type)
    )

  for op in operations:
    f.write("    initialize_%s(manifest);\n" % op.procedural_name())
    for op in operations:
        f.write("    initialize_%s(manifest);\n" % op.procedural_name())

  f.write("""
    f.write(
        """
 }

 }  // namespace library
 }  // namespace cutlass

 #endif
 """)
  f.close()
 """
    )
    f.close()
--- a/dnn/src/CMakeLists.txt
+++ b/dnn/src/CMakeLists.txt
@@ -181,6 +181,12 @@ if(MGE_WITH_CUDA)
  gen_cutlass_kimpl(conv2d simt CUTLASS_SOURCES)
  gen_cutlass_kimpl(conv2d tensorop8816 CUTLASS_SOURCES)
  gen_cutlass_kimpl(conv2d tensorop8832 CUTLASS_SOURCES)
  gen_cutlass_kimpl(dwconv2d_fprop simt CUTLASS_SOURCES)
  gen_cutlass_kimpl(dwconv2d_fprop tensorop884 CUTLASS_SOURCES)
  gen_cutlass_kimpl(dwconv2d_dgrad simt CUTLASS_SOURCES)
  gen_cutlass_kimpl(dwconv2d_dgrad tensorop884 CUTLASS_SOURCES)
  gen_cutlass_kimpl(dwconv2d_wgrad simt CUTLASS_SOURCES)
  gen_cutlass_kimpl(dwconv2d_wgrad tensorop884 CUTLASS_SOURCES)
  list(APPEND SOURCES ${CUTLASS_SOURCES})
  list(APPEND SOURCES ${CUSOURCES})
 endif()
--- a/dnn/src/cuda/conv_bias/algo.cpp
+++ b/dnn/src/cuda/conv_bias/algo.cpp
@@ -19,6 +19,7 @@ using namespace cuda;
 ConvBiasForwardImpl::AlgoPack::AlgoPack() {
    non_cudnn_algos.push_back(&chanwise);
    non_cudnn_algos.push_back(&chanwise_small);
    non_cudnn_algos.push_back(&depthwise_large_filter);

    non_cudnn_algos.push_back(&inplace_matmul);
    non_cudnn_algos.push_back(&matmul);
@@ -34,6 +35,7 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() {
    std::vector<AlgoBase*> conv_algos;
    conv_algos.push_back(&chanwise);
    conv_algos.push_back(&chanwise_small);
    conv_algos.push_back(&depthwise_large_filter);
    conv_algos.push_back(&chanwise8x8x32);
    for (auto&& algo : cudnn_convs) {
        conv_algos.push_back(&algo);
@@ -92,6 +94,7 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() {
    for (auto&& algo : int8_nchw4_dotprod) {
        all_algos.push_back(&algo);
    }
    fill_dwconv_algos();
    all_algos.push_back(&int8_chwn4_dotprod);
    all_algos.push_back(&fallback_nchw_qs8);
    for (size_t i = all_algo_size; i < all_algos.size(); ++i) {
@@ -301,6 +304,34 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
 }
 #endif

 void ConvBiasForwardImpl::AlgoPack::fill_dwconv_algos() {
    using AlgoParam = AlgoCutlassConvolutionBase::AlgoParam;
    /// preferred algo
    f32_implicit_bmm.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8, 1, 1, 1, 2});
    f32_implicit_bmm.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8, 1, 1, 1, 2});
    f32_implicit_bmm.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8, 1, 1, 1, 2});
    f32_implicit_bmm.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8, 1, 1, 1, 2});
    f32_implicit_bmm.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8, 1, 1, 1, 2});
    f32_implicit_bmm.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8, 1, 1, 1, 2});
    f32_implicit_bmm.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8, 1, 1, 1, 2});
    f32_implicit_bmm.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8, 1, 1, 1, 2});
    f32_implicit_bmm.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8, 1, 1, 1, 2});
    for (auto&& algo : f32_implicit_bmm) {
        all_algos.push_back(&algo);
    }
 #if CUDA_VERSION >= 10010
    /// preferred algo
    f16_implicit_bmm.emplace_back(AlgoParam{64, 128, 32, 32, 32, 32, 8, 8, 4, 2});
    f16_implicit_bmm.emplace_back(AlgoParam{128, 128, 32, 32, 32, 32, 8, 8, 4, 2});
    f16_implicit_bmm.emplace_back(AlgoParam{128, 256, 32, 64, 64, 32, 8, 8, 4, 2});
    f16_implicit_bmm.emplace_back(AlgoParam{128, 64, 32, 32, 32, 32, 8, 8, 4, 2});
    f16_implicit_bmm.emplace_back(AlgoParam{64, 64, 32, 32, 32, 32, 8, 8, 4, 2});
    for (auto&& algo : f16_implicit_bmm) {
        all_algos.push_back(&algo);
    }
 #endif
 }

 void ConvBiasForwardImpl::AlgoPack::fill_dp4a_algos() {
    using AlgoParam = AlgoInt8NCHW4DotProdImplicitGemm::AlgoParam;
    int8_nchw4_dotprod.emplace_back(AlgoParam{128, 128, 32, 64, 32, 32, 1, 1, 4, 2});
--- a/dnn/src/cuda/conv_bias/algo.h
+++ b/dnn/src/cuda/conv_bias/algo.h
@@ -22,7 +22,6 @@
 #include "src/cuda/conv_bias/opr_impl.h"
 #include "src/cuda/convolution_helper/parameter.cuh"
 #include "src/cuda/cudnn_wrapper.h"
 #include "src/cuda/handle.h"

 #include <cuda.h>
 #include <memory>
@@ -57,6 +56,7 @@ public:
        CUDA_CUDNN_CONVBIAS,
        CUDA_CHANWISE,
        CUDA_CHANWISE_SMALL,
        CUDA_DEPTHWISE_LARGE_FILTER,
        CUDA_CHANWISE_INT8X8X32,
        CUDA_CUDNN_CONV,
        CUDA_INPLACE_MATMUL,
@@ -84,7 +84,9 @@ public:
        CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW32_IMMA_INT8,
        CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_INT4_INT4,
        CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_UINT4_INT4,
        CUDA_FALLBACK_NCHW_INT4
        CUDA_FALLBACK_NCHW_INT4,
        CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32,
        CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16,
    };
    using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;

@@ -255,6 +257,26 @@ private:
    mutable std::string m_name;
 };

 class ConvBiasForwardImpl::AlgoDepthwiseLargeFilter final : public AlgoBase {
 public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

    const char* name() const override {
        if (m_name.empty()) {
            m_name = ConvBiasForward::algo_name<DirectParam>(
                    "DEPTHWISE_LARGE_FILTER", {});
        }
        return m_name.c_str();
    }
    MEGDNN_DECL_ALGO_TYPE(CUDA_DEPTHWISE_LARGE_FILTER)
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }

 private:
    mutable std::string m_name;
 };

 class ConvBiasForwardImpl::AlgoChanwise8x8x32 final : public AlgoBase {
 public:
    bool is_available(const SizeArgs& args) const override;
@@ -503,6 +525,8 @@ public:
 * +----+--- AlgoInt4Int4NHWCIMMAImplicitGemm
 * +----+--- AlgoUInt4Int4NHWCIMMAImplicitGemm
 * +
 * +--- AlgoFloat32NCHWImplicitBatchedGemm
 * +--- AlgoFloat16NCHWHMMAImplicitBatchedGemm
 */

 /*
@@ -516,7 +540,13 @@ public:

    // corresponds to cutlass::conv::ConvType. we hope that algo.h does not
    // depend on cutlass headers
    enum class ConvType { kConvolution, kBatchConvolution, kLocal, kLocalShare };
    enum class ConvType {
        kConvolution,
        kBatchConvolution,
        kLocal,
        kLocalShare,
        kDepthwiseConvolution,
    };

    // common parameters for operation selection
    struct AlgoParam {
@@ -558,7 +588,8 @@ public:
            size_t wo, size_t ph, size_t pw, size_t sh, size_t sw, size_t dh, size_t dw,
            const void* alpha, const void* beta, const void* gamma, const void* delta,
            const void* theta, const void* threshold, const void* dst_scale,
            cudaStream_t stream, const void* extra_param = nullptr) const;
            cudaStream_t stream, const void* extra_param = nullptr,
            size_t groups = 1) const;

 protected:
    AlgoParam m_algo_param;
@@ -992,6 +1023,54 @@ private:
 };
 #endif

 class ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm final
        : public AlgoCutlassConvolutionBase {
 public:
    AlgoFloat32NCHWFMAImplicitBatchedGemm(AlgoParam algo_param)
            : AlgoCutlassConvolutionBase(algo_param) {
        m_name = ConvBias::algo_name<ConvBias::DirectParam>(
                ssprintf(
                        "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM%s",
                        m_algo_param.to_string().c_str()),
                ConvBias::DirectParam{});
    }
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override {
        return 0;
    }
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); };
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32);

 private:
    std::string m_name;
 };

 class ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm final
        : public AlgoCutlassConvolutionBase {
 public:
    AlgoFloat16NCHWHMMAImplicitBatchedGemm(AlgoParam algo_param)
            : AlgoCutlassConvolutionBase(algo_param) {
        m_name = ConvBias::algo_name<ConvBias::DirectParam>(
                ssprintf(
                        "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM%s",
                        m_algo_param.to_string().c_str()),
                ConvBias::DirectParam{});
    }
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& /* args */) const override {
        return 0;
    }
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); };
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16);

 private:
    std::string m_name;
 };

 class ConvBiasForwardImpl::AlgoBFloat16 final : public AlgoBase {
 public:
    bool is_available(const SizeArgs& args) const override;
@@ -1025,6 +1104,7 @@ public:
    AlgoFallbackNCHWQS8 fallback_nchw_qs8;
    AlgoChanwise chanwise;
    AlgoChanwiseSmall chanwise_small;
    AlgoDepthwiseLargeFilter depthwise_large_filter;
    AlgoChanwise8x8x32 chanwise8x8x32;
    AlgoInplaceMatmul inplace_matmul;
    AlgoMatmul matmul;
@@ -1048,6 +1128,8 @@ public:
    std::vector<AlgoInt4Int4NHWCIMMAImplicitGemm> int4_int4_nhwc_imma;
    std::vector<AlgoUInt4Int4NHWCIMMAImplicitGemm> uint4_int4_nhwc_imma;
 #endif
    std::vector<AlgoFloat32NCHWFMAImplicitBatchedGemm> f32_implicit_bmm;
    std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> f16_implicit_bmm;
    AlgoGroupConvGeneral group;
    AlgoBFloat16 bfloat16;

@@ -1063,6 +1145,7 @@ private:
 #endif
    void fill_cudnn_algos();
    void fill_dp4a_algos();
    void fill_dwconv_algos();
 };

 }  // namespace cuda
--- a/dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter.cuh
+++ b/dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter.cuh
@@ -0,0 +1,179 @@
 /**
 * \file dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter.cuh
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #pragma once
 namespace {
 #define DIVUP(x, y) (((x) + (y)-1) / (y))
 enum DepthwiseConv2dDirection { DIRECTION_FORWARD, DIRECTION_BACKWARD };

 template <typename ThreadConfig_, int oh_, int ow_>
 struct OutTileConfig {
    using ThreadConfig = ThreadConfig_;
    static int const unroll_h = oh_;
    static int const unroll_w = ThreadConfig::thread_x * ow_;
    static int const unroll_size = unroll_h * unroll_w;
    static int const block_h = unroll_h * ThreadConfig::thread_y;
    static int const block_w = unroll_w;
 };

 template <int fh_, int fw_>
 struct FilterTileConfig {
    static int const unroll_h = fh_;
    static int const unroll_w = fw_;
    static int const unroll_size = unroll_h * unroll_w;
 };

 template <int x_, int y_>
 struct ThreadConfig {
    static int const thread_x = x_;
    static_assert((thread_x & (thread_x - 1)) == 0, "thread_x must be pow of 2!");
    static int const thread_y = y_;
    static int const nr_threads = x_ * y_;
 };

 template <
        typename ldg_dtype, typename ThreadConfig_, typename OutTileConfig_,
        typename FilterTileConfig_, int stride_w, int stride_h>
 struct ConvTraitInner {
    using ThreadConfig = ThreadConfig_;
    using OutTileConfig = OutTileConfig_;
    using FilterTileConfig = FilterTileConfig_;
    using CompType = ldg_dtype;

    struct SrcTileConfig {
        static int const unroll_h =
                OutTileConfig::unroll_h + FilterTileConfig::unroll_h - 1;
        static int const unroll_w =
                (OutTileConfig::unroll_w - 1) * stride_w + FilterTileConfig::unroll_w;
        static int const unroll_size = unroll_h * unroll_w;
    };

    struct SrcTileCount {
        static int const smem_src_h =
                (OutTileConfig::block_h - 1) * stride_h + FilterTileConfig::unroll_h;
        static int const smem_buff_h = FilterTileConfig::unroll_h;
        static int const smem_load_h = smem_src_h + smem_buff_h;
        static int const smem_h = smem_load_h + smem_buff_h;
        static int const smem_w =
                DIVUP((OutTileConfig::block_w - 1) * stride_w +
                              FilterTileConfig::unroll_w * ThreadConfig::thread_x,
                      2) *
                2;
        static int const smem_size = smem_h * smem_w;
        static int const load_w =
                smem_w > ThreadConfig::nr_threads ? ThreadConfig::nr_threads : smem_w;
        static int const load_h = 1;
        static int const reg_h = 1;
        static int const reg_w = DIVUP(smem_w, load_w);
        static bool constexpr check_bounds_h = smem_h % load_h != 0;
        static bool constexpr check_bounds_w = smem_w % load_w != 0;
    };

    struct FilterTileCount {
        static int const smem_flt_h = FilterTileConfig::unroll_h;
        static int const smem_buff_h = FilterTileConfig::unroll_h;
        static int const smem_load_h = smem_flt_h + smem_buff_h;
        static int const smem_h = smem_load_h + smem_buff_h;
        static int const smem_w = FilterTileConfig::unroll_w * ThreadConfig::thread_x;
        static int const smem_size = smem_h * smem_w;
        static int const load_w = smem_w > 32 ? 32 : smem_w;
        static int const load_h = ThreadConfig::nr_threads / load_w;
        static int const reg_h = 1;
        static int const reg_w = DIVUP(smem_w, load_w);
        static bool constexpr check_bounds_h = smem_h % load_h != 0;
        static bool constexpr check_bounds_w = smem_w % load_w != 0;
    };
 };

 #define CHECK_AB_FWD(a, b)                                                             \
    if (param.out_w > b * 4) {                                                         \
        if (param.stride_h == 1 && param.stride_w == 1) {                              \
            using FilterTileConfig_ = FilterTileConfig<unroll_fh, a + 2>;              \
            using ThreadConfig_ = ThreadConfig<4, 32>;                                 \
            using OutTileConfig_ = OutTileConfig<ThreadConfig_, unroll_oh, b + 1>;     \
            using IConvTrait = ConvTraitInner<                                         \
                    float, ThreadConfig_, OutTileConfig_, FilterTileConfig_, 1, 1>;    \
            using SrcTileConfig = typename IConvTrait::SrcTileConfig;                  \
            using SrcTileCount = typename IConvTrait::SrcTileCount;                    \
            using FilterTileCount = typename IConvTrait::FilterTileCount;              \
                                                                                       \
            if (device_prop.regsPerBlock <                                             \
                        4 * 32 *                                                       \
                                (FilterTileConfig_::unroll_h *                         \
                                         FilterTileConfig_::unroll_w * 2 +             \
                                 SrcTileConfig::unroll_h * SrcTileConfig::unroll_w) || \
                device_prop.sharedMemPerBlock <                                        \
                        static_cast<size_t>(                                           \
                                (SrcTileCount::smem_size +                             \
                                 FilterTileCount::smem_size))) {                       \
                return false;                                                          \
            }                                                                          \
            return true;                                                               \
        } else if (param.stride_h == 2 && param.stride_w == 2) {                       \
            using FilterTileConfig_ = FilterTileConfig<unroll_fh, a + 2>;              \
            using ThreadConfig_ = ThreadConfig<4, 32>;                                 \
            using OutTileConfig_ = OutTileConfig<ThreadConfig_, unroll_oh, b + 1>;     \
            using IConvTrait = ConvTraitInner<                                         \
                    float, ThreadConfig_, OutTileConfig_, FilterTileConfig_, 2, 2>;    \
            using SrcTileConfig = typename IConvTrait::SrcTileConfig;                  \
            using SrcTileCount = typename IConvTrait::SrcTileCount;                    \
            using FilterTileCount = typename IConvTrait::FilterTileCount;              \
                                                                                       \
            if (device_prop.regsPerBlock <                                             \
                        4 * 32 *                                                       \
                                (FilterTileConfig_::unroll_h *                         \
                                         FilterTileConfig_::unroll_w * 2 +             \
                                 SrcTileConfig::unroll_h * SrcTileConfig::unroll_w) || \
                device_prop.sharedMemPerBlock <                                        \
                        static_cast<size_t>(                                           \
                                (SrcTileCount::smem_size +                             \
                                 FilterTileCount::smem_size))) {                       \
                return false;                                                          \
            }                                                                          \
            return true;                                                               \
        }                                                                              \
    }

 #define CHECK_AB_BWD(a, b)                                                             \
    if (param.out_w > b * 4) {                                                         \
        using FilterTileConfig_ = FilterTileConfig<unroll_fh, a + 2>;                  \
        using ThreadConfig_ = ThreadConfig<4, 32>;                                     \
        using OutTileConfig_ = OutTileConfig<ThreadConfig_, unroll_oh, b + 1>;         \
        using IConvTrait = ConvTraitInner<                                             \
                float, ThreadConfig_, OutTileConfig_, FilterTileConfig_, 1, 1>;        \
        using SrcTileConfig = typename IConvTrait::SrcTileConfig;                      \
        using SrcTileCount = typename IConvTrait::SrcTileCount;                        \
        using FilterTileCount = typename IConvTrait::FilterTileCount;                  \
                                                                                       \
        if (device_prop.regsPerBlock <                                                 \
                    4 * 32 *                                                           \
                            (FilterTileConfig_::unroll_h *                             \
                                     FilterTileConfig_::unroll_w * 2 +                 \
                             SrcTileConfig::unroll_h * SrcTileConfig::unroll_w) ||     \
            device_prop.sharedMemPerBlock <                                            \
                    static_cast<size_t>(                                               \
                            (SrcTileCount::smem_size + FilterTileCount::smem_size))) { \
            return false;                                                              \
        }                                                                              \
        return true;                                                                   \
    }

 #define CHECK_A(a, cb)                                                                                                                                                                                                                                                                                                                                                                                          \
    if (param.flt_w > a * 4) {                                                                                                                                                                                                                                                                                                                                                                                  \
        CHECK_AB_##cb(                                                                                                                                                                                                                                                                                                                                                                                          \
                a,                                                                                                                                                                                                                                                                                                                                                                                              \
                15) else CHECK_AB_##cb(a, 14) else CHECK_AB_##cb(a, 13) else CHECK_AB_##cb(a, 12) else CHECK_AB_##cb(a, 11) else CHECK_AB_##cb(a, 10) else CHECK_AB_##cb(a, 9) else CHECK_AB_##cb(a, 8) else CHECK_AB_##cb(a, 7) else CHECK_AB_##cb(a, 6) else CHECK_AB_##cb(a, 5) else CHECK_AB_##cb(a, 4) else CHECK_AB_##cb(a, 3) else CHECK_AB_##cb(a, 2) else CHECK_AB_##cb(a, 1) else CHECK_AB_##cb(a, 0) \
    }

 #define CHECK(cb)  \
    CHECK_A(6, cb) \
    else CHECK_A(4, cb) else CHECK_A(2, cb) else CHECK_A(0, cb)

 }  // namespace
--- a/dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh
+++ b/dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh
@@ -0,0 +1,914 @@
 /**
 * \file dnn/src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #pragma once
 #include "depthwise_large_filter.cuh"
 #include "src/cuda/cuda_shfl_compat.cuh"

 namespace {

 template <
        typename T, DepthwiseConv2dDirection kDirection, typename ThreadConfig_,
        typename TileCount_>
 struct Global2SharedMem {
    using TileCount = TileCount_;
    using ThreadConfig = ThreadConfig_;
    T reg[TileCount::reg_w];
    const int tidx = threadIdx.x;
    const int tidy = threadIdx.y;
    const int tid = tidy * ThreadConfig::thread_x + tidx;
    const int gl_load_y = tid / TileCount::load_w;
    const int gl_load_x = tid - gl_load_y * TileCount::load_w;
    const bool is_fwd = (kDirection == DIRECTION_FORWARD);
    int w_offset;

    T* smem;
    int stride;
    int start_h, start_w, bound_h, bound_w, ring_smem_h, ring_src_h;
    // just used in backward src data
    int stride_h, stride_w;
    const T* g_ptr;

    __device__ __forceinline__ Global2SharedMem(
            T* smem_, int stride_, int s_h, int s_w, int b_h, int b_w, int stride_h_,
            int stride_w_);

    __device__ __forceinline__ void first_copy();
    __device__ __forceinline__ void copy();
    __device__ __forceinline__ void commit();
    __device__ __forceinline__ void iter_forward();
    __device__ __forceinline__ T* sh_ptr(int y, int x) {
        return &smem[y * TileCount::smem_w + x];
    }

    __device__ __forceinline__ T* sh_ptr_as_copy_t(int y, int x) {
        return reinterpret_cast<T*>(sh_ptr(y, x));
    }
 };

 template <
        typename ldg_dtype, DepthwiseConv2dDirection kDirection, typename ThreadConfig_,
        typename OutTileConfig_, typename FilterTileConfig_, int stride_w, int stride_h>
 struct ConvTrait {
    using ThreadConfig = ThreadConfig_;
    using OutTileConfig = OutTileConfig_;
    using FilterTileConfig = FilterTileConfig_;
    using CompType = ldg_dtype;

    using CI = ConvTraitInner<
            ldg_dtype, ThreadConfig_, OutTileConfig_, FilterTileConfig_, stride_w,
            stride_h>;
    using SrcTileConfig = typename CI::SrcTileConfig;
    using SrcTileCount = typename CI::SrcTileCount;
    using FilterTileCount = typename CI::FilterTileCount;

    using SrcGlobal2ShareVisitor = Global2SharedMem<
            CompType, DepthwiseConv2dDirection::DIRECTION_FORWARD, ThreadConfig,
            SrcTileCount>;
    using FilterGlobal2ShareVisitor =
            Global2SharedMem<CompType, kDirection, ThreadConfig, FilterTileCount>;
 };

 template <
        typename T, DepthwiseConv2dDirection kDirection, typename ThreadConfig_,
        typename TileCount_>
 __device__ __forceinline__
 Global2SharedMem<T, kDirection, ThreadConfig_, TileCount_>::Global2SharedMem(
        T* smem_, int stride_, int s_h, int s_w, int b_h, int b_w, int stride_h_,
        int stride_w_)
        : smem(smem_),
          stride(stride_),
          start_h(s_h),
          start_w(s_w),
          bound_h(b_h),
          bound_w(b_w),
          ring_smem_h(TileCount::smem_load_h),
          stride_h(stride_h_),
          stride_w(stride_w_) {
    if (is_fwd) {
        ring_src_h = s_h + TileCount::smem_load_h;
        w_offset = 0;
    } else {
        ring_src_h = s_h - 1;
        w_offset = TileCount::smem_w - b_w;
        // stride_h and stride_w just used in backward src data.
        stride_h = stride_w = 1;
    }
 }

 template <
        typename T, DepthwiseConv2dDirection kDirection, typename ThreadConfig_,
        typename TileCount_>
 __device__ __forceinline__ void Global2SharedMem<
        T, kDirection, ThreadConfig_, TileCount_>::first_copy() {
    static int const load_w = TileCount::smem_w > 32 ? 32 : TileCount::smem_w;
    static int const load_h = ThreadConfig::nr_threads / load_w;
    static int const h_per_thread = DIVUP(TileCount::smem_load_h, load_h);
    static int const w_per_thread = DIVUP(TileCount::smem_w, load_w);
    static bool constexpr check_bounds_h = TileCount::smem_load_h % load_h != 0;
    static bool constexpr check_bounds_w = TileCount::smem_w % load_w != 0;
    const int y_base_idx = tid / load_w;
    const int x_base_idx = tid - y_base_idx * load_w;
 #pragma unroll
    for (int i = 0; i < h_per_thread; ++i) {
        int smem_h_idx = y_base_idx + i * load_h;
        int src_h_idx;
        if (is_fwd) {
            src_h_idx = start_h + smem_h_idx;
        } else {
            src_h_idx = start_h + TileCount::smem_load_h - smem_h_idx - 1;
        }
        if (check_bounds_h && smem_h_idx >= TileCount::smem_load_h)
            continue;
 #pragma unroll
        for (int j = 0; j < w_per_thread; ++j) {
            int smem_w_idx = x_base_idx + j * load_w;
            int src_w_idx;
            if (is_fwd) {
                src_w_idx = start_w + smem_w_idx;
            } else {
                src_w_idx = start_w + TileCount::smem_w - w_offset - smem_w_idx - 1;
            }
            if (check_bounds_w && smem_w_idx >= TileCount::smem_w)
                continue;
            T val = 0.0f;
            if (src_h_idx >= 0 && src_h_idx < bound_h && src_w_idx >= 0 &&
                src_w_idx < bound_w &&
                ((is_fwd && src_h_idx % stride_h == 0 && src_w_idx % stride_w == 0) ||
                 (!is_fwd && TileCount::smem_load_h - smem_h_idx - 1 >= 0 &&
                  TileCount::smem_w - w_offset - smem_w_idx - 1 >= 0))) {
                val = g_ptr[src_h_idx / stride_h * stride + src_w_idx / stride_w];
            }
            *(sh_ptr_as_copy_t(smem_h_idx, smem_w_idx)) = val;
        }
    }
 }

 template <
        typename T, DepthwiseConv2dDirection kDirection, typename ThreadConfig_,
        typename TileCount_>
 __device__ __forceinline__ void Global2SharedMem<
        T, kDirection, ThreadConfig_, TileCount_>::copy() {
 #pragma unroll
    for (int j = 0; j < TileCount::reg_w; ++j) {
        int smem_w_idx = gl_load_x + j * TileCount::load_w;
        int src_w_idx;
        if (is_fwd) {
            src_w_idx = start_w + smem_w_idx;
        } else {
            src_w_idx = start_w + TileCount::smem_w - w_offset - smem_w_idx - 1;
        }
        if (TileCount::check_bounds_w && smem_w_idx >= TileCount::smem_w)
            continue;
        T val = 0.0f;
        if (ring_src_h >= 0 && ring_src_h < bound_h && src_w_idx >= 0 &&
            src_w_idx < bound_w &&
            ((is_fwd && ring_src_h % stride_h == 0 && src_w_idx % stride_w == 0) ||
             (!is_fwd && TileCount::smem_w - w_offset - smem_w_idx - 1 >= 0))) {
            val = g_ptr[ring_src_h / stride_h * stride + src_w_idx / stride_w];
        }
        reg[j] = val;
    }
 }

 template <
        typename T, DepthwiseConv2dDirection kDirection, typename ThreadConfig_,
        typename TileCount_>
 __device__ __forceinline__ void Global2SharedMem<
        T, kDirection, ThreadConfig_, TileCount_>::commit() {
 #pragma unroll
    for (int j = 0; j < TileCount::reg_w; ++j) {
        int smem_w_idx = gl_load_x + j * TileCount::load_w;

        if (TileCount::check_bounds_w && smem_w_idx >= TileCount::smem_w)
            continue;

        *(sh_ptr_as_copy_t(ring_smem_h, smem_w_idx)) = reg[j];
    }
 }

 template <
        typename T, DepthwiseConv2dDirection kDirection, typename ThreadConfig_,
        typename TileCount_>
 __device__ __forceinline__ void Global2SharedMem<
        T, kDirection, ThreadConfig_, TileCount_>::iter_forward() {
    if (is_fwd) {
        ring_src_h++;
    } else {
        ring_src_h--;
    }
    ring_smem_h = (ring_smem_h + 1) % TileCount::smem_h;
 }

 // CUDA kernel to compute the depthwise convolution forward pass in NCHW format,
 // tailored for small images up to 32x32. Stride and depth multiplier must be 1.
 // Padding must be 'SAME', which allows to reuse the index computation. Only
 // use this kernel if CanLaunchDepthwiseConv2dGPU(args) returns true.
 // Tiles of the input and filter tensors are loaded into shared memory before
 // performing the convolution. Each thread handles two elements per iteration,
 // one each in the lower and upper half of a tile.
 // Backprop input direction is the same as forward direction with the filter
 // rotated by 180°.
 #if CUDA_VERSION >= 9000
 template <typename ConvTrait, DepthwiseConv2dDirection kDirection>
 __global__ void DepthwiseConv2dGPUKernelNCHW(
        const Param param, const __half* input, const __half* filter, __half* output) {
    using T = __half;
    using T2 = __half2;
    using ThreadConfig = typename ConvTrait::ThreadConfig;
    using SrcTileConfig = typename ConvTrait::SrcTileConfig;
    using FilterTileConfig = typename ConvTrait::FilterTileConfig;
    using OutTileConfig = typename ConvTrait::OutTileConfig;
    using SrcTileCount = typename ConvTrait::SrcTileCount;
    using FilterTileCount = typename ConvTrait::FilterTileCount;
    using SrcGlobal2ShareVisitor = typename ConvTrait::SrcGlobal2ShareVisitor;
    using FilterGlobal2ShareVisitor = typename ConvTrait::FilterGlobal2ShareVisitor;
    const bool is_fwd = (kDirection == DepthwiseConv2dDirection::DIRECTION_FORWARD);

    int off_ochannel = blockIdx.x, off_obw = blockIdx.y, off_obh = blockIdx.z,
        off_oh = threadIdx.y, off_ow = threadIdx.x;

    constexpr int t2_src_unroll_w = (SrcTileConfig::unroll_w + 3) / 2;
    constexpr int t2_flt_unroll_w = (FilterTileConfig::unroll_w + 2) / 2;
    constexpr int t2_out_unroll_w = (OutTileConfig::unroll_w + 1) / 2;

    extern __shared__ __align__(8) unsigned char smem[];
    static_assert(sizeof(T) <= 8, "Insufficient alignment detected");
    T* smem_src = reinterpret_cast<T*>(smem);
    T* smem_flt = reinterpret_cast<T*>(&smem_src[SrcTileCount::smem_size]);
    int stride_h = is_fwd ? param.stride_h : 1;
    int stride_w = is_fwd ? param.stride_w : 1;

    int off_ichannel = off_ochannel / param.chl_mul,
        off_fchannel = off_ichannel % param.src_chl,
        out_start_h = off_obh * OutTileConfig::block_h,
        out_start_w = off_obw * OutTileConfig::block_w,
        src_start_h = out_start_h * stride_h - param.pad_h,
        src_start_w = out_start_w * stride_w - param.pad_w,
        out_base_h_idx = out_start_h + off_oh * OutTileConfig::unroll_h;

    T* smem_src_ptr = smem_src + off_ow * FilterTileConfig::unroll_w;
    T* smem_flt_ptr = smem_flt + off_ow * FilterTileConfig::unroll_w;

    T* out_base_ptr = output + off_ochannel * param.out_h * param.out_w;

    SrcGlobal2ShareVisitor gl2sh_src = {
            smem_src,
            param.src_w,
            is_fwd ? src_start_h
                   : src_start_h - (param.out_h / 2 + param.flt_h / 2 - param.pad_h -
                                    param.src_h * param.stride_h / 2),
            is_fwd ? src_start_w
                   : src_start_w - (param.out_w / 2 + param.flt_w / 2 - param.pad_w -
                                    param.src_w * param.stride_w / 2),
            is_fwd ? param.src_h : param.src_h * param.stride_h,
            is_fwd ? param.src_w : param.src_w * param.stride_w,
            is_fwd ? 1 : param.stride_h,
            is_fwd ? 1 : param.stride_w};

    FilterGlobal2ShareVisitor gl2sh_flt = {smem_flt,
                                           param.flt_w,
                                           is_fwd ? 0 : param.flt_h - 2,
                                           0,
                                           param.flt_h,
                                           param.flt_w,
                                           1,
                                           1};

    gl2sh_src.g_ptr = input + off_ichannel * param.src_h * param.src_w;
    gl2sh_flt.g_ptr = filter + off_fchannel * param.flt_h * param.flt_w;

    gl2sh_src.first_copy();
    gl2sh_flt.first_copy();

    __syncthreads();

    T2 reg_src[SrcTileConfig::unroll_h * t2_src_unroll_w],
            reg_flt[2][FilterTileConfig::unroll_h * t2_flt_unroll_w];

    T2 sum[OutTileConfig::unroll_size] = {{0.0, 0.0}};

    for (int fh = 0; fh < param.flt_h; fh += FilterTileConfig::unroll_h) {
        gl2sh_src.copy();
        gl2sh_flt.copy();
 #pragma unroll
        for (int s_h = 0; s_h < SrcTileConfig::unroll_h; ++s_h) {
 #pragma unroll
            for (int s_w = 0; s_w < t2_src_unroll_w; ++s_w) {
                int src_offset = (off_oh * stride_h + fh + s_h) % SrcTileCount::smem_h *
                                         SrcTileCount::smem_w +
                                 s_w * 2;
                reg_src[s_h * t2_src_unroll_w + s_w] =
                        *reinterpret_cast<T2*>(smem_src_ptr + src_offset);
            }
        }

 #pragma unroll
        for (int f_h = 0; f_h < FilterTileConfig::unroll_h; ++f_h) {
 #pragma unroll
            for (int f_w = 0; f_w < t2_flt_unroll_w - 1; ++f_w) {
                int flt_offset =
                        (fh + f_h) % FilterTileCount::smem_h * FilterTileCount::smem_w +
                        f_w * 2;
                reg_flt[0][f_h * t2_flt_unroll_w + f_w] =
                        *reinterpret_cast<T2*>(smem_flt_ptr + flt_offset);
                if (f_w > 0) {
                    reg_flt[1][f_h * t2_flt_unroll_w + f_w] =
                            T2{reg_flt[0][f_h * t2_flt_unroll_w + f_w - 1].y,
                               reg_flt[0][f_h * t2_flt_unroll_w + f_w].x};
                } else {
                    reg_flt[1][f_h * t2_flt_unroll_w + f_w] =
                            T2{0.0, reg_flt[0][f_h * t2_flt_unroll_w + f_w].x};
                }
            }
            reg_flt[0][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 1] = T2{0.0, 0.0};
            reg_flt[1][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 1] =
                    T2{reg_flt[0][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 2].y, 0.0};
        }

 #pragma unroll
        for (int inner_fh = 0; inner_fh < FilterTileConfig::unroll_h; ++inner_fh) {
 #pragma unroll
            for (int oh = 0; oh < OutTileConfig::unroll_h; ++oh) {
 #pragma unroll
                for (int fw = 0; fw < t2_flt_unroll_w; ++fw) {
 #pragma unroll
                    for (int ow = 0; ow < OutTileConfig::unroll_w; ++ow) {
                        sum[oh * t2_out_unroll_w + ow] = megdnn::cuda::fma2(
                                reg_flt[ow * stride_w % 2]
                                       [inner_fh * t2_flt_unroll_w + fw],
                                reg_src[(inner_fh + oh) * t2_src_unroll_w + fw +
                                        ow * stride_w / 2],
                                sum[oh * t2_out_unroll_w + ow]);
                    }
                }
            }
        }

        __syncthreads();
        gl2sh_src.commit();
        gl2sh_flt.commit();
        gl2sh_src.iter_forward();
        gl2sh_flt.iter_forward();
        __syncthreads();
    }

    for (int o = 0; o < OutTileConfig::unroll_size; ++o) {
        for (int i = 1; i < ThreadConfig::thread_x; i = i << 1) {
            sum[o] = megdnn::cuda::hadd2(sum[o], __shfl_xor(sum[o], i, 32));
        }
    }

    if (threadIdx.x == 0) {
 #pragma unroll
        for (int i = 0; i < OutTileConfig::unroll_h; ++i) {
            int out_h_idx = out_base_h_idx + i;
            if (out_h_idx < param.out_h) {
 #pragma unroll
                for (int j = 0; j < OutTileConfig::unroll_w; ++j) {
                    int out_w_idx = out_start_w + j;
                    if (out_w_idx >= param.out_w)
                        return;
                    out_base_ptr[out_h_idx * param.out_w + out_w_idx] = __float2half(
                            __half2float(sum[i * OutTileConfig::unroll_w + j].x) +
                            __half2float(sum[i * OutTileConfig::unroll_w + j].y));
                }
            }
        }
    }
 }

 template <typename ConvTrait, DepthwiseConv2dDirection kDirection>
 __global__ void DepthwiseConv2dGPUKernelNCHWC32(
        const Param param, const __half* input, const __half* filter, __half* output) {
    using T = __half;
    using T2 = __half2;
    using ThreadConfig = typename ConvTrait::ThreadConfig;
    using SrcTileConfig = typename ConvTrait::SrcTileConfig;
    using FilterTileConfig = typename ConvTrait::FilterTileConfig;
    using OutTileConfig = typename ConvTrait::OutTileConfig;
    using SrcTileCount = typename ConvTrait::SrcTileCount;
    using FilterTileCount = typename ConvTrait::FilterTileCount;
    using SrcGlobal2ShareVisitor = typename ConvTrait::SrcGlobal2ShareVisitor;
    using FilterGlobal2ShareVisitor = typename ConvTrait::FilterGlobal2ShareVisitor;
    const bool is_fwd = (kDirection == DepthwiseConv2dDirection::DIRECTION_FORWARD);

    int off_ochannel = blockIdx.x, off_obw = blockIdx.y, off_obh = blockIdx.z,
        off_oh = threadIdx.y, off_ow = threadIdx.x;

    constexpr int t2_src_unroll_w = (SrcTileConfig::unroll_w + 3) / 2;
    constexpr int t2_flt_unroll_w = (FilterTileConfig::unroll_w + 2) / 2;
    constexpr int t2_out_unroll_w = (OutTileConfig::unroll_w + 1) / 2;

    extern __shared__ __align__(8) unsigned char smem[];
    static_assert(sizeof(T) <= 8, "Insufficient alignment detected");
    T* smem_src = reinterpret_cast<T*>(smem);
    T* smem_flt = reinterpret_cast<T*>(&smem_src[SrcTileCount::smem_size]);
    int stride_h = is_fwd ? param.stride_h : 1;
    int stride_w = is_fwd ? param.stride_w : 1;

    int off_ichannel = off_ochannel / param.chl_mul,
        off_fchannel = off_ichannel % param.src_chl,
        out_start_h = off_obh * OutTileConfig::block_h,
        out_start_w = off_obw * OutTileConfig::block_w,
        src_start_h = out_start_h * stride_h - param.pad_h,
        src_start_w = out_start_w * stride_w - param.pad_w,
        out_base_h_idx = out_start_h + off_oh * OutTileConfig::unroll_h;

    T* smem_src_ptr = smem_src + off_ow * FilterTileConfig::unroll_w;
    T* smem_flt_ptr = smem_flt + off_ow * FilterTileConfig::unroll_w;

    T* out_base_ptr = output + off_ochannel * param.out_h * param.out_w;

    SrcGlobal2ShareVisitor gl2sh_src = {
            smem_src,
            param.src_w,
            is_fwd ? src_start_h
                   : src_start_h - (param.out_h / 2 + param.flt_h / 2 - param.pad_h -
                                    param.src_h * param.stride_h / 2),
            is_fwd ? src_start_w
                   : src_start_w - (param.out_w / 2 + param.flt_w / 2 - param.pad_w -
                                    param.src_w * param.stride_w / 2),
            is_fwd ? param.src_h : param.src_h * param.stride_h,
            is_fwd ? param.src_w : param.src_w * param.stride_w,
            is_fwd ? 1 : param.stride_h,
            is_fwd ? 1 : param.stride_w};

    FilterGlobal2ShareVisitor gl2sh_flt = {smem_flt,
                                           param.flt_w,
                                           is_fwd ? 0 : param.flt_h - 2,
                                           0,
                                           param.flt_h,
                                           param.flt_w,
                                           1,
                                           1};

    gl2sh_src.g_ptr = input + off_ichannel * param.src_h * param.src_w;
    gl2sh_flt.g_ptr = filter + off_fchannel * param.flt_h * param.flt_w;

    gl2sh_src.first_copy();
    gl2sh_flt.first_copy();

    __syncthreads();

    T2 reg_src[SrcTileConfig::unroll_h * t2_src_unroll_w],
            reg_flt[2][FilterTileConfig::unroll_h * t2_flt_unroll_w];

    float2 sum[OutTileConfig::unroll_size] = {{0.0, 0.0}};

    for (int fh = 0; fh < param.flt_h; fh += FilterTileConfig::unroll_h) {
        gl2sh_src.copy();
        gl2sh_flt.copy();
 #pragma unroll
        for (int s_h = 0; s_h < SrcTileConfig::unroll_h; ++s_h) {
 #pragma unroll
            for (int s_w = 0; s_w < t2_src_unroll_w; ++s_w) {
                int src_offset = (off_oh * stride_h + fh + s_h) % SrcTileCount::smem_h *
                                         SrcTileCount::smem_w +
                                 s_w * 2;
                reg_src[s_h * t2_src_unroll_w + s_w] =
                        *reinterpret_cast<T2*>(smem_src_ptr + src_offset);
            }
        }

 #pragma unroll
        for (int f_h = 0; f_h < FilterTileConfig::unroll_h; ++f_h) {
 #pragma unroll
            for (int f_w = 0; f_w < t2_flt_unroll_w - 1; ++f_w) {
                int flt_offset =
                        (fh + f_h) % FilterTileCount::smem_h * FilterTileCount::smem_w +
                        f_w * 2;
                reg_flt[0][f_h * t2_flt_unroll_w + f_w] =
                        *reinterpret_cast<T2*>(smem_flt_ptr + flt_offset);
                if (f_w > 0) {
                    reg_flt[1][f_h * t2_flt_unroll_w + f_w] =
                            T2{reg_flt[0][f_h * t2_flt_unroll_w + f_w - 1].y,
                               reg_flt[0][f_h * t2_flt_unroll_w + f_w].x};
                } else {
                    reg_flt[1][f_h * t2_flt_unroll_w + f_w] =
                            T2{0.0, reg_flt[0][f_h * t2_flt_unroll_w + f_w].x};
                }
            }
            reg_flt[0][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 1] = T2{0.0, 0.0};
            reg_flt[1][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 1] =
                    T2{reg_flt[0][f_h * t2_flt_unroll_w + t2_flt_unroll_w - 2].y, 0.0};
        }

 #pragma unroll
        for (int inner_fh = 0; inner_fh < FilterTileConfig::unroll_h; ++inner_fh) {
 #pragma unroll
            for (int oh = 0; oh < OutTileConfig::unroll_h; ++oh) {
 #pragma unroll
                for (int fw = 0; fw < t2_flt_unroll_w; ++fw) {
 #pragma unroll
                    for (int ow = 0; ow < OutTileConfig::unroll_w; ++ow) {
                        sum[oh * t2_out_unroll_w + ow] = megdnn::cuda::fma2(
                                reg_flt[ow * stride_w % 2]
                                       [inner_fh * t2_flt_unroll_w + fw],
                                reg_src[(inner_fh + oh) * t2_src_unroll_w + fw +
                                        ow * stride_w / 2],
                                sum[oh * t2_out_unroll_w + ow]);
                    }
                }
            }
        }

        __syncthreads();
        gl2sh_src.commit();
        gl2sh_flt.commit();
        gl2sh_src.iter_forward();
        gl2sh_flt.iter_forward();
        __syncthreads();
    }

    for (int o = 0; o < OutTileConfig::unroll_size; ++o) {
        for (int i = 1; i < ThreadConfig::thread_x; i = i << 1) {
            sum[o].x += __shfl_xor(sum[o].x, i, 32);
            sum[o].y += __shfl_xor(sum[o].y, i, 32);
        }
    }

    if (threadIdx.x == 0) {
 #pragma unroll
        for (int i = 0; i < OutTileConfig::unroll_h; ++i) {
            int out_h_idx = out_base_h_idx + i;
            if (out_h_idx < param.out_h) {
 #pragma unroll
                for (int j = 0; j < OutTileConfig::unroll_w; ++j) {
                    int out_w_idx = out_start_w + j;
                    if (out_w_idx >= param.out_w)
                        return;
                    out_base_ptr[out_h_idx * param.out_w + out_w_idx] = __float2half(
                            sum[i * OutTileConfig::unroll_w + j].x +
                            sum[i * OutTileConfig::unroll_w + j].y);
                }
            }
        }
    }
 }
 #endif

 template <typename ConvTrait, DepthwiseConv2dDirection kDirection>
 __global__ void DepthwiseConv2dGPUKernelNCHW(
        const Param param, const float* input, const float* filter, float* output) {
    using T = float;
    using T2 = float2;
    using ThreadConfig = typename ConvTrait::ThreadConfig;
    using SrcTileConfig = typename ConvTrait::SrcTileConfig;
    using FilterTileConfig = typename ConvTrait::FilterTileConfig;
    using OutTileConfig = typename ConvTrait::OutTileConfig;
    using SrcTileCount = typename ConvTrait::SrcTileCount;
    using FilterTileCount = typename ConvTrait::FilterTileCount;
    using SrcGlobal2ShareVisitor = typename ConvTrait::SrcGlobal2ShareVisitor;
    using FilterGlobal2ShareVisitor = typename ConvTrait::FilterGlobal2ShareVisitor;
    const bool is_fwd = (kDirection == DepthwiseConv2dDirection::DIRECTION_FORWARD);

    int off_ochannel = blockIdx.x, off_obw = blockIdx.y, off_obh = blockIdx.z,
        off_oh = threadIdx.y, off_ow = threadIdx.x;

    extern __shared__ __align__(8) unsigned char smem[];
    static_assert(sizeof(T) <= 8, "Insufficient alignment detected");
    T* smem_src = reinterpret_cast<T*>(smem);
    T* smem_flt = reinterpret_cast<T*>(&smem_src[SrcTileCount::smem_size]);
    int stride_h = is_fwd ? param.stride_h : 1;
    int stride_w = is_fwd ? param.stride_w : 1;

    int off_ichannel = off_ochannel / param.chl_mul,
        off_fchannel = off_ichannel % param.src_chl,
        out_start_h = off_obh * OutTileConfig::block_h,
        out_start_w = off_obw * OutTileConfig::block_w,
        src_start_h = out_start_h * stride_h - param.pad_h,
        src_start_w = out_start_w * stride_w - param.pad_w,
        out_base_h_idx = out_start_h + off_oh * OutTileConfig::unroll_h;

    T* smem_src_ptr = smem_src + off_ow * FilterTileConfig::unroll_w;
    T* smem_flt_ptr = smem_flt + off_ow * FilterTileConfig::unroll_w;

    T* out_base_ptr = output + off_ochannel * param.out_h * param.out_w;

    SrcGlobal2ShareVisitor gl2sh_src = {
            smem_src,
            param.src_w,
            is_fwd ? src_start_h
                   : src_start_h - (param.out_h / 2 + param.flt_h / 2 - param.pad_h -
                                    param.src_h * param.stride_h / 2),
            is_fwd ? src_start_w
                   : src_start_w - (param.out_w / 2 + param.flt_w / 2 - param.pad_w -
                                    param.src_w * param.stride_w / 2),
            is_fwd ? param.src_h : param.src_h * param.stride_h,
            is_fwd ? param.src_w : param.src_w * param.stride_w,
            is_fwd ? 1 : param.stride_h,
            is_fwd ? 1 : param.stride_w};

    FilterGlobal2ShareVisitor gl2sh_flt = {smem_flt,
                                           param.flt_w,
                                           is_fwd ? 0 : param.flt_h - 2,
                                           0,
                                           param.flt_h,
                                           param.flt_w,
                                           1,
                                           1};

    gl2sh_src.g_ptr = input + off_ichannel * param.src_h * param.src_w;
    gl2sh_flt.g_ptr = filter + off_fchannel * param.flt_h * param.flt_w;

    gl2sh_src.first_copy();
    gl2sh_flt.first_copy();

    __syncthreads();

    T reg_src[SrcTileConfig::unroll_h * SrcTileConfig::unroll_w],
            reg_flt[FilterTileConfig::unroll_h * FilterTileConfig::unroll_w];

    T sum[OutTileConfig::unroll_size] = {0.0};

    for (int fh = 0; fh < param.flt_h; fh += FilterTileConfig::unroll_h) {
        gl2sh_src.copy();
        gl2sh_flt.copy();
 #pragma unroll
        for (int s_h = 0; s_h < SrcTileConfig::unroll_h; ++s_h) {
 #pragma unroll
            for (int s_w = 0; s_w < SrcTileConfig::unroll_w; ++s_w) {
                reg_src[s_h * SrcTileConfig::unroll_w + s_w] = smem_src_ptr
                        [(off_oh * stride_h + fh + s_h) % SrcTileCount::smem_h *
                                 SrcTileCount::smem_w +
                         s_w];
            }
        }

 #pragma unroll
        for (int f_h = 0; f_h < FilterTileConfig::unroll_h; ++f_h) {
 #pragma unroll
            for (int f_w = 0; f_w < FilterTileConfig::unroll_w; ++f_w) {
                reg_flt[f_h * FilterTileConfig::unroll_w + f_w] = smem_flt_ptr
                        [(fh + f_h) % FilterTileCount::smem_h *
                                 FilterTileCount::smem_w +
                         f_w];
            }
        }

 #pragma unroll
        for (int inner_fh = 0; inner_fh < FilterTileConfig::unroll_h; ++inner_fh) {
 #pragma unroll
            for (int oh = 0; oh < OutTileConfig::unroll_h; ++oh) {
 #pragma unroll
                for (int fw = 0; fw < FilterTileConfig::unroll_w; ++fw) {
 #pragma unroll
                    for (int ow = 0; ow < OutTileConfig::unroll_w; ++ow) {
                        sum[oh * OutTileConfig::unroll_w + ow] +=
                                reg_flt[inner_fh * FilterTileConfig::unroll_w + fw] *
                                reg_src[(inner_fh + oh) * SrcTileConfig::unroll_w + fw +
                                        ow * stride_w];
                    }
                }
            }
        }

        __syncthreads();
        gl2sh_src.commit();
        gl2sh_flt.commit();
        gl2sh_src.iter_forward();
        gl2sh_flt.iter_forward();
        __syncthreads();
    }

    for (int o = 0; o < OutTileConfig::unroll_size; ++o) {
        for (int i = 1; i < ThreadConfig::thread_x; i = i << 1) {
            sum[o] += __shfl_xor(sum[o], i, 32);
        }
    }

    if (threadIdx.x == 0) {
 #pragma unroll
        for (int i = 0; i < OutTileConfig::unroll_h; ++i) {
            int out_h_idx = out_base_h_idx + i;
            if (out_h_idx < param.out_h) {
 #pragma unroll
                for (int j = 0; j < OutTileConfig::unroll_w; ++j) {
                    int out_w_idx = out_start_w + j;
                    if (out_w_idx >= param.out_w)
                        return;
                    out_base_ptr[out_h_idx * param.out_w + out_w_idx] =
                            sum[i * OutTileConfig::unroll_w + j];
                }
            }
        }
    }
 }

 template <typename ConvTrait, DepthwiseConv2dDirection kDirection>
 __global__ void DepthwiseConv2dGPUKernelNCHWC32(
        const Param param, const float* input, const float* filter, float* output) {
    using T = float;
    using T2 = float2;
    using ThreadConfig = typename ConvTrait::ThreadConfig;
    using SrcTileConfig = typename ConvTrait::SrcTileConfig;
    using FilterTileConfig = typename ConvTrait::FilterTileConfig;
    using OutTileConfig = typename ConvTrait::OutTileConfig;
    using SrcTileCount = typename ConvTrait::SrcTileCount;
    using FilterTileCount = typename ConvTrait::FilterTileCount;
    using SrcGlobal2ShareVisitor = typename ConvTrait::SrcGlobal2ShareVisitor;
    using FilterGlobal2ShareVisitor = typename ConvTrait::FilterGlobal2ShareVisitor;
    const bool is_fwd = (kDirection == DepthwiseConv2dDirection::DIRECTION_FORWARD);

    int off_ochannel = blockIdx.x, off_obw = blockIdx.y, off_obh = blockIdx.z,
        off_oh = threadIdx.y, off_ow = threadIdx.x;

    extern __shared__ __align__(8) unsigned char smem[];
    static_assert(sizeof(T) <= 8, "Insufficient alignment detected");
    T* smem_src = reinterpret_cast<T*>(smem);
    T* smem_flt = reinterpret_cast<T*>(&smem_src[SrcTileCount::smem_size]);
    int stride_h = is_fwd ? param.stride_h : 1;
    int stride_w = is_fwd ? param.stride_w : 1;

    int off_ichannel = off_ochannel / param.chl_mul,
        off_fchannel = off_ichannel % param.src_chl,
        out_start_h = off_obh * OutTileConfig::block_h,
        out_start_w = off_obw * OutTileConfig::block_w,
        src_start_h = out_start_h * stride_h - param.pad_h,
        src_start_w = out_start_w * stride_w - param.pad_w,
        out_base_h_idx = out_start_h + off_oh * OutTileConfig::unroll_h;

    T* smem_src_ptr = smem_src + off_ow * FilterTileConfig::unroll_w;
    T* smem_flt_ptr = smem_flt + off_ow * FilterTileConfig::unroll_w;

    T* out_base_ptr = output + off_ochannel * param.out_h * param.out_w;

    SrcGlobal2ShareVisitor gl2sh_src = {
            smem_src,
            param.src_w,
            is_fwd ? src_start_h
                   : src_start_h - (param.out_h / 2 + param.flt_h / 2 - param.pad_h -
                                    param.src_h * param.stride_h / 2),
            is_fwd ? src_start_w
                   : src_start_w - (param.out_w / 2 + param.flt_w / 2 - param.pad_w -
                                    param.src_w * param.stride_w / 2),
            is_fwd ? param.src_h : param.src_h * param.stride_h,
            is_fwd ? param.src_w : param.src_w * param.stride_w,
            is_fwd ? 1 : param.stride_h,
            is_fwd ? 1 : param.stride_w};

    FilterGlobal2ShareVisitor gl2sh_flt = {smem_flt,
                                           param.flt_w,
                                           is_fwd ? 0 : param.flt_h - 2,
                                           0,
                                           param.flt_h,
                                           param.flt_w,
                                           1,
                                           1};

    gl2sh_src.g_ptr = input + off_ichannel * param.src_h * param.src_w;
    gl2sh_flt.g_ptr = filter + off_fchannel * param.flt_h * param.flt_w;

    gl2sh_src.first_copy();
    gl2sh_flt.first_copy();

    __syncthreads();

    T reg_src[SrcTileConfig::unroll_h * SrcTileConfig::unroll_w],
            reg_flt[FilterTileConfig::unroll_h * FilterTileConfig::unroll_w];

    T sum[OutTileConfig::unroll_size] = {0.0};

    for (int fh = 0; fh < param.flt_h; fh += FilterTileConfig::unroll_h) {
        gl2sh_src.copy();
        gl2sh_flt.copy();
 #pragma unroll
        for (int s_h = 0; s_h < SrcTileConfig::unroll_h; ++s_h) {
 #pragma unroll
            for (int s_w = 0; s_w < SrcTileConfig::unroll_w; ++s_w) {
                reg_src[s_h * SrcTileConfig::unroll_w + s_w] = smem_src_ptr
                        [(off_oh * stride_h + fh + s_h) % SrcTileCount::smem_h *
                                 SrcTileCount::smem_w +
                         s_w];
            }
        }

 #pragma unroll
        for (int f_h = 0; f_h < FilterTileConfig::unroll_h; ++f_h) {
 #pragma unroll
            for (int f_w = 0; f_w < FilterTileConfig::unroll_w; ++f_w) {
                reg_flt[f_h * FilterTileConfig::unroll_w + f_w] = smem_flt_ptr
                        [(fh + f_h) % FilterTileCount::smem_h *
                                 FilterTileCount::smem_w +
                         f_w];
            }
        }

 #pragma unroll
        for (int inner_fh = 0; inner_fh < FilterTileConfig::unroll_h; ++inner_fh) {
 #pragma unroll
            for (int oh = 0; oh < OutTileConfig::unroll_h; ++oh) {
 #pragma unroll
                for (int fw = 0; fw < FilterTileConfig::unroll_w; ++fw) {
 #pragma unroll
                    for (int ow = 0; ow < OutTileConfig::unroll_w; ++ow) {
                        sum[oh * OutTileConfig::unroll_w + ow] +=
                                reg_flt[inner_fh * FilterTileConfig::unroll_w + fw] *
                                reg_src[(inner_fh + oh) * SrcTileConfig::unroll_w + fw +
                                        ow * stride_w];
                    }
                }
            }
        }

        __syncthreads();
        gl2sh_src.commit();
        gl2sh_flt.commit();
        gl2sh_src.iter_forward();
        gl2sh_flt.iter_forward();
        __syncthreads();
    }

    for (int o = 0; o < OutTileConfig::unroll_size; ++o) {
        for (int i = 1; i < ThreadConfig::thread_x; i = i << 1) {
            sum[o] += __shfl_xor(sum[o], i, 32);
        }
    }

    if (threadIdx.x == 0) {
 #pragma unroll
        for (int i = 0; i < OutTileConfig::unroll_h; ++i) {
            int out_h_idx = out_base_h_idx + i;
            if (out_h_idx < param.out_h) {
 #pragma unroll
                for (int j = 0; j < OutTileConfig::unroll_w; ++j) {
                    int out_w_idx = out_start_w + j;
                    if (out_w_idx >= param.out_w)
                        return;
                    out_base_ptr[out_h_idx * param.out_w + out_w_idx] =
                            sum[i * OutTileConfig::unroll_w + j];
                }
            }
        }
    }
 }

 template <
        typename T, typename T2, DepthwiseConv2dDirection kDirection, int unroll_fw,
        int unroll_ow, int stride>
 void LaunchDepthwiseConv2dGPU(
        const Param& param, const T* input, const T* filter, T* output,
        cudaStream_t stream) {
    static int const unroll_oh = 1, unroll_fh = 1;

    using FilterTileConfig = FilterTileConfig<unroll_fh, unroll_fw>;
    using ThreadConfig = ThreadConfig<4, 32>;
    using OutTileConfig = OutTileConfig<ThreadConfig, unroll_oh, unroll_ow>;
    using IConvTrait = ConvTrait<
            T, kDirection, ThreadConfig, OutTileConfig, FilterTileConfig, stride,
            stride>;
    using SrcTileCount = typename IConvTrait::SrcTileCount;
    using FilterTileCount = typename IConvTrait::FilterTileCount;

    dim3 block(ThreadConfig::thread_x, ThreadConfig::thread_y);
    dim3 grid;
    grid.x = param.batch * param.src_chl * param.chl_mul;
    grid.y = DIVUP(param.out_w, OutTileConfig::block_w);
    grid.z = DIVUP(param.out_h, OutTileConfig::block_h);
    const int shared_storage =
            (SrcTileCount::smem_size + FilterTileCount::smem_size) * sizeof(T);

    void (*kernel)(const Param, const T*, const T*, T*);

    if (param.is_compute_deafult) {
        kernel = DepthwiseConv2dGPUKernelNCHW<IConvTrait, kDirection>;
    } else {
        kernel = DepthwiseConv2dGPUKernelNCHWC32<IConvTrait, kDirection>;
    }
    kernel<<<grid, block, shared_storage, stream>>>(param, input, filter, output);
    after_kernel_launch();
 }

 #define INSTANCE_AB(type1, type2, a, b, direction)                              \
    if (param.out_w > b * 4) {                                                  \
        if (direction == DepthwiseConv2dDirection::DIRECTION_BACKWARD ||        \
            (param.stride_h == 1 && param.stride_w == 1)) {                     \
            LaunchDepthwiseConv2dGPU<type1, type2, direction, a + 2, b + 1, 1>( \
                    param, src, flt, dst, stream);                              \
        } else if (param.stride_h == 2 && param.stride_w == 2) {                \
            LaunchDepthwiseConv2dGPU<type1, type2, direction, a + 2, b + 1, 2>( \
                    param, src, flt, dst, stream);                              \
        }                                                                       \
    }

 #define INSTANCE_A(type1, type2, a, direction)                                                                                                                                                                                                                                                                                                                                                                                                                                   \
    if (param.flt_w > a * 4) {                                                                                                                                                                                                                                                                                                                                                                                                                                                   \
        INSTANCE_AB(type1, type2, a, 15, direction)                                                                                                                                                                                                                                                                                                                                                                                                                              \
        else INSTANCE_AB(type1, type2, a, 14, direction) else INSTANCE_AB(type1, type2, a, 13, direction) else INSTANCE_AB(type1, type2, a, 12, direction) else INSTANCE_AB(type1, type2, a, 11, direction) else INSTANCE_AB(type1, type2, a, 10, direction) else INSTANCE_AB(                                                                                                                                                                                                   \
                type1, type2,                                                                                                                                                                                                                                                                                                                                                                                                                                                    \
                a, 9, direction) else INSTANCE_AB(type1, type2, a, 8, direction) else INSTANCE_AB(type1, type2, a, 7, direction) else INSTANCE_AB(type1, type2, a, 6, direction) else INSTANCE_AB(type1, type2, a, 5, direction) else INSTANCE_AB(type1, type2, a, 4, direction) else INSTANCE_AB(type1, type2, a, 3, direction) else INSTANCE_AB(type1, type2, a, 2, direction) else INSTANCE_AB(type1, type2, a, 1, direction) else INSTANCE_AB(type1, type2, a, 0, direction) \
    }

 #define INSTANCE(type1, type2, direction)                        \
    INSTANCE_A(type1, type2, 6, direction)                       \
    else INSTANCE_A(type1, type2, 4, direction) else INSTANCE_A( \
            type1, type2, 2, direction) else INSTANCE_A(type1, type2, 0, direction)
 }  // anonymous namespace
--- a/dnn/src/cuda/conv_bias/chanwise/fwd_large_filter.cu
+++ b/dnn/src/cuda/conv_bias/chanwise/fwd_large_filter.cu
@@ -0,0 +1,56 @@
 /**
 * \file dnn/src/cuda/conv_bias/chanwise/fwd_depthwise_large_filter.cu
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include "cuda.h"
 #include "cuda_fp16.h"
 #include "src/cuda/conv_bias/chanwise/kern.cuh"
 #include "src/cuda/conv_bias/chanwise/kern_helper.cuh"
 #include "src/cuda/conv_bias/chanwise/launch_config.cuh"
 #include "src/cuda/fp16_help.cuh"

 using namespace megdnn;
 using namespace cuda;
 using namespace conv_bias;
 using namespace chanwise;

 #include "src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh"

 namespace megdnn {
 namespace cuda {
 namespace conv_bias {
 namespace chanwise {

 // =====================================fwd=====================================

 #define check

 template <>
 void run_fwd_depthwise_large_filter(
        float* dst, const float* src, const float* flt, const Param& param,
        cudaStream_t stream) {
    INSTANCE(float, float2, DepthwiseConv2dDirection::DIRECTION_FORWARD)
 }

 #if CUDA_VERSION >= 9000
 template <>
 void run_fwd_depthwise_large_filter(
        __half* dst, const __half* src, const __half* flt, const Param& param,
        cudaStream_t stream) {
    INSTANCE(__half, __half2, DepthwiseConv2dDirection::DIRECTION_FORWARD)
 }
 #endif

 }  // namespace chanwise
 }  // namespace conv_bias
 }  // namespace cuda
 }  // namespace megdnn

 // vim: syntax=cuda.doxygen
--- a/dnn/src/cuda/conv_bias/chanwise/kern.cuh
+++ b/dnn/src/cuda/conv_bias/chanwise/kern.cuh
@@ -27,8 +27,10 @@ namespace chanwise {
 struct Param {
    uint32_t batch, src_chl, src_h, src_w, chl_mul, flt_h, flt_w, out_h, out_w, pad_h,
            pad_w, stride_h, stride_w, dilation_h, dilation_w;
    bool is_compute_deafult;
 #if MEGDNN_CC_HOST
    static Param from_fwd_args(const BiasForwardSizeArgs& args) {
    static Param from_fwd_args(
            const BiasForwardSizeArgs& args, bool is_compute_deafult_ = true) {
 #define U(v) static_cast<uint32_t>(v)
        auto&& src = args.src_layout->shape;
        auto&& dst = args.dst_layout->shape;
@@ -42,11 +44,12 @@ struct Param {
            hw_pos = 1;
        }
        return {
                U(src[0]),          U(src[c_pos]),     U(src[hw_pos]),
                U(src[hw_pos + 1]), U(fm.ocpg),        U(fm.spatial[0]),
                U(fm.spatial[1]),   U(dst[hw_pos]),    U(dst[hw_pos + 1]),
                U(fm.padding[0]),   U(fm.padding[1]),  U(fm.stride[0]),
                U(fm.stride[1]),    U(fm.dilation[0]), U(fm.dilation[1]),
                U(src[0]),           U(src[c_pos]),     U(src[hw_pos]),
                U(src[hw_pos + 1]),  U(fm.ocpg),        U(fm.spatial[0]),
                U(fm.spatial[1]),    U(dst[hw_pos]),    U(dst[hw_pos + 1]),
                U(fm.padding[0]),    U(fm.padding[1]),  U(fm.stride[0]),
                U(fm.stride[1]),     U(fm.dilation[0]), U(fm.dilation[1]),
                is_compute_deafult_,
        };
 #undef U
    }
@@ -61,6 +64,10 @@ template <typename T>
 void run_fwd_small(
        T* dst, const T* src, const T* flt, const Param& param, cudaStream_t stream);

 template <typename T>
 void run_fwd_depthwise_large_filter(
        T* dst, const T* src, const T* flt, const Param& param, cudaStream_t stream);

 // implemented in fwd_8x8x32.cu
 void run_fwd_8x8x32(
        int32_t* dst, const int8_t* src, const int8_t* flt, const Param& param,
--- a/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp
+++ b/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp
@@ -74,13 +74,18 @@ cutlass::conv::ConvType convert_conv_type(Base::ConvType conv_type) {
            return cutlass::conv::ConvType::kLocal;
        case Base::ConvType::kLocalShare:
            return cutlass::conv::ConvType::kLocalShare;
        case Base::ConvType::kDepthwiseConvolution:
            return cutlass::conv::ConvType::kDepthwiseConvolution;
        default:
            megdnn_assert(0, "invalid conv type");
    }
 }

 NumericTypeID convert_dtype(DTypeEnum dtype) {
    switch (dtype) {
 NumericTypeID convert_dtype(DType dtype) {
    // just make convolution with no bias happy
    if (!dtype.valid())
        return NumericTypeID::kF32;
    switch (dtype.enumv()) {
        case DTypeEnum::Float32:
            return NumericTypeID::kF32;
        case DTypeEnum::Float16:
@@ -100,6 +105,21 @@ NumericTypeID convert_dtype(DTypeEnum dtype) {
    }
 }

 NumericTypeID get_accumulator_dtype(
        DType dtype, const param::ConvBias::ComputeMode comp_mode) {
    if (dtype.category() == DTypeCategory::QUANTIZED) {
        return NumericTypeID::kS32;
    } else {
        megdnn_assert(dtype.category() == DTypeCategory::FLOAT);
        if (comp_mode == param::ConvBias::ComputeMode::DEFAULT) {
            return convert_dtype(dtype);
        } else {
            megdnn_assert(comp_mode == param::ConvBias::ComputeMode::FLOAT32);
            return NumericTypeID::kF32;
        }
    }
 }

 struct LayoutPack {
    LayoutTypeID src;
    LayoutTypeID filter;
@@ -149,6 +169,9 @@ LayoutPack get_layout_pack(const param::ConvBias::Format format, int access_type
                default:
                    megdnn_assert(0, "invalid access_type");
            }
        case Format::NCHW:
            return {LayoutTypeID::kTensorNCHW, LayoutTypeID::kTensorNCHW,
                    LayoutTypeID::kTensorNCHW, LayoutTypeID::kTensorNCHW};
        default:
            megdnn_assert(0, "invalid format");
    }
@@ -177,6 +200,94 @@ EpilogueType get_epilogue_type(const param::ConvBias::NonlineMode mode, bool cla
    megdnn_assert(0, "invalid nonlinear mode");
 }

 std::pair<int, int> get_tensor_alignment(
        const param::ConvBias::Format format, const TensorLayout& src,
        const TensorLayout& filter, const Base::AlgoParam& algo_param,
        bool is_chanwise) {
    int alignment_src = 0;
    int alignment_filter = 0;

    using Format = param::ConvBias::Format;

    // get tensor alignment for tensor op operations
    // for tensor op operations, the alignment is determined by the size of a vector
    auto get_tensor_alignment_tensor_op = [&]() {
        switch (format) {
            /// case int8
            case Format::NCHW32:
            case Format::NCHW32_NCHW4:
                alignment_src = 16;
                alignment_filter = 16;
                break;
            /// case int4 or uint4
            case Format::NCHW64:
                alignment_src = 32;
                alignment_filter = 32;
                break;
            case Format::NHWC:
                alignment_src = alignment_filter = algo_param.access_size;
                break;
            default:
                megdnn_throw("invalid format");
        };
    };

    // get tensor alignment for dot product operations
    // for integer dot product operations, alignment src is always 4
    // and the alignment filter is determined by the threadblock shape
    auto get_tensor_alignment_dp4a = [&]() {
        megdnn_assert(
                format == Format::NCHW4 || format == Format::NCHW4_NCHW ||
                format == Format::NCHW4_NHWC || format == Format::NCHW4_NCHW32);
        alignment_src = 4;
        // determine alignment filter
        constexpr int warp_size = 32;
        int threads = warp_size * algo_param.threadblock_m * algo_param.threadblock_n *
                      algo_param.threadblock_k /
                      (algo_param.warp_m * algo_param.warp_n * algo_param.warp_k);
        int threadblock_loads =
                filter.dtype.size(algo_param.threadblock_m * algo_param.threadblock_k);
        int load_per_thread = threadblock_loads / threads;
        if (load_per_thread >= 16)
            alignment_filter = 16;
        else if (load_per_thread >= 8)
            alignment_filter = 8;
        else {
            megdnn_assert(load_per_thread >= 4);
            alignment_filter = 4;
        }
    };

    // get tensor alignment for depthwise convolution
    auto get_tensor_alignment_dwconv2d_nchw = [&]() {
        alignment_filter = 1;
        size_t wi = src.dtype.size(src[3]);  // width extent in bytes
        for (size_t candidate : {16, 4, 2}) {
            if (wi % candidate == 0) {
                alignment_src = candidate;
                break;
            }
        }
        alignment_src /= src.dtype.size(1);
    };

    /// TODO: need a better way to check whether tensor core instruction is used
    if (format == Format::NCHW32 || format == Format::NCHW32_NCHW4 ||
        format == Format::NCHW64 || format == Format::NCHW64 ||
        format == Format::NHWC) {
        get_tensor_alignment_tensor_op();
    } else if (
            format == Format::NCHW4 || format == Format::NCHW4_NCHW ||
            format == Format::NCHW4_NHWC || format == Format::NCHW4_NCHW32) {
        get_tensor_alignment_dp4a();
    } else {
        /// the following is used for depthwise convolution
        megdnn_assert(format == Format::NCHW && is_chanwise);
        get_tensor_alignment_dwconv2d_nchw();
    }
    megdnn_assert(alignment_src >= 1 && alignment_filter >= 1);
    return {alignment_src, alignment_filter};
 }
 }  // namespace

 const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op(
@@ -185,23 +296,36 @@ const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_co
    auto&& param = args.opr->param();
    auto layouts = get_layout_pack(param.format, m_algo_param.access_size);
    auto epilogue_type = get_epilogue_type(
            param.nonlineMode, args.dst_layout->dtype.enumv() != DTypeEnum::Float32);
            param.nonlineMode,
            args.dst_layout->dtype.category() != DTypeCategory::FLOAT);

    cutlass::conv::SpecialOptimizeDesc special_optimization =
            (use_conv_filter_unity_opt)
                    ? cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY
                    : cutlass::conv::SpecialOptimizeDesc::NONE;

    int alignment_src, alignment_filter;
    auto&& fm = args.filter_meta;
    bool is_chanwise = param.sparse == param::ConvBias::Sparse::GROUP && fm.icpg == 1 &&
                       fm.ocpg == 1;
    std::tie(alignment_src, alignment_filter) = get_tensor_alignment(
            param.format, *args.src_layout, *args.filter_layout, m_algo_param,
            is_chanwise);

    auto accumulator_dtype =
            get_accumulator_dtype(args.src_layout->dtype, param.compute_mode);

    ConvolutionKey key{
            convert_conv_op(conv_op),
            convert_dtype(args.src_layout->dtype.enumv()),
            convert_dtype(args.src_layout->dtype),
            layouts.src,
            convert_dtype(args.filter_layout->dtype.enumv()),
            convert_dtype(args.filter_layout->dtype),
            layouts.filter,
            convert_dtype(args.dst_layout->dtype.enumv()),
            convert_dtype(args.dst_layout->dtype),
            layouts.dst,
            convert_dtype(args.bias_layout->dtype.enumv()),
            convert_dtype(args.bias_layout->dtype),
            layouts.bias,
            accumulator_dtype,
            convert_conv_type(conv_type),
            m_algo_param.threadblock_m,
            m_algo_param.threadblock_n,
@@ -215,6 +339,8 @@ const Operation* ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_co
            epilogue_type,
            m_algo_param.stage,
            special_optimization,
            alignment_src,
            alignment_filter,
            without_shared_load};

    return Singleton::get().operation_table.find_op(key);
@@ -227,13 +353,16 @@ void ConvBiasForwardImpl::AlgoCutlassConvolutionBase::execute_cutlass_conv_op(
        size_t pw, size_t sh, size_t sw, size_t dh, size_t dw, const void* alpha,
        const void* beta, const void* gamma, const void* delta, const void* theta,
        const void* threshold, const void* dst_scale, cudaStream_t stream,
        const void* extra_param) const {
        const void* extra_param, size_t groups) const {
    // gcc prints warnings when size_t values are implicitly narrowed to int
    cutlass::conv::Conv2dProblemSize problem_size{
            int(n),  int(hi), int(wi), int(ci),
            int(co), int(fh), int(fw), int(ho),
            int(wo), int(ph), int(pw), int(sh),
            int(sw), int(dh), int(dw), cutlass::conv::Mode::kCrossCorrelation};
            int(n),      int(hi), int(wi), int(ci),
            int(co),     int(fh), int(fw), int(ho),
            int(wo),     int(ph), int(pw), int(sh),
            int(sw),     int(dh), int(dw), cutlass::conv::Mode::kCrossCorrelation,
            1,            // split k slices, always 1
            int(groups),  // groups
    };

    ConvolutionArguments conv_args{
            problem_size, src,   filter, bias,  z,         dst,       alpha,
--- a/dnn/src/cuda/conv_bias/depthwise_large_filter.cpp
+++ b/dnn/src/cuda/conv_bias/depthwise_large_filter.cpp
@@ -0,0 +1,111 @@
 /**
 * \file dnn/src/cuda/conv_bias/depthwise_large_filter.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include "src/cuda/conv_bias/chanwise/depthwise_large_filter.cuh"
 #include "src/common/conv_bias.h"
 #include "src/cuda/conv_bias/algo.h"
 #include "src/cuda/conv_bias/chanwise/kern.cuh"
 #include "src/cuda/utils.h"

 using namespace megdnn;
 using namespace cuda;
 using namespace conv_bias;

 namespace {
 inline bool is_available_depthwise_large_filter(const chanwise::Param& param) {
    if ((param.stride_h == 1 && param.stride_w == 1) ||
        (param.stride_h == 2 && param.stride_w == 2)) {
        auto&& device_prop = cuda::current_device_prop();
        static int const unroll_oh = 1, unroll_fh = 1;
        CHECK(FWD)
    }
    return false;
 }
 }  // anonymous namespace

 bool ConvBiasForwardImpl::AlgoDepthwiseLargeFilter::is_available(
        const SizeArgs& args) const {
    if (!args.src_layout->is_contiguous() || !args.dst_layout->is_contiguous()) {
        return false;
    }
    if (args.src_layout->dtype != args.filter_layout->dtype &&
        (args.src_layout->dtype != dtype::Float32()
 #if CUDA_VERSION >= 9000
         || args.src_layout->dtype != dtype::Float16()
 #endif
                 )) {
        return false;
    }
    if (args.z_layout->ndim > 0)
        return false;

    auto param = chanwise::Param::from_fwd_args(
            args, args.opr->param().compute_mode == Param::ComputeMode::DEFAULT);
    auto&& fm = args.filter_meta;
    return fm.group > 1 && args.filter_meta.format == Param::Format::NCHW &&
           args.src_layout->dtype.category() == DTypeCategory::FLOAT &&
           args.opr->param().compute_mode == Param::ComputeMode::DEFAULT &&
           fm.spatial_ndim == 2 && fm.icpg == 1 && fm.ocpg == 1 &&
           fm.dilation[0] == 1 && fm.dilation[1] == 1 && !fm.should_flip &&
           is_available_depthwise_large_filter(param);
 }

 size_t ConvBiasForwardImpl::AlgoDepthwiseLargeFilter::get_workspace_in_bytes(
        const SizeArgs& args) const {
    auto dst_layout = *args.dst_layout;
    if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) {
        dst_layout.dtype = DType();
        args.opr->check_or_deduce_dtype_fwd(
                args.src_layout->dtype, args.filter_layout->dtype, dst_layout.dtype);
        return dst_layout.span().dist_byte();
    }
    return 0;
 }

 void ConvBiasForwardImpl::AlgoDepthwiseLargeFilter::exec(const ExecArgs& args) const {
    WorkspaceBundle bundle{args.workspace.raw_ptr, {get_workspace_in_bytes(args)}};
    TensorND conv_dst_tensor = *args.dst_tensor;
    if (args.dst_layout->dtype.enumv() != args.bias_layout->dtype.enumv()) {
        conv_dst_tensor = TensorND{bundle.get(0), conv_dst_tensor.layout};
        conv_dst_tensor.layout.dtype = DType();
        args.opr->check_or_deduce_dtype_fwd(
                args.src_layout->dtype, args.filter_layout->dtype,
                conv_dst_tensor.layout.dtype);
    }
    {
        auto kparam = chanwise::Param::from_fwd_args(
                args, args.opr->param().compute_mode == Param::ComputeMode::DEFAULT);
        auto stream = cuda_stream(args.handle);
        switch (args.src_layout->dtype.enumv()) {
            case DTypeEnum::Float32:
                chanwise::run_fwd_depthwise_large_filter(
                        conv_dst_tensor.ptr<float>(), args.src_tensor->ptr<float>(),
                        args.filter_tensor->ptr<float>(), kparam, stream);
                break;
 #if CUDA_VERSION >= 9000
            case DTypeEnum::Float16:
                chanwise::run_fwd_depthwise_large_filter(
                        static_cast<half*>(conv_dst_tensor.raw_ptr()),
                        static_cast<half*>(args.src_tensor->raw_ptr()),
                        static_cast<half*>(args.filter_tensor->raw_ptr()), kparam,
                        stream);
                break;
 #endif
            default:
                megdnn_assert_internal(0);
        }
    }
    handle_bias_and_nonlinear(
            args.handle, args.nonlinear_mode, &conv_dst_tensor, args.dst_tensor,
            args.bias_tensor);
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp
@@ -0,0 +1,99 @@
 /**
 * \file dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "src/common/conv_bias.h"
 #include "src/cuda/conv_bias/algo.h"
 #include "src/cuda/convolution_helper/parameter.cuh"
 #include "src/cuda/utils.h"

 using namespace megdnn;
 using namespace cuda;

 bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available(
        const SizeArgs& args) const {
 #define RETURN_IF_FALSE(stmt_) \
    if (!(stmt_))              \
        return false;
    RETURN_IF_FALSE(is_compute_capability_required(7, 0));
    RETURN_IF_FALSE(
            args.src_layout->is_contiguous() && args.dst_layout->is_contiguous());
    using Param = param::ConvBias;
    using Format = Param::Format;
    using Sparse = Param::Sparse;
    using Mode = Param::Mode;
    using NonlineMode = Param::NonlineMode;
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    RETURN_IF_FALSE(
            param.format == Format::NCHW &&
            args.src_layout->dtype.enumv() == DTypeEnum::Float16 &&
            args.filter_layout->dtype.enumv() == DTypeEnum::Float16 &&
            args.dst_layout->dtype.enumv() == DTypeEnum::Float16);
    RETURN_IF_FALSE(param.nonlineMode != NonlineMode::SIGMOID);
    RETURN_IF_FALSE(
            args.bias_layout->ndim <= 0 ||
            (args.bias_layout->dtype.enumv() == DTypeEnum::Float16 &&
             check_bias_share_in_channel(*args.bias_layout, param.format)));
    RETURN_IF_FALSE(
            args.z_layout->ndim <= 0 ||
            args.z_layout->dtype.enumv() == DTypeEnum::Float16);
    RETURN_IF_FALSE(param.sparse == Sparse::GROUP);
    RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION);
    // check if channelwise convolution
    RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1);
    RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1);
    const auto* op = get_cutlass_conv_op(
            args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false);
    RETURN_IF_FALSE(op != nullptr);
    return true;
 #undef RETURN_IF_FALSE
 }

 void ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::exec(
        const ExecArgs& args) const {
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    size_t n = args.src_layout->operator[](0), hi = args.src_layout->operator[](2),
           wi = args.src_layout->operator[](3);
    size_t ho = args.dst_layout->operator[](2), wo = args.dst_layout->operator[](3);
    size_t co = fm.group;
    size_t ci = co;
    // check if channelwise convolution
    megdnn_assert(fm.icpg == 1 && fm.ocpg == 1);
    auto&& stream = cuda_stream(args.opr->handle());

    float alpha = 1.f;
    float beta = args.bias_layout->ndim > 0 ? 1.f : 0.f;
    void* bias_ptr = args.bias_layout->ndim > 0 ? args.bias_tensor->raw_ptr() : nullptr;
    float gamma = args.z_layout->ndim > 0 ? 1.f : 0.f;
    void* z_ptr = args.z_layout->ndim > 0 ? args.z_tensor->raw_ptr() : nullptr;

    // dummy parameters, used for quantization cases
    float theta = 0.f;
    float delta = 0.f;
    float threshold = 0.f;

    const auto* op = get_cutlass_conv_op(
            args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false);

    UNPACK_CONV_PARAMETER(fm, param);
    MARK_USED_VAR
    execute_cutlass_conv_op(
            op, args.src_tensor->raw_ptr(), args.filter_tensor->raw_ptr(), bias_ptr,
            z_ptr, args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho,
            wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta,
            &threshold, nullptr, stream, nullptr, fm.group);

    after_kernel_launch();
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp
@@ -0,0 +1,99 @@
 /**
 * \file dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "src/common/conv_bias.h"
 #include "src/cuda/conv_bias/algo.h"
 #include "src/cuda/convolution_helper/parameter.cuh"
 #include "src/cuda/utils.h"

 using namespace megdnn;
 using namespace cuda;

 bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available(
        const SizeArgs& args) const {
 #define RETURN_IF_FALSE(stmt_) \
    if (!(stmt_))              \
        return false;
    RETURN_IF_FALSE(is_compute_capability_required(6, 1));
    RETURN_IF_FALSE(
            args.src_layout->is_contiguous() && args.dst_layout->is_contiguous());
    using Param = param::ConvBias;
    using Format = Param::Format;
    using Sparse = Param::Sparse;
    using Mode = Param::Mode;
    using NonlineMode = Param::NonlineMode;
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    RETURN_IF_FALSE(
            param.format == Format::NCHW &&
            args.src_layout->dtype.enumv() == DTypeEnum::Float32 &&
            args.filter_layout->dtype.enumv() == DTypeEnum::Float32 &&
            args.dst_layout->dtype.enumv() == DTypeEnum::Float32);
    RETURN_IF_FALSE(param.nonlineMode != NonlineMode::SIGMOID);
    RETURN_IF_FALSE(
            args.bias_layout->ndim <= 0 ||
            (args.bias_layout->dtype.enumv() == DTypeEnum::Float32 &&
             check_bias_share_in_channel(*args.bias_layout, param.format)));
    RETURN_IF_FALSE(
            args.z_layout->ndim <= 0 ||
            args.z_layout->dtype.enumv() == DTypeEnum::Float32);
    RETURN_IF_FALSE(param.sparse == Sparse::GROUP);
    RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION);
    // check if channelwise convolution
    RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1);
    RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1);
    const auto* op = get_cutlass_conv_op(
            args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false);
    RETURN_IF_FALSE(op != nullptr);
    return true;
 #undef RETURN_IF_FALSE
 }

 void ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::exec(
        const ExecArgs& args) const {
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    size_t n = args.src_layout->operator[](0), hi = args.src_layout->operator[](2),
           wi = args.src_layout->operator[](3);
    size_t ho = args.dst_layout->operator[](2), wo = args.dst_layout->operator[](3);
    size_t co = fm.group;
    size_t ci = co;
    // check if channelwise convolution
    megdnn_assert(fm.icpg == 1 && fm.ocpg == 1);
    auto&& stream = cuda_stream(args.opr->handle());

    float alpha = 1.f;
    float beta = args.bias_layout->ndim > 0 ? 1.f : 0.f;
    void* bias_ptr = args.bias_layout->ndim > 0 ? args.bias_tensor->raw_ptr() : nullptr;
    float gamma = args.z_layout->ndim > 0 ? 1.f : 0.f;
    void* z_ptr = args.z_layout->ndim > 0 ? args.z_tensor->raw_ptr() : nullptr;

    // dummy parameters, used for quantization cases
    float theta = 0.f;
    float delta = 0.f;
    float threshold = 0.f;

    const auto* op = get_cutlass_conv_op(
            args, ConvOperator::kFprop, ConvType::kDepthwiseConvolution, false, false);

    UNPACK_CONV_PARAMETER(fm, param);
    MARK_USED_VAR
    execute_cutlass_conv_op(
            op, args.src_tensor->raw_ptr(), args.filter_tensor->raw_ptr(), bias_ptr,
            z_ptr, args.dst_tensor->raw_ptr(), nullptr, n, hi, wi, ci, co, fh, fw, ho,
            wo, ph, pw, sh, sw, dh, dw, &alpha, &beta, &gamma, &delta, &theta,
            &threshold, nullptr, stream, nullptr, fm.group);

    after_kernel_launch();
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/conv_bias/opr_impl.cpp
+++ b/dnn/src/cuda/conv_bias/opr_impl.cpp
@@ -145,9 +145,22 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic(
    const bool prefer_dnn_chanwise = slow_cudnn_chanwise_impl ||
                                     args.filter_meta.stride[0] != 1 ||
                                     args.filter_meta.stride[1] != 1 || hw_size < 512;
    //! choose for large kernel cases
    size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1];
    size_t hi = src[2], wi = src[3];
    const bool prefer_dnn_lk_implbmm = hi <= 2 * fh && wi <= 2 * fw;
    //! avoid bad case in cudnn, check dnn chanwise impl first
    if (is_chanwise) {
        if (prefer_dnn_chanwise) {
        if (prefer_dnn_lk_implbmm) {
 #if CUDA_VERSION >= 10020
            if (sm_algo_pack.f16_implicit_bmm[0].is_available_attribute(
                        args, positive_attr, negative_attr, workspace_limit_in_bytes))
                return &sm_algo_pack.f16_implicit_bmm[0];
 #endif
            if (sm_algo_pack.f32_implicit_bmm[0].is_available_attribute(
                        args, positive_attr, negative_attr, workspace_limit_in_bytes))
                return &sm_algo_pack.f32_implicit_bmm[0];
        } else if (prefer_dnn_chanwise) {
            if (sm_algo_pack.chanwise.is_available_attribute(
                        args, positive_attr, negative_attr, workspace_limit_in_bytes))
                return &sm_algo_pack.chanwise;
--- a/dnn/src/cuda/conv_bias/opr_impl.h
+++ b/dnn/src/cuda/conv_bias/opr_impl.h
@@ -45,6 +45,7 @@ public:
    class AlgoCUDNNConvBiasActivation;
    class AlgoChanwise;
    class AlgoChanwiseSmall;
    class AlgoDepthwiseLargeFilter;
    class AlgoChanwise8x8x32;
    class AlgoCUDNNConv;
    class AlgoFallbackNCHWQS8;
@@ -71,6 +72,9 @@ public:
    class AlgoInt4Int4NHWCIMMAImplicitGemm;
    class AlgoUInt4Int4NHWCIMMAImplicitGemm;
    class AlgoBFloat16;
    // The following algorithms are suitable for channel wise convolution
    class AlgoFloat32NCHWFMAImplicitBatchedGemm;
    class AlgoFloat16NCHWHMMAImplicitBatchedGemm;

    class AlgoPack;

--- a/dnn/src/cuda/convolution/backward_data/algo.cpp
+++ b/dnn/src/cuda/convolution/backward_data/algo.cpp
@@ -19,10 +19,12 @@ using namespace cuda;
 ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() {
    non_cudnn_algos.push_back(&chanwise);
    non_cudnn_algos.push_back(&chanwise_small);
    non_cudnn_algos.push_back(&depthwise_large_filter);
    non_cudnn_algos.push_back(&matmul);

    all_algos.push_back(&chanwise);        // prefer chanwise
    all_algos.push_back(&chanwise_small);  // prefer small chanwise
    all_algos.push_back(&depthwise_large_filter);

    fill_cudnn_algos();
    for (auto&& i : cudnn) {
@@ -41,6 +43,7 @@ ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() {
        all_algos.push_back(&algo);
        int8_algos.push_back(&algo);
    }
    fill_dwconv_algos();

    int8_algos.push_back(&int8_nchw_dotprod);
    all_algos.push_back(&int8_nchw_dotprod);
@@ -54,6 +57,39 @@ ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() {
    }
 }

 void ConvolutionBackwardDataImpl::AlgoPack::fill_dwconv_algos() {
    {
        using AlgoParam = AlgoFloat32NCHWFMAImplicitBatchedGemm::AlgoParam;
        /// preferred algo
        implbmm_nchw_fma.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8, 2});
        implbmm_nchw_fma.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8, 2});
        implbmm_nchw_fma.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8, 2});
        implbmm_nchw_fma.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8, 2});
        implbmm_nchw_fma.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8, 2});
        implbmm_nchw_fma.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8, 2});
        implbmm_nchw_fma.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8, 2});
        implbmm_nchw_fma.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8, 2});
        implbmm_nchw_fma.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8, 2});
        for (auto&& algo : implbmm_nchw_fma) {
            all_algos.push_back(&algo);
        }
    }
 #if CUDA_VERSION >= 10010
    {
        using AlgoParam = AlgoFloat16NCHWHMMAImplicitBatchedGemm::AlgoParam;
        /// preferred algo
        implbmm_nchw_hmma.emplace_back(AlgoParam{64, 128, 32, 32, 32, 32, 8, 8, 4, 2});
        implbmm_nchw_hmma.emplace_back(AlgoParam{128, 128, 32, 32, 32, 32, 8, 8, 4, 2});
        implbmm_nchw_hmma.emplace_back(AlgoParam{128, 256, 32, 64, 64, 32, 8, 8, 4, 2});
        implbmm_nchw_hmma.emplace_back(AlgoParam{128, 64, 32, 32, 32, 32, 8, 8, 4, 2});
        implbmm_nchw_hmma.emplace_back(AlgoParam{64, 64, 32, 32, 32, 32, 8, 8, 4, 2});
        for (auto&& algo : implbmm_nchw_hmma) {
            all_algos.push_back(&algo);
        }
    }
 #endif
 }

 MEGDNN_DEF_GET_ALGO_FROM_DESC(ConvolutionBackwardDataImpl)

 ConvolutionBackwardDataImpl::AlgoCUDNN* ConvolutionBackwardDataImpl::AlgoPack::
--- a/dnn/src/cuda/convolution/backward_data/algo.h
+++ b/dnn/src/cuda/convolution/backward_data/algo.h
@@ -37,11 +37,14 @@ public:
        CUDA_MATMUL,
        CUDA_CHANWISE,
        CUDA_CHANWISE_SMALL,
        CUDA_DEPTHWISE_LARGE_FILTER,
        CUDA_BFLOAT16,
        CUDA_GROUP_CONV_GENERAL,
        CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8,
        CUDA_IMPLICIT_GEMM_NCHW_DOTPROD_INT8,
        CUDA_IMPLICIT_GEMM_NHWC_IMMA_INT8
        CUDA_IMPLICIT_GEMM_NHWC_IMMA_INT8,
        CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32,
        CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16,
    };
    using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;

@@ -190,6 +193,20 @@ public:
    }
 };

 class ConvolutionBackwardDataImpl::AlgoDepthwiseLargeFilter final : public AlgoBase {
 public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

    const char* name() const override { return "DEPTHWISE_LARGE_FILTER"; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_DEPTHWISE_LARGE_FILTER)
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }

 private:
    mutable std::string m_name;
 };

 class ConvolutionBackwardDataImpl::AlgoBFloat16 final : public AlgoBase {
 public:
    bool is_available(const SizeArgs& args) const override;
@@ -315,6 +332,82 @@ private:
    std::string m_name;
 };

 class ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm final
        : public AlgoBase {
 public:
    struct AlgoParam {
        int threadblock_m;
        int threadblock_n;
        int threadblock_k;
        int warp_m;
        int warp_n;
        int warp_k;
        int stage;
        std::string to_string() {
            return ssprintf(
                    "_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n,
                    threadblock_k, warp_m, warp_n, warp_k, stage);
        }
    };
    AlgoFloat32NCHWFMAImplicitBatchedGemm(AlgoParam algo_param)
            : m_algo_param{algo_param},
              m_name{ssprintf(
                      "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM%s",
                      m_algo_param.to_string().c_str())} {}
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override { return 0; }
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32)

 private:
    const void* get_available_op(const SizeArgs& args) const;
    AlgoParam m_algo_param;
    std::string m_name;
 };

 class ConvolutionBackwardDataImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm final
        : public AlgoBase {
 public:
    /// add instruction shape as member of algo param, because f16 tensor core has 2
    /// different matrix shapes (i.e. mma.884 and mma.1688)
    struct AlgoParam {
        int threadblock_m;
        int threadblock_n;
        int threadblock_k;
        int warp_m;
        int warp_n;
        int warp_k;
        int instruction_m;
        int instruction_n;
        int instruction_k;
        int stage;
        std::string to_string() {
            return ssprintf(
                    "_%dX%dX%d_%dX%dX%d_mma%dX%dX%d_%dstage", threadblock_m,
                    threadblock_n, threadblock_k, warp_m, warp_n, warp_k, instruction_m,
                    instruction_n, instruction_k, stage);
        }
    };
    AlgoFloat16NCHWHMMAImplicitBatchedGemm(AlgoParam algo_param)
            : m_algo_param{algo_param},
              m_name{ssprintf(
                      "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM%s",
                      m_algo_param.to_string().c_str())} {}
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override { return 0; }
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16)

 private:
    const void* get_available_op(const SizeArgs& args) const;
    AlgoParam m_algo_param;
    std::string m_name;
 };

 class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj {
    // defined in cudnn.cpp
    void fill_cudnn_algos();
@@ -322,6 +415,7 @@ class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj {
    void fill_int8_dp4a_algos();
    // defined in implicit_gemm_int8_nhwc_imma.cpp
    void fill_int8_imma_algos();
    void fill_dwconv_algos();

    AlgoBase::Mapper m_all_algos_map;

@@ -332,11 +426,14 @@ public:
    AlgoMatmul matmul;
    AlgoChanwise chanwise;
    AlgoChanwiseSmall chanwise_small;
    AlgoDepthwiseLargeFilter depthwise_large_filter;
    AlgoBFloat16 bfloat16;
    AlgoGroupConvGeneral group;
    std::vector<AlgoInt8NCHW4DotProdImplicitGemm> int8_nchw4_dotprod;
    AlgoInt8NCHWDotProdImplicitGemm int8_nchw_dotprod;
    std::vector<AlgoInt8NHWCIMMAImplicitGemm> int8_nhwc_imma;
    std::vector<AlgoFloat32NCHWFMAImplicitBatchedGemm> implbmm_nchw_fma;
    std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> implbmm_nchw_hmma;

    std::vector<AlgoBase*>
            //! all algorithms
--- a/dnn/src/cuda/convolution/backward_data/depthwise_large_filter.cpp
+++ b/dnn/src/cuda/convolution/backward_data/depthwise_large_filter.cpp
@@ -0,0 +1,89 @@
 /**
 * \file dnn/src/cuda/convolution/backward_data/depthwise_large_filter.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include "src/cuda/conv_bias/chanwise/depthwise_large_filter.cuh"
 #include "src/cuda/convolution/backward_data/algo.h"
 #include "src/cuda/convolution/chanwise/kern.cuh"
 #include "src/cuda/utils.h"

 using namespace megdnn;
 using namespace cuda;
 using namespace convolution;

 namespace {
 inline bool is_available_depthwise_large_filter(const chanwise::Param& param) {
    if ((param.stride_h == 1 && param.stride_w == 1) ||
        (param.stride_h == 2 && param.stride_w == 2)) {
        auto&& device_prop = cuda::current_device_prop();
        static int const unroll_oh = 1, unroll_fh = 1;
        CHECK(BWD)
    }
    return false;
 }
 }  // anonymous namespace

 bool ConvolutionBackwardDataImpl::AlgoDepthwiseLargeFilter::is_available(
        const SizeArgs& args) const {
    if (!args.grad_layout->is_contiguous() || !args.diff_layout->is_contiguous()) {
        return false;
    }
    if (args.diff_layout->dtype != args.filter_layout->dtype &&
        (args.diff_layout->dtype != dtype::Float32()
 #if CUDA_VERSION >= 9000
         || args.diff_layout->dtype != dtype::Float16()
 #endif
                 )) {
        return false;
    }

    auto param = chanwise::Param::from_fwd_args(
            {args.handle, args.diff_layout, args.filter_layout, args.filter_meta,
             args.grad_layout});
    auto&& fm = args.filter_meta;
    return fm.group > 1 && args.filter_meta.format == Param::Format::NCHW &&
           args.diff_layout->dtype.category() == DTypeCategory::FLOAT &&
           args.opr->param().compute_mode == Param::ComputeMode::DEFAULT &&
           fm.spatial_ndim == 2 && fm.icpg == 1 && fm.ocpg == 1 &&
           fm.dilation[0] == 1 && fm.dilation[1] == 1 && !fm.should_flip &&
           is_available_depthwise_large_filter(param);
 }

 size_t ConvolutionBackwardDataImpl::AlgoDepthwiseLargeFilter::get_workspace_in_bytes(
        const SizeArgs& args) const {
    return 0;
 }

 void ConvolutionBackwardDataImpl::AlgoDepthwiseLargeFilter::exec(
        const ExecArgs& args) const {
    auto kparam = chanwise::Param::from_fwd_args(
            {args.handle, args.diff_layout, args.filter_layout, args.filter_meta,
             args.grad_layout});
    auto stream = cuda_stream(args.handle);
    switch (args.diff_layout->dtype.enumv()) {
        case DTypeEnum::Float32:
            chanwise::run_bwd_depthwise_large_filter(
                    args.grad_tensor->ptr<float>(), args.diff_tensor->ptr<float>(),
                    args.filter_tensor->ptr<float>(), kparam, stream);
            break;
 #if CUDA_VERSION >= 9000
        case DTypeEnum::Float16:
            chanwise::run_bwd_depthwise_large_filter(
                    static_cast<half*>(args.grad_tensor->raw_ptr()),
                    static_cast<half*>(args.diff_tensor->raw_ptr()),
                    static_cast<half*>(args.filter_tensor->raw_ptr()), kparam, stream);
            break;
 #endif
        default:
            megdnn_assert_internal(0);
    }
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float16_nchw_hmma.cpp
+++ b/dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float16_nchw_hmma.cpp
@@ -0,0 +1,149 @@
 /**
 * \file
 * dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float16_nchw_hmma.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "src/cuda/convolution/backward_data/algo.h"
 #include "src/cuda/cutlass/singleton.h"
 #include "src/cuda/utils.h"

 using namespace megdnn;
 using namespace cuda;
 using namespace cutlass::library;

 const void* ConvolutionBackwardDataImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::
        get_available_op(const SizeArgs& args) const {
    int alignment_diff = 0;
    int wo = args.diff_layout->dtype.size(args.diff_layout->operator[](3));
    for (int candidate : {16, 4, 2}) {
        if (wo % candidate == 0) {
            alignment_diff = candidate;
            break;
        }
    }
    alignment_diff /= args.diff_layout->dtype.size(1);
    NumericTypeID accumulator_dtype =
            args.opr->param().compute_mode == param::Convolution::ComputeMode::DEFAULT
                    ? NumericTypeID::kF16
                    : NumericTypeID::kF32;
    ConvolutionKey key{
            cutlass::conv::Operator::kDgrad,
            NumericTypeID::kF16,
            LayoutTypeID::kTensorNCHW,
            NumericTypeID::kF16,
            LayoutTypeID::kTensorNCHW,
            NumericTypeID::kF16,
            LayoutTypeID::kTensorNCHW,
            NumericTypeID::kF16,
            LayoutTypeID::kTensorNCHW,
            accumulator_dtype,
            cutlass::conv::ConvType::kDepthwiseConvolution,
            m_algo_param.threadblock_m,
            m_algo_param.threadblock_n,
            m_algo_param.threadblock_k,
            m_algo_param.warp_m,
            m_algo_param.warp_n,
            m_algo_param.warp_k,
            m_algo_param.instruction_m,
            m_algo_param.instruction_n,
            m_algo_param.instruction_k,
            cutlass::epilogue::EpilogueType::kBiasAddLinearCombination,
            m_algo_param.stage,
            cutlass::conv::SpecialOptimizeDesc::NONE,
            alignment_diff,
            1,
            false};
    return (void*)Singleton::get().operation_table.find_op(key);
 }

 bool ConvolutionBackwardDataImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available(
        const SizeArgs& args) const {
 #define RETURN_IF_FALSE(stmt_) \
    if (!(stmt_))              \
        return false;
    RETURN_IF_FALSE(is_compute_capability_required(7, 0));
    RETURN_IF_FALSE(
            args.diff_layout->is_contiguous() && args.grad_layout->is_contiguous());
    using Param = param::Convolution;
    using Format = Param::Format;
    using Sparse = Param::Sparse;
    using Mode = Param::Mode;
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    RETURN_IF_FALSE(
            param.format == Format::NCHW &&
            args.diff_layout->dtype.enumv() == DTypeEnum::Float16 &&
            args.filter_layout->dtype.enumv() == DTypeEnum::Float16 &&
            args.grad_layout->dtype.enumv() == DTypeEnum::Float16);
    RETURN_IF_FALSE(param.sparse == Sparse::GROUP);
    RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION);
    // check if channelwise convolution
    RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1);
    RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1);
    const auto* op = get_available_op(args);
    RETURN_IF_FALSE(op != nullptr);
    return true;
 #undef RETURN_IF_FALSE
 }

 void ConvolutionBackwardDataImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::exec(
        const ExecArgs& args) const {
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    int n = args.diff_layout->operator[](0), ho = args.diff_layout->operator[](2),
        wo = args.diff_layout->operator[](3);
    int hi = args.grad_layout->operator[](2), wi = args.grad_layout->operator[](3);
    int co = fm.group, ci = co, groups = co;
    int fh = fm.spatial[0], fw = fm.spatial[1];
    int sh = fm.stride[0], sw = fm.stride[1];
    int ph = fm.padding[0], pw = fm.padding[1];
    int dh = param.dilate_h, dw = param.dilate_w;

    // check if channelwise convolution
    megdnn_assert(fm.icpg == 1 && fm.ocpg == 1);
    auto&& stream = cuda_stream(args.opr->handle());

    float alpha = 1.f;
    float beta = 0.f;
    float gamma = 0.f;
    float delta = 0.f;

    const Operation* op = (const Operation*)get_available_op(args);

    cutlass::conv::Conv2dProblemSize problem_size{
            n,      hi, wi, ci, co, fh, fw, ho,
            wo,     ph, pw, sh, sw, dh, dw, cutlass::conv::Mode::kCrossCorrelation,
            1,       // split k slices, always 1
            groups,  // groups
    };

    cutlass::library::ConvolutionArguments conv_args{
            problem_size,
            args.diff_tensor->raw_ptr(),
            args.filter_tensor->raw_ptr(),
            nullptr,
            nullptr,
            args.grad_tensor->raw_ptr(),
            &alpha,
            &beta,
            &gamma,
            &delta,
            nullptr,
            nullptr,
            nullptr,
            nullptr};

    cutlass_check(op->run(&conv_args, nullptr, stream));

    after_kernel_launch();
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float32_nchw_fma.cpp
+++ b/dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float32_nchw_fma.cpp
@@ -0,0 +1,145 @@
 /**
 * \file
 * dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float32_nchw_fma.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "src/cuda/convolution/backward_data/algo.h"
 #include "src/cuda/cutlass/singleton.h"
 #include "src/cuda/utils.h"

 using namespace megdnn;
 using namespace cuda;
 using namespace cutlass::library;

 const void* ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::
        get_available_op(const SizeArgs& args) const {
    int alignment_diff = 0;
    int wo = args.diff_layout->dtype.size(args.diff_layout->operator[](3));
    for (int candidate : {16, 4}) {
        if (wo % candidate == 0) {
            alignment_diff = candidate;
            break;
        }
    }
    alignment_diff /= args.diff_layout->dtype.size(1);
    ConvolutionKey key{
            cutlass::conv::Operator::kDgrad,
            NumericTypeID::kF32,
            LayoutTypeID::kTensorNCHW,
            NumericTypeID::kF32,
            LayoutTypeID::kTensorNCHW,
            NumericTypeID::kF32,
            LayoutTypeID::kTensorNCHW,
            NumericTypeID::kF32,
            LayoutTypeID::kTensorNCHW,
            NumericTypeID::kF32,
            cutlass::conv::ConvType::kDepthwiseConvolution,
            m_algo_param.threadblock_m,
            m_algo_param.threadblock_n,
            m_algo_param.threadblock_k,
            m_algo_param.warp_m,
            m_algo_param.warp_n,
            m_algo_param.warp_k,
            1,
            1,
            1,
            cutlass::epilogue::EpilogueType::kBiasAddLinearCombination,
            m_algo_param.stage,
            cutlass::conv::SpecialOptimizeDesc::NONE,
            alignment_diff,
            1,
            false};
    return (void*)Singleton::get().operation_table.find_op(key);
 }

 bool ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available(
        const SizeArgs& args) const {
 #define RETURN_IF_FALSE(stmt_) \
    if (!(stmt_))              \
        return false;
    RETURN_IF_FALSE(is_compute_capability_required(6, 1));
    RETURN_IF_FALSE(
            args.diff_layout->is_contiguous() && args.grad_layout->is_contiguous());
    using Param = param::Convolution;
    using Format = Param::Format;
    using Sparse = Param::Sparse;
    using Mode = Param::Mode;
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    RETURN_IF_FALSE(
            param.format == Format::NCHW &&
            args.diff_layout->dtype.enumv() == DTypeEnum::Float32 &&
            args.filter_layout->dtype.enumv() == DTypeEnum::Float32 &&
            args.grad_layout->dtype.enumv() == DTypeEnum::Float32);
    RETURN_IF_FALSE(param.sparse == Sparse::GROUP);
    RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION);
    // check if channelwise convolution
    RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1);
    RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1);
    const auto* op = get_available_op(args);
    RETURN_IF_FALSE(op != nullptr);
    return true;
 #undef RETURN_IF_FALSE
 }

 void ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::exec(
        const ExecArgs& args) const {
    auto&& param = args.opr->param();
    auto&& fm = args.filter_meta;
    int n = args.diff_layout->operator[](0), ho = args.diff_layout->operator[](2),
        wo = args.diff_layout->operator[](3);
    int hi = args.grad_layout->operator[](2), wi = args.grad_layout->operator[](3);
    int co = fm.group, ci = co, groups = co;
    int fh = fm.spatial[0], fw = fm.spatial[1];
    int sh = fm.stride[0], sw = fm.stride[1];
    int ph = fm.padding[0], pw = fm.padding[1];
    int dh = param.dilate_h, dw = param.dilate_w;

    // check if channelwise convolution
    megdnn_assert(fm.icpg == 1 && fm.ocpg == 1);
    auto&& stream = cuda_stream(args.opr->handle());

    float alpha = 1.f;
    float beta = 0.f;
    float gamma = 0.f;
    float delta = 0.f;

    const Operation* op = (const Operation*)get_available_op(args);

    cutlass::conv::Conv2dProblemSize problem_size{
            n,      hi, wi, ci, co, fh, fw, ho,
            wo,     ph, pw, sh, sw, dh, dw, cutlass::conv::Mode::kCrossCorrelation,
            1,       // split k slices, always 1
            groups,  // groups
    };

    cutlass::library::ConvolutionArguments conv_args{
            problem_size,
            args.diff_tensor->raw_ptr(),
            args.filter_tensor->raw_ptr(),
            nullptr,
            nullptr,
            args.grad_tensor->raw_ptr(),
            &alpha,
            &beta,
            &gamma,
            &delta,
            nullptr,
            nullptr,
            nullptr,
            nullptr};

    cutlass_check(op->run(&conv_args, nullptr, stream));

    after_kernel_launch();
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp
+++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp
@@ -29,6 +29,19 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
            (sh == 2 && sw == 2)
                    ? cutlass::conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING
                    : cutlass::conv::SpecialOptimizeDesc::NONE;
    int alignment_filter = 4;
    constexpr int warp_size = 32;
    int threads = warp_size * m_algo_param.threadblock_m * m_algo_param.threadblock_n *
                  m_algo_param.threadblock_k /
                  (m_algo_param.warp_m * m_algo_param.warp_n * m_algo_param.warp_k);
    int threadblock_loads = args.filter_layout->dtype.size(
            m_algo_param.threadblock_m * m_algo_param.threadblock_k);
    int load_per_thread = threadblock_loads / threads;
    if (load_per_thread >= 16)
        alignment_filter = 16;
    else if (load_per_thread >= 8)
        alignment_filter = 8;
    megdnn_assert(load_per_thread >= 4);
    ConvolutionKey key{
            cutlass::conv::Operator::kDgrad,
            NumericTypeID::kS8,
@@ -39,6 +52,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
            LayoutTypeID::kTensorNC4HW4,
            NumericTypeID::kS32,
            LayoutTypeID::kTensorNC4HW4,
            NumericTypeID::kS32,
            cutlass::conv::ConvType::kConvolution,
            m_algo_param.threadblock_m,
            m_algo_param.threadblock_n,
@@ -52,6 +66,8 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
            cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
            m_algo_param.stage,
            special_optimization,
            4,
            alignment_filter,
            false};
    return (void*)Singleton::get().operation_table.find_op(key);
 }
--- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp
+++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp
@@ -39,6 +39,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::
            LayoutTypeID::kTensorNC4HW4,
            NumericTypeID::kS32,
            LayoutTypeID::kTensorNC4HW4,
            NumericTypeID::kS32,
            cutlass::conv::ConvType::kConvolution,
            16,
            64,
@@ -52,6 +53,8 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::
            cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
            2,
            special_optimization,
            4,
            4,
            false};
    return (void*)Singleton::get().operation_table.find_op(key);
 }
--- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nhwc_imma.cpp
+++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nhwc_imma.cpp
@@ -50,6 +50,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NHWCIMMAImplicitGemm::get_avail
            LayoutTypeID::kTensorNHWC,
            NumericTypeID::kS32,
            LayoutTypeID::kTensorNHWC,
            NumericTypeID::kS32,
            cutlass::conv::ConvType::kConvolution,
            m_algo_param.threadblock_m,
            m_algo_param.threadblock_n,
@@ -63,6 +64,8 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NHWCIMMAImplicitGemm::get_avail
            cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
            m_algo_param.stage,
            special_optimization,
            m_algo_param.access_size,
            m_algo_param.access_size,
            false};
    return (void*)Singleton::get().operation_table.find_op(key);
 }
--- a/dnn/src/cuda/convolution/backward_filter/algo.cpp
+++ b/dnn/src/cuda/convolution/backward_filter/algo.cpp
@@ -25,6 +25,7 @@ ConvolutionBackwardFilterImpl::AlgoPack::AlgoPack() {
    for (auto&& i : cudnn) {
        all_algos.push_back(&i);
    }
    fill_dwconv_algos();
    all_algos.push_back(&matmul);
    all_algos.push_back(&group);

@@ -48,6 +49,39 @@ ConvolutionBackwardFilterImpl::AlgoCUDNN* ConvolutionBackwardFilterImpl::AlgoPac
            "can not find cudnn bwd_filter algorithm %d", static_cast<int>(algo)));
 }

 void ConvolutionBackwardFilterImpl::AlgoPack::fill_dwconv_algos() {
    {
        using AlgoParam = AlgoFloat32NCHWFMAImplicitBatchedGemm::AlgoParam;
        /// preferred algo
        implbmm_nchw_fma.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8, 2});
        implbmm_nchw_fma.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8, 2});
        implbmm_nchw_fma.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8, 2});
        implbmm_nchw_fma.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8, 2});
        implbmm_nchw_fma.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8, 2});
        implbmm_nchw_fma.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8, 2});
        implbmm_nchw_fma.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8, 2});
        implbmm_nchw_fma.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8, 2});
        implbmm_nchw_fma.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8, 2});
        for (auto&& algo : implbmm_nchw_fma) {
            all_algos.push_back(&algo);
        }
    }
 #if CUDA_VERSION >= 10010
    {
        using AlgoParam = AlgoFloat16NCHWHMMAImplicitBatchedGemm::AlgoParam;
        /// preferred algo
        implbmm_nchw_hmma.emplace_back(AlgoParam{64, 128, 32, 32, 32, 32, 8, 8, 4, 2});
        implbmm_nchw_hmma.emplace_back(AlgoParam{128, 128, 32, 32, 32, 32, 8, 8, 4, 2});
        implbmm_nchw_hmma.emplace_back(AlgoParam{128, 256, 32, 64, 64, 32, 8, 8, 4, 2});
        implbmm_nchw_hmma.emplace_back(AlgoParam{128, 64, 32, 32, 32, 32, 8, 8, 4, 2});
        implbmm_nchw_hmma.emplace_back(AlgoParam{64, 64, 32, 32, 32, 32, 8, 8, 4, 2});
        for (auto&& algo : implbmm_nchw_hmma) {
            all_algos.push_back(&algo);
        }
    }
 #endif
 }

 ConvolutionBackwardFilterImpl::AlgoPack ConvolutionBackwardFilterImpl::sm_algo_pack;

 ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs(
--- a/dnn/src/cuda/convolution/backward_filter/algo.h
+++ b/dnn/src/cuda/convolution/backward_filter/algo.h
@@ -37,6 +37,8 @@ public:
        CUDA_CHANWISE,
        CUDA_BFLOAT16,
        CUDA_GROUP_CONV_GENERAL,
        CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32,
        CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16,
    };
    using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;

@@ -210,9 +212,86 @@ private:
    WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
 };

 class ConvolutionBackwardFilterImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm final
        : public AlgoBase {
 public:
    struct AlgoParam {
        int threadblock_m;
        int threadblock_n;
        int threadblock_k;
        int warp_m;
        int warp_n;
        int warp_k;
        int stage;
        std::string to_string() {
            return ssprintf(
                    "_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n,
                    threadblock_k, warp_m, warp_n, warp_k, stage);
        }
    };
    AlgoFloat32NCHWFMAImplicitBatchedGemm(AlgoParam algo_param)
            : m_algo_param{algo_param},
              m_name{ssprintf(
                      "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM%s",
                      m_algo_param.to_string().c_str())} {}
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override { return 0; }
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32)

 private:
    const void* get_available_op(const SizeArgs& args) const;
    AlgoParam m_algo_param;
    std::string m_name;
 };

 class ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm final
        : public AlgoBase {
 public:
    /// add instruction shape as member of algo param, because f16 tensor core has 2
    /// different matrix shapes (i.e. mma.884 and mma.1688)
    struct AlgoParam {
        int threadblock_m;
        int threadblock_n;
        int threadblock_k;
        int warp_m;
        int warp_n;
        int warp_k;
        int instruction_m;
        int instruction_n;
        int instruction_k;
        int stage;
        std::string to_string() {
            return ssprintf(
                    "_%dX%dX%d_%dX%dX%d_mma%dX%dX%d_%dstage", threadblock_m,
                    threadblock_n, threadblock_k, warp_m, warp_n, warp_k, instruction_m,
                    instruction_n, instruction_k, stage);
        }
    };
    AlgoFloat16NCHWHMMAImplicitBatchedGemm(AlgoParam algo_param)
            : m_algo_param{algo_param},
              m_name{ssprintf(
                      "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM%s",
                      m_algo_param.to_string().c_str())} {}
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16)

 private:
    const void* get_available_op(const SizeArgs& args) const;
    AlgoParam m_algo_param;
    std::string m_name;
 };

 class ConvolutionBackwardFilterImpl::AlgoPack : NonCopyableObj {
    // defined in cudnn.cpp
    void fill_cudnn_algos();
    void fill_dwconv_algos();

    AlgoBase::Mapper m_all_algos_map;

@@ -224,6 +303,8 @@ public:
    AlgoChanwise chanwise;
    AlgoGroupConvGeneral group;
    AlgoBFloat16 bfloat16;
    std::vector<AlgoFloat32NCHWFMAImplicitBatchedGemm> implbmm_nchw_fma;
    std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> implbmm_nchw_hmma;

    std::vector<AlgoBase*>
            //! all algorithms
--- a/dnn/src/cuda/convolution/backward_filter/implicit_batched_gemm_float16_nchw_hmma.cpp
+++ b/dnn/src/cuda/convolution/backward_filter/implicit_batched_gemm_float16_nchw_hmma.cpp
@@ -0,0 +1,172 @@
 /**
 * \file
 * dnn/src/cuda/convolution/backward_filter/implicit_batched_gemm_float16_nchw_hmma.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "src/cuda/convolution/backward_filter/algo.h"
 #include "src/cuda/cutlass/singleton.h"
 #include "src/cuda/utils.h"

 using namespace megdnn;
 using namespace cuda;
 using namespace cutlass::library;

 const void* ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::
        get_available_op(const SizeArgs& args) const {
    auto get_alignment = [](const TensorLayout& layout) {
        int alignment = 0;
        int width = layout.dtype.size(layout[3]);
        for (int candidate : {16, 4, 2}) {
            if (width % candidate == 0) {
                alignment = candidate;
                break;
            }
        }
        alignment /= layout.dtype.size(1);
        return alignment;
    };
    int alignment_src = get_alignment(*args.src_layout);
    int alignment_diff = get_alignment(*args.diff_layout);
    megdnn_assert(alignment_src >= 1 && alignment_diff >= 1);
    NumericTypeID accumulator_dtype =
            args.opr->param().compute_mode == param::Convolution::ComputeMode::DEFAULT
                    ? NumericTypeID::kF16
                    : NumericTypeID::kF32;
    ConvolutionKey key{
            cutlass::conv::Operator::kWgrad,
            NumericTypeID::kF16,        // src tensor data type
            LayoutTypeID::kTensorNCHW,  // src tensor layout
            NumericTypeID::kF16,        // diff tensor data type
            LayoutTypeID::kTensorNCHW,  // diff tensor layout
            NumericTypeID::kF32,        // grad tensor data type
            LayoutTypeID::kTensorNCHW,  // grad tensor layout
            NumericTypeID::kF32,        // dummy argument, not used.
            LayoutTypeID::kTensorNCHW,  // dummy argument, not used
            accumulator_dtype,
            cutlass::conv::ConvType::kDepthwiseConvolution,
            m_algo_param.threadblock_m,
            m_algo_param.threadblock_n,
            m_algo_param.threadblock_k,
            m_algo_param.warp_m,
            m_algo_param.warp_n,
            m_algo_param.warp_k,
            m_algo_param.instruction_m,
            m_algo_param.instruction_n,
            m_algo_param.instruction_k,
            cutlass::epilogue::EpilogueType::kLinearCombination,  // no bias
            m_algo_param.stage,
            cutlass::conv::SpecialOptimizeDesc::NONE,
            alignment_src,
            alignment_diff,
            true};
    return (void*)Singleton::get().operation_table.find_op(key);
 }

 bool ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::
        is_available(const SizeArgs& args) const {
 #define RETURN_IF_FALSE(stmt_) \
    if (!(stmt_))              \
        return false;
    RETURN_IF_FALSE(is_compute_capability_required(7, 0));
    RETURN_IF_FALSE(
            args.src_layout->is_contiguous() && args.diff_layout->is_contiguous() &&
            args.grad_layout->is_contiguous());
    using Param = param::Convolution;
    using Format = Param::Format;
    using Sparse = Param::Sparse;
    using Mode = Param::Mode;
    using ComputeMode = Param::ComputeMode;
    auto&& param = args.opr->param();
    auto&& fm = args.grad_filter_meta;
    RETURN_IF_FALSE(param.compute_mode == ComputeMode::FLOAT32);
    RETURN_IF_FALSE(
            param.format == Format::NCHW &&
            args.src_layout->dtype.enumv() == DTypeEnum::Float16 &&
            args.diff_layout->dtype.enumv() == DTypeEnum::Float16 &&
            args.grad_layout->dtype.enumv() == DTypeEnum::Float16);
    RETURN_IF_FALSE(param.sparse == Sparse::GROUP);
    RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION);
    // check if channelwise convolution
    RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1);
    RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1);
    const auto* op = get_available_op(args);
    RETURN_IF_FALSE(op != nullptr);
    return true;
 #undef RETURN_IF_FALSE
 }

 size_t ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::
        get_workspace_in_bytes(const SizeArgs& args) const {
    auto layout = *args.grad_layout;
    // modify data type
    layout.modify_dtype_inplace(dtype::Float32());
    return layout.span().dist_byte();
 }

 void ConvolutionBackwardFilterImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::exec(
        const ExecArgs& args) const {
    auto&& param = args.opr->param();
    auto&& fm = args.grad_filter_meta;
    int hi = args.src_layout->operator[](2), wi = args.src_layout->operator[](3);
    int n = args.diff_layout->operator[](0), ho = args.diff_layout->operator[](2),
        wo = args.diff_layout->operator[](3);
    int co = fm.group, ci = co, groups = co;
    int fh = fm.spatial[0], fw = fm.spatial[1];
    int sh = fm.stride[0], sw = fm.stride[1];
    int ph = fm.padding[0], pw = fm.padding[1];
    int dh = param.dilate_h, dw = param.dilate_w;

    // check if channelwise convolution
    megdnn_assert(fm.icpg == 1 && fm.ocpg == 1);
    auto&& stream = cuda_stream(args.opr->handle());

    float alpha = 1.f;
    float beta = 0.f;

    const Operation* op = (const Operation*)get_available_op(args);

    cutlass::conv::Conv2dProblemSize problem_size{
            n,      hi, wi, ci, co, fh, fw, ho,
            wo,     ph, pw, sh, sw, dh, dw, cutlass::conv::Mode::kCrossCorrelation,
            1,       // split k slices, always 1
            groups,  // groups
    };

    cutlass::library::ConvolutionArguments conv_args{
            problem_size,
            args.src_tensor->raw_ptr(),
            args.diff_tensor->raw_ptr(),
            nullptr,
            nullptr,
            args.workspace.raw_ptr,
            &alpha,
            &beta,
            nullptr,
            nullptr,
            nullptr,
            nullptr,
            nullptr,
            nullptr};

    cutlass_check(op->run(&conv_args, nullptr, stream));

    after_kernel_launch();

    auto&& typecvt = args.opr->handle()->create_operator<TypeCvt>();
    auto f32_grad_layout = *args.grad_layout;
    // modify data type
    f32_grad_layout.modify_dtype_inplace(dtype::Float32());
    TensorND src{args.workspace.raw_ptr, f32_grad_layout},
            dst{args.grad_tensor->raw_ptr(), *args.grad_layout};
    typecvt->exec(src, dst);
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/convolution/backward_filter/implicit_batched_gemm_float32_nchw_fma.cpp
+++ b/dnn/src/cuda/convolution/backward_filter/implicit_batched_gemm_float32_nchw_fma.cpp
@@ -0,0 +1,135 @@
 /**
 * \file
 * dnn/src/cuda/convolution/backward_filter/implicit_batched_gemm_float32_nchw_fma.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "src/cuda/convolution/backward_filter/algo.h"
 #include "src/cuda/cutlass/singleton.h"
 #include "src/cuda/utils.h"

 using namespace megdnn;
 using namespace cuda;
 using namespace cutlass::library;

 const void* ConvolutionBackwardFilterImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::
        get_available_op(const SizeArgs& args) const {
    ConvolutionKey key{
            cutlass::conv::Operator::kWgrad,
            NumericTypeID::kF32,        // src tensor data type
            LayoutTypeID::kTensorNCHW,  // src tensor layout
            NumericTypeID::kF32,        // diff tensor data type
            LayoutTypeID::kTensorNCHW,  // diff tensor layout
            NumericTypeID::kF32,        // grad tensor data type
            LayoutTypeID::kTensorNCHW,  // grad tensor layout
            NumericTypeID::kF32,        // dummy argument, not used.
            LayoutTypeID::kTensorNCHW,  // dummy argument, not used
            NumericTypeID::kF32,
            cutlass::conv::ConvType::kDepthwiseConvolution,
            m_algo_param.threadblock_m,
            m_algo_param.threadblock_n,
            m_algo_param.threadblock_k,
            m_algo_param.warp_m,
            m_algo_param.warp_n,
            m_algo_param.warp_k,
            1,
            1,
            1,
            cutlass::epilogue::EpilogueType::kLinearCombination,  // no bias
            m_algo_param.stage,
            cutlass::conv::SpecialOptimizeDesc::NONE,
            1,
            1,
            true};
    return (void*)Singleton::get().operation_table.find_op(key);
 }

 bool ConvolutionBackwardFilterImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available(
        const SizeArgs& args) const {
 #define RETURN_IF_FALSE(stmt_) \
    if (!(stmt_))              \
        return false;
    RETURN_IF_FALSE(is_compute_capability_required(6, 1));
    RETURN_IF_FALSE(
            args.src_layout->is_contiguous() && args.diff_layout->is_contiguous() &&
            args.grad_layout->is_contiguous());
    using Param = param::Convolution;
    using Format = Param::Format;
    using Sparse = Param::Sparse;
    using Mode = Param::Mode;
    auto&& param = args.opr->param();
    auto&& fm = args.grad_filter_meta;
    RETURN_IF_FALSE(
            param.format == Format::NCHW &&
            args.src_layout->dtype.enumv() == DTypeEnum::Float32 &&
            args.diff_layout->dtype.enumv() == DTypeEnum::Float32 &&
            args.grad_layout->dtype.enumv() == DTypeEnum::Float32);
    RETURN_IF_FALSE(param.sparse == Sparse::GROUP);
    RETURN_IF_FALSE(param.mode == Mode::CROSS_CORRELATION);
    // check if channelwise convolution
    RETURN_IF_FALSE(fm.icpg == 1 && fm.ocpg == 1);
    RETURN_IF_FALSE(param.dilate_h == 1 && param.dilate_w == 1);
    const auto* op = get_available_op(args);
    RETURN_IF_FALSE(op != nullptr);
    return true;
 #undef RETURN_IF_FALSE
 }

 void ConvolutionBackwardFilterImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::exec(
        const ExecArgs& args) const {
    auto&& param = args.opr->param();
    auto&& fm = args.grad_filter_meta;
    int hi = args.src_layout->operator[](2), wi = args.src_layout->operator[](3);
    int n = args.diff_layout->operator[](0), ho = args.diff_layout->operator[](2),
        wo = args.diff_layout->operator[](3);
    int co = fm.group, ci = co, groups = co;
    int fh = fm.spatial[0], fw = fm.spatial[1];
    int sh = fm.stride[0], sw = fm.stride[1];
    int ph = fm.padding[0], pw = fm.padding[1];
    int dh = param.dilate_h, dw = param.dilate_w;

    // check if channelwise convolution
    megdnn_assert(fm.icpg == 1 && fm.ocpg == 1);
    auto&& stream = cuda_stream(args.opr->handle());

    float alpha = 1.f;
    float beta = 0.f;

    const Operation* op = (const Operation*)get_available_op(args);

    cutlass::conv::Conv2dProblemSize problem_size{
            n,      hi, wi, ci, co, fh, fw, ho,
            wo,     ph, pw, sh, sw, dh, dw, cutlass::conv::Mode::kCrossCorrelation,
            1,       // split k slices, always 1
            groups,  // groups
    };

    cutlass::library::ConvolutionArguments conv_args{
            problem_size,
            args.src_tensor->raw_ptr(),
            args.diff_tensor->raw_ptr(),
            nullptr,
            nullptr,
            args.grad_tensor->raw_ptr(),
            &alpha,
            &beta,
            nullptr,
            nullptr,
            nullptr,
            nullptr,
            nullptr,
            nullptr};

    cutlass_check(op->run(&conv_args, nullptr, stream));

    after_kernel_launch();
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/convolution/chanwise/bwd_large_filter.cu
+++ b/dnn/src/cuda/convolution/chanwise/bwd_large_filter.cu
@@ -0,0 +1,54 @@
 /**
 * \file dnn/src/cuda/conv_bias/chanwise/bwd_large_filter.cu
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

 #include "./kern.cuh"
 #include "./kern_helper.cuh"
 #include "cuda.h"
 #include "cuda_fp16.h"
 #include "src/cuda/convolution/chanwise/launch_config.cuh"
 #include "src/cuda/fp16_help.cuh"

 using namespace megdnn;
 using namespace cuda;
 using namespace convolution;
 using namespace chanwise;

 #include "src/cuda/conv_bias/chanwise/depthwise_large_filter_algo.cuh"

 namespace megdnn {
 namespace cuda {
 namespace convolution {
 namespace chanwise {

 // =====================================fwd=====================================

 template <>
 void run_bwd_depthwise_large_filter(
        float* dst, const float* src, const float* flt, const Param& param,
        cudaStream_t stream) {
    INSTANCE(float, float2, DepthwiseConv2dDirection::DIRECTION_BACKWARD)
 }

 #if CUDA_VERSION >= 9000
 template <>
 void run_bwd_depthwise_large_filter(
        __half* dst, const __half* src, const __half* flt, const Param& param,
        cudaStream_t stream) {
    INSTANCE(__half, __half2, DepthwiseConv2dDirection::DIRECTION_BACKWARD)
 }
 #endif

 }  // namespace chanwise
 }  // namespace convolution
 }  // namespace cuda
 }  // namespace megdnn

 // vim: syntax=cuda.doxygen
--- a/dnn/src/cuda/convolution/chanwise/kern.cuh
+++ b/dnn/src/cuda/convolution/chanwise/kern.cuh
@@ -27,8 +27,10 @@ namespace chanwise {
 struct Param {
    uint32_t batch, src_chl, src_h, src_w, chl_mul, flt_h, flt_w, out_h, out_w, pad_h,
            pad_w, stride_h, stride_w, dilation_h, dilation_w;
    bool is_compute_deafult;
 #if MEGDNN_CC_HOST
    static Param from_fwd_args(const ForwardSizeArgs& args) {
    static Param from_fwd_args(
            const ForwardSizeArgs& args, bool is_compute_deafult_ = true) {
 #define U(v) static_cast<uint32_t>(v)
        auto&& src = args.src_layout->shape;
        auto&& dst = args.dst_layout->shape;
@@ -42,11 +44,12 @@ struct Param {
            hw_pos = 1;
        }
        return {
                U(src[0]),          U(src[c_pos]),     U(src[hw_pos]),
                U(src[hw_pos + 1]), U(fm.ocpg),        U(fm.spatial[0]),
                U(fm.spatial[1]),   U(dst[hw_pos]),    U(dst[hw_pos + 1]),
                U(fm.padding[0]),   U(fm.padding[1]),  U(fm.stride[0]),
                U(fm.stride[1]),    U(fm.dilation[0]), U(fm.dilation[1]),
                U(src[0]),           U(src[c_pos]),     U(src[hw_pos]),
                U(src[hw_pos + 1]),  U(fm.ocpg),        U(fm.spatial[0]),
                U(fm.spatial[1]),    U(dst[hw_pos]),    U(dst[hw_pos + 1]),
                U(fm.padding[0]),    U(fm.padding[1]),  U(fm.stride[0]),
                U(fm.stride[1]),     U(fm.dilation[0]), U(fm.dilation[1]),
                is_compute_deafult_,
        };
 #undef U
    }
@@ -63,6 +66,10 @@ void run_bwd_data(
        T* src_grad, const T* dst_grad, const T* flt, const Param& param,
        cudaStream_t stream);

 template <typename T>
 void run_bwd_depthwise_large_filter(
        T* dst, const T* src, const T* flt, const Param& param, cudaStream_t stream);

 template <typename T>
 void run_bwd_filter(
        T* filter_grad, const T* src, const T* dst_grad, const Param& param,
--- a/dnn/src/cuda/convolution/opr_impl.cpp
+++ b/dnn/src/cuda/convolution/opr_impl.cpp
@@ -115,6 +115,22 @@ ConvolutionBackwardDataImpl::Algorithm* ConvolutionBackwardDataImpl::
                const AlgoAttribute& negative_attr) {
    AlgoBase::SizeArgs args(this, filter, diff, grad);

    //! choose for large kernel cases
    size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1];
    size_t ho = diff[2], wo = diff[3];
    const bool prefer_dnn_lk_implbmm = args.filter_meta.format == Param::Format::NCHW &&
                                       ho <= 2 * fh && wo <= 2 * fw;
    if (prefer_dnn_lk_implbmm) {
 #if CUDA_VERSION >= 10020
        if (sm_algo_pack.implbmm_nchw_hmma[0].is_available_attribute(
                    args, positive_attr, negative_attr, workspace_limit_in_bytes))
            return &sm_algo_pack.implbmm_nchw_hmma[0];
 #endif
        if (sm_algo_pack.implbmm_nchw_fma[0].is_available_attribute(
                    args, positive_attr, negative_attr, workspace_limit_in_bytes))
            return &sm_algo_pack.implbmm_nchw_fma[0];
    }

    if (args.filter_meta.group > 1 &&
        sm_algo_pack.chanwise.is_available_attribute(
                args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
@@ -242,6 +258,23 @@ ConvolutionBackwardFilterImpl::Algorithm* ConvolutionBackwardFilterImpl::
                const AlgoAttribute& negative_attr) {
    AlgoBase::SizeArgs args(this, src, diff, grad);

    //! choose for large kernel cases
    size_t fh = args.grad_filter_meta.spatial[0], fw = args.grad_filter_meta.spatial[1];
    size_t ho = diff[2], wo = diff[3];
    const bool prefer_dnn_lk_implbmm =
            args.grad_filter_meta.format == Param::Format::NCHW && ho <= 2 * fh &&
            wo <= 2 * fw;
    if (prefer_dnn_lk_implbmm) {
 #if CUDA_VERSION >= 10020
        if (sm_algo_pack.implbmm_nchw_hmma[0].is_available_attribute(
                    args, positive_attr, negative_attr, workspace_limit_in_bytes))
            return &sm_algo_pack.implbmm_nchw_hmma[0];
 #endif
        if (sm_algo_pack.implbmm_nchw_fma[0].is_available_attribute(
                    args, positive_attr, negative_attr, workspace_limit_in_bytes))
            return &sm_algo_pack.implbmm_nchw_fma[0];
    }

    if (args.grad_filter_meta.group > 1 &&
        sm_algo_pack.chanwise.is_available_attribute(
                args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
--- a/dnn/src/cuda/convolution/opr_impl.h
+++ b/dnn/src/cuda/convolution/opr_impl.h
@@ -97,11 +97,14 @@ public:
    class AlgoMatmul;
    class AlgoChanwise;
    class AlgoChanwiseSmall;
    class AlgoDepthwiseLargeFilter;
    class AlgoGroupConvGeneral;
    class AlgoBFloat16;
    class AlgoInt8NCHW4DotProdImplicitGemm;
    class AlgoInt8NCHWDotProdImplicitGemm;
    class AlgoInt8NHWCIMMAImplicitGemm;
    class AlgoFloat32NCHWFMAImplicitBatchedGemm;
    class AlgoFloat16NCHWHMMAImplicitBatchedGemm;

    class AlgoPack;

@@ -154,6 +157,8 @@ public:
    class AlgoChanwise;
    class AlgoGroupConvGeneral;
    class AlgoBFloat16;
    class AlgoFloat32NCHWFMAImplicitBatchedGemm;
    class AlgoFloat16NCHWHMMAImplicitBatchedGemm;

    class AlgoPack;

--- a/dnn/src/cuda/cutlass/convolution_operation.h
+++ b/dnn/src/cuda/cutlass/convolution_operation.h
@@ -135,6 +135,15 @@ namespace detail {
 template <typename EpilogueOp, epilogue::EpilogueType type>
 struct init_epilogue_param_;

 template <typename EpilogueOp>
 struct init_epilogue_param_<EpilogueOp, epilogue::EpilogueType::kLinearCombination> {
    using ElementCompute = typename EpilogueOp::ElementCompute;
    typename EpilogueOp::Params get(ConvolutionArguments const* conv_args) {
        return {*static_cast<ElementCompute const*>(conv_args->alpha),
                *static_cast<ElementCompute const*>(conv_args->beta)};
    }
 };

 template <typename EpilogueOp>
 struct init_epilogue_param_<
        EpilogueOp, epilogue::EpilogueType::kBiasAddLinearCombination> {
@@ -290,6 +299,159 @@ public:

 ///////////////////////////////////////////////////////////////////////////////////////////////////

 /// We add a new template class to handle convolution backward filter operation, because
 /// the device-level convolution operator of backward filter is different from the
 /// others (convolution forward and convolution backward data).
 /// But the description object is reused in this wrapper of convolution backward filter.
 /// The reason is that we do not want to introduce an another unnecessary structure.
 /// TODO: Maybe the device-level operator in cutlass for convoluton forward, backward
 /// data and backward filter should be combined.
 template <typename Operator_>
 class ConvolutionBackwardFilterOperationBase : public Operation {
 public:
    using Operator = Operator_;
    using ElementSrc = typename Operator::ElementSrc;
    using LayoutSrc = typename Operator::LayoutSrc;
    using ElementDiff = typename Operator::ElementDiff;
    using LayoutDiff = typename Operator::LayoutDiff;
    using ElementGrad = typename Operator::ElementGrad;
    using LayoutGrad = typename Operator::LayoutGrad;
    using ElementAccumulator = typename Operator::ElementAccumulator;

    ConvolutionBackwardFilterOperationBase(char const* name = "unknown_convolution") {
        m_description.name = name;
        m_description.provider = Provider::kCUTLASS;
        m_description.kind = OperationKind::kConvolution;
        m_description.conv_op = Operator::kConvolutionalOperator;

        m_description.tile_description.threadblock_shape = make_Coord(
                Operator::ThreadblockShape::kM, Operator::ThreadblockShape::kN,
                Operator::ThreadblockShape::kK);

        m_description.tile_description.threadblock_stages = Operator::kStages;

        m_description.tile_description.warp_count = make_Coord(
                Operator::ConvolutionKernel::WarpCount::kM,
                Operator::ConvolutionKernel::WarpCount::kN,
                Operator::ConvolutionKernel::WarpCount::kK);

        m_description.tile_description.math_instruction.instruction_shape = make_Coord(
                Operator::InstructionShape::kM, Operator::InstructionShape::kN,
                Operator::InstructionShape::kK);

        m_description.tile_description.math_instruction.element_accumulator =
                NumericTypeMap<ElementAccumulator>::kId;

        m_description.tile_description.math_instruction.opcode_class =
                OpcodeClassMap<typename Operator::OperatorClass>::kId;

        m_description.tile_description.math_instruction.math_operation =
                MathOperationMap<typename Operator::Operator>::kId;

        m_description.tile_description.minimum_compute_capability =
                ArchMap<typename Operator::ArchTag,
                        typename Operator::OperatorClass>::kMin;

        m_description.tile_description.maximum_compute_capability =
                ArchMap<typename Operator::ArchTag,
                        typename Operator::OperatorClass>::kMax;

        /// src in description -> src in C++ template
        m_description.src =
                make_TensorDescription<ElementSrc, LayoutSrc>(Operator::kAlignmentSrc);
        /// filter in description -> diff in C++ template
        m_description.filter = make_TensorDescription<ElementDiff, LayoutDiff>(
                Operator::kAlignmentDiff);
        /// dst in description -> grad in C++ template
        m_description.dst = make_TensorDescription<ElementGrad, LayoutGrad>(
                Operator::kAlignmentGrad);
        /// because bias tensor is not used in ConvolutionBackwardFilter operation, the
        /// following tensor description is a dummy arguments
        m_description.bias = make_TensorDescription<ElementGrad, LayoutGrad>(
                Operator::kAlignmentGrad);

        m_description.convolution_type = Operator::kConvolutionType;
        m_description.arch_tag = ArchTagMap<typename Operator::ArchTag>::kId;

        m_description.epilogue_type = Operator::EpilogueOutputOp::kType;
        m_description.epilogue_count = Operator::EpilogueOutputOp::kCount;

        m_description.threadblock_swizzle =
                ThreadblockSwizzleMap<typename Operator::ThreadblockSwizzle>::kId;

        m_description.special_optimization = Operator::kSpecialOpt;
        m_description.gemm_mode = Operator::kGemmMode;
        /// ConvolutionBackwardFilter operation is only used for depthwise convolution,
        /// so the option without_shared_load is always true
        m_description.without_shared_load = true;
    }

    virtual OperationDescription const& description() const { return m_description; }

 protected:
    ConvolutionDescription m_description;
 };

 ///////////////////////////////////////////////////////////////////////////////////////////////////

 template <typename Operator_>
 class ConvolutionBackwardFilterOperation
        : public ConvolutionBackwardFilterOperationBase<Operator_> {
 public:
    using Operator = Operator_;
    using ElementSrc = typename Operator::ElementSrc;
    using LayoutSrc = typename Operator::LayoutSrc;
    using ElementDiff = typename Operator::ElementDiff;
    using LayoutDiff = typename Operator::LayoutDiff;
    using ElementGrad = typename Operator::ElementGrad;
    using LayoutGrad = typename Operator::LayoutGrad;
    using ElementAccumulator = typename Operator::ElementAccumulator;
    using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;

    using OperatorArguments = typename Operator::Arguments;

    ConvolutionBackwardFilterOperation(char const* name = "unknown_gemm")
            : ConvolutionBackwardFilterOperationBase<Operator_>(name) {}

    virtual Status run(
            void const* arguments_ptr, void* device_workspace = nullptr,
            cudaStream_t stream = nullptr) const {
        cutlass::conv::Operator conv_op = this->m_description.conv_op;
        ConvolutionArguments const* conv_args =
                reinterpret_cast<ConvolutionArguments const*>(arguments_ptr);
        const auto& ps = conv_args->problem_size;

        OperatorArguments args;
        args.problem_size = ps;
        /// src in convolution arguments -> ref_src
        args.ref_src = {
                static_cast<ElementSrc*>(const_cast<void*>(conv_args->src)),
                LayoutSrc::packed(implicit_gemm_tensor_b_extent(conv_op, ps))};
        /// filter in convolution arguments -> ref_diff
        args.ref_diff = {
                static_cast<ElementDiff*>(const_cast<void*>(conv_args->filter)),
                LayoutDiff::packed(implicit_gemm_tensor_a_extent(conv_op, ps))};
        /// dst in convolution arguments -> ref_grad
        args.ref_grad = {
                static_cast<ElementGrad*>(conv_args->dst),
                LayoutGrad::packed(implicit_gemm_tensor_c_extent(conv_op, ps))};

        args.output_op = init_epilogue_param<typename Operator::EpilogueOutputOp>().get(
                conv_args);

        Operator op;
        Status status = op.initialize(args, device_workspace);

        if (status != Status::kSuccess) {
            return status;
        }

        return op.run(stream);
    }
 };

 ///////////////////////////////////////////////////////////////////////////////////////////////////

 }  // namespace library
 }  // namespace cutlass

--- a/dnn/src/cuda/cutlass/initialize_all.cu
+++ b/dnn/src/cuda/cutlass/initialize_all.cu
@@ -44,6 +44,11 @@ namespace cutlass {
 namespace library {

 /////////////////////////////////////////////////////////////////////////////////////////////////
 #if ((__CUDACC_VER_MAJOR__ > 10) || \
     (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))
 #define CUTLASS_ARCH_MMA_SM70_SUPPORTED 1
 #endif

 #if ((__CUDACC_VER_MAJOR__ > 10) || \
     (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))
 #define CUTLASS_ARCH_MMA_SM75_SUPPORTED 1
@@ -54,8 +59,16 @@ namespace library {
 void initialize_all_gemm_simt_operations(Manifest& manifest);
 void initialize_all_conv2d_simt_operations(Manifest& manifest);
 void initialize_all_deconv_simt_operations(Manifest& manifest);
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED
 void initialize_all_dwconv2d_fprop_simt_operations(Manifest& manifest);
 void initialize_all_dwconv2d_dgrad_simt_operations(Manifest& manifest);
 void initialize_all_dwconv2d_wgrad_simt_operations(Manifest& manifest);
 #if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) && CUTLASS_ARCH_MMA_SM70_SUPPORTED
 void initialize_all_gemm_tensorop884_operations(Manifest& manifest);
 void initialize_all_dwconv2d_fprop_tensorop884_operations(Manifest& manifest);
 void initialize_all_dwconv2d_dgrad_tensorop884_operations(Manifest& manifest);
 void initialize_all_dwconv2d_wgrad_tensorop884_operations(Manifest& manifest);
 #endif
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED
 void initialize_all_gemm_tensorop1688_operations(Manifest& manifest);
 void initialize_all_conv2d_tensorop8816_operations(Manifest& manifest);
 void initialize_all_conv2d_tensorop8832_operations(Manifest& manifest);
@@ -66,8 +79,16 @@ void initialize_all(Manifest& manifest) {
    initialize_all_gemm_simt_operations(manifest);
    initialize_all_conv2d_simt_operations(manifest);
    initialize_all_deconv_simt_operations(manifest);
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED
    initialize_all_dwconv2d_fprop_simt_operations(manifest);
    initialize_all_dwconv2d_dgrad_simt_operations(manifest);
    initialize_all_dwconv2d_wgrad_simt_operations(manifest);
 #if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) && CUTLASS_ARCH_MMA_SM70_SUPPORTED
    initialize_all_gemm_tensorop884_operations(manifest);
    initialize_all_dwconv2d_fprop_tensorop884_operations(manifest);
    initialize_all_dwconv2d_dgrad_tensorop884_operations(manifest);
    initialize_all_dwconv2d_wgrad_tensorop884_operations(manifest);
 #endif
 #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) && CUTLASS_ARCH_MMA_SM75_SUPPORTED
    initialize_all_gemm_tensorop1688_operations(manifest);
    initialize_all_conv2d_tensorop8816_operations(manifest);
    initialize_all_conv2d_tensorop8832_operations(manifest);
--- a/dnn/src/cuda/cutlass/library.h
+++ b/dnn/src/cuda/cutlass/library.h
@@ -223,6 +223,9 @@ enum class ThreadblockSwizzleID {
    kConvolutionFpropTrans,
    kConvolutionDgradNCxHWx,
    kConvolutionDgradTrans,
    kDepthwiseConvolutionFprop,
    kDepthwiseConvolutionDgrad,
    kDepthwiseConvolutionWgrad,
    kInvalid
 };

--- a/dnn/src/cuda/cutlass/library_internal.h
+++ b/dnn/src/cuda/cutlass/library_internal.h
@@ -570,6 +570,27 @@ struct ThreadblockSwizzleMap<
            ThreadblockSwizzleID::kConvolutionDgradTrans;
 };

 template <>
 struct ThreadblockSwizzleMap<
        conv::threadblock::DepthwiseConvolutionFpropThreadblockSwizzle> {
    static ThreadblockSwizzleID const kId =
            ThreadblockSwizzleID::kDepthwiseConvolutionFprop;
 };

 template <>
 struct ThreadblockSwizzleMap<
        conv::threadblock::DepthwiseConvolutionDgradThreadblockSwizzle> {
    static ThreadblockSwizzleID const kId =
            ThreadblockSwizzleID::kDepthwiseConvolutionDgrad;
 };

 template <>
 struct ThreadblockSwizzleMap<
        conv::threadblock::DepthwiseConvolutionWgradThreadblockSwizzle> {
    static ThreadblockSwizzleID const kId =
            ThreadblockSwizzleID::kDepthwiseConvolutionWgrad;
 };

 /////////////////////////////////////////////////////////////////////////////////////////////////

 template <typename Element, typename Layout>
--- a/dnn/src/cuda/cutlass/operation_table.cpp
+++ b/dnn/src/cuda/cutlass/operation_table.cpp
@@ -99,6 +99,8 @@ ConvolutionKey get_convolution_key_from_desc(const ConvolutionDescription& desc)
    key.layout_dst = desc.dst.layout;
    key.element_bias = desc.bias.element;
    key.layout_bias = desc.bias.layout;
    key.element_accumulator =
            desc.tile_description.math_instruction.element_accumulator;

    key.convolution_type = desc.convolution_type;

@@ -124,6 +126,8 @@ ConvolutionKey get_convolution_key_from_desc(const ConvolutionDescription& desc)

    key.stages = desc.tile_description.threadblock_stages;
    key.special_optimization = desc.special_optimization;
    key.alignment_src = desc.src.alignment;
    key.alignment_filter = desc.filter.alignment;
    key.without_shared_load = desc.without_shared_load;

    return key;
--- a/dnn/src/cuda/cutlass/operation_table.h
+++ b/dnn/src/cuda/cutlass/operation_table.h
@@ -188,6 +188,7 @@ struct ConvolutionKey {
    library::LayoutTypeID layout_dst;
    library::NumericTypeID element_bias;
    library::LayoutTypeID layout_bias;
    NumericTypeID element_accumulator;

    conv::ConvType convolution_type;

@@ -206,6 +207,10 @@ struct ConvolutionKey {
    epilogue::EpilogueType epilogue_type;
    int stages;
    conv::SpecialOptimizeDesc special_optimization;

    int alignment_src;
    int alignment_filter;

    bool without_shared_load;

    inline bool operator==(ConvolutionKey const& rhs) const {
@@ -215,6 +220,7 @@ struct ConvolutionKey {
               (layout_filter == rhs.layout_filter) &&
               (element_dst == rhs.element_dst) && (layout_dst == rhs.layout_dst) &&
               (element_bias == rhs.element_bias) && (layout_bias == rhs.layout_bias) &&
               (element_accumulator == rhs.element_accumulator) &&
               (convolution_type == rhs.convolution_type) &&
               (threadblock_shape_m == rhs.threadblock_shape_m) &&
               (threadblock_shape_n == rhs.threadblock_shape_n) &&
@@ -227,6 +233,8 @@ struct ConvolutionKey {
               (instruction_shape_k == rhs.instruction_shape_k) &&
               (epilogue_type == rhs.epilogue_type) && (stages == rhs.stages) &&
               (special_optimization == rhs.special_optimization) &&
               (alignment_src == rhs.alignment_src) &&
               (alignment_filter == rhs.alignment_filter) &&
               (without_shared_load == rhs.without_shared_load);
    }

@@ -254,6 +262,7 @@ struct ConvolutionKey {
               "\n    layout_dst: " + to_string(layout_dst) +
               "\n    element_bias: " + to_string(element_bias) +
               "\n    layout_bias: " + to_string(layout_bias) +
               "\n    element_accumulator: " + to_string(element_accumulator) +
               "\n    convolution_type: " + to_string(convolution_type) +
               "\n    threadblock_shape: " + threadblock_shape_str +
               "\n    warp_shape: " + warp_shape_str +
@@ -261,6 +270,8 @@ struct ConvolutionKey {
               "\n    epilogue_type: " + to_string(epilogue_type) +
               "\n    stages: " + std::to_string(stages) +
               "\n    special_optimization: " + to_string(special_optimization) +
               "\n    alignment_src: " + std::to_string(alignment_src) +
               "\n    alignment_filter: " + std::to_string(alignment_filter) +
               "\n    without_shared_load: " + to_string(without_shared_load) + "\n}";
    }
 };
@@ -268,7 +279,6 @@ struct ConvolutionKey {
 struct ConvolutionKeyHasher {
    inline size_t operator()(ConvolutionKey const& key) const {
        return Hash()
                .update(&key.conv_op, sizeof(key.conv_op))
                .update(&key.conv_op, sizeof(key.conv_op))
                .update(&key.element_src, sizeof(key.element_src))
                .update(&key.layout_src, sizeof(key.layout_src))
@@ -278,6 +288,7 @@ struct ConvolutionKeyHasher {
                .update(&key.layout_dst, sizeof(key.layout_dst))
                .update(&key.element_bias, sizeof(key.element_bias))
                .update(&key.layout_bias, sizeof(key.layout_bias))
                .update(&key.element_accumulator, sizeof(key.element_accumulator))
                .update(&key.convolution_type, sizeof(key.convolution_type))
                .update(&key.threadblock_shape_m, sizeof(key.threadblock_shape_m))
                .update(&key.threadblock_shape_n, sizeof(key.threadblock_shape_n))
@@ -291,6 +302,8 @@ struct ConvolutionKeyHasher {
                .update(&key.epilogue_type, sizeof(key.epilogue_type))
                .update(&key.stages, sizeof(key.stages))
                .update(&key.special_optimization, sizeof(key.special_optimization))
                .update(&key.alignment_src, sizeof(key.alignment_src))
                .update(&key.alignment_filter, sizeof(key.alignment_filter))
                .update(&key.without_shared_load, sizeof(key.without_shared_load))
                .digest();
    }
--- a/dnn/src/cuda/cutlass/util.cu
+++ b/dnn/src/cuda/cutlass/util.cu
@@ -1322,6 +1322,8 @@ static struct {
        {"batch_convolution", "BatchConvolution", conv::ConvType::kBatchConvolution},
        {"local", "Local", conv::ConvType::kLocal},
        {"local_share", "LocalShare", conv::ConvType::kLocalShare},
        {"depthwise_convolution", "DepthwiseConvolution",
         conv::ConvType::kDepthwiseConvolution},
 };

 /// Converts a ConvType enumerant to a string
--- a/dnn/src/cuda/fp16_help.cuh
+++ b/dnn/src/cuda/fp16_help.cuh
@@ -45,6 +45,21 @@ fma2(const __half2 a, const __half2 b, const __half2 c) {
 #endif
 }

 __device__ __forceinline__ __half2 hadd2(const __half2 a, const __half2 b) {
 #if __CUDA_ARCH__ >= 530
    return __hadd2(a, b);
 #else
    return {__float2half(__half2float(a.x) + __half2float(b.x)),
            __float2half(__half2float(a.y) + __half2float(b.y))};
 #endif
 }

 __device__ __forceinline__ float2
 fma2(const __half2 a, const __half2 b, const float2 c) {
    return {__half2float(a.x) * __half2float(b.x) + c.x,
            __half2float(a.y) * __half2float(b.y) + c.y};
 }

 #endif  // CUDA_VERSION >= 9000

 }  // namespace cuda
--- a/dnn/src/cuda/matrix_mul/algos.cpp
+++ b/dnn/src/cuda/matrix_mul/algos.cpp
@@ -44,7 +44,7 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() {
    for (auto&& algo : simt_float32_gemv_batched_strided) {
        all_algos.push_back(&algo);
    }
 #if CUDA_VERSION >= 10020
 #if CUDA_VERSION >= 10010
    for (auto&& algo : tensorop_float16) {
        all_algos.push_back(&algo);
    }
@@ -113,21 +113,26 @@ void MatrixMulForwardImpl::AlgoPack::fill_cutlass_algos() {
    simt_float32_gemv_batched_strided.emplace_back(128);
    simt_float32_gemv_batched_strided.emplace_back(64);
    simt_float32_gemv_batched_strided.emplace_back(32);
 #define FOREACH_CUTLASS_MATMUL_F16_SHAPES(cb) \
    cb(256, 128, 32, 64, 64, 32, 8, 8, 4);    \
    cb(128, 256, 32, 64, 64, 32, 8, 8, 4);    \
    cb(128, 128, 32, 64, 64, 32, 8, 8, 4);    \
    cb(256, 128, 32, 64, 64, 32, 16, 8, 8);   \
    cb(128, 256, 32, 64, 64, 32, 16, 8, 8);   \
 #define FOREACH_CUTLASS_MATMUL_MMA_SM70_SHAPES(cb) \
    cb(256, 128, 32, 64, 64, 32, 8, 8, 4);         \
    cb(128, 256, 32, 64, 64, 32, 8, 8, 4);         \
    cb(128, 128, 32, 64, 64, 32, 8, 8, 4);
 #define FOREACH_CUTLASS_MATMUL_MMA_SM75_SHAPES(cb) \
    cb(256, 128, 32, 64, 64, 32, 16, 8, 8);        \
    cb(128, 256, 32, 64, 64, 32, 16, 8, 8);        \
    cb(128, 128, 32, 64, 64, 32, 16, 8, 8);
 #define cb(...)                                            \
    tensorop_float16.emplace_back(AlgoParam{__VA_ARGS__}); \
    tensorop_float16_split_k.emplace_back(AlgoParam{__VA_ARGS__});
 #if CUDA_VERSION >= 10010
    FOREACH_CUTLASS_MATMUL_MMA_SM70_SHAPES(cb)
 #endif
 #if CUDA_VERSION >= 10020
    FOREACH_CUTLASS_MATMUL_F16_SHAPES(cb)
    FOREACH_CUTLASS_MATMUL_MMA_SM75_SHAPES(cb)
 #endif
 #undef cb
 #undef FOREACH_CUTLASS_MATMUL_F16_SHAPES
 #undef FOREACH_CUTLASS_MATMUL_MMA_SM70_SHAPES
 #undef FOREACH_CUTLASS_MATMUL_MMA_SM75_SHAPES
 }
 #endif

--- a/dnn/src/cuda/matrix_mul/algos.h
+++ b/dnn/src/cuda/matrix_mul/algos.h
@@ -350,7 +350,7 @@ private:
    std::string m_name;
 };

 #if CUDA_VERSION >= 10020
 #if CUDA_VERSION >= 10010
 class MatrixMulForwardImpl::AlgoFloat16TensorOp final
        : public AlgoCutlassMatrixMulBase {
 public:
@@ -418,7 +418,7 @@ public:
    std::vector<AlgoFloat32SIMT> simt_float32;
    std::vector<AlgoFloat32SIMTSplitK> simt_float32_split_k;
    std::vector<AlgoFloat32SIMTGemvBatchedStrided> simt_float32_gemv_batched_strided;
 #if CUDA_VERSION >= 10020
 #if CUDA_VERSION >= 10010
    std::vector<AlgoFloat16TensorOp> tensorop_float16;
    std::vector<AlgoFloat16TensorOpSplitK> tensorop_float16_split_k;
 #endif
--- a/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp
+++ b/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop.cpp
@@ -15,7 +15,7 @@
 #include "src/cuda/matrix_mul/algos.h"
 #include "src/cuda/utils.h"

 #if CUDA_VERSION >= 10020
 #if CUDA_VERSION >= 10010
 using namespace megdnn;
 using namespace cuda;

--- a/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp
+++ b/dnn/src/cuda/matrix_mul/cutlass_float16_tensorop_split_k.cpp
@@ -15,7 +15,7 @@
 #include "src/cuda/matrix_mul/algos.h"
 #include "src/cuda/utils.h"

 #if CUDA_VERSION >= 10020
 #if CUDA_VERSION >= 10010
 using namespace megdnn;
 using namespace cuda;

--- a/dnn/src/cuda/matrix_mul/opr_impl.h
+++ b/dnn/src/cuda/matrix_mul/opr_impl.h
@@ -46,8 +46,10 @@ public:
    class AlgoFloat32SIMT;
    class AlgoFloat32SIMTSplitK;
    class AlgoFloat32SIMTGemvBatchedStrided;
 #if CUDA_VERSION >= 10010
    class AlgoFloat16TensorOp;
    class AlgoFloat16TensorOpSplitK;
 #endif
 #endif
    class AlgoPack;

--- a/dnn/src/cuda/padding/opr_impl.cpp
+++ b/dnn/src/cuda/padding/opr_impl.cpp
@@ -35,6 +35,7 @@ void PaddingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
                param().padding_val, stream);                            \
    }
    MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
    MEGDNN_FOREACH_QUANTIZED_DTYPE(cb)
 #undef cb
 }

--- a/dnn/src/cuda/padding/padding.cu
+++ b/dnn/src/cuda/padding/padding.cu
@@ -60,7 +60,8 @@ __global__ void paddingConst_kernel(
                        params.src_stride[dim].divisor();
            */
        }
        dst[out_index] = in_src_valid_area ? src[in_index] : padding_val;
        dst[out_index] =
                in_src_valid_area ? src[in_index] : static_cast<T>(padding_val);
    }
 }

@@ -256,6 +257,7 @@ void padding_backward_proxy(
            const float_t padding_val, cudaStream_t stream);
 #define cb(DType) INST(typename DTypeTrait<DType>::ctype)
 MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
 MEGDNN_FOREACH_QUANTIZED_DTYPE(cb)
 #undef cb
 #undef INST

--- a/dnn/src/naive/padding/opr_impl.cpp
+++ b/dnn/src/naive/padding/opr_impl.cpp
@@ -171,7 +171,7 @@ void PaddingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
    switch (param().padding_mode) {
        case param::Padding::PaddingMode::CONSTANT:
 #define cb(DType)                                                       \
    if (src.layout.dtype == DType()) {                                  \
    if (src.layout.dtype.enumv() == DTypeTrait<DType>::enumv) {         \
        using T = typename DTypeTrait<DType>::ctype;                    \
        MEGDNN_DISPATCH_CPU_KERN_OPR(exec_const_internal<T>(            \
                src.layout.ndim, n, src.ptr<T>(), dst.ptr<T>(), params, \
@@ -179,28 +179,31 @@ void PaddingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
        return;                                                         \
    }
            MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
            MEGDNN_FOREACH_QUANTIZED_DTYPE(cb)
 #undef cb
            break;
        case param::Padding::PaddingMode::REPLICATE:
 #define cb(DType)                                                         \
    if (src.layout.dtype == DType()) {                                    \
    if (src.layout.dtype.enumv() == DTypeTrait<DType>::enumv) {           \
        using T = typename DTypeTrait<DType>::ctype;                      \
        MEGDNN_DISPATCH_CPU_KERN_OPR(exec_replicate_internal<T>(          \
                src.layout.ndim, n, src.ptr<T>(), dst.ptr<T>(), params)); \
        return;                                                           \
    }
            MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
            MEGDNN_FOREACH_QUANTIZED_DTYPE(cb)
 #undef cb
            break;
        case param::Padding::PaddingMode::REFLECT:
 #define cb(DType)                                                         \
    if (src.layout.dtype == DType()) {                                    \
    if (src.layout.dtype.enumv() == DTypeTrait<DType>::enumv) {           \
        using T = typename DTypeTrait<DType>::ctype;                      \
        MEGDNN_DISPATCH_CPU_KERN_OPR(exec_reflect_internal<T>(            \
                src.layout.ndim, n, src.ptr<T>(), dst.ptr<T>(), params)); \
        return;                                                           \
    }
            MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
            MEGDNN_FOREACH_QUANTIZED_DTYPE(cb)
 #undef cb
            break;
        default:
--- a/dnn/test/common/checker.h
+++ b/dnn/test/common/checker.h
@@ -569,6 +569,7 @@ public:
            });
            return ret;
        }
        megdnn_assert(false, "Expected algo not found: %s\n", policy_name.name.c_str());
        return ret;
    }

--- a/dnn/test/cuda/chanwise_convolution.cpp
+++ b/dnn/test/cuda/chanwise_convolution.cpp
@@ -20,6 +20,7 @@
 #include "test/common/workspace_wrapper.h"
 #include "test/cuda/benchmark.h"
 #include "test/cuda/fixture.h"
 #include "test/cuda/utils.h"

 #include <cuda_profiler_api.h>
 #include <cuda_runtime_api.h>
@@ -38,8 +39,10 @@ bool check_need_full_bench() {
 }
 #endif

 Convolution::Param gconv_param(Convolution::Param p) {
 Convolution::Param gconv_param(Convolution::Param p, bool io16xc32 = false) {
    p.sparse = Convolution::Param::Sparse::GROUP;
    if (io16xc32)
        p.compute_mode = Convolution::Param::ComputeMode::FLOAT32;
    return p;
 }

@@ -421,6 +424,209 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER) {
    }
 }

 namespace {
 template <typename Op>
 struct AlgoCheckerMaker {
    static auto make(const char* name, bool* require_algo) {
        return AlgoChecker<Op>(name, require_algo);
    }
 };

 template <>
 struct AlgoCheckerMaker<ConvolutionForward> {
    static auto make(const char* name, bool* require_algo) {
        return AlgoChecker<ConvolutionForward>(
                ExecutionPolicyAlgoName{
                        "DEFAULT",
                        {{ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
                                  name, {})
                                  .c_str(),
                          {}}}},
                require_algo);
    }
 };

 template <typename Op>
 void check_chanwise(DType io_type, DType comp_type, Handle* handle, const char* name) {
    Checker<Op> checker(handle);
    bool require_algo = false;
    checker.set_before_exec_callback(AlgoCheckerMaker<Op>::make(name, &require_algo));
    checker.set_dtype(0, io_type).set_dtype(1, io_type).set_dtype(2, io_type);
    bool io16xc32 = false;
    if (io_type == dtype::Float16()) {
        if (comp_type == dtype::Float16()) {
            checker.set_epsilon(1e-1);
        } else {
            io16xc32 = true;
        }
    }
    // dispatch testcase by operation
    if (std::is_same<Op, ConvolutionForward>::value) {
        // align 8
        checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
                .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
        // align 1
        checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
                .execs({{8, 2, 15, 15}, {2, 1, 1, 15, 15}, {}});
        // align 2
        checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
                .execs({{8, 2, 14, 14}, {2, 1, 1, 15, 15}, {}});
        // custom padding
        checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32))
                .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
        // custom stride
        checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32))
                .execs({{8, 2, 16, 16}, {2, 1, 1, 15, 15}, {}});
    } else if (std::is_same<Op, ConvolutionBackwardData>::value) {
        // align 8
        checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
                .execs({{2, 1, 1, 15, 15}, {8, 2, 16, 16}, {8, 2, 16, 16}});
        // align 1
        checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
                .execs({{2, 1, 1, 15, 15}, {8, 2, 15, 15}, {8, 2, 15, 15}});
        // align 2
        checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
                .execs({{2, 1, 1, 15, 15}, {8, 2, 14, 14}, {8, 2, 14, 14}});
        // custom padding
        checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32))
                .execs({{2, 1, 1, 15, 15}, {8, 2, 8, 8}, {8, 2, 16, 16}});
        // custom stride
        checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32))
                .execs({{2, 1, 1, 15, 15}, {8, 2, 7, 7}, {8, 2, 14, 14}});
    } else if (std::is_same<Op, ConvolutionBackwardFilter>::value) {
        // align 8
        checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
                .execs({{8, 2, 16, 16}, {8, 2, 16, 16}, {2, 1, 1, 15, 15}});
        // align 1
        checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
                .execs({{8, 2, 15, 15}, {8, 2, 15, 15}, {2, 1, 1, 15, 15}});
        // align 2
        checker.set_param(gconv_param({M, 7, 7, 1, 1}, io16xc32))
                .execs({{8, 2, 14, 14}, {8, 2, 14, 14}, {2, 1, 1, 15, 15}});
        // custom padding
        checker.set_param(gconv_param({M, 3, 3, 1, 1}, io16xc32))
                .execs({{8, 2, 16, 16}, {8, 2, 8, 8}, {2, 1, 1, 15, 15}});
        // custom stride
        checker.set_param(gconv_param({M, 7, 7, 2, 2}, io16xc32))
                .execs({{8, 2, 14, 14}, {8, 2, 7, 7}, {2, 1, 1, 15, 15}});
    }
 }
 }  // namespace

 #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb) \
    cb(1, 128, 128, 8, 32, 64, 8);                          \
    cb(2, 128, 64, 8, 64, 32, 8);                           \
    cb(3, 128, 32, 8, 64, 32, 8);                           \
    cb(4, 64, 128, 8, 32, 64, 8);                           \
    cb(5, 32, 128, 8, 32, 64, 8);                           \
    cb(6, 64, 64, 8, 32, 64, 8);                            \
    cb(7, 32, 64, 8, 32, 64, 8);                            \
    cb(8, 32, 32, 8, 32, 32, 8);                            \
    cb(9, 64, 32, 8, 64, 32, 8);

 #define cb(tag, tbm, tbn, tbk, wm, wn, wk)                                       \
    TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_FMA_##tag) {               \
        require_compute_capability(6, 1);                                        \
        check_chanwise<ConvolutionForward>(                                      \
                dtype::Float32(), dtype::Float32(), handle_cuda(),               \
                "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
                "_" #wm "X" #wn "X" #wk "_2stage");                              \
    }

 MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb)

 #undef cb

 #define cb(tag, tbm, tbn, tbk, wm, wn, wk)                                       \
    TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_FMA_##tag) {         \
        require_compute_capability(6, 1);                                        \
        check_chanwise<ConvolutionBackwardData>(                                 \
                dtype::Float32(), dtype::Float32(), handle_cuda(),               \
                "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
                "_" #wm "X" #wn "X" #wk "_2stage");                              \
    }

 MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb)

 #undef cb

 #define cb(tag, tbm, tbn, tbk, wm, wn, wk)                                       \
    TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER_CUTLASS_FMA_##tag) {       \
        require_compute_capability(6, 1);                                        \
        check_chanwise<ConvolutionBackwardFilter>(                               \
                dtype::Float32(), dtype::Float32(), handle_cuda(),               \
                "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
                "_" #wm "X" #wn "X" #wk "_2stage");                              \
    }

 MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb)

 #undef cb

 #undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL

 #if CUDA_VERSION >= 10010
 #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb) \
    cb(1, 128, 128, 32, 32, 32, 32);                         \
    cb(2, 128, 256, 32, 64, 64, 32);                         \
    cb(3, 128, 64, 32, 32, 32, 32);                          \
    cb(4, 64, 128, 32, 32, 32, 32);                          \
    cb(5, 64, 64, 32, 32, 32, 32);
 #else
 // hmma instruction need cuda version >= 10.2, disable hmma testcases in this path
 #define MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb)
 #endif

 // check both ioc16 and io16xc32
 #define cb(tag, tbm, tbn, tbk, wm, wn, wk)                                        \
    TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_HMMA_##tag) {               \
        require_compute_capability(7, 0);                                         \
        check_chanwise<ConvolutionForward>(                                       \
                dtype::Float16(), dtype::Float16(), handle_cuda(),                \
                "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
                "_" #wm "X" #wn "X" #wk "_2stage");                               \
        check_chanwise<ConvolutionForward>(                                       \
                dtype::Float16(), dtype::Float32(), handle_cuda(),                \
                "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
                "_" #wm "X" #wn "X" #wk "_2stage");                               \
    }

 MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb)

 #undef cb

 #define cb(tag, tbm, tbn, tbk, wm, wn, wk)                                        \
    TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_HMMA_##tag) {         \
        require_compute_capability(7, 0);                                         \
        check_chanwise<ConvolutionBackwardData>(                                  \
                dtype::Float16(), dtype::Float16(), handle_cuda(),                \
                "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
                "_" #wm "X" #wn "X" #wk "_mma8X8X4_2stage");                      \
        check_chanwise<ConvolutionBackwardData>(                                  \
                dtype::Float16(), dtype::Float32(), handle_cuda(),                \
                "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
                "_" #wm "X" #wn "X" #wk "_mma8X8X4_2stage");                      \
    }

 MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb)

 #undef cb

 #define cb(tag, tbm, tbn, tbk, wm, wn, wk)                                        \
    TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_FILTER_CUTLASS_HMMA_##tag) {       \
        require_compute_capability(7, 0);                                         \
        check_chanwise<ConvolutionBackwardData>(                                  \
                dtype::Float16(), dtype::Float32(), handle_cuda(),                \
                "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
                "_" #wm "X" #wn "X" #wk "_mma8X8X4_2stage");                      \
    }

 MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb)

 #undef cb

 #undef MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FWD_HMMA_KERNEL

 #if MEGDNN_WITH_BENCHMARK
 TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_BENCH_CHECK) {
    auto handle = handle_cuda();
@@ -1123,6 +1329,228 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BWD_FILTER) {
    // clang-format on
 }

 TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_FORWARD_LARGE_KERNEL) {
    CUBenchmarker<ConvolutionForward> bencher(handle_cuda());
    size_t RUNS = 100;
    bencher.set_display(false).set_times(RUNS);
    std::unique_ptr<OprProxy<ConvolutionForward>> proxy{
            new OprProxy<ConvolutionForward>{true}};
    bencher.set_proxy(proxy);

    Convolution::Param param;
    param.format = ConvBias::Param::Format::NCHW;
    param.sparse = Convolution::Param::Sparse::GROUP;
    NormalRNG rng;

    auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
        param.pad_h = f / 2;
        param.pad_w = f / 2;
        param.stride_h = s;
        param.stride_w = s;
        param.compute_mode = param::Convolution::ComputeMode::DEFAULT;

        TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};

        TensorLayout dst_layout;
        auto opr = handle_cuda()->create_operator<Convolution>();
        opr->param() = param;
        opr->deduce_layout(
                {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
        float bandwith = static_cast<float>(
                                 src.total_nr_elems() + filter.total_nr_elems() +
                                 dst_layout.total_nr_elems()) /
                         (1024 * 1024 * 1024) * 1e3;

        bencher.set_param(param)
                .set_dtype(0, dtype::Float32())
                .set_dtype(1, dtype::Float32())
                .set_dtype(2, dtype::Float32())
                .set_rng(0, &rng)
                .set_rng(1, &rng);
        bencher.proxy()->target_execution_policy = {};
        auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS;

        bencher.set_param(param)
                .set_dtype(0, dtype::Float16())
                .set_dtype(1, dtype::Float16())
                .set_dtype(2, dtype::Float16())
                .set_rng(0, &rng)
                .set_rng(1, &rng);
        bencher.proxy()->target_execution_policy = {};
        auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS;

        bencher.proxy()->target_execution_policy.algo.reset();
        param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
        bencher.set_param(param);
        auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS;

        printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
               "float16: %.2fms %.2fGB/s "
               "pseudo float16: %.2fms %.2fGB/s "
               "speedup: "
               "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
               s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
               bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
               bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
               bandwith * 2 / time_in_ms_pseudo_fp16, time_in_ms_fp32 / time_in_ms_fp16,
               time_in_ms_pseudo_fp16 / time_in_ms_fp16);
    };

    // clang-format off
    for (size_t b : {32, 64})
    for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) {
        run(b, 384, 32, 32, f, 1);
        run(b, 384, 64, 64, f, 1);
    }
    // clang-format on
 }

 TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_LARGE_KERNEL) {
    CUBenchmarker<ConvolutionBackwardData> bencher(handle_cuda());
    size_t RUNS = 100;
    bencher.set_display(false).set_times(RUNS);
    std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
            new OprProxy<ConvolutionBackwardData>{true}};
    bencher.set_proxy(proxy);

    Convolution::Param param;
    param.format = ConvBias::Param::Format::NCHW;
    param.sparse = Convolution::Param::Sparse::GROUP;
    NormalRNG rng;

    auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
        param.pad_h = f / 2;
        param.pad_w = f / 2;
        param.stride_h = s;
        param.stride_w = s;
        param.compute_mode = param::Convolution::ComputeMode::DEFAULT;

        TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};

        TensorLayout dst_layout;
        auto opr = handle_cuda()->create_operator<Convolution>();
        opr->param() = param;
        opr->deduce_layout(
                {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
        float bandwith = static_cast<float>(
                                 src.total_nr_elems() + filter.total_nr_elems() +
                                 dst_layout.total_nr_elems()) /
                         (1024 * 1024 * 1024) * 1e3;

        bencher.set_param(param)
                .set_dtype(0, dtype::Float32())
                .set_dtype(1, dtype::Float32())
                .set_dtype(2, dtype::Float32())
                .set_rng(0, &rng)
                .set_rng(1, &rng);
        bencher.proxy()->target_execution_policy = {};
        auto time_in_ms_fp32 = bencher.execs({filter, src, src}) / RUNS;

        bencher.set_param(param)
                .set_dtype(0, dtype::Float16())
                .set_dtype(1, dtype::Float16())
                .set_dtype(2, dtype::Float16())
                .set_rng(0, &rng)
                .set_rng(1, &rng);
        bencher.proxy()->target_execution_policy = {};
        auto time_in_ms_fp16 = bencher.execs({filter, src, src}) / RUNS;

        bencher.proxy()->target_execution_policy.algo.reset();
        param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
        bencher.set_param(param);
        auto time_in_ms_pseudo_fp16 = bencher.execs({filter, src, src}) / RUNS;

        printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
               "float16: %.2fms %.2fGB/s "
               "pseudo float16: %.2fms %.2fGB/s "
               "speedup: "
               "%0.2f (fp16/fp32) %.2f (fp16/pseudo fp16)\n",
               s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
               bandwith * 4 / time_in_ms_fp32, time_in_ms_fp16,
               bandwith * 2 / time_in_ms_fp16, time_in_ms_pseudo_fp16,
               bandwith * 2 / time_in_ms_pseudo_fp16, time_in_ms_fp32 / time_in_ms_fp16,
               time_in_ms_pseudo_fp16 / time_in_ms_fp16);
    };

    // clang-format off
    for (size_t b : {32, 64})
    for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) {
        run(b, 384, 32, 32, f, 1);
        run(b, 384, 64, 64, f, 1);
    }
    // clang-format on
 }

 TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_FILTER_LARGE_KERNEL) {
    CUBenchmarker<ConvolutionBackwardFilter> bencher(handle_cuda());
    size_t RUNS = 100;
    bencher.set_display(false).set_times(RUNS);
    std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
            new OprProxy<ConvolutionBackwardFilter>{true}};
    bencher.set_proxy(proxy);

    Convolution::Param param;
    param.format = ConvBias::Param::Format::NCHW;
    param.sparse = Convolution::Param::Sparse::GROUP;
    NormalRNG rng;

    auto run = [&](size_t batch, size_t c, size_t ih, size_t iw, size_t f, size_t s) {
        param.pad_h = f / 2;
        param.pad_w = f / 2;
        param.stride_h = s;
        param.stride_w = s;
        param.compute_mode = param::Convolution::ComputeMode::DEFAULT;

        TensorShape src = {batch, c, ih, iw}, filter = {c, 1, 1, f, f};

        TensorLayout dst_layout;
        auto opr = handle_cuda()->create_operator<Convolution>();
        opr->param() = param;
        opr->deduce_layout(
                {src, dtype::Float32()}, {filter, dtype::Float32()}, dst_layout);
        float bandwith = static_cast<float>(
                                 src.total_nr_elems() + filter.total_nr_elems() +
                                 dst_layout.total_nr_elems()) /
                         (1024 * 1024 * 1024) * 1e3;

        bencher.set_param(param)
                .set_dtype(0, dtype::Float32())
                .set_dtype(1, dtype::Float32())
                .set_dtype(2, dtype::Float32())
                .set_rng(0, &rng)
                .set_rng(1, &rng);
        bencher.proxy()->target_execution_policy = {};
        auto time_in_ms_fp32 = bencher.execs({src, src, filter}) / RUNS;

        bencher.set_param(param)
                .set_dtype(0, dtype::Float16())
                .set_dtype(1, dtype::Float16())
                .set_dtype(2, dtype::Float16())
                .set_rng(0, &rng)
                .set_rng(1, &rng);
        bencher.proxy()->target_execution_policy = {};
        param.compute_mode = param::Convolution::ComputeMode::FLOAT32;
        bencher.set_param(param);
        auto time_in_ms_pseudo_fp16 = bencher.execs({src, src, filter}) / RUNS;

        printf("stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
               "pseudo float16: %.2fms %.2fGB/s "
               "speedup: "
               "%0.2f (fp16/fp32) \n",
               s, src.to_string().c_str(), filter.to_string().c_str(), time_in_ms_fp32,
               bandwith * 4 / time_in_ms_fp32, time_in_ms_pseudo_fp16,
               bandwith * 2 / time_in_ms_pseudo_fp16,
               time_in_ms_fp32 / time_in_ms_pseudo_fp16);
    };

    // clang-format off
    for (size_t b : {32, 64})
    for (size_t f : {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}) {
        run(b, 384, 32, 32, f, 1);
        run(b, 384, 64, 64, f, 1);
    }
    // clang-format on
 }
 #endif

 // vim: syntax=cpp.doxygen
--- a/dnn/test/cuda/conv_bias.cpp
+++ b/dnn/test/cuda/conv_bias.cpp
@@ -695,6 +695,78 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_CHANWISE_SMALL) {
    }
 }

 TEST_F(CUDA, CONV_BIAS_FORWARD_DEPTHWISE_LARGE_FILTER) {
    Checker<ConvBiasForward> checker(handle_cuda());
    checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBias>(
            ConvBiasForward::algo_name<ConvBias::DirectParam>(
                    "DEPTHWISE_LARGE_FILTER", {})
                    .c_str()));
    for (auto dtype : std::vector<DType> {
             dtype::Float32(),
 #if CUDA_VERSION >= 9000
                     dtype::Float16()
 #endif
         }) {
        auto run = [&checker, &dtype](
                           size_t n, size_t g, size_t h, size_t fh, size_t padding,
                           size_t stride) {
            param::ConvBias cur_param;
            cur_param.mode = param::ConvBias::Mode::CROSS_CORRELATION;
            cur_param.sparse = ConvBias::Param::Sparse::GROUP;
            checker.set_dtype(0, dtype)
                    .set_dtype(1, dtype)
                    .set_dtype(2, dtype)
                    .set_dtype(3, dtype)
                    .set_dtype(4, dtype);
            float scale = 64.f / sqrt(fh * fh);
            UniformFloatRNG rng(scale, 2 * scale);
            checker.set_rng(0, &rng)
                    .set_rng(1, &rng)
                    .set_rng(2, &rng)
                    .set_rng(3, &rng)
                    .set_rng(4, &rng);
            if (dtype.enumv() == DTypeEnum::Float16) {
                checker.set_epsilon(1e-1);
            }

            cur_param.pad_h = cur_param.pad_w = padding;
            cur_param.stride_h = cur_param.stride_w = stride;
            checker.set_param(cur_param).execs(
                    {{n, g, h, h}, {g, 1, 1, fh, fh}, {}, {}, {}});
        };
        run(4, 8, 32, 5, 5 / 2, 1);
        run(4, 8, 32, 7, 7 / 2, 1);
        run(4, 8, 32, 9, 9 / 2, 1);
        run(4, 8, 32, 11, 11 / 2, 1);
        run(4, 8, 32, 13, 13 / 2, 1);
        run(4, 8, 32, 15, 15 / 2, 1);
        run(4, 8, 32, 17, 17 / 2, 1);
        run(4, 8, 32, 19, 19 / 2, 1);
        run(4, 8, 32, 21, 21 / 2, 1);
        run(4, 8, 32, 23, 23 / 2, 1);
        run(4, 8, 32, 25, 25 / 2, 1);
        run(4, 8, 32, 27, 27 / 2, 1);
        run(4, 8, 32, 29, 29 / 2, 1);
        run(4, 8, 32, 31, 31 / 2, 1);
        run(4, 8, 64, 5, 5 / 3, 2);
        run(4, 8, 64, 7, 7 / 3, 2);
        run(4, 8, 64, 9, 9 / 3, 2);
        run(4, 8, 64, 11, 11 / 3, 2);
        run(4, 8, 64, 13, 13 / 3, 2);
        run(4, 8, 64, 15, 15 / 3, 2);
        run(4, 8, 64, 17, 17 / 3, 2);
        run(4, 8, 64, 19, 19 / 3, 2);
        run(4, 8, 64, 21, 21 / 3, 2);
        run(4, 8, 64, 23, 23 / 3, 2);
        run(4, 8, 64, 25, 25 / 3, 2);
        run(4, 8, 64, 27, 27 / 3, 2);
        run(4, 8, 64, 29, 29 / 3, 2);
        run(4, 8, 64, 31, 31 / 3, 2);
        run(1, 2, 128, 31, 10, 2);
        run(1, 2, 256, 31, 10, 2);
    }
 }

 TEST_F(CUDA, CONV_BIAS_FORWARD_CHANWISE_8x8x32) {
    require_compute_capability(6, 1);
    Checker<ConvBiasForward> checker(handle_cuda());
@@ -1033,7 +1105,7 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_GROUP) {
                            ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
                                    "CUDA:GROUP_CONV", {})
                                    .c_str(),
                            {{"CUDNN", {}}}}));
                            {{"DEFAULT:CUDNN", {}}}}));
            ConvBias::Param param;
            param.sparse = ConvBias::Param::Sparse::GROUP;
            param.nonlineMode = mode;
@@ -1093,8 +1165,11 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_GROUP) {
        run(2, 32, 7, 7, 3, 3, 64, 1, 1, 1, 1, 1, 1, 4, nlmode);
        // strided case
        run(2, 32, 7, 7, 3, 3, 64, 0, 0, 2, 2, 1, 1, 8, nlmode);
        // dilate conv is supported in CUDNN since version 7.5.0
 #if CUDNN_VERSION >= 7500
        // dilated case
        run(2, 32, 7, 7, 3, 3, 64, 0, 0, 1, 1, 2, 2, 8, nlmode);
 #endif
    }
 }

@@ -1471,6 +1546,132 @@ TEST_F(CUDA, BENCHMARK_CONV_BIAS_FORWARD_TENSORCORE_INT8) {
    run_bench(256, 512, 7, 7, 512, 3, 3, 1, 1, 1000);
    run_bench(256, 512, 7, 7, 2048, 1, 1, 1, 1, 1000);
 }

 TEST_F(CUDA, BENCHMARK_CONV_BIAS_FORWARD_DEPTHWISE_LARGE_FILTER_FP16) {
    require_compute_capability(7, 5);
    Benchmarker<ConvBiasForward> bencher(handle_cuda());
    bencher.set_display(false);
    bencher.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
            ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
                    "DEPTHWISE_LARGE_FILTER", {})
                    .c_str()));

    ConvBias::Param param;
    param.format = ConvBias::Param::Format::NCHW;

    using NonlineMode = ConvBias::Param::NonlineMode;
    param.nonlineMode = NonlineMode::IDENTITY;
    param.sparse = ConvBias::Param::Sparse::GROUP;
    auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
                         size_t fw, size_t sh, size_t sw, size_t nr_times) {
        param.pad_h = fh / 2;
        param.pad_w = fw / 2;
        param.stride_h = sh;
        param.stride_w = sw;

        bencher.set_param(param)
                .set_dtype(0, dtype::Float16())
                .set_dtype(1, dtype::Float16())
                .set_dtype(2, dtype::Float16())
                .set_dtype(4, dtype::Float16());
        bencher.set_times(nr_times);
        size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
        size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
        TensorShape inp{batch, g, hi, wi}, kern{g, 1, 1, fh, fw}, out{batch, g, ho, wo};

        float bandwith = static_cast<float>(
                                 inp.total_nr_elems() + kern.total_nr_elems() +
                                 out.total_nr_elems()) /
                         (1024 * 1024 * 1024) * 1e3;

        auto time_in_ms = bencher.execs({inp, kern, {}, {}, out}) / nr_times;
        auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
        printf("chanwise_depthwise_large_filter: inp=%s, kern=%s, out=%s, time: "
               "%.2fms, "
               "perf: %.2f Tops bandwidth: %.2fGB/s.\n",
               inp.to_string().c_str(), kern.to_string().c_str(),
               out.to_string().c_str(), time_in_ms, ops, bandwith * 4 / time_in_ms);
    };

    run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10);
    run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10);
    run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10);
    run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10);
    run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10);
    run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10);
    run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10);
    run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10);
    run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10);
    run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10);
    run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10);
    run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10);
    run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10);
    run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10);
    run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
 }

 TEST_F(CUDA, BENCHMARK_CONV_BIAS_FORWARD_DEPTHWISE_LARGE_FILTER_FP32) {
    require_compute_capability(7, 5);
    Benchmarker<ConvBiasForward> bencher(handle_cuda());
    bencher.set_display(false);
    bencher.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
            ConvBiasForward::algo_name<ConvBiasForward::DirectParam>(
                    "DEPTHWISE_LARGE_FILTER", {})
                    .c_str()));

    ConvBias::Param param;
    param.format = ConvBias::Param::Format::NCHW;

    using NonlineMode = ConvBias::Param::NonlineMode;
    param.nonlineMode = NonlineMode::IDENTITY;
    param.sparse = ConvBias::Param::Sparse::GROUP;
    auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh,
                         size_t fw, size_t sh, size_t sw, size_t nr_times) {
        param.pad_h = fh / 2;
        param.pad_w = fw / 2;
        param.stride_h = sh;
        param.stride_w = sw;

        bencher.set_param(param)
                .set_dtype(0, dtype::Float32())
                .set_dtype(1, dtype::Float32())
                .set_dtype(2, dtype::Float32())
                .set_dtype(4, dtype::Float32());
        bencher.set_times(nr_times);
        size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h);
        size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w);
        TensorShape inp{batch, g, hi, wi}, kern{g, 1, 1, fh, fw}, out{batch, g, ho, wo};

        float bandwith = static_cast<float>(
                                 inp.total_nr_elems() + kern.total_nr_elems() +
                                 out.total_nr_elems()) /
                         (1024 * 1024 * 1024) * 1e3;

        auto time_in_ms = bencher.execs({inp, kern, {}, {}, out}) / nr_times;
        auto ops = 2.0 * batch * g * ho * wo * fh * fw / (time_in_ms * 1e-3) * 1e-12;
        printf("chanwise_depthwise_large_filter: inp=%s, kern=%s, out=%s, time: "
               "%.2fms, "
               "perf: %.2f Tops bandwidth: %.2fGB/s.\n",
               inp.to_string().c_str(), kern.to_string().c_str(),
               out.to_string().c_str(), time_in_ms, ops, bandwith * 4 / time_in_ms);
    };

    run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10);
    run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10);
    run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10);
    run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10);
    run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10);
    run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10);
    run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10);
    run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10);
    run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10);
    run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10);
    run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10);
    run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10);
    run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10);
    run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10);
    run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10);
 }
 #endif
 #endif

--- a/dnn/test/cuda/convolution.cpp
+++ b/dnn/test/cuda/convolution.cpp
@@ -724,6 +724,70 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) {
            TensorLayoutArray{filter, dst, src});
 }

 TEST_F(CUDA, CONVOLUTION_BACKWARD_DEPTHWISE_LARGE_FILTER) {
    Checker<ConvolutionBackwardData> checker(handle_cuda());
    checker.set_before_exec_callback(
            AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));
    for (auto dtype : std::vector<DType> {
             dtype::Float32(),
 #if CUDA_VERSION >= 9000
                     dtype::Float16()
 #endif
         }) {
        auto run = [&checker, &dtype](
                           size_t n, size_t g, size_t h, size_t fh, size_t padding,
                           size_t stride) {
            param::Convolution param;
            param.stride_h = param.stride_w = stride;
            param.pad_h = param.pad_w = padding;
            param.mode = Convolution::Mode::CROSS_CORRELATION;
            param.sparse = param::Convolution::Sparse::GROUP;
            checker.set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
            float scale = 64.f / sqrt(fh * fh);
            UniformFloatRNG rng(scale, scale * 2);
            checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &rng);
            if (dtype.enumv() == DTypeEnum::Float16)
                checker.set_epsilon(1e-1);

            checker.set_param(param).execs(
                    {{g, 1, 1, fh, fh},
                     {n, g, (h + 2 * padding - fh + 1) / stride,
                      (h + 2 * padding - fh + 1) / stride},
                     {n, g, h, h}});
        };
        run(4, 8, 32, 5, 5 / 2, 1);
        run(4, 8, 32, 7, 7 / 2, 1);
        run(4, 8, 32, 9, 9 / 2, 1);
        run(4, 8, 32, 11, 11 / 2, 1);
        run(4, 8, 32, 13, 13 / 2, 1);
        run(4, 8, 32, 15, 15 / 2, 1);
        run(4, 8, 32, 17, 17 / 2, 1);
        run(4, 8, 32, 19, 19 / 2, 1);
        run(4, 8, 32, 21, 21 / 2, 1);
        run(4, 8, 32, 23, 23 / 2, 1);
        run(4, 8, 32, 25, 25 / 2, 1);
        run(4, 8, 32, 27, 27 / 2, 1);
        run(4, 8, 32, 29, 29 / 2, 1);
        run(4, 8, 32, 31, 31 / 2, 1);
        run(4, 8, 64, 5, 5 / 2, 2);
        run(4, 8, 64, 7, 7 / 3, 2);
        run(4, 8, 64, 9, 9 / 3, 2);
        run(4, 8, 64, 11, 11 / 3, 2);
        run(4, 8, 64, 13, 13 / 3, 2);
        run(4, 8, 64, 15, 15 / 3, 2);
        run(4, 8, 64, 17, 17 / 3, 2);
        run(4, 8, 64, 19, 19 / 3, 2);
        run(4, 8, 64, 21, 21 / 3, 2);
        run(4, 8, 64, 23, 23 / 3, 2);
        run(4, 8, 64, 25, 25 / 3, 2);
        run(4, 8, 64, 27, 27 / 3, 2);
        run(4, 8, 64, 29, 29 / 3, 2);
        run(4, 8, 64, 31, 31 / 3, 2);
        run(1, 2, 128, 31, 31 / 3, 2);
        run(1, 2, 256, 31, 31 / 3, 2);
    }
 }

 #if MEGDNN_WITH_BENCHMARK
 TEST_F(CUDA, CONV_FWD_BENCHMARK) {
    auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH = 1,
@@ -901,6 +965,104 @@ TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) {
    run(32, 64, 64, 56, 56, 1, 1, 0);
 }

 TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_DEPTHWISE_LARGE_FILTER_FP32) {
    CUBenchmarker<ConvolutionBackwardData> bencher{handle_cuda()};
    bencher.set_display(false);
    bencher.set_before_exec_callback(
            AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));

    auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
                   size_t SH, size_t nr_times) {
        bencher.set_dtype(0, dtype::Float32())
                .set_dtype(1, dtype::Float32())
                .set_dtype(2, dtype::Float32());
        param::Convolution param;
        param.stride_h = param.stride_w = SH;
        param.pad_h = param.pad_w = FH / 2;
        param.sparse = param::Convolution::Sparse::GROUP;
        bencher.set_param(param);
        bencher.set_times(nr_times);
        TensorLayout src{{N, g, IH, IW}, dtype::Float32()},
                filter{{g, 1, 1, FH, FH}, dtype::Float32()};
        TensorLayout dst;
        {
            auto&& opr = handle_cuda()->create_operator<Convolution>();
            opr->param() = param;
            opr->deduce_layout(src, filter, dst);
        }
        auto time_ms_fp32 = bencher.execl({filter, dst, src}) / nr_times;
        float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
        printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
               filter.to_string().c_str(), dst.to_string().c_str());
        printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp32,
               (flo / (time_ms_fp32 * 1e9)));
    };
    run(64, 384, 384, 32, 32, 3, 1, 10);
    run(64, 384, 384, 32, 32, 5, 1, 10);
    run(64, 384, 384, 32, 32, 7, 1, 10);
    run(64, 384, 384, 32, 32, 9, 1, 10);
    run(64, 384, 384, 32, 32, 11, 1, 10);
    run(64, 384, 384, 32, 32, 13, 1, 10);
    run(64, 384, 384, 32, 32, 15, 1, 10);
    run(64, 384, 384, 32, 32, 17, 1, 10);
    run(64, 384, 384, 32, 32, 19, 1, 10);
    run(64, 384, 384, 32, 32, 21, 1, 10);
    run(64, 384, 384, 32, 32, 23, 1, 10);
    run(64, 384, 384, 32, 32, 25, 1, 10);
    run(64, 384, 384, 32, 32, 27, 1, 10);
    run(64, 384, 384, 32, 32, 29, 1, 10);
    run(64, 384, 384, 32, 32, 31, 1, 10);
 }

 TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_DEPTHWISE_LARGE_FILTER_FP16) {
    CUBenchmarker<ConvolutionBackwardData> bencher{handle_cuda()};
    bencher.set_display(false);
    bencher.set_before_exec_callback(
            AlgoChecker<ConvolutionBackwardData>("DEPTHWISE_LARGE_FILTER"));

    auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
                   size_t SH, size_t nr_times) {
        bencher.set_dtype(0, dtype::Float16())
                .set_dtype(1, dtype::Float16())
                .set_dtype(2, dtype::Float16());
        param::Convolution param;
        param.stride_h = param.stride_w = SH;
        param.pad_h = param.pad_w = FH / 2;
        param.sparse = param::Convolution::Sparse::GROUP;
        bencher.set_param(param);
        bencher.set_times(nr_times);
        TensorLayout src{{N, g, IH, IW}, dtype::Float16()},
                filter{{g, 1, 1, FH, FH}, dtype::Float16()};
        TensorLayout dst;
        {
            auto&& opr = handle_cuda()->create_operator<Convolution>();
            opr->param() = param;
            opr->deduce_layout(src, filter, dst);
        }
        auto time_ms_fp16 = bencher.execl({filter, dst, src}) / nr_times;
        float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
        printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
               filter.to_string().c_str(), dst.to_string().c_str());
        printf("time_fp16=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp16,
               (flo / (time_ms_fp16 * 1e9)));
    };
    run(64, 384, 384, 32, 32, 3, 1, 10);
    run(64, 384, 384, 32, 32, 5, 1, 10);
    run(64, 384, 384, 32, 32, 7, 1, 10);
    run(64, 384, 384, 32, 32, 9, 1, 10);
    run(64, 384, 384, 32, 32, 11, 1, 10);
    run(64, 384, 384, 32, 32, 13, 1, 10);
    run(64, 384, 384, 32, 32, 15, 1, 10);
    run(64, 384, 384, 32, 32, 17, 1, 10);
    run(64, 384, 384, 32, 32, 19, 1, 10);
    run(64, 384, 384, 32, 32, 21, 1, 10);
    run(64, 384, 384, 32, 32, 23, 1, 10);
    run(64, 384, 384, 32, 32, 25, 1, 10);
    run(64, 384, 384, 32, 32, 27, 1, 10);
    run(64, 384, 384, 32, 32, 29, 1, 10);
    run(64, 384, 384, 32, 32, 31, 1, 10);
 }

 TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) {
    CUBenchmarker<ConvolutionBackwardData> bench{handle_cuda()};
    std::unique_ptr<OprProxy<ConvolutionBackwardData>> proxy{
@@ -1065,6 +1227,46 @@ TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) {
    run(32, 512, 1024, 14, 14, 1, 2, 0);
    run(32, 64, 64, 56, 56, 1, 1, 0);
 }

 TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_FILTER_DEPTHWISE_LARGE_FILTER) {
    CUBenchmarker<ConvolutionBackwardFilter> bench{handle_cuda()};
    std::unique_ptr<OprProxy<ConvolutionBackwardFilter>> proxy{
            new OprProxy<ConvolutionBackwardFilter>{true}};
    size_t RUNS = 10;
    bench.set_proxy(proxy).set_times(RUNS);

    bench.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>(
            "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFTv7.6.3"));

    auto run = [&](size_t N, size_t OC, size_t g, size_t IH, size_t IW, size_t FH,
                   size_t SH, size_t PH) {
        bench.set_dtype(0, dtype::Float32())
                .set_dtype(1, dtype::Float32())
                .set_dtype(2, dtype::Float32());
        param::Convolution param;
        param.stride_h = param.stride_w = SH;
        param.pad_h = param.pad_w = FH / 2;
        param.sparse = param::Convolution::Sparse::GROUP;
        bench.set_param(param);
        bench.proxy()->target_execution_policy.algo.reset();
        TensorLayout src{{N, g, IH, IW}, dtype::Float32()},
                filter{{g, 1, 1, FH, FH}, dtype::Float32()};
        TensorLayout dst;
        {
            auto&& opr = handle_cuda()->create_operator<Convolution>();
            opr->param() = param;
            opr->deduce_layout(src, filter, dst);
        }
        auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS;
        float flo = 2.0 * N * g * dst[2] * dst[3] * FH * FH;
        printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(),
               filter.to_string().c_str(), dst.to_string().c_str());
        printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", time_ms_fp32,
               (flo / (time_ms_fp32 * 1e9)));
    };
    run(64, 384, 384, 32, 32, 31, 1, 15);
 }

 #endif

 #undef CUDNN_VERSION_STRING
--- a/dnn/test/cuda/cutlass_matmul.cpp
+++ b/dnn/test/cuda/cutlass_matmul.cpp
@@ -213,7 +213,7 @@ std::vector<BenchArgs> get_feat_model_args() {
    return args;
 }

 #if CUDA_VERSION >= 10020
 #if CUDA_VERSION >= 10010
 std::vector<BenchArgs> get_f16_feat_model_args() {
    std::vector<BenchArgs> args;
    args.emplace_back(BenchArgs{128, 9216, 9216});
@@ -367,7 +367,7 @@ MEGDNN_FOREACH_CUTLASS_KERNEL(cb)
 #undef cb
 #undef MEGDNN_FOREACH_CUTLASS_KERNEL

 #if CUDA_VERSION >= 10020
 #if CUDA_VERSION >= 10010
 #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb)     \
    cb(1, 256, 128, 32, 64, 64, 32, 8, 8, 4); \
    cb(2, 128, 256, 32, 64, 64, 32, 8, 8, 4); \
@@ -403,7 +403,9 @@ MEGDNN_FOREACH_CUTLASS_KERNEL(cb)
 #undef cb

 #undef MEGDNN_FOREACH_CUTLASS_KERNEL
 #endif

 #if CUDA_VERSION >= 10020
 #define MEGDNN_FOREACH_CUTLASS_KERNEL(cb)      \
    cb(1, 256, 128, 32, 64, 64, 32, 16, 8, 8); \
    cb(2, 128, 256, 32, 64, 64, 32, 16, 8, 8); \
@@ -454,7 +456,7 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_MATMUL_FEAT) {
            dtype::Float32(), "CUTLASS_FLOAT32_SIMT");
 }

 #if CUDA_VERSION >= 10020
 #if CUDA_VERSION >= 10010
 TEST_F(CUDA, BENCHMARK_CUTLASS_F16_MATMUL_FEAT) {
    benchmark_matrix_mul(
            handle_cuda(), get_f16_feat_model_args(), dtype::Float16(),
--- a/dnn/test/cuda/padding.cpp
+++ b/dnn/test/cuda/padding.cpp
@@ -101,6 +101,36 @@ TEST_F(CUDA, PADDING_REFLECT2) {
                                                             4, 1, 6, 3, 6, 1, 6, 3})});
 }

 TEST_F(CUDA, PADDING_REFLECT2_QUANTIZED) {
    Checker<Padding> checker(handle_cuda(), false);
    param::Padding param;
    param.padding_mode = param::Padding::PaddingMode::REFLECT;
    param.front_offset_dim0 = 2;
    param.front_offset_dim1 = 1;
    param.front_offset_dim2 = 0;
    param.front_offset_dim3 = 0;
    param.front_offset_dim4 = 0;
    param.front_offset_dim5 = 0;
    param.front_offset_dim6 = 0;
    param.back_offset_dim0 = 0;
    param.back_offset_dim1 = 2;
    param.back_offset_dim2 = 0;
    param.back_offset_dim3 = 0;
    param.back_offset_dim4 = 0;
    param.back_offset_dim5 = 0;
    param.back_offset_dim6 = 0;
    checker.set_param(param).exect(
            Testcase{
                    TensorValue(
                            {3, 3}, dtype::QuantizedS8(), {1, 2, 3, 4, 5, 6, 7, 8, 9}),
                    {}},
            Testcase{{}, TensorValue({5, 6}, dtype::QuantizedS8(), {8, 7, 8, 9, 8, 7, 5,
                                                                    4, 5, 6, 5, 4, 2, 1,
                                                                    2, 3, 2, 1, 5, 4, 5,
                                                                    6, 5, 4, 8, 7, 8, 9,
                                                                    8, 7})});
 }

 TEST_F(CUDA, PADDING_REPLICATE) {
    Checker<Padding> checker(handle_cuda(), false);
    param::Padding param;
--- a/dnn/test/naive/padding.cpp
+++ b/dnn/test/naive/padding.cpp
@@ -83,6 +83,36 @@ TEST_F(NAIVE, PADDING_REFLECT) {
                            {10}, dtype::Float32(), {3, 2, 1, 2, 3, 4, 5, 4, 3, 2})});
 }

 TEST_F(NAIVE, PADDING_REFLECT2) {
    Checker<Padding> checker(handle(), false);
    param::Padding param;
    param.padding_mode = param::Padding::PaddingMode::REFLECT;
    param.front_offset_dim0 = 2;
    param.front_offset_dim1 = 1;
    param.front_offset_dim2 = 0;
    param.front_offset_dim3 = 0;
    param.front_offset_dim4 = 0;
    param.front_offset_dim5 = 0;
    param.front_offset_dim6 = 0;
    param.back_offset_dim0 = 0;
    param.back_offset_dim1 = 2;
    param.back_offset_dim2 = 0;
    param.back_offset_dim3 = 0;
    param.back_offset_dim4 = 0;
    param.back_offset_dim5 = 0;
    param.back_offset_dim6 = 0;
    checker.set_param(param).exect(
            Testcase{
                    TensorValue(
                            {3, 3}, dtype::QuantizedS8(), {1, 2, 3, 4, 5, 6, 7, 8, 9}),
                    {}},
            Testcase{{}, TensorValue({5, 6}, dtype::QuantizedS8(), {8, 7, 8, 9, 8, 7, 5,
                                                                    4, 5, 6, 5, 4, 2, 1,
                                                                    2, 3, 2, 1, 5, 4, 5,
                                                                    6, 5, 4, 8, 7, 8, 9,
                                                                    8, 7})});
 }

 TEST_F(NAIVE, PADDING_REPLICATE) {
    Checker<Padding> checker(handle(), false);
    param::Padding param;
--- a/imperative/python/megengine/core/tensor/indexing.py
+++ b/imperative/python/megengine/core/tensor/indexing.py
@@ -119,12 +119,16 @@ def unpack_getitem(inp, tuple_val, *, allow_newaxis=True):
                else 1
            )
    else:
        if ndim_indexed > inp.ndim:
            raise IndexError(
                "too many indices for tensor: tensor is {}-dimensional, but {} were indexed".format(
                    inp.ndim, len(tuple_val)
        try:
            if ndim_indexed > inp.ndim:
                raise IndexError(
                    "too many indices for tensor: tensor is {}-dimensional, but {} were indexed".format(
                        inp.ndim, len(tuple_val)
                    )
                )
            )
        except ValueError:
            # ignore
            pass

    tuple_val = remove_ellipsis(inp, tuple_val)
    use_subtensor = True
--- a/imperative/python/megengine/module/conv.py
+++ b/imperative/python/megengine/module/conv.py
@@ -18,6 +18,7 @@ from ..functional import (
    conv_transpose3d,
    deformable_conv2d,
    local_conv2d,
    pad,
    relu,
 )
 from ..tensor import Parameter
@@ -126,7 +127,7 @@ class Conv1d(_ConvNd):
        kernel_size: size of weight on spatial dimensions.
        stride: stride of the 1D convolution operation.
        padding: size of the paddings added to the input on both sides of its
            spatial dimensions. Only zero-padding is supported. Default: 0
            spatial dimensions. Default: 0
        dilation: dilation of the 1D convolution operation. Default: 1
        groups: number of groups to divide input and output channels into,
            so as to perform a "grouped convolution". When ``groups`` is not 1,
@@ -139,6 +140,8 @@ class Conv1d(_ConvNd):
            placed on the precision of intermediate results. When set to "float32",
            "float32" would be used for accumulator and intermediate result, but only
            effective when input and output are of float16 dtype.
        padding_mode: "zeros", "reflect" or "replicate". Default: "zeros".
            Refer to :class:`~.module.padding.Pad` for more information.

    Note:
        * ``weight`` usually has shape ``(out_channels, in_channels, kernel_size)`` ,
@@ -177,6 +180,7 @@ class Conv1d(_ConvNd):
        bias: bool = True,
        conv_mode: str = "cross_correlation",
        compute_mode: str = "default",
        padding_mode: str = "zeros",
        **kwargs
    ):
        kernel_size = kernel_size
@@ -185,6 +189,7 @@ class Conv1d(_ConvNd):
        dilation = dilation
        self.conv_mode = conv_mode
        self.compute_mode = compute_mode
        self.padding_mode = padding_mode
        super().__init__(
            in_channels,
            out_channels,
@@ -223,7 +228,27 @@ class Conv1d(_ConvNd):
        # Assume format is NCH(W=1)
        return (1, self.out_channels, 1)

    def get_pad_witdth(self):
        return ((0, 0), (0, 0), (self.padding, self.padding))

    def calc_conv(self, inp, weight, bias):
        assert self.padding_mode in [
            "zeros",
            "reflect",
            "replicate",
        ]
        if self.padding_mode != "zeros":
            return conv1d(
                pad(inp, self.get_pad_witdth(), self.padding_mode),
                weight,
                bias,
                self.stride,
                0,
                self.dilation,
                self.groups,
                self.conv_mode,
                self.compute_mode,
            )
        return conv1d(
            inp,
            weight,
@@ -287,7 +312,7 @@ class Conv2d(_ConvNd):
            ``(kernel_size, kernel_size)``.
        stride: stride of the 2D convolution operation. Default: 1
        padding: size of the paddings added to the input on both sides of its
            spatial dimensions. Only zero-padding is supported. Default: 0
            spatial dimensions. Default: 0
        dilation: dilation of the 2D convolution operation. Default: 1
        groups: number of groups into which the input and output channels are divided,
            so as to perform a ``grouped convolution``. When ``groups`` is not 1,
@@ -300,6 +325,8 @@ class Conv2d(_ConvNd):
            placed on the precision of intermediate results. When set to "float32",
            "float32" would be used for accumulator and intermediate result, but only
            effective when input and output are of float16 dtype.
        padding_mode: "zeros", "reflect" or "replicate". Default: "zeros".
            Refer to :class:`~.module.padding.Pad` for more information.

    Note:
        * ``weight`` usually has shape ``(out_channels, in_channels, height, width)`` ,
@@ -338,6 +365,7 @@ class Conv2d(_ConvNd):
        bias: bool = True,
        conv_mode: str = "cross_correlation",
        compute_mode: str = "default",
        padding_mode: str = "zeros",
        **kwargs
    ):
        kernel_size = _pair_nonzero(kernel_size)
@@ -346,6 +374,7 @@ class Conv2d(_ConvNd):
        dilation = _pair_nonzero(dilation)
        self.conv_mode = conv_mode
        self.compute_mode = compute_mode
        self.padding_mode = padding_mode
        super().__init__(
            in_channels,
            out_channels,
@@ -384,7 +413,32 @@ class Conv2d(_ConvNd):
        # Assume format is NCHW
        return (1, self.out_channels, 1, 1)

    def get_pad_witdth(self):
        return (
            (0, 0),
            (0, 0),
            (self.padding[0], self.padding[0]),
            (self.padding[1], self.padding[1]),
        )

    def calc_conv(self, inp, weight, bias):
        assert self.padding_mode in [
            "zeros",
            "reflect",
            "replicate",
        ]
        if self.padding_mode != "zeros":
            return conv2d(
                pad(inp, self.get_pad_witdth(), self.padding_mode),
                weight,
                bias,
                self.stride,
                0,
                self.dilation,
                self.groups,
                self.conv_mode,
                self.compute_mode,
            )
        return conv2d(
            inp,
            weight,
--- a/imperative/python/megengine/module/conv_bn.py
+++ b/imperative/python/megengine/module/conv_bn.py
@@ -30,6 +30,7 @@ class _ConvBnActivation2d(Module):
        momentum=0.9,
        affine=True,
        track_running_stats=True,
        padding_mode: str = "zeros",
        **kwargs
    ):
        super().__init__(**kwargs)
@@ -44,6 +45,7 @@ class _ConvBnActivation2d(Module):
            bias,
            conv_mode,
            compute_mode,
            padding_mode,
            **kwargs,
        )
        self.bn = BatchNorm2d(out_channels, eps, momentum, affine, track_running_stats)
--- a/imperative/python/megengine/module/module.py
+++ b/imperative/python/megengine/module/module.py
@@ -138,11 +138,7 @@ class Module(metaclass=ABCMeta):
        return HookHandler(self._forward_hooks, hook)

    def __call__(self, *inputs, **kwargs):
        AutoNaming.push_scope(
            self.name
            if self.name is not None
            else (self._short_name if hasattr(self, "_short_name") else self._name)
        )
        AutoNaming.push_scope(self.name if self.name is not None else self._short_name)
        for hook in self._forward_pre_hooks.values():
            modified_inputs = hook(self, inputs)
            if modified_inputs is not None:
@@ -685,6 +681,12 @@ class Module(metaclass=ABCMeta):
            set_name(self, prefix, k, v)
        super().__setattr__(name, value)

    def __setstate__(self, state):
        if "_short_name" not in state:
            state["_short_name"] = state["_name"]
            state["_name"] = None
        self.__dict__.update(state)

    def __delattr__(self, name: str):
        if name in self.__dict__ and _is_module(self.__dict__[name]):
            modules = self.__dict__.get("_modules")
--- a/imperative/python/megengine/module/qat/conv.py
+++ b/imperative/python/megengine/module/qat/conv.py
@@ -38,6 +38,7 @@ class Conv2d(Float.Conv2d, QATModule):
            float_module.bias is not None,
            float_module.conv_mode,
            float_module.compute_mode,
            float_module.padding_mode,
            name=float_module.name,
        )
        qat_module.weight = float_module.weight
--- a/imperative/python/megengine/module/qat/conv_bn.py
+++ b/imperative/python/megengine/module/qat/conv_bn.py
@@ -147,6 +147,7 @@ class _ConvBnActivation2d(Float._ConvBnActivation2d, QATModule):
            float_module.conv.bias is not None,
            float_module.conv.conv_mode,
            float_module.conv.compute_mode,
            padding_mode=float_module.conv.padding_mode,
            name=float_module.name,
        )
        qat_module.conv.weight = float_module.conv.weight
--- a/imperative/python/megengine/module/quantized/conv.py
+++ b/imperative/python/megengine/module/quantized/conv.py
@@ -11,7 +11,7 @@ import numpy as np

 from ... import module as Float
 from ...core.tensor import dtype
 from ...functional.nn import conv_bias_activation
 from ...functional.nn import conv_bias_activation, pad
 from ...functional.quantized import conv_transpose2d
 from ...tensor import Parameter
 from ..qat import conv as QAT
@@ -38,6 +38,7 @@ class Conv2d(Float.Conv2d, QuantizedModule):
        conv_mode: str = "cross_correlation",
        compute_mode: str = "default",
        dtype=None,
        padding_mode: str = "zeros",
        **kwargs
    ):
        super().__init__(
@@ -51,13 +52,33 @@ class Conv2d(Float.Conv2d, QuantizedModule):
            True,
            conv_mode,
            compute_mode,
            padding_mode,
        )
        self.output_dtype = dtype

    def calc_conv_quantized(self, inp, nonlinear_mode="identity"):
        assert self.padding_mode in [
            "zeros",
            "reflect",
            "replicate",
        ]
        inp_scale = dtype.get_scale(inp.dtype)
        w_scale = dtype.get_scale(self.weight.dtype)
        bias_scale = inp_scale * w_scale
        if self.padding_mode != "zeros":
            return conv_bias_activation(
                pad(inp, self.get_pad_witdth(), self.padding_mode),
                self.weight,
                self.bias.astype(dtype.qint32(bias_scale)),
                self.output_dtype,
                self.stride,
                0,
                self.dilation,
                self.groups,
                conv_mode=self.conv_mode,
                compute_mode=self.compute_mode,
                nonlinear_mode=nonlinear_mode,
            )
        return conv_bias_activation(
            inp,
            self.weight,
@@ -88,6 +109,7 @@ class Conv2d(Float.Conv2d, QuantizedModule):
            qat_module.dilation,
            qat_module.groups,
            dtype=output_dtype,
            padding_mode=qat_module.padding_mode,
            name=qat_module.name,
        )
        weight = qat_module.weight.astype(qat_module.get_weight_dtype())
--- a/imperative/python/megengine/module/quantized/conv_bn.py
+++ b/imperative/python/megengine/module/quantized/conv_bn.py
@@ -31,6 +31,7 @@ class _ConvBnActivation2d(Conv2d):
            qat_module.conv.groups,
            dtype=output_dtype,
            name=qat_module.name,
            padding_mode=qat_module.conv.padding_mode,
        )
        w_fold, b_fold = qat_module.fold_weight_bias(
            qat_module.bn.running_mean, qat_module.bn.running_var
--- a/imperative/python/megengine/traced_module/compat.py
+++ b/imperative/python/megengine/traced_module/compat.py
@@ -126,6 +126,9 @@ def convbn2d_module_loader(expr):
        module = expr.inputs[0].owner
        if not hasattr(module.bn, "param_dim"):
            module.bn.param_dim = "dim_1c11"
    module = expr.inputs[0].owner
    if not hasattr(module.conv, "padding_mode"):
        module.conv.padding_mode = "zeros"


@register_opdef_loader(BatchNorm)
@@ -162,3 +165,36 @@ def tensor_gen_func_loader(expr):
        else:
            device = None
        expr.set_args_kwargs(shape, dtype=dtype, device=device)


@register_functional_loader(("megengine.functional.nn", "pad"))
 def pad_func_loader(expr):
    if "pad_witdth" in expr.kwargs:
        kwargs = expr.kwargs
        kwargs["pad_width"] = kwargs.pop("pad_witdth")
        expr.set_args_kwargs(*expr.args, **kwargs)


@register_module_loader(
    ("megengine.module.conv", "Conv1d"),
    ("megengine.module.conv", "Conv2d"),
    ("megengine.module.conv", "ConvRelu2d"),
    ("megengine.module.qat.conv", "Conv2d"),
    ("megengine.module.qat.conv", "ConvRelu2d"),
    ("megengine.module.quantized.conv", "Conv2d"),
    ("megengine.module.quantized.conv", "ConvRelu2d"),
 )
 def conv2d_module_loader(expr):
    module = expr.inputs[0].owner
    if not hasattr(module, "padding_mode"):
        module.padding_mode = "zeros"


@register_module_loader(
    ("megengine.module.quantized.conv_bn", "ConvBn2d"),
    ("megengine.module.quantized.conv_bn", "ConvBnRelu2d"),
 )
 def quantized_convbn2d_module_loader(expr):
    module = expr.inputs[0].owner
    if not hasattr(module, "padding_mode"):
        module.padding_mode = "zeros"
--- a/imperative/python/megengine/traced_module/serialization.py
+++ b/imperative/python/megengine/traced_module/serialization.py
@@ -50,7 +50,7 @@ class _ModuleState:
        if self.obj is None:
            typem = getattr(import_module(self.module[0]), self.module[1])
            m_obj = typem.__new__(typem)
            m_obj.__dict__.update(self.state)
            m_obj.__setstate__(self.state)
            self.obj = m_obj
        return self.obj

--- a/imperative/python/megengine/traced_module/traced_module.py
+++ b/imperative/python/megengine/traced_module/traced_module.py
@@ -1681,11 +1681,13 @@ class TracedModuleBuilder(NodeMixin):

                if isinstance(wrapped, TracedModuleBuilder):
                    if not isinstance(mod_attr, (List, Dict, QATModule)):
                        assert mod_attr is wrapped._mod
                else:
                        assert (
                            mod_attr is wrapped._mod
                        ), "TracedModule do not support modify module attributes, please check your code."
                if isinstance(wrapped, RawTensor):
                    assert (
                        mod_attr is wrapped
                    ), "TracedModule do not support modify attributes, please check your code."
                    ), "TracedModule do not support modify tensor attributes, please check your code."

                if isinstance(wrapped, (NodeMixin, RawTensor)):
                    NodeMixin.wrap(
@@ -2296,7 +2298,7 @@ class TracedModule(Module):
        for k, v in state.items():
            if isinstance(v, _ModuleState):
                state[k] = v.to_module()
        self.__dict__.update(state)
        super().__setstate__(state)
        self._update_ref()

        for _, graph in self.argdef_graph_map.items():
--- a/imperative/python/src/tensor.cpp
+++ b/imperative/python/src/tensor.cpp
@@ -272,16 +272,12 @@ PyObject* TensorWrapper::device() {

 PyObject* TensorWrapper::numpy() {
    auto hv = m_tensor->numpy();
    // if (!hv) {
    //     PyErr_SetString(PyExc_ValueError, "tensor invalid");
    //     return nullptr;
    // }
    auto arr = py::reinterpret_steal<py::array>(
            npy::ndarray_from_tensor(hv->as_nd(true), npy::ShareType::TRY_SHARE));
    if (!arr) {
    if (!hv) {
        PyErr_SetString(PyExc_ValueError, "tensor invalid");
        return nullptr;
    }
    auto arr = py::reinterpret_steal<py::array>(
            npy::ndarray_from_tensor(hv->as_nd(true), npy::ShareType::TRY_SHARE));
    if (hv->shape().is_scalar()) {
        mgb_assert(PyArray_Check(arr.ptr()));
        return PyArray_Squeeze(reinterpret_cast<PyArrayObject*>(arr.ptr()));
--- a/imperative/python/test/unit/core/test_indexing_op.py
+++ b/imperative/python/test/unit/core/test_indexing_op.py
@@ -7,6 +7,8 @@
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import collections
 import platform
 from tempfile import NamedTemporaryFile

 import numpy as np
 import pytest
@@ -16,6 +18,8 @@ import megengine
 import megengine.core.tensor.megbrain_graph as G
 import megengine.functional as F
 import megengine.jit as jit
 import megengine.random as rand
 import megengine.utils.comp_graph_tools as cgtools
 from megengine.core._imperative_rt.core2 import apply
 from megengine.core._trace_option import use_symbolic_shape
 from megengine.core.ops import builtin
@@ -724,3 +728,26 @@ def test_nd_int_indexing(symbolic):
            np.testing.assert_equal(out.numpy(), npy_out)

    run_test([inp, idx], lambda inp, idx: inp[idx])


@pytest.mark.skipif(
    platform.system() == "Windows", reason="windows temp file issue, fixme later"
 )
 def test_subtensor_when_shape_invalid():
    @jit.trace(symbolic=True, capture_as_const=True)
    def fun(inp):
        shape = inp.shape
        H = shape[-1]
        NH = H * 8 + 4
        arr = F.arange(4, NH, 8)
        arr_shape = arr.shape
        return arr_shape[0]

    inp = rand.uniform(size=[1, 3, 224, 224])
    fun(inp)

    with NamedTemporaryFile() as f:
        fun.dump(f.name, arg_names=["data"], optimize_for_inference=True)
        inp = rand.uniform(size=[1, 3, 512, 512])
        net = cgtools.GraphInference(f.name)
        net.run(inp_dict={"data": inp})
--- a/imperative/python/test/unit/core/test_serialization.py
+++ b/imperative/python/test/unit/core/test_serialization.py
@@ -87,3 +87,17 @@ def test_compatibility():

    test_old_tensor("tensor_v1_1.mge")
    test_old_tensor("tensor_v1_2.mge")

    t = mge.tensor([1])
    getattr(t, "qparams")
    new_args = t.__getnewargs__()
    assert (
        len(new_args) == 3
        and isinstance(new_args[0], np.ndarray)
        and new_args[1] == np.int32
        and isinstance(new_args[2], str)
    ), "Modify Tensor __getnewargs__ may break pickle serialization compatible"
    state = t.__getstate__()
    assert set(state.keys()) == set(
        ["qparams"]
    ), "Modify Tensor __getstate__ may break pickle serialization compatible"
--- a/imperative/python/test/unit/module/test_module.py
+++ b/imperative/python/test/unit/module/test_module.py
@@ -681,3 +681,27 @@ def test_repr_module_reset_attr():
    m1 = ResetAttrModule(False)
    output = [m0.__repr__(), m1.__repr__()]
    assert output == ground_truth


 def test_module_compatible():
    class Empty(Module):
        def forward(self):
            pass

    empty_module = Empty()
    old_attributes = set(
        [
            "_modules",
            "name",
            "training",
            "quantize_disabled",
            "_forward_pre_hooks",
            "_forward_hooks",
            "_name",
            "_short_name",
        ]
    )
    current_attributes = set(empty_module.__dict__.keys())
    assert (
        old_attributes == current_attributes
    ), "Add or delete attributes in Module class may break compatibility of pickle serialization"
--- a/imperative/python/test/unit/module/test_qat.py
+++ b/imperative/python/test/unit/module/test_qat.py
@@ -60,7 +60,18 @@ def test_qat_convbn2d():
        )


 def test_qat_conv():
@pytest.mark.parametrize(
    "padding, padding_mode",
    [
        (0, "zeros"),
        ((1, 2), "zeros"),
        (3, "reflect"),
        ((1, 2), "reflect"),
        (4, "replicate"),
        ((1, 2), "replicate"),
    ],
 )
 def test_qat_conv(padding, padding_mode):

    in_channels = 32
    out_channels = 64
@@ -72,7 +83,13 @@ def test_qat_conv():
            self.quant = QuantStub()
            self.dequant = DequantStub()
            self.conv = Conv2d(
                in_channels, out_channels, kernel_size, groups=groups, bias=bias
                in_channels,
                out_channels,
                kernel_size,
                groups=groups,
                bias=bias,
                padding=padding,
                padding_mode=padding_mode,
            )
            self.conv_relu = ConvRelu2d(
                out_channels, in_channels, kernel_size, groups=groups, bias=bias
--- a/imperative/python/test/unit/quantization/test_module.py
+++ b/imperative/python/test/unit/quantization/test_module.py
@@ -236,11 +236,16 @@ def test_linear():


@pytest.mark.parametrize("module", ["Conv2d", "ConvBn2d", "ConvBnRelu2d"])
 def test_conv(module):
    normal_net = getattr(Float, module)(3, 3, 3, 1, 1, 1, bias=True)
@pytest.mark.parametrize("padding_mode", ["zeros", "reflect", "replicate"])
 def test_conv(module, padding_mode):
    normal_net = getattr(Float, module)(
        3, 3, 3, 1, 1, 1, bias=True, padding_mode=padding_mode
    )
    normal_net.eval()

    qat_net = getattr(QAT, module)(3, 3, 3, 1, 1, 1, bias=True)
    qat_net = getattr(QAT, module)(
        3, 3, 3, 1, 1, 1, bias=True, padding_mode=padding_mode
    )
    qat_net.eval()
    disable_observer(qat_net)

--- a/imperative/src/impl/transformations/scalar.cpp
+++ b/imperative/src/impl/transformations/scalar.cpp
@@ -51,6 +51,7 @@ bool is_scalar_shape(ValueRef shape) {
    if (shape.is<ScalarValue>()) {
        return false;
    }
    // may have performance issue
    auto shape_of_shape = shape.shape();
    if (!shape_of_shape) {
        // assume not scalar
@@ -211,14 +212,21 @@ std::vector<ValueRef> subtensor_rule(
        const Subtensor& subtensor, Span<ValueRef> inputs) {
    mgb_assert(inputs.size() >= 1);
    auto input = inputs[0];
    size_t ndim = input.is<ScalarValue>() ? 0 : input.shape()->ndim;
    for (auto&& [axis, begin, end, step, idx] : subtensor.items) {
        if (idx) {
            ndim--;
    bool is_scalar;
    mgb_assert(!input.is<ScalarValue>(), "subtensor shouldn't have scalar input");
    if (auto shape = input.shape()) {
        size_t ndim = input.shape()->ndim;
        for (auto&& [axis, begin, end, step, idx] : subtensor.items) {
            if (idx) {
                ndim--;
            }
        }
        is_scalar = ndim == 0;
    } else {
        is_scalar = false;
    }
    auto output = imperative::apply(subtensor, unwrap_inputs(inputs))[0];
    if (!ndim) {
    if (is_scalar) {
        return {ScalarValue::make(output)};
    } else {
        return {output};
@@ -261,8 +269,7 @@ std::vector<ValueRef> fastpath_copy_rule(

 std::vector<ValueRef> reshape_rule(const Reshape& reshape, Span<ValueRef> inputs) {
    mgb_assert(inputs.size() == 2);
    bool is_scalar =
            (!inputs[1].is<ScalarValue>()) && *inputs[1].shape() == ValueShape{0};
    bool is_scalar = is_scalar_shape(inputs[1]);
    auto unwrapped_input = inputs[0].is<ScalarValue>()
                                 ? inputs[0].cast<ScalarValue>().value()
                                 : inputs[0];
--- a/src/core/include/megbrain/version.h
+++ b/src/core/include/megbrain/version.h
@@ -15,7 +15,7 @@

 #define MGE_MAJOR 1
 #define MGE_MINOR 8
 #define MGE_PATCH 0
 #define MGE_PATCH 2

 // for rc version, could be like "rc1", "rc2", etc
 #define MGE_EXTRA_NAME ""
--- a/src/megbrain_build_config.h.in
+++ b/src/megbrain_build_config.h.in
@@ -57,6 +57,7 @@
 #cmakedefine01 MEGDNN_64_BIT
 #cmakedefine01 MEGDNN_THREADS_512
 #cmakedefine01 MEGDNN_ENABLE_MULTI_THREADS
 #cmakedefine01 MEGDNN_WITH_BENCHMARK

 // whether atlas is available
 #ifndef MGB_ATLAS
--- a/third_party/cutlass
+++ b/third_party/cutlass
@@ -1 +1 @@
 Subproject commit 31798848e40c2752d4b3db193491a63b77455029
 Subproject commit 5a639171ed8489d877a56bfd28cfd9e6df8c5403
Author	SHA1	Message	Date
megvii-mge	c42ce93705	feat(mge/third_party): update cutlass version	3 years ago
温娟	9902ccfcb0	chore(release): bump version	3 years ago
Megvii Engine Team	8e5410e41f	feat(cuda): add fp16 compute 16 kernel GitOrigin-RevId: `e03435be02`	3 years ago
Megvii Engine Team	472e2f9655	refactor(cuda): depthwish large kernel GitOrigin-RevId: `dade8710b4`	3 years ago
Megvii Engine Team	e698ec20c2	feat(cuda): float16 depthwise large kernel conv compute fp32 GitOrigin-RevId: `3050d48f26`	3 years ago
Megvii Engine Team	48406382ce	feat(cuda): support float16 depthwise large kernel conv GitOrigin-RevId: `fdc1b15fbc`	4 years ago
Megvii Engine Team	7042f76b34	perf(cuda): speedup conv backward data with small feature map and large filter size GitOrigin-RevId: `85592bca6b`	4 years ago
Megvii Engine Team	87a2aeebb1	perf(cuda): speedup chanwise conv with small feature map and large filter size GitOrigin-RevId: `e65b2ce856`	4 years ago
Megvii Engine Team	2293385e93	feat(mge): add conv padding mode GitOrigin-RevId: `147ced856e`	4 years ago
Megvii Engine Team	afe9c4b50d	feat(dnn/cuda): add implicit bmm kernels for large kernel depthwise convolution backward filter opr GitOrigin-RevId: `932e7689e8`	4 years ago
Megvii Engine Team	e8a169292f	feat(dnn/cuda): add heuristic rule for implicit batched gemm large kernel dwconv2d kernels GitOrigin-RevId: `2d2c213bfd`	4 years ago
Megvii Engine Team	38067472d2	fix(dnn/cuda): fix ci GitOrigin-RevId: `8267e5f9dd`	4 years ago
Megvii Engine Team	1da58ae17a	feat(dnn/cuda): add implicit bmm large kernel dwconv2d dgrad kernels GitOrigin-RevId: `fcb7974d62`	4 years ago
Megvii Engine Team	96050073a2	feat(dnn/cuda): add implicit bmm large kernel dwconv2d fprop impl GitOrigin-RevId: `feb09ebb58`	4 years ago
温娟	19fe2e94e7	chore(release): bump version	4 years ago
Megvii Engine Team	1add4517ad	test(trace): test subtensor on unknown shape GitOrigin-RevId: `1b5cfa4e0a`	4 years ago
Megvii Engine Team	54eef55871	fix(trace): assume result is not scalar when shape is valid GitOrigin-RevId: `beee2d0f28`	4 years ago
Megvii Engine Team	84d99d1cc4	fix(traced_module): fix Module compatible issue and traced module getattr check GitOrigin-RevId: `62eb3bfb10`	4 years ago