GitOrigin-RevId: 0da9b3bca8
tags/v1.11.1
| @@ -19,6 +19,7 @@ genrule( | |||
| CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_dgrad --type tensorop884 $(@D) | |||
| CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_wgrad --type simt $(@D) | |||
| CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations dwconv2d_wgrad --type tensorop884 $(@D) | |||
| CUTLASS_WITH_LONG_PATH=true python3 $$GEN --operations rrconv2d_wgrad --type simt $(@D) | |||
| """, | |||
| tools = ["//brain/megbrain/dnn/scripts/cutlass_generator:generator.py"], | |||
| visibility = ["//visibility:public"], | |||
| @@ -35,6 +35,8 @@ class Conv2dOperation: | |||
| without_shared_load=False, | |||
| required_cuda_ver_major=9, | |||
| required_cuda_ver_minor=2, | |||
| rin=None, | |||
| rout=None, | |||
| ): | |||
| self.operation_kind = OperationKind.Conv2d | |||
| @@ -54,6 +56,8 @@ class Conv2dOperation: | |||
| self.without_shared_load = without_shared_load | |||
| self.required_cuda_ver_major = required_cuda_ver_major | |||
| self.required_cuda_ver_minor = required_cuda_ver_minor | |||
| self.rin = rin | |||
| self.rout = rout | |||
| # | |||
| def accumulator_type(self): | |||
| @@ -95,6 +99,8 @@ class Conv2dOperation: | |||
| conv_type_name = "" | |||
| if self.conv_type == ConvType.DepthwiseConvolution: | |||
| conv_type_name = "dw" | |||
| elif self.conv_type == ConvType.RegionRestrictedConvolution: | |||
| conv_type_name = "rr" | |||
| return "%s%s%s%s%s%s%s_%s" % ( | |||
| ShortDataTypeNames[self.accumulator_type()], | |||
| @@ -125,6 +131,9 @@ class Conv2dOperation: | |||
| elif self.src.element == self.flt.element: | |||
| extended_name = "${core_name}_${element_src}" | |||
| if self.rin != None: | |||
| extended_name += "_${element_rin}" | |||
| extended_name = SubstituteTemplate( | |||
| extended_name, | |||
| { | |||
| @@ -132,6 +141,7 @@ class Conv2dOperation: | |||
| "element_flt": DataTypeNames[self.flt.element], | |||
| "element_dst": DataTypeNames[self.dst.element], | |||
| "core_name": self.core_name(), | |||
| "element_rin": DataTypeNames[self.rin.element], | |||
| }, | |||
| ) | |||
| @@ -512,6 +522,115 @@ using Convolution_${operation_name} = | |||
| return SubstituteTemplate(self.template, values) | |||
| class EmitRegionRestrictedConvolutionBackwardFilterInstance: | |||
| def __init__(self): | |||
| self.template = """ | |||
| // kernel instance "${operation_name}" generated by cutlass generator | |||
| using Convolution_${operation_name} = | |||
| typename cutlass::conv::device::RegionRestrictedConvolutionBackwardFilter< | |||
| ${element_src}, | |||
| ${layout_src}, | |||
| ${element_diff}, | |||
| ${layout_diff}, | |||
| ${element_src_mask}, | |||
| ${layout_src_mask}, | |||
| ${element_output_mask}, | |||
| ${layout_output_mask}, | |||
| ${element_grad}, | |||
| ${layout_grad}, | |||
| ${element_accumulator}, | |||
| ${conv_type}, | |||
| ${opcode_class}, | |||
| ${arch}, | |||
| cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, | |||
| cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, | |||
| cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, | |||
| ${epilogue_functor}< | |||
| ${element_grad}, | |||
| ${epilogue_vector_length}, | |||
| ${element_accumulator}, | |||
| ${element_epilogue} | |||
| >, | |||
| ${swizzling_functor}, | |||
| ${stages}, | |||
| ${alignment_src}, | |||
| ${alignment_diff}, | |||
| ${alignment_src_mask}, | |||
| ${alignment_output_mask}, | |||
| ${special_optimization}, | |||
| ${math_operator}, | |||
| ${implicit_gemm_mode}>; | |||
| """ | |||
| def emit(self, operation): | |||
| warp_shape = [ | |||
| int( | |||
| operation.tile_description.threadblock_shape[idx] | |||
| / operation.tile_description.warp_count[idx] | |||
| ) | |||
| for idx in range(3) | |||
| ] | |||
| epilogue_vector_length = int( | |||
| min(operation.dst.alignment * DataTypeSize[operation.dst.element], 128) | |||
| / DataTypeSize[operation.dst.element] | |||
| ) | |||
| values = { | |||
| "operation_name": operation.procedural_name(), | |||
| "conv_type": ConvTypeTag[operation.conv_type], | |||
| "element_src": DataTypeTag[operation.src.element], | |||
| "layout_src": LayoutTag[operation.src.layout], | |||
| "element_diff": DataTypeTag[operation.flt.element], | |||
| "layout_diff": LayoutTag[operation.flt.layout], | |||
| "element_src_mask": DataTypeTag[operation.rin.element], | |||
| "layout_src_mask": LayoutTag[operation.rin.layout], | |||
| "element_output_mask": DataTypeTag[operation.rout.element], | |||
| "layout_output_mask": LayoutTag[operation.rout.layout], | |||
| "element_grad": DataTypeTag[operation.dst.element], | |||
| "layout_grad": LayoutTag[operation.dst.layout], | |||
| "element_accumulator": DataTypeTag[operation.accumulator_type()], | |||
| "opcode_class": OpcodeClassTag[ | |||
| operation.tile_description.math_instruction.opcode_class | |||
| ], | |||
| "arch": "cutlass::arch::Sm%d" % operation.arch, | |||
| "threadblock_shape_m": str(operation.tile_description.threadblock_shape[0]), | |||
| "threadblock_shape_n": str(operation.tile_description.threadblock_shape[1]), | |||
| "threadblock_shape_k": str(operation.tile_description.threadblock_shape[2]), | |||
| "warp_shape_m": str(warp_shape[0]), | |||
| "warp_shape_n": str(warp_shape[1]), | |||
| "warp_shape_k": str(warp_shape[2]), | |||
| "instruction_shape_m": str( | |||
| operation.tile_description.math_instruction.instruction_shape[0] | |||
| ), | |||
| "instruction_shape_n": str( | |||
| operation.tile_description.math_instruction.instruction_shape[1] | |||
| ), | |||
| "instruction_shape_k": str( | |||
| operation.tile_description.math_instruction.instruction_shape[2] | |||
| ), | |||
| "epilogue_vector_length": str(epilogue_vector_length), | |||
| "epilogue_functor": EpilogueFunctorTag[operation.epilogue_functor], | |||
| "element_epilogue": str(DataTypeTag[operation.element_epilogue]), | |||
| "swizzling_functor": SwizzlingFunctorTag[operation.swizzling_functor], | |||
| "stages": str(operation.tile_description.stages), | |||
| "alignment_src": str(operation.src.alignment), | |||
| "alignment_diff": str(operation.flt.alignment), | |||
| "alignment_src_mask": str(operation.rin.alignment), | |||
| "alignment_output_mask": str(operation.rout.alignment), | |||
| "special_optimization": SpecialOptimizeDescTag[ | |||
| operation.special_optimization | |||
| ], | |||
| "math_operator": MathOperationTag[ | |||
| operation.tile_description.math_instruction.math_operation | |||
| ], | |||
| "implicit_gemm_mode": ImplicitGemmModeTag[operation.implicit_gemm_mode], | |||
| } | |||
| return SubstituteTemplate(self.template, values) | |||
| ################################################################################################### | |||
| # | |||
| # Generator functions for all layouts | |||
| @@ -540,7 +659,10 @@ def GenerateConv2d( | |||
| operations = [] | |||
| element_epilogue = DataType.f32 | |||
| if conv_type == ConvType.DepthwiseConvolution: | |||
| if ( | |||
| conv_type == ConvType.DepthwiseConvolution | |||
| or conv_type == ConvType.RegionRestrictedConvolution | |||
| ): | |||
| if conv_kind == ConvKind.Fprop: | |||
| swizzling_functor = SwizzlingFunctor.DepthwiseConvolutionFprop | |||
| elif conv_kind == ConvKind.Dgrad: | |||
| @@ -680,6 +802,16 @@ def GenerateConv2d( | |||
| flt_layout, | |||
| int(flt_align / DataTypeSize[tile.math_instruction.element_a]), | |||
| ) | |||
| rin = TensorDescription( | |||
| tile.math_instruction.element_rin, | |||
| src_layout, | |||
| int(src_align / DataTypeSize[tile.math_instruction.element_rin]), | |||
| ) | |||
| rout = TensorDescription( | |||
| tile.math_instruction.element_rout, | |||
| dst_layout, | |||
| int(dst_align / DataTypeSize[tile.math_instruction.element_rout]), | |||
| ) | |||
| bias = TensorDescription( | |||
| bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type])) | |||
| ) | |||
| @@ -704,6 +836,8 @@ def GenerateConv2d( | |||
| without_shared_load, | |||
| required_cuda_ver_major, | |||
| required_cuda_ver_minor, | |||
| rin, | |||
| rout, | |||
| ) | |||
| operations.append(new_operation) | |||
| if use_special_optimization != SpecialOptimizeDesc.NoneSpecialOpt: | |||
| @@ -724,6 +858,8 @@ def GenerateConv2d( | |||
| without_shared_load, | |||
| required_cuda_ver_major, | |||
| required_cuda_ver_minor, | |||
| rin, | |||
| rout, | |||
| ) | |||
| operations.append(new_operation) | |||
| return operations | |||
| @@ -955,5 +1091,89 @@ void initialize_${operation_name}(Manifest &manifest) { | |||
| self.kernel_file.close() | |||
| class EmitRegionRestrictedConvSingleKernelWrapper: | |||
| def __init__(self, kernel_path, operation, short_path=False): | |||
| self.kernel_path = kernel_path | |||
| self.operation = operation | |||
| self.short_path = short_path | |||
| # Now only support wgrad | |||
| assert self.operation.conv_kind == ConvKind.Wgrad | |||
| self.instance_emitter = EmitRegionRestrictedConvolutionBackwardFilterInstance() | |||
| self.convolution_name = "RegionRestrictedConvolutionBackwardFilterOperation" | |||
| self.header_template = """ | |||
| #if __CUDACC_VER_MAJOR__ > ${required_cuda_ver_major} || (__CUDACC_VER_MAJOR__ == ${required_cuda_ver_major} && __CUDACC_VER_MINOR__ >= ${required_cuda_ver_minor}) | |||
| // ignore warning of cutlass | |||
| #pragma GCC diagnostic push | |||
| #pragma GCC diagnostic ignored "-Wunused-parameter" | |||
| #pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
| #pragma GCC diagnostic ignored "-Wuninitialized" | |||
| #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" | |||
| #include "cutlass/convolution/device/convolution.h" | |||
| #include "src/cuda/cutlass/manifest.h" | |||
| #include "src/cuda/cutlass/convolution_operation.h" | |||
| """ | |||
| self.instance_template = """ | |||
| ${operation_instance} | |||
| """ | |||
| self.manifest_template = """ | |||
| namespace cutlass { | |||
| namespace library { | |||
| void initialize_${operation_name}(Manifest &manifest) { | |||
| manifest.append(new ${convolution_name}<Convolution_${operation_name}>( | |||
| "${operation_name}" | |||
| )); | |||
| } | |||
| } // namespace library | |||
| } // namespace cutlass | |||
| """ | |||
| self.epilogue_template = """ | |||
| #pragma GCC diagnostic pop | |||
| #endif | |||
| """ | |||
| # | |||
| def __enter__(self): | |||
| if self.short_path: | |||
| self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % GlobalCnt.cnt) | |||
| GlobalCnt.cnt += 1 | |||
| else: | |||
| self.kernel_path = os.path.join( | |||
| self.kernel_path, "%s.cu" % self.operation.procedural_name() | |||
| ) | |||
| self.kernel_file = open(self.kernel_path, "w") | |||
| return self | |||
| # | |||
| def emit(self): | |||
| self.kernel_file.write( | |||
| SubstituteTemplate( | |||
| self.instance_template, | |||
| {"operation_instance": self.instance_emitter.emit(self.operation)}, | |||
| ) | |||
| ) | |||
| # emit manifest helper | |||
| manifest = SubstituteTemplate( | |||
| self.manifest_template, | |||
| { | |||
| "operation_name": self.operation.procedural_name(), | |||
| "convolution_name": self.convolution_name, | |||
| }, | |||
| ) | |||
| self.kernel_file.write(manifest) | |||
| # | |||
| def __exit__(self, exception_type, exception_value, traceback): | |||
| self.kernel_file.close() | |||
| ################################################################################################### | |||
| ################################################################################################### | |||
| @@ -64,4 +64,5 @@ if __name__ == "__main__": | |||
| write_merge_file_name(f, "dwconv2d_dgrad", "tensorop884", 4) | |||
| write_merge_file_name(f, "dwconv2d_wgrad", "simt", 2) | |||
| write_merge_file_name(f, "dwconv2d_wgrad", "tensorop884", 4) | |||
| write_merge_file_name(f, "rrconv2d_wgrad", "simt", 2) | |||
| f.write("]") | |||
| @@ -1260,6 +1260,218 @@ def GenerateDwconv2d_Simt(args, conv_kind): | |||
| return operations | |||
| def GenerateRegionRestrictedconv2d_Simt(args, conv_kind): | |||
| ################################################################################ | |||
| # warps per threadblock | |||
| ################################################################################ | |||
| warpsPerThreadblocks = [] | |||
| for warpsPerThreadblock0 in warpsPerThreadblockEdge: | |||
| for warpsPerThreadblock1 in warpsPerThreadblockEdge: | |||
| if ( | |||
| warpsPerThreadblock0 / warpsPerThreadblock1 <= warpsPerThreadblockRatio | |||
| and warpsPerThreadblock1 / warpsPerThreadblock0 | |||
| <= warpsPerThreadblockRatio | |||
| and warpsPerThreadblock0 * warpsPerThreadblock1 | |||
| <= warpsPerThreadblockMax | |||
| ): | |||
| warpsPerThreadblocks.append( | |||
| [warpsPerThreadblock0, warpsPerThreadblock1] | |||
| ) | |||
| ################################################################################ | |||
| # warp shapes | |||
| ################################################################################ | |||
| warpNumThreads = 32 | |||
| warpShapes = [] | |||
| for warp0 in warpShapeEdges: | |||
| for warp1 in warpShapeEdges: | |||
| if ( | |||
| warp0 / warp1 <= warpShapeRatio | |||
| and warp1 / warp0 <= warpShapeRatio | |||
| and warp0 * warp1 <= warpShapeMax | |||
| and warp0 * warp1 > warpShapeMin | |||
| ): | |||
| warpShapes.append([warp0, warp1]) | |||
| # sgemm | |||
| ( | |||
| precisionType, | |||
| precisionBits, | |||
| threadblockMaxElements, | |||
| threadblockTilesL0, | |||
| ) = precisions["s"] | |||
| layouts = [(LayoutType.TensorNCHW, LayoutType.TensorNCHW)] | |||
| math_instructions = [ | |||
| MathInstruction( | |||
| [1, 1, 1], | |||
| DataType.f32, | |||
| DataType.f32, | |||
| DataType.f32, | |||
| OpcodeClass.Simt, | |||
| MathOperation.multiply_add, | |||
| DataType.s32, | |||
| DataType.s32, | |||
| ), | |||
| MathInstruction( | |||
| [1, 1, 1], | |||
| DataType.f32, | |||
| DataType.f32, | |||
| DataType.f32, | |||
| OpcodeClass.Simt, | |||
| MathOperation.multiply_add, | |||
| DataType.s8, | |||
| DataType.s8, | |||
| ), | |||
| ] | |||
| min_cc = 50 | |||
| max_cc = 1024 | |||
| dst_layouts = [LayoutType.TensorNCHW] | |||
| dst_types = [DataType.f32] | |||
| if conv_kind == ConvKind.Wgrad: | |||
| alignment_constraints = [32] | |||
| else: | |||
| alignment_constraints = [128, 32] | |||
| operations = [] | |||
| for math_inst in math_instructions: | |||
| tile_descriptions = [ | |||
| TileDescription([128, 128, 8], 1, [4, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([128, 64, 8], 1, [2, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([64, 128, 8], 1, [2, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([128, 32, 8], 1, [2, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([32, 128, 8], 1, [1, 2, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([64, 64, 8], 1, [2, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([32, 64, 8], 1, [1, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([64, 32, 8], 1, [1, 1, 1], math_inst, min_cc, max_cc), | |||
| TileDescription([32, 32, 8], 1, [1, 1, 1], math_inst, min_cc, max_cc), | |||
| ] | |||
| for warpsPerThreadblock in warpsPerThreadblocks: | |||
| for warpShape in warpShapes: | |||
| warpThreadsM = 0 | |||
| if warpShape[0] > warpShape[1]: | |||
| warpThreadsM = 8 | |||
| else: | |||
| warpThreadsM = 4 | |||
| warpThreadsN = warpNumThreads / warpThreadsM | |||
| # skip shapes with conflicting rectangularity | |||
| # they are unlikely to be fastest | |||
| blockG = warpsPerThreadblock[0] > warpsPerThreadblock[1] | |||
| blockL = warpsPerThreadblock[0] < warpsPerThreadblock[1] | |||
| warpG = warpShape[0] > warpShape[1] | |||
| warpL = warpShape[0] < warpShape[1] | |||
| blockG2 = warpsPerThreadblock[0] > warpsPerThreadblock[1] * 2 | |||
| blockL2 = warpsPerThreadblock[0] * 2 < warpsPerThreadblock[1] | |||
| warpG2 = warpShape[0] > warpShape[1] * 2 | |||
| warpL2 = warpShape[0] * 2 < warpShape[1] | |||
| if blockG2 and warpL: | |||
| continue | |||
| if blockL2 and warpG: | |||
| continue | |||
| if warpG2 and blockL: | |||
| continue | |||
| if warpL2 and blockG: | |||
| continue | |||
| # check threadblock ratios and max | |||
| threadblockTile = [ | |||
| warpShape[0] * warpsPerThreadblock[0], | |||
| warpShape[1] * warpsPerThreadblock[1], | |||
| ] | |||
| if threadblockTile[0] * threadblockTile[1] > threadblockMaxElements: | |||
| continue | |||
| if threadblockTile[0] > threadblockEdgeMax: | |||
| continue | |||
| if threadblockTile[1] > threadblockEdgeMax: | |||
| continue | |||
| totalThreads = ( | |||
| warpNumThreads * warpsPerThreadblock[0] * warpsPerThreadblock[1] | |||
| ) | |||
| # calculate unroll | |||
| # ensure that every iteration at least a full load of A,B are done | |||
| unrollMin = 8 | |||
| unrollMin0 = totalThreads // threadblockTile[0] | |||
| unrollMin1 = totalThreads // threadblockTile[1] | |||
| unroll = max(unrollMin, unrollMin0, unrollMin1) | |||
| threadTileM = warpShape[0] // warpThreadsM | |||
| threadTileN = warpShape[1] // warpThreadsN | |||
| if threadTileM < 2 or threadTileN < 2: | |||
| continue | |||
| if threadTileM * threadTileN * precisionBits > 8 * 8 * 32: | |||
| continue | |||
| # epilogue currently only supports N < WarpNumThreads | |||
| if threadblockTile[1] < warpNumThreads: | |||
| continue | |||
| # limit smem | |||
| smemBitsA = threadblockTile[0] * unroll * 2 * precisionBits | |||
| smemBitsB = threadblockTile[1] * unroll * 2 * precisionBits | |||
| smemKBytes = (smemBitsA + smemBitsB) / 8 / 1024 | |||
| if smemKBytes > 48: | |||
| continue | |||
| tile = TileDescription( | |||
| [threadblockTile[0], threadblockTile[1], unroll], | |||
| 1, | |||
| [ | |||
| threadblockTile[0] // warpShape[0], | |||
| threadblockTile[1] // warpShape[1], | |||
| 1, | |||
| ], | |||
| math_inst, | |||
| min_cc, | |||
| max_cc, | |||
| ) | |||
| def filter(t: TileDescription) -> bool: | |||
| nonlocal tile | |||
| return ( | |||
| t.threadblock_shape[0] == tile.threadblock_shape[0] | |||
| and t.threadblock_shape[1] == tile.threadblock_shape[1] | |||
| and t.threadblock_shape[2] == tile.threadblock_shape[2] | |||
| and t.warp_count[0] == tile.warp_count[0] | |||
| and t.warp_count[1] == tile.warp_count[1] | |||
| and t.warp_count[2] == tile.warp_count[2] | |||
| and t.stages == tile.stages | |||
| ) | |||
| if not any(t for t in tile_descriptions if filter(t)): | |||
| continue | |||
| for layout in layouts: | |||
| for dst_type, dst_layout in zip(dst_types, dst_layouts): | |||
| for alignment_src in alignment_constraints: | |||
| operations += GenerateConv2d( | |||
| ConvType.RegionRestrictedConvolution, | |||
| conv_kind, | |||
| [tile], | |||
| layout[0], | |||
| layout[1], | |||
| dst_layout, | |||
| dst_type, | |||
| min_cc, | |||
| alignment_src, | |||
| 32, | |||
| 32, | |||
| SpecialOptimizeDesc.NoneSpecialOpt, | |||
| ImplicitGemmMode.GemmNT | |||
| if conv_kind == ConvKind.Wgrad | |||
| else ImplicitGemmMode.GemmTN, | |||
| ) | |||
| return operations | |||
| # | |||
| def GenerateDwconv2d_TensorOp_884(args, conv_kind): | |||
| layouts = [(LayoutType.TensorNCHW, LayoutType.TensorNCHW)] | |||
| @@ -1644,6 +1856,14 @@ def GenerateDwconv2dWgradOperations(args): | |||
| return GenerateDwconv2d_TensorOp_884(args, ConvKind.Wgrad) | |||
| def GenerateRegionRestrictedconv2dWgradOperations(args): | |||
| assert args.type == "simt", ( | |||
| "operation RegionRestrictedconv2d wgrad only support" | |||
| "simt. (got:{})".format(args.type) | |||
| ) | |||
| return GenerateRegionRestrictedconv2d_Simt(args, ConvKind.Wgrad) | |||
| def GenerateGemmOperations(args): | |||
| if args.type == "tensorop884": | |||
| return GeneratesGemm_TensorOp_884(args) | |||
| @@ -1698,6 +1918,8 @@ def ConcatFile( | |||
| sub_string_1 = sub_string_2 = "simt" | |||
| if "dwconv2d_" in operations: | |||
| filtered_operations = operations[:2] + operations[9:] | |||
| if "rrconv2d_" in operations: | |||
| filtered_operations = operations[:2] + operations[9:] | |||
| elif ("conv2d" in operations) or ("deconv" in operations): | |||
| filtered_operations = "cutlass" | |||
| else: | |||
| @@ -1893,6 +2115,7 @@ if __name__ == "__main__": | |||
| "dwconv2d_fprop", | |||
| "dwconv2d_dgrad", | |||
| "dwconv2d_wgrad", | |||
| "rrconv2d_wgrad", | |||
| ], | |||
| required=True, | |||
| help="Specifies the operation to generate (gemm, gemv, conv2d, deconv, dwconv2d_fprop, dwconv2d_dgrad, dwconv2d_wgrad)", | |||
| @@ -1928,9 +2151,11 @@ if __name__ == "__main__": | |||
| operations = GenerateDwconv2dFpropOperations(args) | |||
| elif args.operations == "dwconv2d_dgrad": | |||
| operations = GenerateDwconv2dDgradOperations(args) | |||
| else: | |||
| assert args.operations == "dwconv2d_wgrad", "invalid operation" | |||
| elif args.operations == "dwconv2d_wgrad": | |||
| operations = GenerateDwconv2dWgradOperations(args) | |||
| else: | |||
| assert args.operations == "rrconv2d_wgrad", "invalid operation" | |||
| operations = GenerateRegionRestrictedconv2dWgradOperations(args) | |||
| if ( | |||
| args.operations == "conv2d" | |||
| @@ -1974,6 +2199,42 @@ if __name__ == "__main__": | |||
| required_cuda_ver_minor, | |||
| epilogue, | |||
| ) | |||
| elif args.operations == "rrconv2d_wgrad": | |||
| for operation in operations: | |||
| with EmitRegionRestrictedConvSingleKernelWrapper( | |||
| args.output, operation, short_path | |||
| ) as emitter: | |||
| emitter.emit() | |||
| head = EmitRegionRestrictedConvSingleKernelWrapper( | |||
| args.output, operations[0], short_path | |||
| ).header_template | |||
| required_cuda_ver_major = operations[0].required_cuda_ver_major | |||
| required_cuda_ver_minor = operations[0].required_cuda_ver_minor | |||
| epilogue = EmitRegionRestrictedConvSingleKernelWrapper( | |||
| args.output, operations[0], short_path | |||
| ).epilogue_template | |||
| if "tensorop" in args.type: | |||
| ConcatFile( | |||
| 4, | |||
| args.output, | |||
| args.operations, | |||
| args.type, | |||
| head, | |||
| required_cuda_ver_major, | |||
| required_cuda_ver_minor, | |||
| epilogue, | |||
| ) | |||
| else: | |||
| ConcatFile( | |||
| 2, | |||
| args.output, | |||
| args.operations, | |||
| args.type, | |||
| head, | |||
| required_cuda_ver_major, | |||
| required_cuda_ver_minor, | |||
| epilogue, | |||
| ) | |||
| elif args.operations == "gemm": | |||
| for operation in operations: | |||
| with EmitGemmSingleKernelWrapper( | |||
| @@ -532,6 +532,7 @@ class ConvType(enum.Enum): | |||
| Local = enum_auto() | |||
| LocalShare = enum_auto() | |||
| DepthwiseConvolution = enum_auto() | |||
| RegionRestrictedConvolution = enum_auto() | |||
| ConvTypeTag = { | |||
| @@ -540,6 +541,8 @@ ConvTypeTag = { | |||
| ConvType.Local: "cutlass::conv::ConvType::kLocal", | |||
| ConvType.LocalShare: "cutlass::conv::ConvType::kLocalShare", | |||
| ConvType.DepthwiseConvolution: "cutlass::conv::ConvType::kDepthwiseConvolution", | |||
| # RegionRestrictedConvolution using the same conv type with Depthwise | |||
| ConvType.RegionRestrictedConvolution: "cutlass::conv::ConvType::kDepthwiseConvolution", | |||
| } | |||
| # | |||
| @@ -640,6 +643,8 @@ class MathInstruction: | |||
| element_accumulator, | |||
| opcode_class, | |||
| math_operation=MathOperation.multiply_add, | |||
| element_rin=DataType.s32, | |||
| element_rout=DataType.s32, | |||
| ): | |||
| self.instruction_shape = instruction_shape | |||
| self.element_a = element_a | |||
| @@ -647,6 +652,8 @@ class MathInstruction: | |||
| self.element_accumulator = element_accumulator | |||
| self.opcode_class = opcode_class | |||
| self.math_operation = math_operation | |||
| self.element_rin = element_rin | |||
| self.element_rout = element_rout | |||
| # | |||
| @@ -85,4 +85,7 @@ cutlass_gen_list = [ | |||
| "dwconv2d_wgrad_tensorop884_2.cu", | |||
| "dwconv2d_wgrad_tensorop884_3.cu", | |||
| "all_dwconv2d_wgrad_tensorop884_operations.cu", | |||
| "rrconv2d_wgrad_simt_0.cu", | |||
| "rrconv2d_wgrad_simt_1.cu", | |||
| "all_rrconv2d_wgrad_simt_operations.cu", | |||
| ] | |||
| @@ -188,6 +188,7 @@ if(MGE_WITH_CUDA) | |||
| gen_cutlass_kimpl(dwconv2d_dgrad tensorop884 CUTLASS_SOURCES) | |||
| gen_cutlass_kimpl(dwconv2d_wgrad simt CUTLASS_SOURCES) | |||
| gen_cutlass_kimpl(dwconv2d_wgrad tensorop884 CUTLASS_SOURCES) | |||
| gen_cutlass_kimpl(rrconv2d_wgrad simt CUTLASS_SOURCES) | |||
| list(PREPEND CUSOURCES ${CUTLASS_SOURCES}) | |||
| # Compile the following file first, the priority_compile_opr.txt is generated by | |||
| @@ -452,6 +452,86 @@ public: | |||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | |||
| template <typename Operator_> | |||
| class RegionRestrictedConvolutionBackwardFilterOperation | |||
| : public ConvolutionBackwardFilterOperationBase<Operator_> { | |||
| public: | |||
| using Operator = Operator_; | |||
| using ElementSrc = typename Operator::ElementSrc; | |||
| using LayoutSrc = typename Operator::LayoutSrc; | |||
| using ElementDiff = typename Operator::ElementDiff; | |||
| using LayoutDiff = typename Operator::LayoutDiff; | |||
| using ElementGrad = typename Operator::ElementGrad; | |||
| using LayoutGrad = typename Operator::LayoutGrad; | |||
| using ElementAccumulator = typename Operator::ElementAccumulator; | |||
| using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute; | |||
| using OperatorArguments = typename Operator::Arguments; | |||
| using ElementRin = typename Operator::ElementMaskInput; | |||
| using LayoutRin = typename Operator::LayoutMaskInput; | |||
| using ElementRout = typename Operator::ElementMaskOutput; | |||
| using LayoutRout = typename Operator::LayoutMaskOutput; | |||
| RegionRestrictedConvolutionBackwardFilterOperation( | |||
| char const* name = "unknown_gemm") | |||
| : ConvolutionBackwardFilterOperationBase<Operator_>(name) { | |||
| /// rin in description -> rin in C++ template | |||
| this->m_description.rin = make_TensorDescription<ElementRin, LayoutRin>( | |||
| Operator::kAlignmentMaskInput); | |||
| /// rout in description -> rout in C++ template | |||
| this->m_description.rout = make_TensorDescription<ElementRout, LayoutRout>( | |||
| Operator::kAlignmentMaskOutput); | |||
| this->m_description.without_shared_load = false; | |||
| } | |||
| virtual Status run( | |||
| void const* arguments_ptr, void* device_workspace = nullptr, | |||
| cudaStream_t stream = nullptr) const { | |||
| cutlass::conv::Operator conv_op = this->m_description.conv_op; | |||
| ConvolutionArguments const* conv_args = | |||
| reinterpret_cast<ConvolutionArguments const*>(arguments_ptr); | |||
| const auto& ps = conv_args->problem_size; | |||
| OperatorArguments args; | |||
| args.problem_size = ps; | |||
| /// src in convolution arguments -> ref_src | |||
| args.ref_src = { | |||
| static_cast<ElementSrc*>(const_cast<void*>(conv_args->src)), | |||
| LayoutSrc::packed(implicit_gemm_tensor_b_extent(conv_op, ps))}; | |||
| /// filter in convolution arguments -> ref_diff | |||
| args.ref_diff = { | |||
| static_cast<ElementDiff*>(const_cast<void*>(conv_args->filter)), | |||
| LayoutDiff::packed(implicit_gemm_tensor_a_extent(conv_op, ps))}; | |||
| /// dst in convolution arguments -> ref_grad | |||
| args.ref_grad = { | |||
| static_cast<ElementGrad*>(conv_args->dst), | |||
| LayoutGrad::packed(implicit_gemm_tensor_c_extent(conv_op, ps))}; | |||
| /// rin in convolution arguments -> ref_mask_input | |||
| args.ref_mask_input = { | |||
| static_cast<ElementRin*>(const_cast<void*>(conv_args->rin)), | |||
| LayoutRin::packed(implicit_gemm_tensor_rin_extent(conv_op, ps))}; | |||
| /// rout in convolution arguments -> ref_mask_output | |||
| args.ref_mask_output = { | |||
| static_cast<ElementRout*>(const_cast<void*>(conv_args->rout)), | |||
| LayoutRout::packed(implicit_gemm_tensor_rout_extent(conv_op, ps))}; | |||
| args.output_op = init_epilogue_param<typename Operator::EpilogueOutputOp>().get( | |||
| conv_args); | |||
| Operator op; | |||
| Status status = op.initialize(args, device_workspace); | |||
| if (status != Status::kSuccess) { | |||
| return status; | |||
| } | |||
| return op.run(stream); | |||
| } | |||
| }; | |||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | |||
| } // namespace library | |||
| } // namespace cutlass | |||
| @@ -50,6 +50,7 @@ void initialize_all_deconv_simt_operations(Manifest& manifest); | |||
| void initialize_all_dwconv2d_fprop_simt_operations(Manifest& manifest); | |||
| void initialize_all_dwconv2d_dgrad_simt_operations(Manifest& manifest); | |||
| void initialize_all_dwconv2d_wgrad_simt_operations(Manifest& manifest); | |||
| void initialize_all_rrconv2d_wgrad_simt_operations(Manifest& manifest); | |||
| #if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) && CUTLASS_ARCH_MMA_SM70_SUPPORTED | |||
| void initialize_all_gemm_tensorop884_operations(Manifest& manifest); | |||
| void initialize_all_dwconv2d_fprop_tensorop884_operations(Manifest& manifest); | |||
| @@ -70,6 +71,7 @@ void initialize_all(Manifest& manifest) { | |||
| initialize_all_dwconv2d_fprop_simt_operations(manifest); | |||
| initialize_all_dwconv2d_dgrad_simt_operations(manifest); | |||
| initialize_all_dwconv2d_wgrad_simt_operations(manifest); | |||
| initialize_all_rrconv2d_wgrad_simt_operations(manifest); | |||
| #if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) && CUTLASS_ARCH_MMA_SM70_SUPPORTED | |||
| initialize_all_gemm_tensorop884_operations(manifest); | |||
| initialize_all_dwconv2d_fprop_tensorop884_operations(manifest); | |||
| @@ -471,6 +471,10 @@ struct ConvolutionDescription : public OperationDescription { | |||
| conv::SpecialOptimizeDesc special_optimization; | |||
| conv::ImplicitGemmMode gemm_mode; | |||
| bool without_shared_load; | |||
| // only used by rrconv | |||
| TensorDescription rin; | |||
| TensorDescription rout; | |||
| }; | |||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | |||
| @@ -499,6 +503,10 @@ struct ConvolutionArguments { | |||
| /// Host pointer to extra param struct | |||
| void const* extra_param; | |||
| // only used by rrconv, default: nullptr | |||
| void const* rin = nullptr; | |||
| void const* rout = nullptr; | |||
| }; | |||
| ///////////////////////////////////////////////////////////////////////////////////////////////// | |||
| @@ -118,6 +118,11 @@ ConvolutionKey get_convolution_key_from_desc(const ConvolutionDescription& desc) | |||
| key.alignment_filter = desc.filter.alignment; | |||
| key.without_shared_load = desc.without_shared_load; | |||
| key.element_rin = desc.rin.element; | |||
| key.layout_rin = desc.rin.layout; | |||
| key.element_rout = desc.rout.element; | |||
| key.layout_rout = desc.rout.layout; | |||
| return key; | |||
| } | |||
| @@ -201,6 +201,12 @@ struct ConvolutionKey { | |||
| bool without_shared_load; | |||
| // only used by rrconv | |||
| library::NumericTypeID element_rin = library::NumericTypeID::kInvalid; | |||
| library::LayoutTypeID layout_rin = library::LayoutTypeID::kInvalid; | |||
| library::NumericTypeID element_rout = library::NumericTypeID::kInvalid; | |||
| library::LayoutTypeID layout_rout = library::LayoutTypeID::kInvalid; | |||
| inline bool operator==(ConvolutionKey const& rhs) const { | |||
| return (conv_op == rhs.conv_op) && (element_src == rhs.element_src) && | |||
| (layout_src == rhs.layout_src) && | |||
| @@ -223,7 +229,9 @@ struct ConvolutionKey { | |||
| (special_optimization == rhs.special_optimization) && | |||
| (alignment_src == rhs.alignment_src) && | |||
| (alignment_filter == rhs.alignment_filter) && | |||
| (without_shared_load == rhs.without_shared_load); | |||
| (without_shared_load == rhs.without_shared_load) && | |||
| (element_rin == rhs.element_rin) && (layout_rin == rhs.layout_rin) && | |||
| (element_rout == rhs.element_rout) && (layout_rout == rhs.layout_rout); | |||
| } | |||
| inline bool operator!=(ConvolutionKey const& rhs) const { return !(*this == rhs); } | |||
| @@ -260,7 +268,11 @@ struct ConvolutionKey { | |||
| "\n special_optimization: " + to_string(special_optimization) + | |||
| "\n alignment_src: " + std::to_string(alignment_src) + | |||
| "\n alignment_filter: " + std::to_string(alignment_filter) + | |||
| "\n without_shared_load: " + to_string(without_shared_load) + "\n}"; | |||
| "\n without_shared_load: " + to_string(without_shared_load) + | |||
| "\n element_rin: " + to_string(element_rin) + | |||
| "\n layout_rin: " + to_string(layout_rin) + | |||
| "\n element_rout: " + to_string(element_rout) + | |||
| "\n layout_rout: " + to_string(layout_rout) + "\n}"; | |||
| } | |||
| }; | |||
| @@ -293,6 +305,10 @@ struct ConvolutionKeyHasher { | |||
| .update(&key.alignment_src, sizeof(key.alignment_src)) | |||
| .update(&key.alignment_filter, sizeof(key.alignment_filter)) | |||
| .update(&key.without_shared_load, sizeof(key.without_shared_load)) | |||
| .update(&key.element_rin, sizeof(key.element_rin)) | |||
| .update(&key.layout_rin, sizeof(key.layout_rin)) | |||
| .update(&key.element_rout, sizeof(key.element_rout)) | |||
| .update(&key.layout_rout, sizeof(key.layout_rout)) | |||
| .digest(); | |||
| } | |||
| }; | |||
| @@ -1,4 +1,5 @@ | |||
| #include "src/cuda/region_restricted_convolution/opr_impl.h" | |||
| #include "src/cuda/cutlass/singleton.h" | |||
| #include "src/cuda/region_restricted_convolution/chanwise/depthwise_large_filter.cuh" | |||
| #include "src/cuda/region_restricted_convolution/chanwise/kern.cuh" | |||
| #include "src/cuda/utils.h" | |||
| @@ -6,6 +7,7 @@ | |||
| using namespace megdnn; | |||
| using namespace cuda; | |||
| using namespace region_restricted_convolution; | |||
| using namespace cutlass::library; | |||
| /* ============== RegionRestrictedConvolutionForwardImpl ============== */ | |||
| void RegionRestrictedConvolutionForwardImpl::exec( | |||
| @@ -113,7 +115,137 @@ size_t RegionRestrictedConvolutionBackwardFilterImpl::get_workspace_in_bytes( | |||
| void RegionRestrictedConvolutionBackwardFilterImpl::exec( | |||
| _megdnn_tensor_in src, _megdnn_tensor_in diff, _megdnn_tensor_in rin, | |||
| _megdnn_tensor_in rout, _megdnn_tensor_out grad, _megdnn_workspace workspace) { | |||
| megdnn_throw("Region Restricted Conv BackwardFilter unimplemented"); | |||
| auto fm = check_exec( | |||
| src.layout, diff.layout, rin.layout, rout.layout, grad.layout, | |||
| workspace.size); | |||
| megdnn_assert( | |||
| fm.group > 1 && src.layout.dtype.category() == DTypeCategory::FLOAT && | |||
| param().compute_mode == Param::ComputeMode::DEFAULT && | |||
| fm.spatial_ndim == 2 && fm.icpg == 1 && fm.ocpg == 1 && | |||
| fm.dilation[0] == 1 && fm.dilation[1] == 1 && !fm.should_flip && | |||
| param().stride_h == 1 && param().stride_w == 1); | |||
| int hi = src.layout.operator[](2), wi = src.layout.operator[](3); | |||
| int n = diff.layout.operator[](0), ho = diff.layout.operator[](2), | |||
| wo = diff.layout.operator[](3); | |||
| int co = fm.group, ci = co, groups = co; | |||
| int fh = fm.spatial[0], fw = fm.spatial[1]; | |||
| int sh = fm.stride[0], sw = fm.stride[1]; | |||
| int ph = fm.padding[0], pw = fm.padding[1]; | |||
| int dh = 0, dw = 0; | |||
| // check if channelwise convolution | |||
| megdnn_assert(fm.icpg == 1 && fm.ocpg == 1); | |||
| auto stream = cuda_stream(handle()); | |||
| float alpha = 1.f; | |||
| float beta = 0.f; | |||
| ConvolutionKey key; | |||
| int threadblock_shape_n = 128; | |||
| int warp_shape_m = 32; | |||
| int warp_shape_n = 64; | |||
| if (grad.layout.operator[](3) % 8 < 4) { | |||
| threadblock_shape_n = 64; | |||
| warp_shape_m = 64; | |||
| warp_shape_n = 32; | |||
| } | |||
| if (rin.layout.dtype == dtype::Int32() && rout.layout.dtype == dtype::Int32()) { | |||
| key = { | |||
| cutlass::conv::Operator::kWgrad, | |||
| NumericTypeID::kF32, | |||
| LayoutTypeID::kTensorNCHW, | |||
| NumericTypeID::kF32, | |||
| LayoutTypeID::kTensorNCHW, | |||
| NumericTypeID::kF32, | |||
| LayoutTypeID::kTensorNCHW, | |||
| NumericTypeID::kF32, | |||
| LayoutTypeID::kTensorNCHW, | |||
| NumericTypeID::kF32, | |||
| cutlass::conv::ConvType::kDepthwiseConvolution, | |||
| 128, | |||
| threadblock_shape_n, | |||
| 8, | |||
| warp_shape_m, | |||
| warp_shape_n, | |||
| 8, | |||
| 1, | |||
| 1, | |||
| 1, | |||
| cutlass::epilogue::EpilogueType::kLinearCombination, | |||
| 1, | |||
| cutlass::conv::SpecialOptimizeDesc::NONE, | |||
| 1, | |||
| 1, | |||
| false, | |||
| NumericTypeID::kS32, | |||
| LayoutTypeID::kTensorNCHW, | |||
| NumericTypeID::kS32, | |||
| LayoutTypeID::kTensorNCHW, | |||
| }; | |||
| } else if ( | |||
| rin.layout.dtype == dtype::Uint8() && rout.layout.dtype == dtype::Uint8()) { | |||
| key = { | |||
| cutlass::conv::Operator::kWgrad, | |||
| NumericTypeID::kF32, | |||
| LayoutTypeID::kTensorNCHW, | |||
| NumericTypeID::kF32, | |||
| LayoutTypeID::kTensorNCHW, | |||
| NumericTypeID::kF32, | |||
| LayoutTypeID::kTensorNCHW, | |||
| NumericTypeID::kF32, | |||
| LayoutTypeID::kTensorNCHW, | |||
| NumericTypeID::kF32, | |||
| cutlass::conv::ConvType::kDepthwiseConvolution, | |||
| 128, | |||
| threadblock_shape_n, | |||
| 8, | |||
| warp_shape_m, | |||
| warp_shape_n, | |||
| 8, | |||
| 1, | |||
| 1, | |||
| 1, | |||
| cutlass::epilogue::EpilogueType::kLinearCombination, | |||
| 1, | |||
| cutlass::conv::SpecialOptimizeDesc::NONE, | |||
| 1, | |||
| 1, | |||
| false, | |||
| NumericTypeID::kS8, | |||
| LayoutTypeID::kTensorNCHW, | |||
| NumericTypeID::kS8, | |||
| LayoutTypeID::kTensorNCHW, | |||
| }; | |||
| } else { | |||
| megdnn_throw(ssprintf( | |||
| "don't support region restricted type rin: %s, rout: %s", | |||
| rin.layout.dtype.name(), rout.layout.dtype.name()) | |||
| .c_str()); | |||
| } | |||
| const Operation* op = | |||
| (const Operation*)Singleton::get().operation_table.find_op(key); | |||
| cutlass::conv::Conv2dProblemSize problem_size{ | |||
| n, hi, wi, ci, co, fh, fw, ho, | |||
| wo, ph, pw, sh, sw, dh, dw, cutlass::conv::Mode::kCrossCorrelation, | |||
| 1, // split k slices, always 1 | |||
| groups, // groups | |||
| }; | |||
| cutlass::library::ConvolutionArguments conv_args{ | |||
| problem_size, src.raw_ptr(), diff.raw_ptr(), nullptr, | |||
| nullptr, grad.raw_ptr(), &alpha, &beta, | |||
| nullptr, nullptr, nullptr, nullptr, | |||
| nullptr, nullptr, rin.raw_ptr(), rout.raw_ptr()}; | |||
| cutlass_check(op->run(&conv_args, nullptr, stream)); | |||
| after_kernel_launch(); | |||
| } | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -465,6 +465,206 @@ TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_UINT8) { | |||
| run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10); | |||
| } | |||
| TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32) { | |||
| require_compute_capability(7, 5); | |||
| Benchmarker<ConvolutionBackwardFilter> bencher(handle_cuda()); | |||
| bencher.set_display(false); | |||
| bencher.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>( | |||
| "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_128X128X8_32X64X8_2stage")); | |||
| Benchmarker<RegionRestrictedConvolutionBackwardFilter> rr_bencher(handle_cuda()); | |||
| rr_bencher.set_display(false); | |||
| ConvolutionBackwardFilter::Param param; | |||
| param.format = ConvolutionBackwardFilter::Param::Format::NCHW; | |||
| param.sparse = ConvolutionBackwardFilter::Param::Sparse::GROUP; | |||
| RegionRestrictedConvolutionBackwardFilter::Param rr_param; | |||
| rr_param.format = RegionRestrictedConvolutionBackwardFilter::Param::Format::NCHW; | |||
| rr_param.sparse = RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP; | |||
| UniformIntRNG r_rng{1, 3}; | |||
| auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh, | |||
| size_t fw, size_t sh, size_t sw, size_t nr_times) { | |||
| param.pad_h = fh / 2; | |||
| param.pad_w = fw / 2; | |||
| param.stride_h = sh; | |||
| param.stride_w = sw; | |||
| rr_param.pad_h = fh / 2; | |||
| rr_param.pad_w = fw / 2; | |||
| rr_param.stride_h = sh; | |||
| rr_param.stride_w = sw; | |||
| bencher.set_param(param) | |||
| .set_dtype(0, dtype::Float32()) | |||
| .set_dtype(1, dtype::Float32()) | |||
| .set_dtype(2, dtype::Float32()) | |||
| .set_dtype(4, dtype::Float32()); | |||
| bencher.proxy()->target_execution_policy = {}; | |||
| bencher.set_times(nr_times); | |||
| rr_bencher.set_param(rr_param) | |||
| .set_dtype(0, dtype::Float32()) | |||
| .set_dtype(1, dtype::Float32()) | |||
| .set_dtype(2, dtype::Int32()) | |||
| .set_dtype(3, dtype::Int32()); | |||
| rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng); | |||
| rr_bencher.set_times(nr_times); | |||
| size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h); | |||
| size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w); | |||
| TensorShape src{batch, g, hi, wi}, diff{batch, g, ho, wo}, rin{batch, hi, wi}, | |||
| rout{batch, ho, wo}, grad{g, 1, 1, fh, fw}; | |||
| float bandwith = static_cast<float>( | |||
| src.total_nr_elems() + diff.total_nr_elems() + | |||
| grad.total_nr_elems()) / | |||
| (1024 * 1024 * 1024) * 1e3; | |||
| float rr_bandwith = static_cast<float>( | |||
| src.total_nr_elems() + diff.total_nr_elems() + | |||
| rin.total_nr_elems() + rout.total_nr_elems() + | |||
| grad.total_nr_elems()) / | |||
| (1024 * 1024 * 1024) * 1e3; | |||
| auto time_in_ms = bencher.execs({src, diff, grad}) / nr_times; | |||
| auto ops = 2.0 * batch * g * hi * wi * fh * fw / (time_in_ms * 1e-3) * 1e-12; | |||
| auto rr_time_in_ms = rr_bencher.execs({src, diff, rin, rout, grad}) / nr_times; | |||
| auto rr_ops = | |||
| 2.0 * batch * g * hi * wi * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12; | |||
| printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: " | |||
| "src=%s, " | |||
| "diff=%s, grad=%s\n" | |||
| "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n" | |||
| "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n", | |||
| src.to_string().c_str(), diff.to_string().c_str(), | |||
| grad.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops, | |||
| bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms, | |||
| time_in_ms / rr_time_in_ms); | |||
| }; | |||
| run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000); | |||
| } | |||
| TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32_RINT8) { | |||
| require_compute_capability(7, 5); | |||
| Benchmarker<ConvolutionBackwardFilter> bencher(handle_cuda()); | |||
| bencher.set_display(false); | |||
| bencher.set_before_exec_callback(AlgoChecker<ConvolutionBackwardFilter>( | |||
| "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_128X128X8_32X64X8_2stage")); | |||
| Benchmarker<RegionRestrictedConvolutionBackwardFilter> rr_bencher(handle_cuda()); | |||
| rr_bencher.set_display(false); | |||
| ConvolutionBackwardFilter::Param param; | |||
| param.format = ConvolutionBackwardFilter::Param::Format::NCHW; | |||
| param.sparse = ConvolutionBackwardFilter::Param::Sparse::GROUP; | |||
| RegionRestrictedConvolutionBackwardFilter::Param rr_param; | |||
| rr_param.format = RegionRestrictedConvolutionBackwardFilter::Param::Format::NCHW; | |||
| rr_param.sparse = RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP; | |||
| UniformIntRNG r_rng{1, 3}; | |||
| auto run_bench = [&](size_t batch, size_t g, size_t hi, size_t wi, size_t fh, | |||
| size_t fw, size_t sh, size_t sw, size_t nr_times) { | |||
| param.pad_h = fh / 2; | |||
| param.pad_w = fw / 2; | |||
| param.stride_h = sh; | |||
| param.stride_w = sw; | |||
| rr_param.pad_h = fh / 2; | |||
| rr_param.pad_w = fw / 2; | |||
| rr_param.stride_h = sh; | |||
| rr_param.stride_w = sw; | |||
| bencher.set_param(param) | |||
| .set_dtype(0, dtype::Float32()) | |||
| .set_dtype(1, dtype::Float32()) | |||
| .set_dtype(2, dtype::Float32()) | |||
| .set_dtype(4, dtype::Float32()); | |||
| bencher.proxy()->target_execution_policy = {}; | |||
| bencher.set_times(nr_times); | |||
| rr_bencher.set_param(rr_param) | |||
| .set_dtype(0, dtype::Float32()) | |||
| .set_dtype(1, dtype::Float32()) | |||
| .set_dtype(2, dtype::Uint8()) | |||
| .set_dtype(3, dtype::Uint8()); | |||
| rr_bencher.set_rng(2, &r_rng).set_rng(3, &r_rng); | |||
| rr_bencher.set_times(nr_times); | |||
| size_t ho = infer_conv_shape(hi, fh, sh, param.pad_h); | |||
| size_t wo = infer_conv_shape(wi, fw, sw, param.pad_w); | |||
| TensorShape src{batch, g, hi, wi}, diff{batch, g, ho, wo}, rin{batch, hi, wi}, | |||
| rout{batch, ho, wo}, grad{g, 1, 1, fh, fw}; | |||
| float bandwith = static_cast<float>( | |||
| src.total_nr_elems() + diff.total_nr_elems() + | |||
| grad.total_nr_elems()) / | |||
| (1024 * 1024 * 1024) * 1e3; | |||
| float rr_bandwith = static_cast<float>( | |||
| src.total_nr_elems() + diff.total_nr_elems() + | |||
| rin.total_nr_elems() + rout.total_nr_elems() + | |||
| grad.total_nr_elems()) / | |||
| (1024 * 1024 * 1024) * 1e3; | |||
| auto time_in_ms = bencher.execs({src, diff, grad}) / nr_times; | |||
| auto ops = 2.0 * batch * g * hi * wi * fh * fw / (time_in_ms * 1e-3) * 1e-12; | |||
| auto rr_time_in_ms = rr_bencher.execs({src, diff, rin, rout, grad}) / nr_times; | |||
| auto rr_ops = | |||
| 2.0 * batch * g * hi * wi * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12; | |||
| printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: " | |||
| "src=%s, " | |||
| "diff=%s, grad=%s\n" | |||
| "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n" | |||
| "bandwidth: %.2fGB/s, bandwidth(rr): %.2fGB/s, speedup: %.2f.\n", | |||
| src.to_string().c_str(), diff.to_string().c_str(), | |||
| grad.to_string().c_str(), time_in_ms, rr_time_in_ms, ops, rr_ops, | |||
| bandwith * 4 / time_in_ms, rr_bandwith * 4 / rr_time_in_ms, | |||
| time_in_ms / rr_time_in_ms); | |||
| }; | |||
| run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000); | |||
| run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000); | |||
| } | |||
| #endif | |||
| TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32) { | |||
| @@ -585,6 +785,125 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32_RIN_EQ_ROUT) { | |||
| } | |||
| } | |||
| TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_FILTER_FP32) { | |||
| Checker<RegionRestrictedConvolutionBackwardFilter> checker(handle_cuda()); | |||
| for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) { | |||
| auto run = [&checker, &dt]( | |||
| size_t n, size_t g, size_t ih, size_t fh, size_t padding, | |||
| size_t stride) { | |||
| RegionRestrictedConvolutionBackwardFilter::Param cur_param; | |||
| cur_param.mode = RegionRestrictedConvolutionBackwardFilter::Param::Mode:: | |||
| CROSS_CORRELATION; | |||
| cur_param.compute_mode = RegionRestrictedConvolutionBackwardFilter::Param:: | |||
| ComputeMode::DEFAULT; | |||
| cur_param.sparse = | |||
| RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP; | |||
| checker.set_dtype(0, dtype::Float32()) | |||
| .set_dtype(1, dtype::Float32()) | |||
| .set_dtype(2, dt) | |||
| .set_dtype(3, dt); | |||
| float scale = 64.f / sqrt(fh * fh); | |||
| UniformFloatRNG rng(scale, 2 * scale); | |||
| UniformIntRNG r_rng{1, 2}; | |||
| checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng( | |||
| 3, &r_rng); | |||
| cur_param.pad_h = cur_param.pad_w = padding; | |||
| cur_param.stride_h = cur_param.stride_w = stride; | |||
| size_t oh = (ih + 2 * padding - fh + 1) / stride; | |||
| checker.set_param(cur_param).execs({ | |||
| {n, g * 1, ih, ih}, // src | |||
| {n, g * 1, oh, oh}, // diff | |||
| {n, ih, ih}, // rin | |||
| {n, oh, oh}, // rout | |||
| {g, 1, 1, fh, fh} // grad | |||
| }); | |||
| }; | |||
| if (dt == dtype::Int32()) { | |||
| run(4, 8, 32, 5, 5 / 2, 1); | |||
| run(1, 2, 2, 2, 0, 1); | |||
| run(1, 2, 3, 3, 0, 1); | |||
| run(1, 2, 4, 4, 0, 1); | |||
| run(1, 2, 5, 5, 0, 1); | |||
| run(1, 2, 6, 6, 0, 1); | |||
| run(1, 2, 7, 7, 0, 1); | |||
| } | |||
| run(4, 8, 32, 7, 7 / 2, 1); | |||
| run(4, 8, 32, 9, 9 / 2, 1); | |||
| run(4, 8, 32, 11, 11 / 2, 1); | |||
| run(4, 8, 32, 13, 13 / 2, 1); | |||
| run(4, 8, 32, 15, 15 / 2, 1); | |||
| run(4, 8, 32, 17, 17 / 2, 1); | |||
| run(4, 8, 32, 19, 19 / 2, 1); | |||
| run(4, 8, 32, 21, 21 / 2, 1); | |||
| run(4, 8, 32, 23, 23 / 2, 1); | |||
| run(4, 8, 32, 25, 25 / 2, 1); | |||
| run(4, 8, 32, 27, 27 / 2, 1); | |||
| run(4, 8, 32, 29, 29 / 2, 1); | |||
| run(4, 8, 32, 31, 31 / 2, 1); | |||
| } | |||
| } | |||
| TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_FILTER_FP32_RIN_EQ_ROUT) { | |||
| Checker<RegionRestrictedConvolutionBackwardFilter> checker(handle_cuda()); | |||
| for (auto dt : std::vector<DType>{dtype::Int32(), dtype::Uint8()}) { | |||
| auto run = [&checker, &dt]( | |||
| size_t n, size_t g, size_t ih, size_t fh, size_t padding, | |||
| size_t stride) { | |||
| RegionRestrictedConvolutionBackwardFilter::Param cur_param; | |||
| cur_param.mode = RegionRestrictedConvolutionBackwardFilter::Param::Mode:: | |||
| CROSS_CORRELATION; | |||
| cur_param.compute_mode = RegionRestrictedConvolutionBackwardFilter::Param:: | |||
| ComputeMode::DEFAULT; | |||
| cur_param.sparse = | |||
| RegionRestrictedConvolutionBackwardFilter::Param::Sparse::GROUP; | |||
| checker.set_dtype(0, dtype::Float32()) | |||
| .set_dtype(1, dtype::Float32()) | |||
| .set_dtype(2, dt) | |||
| .set_dtype(3, dt); | |||
| float scale = 64.f / sqrt(fh * fh); | |||
| UniformFloatRNG rng(scale, 2 * scale); | |||
| UniformIntRNG r_rng{1, 1}; | |||
| checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng( | |||
| 3, &r_rng); | |||
| cur_param.pad_h = cur_param.pad_w = padding; | |||
| cur_param.stride_h = cur_param.stride_w = stride; | |||
| size_t oh = (ih + 2 * padding - fh + 1) / stride; | |||
| checker.set_param(cur_param).execs({ | |||
| {n, g * 1, ih, ih}, // src | |||
| {n, g * 1, oh, oh}, // diff | |||
| {n, ih, ih}, // rin | |||
| {n, oh, oh}, // rout | |||
| {g, 1, 1, fh, fh} // grad | |||
| }); | |||
| }; | |||
| if (dt == dtype::Int32()) { | |||
| run(4, 8, 32, 5, 5 / 2, 1); | |||
| run(1, 2, 2, 2, 0, 1); | |||
| run(1, 2, 3, 3, 0, 1); | |||
| run(1, 2, 4, 4, 0, 1); | |||
| run(1, 2, 5, 5, 0, 1); | |||
| run(1, 2, 6, 6, 0, 1); | |||
| run(1, 2, 7, 7, 0, 1); | |||
| } | |||
| run(4, 8, 32, 7, 7 / 2, 1); | |||
| run(4, 8, 32, 9, 9 / 2, 1); | |||
| run(4, 8, 32, 11, 11 / 2, 1); | |||
| run(4, 8, 32, 13, 13 / 2, 1); | |||
| run(4, 8, 32, 15, 15 / 2, 1); | |||
| run(4, 8, 32, 17, 17 / 2, 1); | |||
| run(4, 8, 32, 19, 19 / 2, 1); | |||
| run(4, 8, 32, 21, 21 / 2, 1); | |||
| run(4, 8, 32, 23, 23 / 2, 1); | |||
| run(4, 8, 32, 25, 25 / 2, 1); | |||
| run(4, 8, 32, 27, 27 / 2, 1); | |||
| run(4, 8, 32, 29, 29 / 2, 1); | |||
| run(4, 8, 32, 31, 31 / 2, 1); | |||
| } | |||
| } | |||
| } // namespace test | |||
| } // namespace megdnn | |||