| @@ -213,7 +213,7 @@ class EmitConv2dInstance: | |||||
| def __init__(self): | def __init__(self): | ||||
| self.template = """ | self.template = """ | ||||
| // kernel instance "${operation_name}" generated by cutlass generator | // kernel instance "${operation_name}" generated by cutlass generator | ||||
| using Convolution = | |||||
| using Convolution_${operation_name} = | |||||
| typename cutlass::conv::device::Convolution< | typename cutlass::conv::device::Convolution< | ||||
| ${element_src}, | ${element_src}, | ||||
| ${layout_src}, | ${layout_src}, | ||||
| @@ -317,7 +317,7 @@ class EmitDeconvInstance: | |||||
| def __init__(self): | def __init__(self): | ||||
| self.template = """ | self.template = """ | ||||
| // kernel instance "${operation_name}" generated by cutlass generator | // kernel instance "${operation_name}" generated by cutlass generator | ||||
| using Convolution = | |||||
| using Convolution_${operation_name} = | |||||
| typename cutlass::conv::device::Deconvolution< | typename cutlass::conv::device::Deconvolution< | ||||
| ${element_src}, | ${element_src}, | ||||
| ${layout_src}, | ${layout_src}, | ||||
| @@ -419,7 +419,7 @@ class EmitConvolutionBackwardFilterInstance: | |||||
| def __init__(self): | def __init__(self): | ||||
| self.template = """ | self.template = """ | ||||
| // kernel instance "${operation_name}" generated by cutlass generator | // kernel instance "${operation_name}" generated by cutlass generator | ||||
| using Convolution = | |||||
| using Convolution_${operation_name} = | |||||
| typename cutlass::conv::device::ConvolutionBackwardFilter< | typename cutlass::conv::device::ConvolutionBackwardFilter< | ||||
| ${element_src}, | ${element_src}, | ||||
| ${layout_src}, | ${layout_src}, | ||||
| @@ -905,7 +905,7 @@ namespace cutlass { | |||||
| namespace library { | namespace library { | ||||
| void initialize_${operation_name}(Manifest &manifest) { | void initialize_${operation_name}(Manifest &manifest) { | ||||
| manifest.append(new ${convolution_name}<Convolution>( | |||||
| manifest.append(new ${convolution_name}<Convolution_${operation_name}>( | |||||
| "${operation_name}" | "${operation_name}" | ||||
| )); | )); | ||||
| } | } | ||||
| @@ -929,19 +929,6 @@ void initialize_${operation_name}(Manifest &manifest) { | |||||
| self.kernel_path, "%s.cu" % self.operation.procedural_name() | self.kernel_path, "%s.cu" % self.operation.procedural_name() | ||||
| ) | ) | ||||
| self.kernel_file = open(self.kernel_path, "w") | self.kernel_file = open(self.kernel_path, "w") | ||||
| self.kernel_file.write( | |||||
| SubstituteTemplate( | |||||
| self.header_template, | |||||
| { | |||||
| "required_cuda_ver_major": str( | |||||
| self.operation.required_cuda_ver_major | |||||
| ), | |||||
| "required_cuda_ver_minor": str( | |||||
| self.operation.required_cuda_ver_minor | |||||
| ), | |||||
| }, | |||||
| ) | |||||
| ) | |||||
| return self | return self | ||||
| # | # | ||||
| @@ -965,7 +952,6 @@ void initialize_${operation_name}(Manifest &manifest) { | |||||
| # | # | ||||
| def __exit__(self, exception_type, exception_value, traceback): | def __exit__(self, exception_type, exception_value, traceback): | ||||
| self.kernel_file.write(self.epilogue_template) | |||||
| self.kernel_file.close() | self.kernel_file.close() | ||||
| @@ -1347,19 +1347,6 @@ void initialize_${operation_name}(Manifest &manifest) { | |||||
| self.kernel_path, "%s.cu" % self.operation.procedural_name() | self.kernel_path, "%s.cu" % self.operation.procedural_name() | ||||
| ) | ) | ||||
| self.kernel_file = open(self.kernel_path, "w") | self.kernel_file = open(self.kernel_path, "w") | ||||
| self.kernel_file.write( | |||||
| SubstituteTemplate( | |||||
| self.header_template, | |||||
| { | |||||
| "required_cuda_ver_major": str( | |||||
| self.operation.required_cuda_ver_major | |||||
| ), | |||||
| "required_cuda_ver_minor": str( | |||||
| self.operation.required_cuda_ver_minor | |||||
| ), | |||||
| }, | |||||
| ) | |||||
| ) | |||||
| return self | return self | ||||
| # | # | ||||
| @@ -1379,7 +1366,6 @@ void initialize_${operation_name}(Manifest &manifest) { | |||||
| # | # | ||||
| def __exit__(self, exception_type, exception_value, traceback): | def __exit__(self, exception_type, exception_value, traceback): | ||||
| self.kernel_file.write(self.epilogue_template) | |||||
| self.kernel_file.close() | self.kernel_file.close() | ||||
| @@ -1435,20 +1421,6 @@ ${operation_instance} | |||||
| self.kernel_path, "%s.cu" % self.operation.procedural_name() | self.kernel_path, "%s.cu" % self.operation.procedural_name() | ||||
| ) | ) | ||||
| self.kernel_file = open(self.kernel_path, "w") | self.kernel_file = open(self.kernel_path, "w") | ||||
| self.kernel_file.write( | |||||
| SubstituteTemplate( | |||||
| self.header_template, | |||||
| { | |||||
| "wrapper_path": self.wrapper_path, | |||||
| "required_cuda_ver_major": str( | |||||
| self.operation.required_cuda_ver_major | |||||
| ), | |||||
| "required_cuda_ver_minor": str( | |||||
| self.operation.required_cuda_ver_minor | |||||
| ), | |||||
| }, | |||||
| ) | |||||
| ) | |||||
| return self | return self | ||||
| # | # | ||||
| @@ -1468,7 +1440,6 @@ ${operation_instance} | |||||
| # | # | ||||
| def __exit__(self, exception_type, exception_value, traceback): | def __exit__(self, exception_type, exception_value, traceback): | ||||
| self.kernel_file.write(self.epilogue_template) | |||||
| self.kernel_file.close() | self.kernel_file.close() | ||||
| @@ -35,24 +35,31 @@ def write_op_list(f, gen_op, gen_type): | |||||
| if gen_op != "gemv": | if gen_op != "gemv": | ||||
| f.write(' "all_%s_%s_operations.cu",\n' % (gen_op, gen_type)) | f.write(' "all_%s_%s_operations.cu",\n' % (gen_op, gen_type)) | ||||
| # Write down a list of merged filenames | |||||
| def write_merge_file_name(f, gen_op, gen_type): | |||||
| f.write(' "{}_{}_1.cu",\n'.format(gen_op,gen_type)) | |||||
| f.write(' "{}_{}_2.cu",\n'.format(gen_op,gen_type)) | |||||
| if gen_op != "gemv": | |||||
| f.write(' "all_{}_{}_operations.cu",\n'.format(gen_op,gen_type)) | |||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||
| with open("list.bzl", "w") as f: | with open("list.bzl", "w") as f: | ||||
| f.write("# Generated by dnn/scripts/cutlass_generator/gen_list.py\n\n") | f.write("# Generated by dnn/scripts/cutlass_generator/gen_list.py\n\n") | ||||
| f.write("cutlass_gen_list = [\n") | f.write("cutlass_gen_list = [\n") | ||||
| write_op_list(f, "gemm", "simt") | |||||
| write_op_list(f, "gemm", "tensorop1688") | |||||
| write_op_list(f, "gemm", "tensorop884") | |||||
| write_op_list(f, "gemv", "simt") | |||||
| write_op_list(f, "deconv", "simt") | |||||
| write_op_list(f, "deconv", "tensorop8816") | |||||
| write_op_list(f, "conv2d", "simt") | |||||
| write_op_list(f, "conv2d", "tensorop8816") | |||||
| write_op_list(f, "conv2d", "tensorop8832") | |||||
| write_op_list(f, "dwconv2d_fprop", "simt") | |||||
| write_op_list(f, "dwconv2d_fprop", "tensorop884") | |||||
| write_op_list(f, "dwconv2d_dgrad", "simt") | |||||
| write_op_list(f, "dwconv2d_dgrad", "tensorop884") | |||||
| write_op_list(f, "dwconv2d_wgrad", "simt") | |||||
| write_op_list(f, "dwconv2d_wgrad", "tensorop884") | |||||
| write_merge_file_name(f, "gemm", "simt") | |||||
| write_merge_file_name(f, "gemm", "tensorop1688") | |||||
| write_merge_file_name(f, "gemm", "tensorop884") | |||||
| write_merge_file_name(f, "gemv", "simt") | |||||
| write_merge_file_name(f, "deconv", "simt") | |||||
| write_merge_file_name(f, "deconv", "tensorop8816") | |||||
| write_merge_file_name(f, "conv2d", "simt") | |||||
| write_merge_file_name(f, "conv2d", "tensorop8816") | |||||
| write_merge_file_name(f, "conv2d", "tensorop8832") | |||||
| write_merge_file_name(f, "dwconv2d_fprop", "simt") | |||||
| write_merge_file_name(f, "dwconv2d_fprop", "tensorop884") | |||||
| write_merge_file_name(f, "dwconv2d_dgrad", "simt") | |||||
| write_merge_file_name(f, "dwconv2d_dgrad", "tensorop884") | |||||
| write_merge_file_name(f, "dwconv2d_wgrad", "simt") | |||||
| write_merge_file_name(f, "dwconv2d_wgrad", "tensorop884") | |||||
| f.write("]") | f.write("]") | ||||
| @@ -9,7 +9,7 @@ import os.path | |||||
| import shutil | import shutil | ||||
| import argparse | import argparse | ||||
| import platform | import platform | ||||
| import string | |||||
| from library import * | from library import * | ||||
| from manifest import * | from manifest import * | ||||
| @@ -1657,6 +1657,108 @@ def GenerateGemvOperations(args): | |||||
| return GenerateGemv_Simt(args) | return GenerateGemv_Simt(args) | ||||
| def concat_file(file_path:str,file_name_first:str,file_name_last:str,head:str,required_cuda_ver_major:str, required_cuda_ver_minor:str, epilogue:str, wrapper_path = None): | |||||
| import os | |||||
| meragefiledir = file_path | |||||
| filenames=os.listdir(meragefiledir) | |||||
| file1=open(file_path + '/{}_{}_1.cu'.format(file_name_first,file_name_last),'w') | |||||
| file2=open(file_path + '/{}_{}_2.cu'.format(file_name_first,file_name_last),'w') | |||||
| if wrapper_path is None: | |||||
| file1.write( | |||||
| SubstituteTemplate( | |||||
| head, | |||||
| { | |||||
| "required_cuda_ver_major": str( | |||||
| required_cuda_ver_major | |||||
| ), | |||||
| "required_cuda_ver_minor": str( | |||||
| required_cuda_ver_minor | |||||
| ), | |||||
| }, | |||||
| ) | |||||
| ) | |||||
| file2.write( | |||||
| SubstituteTemplate( | |||||
| head, | |||||
| { | |||||
| "required_cuda_ver_major": str( | |||||
| required_cuda_ver_major | |||||
| ), | |||||
| "required_cuda_ver_minor": str( | |||||
| required_cuda_ver_minor | |||||
| ), | |||||
| }, | |||||
| ) | |||||
| ) | |||||
| else: | |||||
| file1.write( | |||||
| SubstituteTemplate( | |||||
| head, | |||||
| { | |||||
| "wrapper_path": wrapper_path, | |||||
| "required_cuda_ver_major": str( | |||||
| required_cuda_ver_major | |||||
| ), | |||||
| "required_cuda_ver_minor": str( | |||||
| required_cuda_ver_minor | |||||
| ), | |||||
| }, | |||||
| ) | |||||
| ) | |||||
| file2.write( | |||||
| SubstituteTemplate( | |||||
| head, | |||||
| { | |||||
| "wrapper_path": wrapper_path, | |||||
| "required_cuda_ver_major": str( | |||||
| required_cuda_ver_major | |||||
| ), | |||||
| "required_cuda_ver_minor": str( | |||||
| required_cuda_ver_minor | |||||
| ), | |||||
| }, | |||||
| ) | |||||
| ) | |||||
| flag = 0 | |||||
| if "tensorop" in file_name_last: | |||||
| sub_string_1 = "tensorop" | |||||
| sub_string_2 = file_name_last[8:] | |||||
| else: | |||||
| sub_string_1 = sub_string_2 = "simt" | |||||
| if "dwconv2d_" in file_name_first: | |||||
| file_name_first = file_name_first[:2]+file_name_first[9:] | |||||
| elif ("conv2d" in file_name_first) or ("deconv" in file_name_first): | |||||
| file_name_first = "cutlass" | |||||
| for filename in filenames: | |||||
| if (file_name_first in filename) and (sub_string_1 in filename) and (sub_string_2 in filename) and ("all_" not in filename): | |||||
| flag += 1 | |||||
| filepath=meragefiledir+'/'+filename | |||||
| if flag <= len(filenames)/2: | |||||
| for line in open(filepath): | |||||
| file1.writelines(line) | |||||
| else: | |||||
| for line in open(filepath): | |||||
| file2.writelines(line) | |||||
| os.remove(filepath) | |||||
| file1.write('\n') | |||||
| file2.write('\n') | |||||
| elif filename[0].isdigit() and ("all_" not in filename): | |||||
| flag += 1 | |||||
| filepath=meragefiledir+'/'+filename | |||||
| if flag <= len(filenames)/2: | |||||
| for line in open(filepath): | |||||
| file1.writelines(line) | |||||
| else: | |||||
| for line in open(filepath): | |||||
| file2.writelines(line) | |||||
| os.remove(filepath) | |||||
| file1.write('\n') | |||||
| file2.write('\n') | |||||
| file1.write(epilogue) | |||||
| file2.write(epilogue) | |||||
| file1.close() | |||||
| file2.close() | |||||
| ################################################################################################### | ################################################################################################### | ||||
| ################################################################################################### | ################################################################################################### | ||||
| @@ -1727,18 +1829,33 @@ if __name__ == "__main__": | |||||
| args.output, operation, short_path | args.output, operation, short_path | ||||
| ) as emitter: | ) as emitter: | ||||
| emitter.emit() | emitter.emit() | ||||
| head = EmitConvSingleKernelWrapper(args.output, operations[0], short_path).header_template | |||||
| required_cuda_ver_major = operations[0].required_cuda_ver_major | |||||
| required_cuda_ver_minor = operations[0].required_cuda_ver_minor | |||||
| epilogue = EmitConvSingleKernelWrapper(args.output, operations[0], short_path).epilogue_template | |||||
| concat_file(args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue) | |||||
| elif args.operations == "gemm": | elif args.operations == "gemm": | ||||
| for operation in operations: | for operation in operations: | ||||
| with EmitGemmSingleKernelWrapper( | with EmitGemmSingleKernelWrapper( | ||||
| args.output, operation, short_path | args.output, operation, short_path | ||||
| ) as emitter: | ) as emitter: | ||||
| emitter.emit() | emitter.emit() | ||||
| head = EmitGemmSingleKernelWrapper(args.output, operations[0], short_path).header_template | |||||
| required_cuda_ver_major = operations[0].required_cuda_ver_major | |||||
| required_cuda_ver_minor = operations[0].required_cuda_ver_minor | |||||
| epilogue = EmitGemmSingleKernelWrapper(args.output, operations[0], short_path).epilogue_template | |||||
| concat_file(args.output, args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue) | |||||
| elif args.operations == "gemv": | elif args.operations == "gemv": | ||||
| for operation in operations: | for operation in operations: | ||||
| with EmitGemvSingleKernelWrapper( | with EmitGemvSingleKernelWrapper( | ||||
| args.output, operation, gemv_wrapper_path, short_path | args.output, operation, gemv_wrapper_path, short_path | ||||
| ) as emitter: | ) as emitter: | ||||
| emitter.emit() | emitter.emit() | ||||
| head = EmitGemvSingleKernelWrapper(args.output, operations[0], gemv_wrapper_path, short_path).header_template | |||||
| required_cuda_ver_major = operations[0].required_cuda_ver_major | |||||
| required_cuda_ver_minor = operations[0].required_cuda_ver_minor | |||||
| epilogue = EmitGemvSingleKernelWrapper(args.output, operations[0], gemv_wrapper_path, short_path).epilogue_template | |||||
| concat_file(args.output,args.operations, args.type, head,required_cuda_ver_major, required_cuda_ver_minor, epilogue, wrapper_path = gemv_wrapper_path) | |||||
| if args.operations != "gemv": | if args.operations != "gemv": | ||||
| GenerateManifest(args, operations, args.output) | GenerateManifest(args, operations, args.output) | ||||
| @@ -1,3 +1,5 @@ | |||||
| #pragma once | |||||
| #include "cutlass/gemm/kernel/default_gemv.h" | #include "cutlass/gemm/kernel/default_gemv.h" | ||||
| #include "cutlass/gemm/kernel/gemv_batched_strided.h" | #include "cutlass/gemm/kernel/gemv_batched_strided.h" | ||||
| #include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh" | #include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh" | ||||