| @@ -442,17 +442,38 @@ class trace: | |||||
| Serialize trace to file system. | Serialize trace to file system. | ||||
| :param fpath: positional only argument. Path of output file. | :param fpath: positional only argument. Path of output file. | ||||
| :param arg_names: names of the input tensors in the traced function | |||||
| :param append: whether output is appended to ``fpath`` | |||||
| :param f16_io_f32_comp: whether to use float16 for I/O between oprs and use | |||||
| :param arg_names: names of the input tensors in the traced function. | |||||
| :param append: whether output is appended to ``fpath``. | |||||
| :param optimize_for_inference: whether to enable optimize_for_inference | |||||
| pass before dump. | |||||
| :param enable_io16xc32: whether to use float16 for I/O between oprs and use | |||||
| float32 as internal computation precision. Note the output var would be | float32 as internal computation precision. Note the output var would be | ||||
| changed to float16 | |||||
| :param f16_io_comp: whether to use float16 for both I/O and computation | |||||
| precision | |||||
| :param use_nhwcd4: whether to use NHWCD4 data format. This is faster on some | |||||
| OpenCL devices | |||||
| :param fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty | |||||
| into one opr. This is supported only in NHWCD4 format. | |||||
| changed to float16. | |||||
| :param enable_ioc16: whether to use float16 for both I/O and computation | |||||
| precision. | |||||
| :param enable_hwcd4: whether to use NHWCD4 data layout. This is faster on some | |||||
| OpenCL backend. | |||||
| :param enable_nchw88: whether to use NCHW4 data layout. it currently | |||||
| used in X86 AVX backend. | |||||
| :param enable_nchw44: whether to use NCHW4 data layout. it currently | |||||
| used in arm backend. | |||||
| :param enable_nchw44_dot: whether to use NCHW4 data layout. it currently | |||||
| used in armv8.2+dotprod backend. | |||||
| :param enable_nchw4: whether to use NCHW4 data layout. it currently | |||||
| used in nvidia backend(based on cudnn). | |||||
| :param enable_nchw32 whether to use NCHW32 data layout. it currently | |||||
| used in nvidia backend with tensorcore(based on cudnn). | |||||
| :param enable_chwn4 whether to use CHWN4 data layout. it currently | |||||
| used in nvidia backend with tensorcore. | |||||
| :param enable_fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty | |||||
| into one opr. | |||||
| :param enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z | |||||
| input for inference on nvidia backend(this optimization pass will | |||||
| result in mismatch of the precision of output of training and | |||||
| inference) | |||||
| """ | """ | ||||
| if self._status != self._FINISHED: | if self._status != self._FINISHED: | ||||
| raise ValueError("not traced") | raise ValueError("not traced") | ||||
| @@ -475,6 +496,7 @@ class trace: | |||||
| "enable_nchw88": "use_nchw88", | "enable_nchw88": "use_nchw88", | ||||
| "enable_nchw32": "use_nchw32", | "enable_nchw32": "use_nchw32", | ||||
| "enable_nchw44": "use_nchw44", | "enable_nchw44": "use_nchw44", | ||||
| "enable_nchw44_dot": "use_nchw44_dot", | |||||
| "enable_chwn4": "use_chwn4", | "enable_chwn4": "use_chwn4", | ||||
| "enable_fuse_conv_bias_nonlinearity": "fuse_conv_bias_nonlinearity", | "enable_fuse_conv_bias_nonlinearity": "fuse_conv_bias_nonlinearity", | ||||
| "enable_fuse_conv_bias_with_z": "fuse_conv_bias_with_z", | "enable_fuse_conv_bias_with_z": "fuse_conv_bias_with_z", | ||||