| @@ -23,6 +23,7 @@ equal_op_info = AkgGpuRegOp("Equal") \ | |||||
| .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.F64_Default, DataType.F64_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.F64_Default, DataType.F64_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.I64_Default, DataType.I64_Default, DataType.BOOL_Default) \ | |||||
| .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.I16_Default, DataType.I16_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.I16_Default, DataType.I16_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.BOOL_Default) \ | ||||
| @@ -23,6 +23,7 @@ greater_equal_op_info = AkgGpuRegOp("GreaterEqual") \ | |||||
| .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.F64_Default, DataType.F64_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.F64_Default, DataType.F64_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.I64_Default, DataType.I64_Default, DataType.BOOL_Default) \ | |||||
| .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.I16_Default, DataType.I16_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.I16_Default, DataType.I16_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.BOOL_Default) \ | ||||
| @@ -23,6 +23,7 @@ lessequal_op_info = AkgGpuRegOp("LessEqual") \ | |||||
| .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.F64_Default, DataType.F64_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.F64_Default, DataType.F64_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.I64_Default, DataType.I64_Default, DataType.BOOL_Default) \ | |||||
| .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.I16_Default, DataType.I16_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.I16_Default, DataType.I16_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.BOOL_Default) \ | ||||
| @@ -22,6 +22,7 @@ notequal_op_info = AkgGpuRegOp("NotEqual") \ | |||||
| .output(0, "output") \ | .output(0, "output") \ | ||||
| .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.F16_Default, DataType.F16_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.F32_Default, DataType.F32_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.I64_Default, DataType.I64_Default, DataType.BOOL_Default) \ | |||||
| .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.I32_Default, DataType.I32_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.I16_Default, DataType.I16_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.I16_Default, DataType.I16_Default, DataType.BOOL_Default) \ | ||||
| .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.BOOL_Default) \ | .dtype_format(DataType.U8_Default, DataType.U8_Default, DataType.BOOL_Default) \ | ||||
| @@ -277,7 +277,7 @@ sh run_standalone_train_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATA | |||||
| sh run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] | sh run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [CHECKPOINT_PATH] | ||||
| # gpu benchmark example | # gpu benchmark example | ||||
| sh run_gpu_resnet_benchmark.sh [IMAGENET_DATASET_PATH] [BATCH_SIZE](optional) [DEVICE_NUM](optional) | |||||
| sh run_gpu_resnet_benchmark.sh [IMAGENET_DATASET_PATH] [BATCH_SIZE](optional) [DTYPE](optional) [DEVICE_NUM](optional) | |||||
| ``` | ``` | ||||
| #### Running parameter server mode training | #### Running parameter server mode training | ||||
| @@ -39,6 +39,8 @@ parser.add_argument('--epoch_size', type=str, default="2", help='Epoch_size: def | |||||
| parser.add_argument('--print_per_steps', type=str, default="20", help='Print loss and time per steps: default 20') | parser.add_argument('--print_per_steps', type=str, default="20", help='Print loss and time per steps: default 20') | ||||
| parser.add_argument('--run_distribute', type=ast.literal_eval, default=False, help='Run distribute') | parser.add_argument('--run_distribute', type=ast.literal_eval, default=False, help='Run distribute') | ||||
| parser.add_argument('--dataset_path', type=str, default=None, help='Imagenet dataset path') | parser.add_argument('--dataset_path', type=str, default=None, help='Imagenet dataset path') | ||||
| parser.add_argument('--dtype', type=str, choices=["fp32", "fp16", "FP16", "FP32"], default="fp16",\ | |||||
| help='Compute data type fp32 or fp16: default fp16') | |||||
| args_opt = parser.parse_args() | args_opt = parser.parse_args() | ||||
| set_seed(1) | set_seed(1) | ||||
| @@ -60,7 +62,7 @@ def pad(image): | |||||
| output = np.concatenate((image, zeros), axis=2) | output = np.concatenate((image, zeros), axis=2) | ||||
| return output | return output | ||||
| def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="GPU"): | |||||
| def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="GPU", dtype="fp16"): | |||||
| ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True) | ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True) | ||||
| image_size = 224 | image_size = 224 | ||||
| @@ -81,9 +83,11 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" | |||||
| C.CenterCrop(image_size), | C.CenterCrop(image_size), | ||||
| C.Normalize(mean=mean, std=std), | C.Normalize(mean=mean, std=std), | ||||
| ] | ] | ||||
| if dtype == "fp32": | |||||
| trans.append(C.HWC2CHW()) | |||||
| ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=4) | ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=4) | ||||
| ds = ds.map(operations=pad, input_columns="image", num_parallel_workers=4) | |||||
| if dtype == "fp16": | |||||
| ds = ds.map(operations=pad, input_columns="image", num_parallel_workers=4) | |||||
| # apply batch operations | # apply batch operations | ||||
| ds = ds.batch(batch_size, drop_remainder=True) | ds = ds.batch(batch_size, drop_remainder=True) | ||||
| # apply dataset repeat operation | # apply dataset repeat operation | ||||
| @@ -112,6 +116,7 @@ if __name__ == '__main__': | |||||
| epoch_size = int(args_opt.epoch_size) | epoch_size = int(args_opt.epoch_size) | ||||
| total_batch = int(args_opt.batch_size) | total_batch = int(args_opt.batch_size) | ||||
| print_per_steps = int(args_opt.print_per_steps) | print_per_steps = int(args_opt.print_per_steps) | ||||
| compute_type = str(args_opt.dtype).lower() | |||||
| # init context | # init context | ||||
| context.set_context(mode=context.GRAPH_MODE, device_target=dev, save_graphs=False) | context.set_context(mode=context.GRAPH_MODE, device_target=dev, save_graphs=False) | ||||
| @@ -122,14 +127,14 @@ if __name__ == '__main__': | |||||
| # create dataset | # create dataset | ||||
| dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=1, | dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=1, | ||||
| batch_size=total_batch, target=dev) | |||||
| batch_size=total_batch, target=dev, dtype=compute_type) | |||||
| step_size = dataset.get_dataset_size() | step_size = dataset.get_dataset_size() | ||||
| if (print_per_steps > step_size or print_per_steps < 1): | if (print_per_steps > step_size or print_per_steps < 1): | ||||
| print("Arg: print_per_steps should lessequal to dataset_size ", step_size) | print("Arg: print_per_steps should lessequal to dataset_size ", step_size) | ||||
| print("Change to default: 20") | print("Change to default: 20") | ||||
| print_per_steps = 20 | print_per_steps = 20 | ||||
| # define net | # define net | ||||
| net = resnet(class_num=1001) | |||||
| net = resnet(class_num=1001, dtype=compute_type) | |||||
| # init weight | # init weight | ||||
| for _, cell in net.cells_and_names(): | for _, cell in net.cells_and_names(): | ||||
| @@ -163,10 +168,11 @@ if __name__ == '__main__': | |||||
| loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | ||||
| opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 1e-4, 1024) | opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 1e-4, 1024) | ||||
| loss_scale = FixedLossScaleManager(1024, drop_overflow_update=False) | loss_scale = FixedLossScaleManager(1024, drop_overflow_update=False) | ||||
| model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) | |||||
| # Mixed precision | # Mixed precision | ||||
| model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, | |||||
| amp_level="O2", keep_batchnorm_fp32=False) | |||||
| if compute_type == "fp16": | |||||
| model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, | |||||
| amp_level="O2", keep_batchnorm_fp32=False) | |||||
| # define callbacks | # define callbacks | ||||
| time_cb = MyTimeMonitor(total_batch, print_per_steps) | time_cb = MyTimeMonitor(total_batch, print_per_steps) | ||||
| loss_cb = LossMonitor() | loss_cb = LossMonitor() | ||||
| @@ -14,10 +14,11 @@ | |||||
| # limitations under the License. | # limitations under the License. | ||||
| # ============================================================================ | # ============================================================================ | ||||
| if [ $# != 1 ] && [ $# != 2 ] && [ $# != 3 ] | |||||
| if [ $# != 1 ] && [ $# != 2 ] && [ $# != 3 ] && [ $# != 4 ] | |||||
| then | then | ||||
| echo "Usage: sh run_gpu_resnet_benchmark.sh [DATASET_PATH] [BATCH_SIZE](optional) [DEVICE_NUM](optional)" | |||||
| echo "Example: sh run_gpu_resnet_benchmark.sh /path/imagenet/train 256 8" | |||||
| echo "Usage: sh run_gpu_resnet_benchmark.sh [DATASET_PATH] [BATCH_SIZE](optional) [DTYPE](optional)\ | |||||
| [DEVICE_NUM](optional)" | |||||
| echo "Example: sh run_gpu_resnet_benchmark.sh /path/imagenet/train 256 FP16 8" | |||||
| exit 1 | exit 1 | ||||
| fi | fi | ||||
| @@ -44,6 +45,12 @@ fi | |||||
| if [ $# == 3 ] | if [ $# == 3 ] | ||||
| then | then | ||||
| mpirun --allow-run-as-root -n $3 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \ | |||||
| --dataset_path=$DATAPATH --batch_size=$2 | |||||
| python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True --dtype=$3 \ | |||||
| --dataset_path=$DATAPATH --batch_size=$2 | |||||
| fi | fi | ||||
| if [ $# == 4 ] | |||||
| then | |||||
| mpirun --allow-run-as-root -n $4 python ${self_path}/../gpu_resnet_benchmark.py --run_distribute=True \ | |||||
| --dataset_path=$DATAPATH --batch_size=$2 --dtype=$3 | |||||
| fi | |||||
| @@ -20,6 +20,13 @@ from mindspore.ops import operations as P | |||||
| from mindspore.common.tensor import Tensor | from mindspore.common.tensor import Tensor | ||||
| from scipy.stats import truncnorm | from scipy.stats import truncnorm | ||||
| format_ = "NHWC" | |||||
| # tranpose shape to NCHW, default init is NHWC. | |||||
| def _trans_shape(shape, shape_format): | |||||
| if shape_format == "NCHW": | |||||
| return (shape[0], shape[3], shape[1], shape[2]) | |||||
| return shape | |||||
| def _conv_variance_scaling_initializer(in_channel, out_channel, kernel_size): | def _conv_variance_scaling_initializer(in_channel, out_channel, kernel_size): | ||||
| fan_in = in_channel * kernel_size * kernel_size | fan_in = in_channel * kernel_size * kernel_size | ||||
| scale = 1.0 | scale = 1.0 | ||||
| @@ -37,30 +44,33 @@ def _weight_variable(shape, factor=0.01): | |||||
| def _conv3x3(in_channel, out_channel, stride=1): | def _conv3x3(in_channel, out_channel, stride=1): | ||||
| weight_shape = (out_channel, 3, 3, in_channel) | weight_shape = (out_channel, 3, 3, in_channel) | ||||
| weight_shape = _trans_shape(weight_shape, format_) | |||||
| weight = _weight_variable(weight_shape) | weight = _weight_variable(weight_shape) | ||||
| return nn.Conv2d(in_channel, out_channel, kernel_size=3, stride=stride, | return nn.Conv2d(in_channel, out_channel, kernel_size=3, stride=stride, | ||||
| padding=1, pad_mode='pad', weight_init=weight, data_format="NHWC") | |||||
| padding=1, pad_mode='pad', weight_init=weight, data_format=format_) | |||||
| def _conv1x1(in_channel, out_channel, stride=1): | def _conv1x1(in_channel, out_channel, stride=1): | ||||
| weight_shape = (out_channel, 1, 1, in_channel) | weight_shape = (out_channel, 1, 1, in_channel) | ||||
| weight_shape = _trans_shape(weight_shape, format_) | |||||
| weight = _weight_variable(weight_shape) | weight = _weight_variable(weight_shape) | ||||
| return nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=stride, | return nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=stride, | ||||
| padding=0, pad_mode='pad', weight_init=weight, data_format="NHWC") | |||||
| padding=0, pad_mode='pad', weight_init=weight, data_format=format_) | |||||
| def _conv7x7(in_channel, out_channel, stride=1): | def _conv7x7(in_channel, out_channel, stride=1): | ||||
| weight_shape = (out_channel, 7, 7, in_channel) | weight_shape = (out_channel, 7, 7, in_channel) | ||||
| weight_shape = _trans_shape(weight_shape, format_) | |||||
| weight = _weight_variable(weight_shape) | weight = _weight_variable(weight_shape) | ||||
| return nn.Conv2d(in_channel, out_channel, kernel_size=7, stride=stride, | return nn.Conv2d(in_channel, out_channel, kernel_size=7, stride=stride, | ||||
| padding=3, pad_mode='pad', weight_init=weight, data_format="NHWC") | |||||
| padding=3, pad_mode='pad', weight_init=weight, data_format=format_) | |||||
| def _bn(channel): | def _bn(channel): | ||||
| return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9, gamma_init=1, beta_init=0, | return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9, gamma_init=1, beta_init=0, | ||||
| moving_mean_init=0, moving_var_init=1, data_format="NHWC") | |||||
| moving_mean_init=0, moving_var_init=1, data_format=format_) | |||||
| def _bn_last(channel): | def _bn_last(channel): | ||||
| return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9, gamma_init=0, beta_init=0, | return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9, gamma_init=0, beta_init=0, | ||||
| moving_mean_init=0, moving_var_init=1, data_format="NHWC") | |||||
| moving_mean_init=0, moving_var_init=1, data_format=format_) | |||||
| def _fc(in_channel, out_channel): | def _fc(in_channel, out_channel): | ||||
| weight_shape = (out_channel, in_channel) | weight_shape = (out_channel, in_channel) | ||||
| @@ -165,10 +175,13 @@ class ResNet(nn.Cell): | |||||
| if not len(layer_nums) == len(in_channels) == len(out_channels) == 4: | if not len(layer_nums) == len(in_channels) == len(out_channels) == 4: | ||||
| raise ValueError("the length of layer_num, in_channels, out_channels list must be 4!") | raise ValueError("the length of layer_num, in_channels, out_channels list must be 4!") | ||||
| self.conv1 = _conv7x7(4, 64, stride=2) | |||||
| input_data_channel = 4 | |||||
| if format_ == "NCHW": | |||||
| input_data_channel = 3 | |||||
| self.conv1 = _conv7x7(input_data_channel, 64, stride=2) | |||||
| self.bn1 = _bn(64) | self.bn1 = _bn(64) | ||||
| self.relu = P.ReLU() | self.relu = P.ReLU() | ||||
| self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same", data_format="NHWC") | |||||
| self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same", data_format=format_) | |||||
| self.layer1 = self._make_layer(block, | self.layer1 = self._make_layer(block, | ||||
| layer_nums[0], | layer_nums[0], | ||||
| in_channel=in_channels[0], | in_channel=in_channels[0], | ||||
| @@ -190,7 +203,7 @@ class ResNet(nn.Cell): | |||||
| out_channel=out_channels[3], | out_channel=out_channels[3], | ||||
| stride=strides[3]) | stride=strides[3]) | ||||
| self.avg_pool = P.AvgPool(7, 1, data_format="NHWC") | |||||
| self.avg_pool = P.AvgPool(7, 1, data_format=format_) | |||||
| self.flatten = nn.Flatten() | self.flatten = nn.Flatten() | ||||
| self.end_point = _fc(out_channels[3], num_classes) | self.end_point = _fc(out_channels[3], num_classes) | ||||
| @@ -237,7 +250,7 @@ class ResNet(nn.Cell): | |||||
| return out | return out | ||||
| def resnet50(class_num=1001): | |||||
| def resnet50(class_num=1001, dtype="fp16"): | |||||
| """ | """ | ||||
| Get ResNet50 neural network. | Get ResNet50 neural network. | ||||
| @@ -250,6 +263,9 @@ def resnet50(class_num=1001): | |||||
| Examples: | Examples: | ||||
| >>> net = resnet50(1001) | >>> net = resnet50(1001) | ||||
| """ | """ | ||||
| global format_ | |||||
| if dtype == "fp32": | |||||
| format_ = "NCHW" | |||||
| return ResNet(ResidualBlock, | return ResNet(ResidualBlock, | ||||
| [3, 4, 6, 3], | [3, 4, 6, 3], | ||||
| [64, 256, 512, 1024], | [64, 256, 512, 1024], | ||||