Here we benchmark the training speed of a Mask R-CNN in detectron2,
with some other popular open source Mask R-CNN implementations.
+-----------------------------+--------------------+
| Implementation | Throughput (img/s) |
+=============================+====================+
| Detectron2 | 59 |
+-----------------------------+--------------------+
| maskrcnn-benchmark_ | 51 |
+-----------------------------+--------------------+
| tensorpack_ | 50 |
+-----------------------------+--------------------+
| mmdetection_ | 41 |
+-----------------------------+--------------------+
| simpledet_ | 39 |
+-----------------------------+--------------------+
| Detectron_ | 19 |
+-----------------------------+--------------------+
| `matterport/Mask_RCNN`__ | 14 |
+-----------------------------+--------------------+
.. _maskrcnn-benchmark: https://github.com/facebookresearch/maskrcnn-benchmark/
.. _tensorpack: https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN
.. _mmdetection: https://github.com/open-mmlab/mmdetection/
.. _simpledet: https://github.com/TuSimple/simpledet/
.. _Detectron: https://github.com/facebookresearch/Detectron
__ https://github.com/matterport/Mask_RCNN/
Details for each implementation:
Detectron2:
python tools/train_net.py --config-file configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml --num-gpus 8
maskrcnn-benchmark: use commit 0ce8f6f with sed -i ‘s/torch.uint8/torch.bool/g’ **/*.py to make it compatible with latest PyTorch.
Then, run training with
python -m torch.distributed.launch --nproc_per_node=8 tools/train_net.py --config-file configs/e2e_mask_rcnn_R_50_FPN_1x.yaml
The speed we observed is faster than its model zoo, likely due to different software versions.
tensorpack: at commit caafda, export TF_CUDNN_USE_AUTOTUNE=0, then run
mpirun -np 8 ./train.py --config DATA.BASEDIR=/data/coco TRAINER=horovod BACKBONE.STRIDE_1X1=True TRAIN.STEPS_PER_EPOCH=50 --load ImageNet-R50-AlignPadding.npz
mmdetection: at commit 4d9a5f, apply the following diff, then run
./tools/dist_train.sh configs/mask_rcnn_r50_fpn_1x.py 8
The speed we observed is faster than its model zoo, likely due to different software versions.
etails>
ummary>
iff to make it use the same architecture - click to expand)
summary>
`diff
ff --git i/configs/mask_rcnn_r50_fpn_1x.py w/configs/mask_rcnn_r50_fpn_1x.py
dex 04f6d22..ed721f2 100644
- i/configs/mask_rcnn_r50_fpn_1x.py
+ w/configs/mask_rcnn_r50_fpn_1x.py
-1,14 +1,15 @@
model settings
del = dict(
pe='MaskRCNN',
pretrained='torchvision://resnet50',
pretrained='open-mmlab://resnet50_caffe',
ckbone=dict(
pe='ResNet',
pth=50,
m_stages=4,
t_indices=(0, 1, 2, 3),
ozen_stages=1,
style='pytorch'),
norm_cfg=dict(type="BN", requires_grad=False),
style='caffe'),
ck=dict(
pe='FPN',
_channels=[256, 512, 1024, 2048],
-115,7 +116,7 @@ test_cfg = dict(
taset_type = 'CocoDataset'
ta_root = 'data/coco/'
g_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
mean=[123.675, 116.28, 103.53], std=[1.0, 1.0, 1.0], to_rgb=False)
ain_pipeline = [
ct(type='LoadImageFromFile'),
ct(type='LoadAnnotations', with_bbox=True, with_mask=True),
details>
SimpleDet: at commit 9187a1, run
python detection_train.py --config config/mask_r50v1_fpn_1x.py
Detectron: run
python tools/train_net.py --cfg configs/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml
Note that many of its ops run on CPUs, therefore the performance is limited.
matterport/Mask_RCNN: at commit 3deaec, apply the following diff, export TF_CUDNN_USE_AUTOTUNE=0, then run
python coco.py train --dataset=/data/coco/ --model=imagenet
Note that many small details in this implementation might be different
from Detectron's standards.
etails>
ummary>
iff to make it use the same hyperparameters - click to expand)
summary>
`diff
ff --git i/mrcnn/model.py w/mrcnn/model.py
dex 62cb2b0..61d7779 100644
- i/mrcnn/model.py
+ w/mrcnn/model.py
-2367,8 +2367,8 @@ class MaskRCNN():
ochs=epochs,
eps_per_epoch=self.config.STEPS_PER_EPOCH,
llbacks=callbacks,
validation_data=val_generator,
validation_steps=self.config.VALIDATION_STEPS,
#validation_data=val_generator,
#validation_steps=self.config.VALIDATION_STEPS,
x_queue_size=100,
rkers=workers,
e_multiprocessing=True,
ff --git i/mrcnn/parallel_model.py w/mrcnn/parallel_model.py
dex d2bf53b..060172a 100644
- i/mrcnn/parallel_model.py
+ w/mrcnn/parallel_model.py
-32,6 +32,7 @@ class ParallelModel(KM.Model):
ras_model: The Keras model to parallelize
u_count: Number of GPUs. Must be > 1
"
super().__init__()
lf.inner_model = keras_model
lf.gpu_count = gpu_count
rged_outputs = self.make_parallel()
ff --git i/samples/coco/coco.py w/samples/coco/coco.py
dex 5d172b5..239ed75 100644
- i/samples/coco/coco.py
+ w/samples/coco/coco.py
-81,7 +81,10 @@ class CocoConfig(Config):
AGES_PER_GPU = 2
Uncomment to train on 8 GPUs (default is 1)
# GPU_COUNT = 8
GPU_COUNT = 8
BACKBONE = "resnet50"
STEPS_PER_EPOCH = 50
TRAIN_ROIS_PER_IMAGE = 512
Number of classes (including background)
M_CLASSES = 1 + 80 # COCO has 80 classes
-496,29 +499,10 @@ if __name__ == '__main__':
*** This training schedule is an example. Update to your needs ***
Training - Stage 1
print("Training network heads")
del.train(dataset_train, dataset_val,
arning_rate=config.LEARNING_RATE,
ochs=40,
layers='heads',
augmentation=augmentation)
# Training - Stage 2
# Finetune layers from ResNet stage 4 and up
print("Fine tune Resnet stage 4 and up")
model.train(dataset_train, dataset_val,
learning_rate=config.LEARNING_RATE,
epochs=120,
layers='4+',
augmentation=augmentation)
# Training - Stage 3
# Fine tune all layers
print("Fine tune all layers")
model.train(dataset_train, dataset_val,
learning_rate=config.LEARNING_RATE / 10,
epochs=160,
layers='all',
layers='3+',
gmentation=augmentation)
if args.command == "evaluate":
details>