diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..a757d4f
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,85 @@
+AccessModifierOffset: -1
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ForEachMacros:   [ FOR_EACH, FOR_EACH_ENUMERATE, FOR_EACH_KV, FOR_EACH_R, FOR_EACH_RANGE, ]
+IncludeCategories:
+  - Regex:           '^<.*\.h(pp)?>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..74ff132
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,9 @@
+# This is an example .flake8 config, used when developing *Black* itself.
+# Keep in sync with setup.cfg which is used for source packages.
+
+[flake8]
+ignore = W503, E203, E221, C901
+max-line-length = 100
+max-complexity = 18
+select = B,C,E,F,W,T4,B9
+exclude = build,__init__.py
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..3d1709c
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,42 @@
+FROM nvidia/cuda:10.1-cudnn7-devel
+# To use this Dockerfile:
+# 1. `nvidia-docker build -t detectron2:v0 .`
+# 2. `nvidia-docker run -it --name detectron2 detectron2:v0`
+# 
+# To enable GUI support (Linux):
+# 1. Grant the container temporary access to your x server (will be reverted at reboot of your host): 
+#    `xhost +local:`docker inspect --format='{{ .Config.Hostname }}' detectron2``
+# 2. `nvidia-docker run -it --name detectron2 --env="DISPLAY" --volume="/tmp/.X11-unix:/tmp/.X11-unix:rw" detectron2:v0`
+
+
+ENV DEBIAN_FRONTEND noninteractive
+RUN apt-get update && apt-get install -y \
+	libpng-dev libjpeg-dev python3-opencv ca-certificates \
+	python3-dev build-essential pkg-config git curl wget automake libtool && \
+  rm -rf /var/lib/apt/lists/*
+
+RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
+	python3 get-pip.py && \
+	rm get-pip.py
+
+# install dependencies
+# See https://pytorch.org/ for other options if you use a different version of CUDA
+RUN pip install torch torchvision cython \
+	'git+https://github.com/facebookresearch/fvcore'
+RUN pip install 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
+
+# install detectron2
+RUN git clone https://github.com/facebookresearch/detectron2 /detectron2_repo
+ENV FORCE_CUDA="1"
+ENV TORCH_CUDA_ARCH_LIST="Kepler;Kepler+Tesla;Maxwell;Maxwell+Tegra;Pascal;Volta;Turing"
+RUN pip install -e /detectron2_repo
+
+WORKDIR /detectron2_repo
+
+# run it, for example:
+# wget http://images.cocodataset.org/val2017/000000439715.jpg -O input.jpg
+# python3 demo/demo.py  \
+	#--config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml \
+	#--input input.jpg --output outputs/ \
+	#--opts MODEL.WEIGHTS detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl
+
diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md
new file mode 100644
index 0000000..683e622
--- /dev/null
+++ b/GETTING_STARTED.md
@@ -0,0 +1,79 @@
+
+## Getting Started with Detectron2
+
+This document provides a brief intro of the usage of builtin command-line tools in detectron2.
+
+For a tutorial that involves actual coding with the API,
+see our [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5)
+which covers how to run inference with an
+existing model, and how to train a builtin model on a custom dataset.
+
+For more advanced tutorials, refer to our [documentation](https://detectron2.readthedocs.io/tutorials/extend.html).
+
+
+### Inference with Pre-trained Models
+
+1. Pick a model and its config file from
+	[model zoo](https://github.com/facebookresearch/detectron2/blob/master/MODEL_ZOO.md),
+	for example, `mask_rcnn_R_50_FPN_3x.yaml`.
+2. Run the demo with
+```
+python demo/demo.py --config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml \
+  --input input1.jpg input2.jpg \
+	[--other-options]
+  --opts MODEL.WEIGHTS detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl
+```
+The configs are made for training, therefore we need to specify `MODEL.WEIGHTS` to a model from model zoo for evaluation.
+This command will run the inference and show visualizations in an OpenCV window.
+
+For details of the command line arguments, see `demo.py -h`. Some common ones are:
+* To run __on your webcam__, replace `--input files` with `--webcam`.
+* To run __on a video__, replace `--input files` with `--video-input video.mp4`.
+* To run __on cpu__, add `MODEL.DEVICE cpu` after `--opts`.
+* To save outputs to a directory (for images) or a file (for webcam or video), use `--output`.
+
+
+### Use Detectron2 in Command Line
+
+We provide a script in "tools/train_net.py", that is made to train
+all the configs provided in detectron2.
+You may want to use it as a reference to write your own training script for a new research.
+
+To train a model with "train_net.py", first
+setup the corresponding datasets following
+[datasets/README.md](https://github.com/facebookresearch/detectron2/blob/master/datasets/README.md),
+then run:
+```
+python tools/train_net.py --num-gpus 8 \
+	--config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
+```
+
+The configs are made for 8-GPU training. To train on 1 GPU, change the batch size with:
+```
+python tools/train_net.py \
+	--config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
+	SOLVER.IMS_PER_BATCH 2 SOLVER.BASE_LR 0.0025
+```
+
+For most models, CPU training is not supported.
+
+(Note that we applied the [linear learning rate scaling rule](https://arxiv.org/abs/1706.02677)
+when changing the batch size.)
+
+To evaluate this model's performance, use
+```
+python tools/train_net.py \
+	--config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
+	--eval-only MODEL.WEIGHTS /path/to/checkpoint_file
+```
+For more options, see `python tools/train_net.py -h`.
+
+### Use Detectron2 in Your Code
+
+See our [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5)
+to learn how to use detectron2 APIs to:
+1. run inference with an existing model
+2. train a builtin model on a custom dataset
+
+See [detectron2/projects](https://github.com/facebookresearch/detectron2/tree/master/projects)
+for more ways to build your project on detectron2.
diff --git a/INSTALL.md b/INSTALL.md
new file mode 100644
index 0000000..b323ed0
--- /dev/null
+++ b/INSTALL.md
@@ -0,0 +1,69 @@
+## Installation
+
+Our [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5)
+has step-by-step instructions that install detectron2.
+The [Dockerfile](https://github.com/facebookresearch/detectron2/blob/master/Dockerfile)
+also installs detectron2 with a few simple commands.
+
+### Requirements
+- Linux or macOS
+- Python >= 3.6
+- PyTorch 1.3
+- [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
+	You can install them together at [pytorch.org](https://pytorch.org) to make sure of this.
+- OpenCV, needed by demo and visualization
+- [fvcore](https://github.com/facebookresearch/fvcore/): `pip install git+https://github.com/facebookresearch/fvcore`
+- pycocotools: `pip install cython; pip install git+https://github.com/philferriere/cocoapi.git#subdirectory=PythonAPI`
+- VS2019/CUDA10.1
+
+
+### Build Detectron2
+
+After having the above dependencies, run:
+```
+git clone https://github.com/facebookresearch/detectron2.git
+cd detectron2
+python setup.py build develop
+
+# or if you are on macOS
+# MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py build develop
+
+# or, as an alternative to `setup.py`, do
+# pip install [--editable] .
+```
+Note: you may need to rebuild detectron2 after reinstalling a different build of PyTorch.
+
+### Common Installation Issues
+
++ Undefined torch/aten symbols, or segmentation fault immediately when running the library.
+  This may be caused by the following reasons:
+
+	* detectron2 or torchvision is not compiled with the version of PyTorch you're running.
+
+		If you use a pre-built torchvision, uninstall torchvision & pytorch, and reinstall them
+		following [pytorch.org](http://pytorch.org).
+		If you manually build detectron2 or torchvision, remove the files you built (`build/`, `**/*.so`)
+		and rebuild them.
+
+	* detectron2 or torchvision is not compiled using gcc >= 4.9.
+
+	  You'll see a warning message during compilation in this case. Please remove the files you build,
+		and rebuild them.
+		Technically, you need the identical compiler that's used to build pytorch to guarantee
+		compatibility. But in practice, gcc >= 4.9 should work OK.
+
++ Undefined cuda symbols. The version of NVCC you use to build detectron2 or torchvision does
+	not match the version of cuda you are running with.
+	This happens sometimes when using anaconda.
+
++ "Not compiled with GPU support": make sure
+	```
+	python -c 'import torch; from torch.utils.cpp_extension import CUDA_HOME; print(torch.cuda.is_available(), CUDA_HOME)'
+	```
+	print valid outputs at the time you build detectron2.
+
++ "invalid device function" or "no kernel image is available for execution": two possibilities:
+  * You build detectron2 with one version of CUDA but run it with a different version.
+  * Detectron2 is not built with the correct compute compability for the GPU model.
+    The compute compability defaults to match the GPU found on the machine during building,
+    and can be controlled by `TORCH_CUDA_ARCH_LIST` environment variable during installation.
diff --git a/LICENSE b/LICENSE
index 0b8ae76..5a90478 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,5 +1,201 @@
-Copyright (C) YEAR by AUTHOR EMAIL
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
 
-Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted.
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 
-THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction,
+and distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by
+the copyright owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all
+other entities that control, are controlled by, or are under common
+control with that entity. For the purposes of this definition,
+"control" means (i) the power, direct or indirect, to cause the
+direction or management of such entity, whether by contract or
+otherwise, or (ii) ownership of fifty percent (50%) or more of the
+outstanding shares, or (iii) beneficial ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity
+exercising permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications,
+including but not limited to software source code, documentation
+source, and configuration files.
+
+"Object" form shall mean any form resulting from mechanical
+transformation or translation of a Source form, including but
+not limited to compiled object code, generated documentation,
+and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or
+Object form, made available under the License, as indicated by a
+copyright notice that is included in or attached to the work
+(an example is provided in the Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object
+form, that is based on (or derived from) the Work and for which the
+editorial revisions, annotations, elaborations, or other modifications
+represent, as a whole, an original work of authorship. For the purposes
+of this License, Derivative Works shall not include works that remain
+separable from, or merely link (or bind by name) to the interfaces of,
+the Work and Derivative Works thereof.
+
+"Contribution" shall mean any work of authorship, including
+the original version of the Work and any modifications or additions
+to that Work or Derivative Works thereof, that is intentionally
+submitted to Licensor for inclusion in the Work by the copyright owner
+or by an individual or Legal Entity authorized to submit on behalf of
+the copyright owner. For the purposes of this definition, "submitted"
+means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems,
+and issue tracking systems that are managed by, or on behalf of, the
+Licensor for the purpose of discussing and improving the Work, but
+excluding communication that is conspicuously marked or otherwise
+designated in writing by the copyright owner as "Not a Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity
+on behalf of whom a Contribution has been received by Licensor and
+subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the
+Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+this License, each Contributor hereby grants to You a perpetual,
+worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+(except as stated in this section) patent license to make, have made,
+use, offer to sell, sell, import, and otherwise transfer the Work,
+where such license applies only to those patent claims licensable
+by such Contributor that are necessarily infringed by their
+Contribution(s) alone or by combination of their Contribution(s)
+with the Work to which such Contribution(s) was submitted. If You
+institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work
+or a Contribution incorporated within the Work constitutes direct
+or contributory patent infringement, then any patent licenses
+granted to You under this License for that Work shall terminate
+as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+Work or Derivative Works thereof in any medium, with or without
+modifications, and in Source or Object form, provided that You
+meet the following conditions:
+
+(a) You must give any other recipients of the Work or
+Derivative Works a copy of this License; and
+
+(b) You must cause any modified files to carry prominent notices
+stating that You changed the files; and
+
+(c) You must retain, in the Source form of any Derivative Works
+that You distribute, all copyright, patent, trademark, and
+attribution notices from the Source form of the Work,
+excluding those notices that do not pertain to any part of
+the Derivative Works; and
+
+(d) If the Work includes a "NOTICE" text file as part of its
+distribution, then any Derivative Works that You distribute must
+include a readable copy of the attribution notices contained
+within such NOTICE file, excluding those notices that do not
+pertain to any part of the Derivative Works, in at least one
+of the following places: within a NOTICE text file distributed
+as part of the Derivative Works; within the Source form or
+documentation, if provided along with the Derivative Works; or,
+within a display generated by the Derivative Works, if and
+wherever such third-party notices normally appear. The contents
+of the NOTICE file are for informational purposes only and
+do not modify the License. You may add Your own attribution
+notices within Derivative Works that You distribute, alongside
+or as an addendum to the NOTICE text from the Work, provided
+that such additional attribution notices cannot be construed
+as modifying the License.
+
+You may add Your own copyright statement to Your modifications and
+may provide additional or different license terms and conditions
+for use, reproduction, or distribution of Your modifications, or
+for any such Derivative Works as a whole, provided Your use,
+reproduction, and distribution of the Work otherwise complies with
+the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+any Contribution intentionally submitted for inclusion in the Work
+by You to the Licensor shall be under the terms and conditions of
+this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify
+the terms of any separate license agreement you may have executed
+with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+names, trademarks, service marks, or product names of the Licensor,
+except as required for reasonable and customary use in describing the
+origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+agreed to in writing, Licensor provides the Work (and each
+Contributor provides its Contributions) on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+implied, including, without limitation, any warranties or conditions
+of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+PARTICULAR PURPOSE. You are solely responsible for determining the
+appropriateness of using or redistributing the Work and assume any
+risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+whether in tort (including negligence), contract, or otherwise,
+unless required by applicable law (such as deliberate and grossly
+negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special,
+incidental, or consequential damages of any character arising as a
+result of this License or out of the use or inability to use the
+Work (including but not limited to damages for loss of goodwill,
+work stoppage, computer failure or malfunction, or any and all
+other commercial damages or losses), even if such Contributor
+has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+the Work or Derivative Works thereof, You may choose to offer,
+and charge a fee for, acceptance of support, warranty, indemnity,
+or other liability obligations and/or rights consistent with this
+License. However, in accepting such obligations, You may act only
+on Your own behalf and on Your sole responsibility, not on behalf
+of any other Contributor, and only if You agree to indemnify,
+defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason
+of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+To apply the Apache License to your work, attach the following
+boilerplate notice, with the fields enclosed by brackets "[]"
+replaced with your own identifying information. (Don't include
+the brackets!)  The text should be enclosed in the appropriate
+comment syntax for the file format. We also recommend that a
+file or class name and description of purpose be included on the
+same "printed page" as the copyright notice for easier
+identification within third-party archives.
+
+Copyright 2019, Facebook, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/MODEL_ZOO.md b/MODEL_ZOO.md
new file mode 100644
index 0000000..f7a6030
--- /dev/null
+++ b/MODEL_ZOO.md
@@ -0,0 +1,877 @@
+# Detectron2 Model Zoo and Baselines
+
+## Introduction
+
+This file documents a large collection of baselines trained
+with detectron2 in Sep-Oct, 2019.
+The corresponding configurations for all models can be found under the `configs/` directory.
+Unless otherwise noted, the following settings are used for all runs:
+
+#### Common Settings
+* All models were trained on [Big Basin](https://engineering.fb.com/data-center-engineering/introducing-big-basin-our-next-generation-ai-hardware/)
+  servers with 8 NVIDIA V100 GPUs, with data-parallel sync SGD and a total minibatch size of 16 images.
+* All models were trained with CUDA 9.2, cuDNN 7.4.2 or 7.6.3 (the difference in speed is found to be negligible).
+* Training curves and other statistics can be found in `metrics` for each model.
+* The default settings are __not directly comparable__ with Detectron.
+  For example, our default training data augmentation uses scale jittering in addition to horizontal flipping.
+
+  For configs that are comparable to Detectron's settings, see
+  [Detectron1-Comparisons](configs/Detectron1-Comparisons/) for accuracy comparison,
+  and [benchmarks](https://detectron2.readthedocs.io/notes/benchmarks.html)
+  for speed comparison.
+* Inference speed is measured by `tools/train_net.py --eval-only`, or [inference_on_dataset()](https://detectron2.readthedocs.io/modules/evaluation.html#detectron2.evaluation.inference_on_dataset),
+  with batch size 1 in detectron2 directly.
+  The actual deployment should in general be faster than the given inference
+  speed due to more optimizations.
+* Training speed is averaged across the entire training.
+	We keep updating the speed with latest version of detectron2/pytorch/etc.,
+	so they might be different from the `metrics` file.
+* All COCO models were trained on `train2017` and evaluated on `val2017`.
+* For Faster/Mask R-CNN, we provide baselines based on __3 different backbone combinations__:
+  * __FPN__: Use a ResNet+FPN backbone with standard conv and FC heads for mask and box prediction,
+    respectively. It obtains the best
+    speed/accuracy tradeoff, but the other two are still useful for research.
+  * __C4__: Use a ResNet conv4 backbone with conv5 head. The original baseline in the Faster R-CNN paper.
+  * __DC5__ (Dilated-C5): Use a ResNet conv5 backbone with dilations in conv5, and standard conv and FC heads
+    for mask and box prediction, respectively.
+    This is used by the Deformable ConvNet paper.
+* Most models are trained with the 3x schedule (~37 COCO epochs).
+  Although 1x models are heavily under-trained, we provide some ResNet-50 models with the 1x (~12 COCO epochs)
+  training schedule for comparison when doing quick research iteration.
+* The *model id* column is provided for ease of reference.
+  To check downloaded file integrity, any model on this page contains its md5 prefix in its file name.
+	Each model also comes with a metrics file with all the training statistics and evaluation curves.
+
+#### ImageNet Pretrained Models
+
+We provide backbone models pretrained on ImageNet-1k dataset.
+These models are __different__ from those provided in Detectron: we do not fuse BatchNorm into an affine layer.
+* [R-50.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/MSRA/R-50.pkl): converted copy of MSRA's original ResNet-50 model
+* [R-101.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/MSRA/R-101.pkl): converted copy of MSRA's original ResNet-101 model
+* [X-101-32x8d.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/FAIR/X-101-32x8d.pkl): ResNeXt-101-32x8d model trained with Caffe2 at FB
+
+Pretrained models in Detectron's format can still be used. For example:
+* [X-152-32x8d-IN5k.pkl](https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/25093814/X-152-32x8d-IN5k.pkl):
+  ResNeXt-152-32x8d model trained on ImageNet-5k with Caffe2 at FB (see ResNeXt paper for details on ImageNet-5k).
+* [R-50-GN.pkl](https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/47261647/R-50-GN.pkl):
+  ResNet-50 with Group Normalization.
+* [R-101-GN.pkl](https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/47592356/R-101-GN.pkl):
+  ResNet-101 with Group Normalization.
+
+#### License
+
+All models available for download through this document are licensed under the
+[Creative Commons Attribution-ShareAlike 3.0 license](https://creativecommons.org/licenses/by-sa/3.0/).
+
+### COCO Object Detection Baselines
+
+#### Faster R-CNN:
+<!--
+(fb only) To update the table in vim:
+1. Remove the old table: d{
+2. Copy the below command to the place of the table
+3. :.!bash
+
+./gen_html_table.py --config 'COCO-Detection/faster*50*'{1x,3x}'*' 'COCO-Detection/faster*101*' --name R50-C4 R50-DC5 R50-FPN R50-C4 R50-DC5 R50-FPN R101-C4 R101-DC5 R101-FPN X101-FPN --fields lr_sched train_speed inference_speed mem box_AP
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: faster_rcnn_R_50_C4_1x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml">R50-C4</a></td>
+<td align="center">1x</td>
+<td align="center">0.551</td>
+<td align="center">0.110</td>
+<td align="center">4.8</td>
+<td align="center">35.7</td>
+<td align="center">137257644</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/model_final_721ade.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_R_50_DC5_1x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml">R50-DC5</a></td>
+<td align="center">1x</td>
+<td align="center">0.380</td>
+<td align="center">0.068</td>
+<td align="center">5.0</td>
+<td align="center">37.3</td>
+<td align="center">137847829</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_DC5_1x/137847829/model_final_51d356.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_DC5_1x/137847829/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml">R50-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.210</td>
+<td align="center">0.055</td>
+<td align="center">3.0</td>
+<td align="center">37.9</td>
+<td align="center">137257794</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_FPN_1x/137257794/model_final_b275ba.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_FPN_1x/137257794/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_R_50_C4_3x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml">R50-C4</a></td>
+<td align="center">3x</td>
+<td align="center">0.543</td>
+<td align="center">0.110</td>
+<td align="center">4.8</td>
+<td align="center">38.4</td>
+<td align="center">137849393</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_3x/137849393/model_final_f97cb7.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_3x/137849393/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_R_50_DC5_3x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml">R50-DC5</a></td>
+<td align="center">3x</td>
+<td align="center">0.378</td>
+<td align="center">0.073</td>
+<td align="center">5.0</td>
+<td align="center">39.0</td>
+<td align="center">137849425</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_DC5_3x/137849425/model_final_68d202.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_DC5_3x/137849425/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_R_50_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml">R50-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.209</td>
+<td align="center">0.047</td>
+<td align="center">3.0</td>
+<td align="center">40.2</td>
+<td align="center">137849458</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_FPN_3x/137849458/model_final_280758.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_FPN_3x/137849458/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_R_101_C4_3x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml">R101-C4</a></td>
+<td align="center">3x</td>
+<td align="center">0.619</td>
+<td align="center">0.149</td>
+<td align="center">5.9</td>
+<td align="center">41.1</td>
+<td align="center">138204752</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_C4_3x/138204752/model_final_298dad.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_C4_3x/138204752/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_R_101_DC5_3x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml">R101-DC5</a></td>
+<td align="center">3x</td>
+<td align="center">0.452</td>
+<td align="center">0.082</td>
+<td align="center">6.1</td>
+<td align="center">40.6</td>
+<td align="center">138204841</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_DC5_3x/138204841/model_final_3e0943.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_DC5_3x/138204841/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_R_101_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml">R101-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.286</td>
+<td align="center">0.063</td>
+<td align="center">4.1</td>
+<td align="center">42.0</td>
+<td align="center">137851257</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_FPN_3x/137851257/model_final_f6e8b1.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_FPN_3x/137851257/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_X_101_32x8d_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml">X101-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.638</td>
+<td align="center">0.120</td>
+<td align="center">6.7</td>
+<td align="center">43.0</td>
+<td align="center">139173657</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x/139173657/model_final_68b088.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x/139173657/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+
+#### RetinaNet:
+<!--
+./gen_html_table.py --config 'COCO-Detection/retina*50*' 'COCO-Detection/retina*101*' --name R50 R50 R101 --fields lr_sched train_speed inference_speed mem box_AP
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: retinanet_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml">R50</a></td>
+<td align="center">1x</td>
+<td align="center">0.200</td>
+<td align="center">0.062</td>
+<td align="center">3.9</td>
+<td align="center">36.5</td>
+<td align="center">137593951</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/retinanet_R_50_FPN_1x/137593951/model_final_b796dc.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/retinanet_R_50_FPN_1x/137593951/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: retinanet_R_50_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/retinanet_R_50_FPN_3x.yaml">R50</a></td>
+<td align="center">3x</td>
+<td align="center">0.201</td>
+<td align="center">0.063</td>
+<td align="center">3.9</td>
+<td align="center">37.9</td>
+<td align="center">137849486</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/retinanet_R_50_FPN_3x/137849486/model_final_4cafe0.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/retinanet_R_50_FPN_3x/137849486/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: retinanet_R_101_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/retinanet_R_101_FPN_3x.yaml">R101</a></td>
+<td align="center">3x</td>
+<td align="center">0.280</td>
+<td align="center">0.080</td>
+<td align="center">5.1</td>
+<td align="center">39.9</td>
+<td align="center">138363263</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/retinanet_R_101_FPN_3x/138363263/model_final_59f53c.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/retinanet_R_101_FPN_3x/138363263/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+#### RPN & Fast R-CNN:
+<!--
+./gen_html_table.py --config 'COCO-Detection/rpn*' 'COCO-Detection/fast_rcnn*' --name "RPN R50-C4" "RPN R50-FPN" "Fast R-CNN R50-FPN" --fields lr_sched train_speed inference_speed mem box_AP prop_AR
+-->
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">prop.<br/>AR</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: rpn_R_50_C4_1x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/rpn_R_50_C4_1x.yaml">RPN R50-C4</a></td>
+<td align="center">1x</td>
+<td align="center">0.130</td>
+<td align="center">0.051</td>
+<td align="center">1.5</td>
+<td align="center"></td>
+<td align="center">51.6</td>
+<td align="center">137258005</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/rpn_R_50_C4_1x/137258005/model_final_450694.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/rpn_R_50_C4_1x/137258005/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: rpn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/rpn_R_50_FPN_1x.yaml">RPN R50-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.186</td>
+<td align="center">0.045</td>
+<td align="center">2.7</td>
+<td align="center"></td>
+<td align="center">58.0</td>
+<td align="center">137258492</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/rpn_R_50_FPN_1x/137258492/model_final_02ce48.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/rpn_R_50_FPN_1x/137258492/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: fast_rcnn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml">Fast R-CNN R50-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.140</td>
+<td align="center">0.035</td>
+<td align="center">2.6</td>
+<td align="center">37.8</td>
+<td align="center"></td>
+<td align="center">137635226</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/fast_rcnn_R_50_FPN_1x/137635226/model_final_e5f7ce.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/fast_rcnn_R_50_FPN_1x/137635226/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+### COCO Instance Segmentation Baselines with Mask R-CNN
+<!--
+./gen_html_table.py --config 'COCO-InstanceSegmentation/mask*50*'{1x,3x}'*' 'COCO-InstanceSegmentation/mask*101*' --name R50-C4 R50-DC5 R50-FPN R50-C4 R50-DC5 R50-FPN R101-C4 R101-DC5 R101-FPN X101-FPN --fields lr_sched train_speed inference_speed mem box_AP mask_AP
+-->
+
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: mask_rcnn_R_50_C4_1x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml">R50-C4</a></td>
+<td align="center">1x</td>
+<td align="center">0.584</td>
+<td align="center">0.117</td>
+<td align="center">5.2</td>
+<td align="center">36.8</td>
+<td align="center">32.2</td>
+<td align="center">137259246</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x/137259246/model_final_9243eb.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x/137259246/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_DC5_1x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml">R50-DC5</a></td>
+<td align="center">1x</td>
+<td align="center">0.471</td>
+<td align="center">0.074</td>
+<td align="center">6.5</td>
+<td align="center">38.3</td>
+<td align="center">34.2</td>
+<td align="center">137260150</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x/137260150/model_final_4f86c3.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x/137260150/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml">R50-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.261</td>
+<td align="center">0.053</td>
+<td align="center">3.4</td>
+<td align="center">38.6</td>
+<td align="center">35.2</td>
+<td align="center">137260431</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/137260431/model_final_a54504.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/137260431/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_C4_3x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml">R50-C4</a></td>
+<td align="center">3x</td>
+<td align="center">0.575</td>
+<td align="center">0.118</td>
+<td align="center">5.2</td>
+<td align="center">39.8</td>
+<td align="center">34.4</td>
+<td align="center">137849525</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x/137849525/model_final_4ce675.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x/137849525/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_DC5_3x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml">R50-DC5</a></td>
+<td align="center">3x</td>
+<td align="center">0.470</td>
+<td align="center">0.075</td>
+<td align="center">6.5</td>
+<td align="center">40.0</td>
+<td align="center">35.9</td>
+<td align="center">137849551</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x/137849551/model_final_84107b.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x/137849551/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml">R50-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.261</td>
+<td align="center">0.055</td>
+<td align="center">3.4</td>
+<td align="center">41.0</td>
+<td align="center">37.2</td>
+<td align="center">137849600</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_101_C4_3x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml">R101-C4</a></td>
+<td align="center">3x</td>
+<td align="center">0.652</td>
+<td align="center">0.155</td>
+<td align="center">6.3</td>
+<td align="center">42.6</td>
+<td align="center">36.7</td>
+<td align="center">138363239</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x/138363239/model_final_a2914c.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x/138363239/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_101_DC5_3x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml">R101-DC5</a></td>
+<td align="center">3x</td>
+<td align="center">0.545</td>
+<td align="center">0.155</td>
+<td align="center">7.6</td>
+<td align="center">41.9</td>
+<td align="center">37.3</td>
+<td align="center">138363294</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x/138363294/model_final_0464b7.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x/138363294/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_101_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml">R101-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.340</td>
+<td align="center">0.070</td>
+<td align="center">4.6</td>
+<td align="center">42.9</td>
+<td align="center">38.6</td>
+<td align="center">138205316</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x/138205316/model_final_a3ec72.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x/138205316/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_X_101_32x8d_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml">X101-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.690</td>
+<td align="center">0.129</td>
+<td align="center">7.2</td>
+<td align="center">44.3</td>
+<td align="center">39.5</td>
+<td align="center">139653917</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x/139653917/model_final_2d9806.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x/139653917/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+### COCO Person Keypoint Detection Baselines with Keypoint R-CNN
+<!--
+./gen_html_table.py --config 'COCO-Keypoints/*50*' 'COCO-Keypoints/*101*'  --name R50-FPN R50-FPN R101-FPN X101-FPN --fields lr_sched train_speed inference_speed mem box_AP keypoint_AP
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">kp.<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: keypoint_rcnn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml">R50-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.315</td>
+<td align="center">0.083</td>
+<td align="center">5.0</td>
+<td align="center">53.6</td>
+<td align="center">64.0</td>
+<td align="center">137261548</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x/137261548/model_final_04e291.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x/137261548/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: keypoint_rcnn_R_50_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml">R50-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.316</td>
+<td align="center">0.076</td>
+<td align="center">5.0</td>
+<td align="center">55.4</td>
+<td align="center">65.5</td>
+<td align="center">137849621</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x/137849621/model_final_a6e10b.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x/137849621/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: keypoint_rcnn_R_101_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml">R101-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.390</td>
+<td align="center">0.090</td>
+<td align="center">6.1</td>
+<td align="center">56.4</td>
+<td align="center">66.1</td>
+<td align="center">138363331</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x/138363331/model_final_997cc7.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x/138363331/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: keypoint_rcnn_X_101_32x8d_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml">X101-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.738</td>
+<td align="center">0.142</td>
+<td align="center">8.7</td>
+<td align="center">57.3</td>
+<td align="center">66.0</td>
+<td align="center">139686956</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x/139686956/model_final_5ad38f.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x/139686956/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+### COCO Panoptic Segmentation Baselines with Panoptic FPN
+<!--
+./gen_html_table.py --config 'COCO-PanopticSegmentation/*50*' 'COCO-PanopticSegmentation/*101*'  --name R50-FPN R50-FPN R101-FPN --fields lr_sched train_speed inference_speed mem box_AP mask_AP PQ
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">PQ</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: panoptic_fpn_R_50_1x -->
+ <tr><td align="left"><a href="configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml">R50-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.304</td>
+<td align="center">0.063</td>
+<td align="center">4.8</td>
+<td align="center">37.6</td>
+<td align="center">34.7</td>
+<td align="center">39.4</td>
+<td align="center">139514544</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x/139514544/model_final_dbfeb4.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x/139514544/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: panoptic_fpn_R_50_3x -->
+ <tr><td align="left"><a href="configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml">R50-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.302</td>
+<td align="center">0.063</td>
+<td align="center">4.8</td>
+<td align="center">40.0</td>
+<td align="center">36.5</td>
+<td align="center">41.5</td>
+<td align="center">139514569</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x/139514569/model_final_c10459.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x/139514569/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: panoptic_fpn_R_101_3x -->
+ <tr><td align="left"><a href="configs/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml">R101-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.392</td>
+<td align="center">0.078</td>
+<td align="center">6.0</td>
+<td align="center">42.4</td>
+<td align="center">38.5</td>
+<td align="center">43.0</td>
+<td align="center">139514519</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x/139514519/model_final_cafdb1.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x/139514519/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+
+### LVIS Instance Segmentation Baselines with Mask R-CNN
+
+Mask R-CNN baselines on the [LVIS dataset](https://lvisdataset.org), v0.5.
+These baselines are described in Table 3(c) of the [LVIS paper](https://arxiv.org/abs/1908.03195).
+
+NOTE: the 1x schedule here has the same amount of __iterations__ as the COCO 1x baselines.
+They are roughly 24 epochs of LVISv0.5 data.
+The final results of these configs have large variance across different runs.
+
+<!--
+./gen_html_table.py --config 'LVIS-InstanceSegmentation/mask*50*' 'LVIS-InstanceSegmentation/mask*101*' --name R50-FPN R101-FPN X101-FPN --fields lr_sched train_speed inference_speed mem box_AP mask_AP
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: mask_rcnn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml">R50-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.292</td>
+<td align="center">0.127</td>
+<td align="center">7.1</td>
+<td align="center">23.6</td>
+<td align="center">24.4</td>
+<td align="center">144219072</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/144219072/model_final_571f7c.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/144219072/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_101_FPN_1x -->
+ <tr><td align="left"><a href="configs/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml">R101-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.371</td>
+<td align="center">0.124</td>
+<td align="center">7.8</td>
+<td align="center">25.6</td>
+<td align="center">25.9</td>
+<td align="center">144219035</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x/144219035/model_final_824ab5.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x/144219035/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_X_101_32x8d_FPN_1x -->
+ <tr><td align="left"><a href="configs/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml">X101-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.712</td>
+<td align="center">0.166</td>
+<td align="center">10.2</td>
+<td align="center">26.7</td>
+<td align="center">27.1</td>
+<td align="center">144219108</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x/144219108/model_final_5e3439.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x/144219108/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+
+
+### Cityscapes & Pascal VOC Baselines
+
+Simple baselines for
+* Mask R-CNN on Cityscapes instance segmentation (initialized from COCO pre-training, then trained on Cityscapes fine annotations only)
+* Faster R-CNN on PASCAL VOC object detection (trained on VOC 2007 train+val + VOC 2012 train+val, tested on VOC 2007 using 11-point interpolated AP)
+
+<!--
+./gen_html_table.py --config 'Cityscapes/*' 'PascalVOC-Detection/*' --name "R50-FPN, Cityscapes" "R50-C4, VOC" --fields train_speed inference_speed mem box_AP box_AP50 mask_AP
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">box<br/>AP50</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: mask_rcnn_R_50_FPN -->
+ <tr><td align="left"><a href="configs/Cityscapes/mask_rcnn_R_50_FPN.yaml">R50-FPN, Cityscapes</a></td>
+<td align="center">0.240</td>
+<td align="center">0.092</td>
+<td align="center">4.4</td>
+<td align="center"></td>
+<td align="center"></td>
+<td align="center">36.5</td>
+<td align="center">142423278</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Cityscapes/mask_rcnn_R_50_FPN/142423278/model_final_af9cf5.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Cityscapes/mask_rcnn_R_50_FPN/142423278/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: faster_rcnn_R_50_C4 -->
+ <tr><td align="left"><a href="configs/PascalVOC-Detection/faster_rcnn_R_50_C4.yaml">R50-C4, VOC</a></td>
+<td align="center">0.537</td>
+<td align="center">0.086</td>
+<td align="center">4.8</td>
+<td align="center">51.9</td>
+<td align="center">80.3</td>
+<td align="center"></td>
+<td align="center">142202221</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/PascalVOC-Detection/faster_rcnn_R_50_C4/142202221/model_final_b1acc2.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/PascalVOC-Detection/faster_rcnn_R_50_C4/142202221/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+
+
+### Other Settings
+
+Ablations for Deformable Conv and Cascade R-CNN:
+
+<!--
+./gen_html_table.py --config 'COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml' 'Misc/*R_50_FPN_1x_dconv*' 'Misc/cascade*1x.yaml' 'COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml' 'Misc/*R_50_FPN_3x_dconv*' 'Misc/cascade*3x.yaml' --name "Baseline R50-FPN" "Deformable Conv" "Cascade R-CNN" "Baseline R50-FPN" "Deformable Conv" "Cascade R-CNN"  --fields lr_sched train_speed inference_speed mem box_AP mask_AP
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: mask_rcnn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml">Baseline R50-FPN</a></td>
+<td align="center">1x</td>
+<td align="center">0.261</td>
+<td align="center">0.053</td>
+<td align="center">3.4</td>
+<td align="center">38.6</td>
+<td align="center">35.2</td>
+<td align="center">137260431</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/137260431/model_final_a54504.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/137260431/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_FPN_1x_dconv_c3-c5 -->
+ <tr><td align="left"><a href="configs/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml">Deformable Conv</a></td>
+<td align="center">1x</td>
+<td align="center">0.342</td>
+<td align="center">0.061</td>
+<td align="center">3.5</td>
+<td align="center">41.5</td>
+<td align="center">37.5</td>
+<td align="center">138602867</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5/138602867/model_final_65c703.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5/138602867/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: cascade_mask_rcnn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml">Cascade R-CNN</a></td>
+<td align="center">1x</td>
+<td align="center">0.317</td>
+<td align="center">0.066</td>
+<td align="center">4.0</td>
+<td align="center">42.1</td>
+<td align="center">36.4</td>
+<td align="center">138602847</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/cascade_mask_rcnn_R_50_FPN_1x/138602847/model_final_e9d89b.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/cascade_mask_rcnn_R_50_FPN_1x/138602847/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml">Baseline R50-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.261</td>
+<td align="center">0.055</td>
+<td align="center">3.4</td>
+<td align="center">41.0</td>
+<td align="center">37.2</td>
+<td align="center">137849600</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_FPN_3x_dconv_c3-c5 -->
+ <tr><td align="left"><a href="configs/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml">Deformable Conv</a></td>
+<td align="center">3x</td>
+<td align="center">0.349</td>
+<td align="center">0.066</td>
+<td align="center">3.5</td>
+<td align="center">42.7</td>
+<td align="center">38.5</td>
+<td align="center">144998336</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5/144998336/model_final_821d0b.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5/144998336/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: cascade_mask_rcnn_R_50_FPN_3x -->
+ <tr><td align="left"><a href="configs/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml">Cascade R-CNN</a></td>
+<td align="center">3x</td>
+<td align="center">0.328</td>
+<td align="center">0.075</td>
+<td align="center">4.0</td>
+<td align="center">44.3</td>
+<td align="center">38.5</td>
+<td align="center">144998488</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/cascade_mask_rcnn_R_50_FPN_3x/144998488/model_final_480dd8.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/cascade_mask_rcnn_R_50_FPN_3x/144998488/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+
+Ablations for normalization methods:
+(Note: The baseline uses `2fc` head while the others use `4conv1fc` head. According to the
+[GroupNorm paper](https://arxiv.org/abs/1803.08494), the change in head does not improve the baseline by much)
+<!--
+./gen_html_table.py --config 'COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml' 'Misc/mask*50_FPN_3x_syncbn.yaml' 'Misc/mask*50_FPN_3x_gn.yaml' 'Misc/scratch*' --name "Baseline R50-FPN" "SyncBN" "GN" "GN (scratch)" --fields lr_sched train_speed inference_speed mem box_AP mask_AP
+   -->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: mask_rcnn_R_50_FPN_3x -->
+ <tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml">Baseline R50-FPN</a></td>
+<td align="center">3x</td>
+<td align="center">0.261</td>
+<td align="center">0.055</td>
+<td align="center">3.4</td>
+<td align="center">41.0</td>
+<td align="center">37.2</td>
+<td align="center">137849600</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_FPN_3x_syncbn -->
+ <tr><td align="left"><a href="configs/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml">SyncBN</a></td>
+<td align="center">3x</td>
+<td align="center">0.464</td>
+<td align="center">0.063</td>
+<td align="center">5.6</td>
+<td align="center">42.0</td>
+<td align="center">37.8</td>
+<td align="center">143915318</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_3x_syncbn/143915318/model_final_220cfb.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_3x_syncbn/143915318/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_FPN_3x_gn -->
+ <tr><td align="left"><a href="configs/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml">GN</a></td>
+<td align="center">3x</td>
+<td align="center">0.356</td>
+<td align="center">0.077</td>
+<td align="center">7.3</td>
+<td align="center">42.6</td>
+<td align="center">38.6</td>
+<td align="center">138602888</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_3x_gn/138602888/model_final_dc5d9e.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_3x_gn/138602888/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: scratch_mask_rcnn_R_50_FPN_3x_gn -->
+ <tr><td align="left"><a href="configs/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml">GN (scratch)</a></td>
+<td align="center">3x</td>
+<td align="center">0.400</td>
+<td align="center">0.077</td>
+<td align="center">9.8</td>
+<td align="center">39.9</td>
+<td align="center">36.6</td>
+<td align="center">138602908</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn/138602908/model_final_01ca85.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn/138602908/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+
+
+A few very large models trained for a long time, for demo purposes:
+
+<!--
+./gen_html_table.py --config 'Misc/panoptic_*dconv*' 'Misc/cascade_*152*' --name "Panoptic FPN R101" "Mask R-CNN X152" --fields inference_speed mem box_AP mask_AP PQ
+# manually add TTA results
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">PQ</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: panoptic_fpn_R_101_dconv_cascade_gn_3x -->
+ <tr><td align="left"><a href="configs/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml">Panoptic FPN R101</a></td>
+<td align="center">0.123</td>
+<td align="center">11.4</td>
+<td align="center">47.4</td>
+<td align="center">41.3</td>
+<td align="center">46.1</td>
+<td align="center">139797668</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x/139797668/model_final_be35db.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x/139797668/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv -->
+ <tr><td align="left"><a href="configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml">Mask R-CNN X152</a></td>
+<td align="center">0.281</td>
+<td align="center">15.1</td>
+<td align="center">50.2</td>
+<td align="center">44.0</td>
+<td align="center"></td>
+<td align="center">18131413</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv/18131413/model_0039999_e76410.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv/18131413/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: TTA cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv -->
+ <tr><td align="left">above + test-time aug.</td>
+<td align="center"></td>
+<td align="center"></td>
+<td align="center">51.9</td>
+<td align="center">45.9</td>
+<td align="center"></td>
+<td align="center"></td>
+<td align="center"></td>
+</tr>
+</tbody></table>
diff --git a/README.md b/README.md
index 9077f2e..e79bc03 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,104 @@
-# detectron
+### Requirements
+- Python >= 3.6(Conda)
+- PyTorch 1.3
+- [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
+	You can install them together at [pytorch.org](https://pytorch.org) to make sure of this.
+- OpenCV, needed by demo and visualization
+- [fvcore](https://github.com/facebookresearch/fvcore/): `pip install git+https://github.com/facebookresearch/fvcore`
+- pycocotools: `pip install cython; pip install git+https://github.com/philferriere/cocoapi.git#subdirectory=PythonAPI`
+- VS2019(no test in older version)/CUDA10.1(no test in older version)
 
+### several files must be changed by manually.
+```
+file1: 
+  {your evn path}\Lib\site-packages\torch\include\torch\csrc\jit\argument_spec.h
+  example:
+  {C:\Miniconda3\envs\py36}\Lib\site-packages\torch\include\torch\csrc\jit\argument_spec.h(190)
+    static constexpr size_t DEPTH_LIMIT = 128;
+      change to -->
+    static const size_t DEPTH_LIMIT = 128;
+file2: 
+  {your evn path}\Lib\site-packages\torch\include\pybind11\cast.h
+  example:
+  {C:\Miniconda3\envs\py36}\Lib\site-packages\torch\include\pybind11\cast.h(1449)
+    explicit operator type&() { return *(this->value); }
+      change to -->
+    explicit operator type&() { return *((type*)this->value); }
+```
+
+### Build detectron2
+
+After having the above dependencies, run:
+```
+conda activate {your env}
+
+"C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+
+git clone https://github.com/conansherry/detectron2
+
+cd detectron2
+
+python setup.py build develop
+```
+Note: you may need to rebuild detectron2 after reinstalling a different build of PyTorch.
+
+<div align="center">
+  <img src="docs/windows_build.png"/>
+</div>
+
+<img src=".github/Detectron2-Logo-Horz.svg" width="300" >
+
+Detectron2 is Facebook AI Research's next generation software system
+that implements state-of-the-art object detection algorithms.
+It is a ground-up rewrite of the previous version,
+[Detectron](https://github.com/facebookresearch/Detectron/),
+and it originates from [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark/).
+
+<div align="center">
+  <img src="https://user-images.githubusercontent.com/1381301/66535560-d3422200-eace-11e9-9123-5535d469db19.png"/>
+</div>
+
+### What's New
+* It is powered by the [PyTorch](https://pytorch.org) deep learning framework.
+* Includes more features such as panoptic segmentation, densepose, Cascade R-CNN, rotated bounding boxes, etc.
+* Can be used as a library to support [different projects](projects/) on top of it.
+  We'll open source more research projects in this way.
+* It [trains much faster](https://detectron2.readthedocs.io/notes/benchmarks.html).
+
+See our [blog post](https://ai.facebook.com/blog/-detectron2-a-pytorch-based-modular-object-detection-library-/)
+to see more demos and learn about detectron2.
+
+## Installation
+
+See [INSTALL.md](INSTALL.md).
+
+## Quick Start
+
+See [GETTING_STARTED.md](GETTING_STARTED.md),
+or the [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5).
+
+Learn more at our [documentation](https://detectron2.readthedocs.org).
+And see [projects/](projects/) for some projects that are built on top of detectron2.
+
+## Model Zoo and Baselines
+
+We provide a large set of baseline results and trained models available for download in the [Detectron2 Model Zoo](MODEL_ZOO.md).
+
+
+## License
+
+Detectron2 is released under the [Apache 2.0 license](LICENSE).
+
+## Citing Detectron
+
+If you use Detectron2 in your research or wish to refer to the baseline results published in the [Model Zoo](MODEL_ZOO.md), please use the following BibTeX entry.
+
+```BibTeX
+@misc{wu2019detectron2,
+  author =       {Yuxin Wu and Alexander Kirillov and Francisco Massa and
+                  Wan-Yen Lo and Ross Girshick},
+  title =        {Detectron2},
+  howpublished = {\url{https://github.com/facebookresearch/detectron2}},
+  year =         {2019}
+}
+```
diff --git a/configs/Base-RCNN-C4.yaml b/configs/Base-RCNN-C4.yaml
new file mode 100644
index 0000000..f44bc6a
--- /dev/null
+++ b/configs/Base-RCNN-C4.yaml
@@ -0,0 +1,18 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  RPN:
+    PRE_NMS_TOPK_TEST: 6000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "Res5ROIHeads"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+
diff --git a/configs/Base-RCNN-DilatedC5.yaml b/configs/Base-RCNN-DilatedC5.yaml
new file mode 100644
index 0000000..458a778
--- /dev/null
+++ b/configs/Base-RCNN-DilatedC5.yaml
@@ -0,0 +1,30 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  RESNETS:
+    OUT_FEATURES: ["res5"]
+    RES5_DILATION: 2
+  RPN:
+    IN_FEATURES: ["res5"]
+    PRE_NMS_TOPK_TEST: 6000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "StandardROIHeads"
+    IN_FEATURES: ["res5"]
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
diff --git a/configs/Base-RCNN-FPN.yaml b/configs/Base-RCNN-FPN.yaml
new file mode 100644
index 0000000..f61493d
--- /dev/null
+++ b/configs/Base-RCNN-FPN.yaml
@@ -0,0 +1,41 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    # Detectron1 uses 2000 proposals per-batch,
+    # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
+    # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+  ROI_HEADS:
+    NAME: "StandardROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
diff --git a/configs/Base-RetinaNet.yaml b/configs/Base-RetinaNet.yaml
new file mode 100644
index 0000000..3f47c64
--- /dev/null
+++ b/configs/Base-RetinaNet.yaml
@@ -0,0 +1,23 @@
+MODEL:
+  META_ARCHITECTURE: "RetinaNet"
+  BACKBONE:
+    NAME: "build_retinanet_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3) ] for x in [32, 64, 128, 256, 512 ]]"]
+  FPN:
+    IN_FEATURES: ["res3", "res4", "res5"]
+  RETINANET:
+    IOU_THRESHOLDS: [0.4, 0.5]
+    IOU_LABELS: [0, -1, 1]
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.01  # Note that RetinaNet uses a different default learning rate
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
diff --git a/configs/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml b/configs/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml
new file mode 100644
index 0000000..773ac10
--- /dev/null
+++ b/configs/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml
@@ -0,0 +1,17 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  LOAD_PROPOSALS: True
+  RESNETS:
+    DEPTH: 50
+  PROPOSAL_GENERATOR:
+    NAME: "PrecomputedProposals"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  PROPOSAL_FILES_TRAIN: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_train_box_proposals_21bc3a.pkl", )
+  TEST: ("coco_2017_val",)
+  PROPOSAL_FILES_TEST: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", )
+DATALOADER:
+  # proposals are part of the dataset_dicts, and take a lot of RAM
+  NUM_WORKERS: 2
diff --git a/configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml b/configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml
new file mode 100644
index 0000000..db142cd
--- /dev/null
+++ b/configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml b/configs/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml
new file mode 100644
index 0000000..bceb6b3
--- /dev/null
+++ b/configs/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-DilatedC5.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml b/configs/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml
new file mode 100644
index 0000000..57a098f
--- /dev/null
+++ b/configs/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml b/configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml
new file mode 100644
index 0000000..f961301
--- /dev/null
+++ b/configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml
@@ -0,0 +1,6 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
diff --git a/configs/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml b/configs/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml
new file mode 100644
index 0000000..bc51bce
--- /dev/null
+++ b/configs/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml b/configs/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml
new file mode 100644
index 0000000..0fe96f5
--- /dev/null
+++ b/configs/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml
@@ -0,0 +1,6 @@
+_BASE_: "../Base-RCNN-DilatedC5.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
diff --git a/configs/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml b/configs/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml
new file mode 100644
index 0000000..33fadeb
--- /dev/null
+++ b/configs/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-DilatedC5.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml b/configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml
new file mode 100644
index 0000000..3262019
--- /dev/null
+++ b/configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml
@@ -0,0 +1,6 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
diff --git a/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml b/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml
new file mode 100644
index 0000000..4139518
--- /dev/null
+++ b/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml b/configs/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml
new file mode 100644
index 0000000..9c9b5ab
--- /dev/null
+++ b/configs/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml
@@ -0,0 +1,13 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  MASK_ON: False
+  WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
+  PIXEL_STD: [57.375, 57.120, 58.395]
+  RESNETS:
+    STRIDE_IN_1X1: False  # this is a C2 model
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-Detection/retinanet_R_101_FPN_3x.yaml b/configs/COCO-Detection/retinanet_R_101_FPN_3x.yaml
new file mode 100644
index 0000000..4abb1b9
--- /dev/null
+++ b/configs/COCO-Detection/retinanet_R_101_FPN_3x.yaml
@@ -0,0 +1,8 @@
+_BASE_: "../Base-RetinaNet.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml b/configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml
new file mode 100644
index 0000000..4a24ce3
--- /dev/null
+++ b/configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml
@@ -0,0 +1,5 @@
+_BASE_: "../Base-RetinaNet.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
diff --git a/configs/COCO-Detection/retinanet_R_50_FPN_3x.yaml b/configs/COCO-Detection/retinanet_R_50_FPN_3x.yaml
new file mode 100644
index 0000000..3b5412d
--- /dev/null
+++ b/configs/COCO-Detection/retinanet_R_50_FPN_3x.yaml
@@ -0,0 +1,8 @@
+_BASE_: "../Base-RetinaNet.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-Detection/rpn_R_50_C4_1x.yaml b/configs/COCO-Detection/rpn_R_50_C4_1x.yaml
new file mode 100644
index 0000000..e048211
--- /dev/null
+++ b/configs/COCO-Detection/rpn_R_50_C4_1x.yaml
@@ -0,0 +1,10 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  META_ARCHITECTURE: "ProposalNetwork"
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+  RPN:
+    PRE_NMS_TOPK_TEST: 12000
+    POST_NMS_TOPK_TEST: 2000
diff --git a/configs/COCO-Detection/rpn_R_50_FPN_1x.yaml b/configs/COCO-Detection/rpn_R_50_FPN_1x.yaml
new file mode 100644
index 0000000..dc9c952
--- /dev/null
+++ b/configs/COCO-Detection/rpn_R_50_FPN_1x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "ProposalNetwork"
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+  RPN:
+    POST_NMS_TOPK_TEST: 2000
diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml
new file mode 100644
index 0000000..1a94cc4
--- /dev/null
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml
new file mode 100644
index 0000000..67b70cf
--- /dev/null
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-DilatedC5.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml
new file mode 100644
index 0000000..1935a30
--- /dev/null
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml
new file mode 100644
index 0000000..a9aeb4e
--- /dev/null
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml
@@ -0,0 +1,6 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml
new file mode 100644
index 0000000..38ed867
--- /dev/null
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml
new file mode 100644
index 0000000..b13eefa
--- /dev/null
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml
@@ -0,0 +1,6 @@
+_BASE_: "../Base-RCNN-DilatedC5.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml
new file mode 100644
index 0000000..d401016
--- /dev/null
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-DilatedC5.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
new file mode 100644
index 0000000..d50fb86
--- /dev/null
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
@@ -0,0 +1,6 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml
new file mode 100644
index 0000000..be7d06b
--- /dev/null
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml
new file mode 100644
index 0000000..d14c63f
--- /dev/null
+++ b/configs/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml
@@ -0,0 +1,13 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  MASK_ON: True
+  WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
+  PIXEL_STD: [57.375, 57.120, 58.395]
+  RESNETS:
+    STRIDE_IN_1X1: False  # this is a C2 model
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-Keypoints/Base-Keypoint-RCNN-FPN.yaml b/configs/COCO-Keypoints/Base-Keypoint-RCNN-FPN.yaml
new file mode 100644
index 0000000..4e03944
--- /dev/null
+++ b/configs/COCO-Keypoints/Base-Keypoint-RCNN-FPN.yaml
@@ -0,0 +1,15 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  KEYPOINT_ON: True
+  ROI_HEADS:
+    NUM_CLASSES: 1
+  ROI_BOX_HEAD:
+    SMOOTH_L1_BETA: 0.5  # Keypoint AP degrades (though box AP improves) when using plain L1 loss
+  RPN:
+    # Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2.
+    # 1000 proposals per-image is found to hurt box AP.
+    # Therefore we increase it to 1500 per-image.
+    POST_NMS_TOPK_TRAIN: 1500
+DATASETS:
+  TRAIN: ("keypoints_coco_2017_train",)
+  TEST: ("keypoints_coco_2017_val",)
diff --git a/configs/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml b/configs/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml
new file mode 100644
index 0000000..9309535
--- /dev/null
+++ b/configs/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml
@@ -0,0 +1,8 @@
+_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml b/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml
new file mode 100644
index 0000000..7bf85cf
--- /dev/null
+++ b/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml
@@ -0,0 +1,5 @@
+_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
diff --git a/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml b/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml
new file mode 100644
index 0000000..a07f243
--- /dev/null
+++ b/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml
@@ -0,0 +1,8 @@
+_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml b/configs/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml
new file mode 100644
index 0000000..d4bfa20
--- /dev/null
+++ b/configs/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml
@@ -0,0 +1,12 @@
+_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
+  PIXEL_STD: [57.375, 57.120, 58.395]
+  RESNETS:
+    STRIDE_IN_1X1: False  # this is a C2 model
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml b/configs/COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml
new file mode 100644
index 0000000..755c120
--- /dev/null
+++ b/configs/COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "PanopticFPN"
+  MASK_ON: True
+  SEM_SEG_HEAD:
+    LOSS_WEIGHT: 0.5
+DATASETS:
+  TRAIN: ("coco_2017_train_panoptic_separated",)
+  TEST: ("coco_2017_val_panoptic_separated",)
diff --git a/configs/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml b/configs/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml
new file mode 100644
index 0000000..0e01f6f
--- /dev/null
+++ b/configs/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml
@@ -0,0 +1,8 @@
+_BASE_: "Base-Panoptic-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml b/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml
new file mode 100644
index 0000000..6afa2c1
--- /dev/null
+++ b/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml
@@ -0,0 +1,5 @@
+_BASE_: "Base-Panoptic-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
diff --git a/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml b/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml
new file mode 100644
index 0000000..b956b3f
--- /dev/null
+++ b/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml
@@ -0,0 +1,8 @@
+_BASE_: "Base-Panoptic-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/Cityscapes/mask_rcnn_R_50_FPN.yaml b/configs/Cityscapes/mask_rcnn_R_50_FPN.yaml
new file mode 100644
index 0000000..d6e7014
--- /dev/null
+++ b/configs/Cityscapes/mask_rcnn_R_50_FPN.yaml
@@ -0,0 +1,25 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  # WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  # For better, more stable performance initialize from COCO
+  WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl"
+  MASK_ON: True
+  ROI_HEADS:
+    NUM_CLASSES: 8
+# This is the setting used in Mask R-CNN paper, Appendix A
+INPUT:
+  MIN_SIZE_TRAIN: (800, 832, 864, 896, 928, 960, 992, 1024)
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 1024
+  MAX_SIZE_TRAIN: 2048
+  MAX_SIZE_TEST: 2048
+DATASETS:
+  TRAIN: ("cityscapes_fine_instance_seg_train",)
+  TEST: ("cityscapes_fine_instance_seg_val",)
+SOLVER:
+  BASE_LR: 0.01
+  STEPS: (18000,)
+  MAX_ITER: 24000
+  IMS_PER_BATCH: 8
+TEST:
+  EVAL_PERIOD: 8000
diff --git a/configs/Detectron1-Comparisons/README.md b/configs/Detectron1-Comparisons/README.md
new file mode 100644
index 0000000..947c0a9
--- /dev/null
+++ b/configs/Detectron1-Comparisons/README.md
@@ -0,0 +1,81 @@
+
+Detectron2's default settings and a few implementation details are different from Detectron.
+
+The differences in implementation details are shared in
+[Compatibility with Other Libraries](../../docs/notes/compatibility.md).
+
+The differences in default config includes:
+* Use scale augmentation during training.
+* Use L1 loss instead of smooth L1 loss.
+* Use `POOLER_SAMPLING_RATIO=0` instead of 2.
+* Use `ROIAlignV2`.
+
+In this directory, we provide a few configs that mimic Detectron's behavior as close as possible.
+This provides a fair comparison of accuracy and speed against Detectron.
+
+<!--
+./gen_html_table.py --config 'Detectron1-Comparisons/*.yaml' --name "Faster R-CNN" "Keypoint R-CNN" "Mask R-CNN" --fields lr_sched train_speed inference_speed mem box_AP mask_AP keypoint_AP
+-->
+
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">mask<br/>AP</th>
+<th valign="bottom">kp.<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY -->
+<!-- ROW: faster_rcnn_R_50_FPN_noaug_1x -->
+ <tr><td align="left"><a href="configs/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml">Faster R-CNN</a></td>
+<td align="center">1x</td>
+<td align="center">0.219</td>
+<td align="center">0.048</td>
+<td align="center">3.1</td>
+<td align="center">36.9</td>
+<td align="center"></td>
+<td align="center"></td>
+<td align="center">137781054</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x/137781054/model_final_7ab50c.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x/137781054/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: keypoint_rcnn_R_50_FPN_1x -->
+ <tr><td align="left"><a href="configs/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml">Keypoint R-CNN</a></td>
+<td align="center">1x</td>
+<td align="center">0.313</td>
+<td align="center">0.082</td>
+<td align="center">5.0</td>
+<td align="center">53.1</td>
+<td align="center"></td>
+<td align="center">64.2</td>
+<td align="center">137781195</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x/137781195/model_final_cce136.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x/137781195/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: mask_rcnn_R_50_FPN_noaug_1x -->
+ <tr><td align="left"><a href="configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml">Mask R-CNN</a></td>
+<td align="center">1x</td>
+<td align="center">0.273</td>
+<td align="center">0.052</td>
+<td align="center">3.4</td>
+<td align="center">37.8</td>
+<td align="center">34.9</td>
+<td align="center"></td>
+<td align="center">137781281</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x/137781281/model_final_62ca52.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x/137781281/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+## Comparisons:
+
+* Faster R-CNN: Detectron's AP is 36.7, similar to ours.
+* Keypoint R-CNN: Detectron's AP is box 53.6, keypoint 64.2. Fixing a Detectron's
+  [bug](https://github.com/facebookresearch/Detectron/issues/459) lead to a drop in box AP, and can be
+	compensated back by some parameter tuning.
+* Mask R-CNN: Detectron's AP is box 37.7, mask 33.9. We're 1 AP better in mask AP, due to more correct implementation.
+
+For speed comparison, see [benchmarks](https://detectron2.readthedocs.io/notes/benchmarks.html).
diff --git a/configs/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml b/configs/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml
new file mode 100644
index 0000000..6ce77f1
--- /dev/null
+++ b/configs/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml
@@ -0,0 +1,17 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+  # Detectron1 uses smooth L1 loss with some magic beta values.
+  # The defaults are changed to L1 loss in Detectron2.
+  RPN:
+    SMOOTH_L1_BETA: 0.1111
+  ROI_BOX_HEAD:
+    SMOOTH_L1_BETA: 1.0
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+INPUT:
+  # no scale augmentation
+  MIN_SIZE_TRAIN: (800, )
diff --git a/configs/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml b/configs/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml
new file mode 100644
index 0000000..aacf868
--- /dev/null
+++ b/configs/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml
@@ -0,0 +1,27 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  KEYPOINT_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 1
+  ROI_KEYPOINT_HEAD:
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+  # Detectron1 uses smooth L1 loss with some magic beta values.
+  # The defaults are changed to L1 loss in Detectron2.
+  ROI_BOX_HEAD:
+    SMOOTH_L1_BETA: 1.0
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+  RPN:
+    SMOOTH_L1_BETA: 0.1111
+    # Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2
+    # 1000 proposals per-image is found to hurt box AP.
+    # Therefore we increase it to 1500 per-image.
+    POST_NMS_TOPK_TRAIN: 1500
+DATASETS:
+  TRAIN: ("keypoints_coco_2017_train",)
+  TEST: ("keypoints_coco_2017_val",)
diff --git a/configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml b/configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml
new file mode 100644
index 0000000..4ea86a8
--- /dev/null
+++ b/configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml
@@ -0,0 +1,20 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  # Detectron1 uses smooth L1 loss with some magic beta values.
+  # The defaults are changed to L1 loss in Detectron2.
+  RPN:
+    SMOOTH_L1_BETA: 0.1111
+  ROI_BOX_HEAD:
+    SMOOTH_L1_BETA: 1.0
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+  ROI_MASK_HEAD:
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+INPUT:
+  # no scale augmentation
+  MIN_SIZE_TRAIN: (800, )
diff --git a/configs/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml b/configs/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml
new file mode 100644
index 0000000..f0c3a1b
--- /dev/null
+++ b/configs/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml
@@ -0,0 +1,19 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 101
+  ROI_HEADS:
+    NUM_CLASSES: 1230
+    SCORE_THRESH_TEST: 0.0001
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+DATASETS:
+  TRAIN: ("lvis_v0.5_train",)
+  TEST: ("lvis_v0.5_val",)
+TEST:
+  DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
+DATALOADER:
+  SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
+  REPEAT_THRESHOLD: 0.001
diff --git a/configs/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml b/configs/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
new file mode 100644
index 0000000..64b4caa
--- /dev/null
+++ b/configs/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
@@ -0,0 +1,19 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 1230
+    SCORE_THRESH_TEST: 0.0001
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+DATASETS:
+  TRAIN: ("lvis_v0.5_train",)
+  TEST: ("lvis_v0.5_val",)
+TEST:
+  DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
+DATALOADER:
+  SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
+  REPEAT_THRESHOLD: 0.001
diff --git a/configs/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml b/configs/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml
new file mode 100644
index 0000000..c8b822c
--- /dev/null
+++ b/configs/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml
@@ -0,0 +1,23 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
+  PIXEL_STD: [57.375, 57.120, 58.395]
+  MASK_ON: True
+  RESNETS:
+    STRIDE_IN_1X1: False  # this is a C2 model
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+    DEPTH: 101
+  ROI_HEADS:
+    NUM_CLASSES: 1230
+    SCORE_THRESH_TEST: 0.0001
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+DATASETS:
+  TRAIN: ("lvis_v0.5_train",)
+  TEST: ("lvis_v0.5_val",)
+TEST:
+  DETECTIONS_PER_IMAGE: 300  # LVIS allows up to 300
+DATALOADER:
+  SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
+  REPEAT_THRESHOLD: 0.001
diff --git a/configs/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml b/configs/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml
new file mode 100644
index 0000000..abb33b6
--- /dev/null
+++ b/configs/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml
@@ -0,0 +1,12 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NAME: CascadeROIHeads
+  ROI_BOX_HEAD:
+    CLS_AGNOSTIC_BBOX_REG: True
+  RPN:
+    POST_NMS_TOPK_TRAIN: 2000
diff --git a/configs/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml b/configs/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml
new file mode 100644
index 0000000..e2201ad
--- /dev/null
+++ b/configs/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml
@@ -0,0 +1,15 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NAME: CascadeROIHeads
+  ROI_BOX_HEAD:
+    CLS_AGNOSTIC_BBOX_REG: True
+  RPN:
+    POST_NMS_TOPK_TRAIN: 2000
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml b/configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml
new file mode 100644
index 0000000..fc117f6
--- /dev/null
+++ b/configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml
@@ -0,0 +1,36 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  MASK_ON: True
+  WEIGHTS: "catalog://ImageNetPretrained/FAIR/X-152-32x8d-IN5k"
+  RESNETS:
+    STRIDE_IN_1X1: False  # this is a C2 model
+    NUM_GROUPS: 32
+    WIDTH_PER_GROUP: 8
+    DEPTH: 152
+    DEFORM_ON_PER_STAGE: [False, True, True, True]
+  ROI_HEADS:
+    NAME: "CascadeROIHeads"
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_CONV: 4
+    NUM_FC: 1
+    NORM: "GN"
+    CLS_AGNOSTIC_BBOX_REG: True
+  ROI_MASK_HEAD:
+    NUM_CONV: 8
+    NORM: "GN"
+  RPN:
+    POST_NMS_TOPK_TRAIN: 2000
+SOLVER:
+  IMS_PER_BATCH: 128
+  STEPS: (35000, 45000)
+  MAX_ITER: 50000
+  BASE_LR: 0.16
+INPUT:
+  MIN_SIZE_TRAIN: (640, 864)
+  MIN_SIZE_TRAIN_SAMPLING: "range"
+  MAX_SIZE_TRAIN: 1440
+  CROP:
+    ENABLED: True
+TEST:
+  EVAL_PERIOD: 2500
diff --git a/configs/Misc/mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml b/configs/Misc/mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml
new file mode 100644
index 0000000..4c3b767
--- /dev/null
+++ b/configs/Misc/mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml
@@ -0,0 +1,10 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_BOX_HEAD:
+    CLS_AGNOSTIC_BBOX_REG: True
+  ROI_MASK_HEAD:
+    CLS_AGNOSTIC_MASK: True
diff --git a/configs/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml b/configs/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml
new file mode 100644
index 0000000..04ff988
--- /dev/null
+++ b/configs/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml
@@ -0,0 +1,8 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+    DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5
+    DEFORM_MODULATED: False
diff --git a/configs/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml b/configs/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml
new file mode 100644
index 0000000..68c0ca5
--- /dev/null
+++ b/configs/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml
@@ -0,0 +1,11 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+    DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5
+    DEFORM_MODULATED: False
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml b/configs/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml
new file mode 100644
index 0000000..74d274e
--- /dev/null
+++ b/configs/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml
@@ -0,0 +1,21 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "catalog://ImageNetPretrained/FAIR/R-50-GN"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+    NORM: "GN"
+    STRIDE_IN_1X1: False
+  FPN:
+    NORM: "GN"
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_CONV: 4
+    NUM_FC: 1
+    NORM: "GN"
+  ROI_MASK_HEAD:
+    NORM: "GN"
+SOLVER:
+  # 3x schedule
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/configs/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml b/configs/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml
new file mode 100644
index 0000000..b4d7ac8
--- /dev/null
+++ b/configs/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml
@@ -0,0 +1,24 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+    NORM: "SyncBN"
+    STRIDE_IN_1X1: False
+  FPN:
+    NORM: "SyncBN"
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_CONV: 4
+    NUM_FC: 1
+    NORM: "SyncBN"
+  ROI_MASK_HEAD:
+    NORM: "SyncBN"
+SOLVER:
+  # 3x schedule
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
+TEST:
+  PRECISE_BN:
+    ENABLED: True
diff --git a/configs/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml b/configs/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml
new file mode 100644
index 0000000..34016ce
--- /dev/null
+++ b/configs/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml
@@ -0,0 +1,26 @@
+# A large PanopticFPN for demo purposes.
+# Use GN on backbone to support semantic seg.
+# Use Cascade + Deform Conv to improve localization.
+_BASE_: "../COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml"
+MODEL:
+  WEIGHTS: "catalog://ImageNetPretrained/FAIR/R-101-GN"
+  RESNETS:
+    DEPTH: 101
+    NORM: "GN"
+    DEFORM_ON_PER_STAGE: [False, True, True, True]
+    STRIDE_IN_1X1: False
+  FPN:
+    NORM: "GN"
+  ROI_HEADS:
+    NAME: CascadeROIHeads
+  ROI_BOX_HEAD:
+    CLS_AGNOSTIC_BBOX_REG: True
+  ROI_MASK_HEAD:
+    NORM: "GN"
+  RPN:
+    POST_NMS_TOPK_TRAIN: 2000
+SOLVER:
+  STEPS: (105000, 125000)
+  MAX_ITER: 135000
+  IMS_PER_BATCH: 32
+  BASE_LR: 0.04
diff --git a/configs/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml b/configs/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml
new file mode 100644
index 0000000..b3c136c
--- /dev/null
+++ b/configs/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml
@@ -0,0 +1,11 @@
+_BASE_: "mask_rcnn_R_50_FPN_3x_gn.yaml"
+# INPUT:
+  # It makes sense to divide by STD when training from scratch
+  # But it seems to make no difference on the results and C2's models didn't do this.
+  # So we keep things consistent with C2.
+  # PIXEL_STD: [57.375, 57.12, 58.395]
+MODEL:
+  WEIGHTS: ""
+  MASK_ON: True
+  BACKBONE:
+    FREEZE_AT: 0
diff --git a/configs/Misc/semantic_R_50_FPN_1x.yaml b/configs/Misc/semantic_R_50_FPN_1x.yaml
new file mode 100644
index 0000000..ac256e1
--- /dev/null
+++ b/configs/Misc/semantic_R_50_FPN_1x.yaml
@@ -0,0 +1,11 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "SemanticSegmentor"
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+DATASETS:
+  TRAIN: ("coco_2017_train_panoptic_stuffonly",)
+  TEST: ("coco_2017_val_panoptic_stuffonly",)
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
diff --git a/configs/PascalVOC-Detection/faster_rcnn_R_50_C4.yaml b/configs/PascalVOC-Detection/faster_rcnn_R_50_C4.yaml
new file mode 100644
index 0000000..ea2a6ba
--- /dev/null
+++ b/configs/PascalVOC-Detection/faster_rcnn_R_50_C4.yaml
@@ -0,0 +1,18 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 20
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  MIN_SIZE_TEST: 800
+DATASETS:
+  TRAIN: ('voc_2007_trainval', 'voc_2012_trainval')
+  TEST: ('voc_2007_test',)
+SOLVER:
+  STEPS: (12000, 16000)
+  MAX_ITER: 18000  # 17.4 epochs
+  WARMUP_ITERS: 100
diff --git a/configs/PascalVOC-Detection/faster_rcnn_R_50_FPN.yaml b/configs/PascalVOC-Detection/faster_rcnn_R_50_FPN.yaml
new file mode 100644
index 0000000..e554cab
--- /dev/null
+++ b/configs/PascalVOC-Detection/faster_rcnn_R_50_FPN.yaml
@@ -0,0 +1,18 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    NUM_CLASSES: 20
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  MIN_SIZE_TEST: 800
+DATASETS:
+  TRAIN: ('voc_2007_trainval', 'voc_2012_trainval')
+  TEST: ('voc_2007_test',)
+SOLVER:
+  STEPS: (12000, 16000)
+  MAX_ITER: 18000  # 17.4 epochs
+  WARMUP_ITERS: 100
diff --git a/configs/quick_schedules/README.md b/configs/quick_schedules/README.md
new file mode 100644
index 0000000..a278199
--- /dev/null
+++ b/configs/quick_schedules/README.md
@@ -0,0 +1 @@
+These are quick configs for performance or accuracy regression tracking purposes.
diff --git a/configs/quick_schedules/fast_rcnn_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/fast_rcnn_R_50_FPN_inference_acc_test.yaml
new file mode 100644
index 0000000..a2f37e5
--- /dev/null
+++ b/configs/quick_schedules/fast_rcnn_R_50_FPN_inference_acc_test.yaml
@@ -0,0 +1,7 @@
+_BASE_: "../COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://COCO-Detection/fast_rcnn_R_50_FPN_1x/137635226/model_final_e5f7ce.pkl"
+DATASETS:
+  TEST: ("coco_2017_val_100",)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 45.70, 0.02]]
diff --git a/configs/quick_schedules/fast_rcnn_R_50_FPN_instant_test.yaml b/configs/quick_schedules/fast_rcnn_R_50_FPN_instant_test.yaml
new file mode 100644
index 0000000..52fc0ec
--- /dev/null
+++ b/configs/quick_schedules/fast_rcnn_R_50_FPN_instant_test.yaml
@@ -0,0 +1,15 @@
+_BASE_: "../COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+DATASETS:
+  TRAIN: ("coco_2017_val_100",)
+  PROPOSAL_FILES_TRAIN: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", )
+  TEST: ("coco_2017_val_100",)
+  PROPOSAL_FILES_TEST: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", )
+SOLVER:
+  BASE_LR: 0.005
+  STEPS: (30,)
+  MAX_ITER: 40
+  IMS_PER_BATCH: 4
+DATALOADER:
+  NUM_WORKERS: 2
diff --git a/configs/quick_schedules/keypoint_rcnn_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/keypoint_rcnn_R_50_FPN_inference_acc_test.yaml
new file mode 100644
index 0000000..14cf2aa
--- /dev/null
+++ b/configs/quick_schedules/keypoint_rcnn_R_50_FPN_inference_acc_test.yaml
@@ -0,0 +1,7 @@
+_BASE_: "../COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x/137849621/model_final_a6e10b.pkl"
+DATASETS:
+  TEST: ("keypoints_coco_2017_val_100",)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 52.47, 0.02], ["keypoints", "AP", 67.36, 0.02]]
diff --git a/configs/quick_schedules/keypoint_rcnn_R_50_FPN_instant_test.yaml b/configs/quick_schedules/keypoint_rcnn_R_50_FPN_instant_test.yaml
new file mode 100644
index 0000000..dc09034
--- /dev/null
+++ b/configs/quick_schedules/keypoint_rcnn_R_50_FPN_instant_test.yaml
@@ -0,0 +1,14 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  KEYPOINT_ON: True
+DATASETS:
+  TRAIN: ("keypoints_coco_2017_val_100",)
+  TEST: ("keypoints_coco_2017_val_100",)
+SOLVER:
+  BASE_LR: 0.005
+  STEPS: (30,)
+  MAX_ITER: 40
+  IMS_PER_BATCH: 4
+DATALOADER:
+  NUM_WORKERS: 2
diff --git a/configs/quick_schedules/keypoint_rcnn_R_50_FPN_normalized_training_acc_test.yaml b/configs/quick_schedules/keypoint_rcnn_R_50_FPN_normalized_training_acc_test.yaml
new file mode 100644
index 0000000..4b92392
--- /dev/null
+++ b/configs/quick_schedules/keypoint_rcnn_R_50_FPN_normalized_training_acc_test.yaml
@@ -0,0 +1,30 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  KEYPOINT_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    BATCH_SIZE_PER_IMAGE: 256
+    NUM_CLASSES: 1
+  ROI_KEYPOINT_HEAD:
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: False
+    LOSS_WEIGHT: 4.0
+  ROI_BOX_HEAD:
+    SMOOTH_L1_BETA: 1.0  # Keypoint AP degrades when using plain L1 loss
+  RPN:
+    SMOOTH_L1_BETA: 0.2  # Keypoint AP degrades when using plain L1 loss
+DATASETS:
+  TRAIN: ("keypoints_coco_2017_val",)
+  TEST: ("keypoints_coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+SOLVER:
+  WARMUP_FACTOR: 0.33333333
+  WARMUP_ITERS: 100
+  STEPS: (5500, 5800)
+  MAX_ITER: 6000
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 55.35, 1.0], ["keypoints", "AP", 76.91, 1.0]]
diff --git a/configs/quick_schedules/keypoint_rcnn_R_50_FPN_training_acc_test.yaml b/configs/quick_schedules/keypoint_rcnn_R_50_FPN_training_acc_test.yaml
new file mode 100644
index 0000000..9bd9628
--- /dev/null
+++ b/configs/quick_schedules/keypoint_rcnn_R_50_FPN_training_acc_test.yaml
@@ -0,0 +1,28 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  KEYPOINT_ON: True
+  RESNETS:
+    DEPTH: 50
+  ROI_HEADS:
+    BATCH_SIZE_PER_IMAGE: 256
+    NUM_CLASSES: 1
+  ROI_KEYPOINT_HEAD:
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+  ROI_BOX_HEAD:
+    SMOOTH_L1_BETA: 1.0  # Keypoint AP degrades when using plain L1 loss
+  RPN:
+    SMOOTH_L1_BETA: 0.2  # Keypoint AP degrades when using plain L1 loss
+DATASETS:
+  TRAIN: ("keypoints_coco_2017_val",)
+  TEST: ("keypoints_coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+SOLVER:
+  WARMUP_FACTOR: 0.33333333
+  WARMUP_ITERS: 100
+  STEPS: (5500, 5800)
+  MAX_ITER: 6000
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 53.5, 1.0], ["keypoints", "AP", 72.4, 1.0]]
diff --git a/configs/quick_schedules/mask_rcnn_R_50_C4_inference_acc_test.yaml b/configs/quick_schedules/mask_rcnn_R_50_C4_inference_acc_test.yaml
new file mode 100644
index 0000000..b2d5b7f
--- /dev/null
+++ b/configs/quick_schedules/mask_rcnn_R_50_C4_inference_acc_test.yaml
@@ -0,0 +1,7 @@
+_BASE_: "../COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x/137849525/model_final_4ce675.pkl"
+DATASETS:
+  TEST: ("coco_2017_val_100",)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 47.37, 0.02], ["segm", "AP", 40.99, 0.02]]
diff --git a/configs/quick_schedules/mask_rcnn_R_50_C4_instant_test.yaml b/configs/quick_schedules/mask_rcnn_R_50_C4_instant_test.yaml
new file mode 100644
index 0000000..6c4f121
--- /dev/null
+++ b/configs/quick_schedules/mask_rcnn_R_50_C4_instant_test.yaml
@@ -0,0 +1,14 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_val_100",)
+  TEST: ("coco_2017_val_100",)
+SOLVER:
+  BASE_LR: 0.001
+  STEPS: (30,)
+  MAX_ITER: 40
+  IMS_PER_BATCH: 4
+DATALOADER:
+  NUM_WORKERS: 2
diff --git a/configs/quick_schedules/mask_rcnn_R_50_C4_training_acc_test.yaml b/configs/quick_schedules/mask_rcnn_R_50_C4_training_acc_test.yaml
new file mode 100644
index 0000000..f68dd8f
--- /dev/null
+++ b/configs/quick_schedules/mask_rcnn_R_50_C4_training_acc_test.yaml
@@ -0,0 +1,22 @@
+_BASE_: "../Base-RCNN-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  ROI_HEADS:
+    BATCH_SIZE_PER_IMAGE: 256
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_val",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (600,)
+  MAX_SIZE_TRAIN: 1000
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1000
+SOLVER:
+  IMS_PER_BATCH: 8  # base uses 16
+  WARMUP_FACTOR: 0.33333
+  WARMUP_ITERS: 100
+  STEPS: (11000, 11600)
+  MAX_ITER: 12000
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 41.88, 0.7], ["segm", "AP", 33.79, 0.5]]
diff --git a/configs/quick_schedules/mask_rcnn_R_50_DC5_inference_acc_test.yaml b/configs/quick_schedules/mask_rcnn_R_50_DC5_inference_acc_test.yaml
new file mode 100644
index 0000000..e3ce6cf
--- /dev/null
+++ b/configs/quick_schedules/mask_rcnn_R_50_DC5_inference_acc_test.yaml
@@ -0,0 +1,7 @@
+_BASE_: "../COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x/137849551/model_final_84107b.pkl"
+DATASETS:
+  TEST: ("coco_2017_val_100",)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 47.44, 0.02], ["segm", "AP", 42.94, 0.02]]
diff --git a/configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml
new file mode 100644
index 0000000..a41dc53
--- /dev/null
+++ b/configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml
@@ -0,0 +1,11 @@
+_BASE_: "../COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl"
+DATASETS:
+  TEST: ("coco_2017_val_100",)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 47.34, 0.02], ["segm", "AP",  42.67, 0.02]]
+  # expected results do not use test-time augmentation. TTA results are not verified.
+  AUG:
+    ENABLED: True
+    MIN_SIZES: (400, 500)  # to save some time
diff --git a/configs/quick_schedules/mask_rcnn_R_50_FPN_instant_test.yaml b/configs/quick_schedules/mask_rcnn_R_50_FPN_instant_test.yaml
new file mode 100644
index 0000000..6dbfcde
--- /dev/null
+++ b/configs/quick_schedules/mask_rcnn_R_50_FPN_instant_test.yaml
@@ -0,0 +1,14 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_val_100",)
+  TEST: ("coco_2017_val_100",)
+SOLVER:
+  BASE_LR: 0.005
+  STEPS: (30,)
+  MAX_ITER: 40
+  IMS_PER_BATCH: 4
+DATALOADER:
+  NUM_WORKERS: 2
diff --git a/configs/quick_schedules/mask_rcnn_R_50_FPN_training_acc_test.yaml b/configs/quick_schedules/mask_rcnn_R_50_FPN_training_acc_test.yaml
new file mode 100644
index 0000000..4992104
--- /dev/null
+++ b/configs/quick_schedules/mask_rcnn_R_50_FPN_training_acc_test.yaml
@@ -0,0 +1,21 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  ROI_HEADS:
+    BATCH_SIZE_PER_IMAGE: 256
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_val",)
+  TEST: ("coco_2017_val",)
+INPUT:
+  MIN_SIZE_TRAIN: (600,)
+  MAX_SIZE_TRAIN: 1000
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1000
+SOLVER:
+  WARMUP_FACTOR: 0.3333333
+  WARMUP_ITERS: 100
+  STEPS: (5500, 5800)
+  MAX_ITER: 6000
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 42.8, 0.8], ["segm", "AP", 35.7, 0.8]]
diff --git a/configs/quick_schedules/panoptic_fpn_R_50_inference_acc_test.yaml b/configs/quick_schedules/panoptic_fpn_R_50_inference_acc_test.yaml
new file mode 100644
index 0000000..70874e3
--- /dev/null
+++ b/configs/quick_schedules/panoptic_fpn_R_50_inference_acc_test.yaml
@@ -0,0 +1,7 @@
+_BASE_: "../COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://COCO-PanopticSegmentation/panoptic_fpn_R_50_3x/139514569/model_final_c10459.pkl"
+DATASETS:
+  TEST: ("coco_2017_val_100_panoptic_separated",)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 46.47, 0.02], ["segm", "AP", 43.39, 0.02], ["sem_seg", "mIoU", 42.55, 0.02], ["panoptic_seg", "PQ", 38.99, 0.02]]
diff --git a/configs/quick_schedules/panoptic_fpn_R_50_instant_test.yaml b/configs/quick_schedules/panoptic_fpn_R_50_instant_test.yaml
new file mode 100644
index 0000000..26f3b14
--- /dev/null
+++ b/configs/quick_schedules/panoptic_fpn_R_50_instant_test.yaml
@@ -0,0 +1,19 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "PanopticFPN"
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  SEM_SEG_HEAD:
+    LOSS_WEIGHT: 0.5
+DATASETS:
+  TRAIN: ("coco_2017_val_100_panoptic_separated",)
+  TEST: ("coco_2017_val_100_panoptic_separated",)
+SOLVER:
+  BASE_LR: 0.005
+  STEPS: (30,)
+  MAX_ITER: 40
+  IMS_PER_BATCH: 4
+DATALOADER:
+  NUM_WORKERS: 2
diff --git a/configs/quick_schedules/panoptic_fpn_R_50_training_acc_test.yaml b/configs/quick_schedules/panoptic_fpn_R_50_training_acc_test.yaml
new file mode 100644
index 0000000..8aeba40
--- /dev/null
+++ b/configs/quick_schedules/panoptic_fpn_R_50_training_acc_test.yaml
@@ -0,0 +1,20 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "PanopticFPN"
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+  SEM_SEG_HEAD:
+    LOSS_WEIGHT: 0.5
+DATASETS:
+  TRAIN: ("coco_2017_val_panoptic_separated",)
+  TEST: ("coco_2017_val_panoptic_separated",)
+SOLVER:
+  BASE_LR: 0.01
+  WARMUP_FACTOR: 0.001
+  WARMUP_ITERS: 500
+  STEPS: (5500,)
+  MAX_ITER: 7000
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 46.80, 1.1], ["segm", "AP", 38.93, 0.7], ["sem_seg", "mIoU", 63.99, 0.9], ["panoptic_seg", "PQ", 48.23, 0.8]]
diff --git a/configs/quick_schedules/retinanet_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/retinanet_R_50_FPN_inference_acc_test.yaml
new file mode 100644
index 0000000..36b9988
--- /dev/null
+++ b/configs/quick_schedules/retinanet_R_50_FPN_inference_acc_test.yaml
@@ -0,0 +1,7 @@
+_BASE_: "../COCO-Detection/retinanet_R_50_FPN_3x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://COCO-Detection/retinanet_R_50_FPN_3x/137849486/model_final_4cafe0.pkl"
+DATASETS:
+  TEST: ("coco_2017_val_100",)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 44.36, 0.02]]
diff --git a/configs/quick_schedules/retinanet_R_50_FPN_instant_test.yaml b/configs/quick_schedules/retinanet_R_50_FPN_instant_test.yaml
new file mode 100644
index 0000000..8d95c1f
--- /dev/null
+++ b/configs/quick_schedules/retinanet_R_50_FPN_instant_test.yaml
@@ -0,0 +1,13 @@
+_BASE_: "../COCO-Detection/retinanet_R_50_FPN_1x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+DATASETS:
+  TRAIN: ("coco_2017_val_100",)
+  TEST: ("coco_2017_val_100",)
+SOLVER:
+  BASE_LR: 0.005
+  STEPS: (30,)
+  MAX_ITER: 40
+  IMS_PER_BATCH: 4
+DATALOADER:
+  NUM_WORKERS: 2
diff --git a/configs/quick_schedules/rpn_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/rpn_R_50_FPN_inference_acc_test.yaml
new file mode 100644
index 0000000..c7c3f90
--- /dev/null
+++ b/configs/quick_schedules/rpn_R_50_FPN_inference_acc_test.yaml
@@ -0,0 +1,7 @@
+_BASE_: "../COCO-Detection/rpn_R_50_FPN_1x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/model_final_02ce48.pkl"
+DATASETS:
+  TEST: ("coco_2017_val_100",)
+TEST:
+  EXPECTED_RESULTS: [["box_proposals", "AR@1000", 58.16, 0.02]]
diff --git a/configs/quick_schedules/rpn_R_50_FPN_instant_test.yaml b/configs/quick_schedules/rpn_R_50_FPN_instant_test.yaml
new file mode 100644
index 0000000..402d432
--- /dev/null
+++ b/configs/quick_schedules/rpn_R_50_FPN_instant_test.yaml
@@ -0,0 +1,13 @@
+_BASE_: "../COCO-Detection/rpn_R_50_FPN_1x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+DATASETS:
+  TRAIN: ("coco_2017_val_100",)
+  TEST: ("coco_2017_val_100",)
+SOLVER:
+  STEPS: (30,)
+  MAX_ITER: 40
+  BASE_LR: 0.005
+  IMS_PER_BATCH: 4
+DATALOADER:
+  NUM_WORKERS: 2
diff --git a/configs/quick_schedules/semantic_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/semantic_R_50_FPN_inference_acc_test.yaml
new file mode 100644
index 0000000..bca7498
--- /dev/null
+++ b/configs/quick_schedules/semantic_R_50_FPN_inference_acc_test.yaml
@@ -0,0 +1,10 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "SemanticSegmentor"
+  WEIGHTS: "detectron2://semantic_R_50_FPN_1x/111802073/model_final_c18079783c55a94968edc28b7101c5f0.pkl"
+  RESNETS:
+    DEPTH: 50
+DATASETS:
+  TEST: ("coco_2017_val_100_panoptic_stuffonly",)
+TEST:
+  EXPECTED_RESULTS: [["sem_seg", "mIoU", 39.53, 0.02], ["sem_seg", "mACC", 51.50, 0.02]]
diff --git a/configs/quick_schedules/semantic_R_50_FPN_instant_test.yaml b/configs/quick_schedules/semantic_R_50_FPN_instant_test.yaml
new file mode 100644
index 0000000..14ab606
--- /dev/null
+++ b/configs/quick_schedules/semantic_R_50_FPN_instant_test.yaml
@@ -0,0 +1,18 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "SemanticSegmentor"
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+DATASETS:
+  TRAIN: ("coco_2017_val_100_panoptic_stuffonly",)
+  TEST: ("coco_2017_val_100_panoptic_stuffonly",)
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+SOLVER:
+  BASE_LR: 0.005
+  STEPS: (30,)
+  MAX_ITER: 40
+  IMS_PER_BATCH: 4
+DATALOADER:
+  NUM_WORKERS: 2
diff --git a/configs/quick_schedules/semantic_R_50_FPN_training_acc_test.yaml b/configs/quick_schedules/semantic_R_50_FPN_training_acc_test.yaml
new file mode 100644
index 0000000..1f78d77
--- /dev/null
+++ b/configs/quick_schedules/semantic_R_50_FPN_training_acc_test.yaml
@@ -0,0 +1,20 @@
+_BASE_: "../Base-RCNN-FPN.yaml"
+MODEL:
+  META_ARCHITECTURE: "SemanticSegmentor"
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+DATASETS:
+  TRAIN: ("coco_2017_val_panoptic_stuffonly",)
+  TEST: ("coco_2017_val_panoptic_stuffonly",)
+SOLVER:
+  BASE_LR: 0.01
+  WARMUP_FACTOR: 0.001
+  WARMUP_ITERS: 300
+  STEPS: (5500,)
+  MAX_ITER: 7000
+TEST:
+  EXPECTED_RESULTS: [["sem_seg", "mIoU", 76.51, 1.0], ["sem_seg", "mACC", 83.25, 1.0]]
+INPUT:
+  # no scale augmentation
+  MIN_SIZE_TRAIN: (800, )
diff --git a/datasets/README.md b/datasets/README.md
new file mode 100644
index 0000000..dad1837
--- /dev/null
+++ b/datasets/README.md
@@ -0,0 +1,85 @@
+
+For a few datasets that detectron2 natively supports,
+the datasets are assumed to exist in a directory called
+"datasets/", under the directory where you launch the program.
+They need to have the following directory structure:
+
+## Expected dataset structure for COCO instance/keypoint detection:
+
+```
+coco/
+  annotations/
+    instances_{train,val}2017.json
+    person_keypoints_{train,val}2017.json
+  {train,val}2017/
+    # image files that are mentioned in the corresponding json
+```
+
+You can use the 2014 version of the dataset as well.
+
+Some of the builtin tests (`dev/run_*_tests.sh`) uses a tiny version of the COCO dataset,
+which you can download with `./prepare_for_tests.sh`.
+
+## Expected dataset structure for PanopticFPN:
+
+```
+coco/
+  annotations/
+    panoptic_{train,val}2017.json
+  panoptic_{train,val}2017/
+    # png annotations
+  panoptic_stuff_{train,val}2017/  # generated by the script mentioned below
+```
+
+Install panopticapi by:
+```
+pip install git+https://github.com/cocodataset/panopticapi.git
+```
+Then, run `python prepare_panoptic_fpn.py`, to extract semantic annotations from panoptic annotations.
+
+## Expected dataset structure for LVIS instance segmentation:
+```
+coco/
+  {train,val,test}2017/
+lvis/
+  lvis_v0.5_{train,val}.json
+  lvis_v0.5_image_info_test.json
+```
+
+Install lvis-api by:
+```
+pip install git+https://github.com/lvis-dataset/lvis-api.git
+```
+
+## Expected dataset structure for cityscapes:
+```
+cityscapes/
+  gtFine/
+    train/
+      aachen/
+        color.png, instanceIds.png, labelIds.png, polygons.json,
+        labelTrainIds.png
+      ...
+    val/
+    test/
+  leftImg8bit/
+    train/
+    val/
+    test/
+```
+Install cityscapes scripts by:
+```
+pip install git+https://github.com/mcordts/cityscapesScripts.git
+```
+
+Note:
+labelTrainIds.png are created by `cityscapesscripts/preparation/createTrainIdLabelImgs.py`.
+They are not needed for instance segmentation.
+
+## Expected dataset structure for Pascal VOC:
+```
+VOC20{07,12}/
+  Annotations/
+  ImageSets/
+  JPEGImages/
+```
diff --git a/datasets/prepare_for_tests.sh b/datasets/prepare_for_tests.sh
new file mode 100644
index 0000000..d59b564
--- /dev/null
+++ b/datasets/prepare_for_tests.sh
@@ -0,0 +1,22 @@
+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Download some files needed for running tests.
+
+cd "${0%/*}"
+
+BASE=https://dl.fbaipublicfiles.com/detectron2
+mkdir -p coco/annotations
+
+for anno in instances_val2017_100 \
+  person_keypoints_val2017_100 \
+  instances_minival2014_100 \
+  person_keypoints_minival2014_100; do
+
+  dest=coco/annotations/$anno.json
+  [[ -s $dest ]] && {
+    echo "$dest exists. Skipping ..."
+  } || {
+    wget $BASE/annotations/coco/$anno.json -O $dest
+  }
+done
diff --git a/datasets/prepare_panoptic_fpn.py b/datasets/prepare_panoptic_fpn.py
new file mode 100644
index 0000000..c2c383d
--- /dev/null
+++ b/datasets/prepare_panoptic_fpn.py
@@ -0,0 +1,116 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import time
+import functools
+import json
+import multiprocessing as mp
+import numpy as np
+import os
+from PIL import Image
+
+from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
+from fvcore.common.download import download
+
+from panopticapi.utils import rgb2id
+
+
+def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map):
+    panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32)
+    panoptic = rgb2id(panoptic)
+    output = np.zeros_like(panoptic, dtype=np.uint8) + 255
+    for seg in segments:
+        cat_id = seg["category_id"]
+        new_cat_id = id_map[cat_id]
+        output[panoptic == seg["id"]] = new_cat_id
+    Image.fromarray(output).save(output_semantic)
+
+
+def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories):
+    """
+    Create semantic segmentation annotations from panoptic segmentation
+    annotations, to be used by PanopticFPN.
+
+    It maps all thing categories to class 0, and maps all unlabeled pixels to class 255.
+    It maps all stuff categories to contiguous ids starting from 1.
+
+    Args:
+        panoptic_json (str): path to the panoptic json file, in COCO's format.
+        panoptic_root (str): a directory with panoptic annotation files, in COCO's format.
+        sem_seg_root (str): a directory to output semantic annotation files
+        categories (list[dict]): category metadata. Each dict needs to have:
+            "id": corresponds to the "category_id" in the json annotations
+            "isthing": 0 or 1
+    """
+    os.makedirs(sem_seg_root, exist_ok=True)
+
+    stuff_ids = [k["id"] for k in categories if k["isthing"] == 0]
+    thing_ids = [k["id"] for k in categories if k["isthing"] == 1]
+    id_map = {}  # map from category id to id in the output semantic annotation
+    assert len(stuff_ids) <= 254
+    for i, stuff_id in enumerate(stuff_ids):
+        id_map[stuff_id] = i + 1
+    for thing_id in thing_ids:
+        id_map[thing_id] = 0
+    id_map[0] = 255
+
+    with open(panoptic_json) as f:
+        obj = json.load(f)
+
+    pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
+
+    def iter_annotations():
+        for anno in obj["annotations"]:
+            file_name = anno["file_name"]
+            segments = anno["segments_info"]
+            input = os.path.join(panoptic_root, file_name)
+            output = os.path.join(sem_seg_root, file_name)
+            yield input, output, segments
+
+    print("Start writing to {} ...".format(sem_seg_root))
+    start = time.time()
+    pool.starmap(
+        functools.partial(_process_panoptic_to_semantic, id_map=id_map),
+        iter_annotations(),
+        chunksize=100,
+    )
+    print("Finished. time: {:.2f}s".format(time.time() - start))
+
+
+if __name__ == "__main__":
+    dataset_dir = os.path.join(os.path.dirname(__file__), "coco")
+    for s in ["val2017", "train2017"]:
+        separate_coco_semantic_from_panoptic(
+            os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)),
+            os.path.join(dataset_dir, "panoptic_{}".format(s)),
+            os.path.join(dataset_dir, "panoptic_stuff_{}".format(s)),
+            COCO_CATEGORIES,
+        )
+
+    # Prepare val2017_100 for quick testing:
+
+    dest_dir = os.path.join(dataset_dir, "annotations/")
+    URL_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/"
+    download(URL_PREFIX + "annotations/coco/panoptic_val2017_100.json", dest_dir)
+    with open(os.path.join(dest_dir, "panoptic_val2017_100.json")) as f:
+        obj = json.load(f)
+
+    def link_val100(dir_full, dir_100):
+        print("Creating " + dir_100 + " ...")
+        os.makedirs(dir_100, exist_ok=True)
+        for img in obj["images"]:
+            basename = os.path.splitext(img["file_name"])[0]
+            src = os.path.join(dir_full, basename + ".png")
+            dst = os.path.join(dir_100, basename + ".png")
+            src = os.path.relpath(src, start=dir_100)
+            os.symlink(src, dst)
+
+    link_val100(
+        os.path.join(dataset_dir, "panoptic_val2017"),
+        os.path.join(dataset_dir, "panoptic_val2017_100"),
+    )
+
+    link_val100(
+        os.path.join(dataset_dir, "panoptic_stuff_val2017"),
+        os.path.join(dataset_dir, "panoptic_stuff_val2017_100"),
+    )
diff --git a/demo/README.md b/demo/README.md
new file mode 100644
index 0000000..c51890f
--- /dev/null
+++ b/demo/README.md
@@ -0,0 +1,8 @@
+
+## Detectron2 Demo
+
+We provide a command line tools for running a simple demo.
+The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md).
+
+See our [blog post](https://ai.facebook.com/blog/-detectron2-a-pytorch-based-modular-object-detection-library-)
+for a high-quality demo generated with this tool.
diff --git a/demo/demo.py b/demo/demo.py
new file mode 100644
index 0000000..11b2f8d
--- /dev/null
+++ b/demo/demo.py
@@ -0,0 +1,149 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import argparse
+import glob
+import multiprocessing as mp
+import os
+import time
+import cv2
+import tqdm
+
+from detectron2.config import get_cfg
+from detectron2.data.detection_utils import read_image
+from detectron2.utils.logger import setup_logger
+
+from predictor import VisualizationDemo
+
+# constants
+WINDOW_NAME = "COCO detections"
+
+
+def setup_cfg(args):
+    # load config from file and command-line arguments
+    cfg = get_cfg()
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    # Set score_threshold for builtin models
+    cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold
+    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
+    cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold
+    cfg.freeze()
+    return cfg
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description="Detectron2 Demo")
+    parser.add_argument(
+        "--config-file",
+        default="configs/quick_schedules/e2e_mask_rcnn_R_50_FPN_inference_acc_test.yaml",
+        metavar="FILE",
+        help="path to config file",
+    )
+    parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
+    parser.add_argument("--video-input", help="Path to video file.")
+    parser.add_argument("--input", nargs="+", help="A list of space separated input images")
+    parser.add_argument(
+        "--output",
+        help="A file or directory to save output visualizations. "
+        "If not given, will show output in an OpenCV window.",
+    )
+
+    parser.add_argument(
+        "--confidence-threshold",
+        type=float,
+        default=0.5,
+        help="Minimum score for instance predictions to be shown",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options using the command-line 'KEY VALUE' pairs",
+        default=[],
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    mp.set_start_method("spawn", force=True)
+    args = get_parser().parse_args()
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+
+    cfg = setup_cfg(args)
+
+    demo = VisualizationDemo(cfg)
+
+    if args.input:
+        if len(args.input) == 1:
+            args.input = glob.glob(os.path.expanduser(args.input[0]))
+            assert args.input, "The input path(s) was not found"
+        for path in tqdm.tqdm(args.input, disable=not args.output):
+            # use PIL, to be consistent with evaluation
+            img = read_image(path, format="BGR")
+            start_time = time.time()
+            predictions, visualized_output = demo.run_on_image(img)
+            logger.info(
+                "{}: detected {} instances in {:.2f}s".format(
+                    path, len(predictions["instances"]), time.time() - start_time
+                )
+            )
+
+            if args.output:
+                if os.path.isdir(args.output):
+                    assert os.path.isdir(args.output), args.output
+                    out_filename = os.path.join(args.output, os.path.basename(path))
+                else:
+                    assert len(args.input) == 1, "Please specify a directory with args.output"
+                    out_filename = args.output
+                visualized_output.save(out_filename)
+            else:
+                cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+                cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
+                if cv2.waitKey(0) == 27:
+                    break  # esc to quit
+    elif args.webcam:
+        assert args.input is None, "Cannot have both --input and --webcam!"
+        cam = cv2.VideoCapture(0)
+        for vis in tqdm.tqdm(demo.run_on_video(cam)):
+            cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+            cv2.imshow(WINDOW_NAME, vis)
+            if cv2.waitKey(1) == 27:
+                break  # esc to quit
+        cv2.destroyAllWindows()
+    elif args.video_input:
+        video = cv2.VideoCapture(args.video_input)
+        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames_per_second = video.get(cv2.CAP_PROP_FPS)
+        num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+        basename = os.path.basename(args.video_input)
+
+        if args.output:
+            if os.path.isdir(args.output):
+                output_fname = os.path.join(args.output, basename)
+                output_fname = os.path.splitext(output_fname)[0] + ".mkv"
+            else:
+                output_fname = args.output
+            assert not os.path.isfile(output_fname), output_fname
+            output_file = cv2.VideoWriter(
+                filename=output_fname,
+                # some installation of opencv may not support x264 (due to its license),
+                # you can try other format (e.g. MPEG)
+                fourcc=cv2.VideoWriter_fourcc(*"x264"),
+                fps=float(frames_per_second),
+                frameSize=(width, height),
+                isColor=True,
+            )
+        assert os.path.isfile(args.video_input)
+        for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames):
+            if args.output:
+                output_file.write(vis_frame)
+            else:
+                cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
+                cv2.imshow(basename, vis_frame)
+                if cv2.waitKey(1) == 27:
+                    break  # esc to quit
+        video.release()
+        if args.output:
+            output_file.release()
+        else:
+            cv2.destroyAllWindows()
diff --git a/demo/predictor.py b/demo/predictor.py
new file mode 100644
index 0000000..689fa85
--- /dev/null
+++ b/demo/predictor.py
@@ -0,0 +1,220 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import atexit
+import bisect
+import multiprocessing as mp
+from collections import deque
+import cv2
+import torch
+
+from detectron2.data import MetadataCatalog
+from detectron2.engine.defaults import DefaultPredictor
+from detectron2.utils.video_visualizer import VideoVisualizer
+from detectron2.utils.visualizer import ColorMode, Visualizer
+
+
+class VisualizationDemo(object):
+    def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
+        """
+        Args:
+            cfg (CfgNode):
+            instance_mode (ColorMode):
+            parallel (bool): whether to run the model in different processes from visualization.
+                Useful since the visualization logic can be slow.
+        """
+        self.metadata = MetadataCatalog.get(
+            cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
+        )
+        self.cpu_device = torch.device("cpu")
+        self.instance_mode = instance_mode
+
+        self.parallel = parallel
+        if parallel:
+            num_gpu = torch.cuda.device_count()
+            self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
+        else:
+            self.predictor = DefaultPredictor(cfg)
+
+    def run_on_image(self, image):
+        """
+        Args:
+            image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+                This is the format used by OpenCV.
+
+        Returns:
+            predictions (dict): the output of the model.
+            vis_output (VisImage): the visualized image output.
+        """
+        vis_output = None
+        predictions = self.predictor(image)
+        # Convert image from OpenCV BGR format to Matplotlib RGB format.
+        image = image[:, :, ::-1]
+        visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
+        if "panoptic_seg" in predictions:
+            panoptic_seg, segments_info = predictions["panoptic_seg"]
+            vis_output = visualizer.draw_panoptic_seg_predictions(
+                panoptic_seg.to(self.cpu_device), segments_info
+            )
+        else:
+            if "sem_seg" in predictions:
+                vis_output = visualizer.draw_sem_seg(
+                    predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+                )
+            if "instances" in predictions:
+                instances = predictions["instances"].to(self.cpu_device)
+                vis_output = visualizer.draw_instance_predictions(predictions=instances)
+
+        return predictions, vis_output
+
+    def _frame_from_video(self, video):
+        while video.isOpened():
+            success, frame = video.read()
+            if success:
+                yield frame
+            else:
+                break
+
+    def run_on_video(self, video):
+        """
+        Visualizes predictions on frames of the input video.
+
+        Args:
+            video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
+                either a webcam or a video file.
+
+        Yields:
+            ndarray: BGR visualizations of each video frame.
+        """
+        video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
+
+        def process_predictions(frame, predictions):
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+            if "panoptic_seg" in predictions:
+                panoptic_seg, segments_info = predictions["panoptic_seg"]
+                vis_frame = video_visualizer.draw_panoptic_seg_predictions(
+                    frame, panoptic_seg.to(self.cpu_device), segments_info
+                )
+            elif "instances" in predictions:
+                predictions = predictions["instances"].to(self.cpu_device)
+                vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
+            elif "sem_seg" in predictions:
+                vis_frame = video_visualizer.draw_sem_seg(
+                    frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+                )
+
+            # Converts Matplotlib RGB format to OpenCV BGR format
+            vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
+            return vis_frame
+
+        frame_gen = self._frame_from_video(video)
+        if self.parallel:
+            buffer_size = self.predictor.default_buffer_size
+
+            frame_data = deque()
+
+            for cnt, frame in enumerate(frame_gen):
+                frame_data.append(frame)
+                self.predictor.put(frame)
+
+                if cnt >= buffer_size:
+                    frame = frame_data.popleft()
+                    predictions = self.predictor.get()
+                    yield process_predictions(frame, predictions)
+
+            while len(frame_data):
+                frame = frame_data.popleft()
+                predictions = self.predictor.get()
+                yield process_predictions(frame, predictions)
+        else:
+            for frame in frame_gen:
+                yield process_predictions(frame, self.predictor(frame))
+
+
+class AsyncPredictor:
+    """
+    A predictor that runs the model asynchronously, possibly on >1 GPUs.
+    Because rendering the visualization takes considerably amount of time,
+    this helps improve throughput when rendering videos.
+    """
+
+    class _StopToken:
+        pass
+
+    class _PredictWorker(mp.Process):
+        def __init__(self, cfg, task_queue, result_queue):
+            self.cfg = cfg
+            self.task_queue = task_queue
+            self.result_queue = result_queue
+            super().__init__()
+
+        def run(self):
+            predictor = DefaultPredictor(self.cfg)
+
+            while True:
+                task = self.task_queue.get()
+                if isinstance(task, AsyncPredictor._StopToken):
+                    break
+                idx, data = task
+                result = predictor(data)
+                self.result_queue.put((idx, result))
+
+    def __init__(self, cfg, num_gpus: int = 1):
+        """
+        Args:
+            cfg (CfgNode):
+            num_gpus (int): if 0, will run on CPU
+        """
+        num_workers = max(num_gpus, 1)
+        self.task_queue = mp.Queue(maxsize=num_workers * 3)
+        self.result_queue = mp.Queue(maxsize=num_workers * 3)
+        self.procs = []
+        for gpuid in range(max(num_gpus, 1)):
+            cfg = cfg.clone()
+            cfg.defrost()
+            cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
+            self.procs.append(
+                AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
+            )
+
+        self.put_idx = 0
+        self.get_idx = 0
+        self.result_rank = []
+        self.result_data = []
+
+        for p in self.procs:
+            p.start()
+        atexit.register(self.shutdown)
+
+    def put(self, image):
+        self.put_idx += 1
+        self.task_queue.put((self.put_idx, image))
+
+    def get(self):
+        self.get_idx += 1  # the index needed for this request
+        if len(self.result_rank) and self.result_rank[0] == self.get_idx:
+            res = self.result_data[0]
+            del self.result_data[0], self.result_rank[0]
+            return res
+
+        while True:
+            # make sure the results are returned in the correct order
+            idx, res = self.result_queue.get()
+            if idx == self.get_idx:
+                return res
+            insert = bisect.bisect(self.result_rank, idx)
+            self.result_rank.insert(insert, idx)
+            self.result_data.insert(insert, res)
+
+    def __len__(self):
+        return self.put_idx - self.get_idx
+
+    def __call__(self, image):
+        self.put(image)
+        return self.get()
+
+    def shutdown(self):
+        for _ in self.procs:
+            self.task_queue.put(AsyncPredictor._StopToken())
+
+    @property
+    def default_buffer_size(self):
+        return len(self.procs) * 5
diff --git a/detectron2/__init__.py b/detectron2/__init__.py
new file mode 100644
index 0000000..d1f75b3
--- /dev/null
+++ b/detectron2/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from .utils.env import setup_environment
+
+setup_environment()
+
+
+__version__ = "0.1"
diff --git a/detectron2/checkpoint/__init__.py b/detectron2/checkpoint/__init__.py
new file mode 100644
index 0000000..e17a9df
--- /dev/null
+++ b/detectron2/checkpoint/__init__.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# File:
+
+
+from . import catalog as _UNUSED  # register the handler
+from .detection_checkpoint import DetectionCheckpointer
+from fvcore.common.checkpoint import Checkpointer, PeriodicCheckpointer
+
+__all__ = ["Checkpointer", "PeriodicCheckpointer", "DetectionCheckpointer"]
diff --git a/detectron2/checkpoint/c2_model_loading.py b/detectron2/checkpoint/c2_model_loading.py
new file mode 100644
index 0000000..1070335
--- /dev/null
+++ b/detectron2/checkpoint/c2_model_loading.py
@@ -0,0 +1,313 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import logging
+import re
+import torch
+from fvcore.common.checkpoint import (
+    get_missing_parameters_message,
+    get_unexpected_parameters_message,
+)
+
+
+def convert_basic_c2_names(original_keys):
+    """
+    Apply some basic name conversion to names in C2 weights.
+    It only deals with typical backbone models.
+
+    Args:
+        original_keys (list[str]):
+    Returns:
+        list[str]: The same number of strings matching those in original_keys.
+    """
+    layer_keys = copy.deepcopy(original_keys)
+    layer_keys = [
+        {"pred_b": "linear_b", "pred_w": "linear_w"}.get(k, k) for k in layer_keys
+    ]  # some hard-coded mappings
+
+    layer_keys = [k.replace("_", ".") for k in layer_keys]
+    layer_keys = [re.sub("\\.b$", ".bias", k) for k in layer_keys]
+    layer_keys = [re.sub("\\.w$", ".weight", k) for k in layer_keys]
+    # Uniform both bn and gn names to "norm"
+    layer_keys = [re.sub("bn\\.s$", "norm.weight", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.bias$", "norm.bias", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.rm", "norm.running_mean", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.running.mean$", "norm.running_mean", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.riv$", "norm.running_var", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.running.var$", "norm.running_var", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.gamma$", "norm.weight", k) for k in layer_keys]
+    layer_keys = [re.sub("bn\\.beta$", "norm.bias", k) for k in layer_keys]
+    layer_keys = [re.sub("gn\\.s$", "norm.weight", k) for k in layer_keys]
+    layer_keys = [re.sub("gn\\.bias$", "norm.bias", k) for k in layer_keys]
+
+    # stem
+    layer_keys = [re.sub("^res\\.conv1\\.norm\\.", "conv1.norm.", k) for k in layer_keys]
+    # to avoid mis-matching with "conv1" in other components (e.g. detection head)
+    layer_keys = [re.sub("^conv1\\.", "stem.conv1.", k) for k in layer_keys]
+
+    # layer1-4 is used by torchvision, however we follow the C2 naming strategy (res2-5)
+    # layer_keys = [re.sub("^res2.", "layer1.", k) for k in layer_keys]
+    # layer_keys = [re.sub("^res3.", "layer2.", k) for k in layer_keys]
+    # layer_keys = [re.sub("^res4.", "layer3.", k) for k in layer_keys]
+    # layer_keys = [re.sub("^res5.", "layer4.", k) for k in layer_keys]
+
+    # blocks
+    layer_keys = [k.replace(".branch1.", ".shortcut.") for k in layer_keys]
+    layer_keys = [k.replace(".branch2a.", ".conv1.") for k in layer_keys]
+    layer_keys = [k.replace(".branch2b.", ".conv2.") for k in layer_keys]
+    layer_keys = [k.replace(".branch2c.", ".conv3.") for k in layer_keys]
+
+    # DensePose substitutions
+    layer_keys = [re.sub("^body.conv.fcn", "body_conv_fcn", k) for k in layer_keys]
+    layer_keys = [k.replace("AnnIndex.lowres", "ann_index_lowres") for k in layer_keys]
+    layer_keys = [k.replace("Index.UV.lowres", "index_uv_lowres") for k in layer_keys]
+    layer_keys = [k.replace("U.lowres", "u_lowres") for k in layer_keys]
+    layer_keys = [k.replace("V.lowres", "v_lowres") for k in layer_keys]
+    return layer_keys
+
+
+def convert_c2_detectron_names(weights):
+    """
+    Map Caffe2 Detectron weight names to Detectron2 names.
+
+    Args:
+        weights (dict): name -> tensor
+
+    Returns:
+        dict: detectron2 names -> tensor
+        dict: detectron2 names -> C2 names
+    """
+    logger = logging.getLogger(__name__)
+    logger.info("Remapping C2 weights ......")
+    original_keys = sorted(weights.keys())
+    layer_keys = copy.deepcopy(original_keys)
+
+    layer_keys = convert_basic_c2_names(layer_keys)
+
+    # --------------------------------------------------------------------------
+    # RPN hidden representation conv
+    # --------------------------------------------------------------------------
+    # FPN case
+    # In the C2 model, the RPN hidden layer conv is defined for FPN level 2 and then
+    # shared for all other levels, hence the appearance of "fpn2"
+    layer_keys = [
+        k.replace("conv.rpn.fpn2", "proposal_generator.rpn_head.conv") for k in layer_keys
+    ]
+    # Non-FPN case
+    layer_keys = [k.replace("conv.rpn", "proposal_generator.rpn_head.conv") for k in layer_keys]
+
+    # --------------------------------------------------------------------------
+    # RPN box transformation conv
+    # --------------------------------------------------------------------------
+    # FPN case (see note above about "fpn2")
+    layer_keys = [
+        k.replace("rpn.bbox.pred.fpn2", "proposal_generator.rpn_head.anchor_deltas")
+        for k in layer_keys
+    ]
+    layer_keys = [
+        k.replace("rpn.cls.logits.fpn2", "proposal_generator.rpn_head.objectness_logits")
+        for k in layer_keys
+    ]
+    # Non-FPN case
+    layer_keys = [
+        k.replace("rpn.bbox.pred", "proposal_generator.rpn_head.anchor_deltas") for k in layer_keys
+    ]
+    layer_keys = [
+        k.replace("rpn.cls.logits", "proposal_generator.rpn_head.objectness_logits")
+        for k in layer_keys
+    ]
+
+    # --------------------------------------------------------------------------
+    # Fast R-CNN box head
+    # --------------------------------------------------------------------------
+    layer_keys = [re.sub("^bbox\\.pred", "bbox_pred", k) for k in layer_keys]
+    layer_keys = [re.sub("^cls\\.score", "cls_score", k) for k in layer_keys]
+    layer_keys = [re.sub("^fc6\\.", "box_head.fc1.", k) for k in layer_keys]
+    layer_keys = [re.sub("^fc7\\.", "box_head.fc2.", k) for k in layer_keys]
+    # 4conv1fc head tensor names: head_conv1_w, head_conv1_gn_s
+    layer_keys = [re.sub("^head\\.conv", "box_head.conv", k) for k in layer_keys]
+
+    # --------------------------------------------------------------------------
+    # FPN lateral and output convolutions
+    # --------------------------------------------------------------------------
+    def fpn_map(name):
+        """
+        Look for keys with the following patterns:
+        1) Starts with "fpn.inner."
+           Example: "fpn.inner.res2.2.sum.lateral.weight"
+           Meaning: These are lateral pathway convolutions
+        2) Starts with "fpn.res"
+           Example: "fpn.res2.2.sum.weight"
+           Meaning: These are FPN output convolutions
+        """
+        splits = name.split(".")
+        norm = ".norm" if "norm" in splits else ""
+        if name.startswith("fpn.inner."):
+            # splits example: ['fpn', 'inner', 'res2', '2', 'sum', 'lateral', 'weight']
+            stage = int(splits[2][len("res") :])
+            return "fpn_lateral{}{}.{}".format(stage, norm, splits[-1])
+        elif name.startswith("fpn.res"):
+            # splits example: ['fpn', 'res2', '2', 'sum', 'weight']
+            stage = int(splits[1][len("res") :])
+            return "fpn_output{}{}.{}".format(stage, norm, splits[-1])
+        return name
+
+    layer_keys = [fpn_map(k) for k in layer_keys]
+
+    # --------------------------------------------------------------------------
+    # Mask R-CNN mask head
+    # --------------------------------------------------------------------------
+    # roi_heads.StandardROIHeads case
+    layer_keys = [k.replace(".[mask].fcn", "mask_head.mask_fcn") for k in layer_keys]
+    layer_keys = [re.sub("^\\.mask\\.fcn", "mask_head.mask_fcn", k) for k in layer_keys]
+    layer_keys = [k.replace("mask.fcn.logits", "mask_head.predictor") for k in layer_keys]
+    # roi_heads.Res5ROIHeads case
+    layer_keys = [k.replace("conv5.mask", "mask_head.deconv") for k in layer_keys]
+
+    # --------------------------------------------------------------------------
+    # Keypoint R-CNN head
+    # --------------------------------------------------------------------------
+    # interestingly, the keypoint head convs have blob names that are simply "conv_fcnX"
+    layer_keys = [k.replace("conv.fcn", "roi_heads.keypoint_head.conv_fcn") for k in layer_keys]
+    layer_keys = [
+        k.replace("kps.score.lowres", "roi_heads.keypoint_head.score_lowres") for k in layer_keys
+    ]
+    layer_keys = [k.replace("kps.score.", "roi_heads.keypoint_head.score.") for k in layer_keys]
+
+    # --------------------------------------------------------------------------
+    # Done with replacements
+    # --------------------------------------------------------------------------
+    assert len(set(layer_keys)) == len(layer_keys)
+    assert len(original_keys) == len(layer_keys)
+
+    new_weights = {}
+    new_keys_to_original_keys = {}
+    for orig, renamed in zip(original_keys, layer_keys):
+        new_keys_to_original_keys[renamed] = orig
+        if renamed.startswith("bbox_pred.") or renamed.startswith("mask_head.predictor."):
+            # remove the meaningless prediction weight for background class
+            new_start_idx = 4 if renamed.startswith("bbox_pred.") else 1
+            new_weights[renamed] = weights[orig][new_start_idx:]
+            logger.info(
+                "Remove prediction weight for background class in {}. The shape changes from "
+                "{} to {}.".format(
+                    renamed, tuple(weights[orig].shape), tuple(new_weights[renamed].shape)
+                )
+            )
+        elif renamed.startswith("cls_score."):
+            # move weights of bg class from original index 0 to last index
+            logger.info(
+                "Move classification weights for background class in {} from index 0 to "
+                "index {}.".format(renamed, weights[orig].shape[0] - 1)
+            )
+            new_weights[renamed] = torch.cat([weights[orig][1:], weights[orig][:1]])
+        else:
+            new_weights[renamed] = weights[orig]
+
+    return new_weights, new_keys_to_original_keys
+
+
+# Note the current matching is not symmetric.
+# it assumes model_state_dict will have longer names.
+def align_and_update_state_dicts(model_state_dict, ckpt_state_dict, c2_conversion=True):
+    """
+    Match names between the two state-dict, and update the values of model_state_dict in-place with
+    copies of the matched tensor in ckpt_state_dict.
+    If `c2_conversion==True`, `ckpt_state_dict` is assumed to be a Caffe2
+    model and will be renamed at first.
+
+    Strategy: suppose that the models that we will create will have prefixes appended
+    to each of its keys, for example due to an extra level of nesting that the original
+    pre-trained weights from ImageNet won't contain. For example, model.state_dict()
+    might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains
+    res2.conv1.weight. We thus want to match both parameters together.
+    For that, we look for each model weight, look among all loaded keys if there is one
+    that is a suffix of the current weight name, and use it if that's the case.
+    If multiple matches exist, take the one with longest size
+    of the corresponding name. For example, for the same model as before, the pretrained
+    weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case,
+    we want to match backbone[0].body.conv1.weight to conv1.weight, and
+    backbone[0].body.res2.conv1.weight to res2.conv1.weight.
+    """
+    model_keys = sorted(list(model_state_dict.keys()))
+    if c2_conversion:
+        ckpt_state_dict, original_keys = convert_c2_detectron_names(ckpt_state_dict)
+        # original_keys: the name in the original dict (before renaming)
+    else:
+        original_keys = {x: x for x in ckpt_state_dict.keys()}
+    ckpt_keys = sorted(list(ckpt_state_dict.keys()))
+
+    def match(a, b):
+        # Matched ckpt_key should be a complete (starts with '.') suffix.
+        # For example, roi_heads.mesh_head.whatever_conv1 does not match conv1,
+        # but matches whatever_conv1 or mesh_head.whatever_conv1.
+        return a == b or a.endswith("." + b)
+
+    # get a matrix of string matches, where each (i, j) entry correspond to the size of the
+    # ckpt_key string, if it matches
+    match_matrix = [len(j) if match(i, j) else 0 for i in model_keys for j in ckpt_keys]
+    match_matrix = torch.as_tensor(match_matrix).view(len(model_keys), len(ckpt_keys))
+    # use the matched one with longest size in case of multiple matches
+    max_match_size, idxs = match_matrix.max(1)
+    # remove indices that correspond to no-match
+    idxs[max_match_size == 0] = -1
+
+    # used for logging
+    max_len_model = max(len(key) for key in model_keys) if model_keys else 1
+    max_len_ckpt = max(len(key) for key in ckpt_keys) if ckpt_keys else 1
+    log_str_template = "{: <{}} loaded from {: <{}} of shape {}"
+    logger = logging.getLogger(__name__)
+    # matched_pairs (matched checkpoint key --> matched model key)
+    matched_keys = {}
+    for idx_model, idx_ckpt in enumerate(idxs.tolist()):
+        if idx_ckpt == -1:
+            continue
+        key_model = model_keys[idx_model]
+        key_ckpt = ckpt_keys[idx_ckpt]
+        value_ckpt = ckpt_state_dict[key_ckpt]
+        shape_in_model = model_state_dict[key_model].shape
+
+        if shape_in_model != value_ckpt.shape:
+            logger.warning(
+                "Shape of {} in checkpoint is {}, while shape of {} in model is {}.".format(
+                    key_ckpt, value_ckpt.shape, key_model, shape_in_model
+                )
+            )
+            logger.warning(
+                "{} will not be loaded. Please double check and see if this is desired.".format(
+                    key_ckpt
+                )
+            )
+            continue
+
+        model_state_dict[key_model] = value_ckpt.clone()
+        if key_ckpt in matched_keys:  # already added to matched_keys
+            logger.error(
+                "Ambiguity found for {} in checkpoint!"
+                "It matches at least two keys in the model ({} and {}).".format(
+                    key_ckpt, key_model, matched_keys[key_ckpt]
+                )
+            )
+            raise ValueError("Cannot match one checkpoint key to multiple keys in the model.")
+
+        matched_keys[key_ckpt] = key_model
+        logger.info(
+            log_str_template.format(
+                key_model,
+                max_len_model,
+                original_keys[key_ckpt],
+                max_len_ckpt,
+                tuple(shape_in_model),
+            )
+        )
+    matched_model_keys = matched_keys.values()
+    matched_ckpt_keys = matched_keys.keys()
+    # print warnings about unmatched keys on both side
+    unmatched_model_keys = [k for k in model_keys if k not in matched_model_keys]
+    if len(unmatched_model_keys):
+        logger.info(get_missing_parameters_message(unmatched_model_keys))
+
+    unmatched_ckpt_keys = [k for k in ckpt_keys if k not in matched_ckpt_keys]
+    if len(unmatched_ckpt_keys):
+        logger.info(
+            get_unexpected_parameters_message(original_keys[x] for x in unmatched_ckpt_keys)
+        )
diff --git a/detectron2/checkpoint/catalog.py b/detectron2/checkpoint/catalog.py
new file mode 100644
index 0000000..56a656e
--- /dev/null
+++ b/detectron2/checkpoint/catalog.py
@@ -0,0 +1,132 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+from fvcore.common.file_io import PathHandler, PathManager
+
+
+class ModelCatalog(object):
+    """
+    Store mappings from names to third-party models.
+    """
+
+    S3_C2_DETECTRON_PREFIX = "https://dl.fbaipublicfiles.com/detectron"
+
+    # MSRA models have STRIDE_IN_1X1=True. False otherwise.
+    # NOTE: all BN models here have fused BN into an affine layer.
+    # As a result, you should only load them to a model with "FrozenBN".
+    # Loading them to a model with regular BN or SyncBN is wrong.
+    # Even when loaded to FrozenBN, it is still different from affine by an epsilon,
+    # which should be negligible for training.
+    # NOTE: all models here uses PIXEL_STD=[1,1,1]
+    C2_IMAGENET_MODELS = {
+        "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl",
+        "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl",
+        "FAIR/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl",
+        "FAIR/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl",
+        "FAIR/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl",
+        "FAIR/X-101-64x4d": "ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl",
+        "FAIR/X-152-32x8d-IN5k": "ImageNetPretrained/25093814/X-152-32x8d-IN5k.pkl",
+    }
+
+    C2_DETECTRON_PATH_FORMAT = (
+        "{prefix}/{url}/output/train/{dataset}/{type}/model_final.pkl"
+    )  # noqa B950
+
+    C2_DATASET_COCO = "coco_2014_train%3Acoco_2014_valminusminival"
+    C2_DATASET_COCO_KEYPOINTS = "keypoints_coco_2014_train%3Akeypoints_coco_2014_valminusminival"
+
+    # format: {model_name} -> part of the url
+    C2_DETECTRON_MODELS = {
+        "35857197/e2e_faster_rcnn_R-50-C4_1x": "35857197/12_2017_baselines/e2e_faster_rcnn_R-50-C4_1x.yaml.01_33_49.iAX0mXvW",  # noqa B950
+        "35857345/e2e_faster_rcnn_R-50-FPN_1x": "35857345/12_2017_baselines/e2e_faster_rcnn_R-50-FPN_1x.yaml.01_36_30.cUF7QR7I",  # noqa B950
+        "35857890/e2e_faster_rcnn_R-101-FPN_1x": "35857890/12_2017_baselines/e2e_faster_rcnn_R-101-FPN_1x.yaml.01_38_50.sNxI7sX7",  # noqa B950
+        "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "36761737/12_2017_baselines/e2e_faster_rcnn_X-101-32x8d-FPN_1x.yaml.06_31_39.5MIHi1fZ",  # noqa B950
+        "35858791/e2e_mask_rcnn_R-50-C4_1x": "35858791/12_2017_baselines/e2e_mask_rcnn_R-50-C4_1x.yaml.01_45_57.ZgkA7hPB",  # noqa B950
+        "35858933/e2e_mask_rcnn_R-50-FPN_1x": "35858933/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml.01_48_14.DzEQe4wC",  # noqa B950
+        "35861795/e2e_mask_rcnn_R-101-FPN_1x": "35861795/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_1x.yaml.02_31_37.KqyEK4tT",  # noqa B950
+        "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "36761843/12_2017_baselines/e2e_mask_rcnn_X-101-32x8d-FPN_1x.yaml.06_35_59.RZotkLKI",  # noqa B950
+        "48616381/e2e_mask_rcnn_R-50-FPN_2x_gn": "GN/48616381/04_2018_gn_baselines/e2e_mask_rcnn_R-50-FPN_2x_gn_0416.13_23_38.bTlTI97Q",  # noqa B950
+        "37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "37697547/12_2017_baselines/e2e_keypoint_rcnn_R-50-FPN_1x.yaml.08_42_54.kdzV35ao",  # noqa B950
+        "35998355/rpn_R-50-C4_1x": "35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L",  # noqa B950
+        "35998814/rpn_R-50-FPN_1x": "35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179",  # noqa B950
+        "36225147/fast_R-50-FPN_1x": "36225147/12_2017_baselines/fast_rcnn_R-50-FPN_1x.yaml.08_39_09.L3obSdQ2",  # noqa B950
+    }
+
+    @staticmethod
+    def get(name):
+        if name.startswith("Caffe2Detectron/COCO"):
+            return ModelCatalog._get_c2_detectron_baseline(name)
+        if name.startswith("ImageNetPretrained/"):
+            return ModelCatalog._get_c2_imagenet_pretrained(name)
+        raise RuntimeError("model not present in the catalog: {}".format(name))
+
+    @staticmethod
+    def _get_c2_imagenet_pretrained(name):
+        prefix = ModelCatalog.S3_C2_DETECTRON_PREFIX
+        name = name[len("ImageNetPretrained/") :]
+        name = ModelCatalog.C2_IMAGENET_MODELS[name]
+        url = "/".join([prefix, name])
+        return url
+
+    @staticmethod
+    def _get_c2_detectron_baseline(name):
+        name = name[len("Caffe2Detectron/COCO/") :]
+        url = ModelCatalog.C2_DETECTRON_MODELS[name]
+        if "keypoint_rcnn" in name:
+            dataset = ModelCatalog.C2_DATASET_COCO_KEYPOINTS
+        else:
+            dataset = ModelCatalog.C2_DATASET_COCO
+
+        if "35998355/rpn_R-50-C4_1x" in name:
+            # this one model is somehow different from others ..
+            type = "rpn"
+        else:
+            type = "generalized_rcnn"
+
+        # Detectron C2 models are stored in the structure defined in `C2_DETECTRON_PATH_FORMAT`.
+        url = ModelCatalog.C2_DETECTRON_PATH_FORMAT.format(
+            prefix=ModelCatalog.S3_C2_DETECTRON_PREFIX, url=url, type=type, dataset=dataset
+        )
+        return url
+
+
+class ModelCatalogHandler(PathHandler):
+    """
+    Resolve URL like catalog://.
+    """
+
+    PREFIX = "catalog://"
+
+    def _get_supported_prefixes(self):
+        return [self.PREFIX]
+
+    def _get_local_path(self, path):
+        logger = logging.getLogger(__name__)
+        catalog_path = ModelCatalog.get(path[len(self.PREFIX) :])
+        logger.info("Catalog entry {} points to {}".format(path, catalog_path))
+        return PathManager.get_local_path(catalog_path)
+
+    def _open(self, path, mode="r", **kwargs):
+        return PathManager.open(self._get_local_path(path), mode, **kwargs)
+
+
+class Detectron2Handler(PathHandler):
+    """
+    Resolve anything that's in Detectron2 model zoo.
+    """
+
+    PREFIX = "detectron2://"
+    S3_DETECTRON2_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/"
+
+    def _get_supported_prefixes(self):
+        return [self.PREFIX]
+
+    def _get_local_path(self, path):
+        name = path[len(self.PREFIX) :]
+        return PathManager.get_local_path(self.S3_DETECTRON2_PREFIX + name)
+
+    def _open(self, path, mode="r", **kwargs):
+        return PathManager.open(self._get_local_path(path), mode, **kwargs)
+
+
+PathManager.register_handler(ModelCatalogHandler())
+PathManager.register_handler(Detectron2Handler())
diff --git a/detectron2/checkpoint/detection_checkpoint.py b/detectron2/checkpoint/detection_checkpoint.py
new file mode 100644
index 0000000..4e0a35e
--- /dev/null
+++ b/detectron2/checkpoint/detection_checkpoint.py
@@ -0,0 +1,59 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import pickle
+from fvcore.common.checkpoint import Checkpointer
+from fvcore.common.file_io import PathManager
+
+import detectron2.utils.comm as comm
+
+from .c2_model_loading import align_and_update_state_dicts
+
+
+class DetectionCheckpointer(Checkpointer):
+    """
+    Same as :class:`Checkpointer`, but is able to handle models in detectron & detectron2
+    model zoo, and apply conversions for legacy models.
+    """
+
+    def __init__(self, model, save_dir="", *, save_to_disk=None, **checkpointables):
+        is_main_process = comm.is_main_process()
+        super().__init__(
+            model,
+            save_dir,
+            save_to_disk=is_main_process if save_to_disk is None else save_to_disk,
+            **checkpointables,
+        )
+
+    def _load_file(self, filename):
+        if filename.endswith(".pkl"):
+            with PathManager.open(filename, "rb") as f:
+                data = pickle.load(f, encoding="latin1")
+            if "model" in data and "__author__" in data:
+                # file is in Detectron2 model zoo format
+                self.logger.info("Reading a file from '{}'".format(data["__author__"]))
+                return data
+            else:
+                # assume file is from Caffe2 / Detectron1 model zoo
+                if "blobs" in data:
+                    # Detection models have "blobs", but ImageNet models don't
+                    data = data["blobs"]
+                data = {k: v for k, v in data.items() if not k.endswith("_momentum")}
+                return {"model": data, "__author__": "Caffe2", "matching_heuristics": True}
+
+        loaded = super()._load_file(filename)  # load native pth checkpoint
+        if "model" not in loaded:
+            loaded = {"model": loaded}
+        return loaded
+
+    def _load_model(self, checkpoint):
+        if checkpoint.get("matching_heuristics", False):
+            self._convert_ndarray_to_tensor(checkpoint["model"])
+            # convert weights by name-matching heuristics
+            model_state_dict = self.model.state_dict()
+            align_and_update_state_dicts(
+                model_state_dict,
+                checkpoint["model"],
+                c2_conversion=checkpoint.get("__author__", None) == "Caffe2",
+            )
+            checkpoint["model"] = model_state_dict
+        # for non-caffe2 models, use standard ways to load it
+        super()._load_model(checkpoint)
diff --git a/detectron2/config/__init__.py b/detectron2/config/__init__.py
new file mode 100644
index 0000000..e6f3f5e
--- /dev/null
+++ b/detectron2/config/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .compat import downgrade_config, upgrade_config
+from .config import CfgNode, get_cfg, global_cfg, set_global_cfg
+
+__all__ = [
+    "CfgNode",
+    "get_cfg",
+    "global_cfg",
+    "set_global_cfg",
+    "downgrade_config",
+    "upgrade_config",
+]
diff --git a/detectron2/config/compat.py b/detectron2/config/compat.py
new file mode 100644
index 0000000..41fe3a0
--- /dev/null
+++ b/detectron2/config/compat.py
@@ -0,0 +1,229 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Backward compatibility of configs.
+
+Instructions to bump version:
++ It's not needed to bump version if new keys are added.
+  It's only needed when backward-incompatible changes happen
+  (i.e., some existing keys disappear, or the meaning of a key changes)
++ To bump version, do the following:
+    1. Increment _C.VERSION in defaults.py
+    2. Add a converter in this file.
+
+      Each ConverterVX has a function "upgrade" which in-place upgrades config from X-1 to X,
+      and a function "downgrade" which in-place downgrades config from X to X-1
+
+      In each function, VERSION is left unchanged.
+
+      Each converter assumes that its input has the relevant keys
+      (i.e., the input is not a partial config).
+    3. Run the tests (test_config.py) to make sure the upgrade & downgrade
+       functions are consistent.
+"""
+
+import logging
+from typing import List, Optional, Tuple
+
+from .config import CfgNode as CN
+from .defaults import _C
+
+__all__ = ["upgrade_config", "downgrade_config"]
+
+
+def upgrade_config(cfg: CN, to_version: Optional[int] = None) -> CN:
+    """
+    Upgrade a config from its current version to a newer version.
+
+    Args:
+        cfg (CfgNode):
+        to_version (int): defaults to the latest version.
+    """
+    cfg = cfg.clone()
+    if to_version is None:
+        to_version = _C.VERSION
+
+    assert cfg.VERSION <= to_version, "Cannot upgrade from v{} to v{}!".format(
+        cfg.VERSION, to_version
+    )
+    for k in range(cfg.VERSION, to_version):
+        converter = globals()["ConverterV" + str(k + 1)]
+        converter.upgrade(cfg)
+        cfg.VERSION = k + 1
+    return cfg
+
+
+def downgrade_config(cfg: CN, to_version: int) -> CN:
+    """
+    Downgrade a config from its current version to an older version.
+
+    Args:
+        cfg (CfgNode):
+        to_version (int):
+
+    Note:
+        A general downgrade of arbitrary configs is not always possible due to the
+        different functionalities in different versions.
+        The purpose of downgrade is only to recover the defaults in old versions,
+        allowing it to load an old partial yaml config.
+        Therefore, the implementation only needs to fill in the default values
+        in the old version when a general downgrade is not possible.
+    """
+    cfg = cfg.clone()
+    assert cfg.VERSION >= to_version, "Cannot downgrade from v{} to v{}!".format(
+        cfg.VERSION, to_version
+    )
+    for k in range(cfg.VERSION, to_version, -1):
+        converter = globals()["ConverterV" + str(k)]
+        converter.downgrade(cfg)
+        cfg.VERSION = k - 1
+    return cfg
+
+
+def guess_version(cfg: CN, filename: str) -> int:
+    """
+    Guess the version of a partial config where the VERSION field is not specified.
+    Returns the version, or the latest if cannot make a guess.
+
+    This makes it easier for users to migrate.
+    """
+    logger = logging.getLogger(__name__)
+
+    def _has(name: str) -> bool:
+        cur = cfg
+        for n in name.split("."):
+            if n not in cur:
+                return False
+            cur = cur[n]
+        return True
+
+    # Most users' partial configs have "MODEL.WEIGHT", so guess on it
+    ret = None
+    if _has("MODEL.WEIGHT") or _has("TEST.AUG_ON"):
+        ret = 1
+
+    if ret is not None:
+        logger.warning("Config '{}' has no VERSION. Assuming it to be v{}.".format(filename, ret))
+    else:
+        ret = _C.VERSION
+        logger.warning(
+            "Config '{}' has no VERSION. Assuming it to be compatible with latest v{}.".format(
+                filename, ret
+            )
+        )
+    return ret
+
+
+def _rename(cfg: CN, old: str, new: str) -> None:
+    old_keys = old.split(".")
+    new_keys = new.split(".")
+
+    def _set(key_seq: List[str], val: str) -> None:
+        cur = cfg
+        for k in key_seq[:-1]:
+            if k not in cur:
+                cur[k] = CN()
+            cur = cur[k]
+        cur[key_seq[-1]] = val
+
+    def _get(key_seq: List[str]) -> CN:
+        cur = cfg
+        for k in key_seq:
+            cur = cur[k]
+        return cur
+
+    def _del(key_seq: List[str]) -> None:
+        cur = cfg
+        for k in key_seq[:-1]:
+            cur = cur[k]
+        del cur[key_seq[-1]]
+        if len(cur) == 0 and len(key_seq) > 1:
+            _del(key_seq[:-1])
+
+    _set(new_keys, _get(old_keys))
+    _del(old_keys)
+
+
+class _RenameConverter:
+    """
+    A converter that handles simple rename.
+    """
+
+    RENAME: List[Tuple[str, str]] = []  # list of tuples of (old name, new name)
+
+    @classmethod
+    def upgrade(cls, cfg: CN) -> None:
+        for old, new in cls.RENAME:
+            _rename(cfg, old, new)
+
+    @classmethod
+    def downgrade(cls, cfg: CN) -> None:
+        for old, new in cls.RENAME[::-1]:
+            _rename(cfg, new, old)
+
+
+class ConverterV1(_RenameConverter):
+    RENAME = [("MODEL.RPN_HEAD.NAME", "MODEL.RPN.HEAD_NAME")]
+
+
+class ConverterV2(_RenameConverter):
+    """
+    A large bulk of rename, before public release.
+    """
+
+    RENAME = [
+        ("MODEL.WEIGHT", "MODEL.WEIGHTS"),
+        ("MODEL.PANOPTIC_FPN.SEMANTIC_LOSS_SCALE", "MODEL.SEM_SEG_HEAD.LOSS_WEIGHT"),
+        ("MODEL.PANOPTIC_FPN.RPN_LOSS_SCALE", "MODEL.RPN.LOSS_WEIGHT"),
+        ("MODEL.PANOPTIC_FPN.INSTANCE_LOSS_SCALE", "MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT"),
+        ("MODEL.PANOPTIC_FPN.COMBINE_ON", "MODEL.PANOPTIC_FPN.COMBINE.ENABLED"),
+        (
+            "MODEL.PANOPTIC_FPN.COMBINE_OVERLAP_THRESHOLD",
+            "MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH",
+        ),
+        (
+            "MODEL.PANOPTIC_FPN.COMBINE_STUFF_AREA_LIMIT",
+            "MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT",
+        ),
+        (
+            "MODEL.PANOPTIC_FPN.COMBINE_INSTANCES_CONFIDENCE_THRESHOLD",
+            "MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH",
+        ),
+        ("MODEL.ROI_HEADS.SCORE_THRESH", "MODEL.ROI_HEADS.SCORE_THRESH_TEST"),
+        ("MODEL.ROI_HEADS.NMS", "MODEL.ROI_HEADS.NMS_THRESH_TEST"),
+        ("MODEL.RETINANET.INFERENCE_SCORE_THRESHOLD", "MODEL.RETINANET.SCORE_THRESH_TEST"),
+        ("MODEL.RETINANET.INFERENCE_TOPK_CANDIDATES", "MODEL.RETINANET.TOPK_CANDIDATES_TEST"),
+        ("MODEL.RETINANET.INFERENCE_NMS_THRESHOLD", "MODEL.RETINANET.NMS_THRESH_TEST"),
+        ("TEST.DETECTIONS_PER_IMG", "TEST.DETECTIONS_PER_IMAGE"),
+        ("TEST.AUG_ON", "TEST.AUG.ENABLED"),
+        ("TEST.AUG_MIN_SIZES", "TEST.AUG.MIN_SIZES"),
+        ("TEST.AUG_MAX_SIZE", "TEST.AUG.MAX_SIZE"),
+        ("TEST.AUG_FLIP", "TEST.AUG.FLIP"),
+    ]
+
+    @classmethod
+    def upgrade(cls, cfg: CN) -> None:
+        super().upgrade(cfg)
+
+        if cfg.MODEL.META_ARCHITECTURE == "RetinaNet":
+            _rename(
+                cfg, "MODEL.RETINANET.ANCHOR_ASPECT_RATIOS", "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS"
+            )
+            _rename(cfg, "MODEL.RETINANET.ANCHOR_SIZES", "MODEL.ANCHOR_GENERATOR.SIZES")
+            del cfg["MODEL"]["RPN"]["ANCHOR_SIZES"]
+            del cfg["MODEL"]["RPN"]["ANCHOR_ASPECT_RATIOS"]
+        else:
+            _rename(cfg, "MODEL.RPN.ANCHOR_ASPECT_RATIOS", "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS")
+            _rename(cfg, "MODEL.RPN.ANCHOR_SIZES", "MODEL.ANCHOR_GENERATOR.SIZES")
+            del cfg["MODEL"]["RETINANET"]["ANCHOR_SIZES"]
+            del cfg["MODEL"]["RETINANET"]["ANCHOR_ASPECT_RATIOS"]
+        del cfg["MODEL"]["RETINANET"]["ANCHOR_STRIDES"]
+
+    @classmethod
+    def downgrade(cls, cfg: CN) -> None:
+        super().downgrade(cfg)
+
+        _rename(cfg, "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS", "MODEL.RPN.ANCHOR_ASPECT_RATIOS")
+        _rename(cfg, "MODEL.ANCHOR_GENERATOR.SIZES", "MODEL.RPN.ANCHOR_SIZES")
+        cfg.MODEL.RETINANET.ANCHOR_ASPECT_RATIOS = cfg.MODEL.RPN.ANCHOR_ASPECT_RATIOS
+        cfg.MODEL.RETINANET.ANCHOR_SIZES = cfg.MODEL.RPN.ANCHOR_SIZES
+        cfg.MODEL.RETINANET.ANCHOR_STRIDES = []  # this is not used anywhere in any version
diff --git a/detectron2/config/config.py b/detectron2/config/config.py
new file mode 100644
index 0000000..f9d01cb
--- /dev/null
+++ b/detectron2/config/config.py
@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import logging
+from fvcore.common.config import CfgNode as _CfgNode
+
+
+class CfgNode(_CfgNode):
+    """
+    The same as `fvcore.common.config.CfgNode`, but different in:
+
+    1. Use unsafe yaml loading by default.
+      Note that this may lead to arbitrary code execution: you must not
+      load a config file from untrusted sources before manually inspecting
+      the content of the file.
+    2. Support config versioning.
+      When attempting to merge an old config, it will convert the old config automatically.
+
+    """
+
+    # Note that the default value of allow_unsafe is changed to True
+    def merge_from_file(self, cfg_filename: str, allow_unsafe: bool = True) -> None:
+        loaded_cfg = _CfgNode.load_yaml_with_base(cfg_filename, allow_unsafe=allow_unsafe)
+        loaded_cfg = type(self)(loaded_cfg)
+
+        # defaults.py needs to import CfgNode
+        from .defaults import _C
+
+        latest_ver = _C.VERSION
+        assert (
+            latest_ver == self.VERSION
+        ), "CfgNode.merge_from_file is only allowed on a config of latest version!"
+
+        logger = logging.getLogger(__name__)
+
+        loaded_ver = loaded_cfg.get("VERSION", None)
+        if loaded_ver is None:
+            from .compat import guess_version
+
+            loaded_ver = guess_version(loaded_cfg, cfg_filename)
+        assert loaded_ver <= self.VERSION, "Cannot merge a v{} config into a v{} config.".format(
+            loaded_ver, self.VERSION
+        )
+
+        if loaded_ver == self.VERSION:
+            self.merge_from_other_cfg(loaded_cfg)
+        else:
+            # compat.py needs to import CfgNode
+            from .compat import upgrade_config, downgrade_config
+
+            logger.warning(
+                "Loading an old v{} config file '{}' by automatically upgrading to v{}. "
+                "See docs/CHANGELOG.md for instructions to update your files.".format(
+                    loaded_ver, cfg_filename, self.VERSION
+                )
+            )
+            # To convert, first obtain a full config at an old version
+            old_self = downgrade_config(self, to_version=loaded_ver)
+            old_self.merge_from_other_cfg(loaded_cfg)
+            new_config = upgrade_config(old_self)
+            self.clear()
+            self.update(new_config)
+
+
+global_cfg = CfgNode()
+
+
+def get_cfg() -> CfgNode:
+    """
+    Get a copy of the default config.
+
+    Returns:
+        a detectron2 CfgNode instance.
+    """
+    from .defaults import _C
+
+    return _C.clone()
+
+
+def set_global_cfg(cfg: CfgNode) -> None:
+    """
+    Let the global config point to the given cfg.
+
+    Assume that the given "cfg" has the key "KEY", after calling
+    `set_global_cfg(cfg)`, the key can be accessed by:
+
+    .. code-block:: python
+
+        from detectron2.config import global_cfg
+        print(global_cfg.KEY)
+
+    By using a hacky global config, you can access these configs anywhere,
+    without having to pass the config object or the values deep into the code.
+    This is a hacky feature introduced for quick prototyping / research exploration.
+    """
+    global global_cfg
+    global_cfg.clear()
+    global_cfg.update(cfg)
diff --git a/detectron2/config/defaults.py b/detectron2/config/defaults.py
new file mode 100644
index 0000000..f0daaaf
--- /dev/null
+++ b/detectron2/config/defaults.py
@@ -0,0 +1,567 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .config import CfgNode as CN
+
+# -----------------------------------------------------------------------------
+# Convention about Training / Test specific parameters
+# -----------------------------------------------------------------------------
+# Whenever an argument can be either used for training or for testing, the
+# corresponding name will be post-fixed by a _TRAIN for a training parameter,
+# or _TEST for a test-specific parameter.
+# For example, the number of images during training will be
+# IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be
+# IMAGES_PER_BATCH_TEST
+
+# -----------------------------------------------------------------------------
+# Config definition
+# -----------------------------------------------------------------------------
+
+_C = CN()
+
+_C.VERSION = 2
+
+_C.MODEL = CN()
+_C.MODEL.LOAD_PROPOSALS = False
+_C.MODEL.MASK_ON = False
+_C.MODEL.KEYPOINT_ON = False
+_C.MODEL.DEVICE = "cuda"
+_C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN"
+
+# Path (possibly with schema like catalog:// or detectron2://) to a checkpoint file
+# to be loaded to the model. You can find available models in the model zoo.
+_C.MODEL.WEIGHTS = ""
+
+# Values to be used for image normalization (BGR order).
+# To train on images of different number of channels, just set different mean & std.
+# Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675]
+_C.MODEL.PIXEL_MEAN = [103.530, 116.280, 123.675]
+# When using pre-trained models in Detectron1 or any MSRA models,
+# std has been absorbed into its conv1 weights, so the std needs to be set 1.
+# Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std)
+_C.MODEL.PIXEL_STD = [1.0, 1.0, 1.0]
+
+
+# -----------------------------------------------------------------------------
+# INPUT
+# -----------------------------------------------------------------------------
+_C.INPUT = CN()
+# Size of the smallest side of the image during training
+_C.INPUT.MIN_SIZE_TRAIN = (800,)
+# Sample size of smallest side by choice or random selection from range give by
+# INPUT.MIN_SIZE_TRAIN
+_C.INPUT.MIN_SIZE_TRAIN_SAMPLING = "choice"
+# Maximum size of the side of the image during training
+_C.INPUT.MAX_SIZE_TRAIN = 1333
+# Size of the smallest side of the image during testing. Set to zero to disable resize in testing.
+_C.INPUT.MIN_SIZE_TEST = 800
+# Maximum size of the side of the image during testing
+_C.INPUT.MAX_SIZE_TEST = 1333
+
+# `True` if cropping is used for data augmentation during training
+_C.INPUT.CROP = CN({"ENABLED": False})
+# Cropping type:
+# - "relative" crop (H * CROP.SIZE[0], W * CROP.SIZE[1]) part of an input of size (H, W)
+# - "relative_range" uniformly sample relative crop size from between [CROP.SIZE[0], [CROP.SIZE[1]].
+#   and  [1, 1] and use it as in "relative" scenario.
+# - "absolute" crop part of an input with absolute size: (CROP.SIZE[0], CROP.SIZE[1]).
+_C.INPUT.CROP.TYPE = "relative_range"
+# Size of crop in range (0, 1] if CROP.TYPE is "relative" or "relative_range" and in number of
+# pixels if CROP.TYPE is "absolute"
+_C.INPUT.CROP.SIZE = [0.9, 0.9]
+
+
+# Whether the model needs RGB, YUV, HSV etc.
+# Should be one of the modes defined here, as we use PIL to read the image:
+# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#concept-modes
+# with BGR being the one exception. One can set image format to BGR, we will
+# internally use RGB for conversion and flip the channels over
+_C.INPUT.FORMAT = "BGR"
+# The ground truth mask format that the model will use.
+# Mask R-CNN supports either "polygon" or "bitmask" as ground truth.
+_C.INPUT.MASK_FORMAT = "polygon"  # alternative: "bitmask"
+
+
+# -----------------------------------------------------------------------------
+# Dataset
+# -----------------------------------------------------------------------------
+_C.DATASETS = CN()
+# List of the dataset names for training. Must be registered in DatasetCatalog
+_C.DATASETS.TRAIN = ()
+# List of the pre-computed proposal files for training, which must be consistent
+# with datasets listed in DATASETS.TRAIN.
+_C.DATASETS.PROPOSAL_FILES_TRAIN = ()
+# Number of top scoring precomputed proposals to keep for training
+_C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN = 2000
+# List of the dataset names for testing. Must be registered in DatasetCatalog
+_C.DATASETS.TEST = ()
+# List of the pre-computed proposal files for test, which must be consistent
+# with datasets listed in DATASETS.TEST.
+_C.DATASETS.PROPOSAL_FILES_TEST = ()
+# Number of top scoring precomputed proposals to keep for test
+_C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST = 1000
+
+# -----------------------------------------------------------------------------
+# DataLoader
+# -----------------------------------------------------------------------------
+_C.DATALOADER = CN()
+# Number of data loading threads
+_C.DATALOADER.NUM_WORKERS = 4
+# If True, each batch should contain only images for which the aspect ratio
+# is compatible. This groups portrait images together, and landscape images
+# are not batched with portrait images.
+_C.DATALOADER.ASPECT_RATIO_GROUPING = True
+# Options: TrainingSampler, RepeatFactorTrainingSampler
+_C.DATALOADER.SAMPLER_TRAIN = "TrainingSampler"
+# Repeat threshold for RepeatFactorTrainingSampler
+_C.DATALOADER.REPEAT_THRESHOLD = 0.0
+# if True, the dataloader will filter out images that have no associated
+# annotations at train time.
+_C.DATALOADER.FILTER_EMPTY_ANNOTATIONS = True
+
+# ---------------------------------------------------------------------------- #
+# Backbone options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.BACKBONE = CN()
+
+_C.MODEL.BACKBONE.NAME = "build_resnet_backbone"
+# Add StopGrad at a specified stage so the bottom layers are frozen
+_C.MODEL.BACKBONE.FREEZE_AT = 2
+
+
+# ---------------------------------------------------------------------------- #
+# FPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.FPN = CN()
+# Names of the input feature maps to be used by FPN
+# They must have contiguous power of 2 strides
+# e.g., ["res2", "res3", "res4", "res5"]
+_C.MODEL.FPN.IN_FEATURES = []
+_C.MODEL.FPN.OUT_CHANNELS = 256
+
+# Options: "" (no norm), "GN"
+_C.MODEL.FPN.NORM = ""
+
+# Types for fusing the FPN top-down and lateral features. Can be either "sum" or "avg"
+_C.MODEL.FPN.FUSE_TYPE = "sum"
+
+
+# ---------------------------------------------------------------------------- #
+# Proposal generator options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.PROPOSAL_GENERATOR = CN()
+# Current proposal generators include "RPN", "RRPN" and "PrecomputedProposals"
+_C.MODEL.PROPOSAL_GENERATOR.NAME = "RPN"
+# Proposal height and width both need to be greater than MIN_SIZE
+# (a the scale used during training or inference)
+_C.MODEL.PROPOSAL_GENERATOR.MIN_SIZE = 0
+
+
+# ---------------------------------------------------------------------------- #
+# Anchor generator options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ANCHOR_GENERATOR = CN()
+# The generator can be any name in the ANCHOR_GENERATOR registry
+_C.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator"
+# anchor sizes given in absolute pixels w.r.t. the scaled network input.
+# Format: list of lists of sizes. SIZES[i] specifies the list of sizes
+# to use for IN_FEATURES[i]; len(SIZES) == len(IN_FEATURES) must be true,
+# or len(SIZES) == 1 is true and size list SIZES[0] is used for all
+# IN_FEATURES.
+_C.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64, 128, 256, 512]]
+# Anchor aspect ratios.
+# Format is list of lists of sizes. ASPECT_RATIOS[i] specifies the list of aspect ratios
+# to use for IN_FEATURES[i]; len(ASPECT_RATIOS) == len(IN_FEATURES) must be true,
+# or len(ASPECT_RATIOS) == 1 is true and aspect ratio list ASPECT_RATIOS[0] is used
+# for all IN_FEATURES.
+_C.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.5, 1.0, 2.0]]
+# Anchor angles.
+# list[float], the angle in degrees, for each input feature map.
+# ANGLES[i] specifies the list of angles for IN_FEATURES[i].
+_C.MODEL.ANCHOR_GENERATOR.ANGLES = [[-90, 0, 90]]
+
+
+# ---------------------------------------------------------------------------- #
+# RPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RPN = CN()
+_C.MODEL.RPN.HEAD_NAME = "StandardRPNHead"  # used by RPN_HEAD_REGISTRY
+
+# Names of the input feature maps to be used by RPN
+# e.g., ["p2", "p3", "p4", "p5", "p6"] for FPN
+_C.MODEL.RPN.IN_FEATURES = ["res4"]
+# Remove RPN anchors that go outside the image by BOUNDARY_THRESH pixels
+# Set to -1 or a large value, e.g. 100000, to disable pruning anchors
+_C.MODEL.RPN.BOUNDARY_THRESH = -1
+# IOU overlap ratios [BG_IOU_THRESHOLD, FG_IOU_THRESHOLD]
+# Minimum overlap required between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD
+# ==> positive RPN example: 1)
+# Maximum overlap allowed between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD
+# ==> negative RPN example: 0)
+# Anchors with overlap in between (BG_IOU_THRESHOLD <= IoU < FG_IOU_THRESHOLD)
+# are ignored (-1)
+_C.MODEL.RPN.IOU_THRESHOLDS = [0.3, 0.7]
+_C.MODEL.RPN.IOU_LABELS = [0, -1, 1]
+# Total number of RPN examples per image
+_C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256
+# Target fraction of foreground (positive) examples per RPN minibatch
+_C.MODEL.RPN.POSITIVE_FRACTION = 0.5
+# Weights on (dx, dy, dw, dh) for normalizing RPN anchor regression targets
+_C.MODEL.RPN.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+# The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1.
+_C.MODEL.RPN.SMOOTH_L1_BETA = 0.0
+_C.MODEL.RPN.LOSS_WEIGHT = 1.0
+# Number of top scoring RPN proposals to keep before applying NMS
+# When FPN is used, this is *per FPN level* (not total)
+_C.MODEL.RPN.PRE_NMS_TOPK_TRAIN = 12000
+_C.MODEL.RPN.PRE_NMS_TOPK_TEST = 6000
+# Number of top scoring RPN proposals to keep after applying NMS
+# When FPN is used, this limit is applied per level and then again to the union
+# of proposals from all levels
+# NOTE: When FPN is used, the meaning of this config is different from Detectron1.
+# It means per-batch topk in Detectron1, but per-image topk here.
+# See "modeling/rpn/rpn_outputs.py" for details.
+_C.MODEL.RPN.POST_NMS_TOPK_TRAIN = 2000
+_C.MODEL.RPN.POST_NMS_TOPK_TEST = 1000
+# NMS threshold used on RPN proposals
+_C.MODEL.RPN.NMS_THRESH = 0.7
+
+# ---------------------------------------------------------------------------- #
+# ROI HEADS options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_HEADS = CN()
+_C.MODEL.ROI_HEADS.NAME = "Res5ROIHeads"
+# Number of foreground classes
+_C.MODEL.ROI_HEADS.NUM_CLASSES = 80
+# Names of the input feature maps to be used by ROI heads
+# Currently all heads (box, mask, ...) use the same input feature map list
+# e.g., ["p2", "p3", "p4", "p5"] is commonly used for FPN
+_C.MODEL.ROI_HEADS.IN_FEATURES = ["res4"]
+# IOU overlap ratios [IOU_THRESHOLD]
+# Overlap threshold for an RoI to be considered background (if < IOU_THRESHOLD)
+# Overlap threshold for an RoI to be considered foreground (if >= IOU_THRESHOLD)
+_C.MODEL.ROI_HEADS.IOU_THRESHOLDS = [0.5]
+_C.MODEL.ROI_HEADS.IOU_LABELS = [0, 1]
+# RoI minibatch size *per image* (number of regions of interest [ROIs])
+# Total number of RoIs per training minibatch =
+#   ROI_HEADS.BATCH_SIZE_PER_IMAGE * SOLVER.IMS_PER_BATCH
+# E.g., a common configuration is: 512 * 16 = 8192
+_C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
+# Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0)
+_C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25
+
+# Only used on test mode
+
+# Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to
+# balance obtaining high recall with not having too many low precision
+# detections that will slow down inference post processing steps (like NMS)
+# A default threshold of 0.0 increases AP by ~0.2-0.3 but significantly slows down
+# inference.
+_C.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.05
+# Overlap threshold used for non-maximum suppression (suppress boxes with
+# IoU >= this threshold)
+_C.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.5
+# If True, augment proposals with ground-truth boxes before sampling proposals to
+# train ROI heads.
+_C.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT = True
+
+
+# ---------------------------------------------------------------------------- #
+# Box Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_BOX_HEAD = CN()
+# C4 don't use head name option
+# Options for non-C4 models: FastRCNNConvFCHead,
+_C.MODEL.ROI_BOX_HEAD.NAME = ""
+# Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets
+# These are empirically chosen to approximately lead to unit variance targets
+_C.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10.0, 10.0, 5.0, 5.0)
+# The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1.
+_C.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA = 0.0
+_C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0
+# Type of pooling operation applied to the incoming feature map for each RoI
+_C.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignV2"
+
+_C.MODEL.ROI_BOX_HEAD.NUM_FC = 0
+# Hidden layer dimension for FC layers in the RoI box head
+_C.MODEL.ROI_BOX_HEAD.FC_DIM = 1024
+_C.MODEL.ROI_BOX_HEAD.NUM_CONV = 0
+# Channel dimension for Conv layers in the RoI box head
+_C.MODEL.ROI_BOX_HEAD.CONV_DIM = 256
+# Normalization method for the convolution layers.
+# Options: "" (no norm), "GN", "SyncBN".
+_C.MODEL.ROI_BOX_HEAD.NORM = ""
+# Whether to use class agnostic for bbox regression
+_C.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG = False
+
+# ---------------------------------------------------------------------------- #
+# Cascaded Box Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_BOX_CASCADE_HEAD = CN()
+# The number of cascade stages is implicitly defined by the length of the following two configs.
+_C.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS = (
+    (10.0, 10.0, 5.0, 5.0),
+    (20.0, 20.0, 10.0, 10.0),
+    (30.0, 30.0, 15.0, 15.0),
+)
+_C.MODEL.ROI_BOX_CASCADE_HEAD.IOUS = (0.5, 0.6, 0.7)
+
+
+# ---------------------------------------------------------------------------- #
+# Mask Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_MASK_HEAD = CN()
+_C.MODEL.ROI_MASK_HEAD.NAME = "MaskRCNNConvUpsampleHead"
+_C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_MASK_HEAD.NUM_CONV = 0  # The number of convs in the mask head
+_C.MODEL.ROI_MASK_HEAD.CONV_DIM = 256
+# Normalization method for the convolution layers.
+# Options: "" (no norm), "GN", "SyncBN".
+_C.MODEL.ROI_MASK_HEAD.NORM = ""
+# Whether to use class agnostic for mask prediction
+_C.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK = False
+# Type of pooling operation applied to the incoming feature map for each RoI
+_C.MODEL.ROI_MASK_HEAD.POOLER_TYPE = "ROIAlignV2"
+
+
+# ---------------------------------------------------------------------------- #
+# Keypoint Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_KEYPOINT_HEAD = CN()
+_C.MODEL.ROI_KEYPOINT_HEAD.NAME = "KRCNNConvDeconvUpsampleHead"
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS = tuple(512 for _ in range(8))
+_C.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS = 17  # 17 is the number of keypoints in COCO.
+
+# Images with too few (or no) keypoints are excluded from training.
+_C.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE = 1
+# Normalize by the total number of visible keypoints in the minibatch if True.
+# Otherwise, normalize by the total number of keypoints that could ever exist
+# in the minibatch.
+# The keypoint softmax loss is only calculated on visible keypoints.
+# Since the number of visible keypoints can vary significantly between
+# minibatches, this has the effect of up-weighting the importance of
+# minibatches with few visible keypoints. (Imagine the extreme case of
+# only one visible keypoint versus N: in the case of N, each one
+# contributes 1/N to the gradient compared to the single keypoint
+# determining the gradient direction). Instead, we can normalize the
+# loss by the total number of keypoints, if it were the case that all
+# keypoints were visible in a full minibatch. (Returning to the example,
+# this means that the one visible keypoint contributes as much as each
+# of the N keypoints.)
+_C.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS = True
+# Multi-task loss weight to use for keypoints
+# Recommended values:
+#   - use 1.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is True
+#   - use 4.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is False
+_C.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT = 1.0
+# Type of pooling operation applied to the incoming feature map for each RoI
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE = "ROIAlignV2"
+
+# ---------------------------------------------------------------------------- #
+# Semantic Segmentation Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.SEM_SEG_HEAD = CN()
+_C.MODEL.SEM_SEG_HEAD.NAME = "SemSegFPNHead"
+_C.MODEL.SEM_SEG_HEAD.IN_FEATURES = ["p2", "p3", "p4", "p5"]
+# Label in the semantic segmentation ground truth that is ignored, i.e., no loss is calculated for
+# the correposnding pixel.
+_C.MODEL.SEM_SEG_HEAD.IGNORE_VALUE = 255
+# Number of classes in the semantic segmentation head
+_C.MODEL.SEM_SEG_HEAD.NUM_CLASSES = 54
+# Number of channels in the 3x3 convs inside semantic-FPN heads.
+_C.MODEL.SEM_SEG_HEAD.CONVS_DIM = 128
+# Outputs from semantic-FPN heads are up-scaled to the COMMON_STRIDE stride.
+_C.MODEL.SEM_SEG_HEAD.COMMON_STRIDE = 4
+# Normalization method for the convolution layers. Options: "" (no norm), "GN".
+_C.MODEL.SEM_SEG_HEAD.NORM = "GN"
+_C.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT = 1.0
+
+_C.MODEL.PANOPTIC_FPN = CN()
+# Scaling of all losses from instance detection / segmentation head.
+_C.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT = 1.0
+
+# options when combining instance & semantic segmentation outputs
+_C.MODEL.PANOPTIC_FPN.COMBINE = CN({"ENABLED": True})
+_C.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH = 0.5
+_C.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT = 4096
+_C.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = 0.5
+
+
+# ---------------------------------------------------------------------------- #
+# RetinaNet Head
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RETINANET = CN()
+
+# This is the number of foreground classes.
+_C.MODEL.RETINANET.NUM_CLASSES = 80
+
+_C.MODEL.RETINANET.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"]
+
+# Convolutions to use in the cls and bbox tower
+# NOTE: this doesn't include the last conv for logits
+_C.MODEL.RETINANET.NUM_CONVS = 4
+
+# IoU overlap ratio [bg, fg] for labeling anchors.
+# Anchors with < bg are labeled negative (0)
+# Anchors  with >= bg and < fg are ignored (-1)
+# Anchors with >= fg are labeled positive (1)
+_C.MODEL.RETINANET.IOU_THRESHOLDS = [0.4, 0.5]
+_C.MODEL.RETINANET.IOU_LABELS = [0, -1, 1]
+
+# Prior prob for rare case (i.e. foreground) at the beginning of training.
+# This is used to set the bias for the logits layer of the classifier subnet.
+# This improves training stability in the case of heavy class imbalance.
+_C.MODEL.RETINANET.PRIOR_PROB = 0.01
+
+# Inference cls score threshold, only anchors with score > INFERENCE_TH are
+# considered for inference (to improve speed)
+_C.MODEL.RETINANET.SCORE_THRESH_TEST = 0.05
+_C.MODEL.RETINANET.TOPK_CANDIDATES_TEST = 1000
+_C.MODEL.RETINANET.NMS_THRESH_TEST = 0.5
+
+# Weights on (dx, dy, dw, dh) for normalizing Retinanet anchor regression targets
+_C.MODEL.RETINANET.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+
+# Loss parameters
+_C.MODEL.RETINANET.FOCAL_LOSS_GAMMA = 2.0
+_C.MODEL.RETINANET.FOCAL_LOSS_ALPHA = 0.25
+_C.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA = 0.1
+
+
+# ---------------------------------------------------------------------------- #
+# ResNe[X]t options (ResNets = {ResNet, ResNeXt}
+# Note that parts of a resnet may be used for both the backbone and the head
+# These options apply to both
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RESNETS = CN()
+
+_C.MODEL.RESNETS.DEPTH = 50
+_C.MODEL.RESNETS.OUT_FEATURES = ["res4"]  # res4 for C4 backbone, res2..5 for FPN backbone
+
+# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
+_C.MODEL.RESNETS.NUM_GROUPS = 1
+
+# Options: FrozenBN, GN, "SyncBN", "BN"
+_C.MODEL.RESNETS.NORM = "FrozenBN"
+
+# Baseline width of each group.
+# Scaling this parameters will scale the width of all bottleneck layers.
+_C.MODEL.RESNETS.WIDTH_PER_GROUP = 64
+
+# Place the stride 2 conv on the 1x1 filter
+# Use True only for the original MSRA ResNet; use False for C2 and Torch models
+_C.MODEL.RESNETS.STRIDE_IN_1X1 = True
+
+# Apply dilation in stage "res5"
+_C.MODEL.RESNETS.RES5_DILATION = 1
+
+# Output width of res2. Scaling this parameters will scale the width of all 1x1 convs in ResNet
+_C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
+_C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64
+
+# Apply Deformable Convolution in stages
+# Specify if apply deform_conv on Res2, Res3, Res4, Res5
+_C.MODEL.RESNETS.DEFORM_ON_PER_STAGE = [False, False, False, False]
+# Use True to use modulated deform_conv (DeformableV2, https://arxiv.org/abs/1811.11168);
+# Use False for DeformableV1.
+_C.MODEL.RESNETS.DEFORM_MODULATED = False
+# Number of groups in deformable conv.
+_C.MODEL.RESNETS.DEFORM_NUM_GROUPS = 1
+
+
+# ---------------------------------------------------------------------------- #
+# Solver
+# ---------------------------------------------------------------------------- #
+_C.SOLVER = CN()
+
+# See detectron2/solver/build.py for LR scheduler options
+_C.SOLVER.LR_SCHEDULER_NAME = "WarmupMultiStepLR"
+
+_C.SOLVER.MAX_ITER = 40000
+
+_C.SOLVER.BASE_LR = 0.001
+
+_C.SOLVER.MOMENTUM = 0.9
+
+_C.SOLVER.WEIGHT_DECAY = 0.0001
+# The weight decay that's applied to parameters of normalization layers
+# (typically the affine transformation)
+_C.SOLVER.WEIGHT_DECAY_NORM = 0.0
+
+_C.SOLVER.GAMMA = 0.1
+_C.SOLVER.STEPS = (30000,)
+
+_C.SOLVER.WARMUP_FACTOR = 1.0 / 1000
+_C.SOLVER.WARMUP_ITERS = 1000
+_C.SOLVER.WARMUP_METHOD = "linear"
+
+_C.SOLVER.CHECKPOINT_PERIOD = 5000
+
+# Number of images per batch across all machines.
+# If we have 16 GPUs and IMS_PER_BATCH = 32,
+# each GPU will see 2 images per batch.
+_C.SOLVER.IMS_PER_BATCH = 16
+
+# Detectron v1 (and previous detection code) used a 2x higher LR and 0 WD for
+# biases. This is not useful (at least for recent models). You should avoid
+# changing these and they exist only to reproduce Detectron v1 training if
+# desired.
+_C.SOLVER.BIAS_LR_FACTOR = 1.0
+_C.SOLVER.WEIGHT_DECAY_BIAS = _C.SOLVER.WEIGHT_DECAY
+
+# ---------------------------------------------------------------------------- #
+# Specific test options
+# ---------------------------------------------------------------------------- #
+_C.TEST = CN()
+# For end-to-end tests to verify the expected accuracy.
+# Each item is [task, metric, value, tolerance]
+# e.g.: [['bbox', 'AP', 38.5, 0.2]]
+_C.TEST.EXPECTED_RESULTS = []
+# The period (in terms of steps) to evaluate the model during training.
+# Set to 0 to disable.
+_C.TEST.EVAL_PERIOD = 0
+# The sigmas used to calculate keypoint OKS.
+# When empty it will use the defaults in COCO.
+# Otherwise it should have the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS.
+_C.TEST.KEYPOINT_OKS_SIGMAS = []
+# Maximum number of detections to return per image during inference (100 is
+# based on the limit established for the COCO dataset).
+_C.TEST.DETECTIONS_PER_IMAGE = 100
+
+_C.TEST.AUG = CN({"ENABLED": False})
+_C.TEST.AUG.MIN_SIZES = (400, 500, 600, 700, 800, 900, 1000, 1100, 1200)
+_C.TEST.AUG.MAX_SIZE = 4000
+_C.TEST.AUG.FLIP = True
+
+_C.TEST.PRECISE_BN = CN({"ENABLED": False})
+_C.TEST.PRECISE_BN.NUM_ITER = 200
+
+# ---------------------------------------------------------------------------- #
+# Misc options
+# ---------------------------------------------------------------------------- #
+# Directory where output files are written
+_C.OUTPUT_DIR = "./output"
+# Set seed to negative to fully randomize everything.
+# Set seed to positive to use a fixed seed. Note that a fixed seed does not
+# guarantee fully deterministic behavior.
+_C.SEED = -1
+# Benchmark different cudnn algorithms.
+# If input images have very different sizes, this option will have large overhead
+# for about 10k iterations. It usually hurts total time, but can benefit for certain models.
+# If input images have the same or similar sizes, benchmark is often helpful.
+_C.CUDNN_BENCHMARK = False
+
+# global config is for quick hack purposes.
+# You can set them in command line or config files,
+# and access it with:
+#
+# from detectron2.config import global_cfg
+# print(global_cfg.HACK)
+#
+# Do not commit any configs into it.
+_C.GLOBAL = CN()
+_C.GLOBAL.HACK = 1.0
diff --git a/detectron2/data/__init__.py b/detectron2/data/__init__.py
new file mode 100644
index 0000000..b04d1dc
--- /dev/null
+++ b/detectron2/data/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from . import transforms  # isort:skip
+
+from .build import (
+    build_detection_test_loader,
+    build_detection_train_loader,
+    get_detection_dataset_dicts,
+    load_proposals_into_dataset,
+    print_instances_class_histogram,
+)
+from .catalog import DatasetCatalog, MetadataCatalog
+from .common import DatasetFromList, MapDataset
+from .dataset_mapper import DatasetMapper
+
+# ensure the builtin datasets are registered
+from . import datasets, samplers  # isort:skip
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/detectron2/data/build.py b/detectron2/data/build.py
new file mode 100644
index 0000000..7095cf7
--- /dev/null
+++ b/detectron2/data/build.py
@@ -0,0 +1,424 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import bisect
+import copy
+import itertools
+import logging
+import numpy as np
+import pickle
+import torch.utils.data
+from fvcore.common.file_io import PathManager
+from tabulate import tabulate
+from termcolor import colored
+
+from detectron2.structures import BoxMode
+from detectron2.utils.comm import get_world_size
+from detectron2.utils.env import seed_all_rng
+from detectron2.utils.logger import log_first_n
+
+from . import samplers
+from .catalog import DatasetCatalog, MetadataCatalog
+from .common import DatasetFromList, MapDataset
+from .dataset_mapper import DatasetMapper
+from .detection_utils import check_metadata_consistency
+
+"""
+This file contains the default logic to build a dataloader for training or testing.
+"""
+
+__all__ = [
+    "build_detection_train_loader",
+    "build_detection_test_loader",
+    "get_detection_dataset_dicts",
+    "load_proposals_into_dataset",
+    "print_instances_class_histogram",
+]
+
+
+def filter_images_with_only_crowd_annotations(dataset_dicts):
+    """
+    Filter out images with none annotations or only crowd annotations
+    (i.e., images without non-crowd annotations).
+    A common training-time preprocessing on COCO dataset.
+
+    Args:
+        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+
+    Returns:
+        list[dict]: the same format, but filtered.
+    """
+    num_before = len(dataset_dicts)
+
+    def valid(anns):
+        for ann in anns:
+            if ann.get("iscrowd", 0) == 0:
+                return True
+        return False
+
+    dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])]
+    num_after = len(dataset_dicts)
+    logger = logging.getLogger(__name__)
+    logger.info(
+        "Removed {} images with no usable annotations. {} images left.".format(
+            num_before - num_after, num_after
+        )
+    )
+    return dataset_dicts
+
+
+def filter_images_with_few_keypoints(dataset_dicts, min_keypoints_per_image):
+    """
+    Filter out images with too few number of keypoints.
+
+    Args:
+        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+
+    Returns:
+        list[dict]: the same format as dataset_dicts, but filtered.
+    """
+    num_before = len(dataset_dicts)
+
+    def visible_keypoints_in_image(dic):
+        # Each keypoints field has the format [x1, y1, v1, ...], where v is visibility
+        annotations = dic["annotations"]
+        return sum(
+            (np.array(ann["keypoints"][2::3]) > 0).sum()
+            for ann in annotations
+            if "keypoints" in ann
+        )
+
+    dataset_dicts = [
+        x for x in dataset_dicts if visible_keypoints_in_image(x) >= min_keypoints_per_image
+    ]
+    num_after = len(dataset_dicts)
+    logger = logging.getLogger(__name__)
+    logger.info(
+        "Removed {} images with fewer than {} keypoints.".format(
+            num_before - num_after, min_keypoints_per_image
+        )
+    )
+    return dataset_dicts
+
+
+def load_proposals_into_dataset(dataset_dicts, proposal_file):
+    """
+    Load precomputed object proposals into the dataset.
+
+    The proposal file should be a pickled dict with the following keys:
+    - "ids": list[int] or list[str], the image ids
+    - "boxes": list[np.ndarray], each is an Nx4 array of boxes corresponding to the image id
+    - "objectness_logits": list[np.ndarray], each is an N sized array of objectness scores
+        corresponding to the boxes.
+    - "bbox_mode": the BoxMode of the boxes array. Defaults to ``BoxMode.XYXY_ABS``.
+
+    Args:
+        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+        proposal_file (str): file path of pre-computed proposals, in pkl format.
+
+    Returns:
+        list[dict]: the same format as dataset_dicts, but added proposal field.
+    """
+    logger = logging.getLogger(__name__)
+    logger.info("Loading proposals from: {}".format(proposal_file))
+
+    with PathManager.open(proposal_file, "rb") as f:
+        proposals = pickle.load(f, encoding="latin1")
+
+    # Rename the key names in D1 proposal files
+    rename_keys = {"indexes": "ids", "scores": "objectness_logits"}
+    for key in rename_keys:
+        if key in proposals:
+            proposals[rename_keys[key]] = proposals.pop(key)
+
+    # Fetch the indexes of all proposals that are in the dataset
+    # Convert image_id to str since they could be int.
+    img_ids = set({str(record["image_id"]) for record in dataset_dicts})
+    id_to_index = {str(id): i for i, id in enumerate(proposals["ids"]) if str(id) in img_ids}
+
+    # Assuming default bbox_mode of precomputed proposals are 'XYXY_ABS'
+    bbox_mode = BoxMode(proposals["bbox_mode"]) if "bbox_mode" in proposals else BoxMode.XYXY_ABS
+
+    for record in dataset_dicts:
+        # Get the index of the proposal
+        i = id_to_index[str(record["image_id"])]
+
+        boxes = proposals["boxes"][i]
+        objectness_logits = proposals["objectness_logits"][i]
+        # Sort the proposals in descending order of the scores
+        inds = objectness_logits.argsort()[::-1]
+        record["proposal_boxes"] = boxes[inds]
+        record["proposal_objectness_logits"] = objectness_logits[inds]
+        record["proposal_bbox_mode"] = bbox_mode
+
+    return dataset_dicts
+
+
+def _quantize(x, bin_edges):
+    bin_edges = copy.copy(bin_edges)
+    bin_edges = sorted(bin_edges)
+    quantized = list(map(lambda y: bisect.bisect_right(bin_edges, y), x))
+    return quantized
+
+
+def print_instances_class_histogram(dataset_dicts, class_names):
+    """
+    Args:
+        dataset_dicts (list[dict]): list of dataset dicts.
+        class_names (list[str]): list of class names (zero-indexed).
+    """
+    num_classes = len(class_names)
+    hist_bins = np.arange(num_classes + 1)
+    histogram = np.zeros((num_classes,), dtype=np.int)
+    for entry in dataset_dicts:
+        annos = entry["annotations"]
+        classes = [x["category_id"] for x in annos if not x.get("iscrowd", 0)]
+        histogram += np.histogram(classes, bins=hist_bins)[0]
+
+    N_COLS = min(6, len(class_names) * 2)
+
+    def short_name(x):
+        # make long class names shorter. useful for lvis
+        if len(x) > 13:
+            return x[:11] + ".."
+        return x
+
+    data = list(
+        itertools.chain(*[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)])
+    )
+    total_num_instances = sum(data[1::2])
+    data.extend([None] * (N_COLS - (len(data) % N_COLS)))
+    if num_classes > 1:
+        data.extend(["total", total_num_instances])
+    data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)])
+    table = tabulate(
+        data,
+        headers=["category", "#instances"] * (N_COLS // 2),
+        tablefmt="pipe",
+        numalign="left",
+        stralign="center",
+    )
+    log_first_n(
+        logging.INFO,
+        "Distribution of training instances among all {} categories:\n".format(num_classes)
+        + colored(table, "cyan"),
+        key="message",
+    )
+
+
+def build_batch_data_sampler(
+    sampler, images_per_batch, group_bin_edges=None, grouping_features=None
+):
+    """
+    Return a dataset index sampler that batches dataset indices possibly with
+    grouping to improve training efficiency.
+
+    Args:
+        sampler (torch.utils.data.sampler.Sampler): any subclass of
+            :class:`torch.utils.data.sampler.Sampler`.
+        images_per_batch (int): the batch size. Note that the sampler may return
+            batches that have between 1 and images_per_batch (inclusive) elements
+            because the underlying index set (and grouping partitions, if grouping
+            is used) may not be divisible by images_per_batch.
+        group_bin_edges (None, list[number], tuple[number]): If None, then grouping
+            is disabled. If a list or tuple is given, the values are used as bin
+            edges for defining len(group_bin_edges) + 1 groups. When batches are
+            sampled, only elements from the same group are returned together.
+        grouping_features (None, list[number], tuple[number]): If None, then grouping
+            is disabled. If a list or tuple is given, it must specify for each index
+            in the underlying dataset the value to be used for placing that dataset
+            index into one of the grouping bins.
+
+    Returns:
+        A BatchSampler or subclass of BatchSampler.
+    """
+    if group_bin_edges and grouping_features:
+        assert isinstance(group_bin_edges, (list, tuple))
+        assert isinstance(grouping_features, (list, tuple))
+        group_ids = _quantize(grouping_features, group_bin_edges)
+        batch_sampler = samplers.GroupedBatchSampler(sampler, group_ids, images_per_batch)
+    else:
+        batch_sampler = torch.utils.data.sampler.BatchSampler(
+            sampler, images_per_batch, drop_last=True
+        )  # drop last so the batch always have the same size
+        # NOTE when we add batch inference support, make sure not to use this.
+    return batch_sampler
+
+
+def get_detection_dataset_dicts(
+    dataset_names, filter_empty=True, min_keypoints=0, proposal_files=None
+):
+    """
+    Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.
+
+    Args:
+        dataset_names (list[str]): a list of dataset names
+        filter_empty (bool): whether to filter out images without instance annotations
+        min_keypoints (int): filter out images with fewer keypoints than
+            `min_keypoints`. Set to 0 to do nothing.
+        proposal_files (list[str]): if given, a list of object proposal files
+            that match each dataset in `dataset_names`.
+    """
+    assert len(dataset_names)
+    dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in dataset_names]
+    for dataset_name, dicts in zip(dataset_names, dataset_dicts):
+        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
+
+    if proposal_files is not None:
+        assert len(dataset_names) == len(proposal_files)
+        # load precomputed proposals from proposal files
+        dataset_dicts = [
+            load_proposals_into_dataset(dataset_i_dicts, proposal_file)
+            for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files)
+        ]
+
+    dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
+
+    has_instances = "annotations" in dataset_dicts[0]
+    # Keep images without instance-level GT if the dataset has semantic labels.
+    if filter_empty and has_instances and "sem_seg_file_name" not in dataset_dicts[0]:
+        dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts)
+
+    if min_keypoints > 0 and has_instances:
+        dataset_dicts = filter_images_with_few_keypoints(dataset_dicts, min_keypoints)
+
+    if has_instances:
+        try:
+            class_names = MetadataCatalog.get(dataset_names[0]).thing_classes
+            check_metadata_consistency("thing_classes", dataset_names)
+            print_instances_class_histogram(dataset_dicts, class_names)
+        except AttributeError:  # class names are not available for this dataset
+            pass
+    return dataset_dicts
+
+
+def build_detection_train_loader(cfg, mapper=None):
+    """
+    A data loader is created by the following steps:
+
+    1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts.
+    2. Start workers to work on the dicts. Each worker will:
+      * Map each metadata dict into another format to be consumed by the model.
+      * Batch them by simply putting dicts into a list.
+    The batched ``list[mapped_dict]`` is what this dataloader will return.
+
+    Args:
+        cfg (CfgNode): the config
+        mapper (callable): a callable which takes a sample (dict) from dataset and
+            returns the format to be consumed by the model.
+            By default it will be `DatasetMapper(cfg, True)`.
+
+    Returns:
+        a torch DataLoader object
+    """
+    num_workers = get_world_size()
+    images_per_batch = cfg.SOLVER.IMS_PER_BATCH
+    assert (
+        images_per_batch % num_workers == 0
+    ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format(
+        images_per_batch, num_workers
+    )
+    assert (
+        images_per_batch >= num_workers
+    ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format(
+        images_per_batch, num_workers
+    )
+    images_per_worker = images_per_batch // num_workers
+
+    dataset_dicts = get_detection_dataset_dicts(
+        cfg.DATASETS.TRAIN,
+        filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
+        min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
+        if cfg.MODEL.KEYPOINT_ON
+        else 0,
+        proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
+    )
+    dataset = DatasetFromList(dataset_dicts, copy=False)
+
+    # Bin edges for batching images with similar aspect ratios. If ASPECT_RATIO_GROUPING
+    # is enabled, we define two bins with an edge at height / width = 1.
+    group_bin_edges = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else []
+    aspect_ratios = [float(img["height"]) / float(img["width"]) for img in dataset]
+
+    if mapper is None:
+        mapper = DatasetMapper(cfg, True)
+    dataset = MapDataset(dataset, mapper)
+
+    sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
+    logger = logging.getLogger(__name__)
+    logger.info("Using training sampler {}".format(sampler_name))
+    if sampler_name == "TrainingSampler":
+        sampler = samplers.TrainingSampler(len(dataset))
+    elif sampler_name == "RepeatFactorTrainingSampler":
+        sampler = samplers.RepeatFactorTrainingSampler(
+            dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD
+        )
+    else:
+        raise ValueError("Unknown training sampler: {}".format(sampler_name))
+    batch_sampler = build_batch_data_sampler(
+        sampler, images_per_worker, group_bin_edges, aspect_ratios
+    )
+
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=cfg.DATALOADER.NUM_WORKERS,
+        batch_sampler=batch_sampler,
+        collate_fn=trivial_batch_collator,
+        worker_init_fn=worker_init_reset_seed,
+    )
+    return data_loader
+
+
+def build_detection_test_loader(cfg, dataset_name, mapper=None):
+    """
+    Similar to `build_detection_train_loader`.
+    But this function uses the given `dataset_name` argument (instead of the names in cfg),
+    and uses batch size 1.
+
+    Args:
+        cfg: a detectron2 CfgNode
+        dataset_name (str): a name of the dataset that's available in the DatasetCatalog
+        mapper (callable): a callable which takes a sample (dict) from dataset
+           and returns the format to be consumed by the model.
+           By default it will be `DatasetMapper(cfg, False)`.
+
+    Returns:
+        DataLoader: a torch DataLoader, that loads the given detection
+        dataset, with test-time transformation and batching.
+    """
+    dataset_dicts = get_detection_dataset_dicts(
+        [dataset_name],
+        filter_empty=False,
+        proposal_files=[
+            cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(dataset_name)]
+        ]
+        if cfg.MODEL.LOAD_PROPOSALS
+        else None,
+    )
+
+    dataset = DatasetFromList(dataset_dicts)
+    if mapper is None:
+        mapper = DatasetMapper(cfg, False)
+    dataset = MapDataset(dataset, mapper)
+
+    sampler = samplers.InferenceSampler(len(dataset))
+    # Always use 1 image per worker during inference since this is the
+    # standard when reporting inference time in papers.
+    batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False)
+
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=cfg.DATALOADER.NUM_WORKERS,
+        batch_sampler=batch_sampler,
+        collate_fn=trivial_batch_collator,
+    )
+    return data_loader
+
+
+def trivial_batch_collator(batch):
+    """
+    A batch collator that does nothing.
+    """
+    return batch
+
+
+def worker_init_reset_seed(worker_id):
+    seed_all_rng(np.random.randint(2 ** 31) + worker_id)
diff --git a/detectron2/data/catalog.py b/detectron2/data/catalog.py
new file mode 100644
index 0000000..4bababb
--- /dev/null
+++ b/detectron2/data/catalog.py
@@ -0,0 +1,211 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import logging
+import types
+from typing import List
+
+from detectron2.utils.logger import log_first_n
+
+__all__ = ["DatasetCatalog", "MetadataCatalog"]
+
+
+class DatasetCatalog(object):
+    """
+    A catalog that stores information about the datasets and how to obtain them.
+
+    It contains a mapping from strings
+    (which are names that identify a dataset, e.g. "coco_2014_train")
+    to a function which parses the dataset and returns the samples in the
+    format of `list[dict]`.
+
+    The returned dicts should be in Detectron2 Dataset format (See DATASETS.md for details)
+    if used with the data loader functionalities in `data/build.py,data/detection_transform.py`.
+
+    The purpose of having this catalog is to make it easy to choose
+    different datasets, by just using the strings in the config.
+    """
+
+    _REGISTERED = {}
+
+    @staticmethod
+    def register(name, func):
+        """
+        Args:
+            name (str): the name that identifies a dataset, e.g. "coco_2014_train".
+            func (callable): a callable which takes no arguments and returns a list of dicts.
+        """
+        assert callable(func), "You must register a function with `DatasetCatalog.register`!"
+        assert name not in DatasetCatalog._REGISTERED, "Dataset '{}' is already registered!".format(
+            name
+        )
+        DatasetCatalog._REGISTERED[name] = func
+
+    @staticmethod
+    def get(name):
+        """
+        Call the registered function and return its results.
+
+        Args:
+            name (str): the name that identifies a dataset, e.g. "coco_2014_train".
+
+        Returns:
+            list[dict]: dataset annotations.0
+        """
+        try:
+            f = DatasetCatalog._REGISTERED[name]
+        except KeyError:
+            raise KeyError(
+                "Dataset '{}' is not registered! Available datasets are: {}".format(
+                    name, ", ".join(DatasetCatalog._REGISTERED.keys())
+                )
+            )
+        return f()
+
+    @staticmethod
+    def list() -> List[str]:
+        """
+        List all registered datasets.
+
+        Returns:
+            list[str]
+        """
+        return list(DatasetCatalog._REGISTERED.keys())
+
+    @staticmethod
+    def clear():
+        """
+        Remove all registered dataset.
+        """
+        DatasetCatalog._REGISTERED.clear()
+
+
+class Metadata(types.SimpleNamespace):
+    """
+    A class that supports simple attribute setter/getter.
+    It is intended for storing metadata of a dataset and make it accessible globally.
+
+    Examples:
+
+    .. code-block:: python
+
+        # somewhere when you load the data:
+        MetadataCatalog.get("mydataset").thing_classes = ["person", "dog"]
+
+        # somewhere when you print statistics or visualize:
+        classes = MetadataCatalog.get("mydataset").thing_classes
+    """
+
+    # the name of the dataset
+    # set default to N/A so that `self.name` in the errors will not trigger getattr again
+    name: str = "N/A"
+
+    _RENAMED = {
+        "class_names": "thing_classes",
+        "dataset_id_to_contiguous_id": "thing_dataset_id_to_contiguous_id",
+        "stuff_class_names": "stuff_classes",
+    }
+
+    def __getattr__(self, key):
+        if key in self._RENAMED:
+            log_first_n(
+                logging.WARNING,
+                "Metadata '{}' was renamed to '{}'!".format(key, self._RENAMED[key]),
+                n=10,
+            )
+            return getattr(self, self._RENAMED[key])
+
+        raise AttributeError(
+            "Attribute '{}' does not exist in the metadata of '{}'. Available keys are {}.".format(
+                key, self.name, str(self.__dict__.keys())
+            )
+        )
+
+    def __setattr__(self, key, val):
+        if key in self._RENAMED:
+            log_first_n(
+                logging.WARNING,
+                "Metadata '{}' was renamed to '{}'!".format(key, self._RENAMED[key]),
+                n=10,
+            )
+            setattr(self, self._RENAMED[key], val)
+
+        # Ensure that metadata of the same name stays consistent
+        try:
+            oldval = getattr(self, key)
+            assert oldval == val, (
+                "Attribute '{}' in the metadata of '{}' cannot be set "
+                "to a different value!\n{} != {}".format(key, self.name, oldval, val)
+            )
+        except AttributeError:
+            super().__setattr__(key, val)
+
+    def as_dict(self):
+        """
+        Returns all the metadata as a dict.
+        Note that modifications to the returned dict will not reflect on the Metadata object.
+        """
+        return copy.copy(self.__dict__)
+
+    def set(self, **kwargs):
+        """
+        Set multiple metadata with kwargs.
+        """
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+        return self
+
+    def get(self, key, default=None):
+        """
+        Access an attribute and return its value if exists.
+        Otherwise return default.
+        """
+        try:
+            return getattr(self, key)
+        except AttributeError:
+            return default
+
+
+class MetadataCatalog:
+    """
+    MetadataCatalog provides access to "Metadata" of a given dataset.
+
+    The metadata associated with a certain name is a singleton: once created,
+    the metadata will stay alive and will be returned by future calls to `get(name)`.
+
+    It's like global variables, so don't abuse it.
+    It's meant for storing knowledge that's constant and shared across the execution
+    of the program, e.g.: the class names in COCO.
+    """
+
+    _NAME_TO_META = {}
+
+    @staticmethod
+    def get(name):
+        """
+        Args:
+            name (str): name of a dataset (e.g. coco_2014_train).
+
+        Returns:
+            Metadata: The :class:`Metadata` instance associated with this name,
+            or create an empty one if none is available.
+        """
+        assert len(name)
+        if name in MetadataCatalog._NAME_TO_META:
+            ret = MetadataCatalog._NAME_TO_META[name]
+            # TODO this is for the BC breaking change in D15247032.
+            # Remove this in the future.
+            if hasattr(ret, "dataset_name"):
+                logger = logging.getLogger()
+                logger.warning(
+                    """
+The 'dataset_name' key in metadata is no longer used for
+sharing metadata among splits after D15247032! Add
+metadata to each split (now called dataset) separately!
+                    """
+                )
+                parent_meta = MetadataCatalog.get(ret.dataset_name).as_dict()
+                ret.set(**parent_meta)
+            return ret
+        else:
+            m = MetadataCatalog._NAME_TO_META[name] = Metadata(name=name)
+            return m
diff --git a/detectron2/data/common.py b/detectron2/data/common.py
new file mode 100644
index 0000000..6512c3d
--- /dev/null
+++ b/detectron2/data/common.py
@@ -0,0 +1,81 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import logging
+import random
+import torch.utils.data as data
+
+from detectron2.utils.serialize import PicklableWrapper
+
+__all__ = ["MapDataset", "DatasetFromList"]
+
+
+class MapDataset(data.Dataset):
+    """
+    Map a function over the elements in a dataset.
+
+    Args:
+        dataset: a dataset where map function is applied.
+        map_func: a callable which maps the element in dataset. map_func is
+            responsible for error handling, when error happens, it needs to
+            return None so the MapDataset will randomly use other
+            elements from the dataset.
+    """
+
+    def __init__(self, dataset, map_func):
+        self._dataset = dataset
+        self._map_func = PicklableWrapper(map_func)  # wrap so that a lambda will work
+
+        self._rng = random.Random(42)
+        self._fallback_candidates = set(range(len(dataset)))
+
+    def __len__(self):
+        return len(self._dataset)
+
+    def __getitem__(self, idx):
+        retry_count = 0
+        cur_idx = int(idx)
+
+        while True:
+            data = self._map_func(self._dataset[cur_idx])
+            if data is not None:
+                self._fallback_candidates.add(cur_idx)
+                return data
+
+            # _map_func fails for this idx, use a random new index from the pool
+            retry_count += 1
+            self._fallback_candidates.discard(cur_idx)
+            cur_idx = self._rng.sample(self._fallback_candidates, k=1)[0]
+
+            if retry_count >= 3:
+                logger = logging.getLogger(__name__)
+                logger.warning(
+                    "Failed to apply `_map_func` for idx: {}, retry count: {}".format(
+                        idx, retry_count
+                    )
+                )
+
+
+class DatasetFromList(data.Dataset):
+    """
+    Wrap a list to a torch Dataset. It produces elements of the list as data.
+    """
+
+    def __init__(self, lst: list, copy: bool = True):
+        """
+        Args:
+            lst (list): a list which contains elements to produce.
+            copy (bool): whether to deepcopy the element when producing it,
+                so that the result can be modified in place without affecting the
+                source in the list.
+        """
+        self._lst = lst
+        self._copy = copy
+
+    def __len__(self):
+        return len(self._lst)
+
+    def __getitem__(self, idx):
+        if self._copy:
+            return copy.deepcopy(self._lst[idx])
+        else:
+            return self._lst[idx]
diff --git a/detectron2/data/dataset_mapper.py b/detectron2/data/dataset_mapper.py
new file mode 100644
index 0000000..300901f
--- /dev/null
+++ b/detectron2/data/dataset_mapper.py
@@ -0,0 +1,146 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import logging
+import numpy as np
+import torch
+from fvcore.common.file_io import PathManager
+from PIL import Image
+
+from . import detection_utils as utils
+from . import transforms as T
+
+"""
+This file contains the default mapping that's applied to "dataset dicts".
+"""
+
+__all__ = ["DatasetMapper"]
+
+
+class DatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by the model.
+
+    This is the default callable to be used to map your dataset dict into training data.
+    You may need to follow it to implement your own one for customized logic.
+
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies cropping/geometric transforms to the image and annotations
+    3. Prepare data and annotations to Tensor and :class:`Instances`
+    """
+
+    def __init__(self, cfg, is_train=True):
+        if cfg.INPUT.CROP.ENABLED and is_train:
+            self.crop_gen = T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE)
+            logging.getLogger(__name__).info("CropGen used in training: " + str(self.crop_gen))
+        else:
+            self.crop_gen = None
+
+        self.tfm_gens = utils.build_transform_gen(cfg, is_train)
+
+        # fmt: off
+        self.img_format     = cfg.INPUT.FORMAT
+        self.mask_on        = cfg.MODEL.MASK_ON
+        self.mask_format    = cfg.INPUT.MASK_FORMAT
+        self.keypoint_on    = cfg.MODEL.KEYPOINT_ON
+        self.load_proposals = cfg.MODEL.LOAD_PROPOSALS
+        # fmt: on
+        if self.keypoint_on and is_train:
+            # Flip only makes sense in training
+            self.keypoint_hflip_indices = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
+        else:
+            self.keypoint_hflip_indices = None
+
+        if self.load_proposals:
+            self.min_box_side_len = cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE
+            self.proposal_topk = (
+                cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN
+                if is_train
+                else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST
+            )
+        self.is_train = is_train
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        # USER: Write your own image loading if it's not from a file
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+
+        if "annotations" not in dataset_dict:
+            image, transforms = T.apply_transform_gens(
+                ([self.crop_gen] if self.crop_gen else []) + self.tfm_gens, image
+            )
+        else:
+            # Crop around an instance if there are instances in the image.
+            # USER: Remove if you don't use cropping
+            if self.crop_gen:
+                crop_tfm = utils.gen_crop_transform_with_instance(
+                    self.crop_gen.get_crop_size(image.shape[:2]),
+                    image.shape[:2],
+                    np.random.choice(dataset_dict["annotations"]),
+                )
+                image = crop_tfm.apply_image(image)
+            image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+            if self.crop_gen:
+                transforms = crop_tfm + transforms
+
+        image_shape = image.shape[:2]  # h, w
+
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32"))
+        # Can use uint8 if it turns out to be slow some day
+
+        # USER: Remove if you don't use pre-computed proposals.
+        if self.load_proposals:
+            utils.transform_proposals(
+                dataset_dict, image_shape, transforms, self.min_box_side_len, self.proposal_topk
+            )
+
+        if not self.is_train:
+            dataset_dict.pop("annotations", None)
+            dataset_dict.pop("sem_seg_file_name", None)
+            return dataset_dict
+
+        if "annotations" in dataset_dict:
+            # USER: Modify this if you want to keep them for some reason.
+            for anno in dataset_dict["annotations"]:
+                if not self.mask_on:
+                    anno.pop("segmentation", None)
+                if not self.keypoint_on:
+                    anno.pop("keypoints", None)
+
+            # USER: Implement additional transformations if you have other types of data
+            annos = [
+                utils.transform_instance_annotations(
+                    obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices
+                )
+                for obj in dataset_dict.pop("annotations")
+                if obj.get("iscrowd", 0) == 0
+            ]
+            instances = utils.annotations_to_instances(
+                annos, image_shape, mask_format=self.mask_format
+            )
+            # Create a tight bounding box from masks, useful when image is cropped
+            if self.crop_gen and instances.has("gt_masks"):
+                instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
+            dataset_dict["instances"] = utils.filter_empty_instances(instances)
+
+        # USER: Remove if you don't do semantic/panoptic segmentation.
+        if "sem_seg_file_name" in dataset_dict:
+            with PathManager.open(dataset_dict.pop("sem_seg_file_name"), "rb") as f:
+                sem_seg_gt = Image.open(f)
+                sem_seg_gt = np.asarray(sem_seg_gt, dtype="uint8")
+            sem_seg_gt = transforms.apply_segmentation(sem_seg_gt)
+            sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
+            dataset_dict["sem_seg"] = sem_seg_gt
+        return dataset_dict
diff --git a/detectron2/data/datasets/README.md b/detectron2/data/datasets/README.md
new file mode 100644
index 0000000..9fb3e4f
--- /dev/null
+++ b/detectron2/data/datasets/README.md
@@ -0,0 +1,9 @@
+
+
+### Common Datasets
+
+The dataset implemented here do not need to load the data into the final format.
+It should provide the minimal data structure needed to use the dataset, so it can be very efficient.
+
+For example, for an image dataset, just provide the file names and labels, but don't read the images.
+Let the downstream decide how to read.
diff --git a/detectron2/data/datasets/__init__.py b/detectron2/data/datasets/__init__.py
new file mode 100644
index 0000000..a2bfbea
--- /dev/null
+++ b/detectron2/data/datasets/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .cityscapes import load_cityscapes_instances
+from .coco import load_coco_json, load_sem_seg
+from .lvis import load_lvis_json, register_lvis_instances, get_lvis_instances_meta
+from .register_coco import register_coco_instances, register_coco_panoptic_separated
+from . import builtin  # ensure the builtin datasets are registered
+
+
+__all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
diff --git a/detectron2/data/datasets/builtin.py b/detectron2/data/datasets/builtin.py
new file mode 100644
index 0000000..28ec435
--- /dev/null
+++ b/detectron2/data/datasets/builtin.py
@@ -0,0 +1,215 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+"""
+This file registers pre-defined datasets at hard-coded paths, and their metadata.
+
+We hard-code metadata for common datasets. This will enable:
+1. Consistency check when loading the datasets
+2. Use models on these standard datasets directly and run demos,
+   without having to download the dataset annotations
+
+We hard-code some paths to the dataset that's assumed to
+exist in "./datasets/".
+
+Users SHOULD NOT use this file to create new dataset / metadata for new dataset.
+To add new dataset, refer to the tutorial "docs/DATASETS.md".
+"""
+
+import os
+
+from detectron2.data import MetadataCatalog, DatasetCatalog
+from .register_coco import register_coco_instances, register_coco_panoptic_separated
+from .lvis import register_lvis_instances, get_lvis_instances_meta
+from .cityscapes import load_cityscapes_instances, load_cityscapes_semantic
+from .pascal_voc import register_pascal_voc
+from .builtin_meta import _get_builtin_metadata
+
+
+# ==== Predefined datasets and splits for COCO ==========
+
+_PREDEFINED_SPLITS_COCO = {}
+_PREDEFINED_SPLITS_COCO["coco"] = {
+    "coco_2014_train": ("coco/train2014", "coco/annotations/instances_train2014.json"),
+    "coco_2014_val": ("coco/val2014", "coco/annotations/instances_val2014.json"),
+    "coco_2014_minival": ("coco/val2014", "coco/annotations/instances_minival2014.json"),
+    "coco_2014_minival_100": ("coco/val2014", "coco/annotations/instances_minival2014_100.json"),
+    "coco_2014_valminusminival": (
+        "coco/val2014",
+        "coco/annotations/instances_valminusminival2014.json",
+    ),
+    "coco_2017_train": ("coco/train2017", "coco/annotations/instances_train2017.json"),
+    "coco_2017_val": ("coco/val2017", "coco/annotations/instances_val2017.json"),
+    "coco_2017_test": ("coco/test2017", "coco/annotations/image_info_test2017.json"),
+    "coco_2017_test-dev": ("coco/test2017", "coco/annotations/image_info_test-dev2017.json"),
+    "coco_2017_val_100": ("coco/val2017", "coco/annotations/instances_val2017_100.json"),
+}
+
+_PREDEFINED_SPLITS_COCO["coco_person"] = {
+    "keypoints_coco_2014_train": (
+        "coco/train2014",
+        "coco/annotations/person_keypoints_train2014.json",
+    ),
+    "keypoints_coco_2014_val": ("coco/val2014", "coco/annotations/person_keypoints_val2014.json"),
+    "keypoints_coco_2014_minival": (
+        "coco/val2014",
+        "coco/annotations/person_keypoints_minival2014.json",
+    ),
+    "keypoints_coco_2014_valminusminival": (
+        "coco/val2014",
+        "coco/annotations/person_keypoints_valminusminival2014.json",
+    ),
+    "keypoints_coco_2014_minival_100": (
+        "coco/val2014",
+        "coco/annotations/person_keypoints_minival2014_100.json",
+    ),
+    "keypoints_coco_2017_train": (
+        "coco/train2017",
+        "coco/annotations/person_keypoints_train2017.json",
+    ),
+    "keypoints_coco_2017_val": ("coco/val2017", "coco/annotations/person_keypoints_val2017.json"),
+    "keypoints_coco_2017_val_100": (
+        "coco/val2017",
+        "coco/annotations/person_keypoints_val2017_100.json",
+    ),
+}
+
+
+_PREDEFINED_SPLITS_COCO_PANOPTIC = {
+    "coco_2017_train_panoptic": (
+        # This is the original panoptic annotation directory
+        "coco/panoptic_train2017",
+        "coco/annotations/panoptic_train2017.json",
+        # This directory contains semantic annotations that are
+        # converted from panoptic annotations.
+        # It is used by PanopticFPN.
+        # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py
+        # to create these directories.
+        "coco/panoptic_stuff_train2017",
+    ),
+    "coco_2017_val_panoptic": (
+        "coco/panoptic_val2017",
+        "coco/annotations/panoptic_val2017.json",
+        "coco/panoptic_stuff_val2017",
+    ),
+    "coco_2017_val_100_panoptic": (
+        "coco/panoptic_val2017_100",
+        "coco/annotations/panoptic_val2017_100.json",
+        "coco/panoptic_stuff_val2017_100",
+    ),
+}
+
+
+def register_all_coco(root="datasets"):
+    for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_COCO.items():
+        for key, (image_root, json_file) in splits_per_dataset.items():
+            # Assume pre-defined datasets live in `./datasets`.
+            register_coco_instances(
+                key,
+                _get_builtin_metadata(dataset_name),
+                os.path.join(root, json_file) if "://" not in json_file else json_file,
+                os.path.join(root, image_root),
+            )
+
+    for (
+        prefix,
+        (panoptic_root, panoptic_json, semantic_root),
+    ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items():
+        prefix_instances = prefix[: -len("_panoptic")]
+        instances_meta = MetadataCatalog.get(prefix_instances)
+        image_root, instances_json = instances_meta.image_root, instances_meta.json_file
+        register_coco_panoptic_separated(
+            prefix,
+            _get_builtin_metadata("coco_panoptic_separated"),
+            image_root,
+            os.path.join(root, panoptic_root),
+            os.path.join(root, panoptic_json),
+            os.path.join(root, semantic_root),
+            instances_json,
+        )
+
+
+# ==== Predefined datasets and splits for LVIS ==========
+
+
+_PREDEFINED_SPLITS_LVIS = {
+    "lvis_v0.5": {
+        "lvis_v0.5_train": ("coco/train2017", "lvis/lvis_v0.5_train.json"),
+        "lvis_v0.5_val": ("coco/val2017", "lvis/lvis_v0.5_val.json"),
+        "lvis_v0.5_val_rand_100": ("coco/val2017", "lvis/lvis_v0.5_val_rand_100.json"),
+        "lvis_v0.5_test": ("coco/test2017", "lvis/lvis_v0.5_image_info_test.json"),
+    }
+}
+
+
+def register_all_lvis(root="datasets"):
+    for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_LVIS.items():
+        for key, (image_root, json_file) in splits_per_dataset.items():
+            # Assume pre-defined datasets live in `./datasets`.
+            register_lvis_instances(
+                key,
+                get_lvis_instances_meta(dataset_name),
+                os.path.join(root, json_file) if "://" not in json_file else json_file,
+                os.path.join(root, image_root),
+            )
+
+
+# ==== Predefined splits for raw cityscapes images ===========
+
+
+_RAW_CITYSCAPES_SPLITS = {
+    "cityscapes_fine_{task}_train": ("cityscapes/leftImg8bit/train", "cityscapes/gtFine/train"),
+    "cityscapes_fine_{task}_val": ("cityscapes/leftImg8bit/val", "cityscapes/gtFine/val"),
+    "cityscapes_fine_{task}_test": ("cityscapes/leftImg8bit/test", "cityscapes/gtFine/test"),
+}
+
+
+def register_all_cityscapes(root="datasets"):
+    for key, (image_dir, gt_dir) in _RAW_CITYSCAPES_SPLITS.items():
+        meta = _get_builtin_metadata("cityscapes")
+        image_dir = os.path.join(root, image_dir)
+        gt_dir = os.path.join(root, gt_dir)
+
+        inst_key = key.format(task="instance_seg")
+        DatasetCatalog.register(
+            inst_key,
+            lambda x=image_dir, y=gt_dir: load_cityscapes_instances(
+                x, y, from_json=True, to_polygons=True
+            ),
+        )
+        MetadataCatalog.get(inst_key).set(
+            image_dir=image_dir, gt_dir=gt_dir, evaluator_type="cityscapes", **meta
+        )
+
+        sem_key = key.format(task="sem_seg")
+        DatasetCatalog.register(
+            sem_key, lambda x=image_dir, y=gt_dir: load_cityscapes_semantic(x, y)
+        )
+        MetadataCatalog.get(sem_key).set(
+            image_dir=image_dir, gt_dir=gt_dir, evaluator_type="sem_seg", **meta
+        )
+
+
+# ==== Predefined splits for PASCAL VOC ===========
+def register_all_pascal_voc(root="datasets"):
+    SPLITS = [
+        ("voc_2007_trainval", "VOC2007", "trainval"),
+        ("voc_2007_train", "VOC2007", "train"),
+        ("voc_2007_val", "VOC2007", "val"),
+        ("voc_2007_test", "VOC2007", "test"),
+        ("voc_2012_trainval", "VOC2012", "trainval"),
+        ("voc_2012_train", "VOC2012", "train"),
+        ("voc_2012_val", "VOC2012", "val"),
+    ]
+    for name, dirname, split in SPLITS:
+        year = 2007 if "2007" in name else 2012
+        register_pascal_voc(name, os.path.join(root, dirname), split, year)
+        MetadataCatalog.get(name).evaluator_type = "pascal_voc"
+
+
+# Register them all under "./datasets"
+register_all_coco()
+register_all_lvis()
+register_all_cityscapes()
+register_all_pascal_voc()
diff --git a/detectron2/data/datasets/builtin_meta.py b/detectron2/data/datasets/builtin_meta.py
new file mode 100644
index 0000000..74c7986
--- /dev/null
+++ b/detectron2/data/datasets/builtin_meta.py
@@ -0,0 +1,267 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+# All coco categories, together with their nice-looking visualization colors
+# It's from https://github.com/cocodataset/panopticapi/blob/master/panoptic_coco_categories.json
+COCO_CATEGORIES = [
+    {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
+    {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
+    {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
+    {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
+    {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
+    {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
+    {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
+    {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
+    {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
+    {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
+    {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
+    {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
+    {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
+    {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
+    {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
+    {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
+    {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
+    {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
+    {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
+    {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
+    {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
+    {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
+    {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
+    {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
+    {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
+    {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
+    {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
+    {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
+    {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
+    {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
+    {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
+    {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
+    {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
+    {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
+    {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
+    {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
+    {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
+    {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
+    {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
+    {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
+    {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
+    {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
+    {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
+    {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
+    {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
+    {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
+    {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
+    {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
+    {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
+    {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
+    {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
+    {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
+    {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
+    {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
+    {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
+    {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
+    {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
+    {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
+    {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
+    {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
+    {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
+    {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
+    {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
+    {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
+    {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
+    {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
+    {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
+    {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
+    {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
+    {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
+    {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
+    {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
+    {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
+    {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
+    {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
+    {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
+    {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
+    {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
+    {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
+    {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
+    {"color": [255, 255, 128], "isthing": 0, "id": 92, "name": "banner"},
+    {"color": [147, 211, 203], "isthing": 0, "id": 93, "name": "blanket"},
+    {"color": [150, 100, 100], "isthing": 0, "id": 95, "name": "bridge"},
+    {"color": [168, 171, 172], "isthing": 0, "id": 100, "name": "cardboard"},
+    {"color": [146, 112, 198], "isthing": 0, "id": 107, "name": "counter"},
+    {"color": [210, 170, 100], "isthing": 0, "id": 109, "name": "curtain"},
+    {"color": [92, 136, 89], "isthing": 0, "id": 112, "name": "door-stuff"},
+    {"color": [218, 88, 184], "isthing": 0, "id": 118, "name": "floor-wood"},
+    {"color": [241, 129, 0], "isthing": 0, "id": 119, "name": "flower"},
+    {"color": [217, 17, 255], "isthing": 0, "id": 122, "name": "fruit"},
+    {"color": [124, 74, 181], "isthing": 0, "id": 125, "name": "gravel"},
+    {"color": [70, 70, 70], "isthing": 0, "id": 128, "name": "house"},
+    {"color": [255, 228, 255], "isthing": 0, "id": 130, "name": "light"},
+    {"color": [154, 208, 0], "isthing": 0, "id": 133, "name": "mirror-stuff"},
+    {"color": [193, 0, 92], "isthing": 0, "id": 138, "name": "net"},
+    {"color": [76, 91, 113], "isthing": 0, "id": 141, "name": "pillow"},
+    {"color": [255, 180, 195], "isthing": 0, "id": 144, "name": "platform"},
+    {"color": [106, 154, 176], "isthing": 0, "id": 145, "name": "playingfield"},
+    {"color": [230, 150, 140], "isthing": 0, "id": 147, "name": "railroad"},
+    {"color": [60, 143, 255], "isthing": 0, "id": 148, "name": "river"},
+    {"color": [128, 64, 128], "isthing": 0, "id": 149, "name": "road"},
+    {"color": [92, 82, 55], "isthing": 0, "id": 151, "name": "roof"},
+    {"color": [254, 212, 124], "isthing": 0, "id": 154, "name": "sand"},
+    {"color": [73, 77, 174], "isthing": 0, "id": 155, "name": "sea"},
+    {"color": [255, 160, 98], "isthing": 0, "id": 156, "name": "shelf"},
+    {"color": [255, 255, 255], "isthing": 0, "id": 159, "name": "snow"},
+    {"color": [104, 84, 109], "isthing": 0, "id": 161, "name": "stairs"},
+    {"color": [169, 164, 131], "isthing": 0, "id": 166, "name": "tent"},
+    {"color": [225, 199, 255], "isthing": 0, "id": 168, "name": "towel"},
+    {"color": [137, 54, 74], "isthing": 0, "id": 171, "name": "wall-brick"},
+    {"color": [135, 158, 223], "isthing": 0, "id": 175, "name": "wall-stone"},
+    {"color": [7, 246, 231], "isthing": 0, "id": 176, "name": "wall-tile"},
+    {"color": [107, 255, 200], "isthing": 0, "id": 177, "name": "wall-wood"},
+    {"color": [58, 41, 149], "isthing": 0, "id": 178, "name": "water-other"},
+    {"color": [183, 121, 142], "isthing": 0, "id": 180, "name": "window-blind"},
+    {"color": [255, 73, 97], "isthing": 0, "id": 181, "name": "window-other"},
+    {"color": [107, 142, 35], "isthing": 0, "id": 184, "name": "tree-merged"},
+    {"color": [190, 153, 153], "isthing": 0, "id": 185, "name": "fence-merged"},
+    {"color": [146, 139, 141], "isthing": 0, "id": 186, "name": "ceiling-merged"},
+    {"color": [70, 130, 180], "isthing": 0, "id": 187, "name": "sky-other-merged"},
+    {"color": [134, 199, 156], "isthing": 0, "id": 188, "name": "cabinet-merged"},
+    {"color": [209, 226, 140], "isthing": 0, "id": 189, "name": "table-merged"},
+    {"color": [96, 36, 108], "isthing": 0, "id": 190, "name": "floor-other-merged"},
+    {"color": [96, 96, 96], "isthing": 0, "id": 191, "name": "pavement-merged"},
+    {"color": [64, 170, 64], "isthing": 0, "id": 192, "name": "mountain-merged"},
+    {"color": [152, 251, 152], "isthing": 0, "id": 193, "name": "grass-merged"},
+    {"color": [208, 229, 228], "isthing": 0, "id": 194, "name": "dirt-merged"},
+    {"color": [206, 186, 171], "isthing": 0, "id": 195, "name": "paper-merged"},
+    {"color": [152, 161, 64], "isthing": 0, "id": 196, "name": "food-other-merged"},
+    {"color": [116, 112, 0], "isthing": 0, "id": 197, "name": "building-other-merged"},
+    {"color": [0, 114, 143], "isthing": 0, "id": 198, "name": "rock-merged"},
+    {"color": [102, 102, 156], "isthing": 0, "id": 199, "name": "wall-other-merged"},
+    {"color": [250, 141, 255], "isthing": 0, "id": 200, "name": "rug-merged"},
+]
+
+# fmt: off
+COCO_PERSON_KEYPOINT_NAMES = (
+    "nose",
+    "left_eye", "right_eye",
+    "left_ear", "right_ear",
+    "left_shoulder", "right_shoulder",
+    "left_elbow", "right_elbow",
+    "left_wrist", "right_wrist",
+    "left_hip", "right_hip",
+    "left_knee", "right_knee",
+    "left_ankle", "right_ankle",
+)
+# fmt: on
+
+# Pairs of keypoints that should be exchanged under horizontal flipping
+COCO_PERSON_KEYPOINT_FLIP_MAP = (
+    ("left_eye", "right_eye"),
+    ("left_ear", "right_ear"),
+    ("left_shoulder", "right_shoulder"),
+    ("left_elbow", "right_elbow"),
+    ("left_wrist", "right_wrist"),
+    ("left_hip", "right_hip"),
+    ("left_knee", "right_knee"),
+    ("left_ankle", "right_ankle"),
+)
+
+# rules for pairs of keypoints to draw a line between, and the line color to use.
+KEYPOINT_CONNECTION_RULES = [
+    # face
+    ("left_ear", "left_eye", (102, 204, 255)),
+    ("right_ear", "right_eye", (51, 153, 255)),
+    ("left_eye", "nose", (102, 0, 204)),
+    ("nose", "right_eye", (51, 102, 255)),
+    # upper-body
+    ("left_shoulder", "right_shoulder", (255, 128, 0)),
+    ("left_shoulder", "left_elbow", (153, 255, 204)),
+    ("right_shoulder", "right_elbow", (128, 229, 255)),
+    ("left_elbow", "left_wrist", (153, 255, 153)),
+    ("right_elbow", "right_wrist", (102, 255, 224)),
+    # lower-body
+    ("left_hip", "right_hip", (255, 102, 0)),
+    ("left_hip", "left_knee", (255, 255, 77)),
+    ("right_hip", "right_knee", (153, 255, 204)),
+    ("left_knee", "left_ankle", (191, 255, 128)),
+    ("right_knee", "right_ankle", (255, 195, 77)),
+]
+
+
+def _get_coco_instances_meta():
+    thing_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    assert len(thing_ids) == 80, len(thing_ids)
+    # Mapping from the incontiguous COCO category id to an id in [0, 79]
+    thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
+    thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    ret = {
+        "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
+        "thing_classes": thing_classes,
+        "thing_colors": thing_colors,
+    }
+    return ret
+
+
+def _get_coco_panoptic_separated_meta():
+    """
+    Returns metadata for "separated" version of the panoptic segmentation dataset.
+    """
+    stuff_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 0]
+    assert len(stuff_ids) == 53, len(stuff_ids)
+
+    # For semantic segmentation, this mapping maps from contiguous stuff id
+    # (in [0, 53], used in models) to ids in the dataset (used for processing results)
+    # The id 0 is mapped to an extra category "thing".
+    stuff_dataset_id_to_contiguous_id = {k: i + 1 for i, k in enumerate(stuff_ids)}
+    # When converting COCO panoptic annotations to semantic annotations
+    # We label the "thing" category to 0
+    stuff_dataset_id_to_contiguous_id[0] = 0
+
+    # 54 names for COCO stuff categories (including "things")
+    stuff_classes = ["things"] + [
+        k["name"].replace("-other", "").replace("-merged", "")
+        for k in COCO_CATEGORIES
+        if k["isthing"] == 0
+    ]
+
+    # NOTE: I randomly picked a color for things
+    stuff_colors = [[82, 18, 128]] + [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 0]
+    ret = {
+        "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
+        "stuff_classes": stuff_classes,
+        "stuff_colors": stuff_colors,
+    }
+    ret.update(_get_coco_instances_meta())
+    return ret
+
+
+def _get_builtin_metadata(dataset_name):
+    if dataset_name == "coco":
+        return _get_coco_instances_meta()
+    if dataset_name == "coco_panoptic_separated":
+        return _get_coco_panoptic_separated_meta()
+    elif dataset_name == "coco_person":
+        return {
+            "thing_classes": ["person"],
+            "keypoint_names": COCO_PERSON_KEYPOINT_NAMES,
+            "keypoint_flip_map": COCO_PERSON_KEYPOINT_FLIP_MAP,
+            "keypoint_connection_rules": KEYPOINT_CONNECTION_RULES,
+        }
+    elif dataset_name == "cityscapes":
+        # fmt: off
+        CITYSCAPES_THING_CLASSES = [
+            "person", "rider", "car", "truck",
+            "bus", "train", "motorcycle", "bicycle",
+        ]
+        CITYSCAPES_STUFF_CLASSES = [
+            "road", "sidewalk", "building", "wall", "fence", "pole", "traffic light",
+            "traffic sign", "vegetation", "terrain", "sky", "person", "rider", "car",
+            "truck", "bus", "train", "motorcycle", "bicycle", "license plate",
+        ]
+        # fmt: on
+        return {
+            "thing_classes": CITYSCAPES_THING_CLASSES,
+            "stuff_classes": CITYSCAPES_STUFF_CLASSES,
+        }
+    raise KeyError("No built-in metadata for dataset {}".format(dataset_name))
diff --git a/detectron2/data/datasets/cityscapes.py b/detectron2/data/datasets/cityscapes.py
new file mode 100644
index 0000000..261282c
--- /dev/null
+++ b/detectron2/data/datasets/cityscapes.py
@@ -0,0 +1,318 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import functools
+import glob
+import json
+import logging
+import multiprocessing as mp
+import numpy as np
+import os
+from itertools import chain
+import pycocotools.mask as mask_util
+from PIL import Image
+
+from detectron2.structures import BoxMode
+from detectron2.utils.logger import setup_logger
+from detectron2.utils.comm import get_world_size
+from fvcore.common.file_io import PathManager
+
+try:
+    import cv2  # noqa
+except ImportError:
+    # OpenCV is an optional dependency at the moment
+    pass
+
+
+def load_cityscapes_instances(image_dir, gt_dir, from_json=True, to_polygons=True):
+    """
+    Args:
+        image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train".
+        gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/train".
+        from_json (bool): whether to read annotations from the raw json file or the png files.
+        to_polygons (bool): whether to represent the segmentation as polygons
+            (COCO's format) instead of masks (cityscapes's format).
+
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard format. (See
+        `Using Custom Datasets </tutorials/datasets.html>`_ )
+    """
+    if from_json:
+        assert to_polygons, (
+            "Cityscapes's json annotations are in polygon format. "
+            "Converting to mask format is not supported now."
+        )
+    files = []
+    for image_file in glob.glob(os.path.join(image_dir, "**/*.png")):
+        suffix = "leftImg8bit.png"
+        assert image_file.endswith(suffix)
+        prefix = image_dir
+        instance_file = gt_dir + image_file[len(prefix) : -len(suffix)] + "gtFine_instanceIds.png"
+        assert os.path.isfile(instance_file), instance_file
+
+        label_file = gt_dir + image_file[len(prefix) : -len(suffix)] + "gtFine_labelIds.png"
+        assert os.path.isfile(label_file), label_file
+
+        json_file = gt_dir + image_file[len(prefix) : -len(suffix)] + "gtFine_polygons.json"
+        files.append((image_file, instance_file, label_file, json_file))
+    assert len(files), "No images found in {}".format(image_dir)
+
+    logger = logging.getLogger(__name__)
+    logger.info("Preprocessing cityscapes annotations ...")
+    # This is still not fast: all workers will execute duplicate works and will
+    # take up to 10m on a 8GPU server.
+    pool = mp.Pool(processes=max(mp.cpu_count() // get_world_size() // 2, 4))
+
+    ret = pool.map(
+        functools.partial(cityscapes_files_to_dict, from_json=from_json, to_polygons=to_polygons),
+        files,
+    )
+    logger.info("Loaded {} images from {}".format(len(ret), image_dir))
+
+    # Map cityscape ids to contiguous ids
+    from cityscapesscripts.helpers.labels import labels
+
+    labels = [l for l in labels if l.hasInstances and not l.ignoreInEval]
+    dataset_id_to_contiguous_id = {l.id: idx for idx, l in enumerate(labels)}
+    for dict_per_image in ret:
+        for anno in dict_per_image["annotations"]:
+            anno["category_id"] = dataset_id_to_contiguous_id[anno["category_id"]]
+    return ret
+
+
+def load_cityscapes_semantic(image_dir, gt_dir):
+    """
+    Args:
+        image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train".
+        gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/train".
+
+    Returns:
+        list[dict]: a list of dict, each has "file_name" and
+            "sem_seg_file_name".
+    """
+    ret = []
+    for image_file in glob.glob(os.path.join(image_dir, "**/*.png")):
+        suffix = "leftImg8bit.png"
+        assert image_file.endswith(suffix)
+        prefix = image_dir
+
+        label_file = gt_dir + image_file[len(prefix) : -len(suffix)] + "gtFine_labelTrainIds.png"
+        assert os.path.isfile(
+            label_file
+        ), "Please generate labelTrainIds.png with cityscapesscripts/preparation/createTrainIdLabelImgs.py"  # noqa
+
+        json_file = gt_dir + image_file[len(prefix) : -len(suffix)] + "gtFine_polygons.json"
+
+        with PathManager.open(json_file, "r") as f:
+            jsonobj = json.load(f)
+        ret.append(
+            {
+                "file_name": image_file,
+                "sem_seg_file_name": label_file,
+                "height": jsonobj["imgHeight"],
+                "width": jsonobj["imgWidth"],
+            }
+        )
+    return ret
+
+
+def cityscapes_files_to_dict(files, from_json, to_polygons):
+    """
+    Parse cityscapes annotation files to a dict.
+
+    Args:
+        files (tuple): consists of (image_file, instance_id_file, label_id_file, json_file)
+        from_json (bool): whether to read annotations from the raw json file or the png files.
+        to_polygons (bool): whether to represent the segmentation as polygons
+            (COCO's format) instead of masks (cityscapes's format).
+
+    Returns:
+        A dict in Detectron2 Dataset format.
+    """
+    from cityscapesscripts.helpers.labels import id2label, name2label
+
+    image_file, instance_id_file, _, json_file = files
+
+    annos = []
+
+    if from_json:
+        from shapely.geometry import MultiPolygon, Polygon
+
+        with PathManager.open(json_file, "r") as f:
+            jsonobj = json.load(f)
+        ret = {
+            "file_name": image_file,
+            "image_id": os.path.basename(image_file),
+            "height": jsonobj["imgHeight"],
+            "width": jsonobj["imgWidth"],
+        }
+
+        # `polygons_union` contains the union of all valid polygons.
+        polygons_union = Polygon()
+
+        # CityscapesScripts draw the polygons in sequential order
+        # and each polygon *overwrites* existing ones. See
+        # (https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/preparation/json2instanceImg.py) # noqa
+        # We use reverse order, and each polygon *avoids* early ones.
+        # This will resolve the ploygon overlaps in the same way as CityscapesScripts.
+        for obj in jsonobj["objects"][::-1]:
+            if "deleted" in obj:  # cityscapes data format specific
+                continue
+            label_name = obj["label"]
+
+            try:
+                label = name2label[label_name]
+            except KeyError:
+                if label_name.endswith("group"):  # crowd area
+                    label = name2label[label_name[: -len("group")]]
+                else:
+                    raise
+            if label.id < 0:  # cityscapes data format
+                continue
+
+            # Cityscapes's raw annotations uses integer coordinates
+            # Therefore +0.5 here
+            poly_coord = np.asarray(obj["polygon"], dtype="f4") + 0.5
+            # CityscapesScript uses PIL.ImageDraw.polygon to rasterize
+            # polygons for evaluation. This function operates in integer space
+            # and draws each pixel whose center falls into the polygon.
+            # Therefore it draws a polygon which is 0.5 "fatter" in expectation.
+            # We therefore dilate the input polygon by 0.5 as our input.
+            poly = Polygon(poly_coord).buffer(0.5, resolution=4)
+
+            if not label.hasInstances or label.ignoreInEval:
+                # even if we won't store the polygon it still contributes to overlaps resolution
+                polygons_union = polygons_union.union(poly)
+                continue
+
+            # Take non-overlapping part of the polygon
+            poly_wo_overlaps = poly.difference(polygons_union)
+            if poly_wo_overlaps.is_empty:
+                continue
+            polygons_union = polygons_union.union(poly)
+
+            anno = {}
+            anno["iscrowd"] = label_name.endswith("group")
+            anno["category_id"] = label.id
+
+            if isinstance(poly_wo_overlaps, Polygon):
+                poly_list = [poly_wo_overlaps]
+            elif isinstance(poly_wo_overlaps, MultiPolygon):
+                poly_list = poly_wo_overlaps.geoms
+            else:
+                raise NotImplementedError("Unknown geometric structure {}".format(poly_wo_overlaps))
+
+            poly_coord = []
+            for poly_el in poly_list:
+                # COCO API can work only with exterior boundaries now, hence we store only them.
+                # TODO: store both exterior and interior boundaries once other parts of the
+                # codebase support holes in polygons.
+                poly_coord.append(list(chain(*poly_el.exterior.coords)))
+            anno["segmentation"] = poly_coord
+            (xmin, ymin, xmax, ymax) = poly_wo_overlaps.bounds
+
+            anno["bbox"] = (xmin, ymin, xmax, ymax)
+            anno["bbox_mode"] = BoxMode.XYXY_ABS
+
+            annos.append(anno)
+    else:
+        # See also the official annotation parsing scripts at
+        # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/instances2dict.py  # noqa
+        with PathManager.open(instance_id_file, "rb") as f:
+            inst_image = np.asarray(Image.open(f), order="F")
+        # ids < 24 are stuff labels (filtering them first is about 5% faster)
+        flattened_ids = np.unique(inst_image[inst_image >= 24])
+
+        ret = {
+            "file_name": image_file,
+            "image_id": os.path.basename(image_file),
+            "height": inst_image.shape[0],
+            "width": inst_image.shape[1],
+        }
+
+        for instance_id in flattened_ids:
+            # For non-crowd annotations, instance_id // 1000 is the label_id
+            # Crowd annotations have <1000 instance ids
+            label_id = instance_id // 1000 if instance_id >= 1000 else instance_id
+            label = id2label[label_id]
+            if not label.hasInstances or label.ignoreInEval:
+                continue
+
+            anno = {}
+            anno["iscrowd"] = instance_id < 1000
+            anno["category_id"] = label.id
+
+            mask = np.asarray(inst_image == instance_id, dtype=np.uint8, order="F")
+
+            inds = np.nonzero(mask)
+            ymin, ymax = inds[0].min(), inds[0].max()
+            xmin, xmax = inds[1].min(), inds[1].max()
+            anno["bbox"] = (xmin, ymin, xmax, ymax)
+            if xmax <= xmin or ymax <= ymin:
+                continue
+            anno["bbox_mode"] = BoxMode.XYXY_ABS
+            if to_polygons:
+                # This conversion comes from D4809743 and D5171122,
+                # when Mask-RCNN was first developed.
+                contours = cv2.findContours(mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[
+                    -2
+                ]
+                polygons = [c.reshape(-1).tolist() for c in contours if len(c) >= 3]
+                # opencv's can produce invalid polygons
+                if len(polygons) == 0:
+                    continue
+                anno["segmentation"] = polygons
+            else:
+                anno["segmentation"] = mask_util.encode(mask[:, :, None])[0]
+            annos.append(anno)
+    ret["annotations"] = annos
+    return ret
+
+
+if __name__ == "__main__":
+    """
+    Test the cityscapes dataset loader.
+
+    Usage:
+        python -m detectron2.data.datasets.cityscapes \
+            cityscapes/leftImg8bit/train cityscapes/gtFine/train
+    """
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("image_dir")
+    parser.add_argument("gt_dir")
+    parser.add_argument("--type", choices=["instance", "semantic"], default="instance")
+    args = parser.parse_args()
+    from detectron2.data.catalog import Metadata
+    from detectron2.utils.visualizer import Visualizer
+    from cityscapesscripts.helpers.labels import labels
+
+    logger = setup_logger(name=__name__)
+
+    dirname = "cityscapes-data-vis"
+    os.makedirs(dirname, exist_ok=True)
+
+    if args.type == "instance":
+        dicts = load_cityscapes_instances(
+            args.image_dir, args.gt_dir, from_json=True, to_polygons=True
+        )
+        logger.info("Done loading {} samples.".format(len(dicts)))
+
+        thing_classes = [k.name for k in labels if k.hasInstances and not k.ignoreInEval]
+        meta = Metadata().set(thing_classes=thing_classes)
+
+    else:
+        dicts = load_cityscapes_semantic(args.image_dir, args.gt_dir)
+        logger.info("Done loading {} samples.".format(len(dicts)))
+
+        stuff_names = [k.name for k in labels if k.trainId != 255]
+        stuff_colors = [k.color for k in labels if k.trainId != 255]
+        meta = Metadata().set(stuff_names=stuff_names, stuff_colors=stuff_colors)
+
+    for d in dicts:
+        img = np.array(Image.open(d["file_name"]))
+        visualizer = Visualizer(img, metadata=meta)
+        vis = visualizer.draw_dataset_dict(d)
+        # cv2.imshow("a", vis.get_image()[:, :, ::-1])
+        # cv2.waitKey()
+        fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
+        vis.save(fpath)
diff --git a/detectron2/data/datasets/coco.py b/detectron2/data/datasets/coco.py
new file mode 100644
index 0000000..6d4a7cc
--- /dev/null
+++ b/detectron2/data/datasets/coco.py
@@ -0,0 +1,452 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import io
+import logging
+import contextlib
+import os
+import datetime
+import json
+import numpy as np
+import imagesize
+
+from PIL import Image
+
+from fvcore.common.timer import Timer
+from detectron2.structures import BoxMode, PolygonMasks, Boxes
+from fvcore.common.file_io import PathManager
+
+
+from .. import MetadataCatalog, DatasetCatalog
+
+"""
+This file contains functions to parse COCO-format annotations into dicts in "Detectron2 format".
+"""
+
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["load_coco_json", "load_sem_seg"]
+
+
+def load_coco_json(json_file, image_root, dataset_name=None, extra_annotation_keys=None):
+    """
+    Load a json file with COCO's instances annotation format.
+    Currently supports instance detection, instance segmentation,
+    and person keypoints annotations.
+
+    Args:
+        json_file (str): full path to the json file in COCO instances annotation format.
+        image_root (str): the directory where the images in this json file exists.
+        dataset_name (str): the name of the dataset (e.g., coco_2017_train).
+            If provided, this function will also put "thing_classes" into
+            the metadata associated with this dataset.
+        extra_annotation_keys (list[str]): list of per-annotation keys that should also be
+            loaded into the dataset dict (besides "iscrowd", "bbox", "keypoints",
+            "category_id", "segmentation"). The values for these keys will be returned as-is.
+            For example, the densepose annotations are loaded in this way.
+
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard format. (See
+        `Using Custom Datasets </tutorials/datasets.html>`_ )
+
+    Notes:
+        1. This function does not read the image files.
+           The results do not have the "image" field.
+    """
+    from pycocotools.coco import COCO
+
+    timer = Timer()
+    json_file = PathManager.get_local_path(json_file)
+    with contextlib.redirect_stdout(io.StringIO()):
+        coco_api = COCO(json_file)
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
+
+    id_map = None
+    if dataset_name is not None:
+        meta = MetadataCatalog.get(dataset_name)
+        cat_ids = sorted(coco_api.getCatIds())
+        cats = coco_api.loadCats(cat_ids)
+        # The categories in a custom json file may not be sorted.
+        thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])]
+        meta.thing_classes = thing_classes
+
+        # In COCO, certain category ids are artificially removed,
+        # and by convention they are always ignored.
+        # We deal with COCO's id issue and translate
+        # the category ids to contiguous ids in [0, 80).
+
+        # It works by looking at the "categories" field in the json, therefore
+        # if users' own json also have incontiguous ids, we'll
+        # apply this mapping as well but print a warning.
+        if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)):
+            if "coco" not in dataset_name:
+                logger.warning(
+                    """
+Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you.
+"""
+                )
+        id_map = {v: i for i, v in enumerate(cat_ids)}
+        meta.thing_dataset_id_to_contiguous_id = id_map
+
+    # sort indices for reproducible results
+    img_ids = sorted(list(coco_api.imgs.keys()))
+    # imgs is a list of dicts, each looks something like:
+    # {'license': 4,
+    #  'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
+    #  'file_name': 'COCO_val2014_000000001268.jpg',
+    #  'height': 427,
+    #  'width': 640,
+    #  'date_captured': '2013-11-17 05:57:24',
+    #  'id': 1268}
+    imgs = coco_api.loadImgs(img_ids)
+    # anns is a list[list[dict]], where each dict is an annotation
+    # record for an object. The inner list enumerates the objects in an image
+    # and the outer list enumerates over images. Example of anns[0]:
+    # [{'segmentation': [[192.81,
+    #     247.09,
+    #     ...
+    #     219.03,
+    #     249.06]],
+    #   'area': 1035.749,
+    #   'iscrowd': 0,
+    #   'image_id': 1268,
+    #   'bbox': [192.81, 224.8, 74.73, 33.43],
+    #   'category_id': 16,
+    #   'id': 42986},
+    #  ...]
+    anns = [coco_api.imgToAnns[img_id] for img_id in img_ids]
+
+    if "minival" not in json_file:
+        # The popular valminusminival & minival annotations for COCO2014 contain this bug.
+        # However the ratio of buggy annotations there is tiny and does not affect accuracy.
+        # Therefore we explicitly white-list them.
+        ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+        assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format(
+            json_file
+        )
+
+    imgs_anns = list(zip(imgs, anns))
+
+    logger.info("Loaded {} images in COCO format from {}".format(len(imgs_anns), json_file))
+
+    dataset_dicts = []
+
+    ann_keys = ["iscrowd", "bbox", "keypoints", "category_id"] + (extra_annotation_keys or [])
+
+    num_instances_without_valid_segmentation = 0
+
+    for (img_dict, anno_dict_list) in imgs_anns:
+        record = {}
+        record["file_name"] = os.path.join(image_root, img_dict["file_name"])
+        record["height"] = img_dict["height"]
+        record["width"] = img_dict["width"]
+        image_id = record["image_id"] = img_dict["id"]
+
+        objs = []
+        for anno in anno_dict_list:
+            # Check that the image_id in this annotation is the same as
+            # the image_id we're looking at.
+            # This fails only when the data parsing logic or the annotation file is buggy.
+
+            # The original COCO valminusminival2014 & minival2014 annotation files
+            # actually contains bugs that, together with certain ways of using COCO API,
+            # can trigger this assertion.
+            assert anno["image_id"] == image_id
+
+            assert anno.get("ignore", 0) == 0
+
+            obj = {key: anno[key] for key in ann_keys if key in anno}
+
+            segm = anno.get("segmentation", None)
+            if segm:  # either list[list[float]] or dict(RLE)
+                if not isinstance(segm, dict):
+                    # filter out invalid polygons (< 3 points)
+                    segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
+                    if len(segm) == 0:
+                        num_instances_without_valid_segmentation += 1
+                        continue  # ignore this instance
+                obj["segmentation"] = segm
+
+            keypts = anno.get("keypoints", None)
+            if keypts:  # list[int]
+                for idx, v in enumerate(keypts):
+                    if idx % 3 != 2:
+                        # COCO's segmentation coordinates are floating points in [0, H or W],
+                        # but keypoint coordinates are integers in [0, H-1 or W-1]
+                        # Therefore we assume the coordinates are "pixel indices" and
+                        # add 0.5 to convert to floating point coordinates.
+                        keypts[idx] = v + 0.5
+                obj["keypoints"] = keypts
+
+            obj["bbox_mode"] = BoxMode.XYWH_ABS
+            if id_map:
+                obj["category_id"] = id_map[obj["category_id"]]
+            objs.append(obj)
+        record["annotations"] = objs
+        dataset_dicts.append(record)
+
+    if num_instances_without_valid_segmentation > 0:
+        logger.warn(
+            "Filtered out {} instances without valid segmentation. "
+            "There might be issues in your dataset generation process.".format(
+                num_instances_without_valid_segmentation
+            )
+        )
+    return dataset_dicts
+
+
+def load_sem_seg(gt_root, image_root, gt_ext="png", image_ext="jpg"):
+    """
+    Load semantic segmentation datasets. All files under "gt_root" with "gt_ext" extension are
+    treated as ground truth annotations and all files under "image_root" with "image_ext" extension
+    as input images. Ground truth and input images are matched using file paths relative to
+    "gt_root" and "image_root" respectively without taking into account file extensions.
+    This works for COCO as well as some other datasets.
+
+    Args:
+        gt_root (str): full path to ground truth semantic segmentation files. Semantic segmentation
+            annotations are stored as images with integer values in pixels that represent
+            corresponding semantic labels.
+        image_root (str): the directory where the input images are.
+        gt_ext (str): file extension for ground truth annotations.
+        image_ext (str): file extension for input images.
+
+    Returns:
+        list[dict]:
+            a list of dicts in detectron2 standard format without instance-level
+            annotation.
+
+    Notes:
+        1. This function does not read the image and ground truth files.
+           The results do not have the "image" and "sem_seg" fields.
+    """
+
+    # We match input images with ground truth based on their relative filepaths (without file
+    # extensions) starting from 'image_root' and 'gt_root' respectively.
+    def file2id(folder_path, file_path):
+        # extract relative path starting from `folder_path`
+        image_id = os.path.normpath(os.path.relpath(file_path, start=folder_path))
+        # remove file extension
+        image_id = os.path.splitext(image_id)[0]
+        return image_id
+
+    input_files = sorted(
+        (os.path.join(image_root, f) for f in PathManager.ls(image_root) if f.endswith(image_ext)),
+        key=lambda file_path: file2id(image_root, file_path),
+    )
+    gt_files = sorted(
+        (os.path.join(gt_root, f) for f in PathManager.ls(gt_root) if f.endswith(gt_ext)),
+        key=lambda file_path: file2id(gt_root, file_path),
+    )
+
+    assert len(gt_files) > 0, "No annotations found in {}.".format(gt_root)
+
+    # Use the intersection, so that val2017_100 annotations can run smoothly with val2017 images
+    if len(input_files) != len(gt_files):
+        logger.warn(
+            "Directory {} and {} has {} and {} files, respectively.".format(
+                image_root, gt_root, len(input_files), len(gt_files)
+            )
+        )
+        input_basenames = [os.path.basename(f)[: -len(image_ext)] for f in input_files]
+        gt_basenames = [os.path.basename(f)[: -len(gt_ext)] for f in gt_files]
+        intersect = list(set(input_basenames) & set(gt_basenames))
+        # sort, otherwise each worker may obtain a list[dict] in different order
+        intersect = sorted(intersect)
+        logger.warn("Will use their intersection of {} files.".format(len(intersect)))
+        input_files = [os.path.join(image_root, f + image_ext) for f in intersect]
+        gt_files = [os.path.join(gt_root, f + gt_ext) for f in intersect]
+
+    logger.info(
+        "Loaded {} images with semantic segmentation from {}".format(len(input_files), image_root)
+    )
+
+    dataset_dicts = []
+    for (img_path, gt_path) in zip(input_files, gt_files):
+        local_path = PathManager.get_local_path(gt_path)
+        w, h = imagesize.get(local_path)
+        record = {}
+        record["file_name"] = img_path
+        record["sem_seg_file_name"] = gt_path
+        record["height"] = h
+        record["width"] = w
+        dataset_dicts.append(record)
+
+    return dataset_dicts
+
+
+def convert_to_coco_dict(dataset_name):
+    """
+    Convert a dataset in detectron2's standard format into COCO json format
+
+    Generic dataset description can be found here:
+    https://detectron2.readthedocs.io/tutorials/datasets.html#register-a-dataset
+
+    COCO data format description can be found here:
+    http://cocodataset.org/#format-data
+
+    Args:
+        dataset_name:
+            name of the source dataset
+            must be registered in DatastCatalog and in detectron2's standard format
+    Returns:
+        coco_dict: serializable dict in COCO json format
+    """
+
+    dataset_dicts = DatasetCatalog.get(dataset_name)
+    categories = [
+        {"id": id, "name": name}
+        for id, name in enumerate(MetadataCatalog.get(dataset_name).thing_classes)
+    ]
+
+    logger.info("Converting dataset dicts into COCO format")
+    coco_images = []
+    coco_annotations = []
+
+    for image_id, image_dict in enumerate(dataset_dicts):
+        coco_image = {
+            "id": image_dict.get("image_id", image_id),
+            "width": image_dict["width"],
+            "height": image_dict["height"],
+            "file_name": image_dict["file_name"],
+        }
+        coco_images.append(coco_image)
+
+        anns_per_image = image_dict["annotations"]
+        for annotation in anns_per_image:
+            # create a new dict with only COCO fields
+            coco_annotation = {}
+
+            # COCO requirement: XYWH box format
+            bbox = annotation["bbox"]
+            bbox_mode = annotation["bbox_mode"]
+            bbox = BoxMode.convert(bbox, bbox_mode, BoxMode.XYWH_ABS)
+
+            # COCO requirement: instance area
+            if "segmentation" in annotation:
+                # Computing areas for instances by counting the pixels
+                segmentation = annotation["segmentation"]
+                # TODO: check segmentation type: RLE, BinaryMask or Polygon
+                polygons = PolygonMasks([segmentation])
+                area = polygons.area()[0].item()
+            else:
+                # Computing areas using bounding boxes
+                bbox_xy = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
+                area = Boxes([bbox_xy]).area()[0].item()
+
+            if "keypoints" in annotation:
+                keypoints = annotation["keypoints"]  # list[int]
+                for idx, v in enumerate(keypoints):
+                    if idx % 3 != 2:
+                        # COCO's segmentation coordinates are floating points in [0, H or W],
+                        # but keypoint coordinates are integers in [0, H-1 or W-1]
+                        # For COCO format consistency we substract 0.5
+                        # https://github.com/facebookresearch/detectron2/pull/175#issuecomment-551202163
+                        keypoints[idx] = v - 0.5
+                if "num_keypoints" in annotation:
+                    num_keypoints = annotation["num_keypoints"]
+                else:
+                    num_keypoints = sum(kp > 0 for kp in keypoints[2::3])
+
+            # COCO requirement:
+            #   linking annotations to images
+            #   "id" field must start with 1
+            coco_annotation["id"] = len(coco_annotations) + 1
+            coco_annotation["image_id"] = coco_image["id"]
+            coco_annotation["bbox"] = [round(float(x), 3) for x in bbox]
+            coco_annotation["area"] = area
+            coco_annotation["category_id"] = annotation["category_id"]
+            coco_annotation["iscrowd"] = annotation.get("iscrowd", 0)
+
+            # Add optional fields
+            if "keypoints" in annotation:
+                coco_annotation["keypoints"] = keypoints
+                coco_annotation["num_keypoints"] = num_keypoints
+
+            if "segmentation" in annotation:
+                coco_annotation["segmentation"] = annotation["segmentation"]
+
+            coco_annotations.append(coco_annotation)
+
+    logger.info(
+        "Conversion finished, "
+        f"num images: {len(coco_images)}, num annotations: {len(coco_annotations)}"
+    )
+
+    info = {
+        "date_created": str(datetime.datetime.now()),
+        "description": "Automatically generated COCO json file for Detectron2.",
+    }
+    coco_dict = {
+        "info": info,
+        "images": coco_images,
+        "annotations": coco_annotations,
+        "categories": categories,
+        "licenses": None,
+    }
+    return coco_dict
+
+
+def convert_to_coco_json(dataset_name, output_folder="", allow_cached=True):
+    """
+    Converts dataset into COCO format and saves it to a json file.
+    dataset_name must be registered in DatastCatalog and in detectron2's standard format.
+
+    Args:
+        dataset_name:
+            reference from the config file to the catalogs
+            must be registered in DatastCatalog and in detectron2's standard format
+        output_folder: where json file will be saved and loaded from
+        allow_cached: if json file is already present then skip conversion
+    Returns:
+        cache_path: path to the COCO-format json file
+    """
+
+    # TODO: The dataset or the conversion script *may* change,
+    # a checksum would be useful for validating the cached data
+    cache_path = os.path.join(output_folder, f"{dataset_name}_coco_format.json")
+    PathManager.mkdirs(output_folder)
+    if os.path.exists(cache_path) and allow_cached:
+        logger.info(f"Reading cached annotations in COCO format from:{cache_path} ...")
+    else:
+        logger.info(f"Converting dataset annotations in '{dataset_name}' to COCO format ...)")
+        coco_dict = convert_to_coco_dict(dataset_name)
+
+        with PathManager.open(cache_path, "w") as json_file:
+            logger.info(f"Caching annotations in COCO format: {cache_path}")
+            json.dump(coco_dict, json_file)
+
+    return cache_path
+
+
+if __name__ == "__main__":
+    """
+    Test the COCO json dataset loader.
+
+    Usage:
+        python -m detectron2.data.datasets.coco \
+            path/to/json path/to/image_root dataset_name
+
+        "dataset_name" can be "coco_2014_minival_100", or other
+        pre-registered ones
+    """
+    from detectron2.utils.logger import setup_logger
+    from detectron2.utils.visualizer import Visualizer
+    import detectron2.data.datasets  # noqa # add pre-defined metadata
+    import sys
+
+    logger = setup_logger(name=__name__)
+    assert sys.argv[3] in DatasetCatalog.list()
+    meta = MetadataCatalog.get(sys.argv[3])
+
+    dicts = load_coco_json(sys.argv[1], sys.argv[2], sys.argv[3])
+    logger.info("Done loading {} samples.".format(len(dicts)))
+
+    dirname = "coco-data-vis"
+    os.makedirs(dirname, exist_ok=True)
+    for d in dicts:
+        img = np.array(Image.open(d["file_name"]))
+        visualizer = Visualizer(img, metadata=meta)
+        vis = visualizer.draw_dataset_dict(d)
+        fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
+        vis.save(fpath)
diff --git a/detectron2/data/datasets/lvis.py b/detectron2/data/datasets/lvis.py
new file mode 100644
index 0000000..83d3d16
--- /dev/null
+++ b/detectron2/data/datasets/lvis.py
@@ -0,0 +1,206 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import os
+
+from fvcore.common.timer import Timer
+from detectron2.structures import BoxMode
+from fvcore.common.file_io import PathManager
+from detectron2.data import DatasetCatalog, MetadataCatalog
+
+from .lvis_v0_5_categories import LVIS_CATEGORIES
+
+"""
+This file contains functions to parse LVIS-format annotations into dicts in the
+"Detectron2 format".
+"""
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["load_lvis_json", "register_lvis_instances", "get_lvis_instances_meta"]
+
+
+def register_lvis_instances(name, metadata, json_file, image_root):
+    """
+    Register a dataset in LVIS's json annotation format for instance detection and segmentation.
+
+    Args:
+        name (str): a name that identifies the dataset, e.g. "lvis_v0.5_train".
+        metadata (dict): extra metadata associated with this dataset. It can be an empty dict.
+        json_file (str): path to the json instance annotation file.
+        image_root (str): directory which contains all the images.
+    """
+    DatasetCatalog.register(name, lambda: load_lvis_json(json_file, image_root, name))
+    MetadataCatalog.get(name).set(
+        json_file=json_file, image_root=image_root, evaluator_type="lvis", **metadata
+    )
+
+
+def load_lvis_json(json_file, image_root, dataset_name=None):
+    """
+    Load a json file in LVIS's annotation format.
+
+    Args:
+        json_file (str): full path to the LVIS json annotation file.
+        image_root (str): the directory where the images in this json file exists.
+        dataset_name (str): the name of the dataset (e.g., "lvis_v0.5_train").
+            If provided, this function will put "thing_classes" into the metadata
+            associated with this dataset.
+
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard format. (See
+        `Using Custom Datasets </tutorials/datasets.html>`_ )
+
+    Notes:
+        1. This function does not read the image files.
+           The results do not have the "image" field.
+    """
+    from lvis import LVIS
+
+    json_file = PathManager.get_local_path(json_file)
+
+    timer = Timer()
+    lvis_api = LVIS(json_file)
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
+
+    if dataset_name is not None:
+        meta = get_lvis_instances_meta(dataset_name)
+        MetadataCatalog.get(dataset_name).set(**meta)
+
+    # sort indices for reproducible results
+    img_ids = sorted(list(lvis_api.imgs.keys()))
+    # imgs is a list of dicts, each looks something like:
+    # {'license': 4,
+    #  'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
+    #  'file_name': 'COCO_val2014_000000001268.jpg',
+    #  'height': 427,
+    #  'width': 640,
+    #  'date_captured': '2013-11-17 05:57:24',
+    #  'id': 1268}
+    imgs = lvis_api.load_imgs(img_ids)
+    # anns is a list[list[dict]], where each dict is an annotation
+    # record for an object. The inner list enumerates the objects in an image
+    # and the outer list enumerates over images. Example of anns[0]:
+    # [{'segmentation': [[192.81,
+    #     247.09,
+    #     ...
+    #     219.03,
+    #     249.06]],
+    #   'area': 1035.749,
+    #   'image_id': 1268,
+    #   'bbox': [192.81, 224.8, 74.73, 33.43],
+    #   'category_id': 16,
+    #   'id': 42986},
+    #  ...]
+    anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids]
+
+    # Sanity check that each annotation has a unique id
+    ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+    assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique".format(
+        json_file
+    )
+
+    imgs_anns = list(zip(imgs, anns))
+
+    logger.info("Loaded {} images in the LVIS format from {}".format(len(imgs_anns), json_file))
+
+    dataset_dicts = []
+
+    for (img_dict, anno_dict_list) in imgs_anns:
+        record = {}
+        file_name = img_dict["file_name"]
+        if img_dict["file_name"].startswith("COCO"):
+            # Convert form the COCO 2014 file naming convention of
+            # COCO_[train/val/test]2014_000000000000.jpg to the 2017 naming convention of
+            # 000000000000.jpg (LVIS v1 will fix this naming issue)
+            file_name = file_name[-16:]
+        record["file_name"] = os.path.join(image_root, file_name)
+        record["height"] = img_dict["height"]
+        record["width"] = img_dict["width"]
+        record["not_exhaustive_category_ids"] = img_dict.get("not_exhaustive_category_ids", [])
+        record["neg_category_ids"] = img_dict.get("neg_category_ids", [])
+        image_id = record["image_id"] = img_dict["id"]
+
+        objs = []
+        for anno in anno_dict_list:
+            # Check that the image_id in this annotation is the same as
+            # the image_id we're looking at.
+            # This fails only when the data parsing logic or the annotation file is buggy.
+            assert anno["image_id"] == image_id
+            obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS}
+            obj["category_id"] = anno["category_id"] - 1  # Convert 1-indexed to 0-indexed
+            segm = anno["segmentation"]  # list[list[float]]
+            # filter out invalid polygons (< 3 points)
+            valid_segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
+            assert len(segm) == len(
+                valid_segm
+            ), "Annotation contains an invalid polygon with < 3 points"
+            assert len(segm) > 0
+            obj["segmentation"] = segm
+            objs.append(obj)
+        record["annotations"] = objs
+        dataset_dicts.append(record)
+
+    return dataset_dicts
+
+
+def get_lvis_instances_meta(dataset_name):
+    """
+    Load LVIS metadata.
+
+    Args:
+        dataset_name (str): LVIS dataset name without the split name (e.g., "lvis_v0.5").
+
+    Returns:
+        dict: LVIS metadata with keys: thing_classes
+    """
+    if "v0.5" in dataset_name:
+        return _get_lvis_instances_meta_v0_5()
+    # There will be a v1 in the future
+    # elif dataset_name == "lvis_v1":
+    #   return get_lvis_instances_meta_v1()
+    raise ValueError("No built-in metadata for dataset {}".format(dataset_name))
+
+
+def _get_lvis_instances_meta_v0_5():
+    assert len(LVIS_CATEGORIES) == 1230
+    cat_ids = [k["id"] for k in LVIS_CATEGORIES]
+    assert min(cat_ids) == 1 and max(cat_ids) == len(
+        cat_ids
+    ), "Category ids are not in [1, #categories], as expected"
+    # Ensure that the category list is sorted by id
+    lvis_categories = [k for k in sorted(LVIS_CATEGORIES, key=lambda x: x["id"])]
+    thing_classes = [k["synonyms"][0] for k in lvis_categories]
+    meta = {"thing_classes": thing_classes}
+    return meta
+
+
+if __name__ == "__main__":
+    """
+    Test the LVIS json dataset loader.
+
+    Usage:
+        python -m detectron2.data.datasets.lvis \
+            path/to/json path/to/image_root dataset_name vis_limit
+    """
+    import sys
+    import numpy as np
+    from detectron2.utils.logger import setup_logger
+    from PIL import Image
+    import detectron2.data.datasets  # noqa # add pre-defined metadata
+    from detectron2.utils.visualizer import Visualizer
+
+    logger = setup_logger(name=__name__)
+    meta = MetadataCatalog.get(sys.argv[3])
+
+    dicts = load_lvis_json(sys.argv[1], sys.argv[2], sys.argv[3])
+    logger.info("Done loading {} samples.".format(len(dicts)))
+
+    dirname = "lvis-data-vis"
+    os.makedirs(dirname, exist_ok=True)
+    for d in dicts[: int(sys.argv[4])]:
+        img = np.array(Image.open(d["file_name"]))
+        visualizer = Visualizer(img, metadata=meta)
+        vis = visualizer.draw_dataset_dict(d)
+        fpath = os.path.join(dirname, os.path.basename(d["file_name"]))
+        vis.save(fpath)
diff --git a/detectron2/data/datasets/lvis_v0_5_categories.py b/detectron2/data/datasets/lvis_v0_5_categories.py
new file mode 100644
index 0000000..8205e60
--- /dev/null
+++ b/detectron2/data/datasets/lvis_v0_5_categories.py
@@ -0,0 +1,13 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Autogen with
+# with open("lvis_v0.5_val.json", "r") as f:
+#     a = json.load(f)
+# c = a["categories"]
+# for x in c:
+#     del x["image_count"]
+#     del x["instance_count"]
+# LVIS_CATEGORIES = repr(c) + "  # noqa"
+
+# fmt: off
+LVIS_CATEGORIES = [{'frequency': 'r', 'id': 1, 'synset': 'acorn.n.01', 'synonyms': ['acorn'], 'def': 'nut from an oak tree', 'name': 'acorn'}, {'frequency': 'c', 'id': 2, 'synset': 'aerosol.n.02', 'synonyms': ['aerosol_can', 'spray_can'], 'def': 'a dispenser that holds a substance under pressure', 'name': 'aerosol_can'}, {'frequency': 'f', 'id': 3, 'synset': 'air_conditioner.n.01', 'synonyms': ['air_conditioner'], 'def': 'a machine that keeps air cool and dry', 'name': 'air_conditioner'}, {'frequency': 'f', 'id': 4, 'synset': 'airplane.n.01', 'synonyms': ['airplane', 'aeroplane'], 'def': 'an aircraft that has a fixed wing and is powered by propellers or jets', 'name': 'airplane'}, {'frequency': 'c', 'id': 5, 'synset': 'alarm_clock.n.01', 'synonyms': ['alarm_clock'], 'def': 'a clock that wakes a sleeper at some preset time', 'name': 'alarm_clock'}, {'frequency': 'c', 'id': 6, 'synset': 'alcohol.n.01', 'synonyms': ['alcohol', 'alcoholic_beverage'], 'def': 'a liquor or brew containing alcohol as the active agent', 'name': 'alcohol'}, {'frequency': 'r', 'id': 7, 'synset': 'alligator.n.02', 'synonyms': ['alligator', 'gator'], 'def': 'amphibious reptiles related to crocodiles but with shorter broader snouts', 'name': 'alligator'}, {'frequency': 'c', 'id': 8, 'synset': 'almond.n.02', 'synonyms': ['almond'], 'def': 'oval-shaped edible seed of the almond tree', 'name': 'almond'}, {'frequency': 'c', 'id': 9, 'synset': 'ambulance.n.01', 'synonyms': ['ambulance'], 'def': 'a vehicle that takes people to and from hospitals', 'name': 'ambulance'}, {'frequency': 'r', 'id': 10, 'synset': 'amplifier.n.01', 'synonyms': ['amplifier'], 'def': 'electronic equipment that increases strength of signals', 'name': 'amplifier'}, {'frequency': 'c', 'id': 11, 'synset': 'anklet.n.03', 'synonyms': ['anklet', 'ankle_bracelet'], 'def': 'an ornament worn around the ankle', 'name': 'anklet'}, {'frequency': 'f', 'id': 12, 'synset': 'antenna.n.01', 'synonyms': ['antenna', 'aerial', 'transmitting_aerial'], 'def': 'an electrical device that sends or receives radio or television signals', 'name': 'antenna'}, {'frequency': 'f', 'id': 13, 'synset': 'apple.n.01', 'synonyms': ['apple'], 'def': 'fruit with red or yellow or green skin and sweet to tart crisp whitish flesh', 'name': 'apple'}, {'frequency': 'r', 'id': 14, 'synset': 'apple_juice.n.01', 'synonyms': ['apple_juice'], 'def': 'the juice of apples', 'name': 'apple_juice'}, {'frequency': 'r', 'id': 15, 'synset': 'applesauce.n.01', 'synonyms': ['applesauce'], 'def': 'puree of stewed apples usually sweetened and spiced', 'name': 'applesauce'}, {'frequency': 'r', 'id': 16, 'synset': 'apricot.n.02', 'synonyms': ['apricot'], 'def': 'downy yellow to rosy-colored fruit resembling a small peach', 'name': 'apricot'}, {'frequency': 'f', 'id': 17, 'synset': 'apron.n.01', 'synonyms': ['apron'], 'def': 'a garment of cloth that is tied about the waist and worn to protect clothing', 'name': 'apron'}, {'frequency': 'c', 'id': 18, 'synset': 'aquarium.n.01', 'synonyms': ['aquarium', 'fish_tank'], 'def': 'a tank/pool/bowl filled with water for keeping live fish and underwater animals', 'name': 'aquarium'}, {'frequency': 'c', 'id': 19, 'synset': 'armband.n.02', 'synonyms': ['armband'], 'def': 'a band worn around the upper arm', 'name': 'armband'}, {'frequency': 'f', 'id': 20, 'synset': 'armchair.n.01', 'synonyms': ['armchair'], 'def': 'chair with a support on each side for arms', 'name': 'armchair'}, {'frequency': 'r', 'id': 21, 'synset': 'armoire.n.01', 'synonyms': ['armoire'], 'def': 'a large wardrobe or cabinet', 'name': 'armoire'}, {'frequency': 'r', 'id': 22, 'synset': 'armor.n.01', 'synonyms': ['armor', 'armour'], 'def': 'protective covering made of metal and used in combat', 'name': 'armor'}, {'frequency': 'c', 'id': 23, 'synset': 'artichoke.n.02', 'synonyms': ['artichoke'], 'def': 'a thistlelike flower head with edible fleshy leaves and heart', 'name': 'artichoke'}, {'frequency': 'f', 'id': 24, 'synset': 'ashcan.n.01', 'synonyms': ['trash_can', 'garbage_can', 'wastebin', 'dustbin', 'trash_barrel', 'trash_bin'], 'def': 'a bin that holds rubbish until it is collected', 'name': 'trash_can'}, {'frequency': 'c', 'id': 25, 'synset': 'ashtray.n.01', 'synonyms': ['ashtray'], 'def': "a receptacle for the ash from smokers' cigars or cigarettes", 'name': 'ashtray'}, {'frequency': 'c', 'id': 26, 'synset': 'asparagus.n.02', 'synonyms': ['asparagus'], 'def': 'edible young shoots of the asparagus plant', 'name': 'asparagus'}, {'frequency': 'c', 'id': 27, 'synset': 'atomizer.n.01', 'synonyms': ['atomizer', 'atomiser', 'spray', 'sprayer', 'nebulizer', 'nebuliser'], 'def': 'a dispenser that turns a liquid (such as perfume) into a fine mist', 'name': 'atomizer'}, {'frequency': 'c', 'id': 28, 'synset': 'avocado.n.01', 'synonyms': ['avocado'], 'def': 'a pear-shaped fruit with green or blackish skin and rich yellowish pulp enclosing a single large seed', 'name': 'avocado'}, {'frequency': 'c', 'id': 29, 'synset': 'award.n.02', 'synonyms': ['award', 'accolade'], 'def': 'a tangible symbol signifying approval or distinction', 'name': 'award'}, {'frequency': 'f', 'id': 30, 'synset': 'awning.n.01', 'synonyms': ['awning'], 'def': 'a canopy made of canvas to shelter people or things from rain or sun', 'name': 'awning'}, {'frequency': 'r', 'id': 31, 'synset': 'ax.n.01', 'synonyms': ['ax', 'axe'], 'def': 'an edge tool with a heavy bladed head mounted across a handle', 'name': 'ax'}, {'frequency': 'f', 'id': 32, 'synset': 'baby_buggy.n.01', 'synonyms': ['baby_buggy', 'baby_carriage', 'perambulator', 'pram', 'stroller'], 'def': 'a small vehicle with four wheels in which a baby or child is pushed around', 'name': 'baby_buggy'}, {'frequency': 'c', 'id': 33, 'synset': 'backboard.n.01', 'synonyms': ['basketball_backboard'], 'def': 'a raised vertical board with basket attached; used to play basketball', 'name': 'basketball_backboard'}, {'frequency': 'f', 'id': 34, 'synset': 'backpack.n.01', 'synonyms': ['backpack', 'knapsack', 'packsack', 'rucksack', 'haversack'], 'def': 'a bag carried by a strap on your back or shoulder', 'name': 'backpack'}, {'frequency': 'f', 'id': 35, 'synset': 'bag.n.04', 'synonyms': ['handbag', 'purse', 'pocketbook'], 'def': 'a container used for carrying money and small personal items or accessories', 'name': 'handbag'}, {'frequency': 'f', 'id': 36, 'synset': 'bag.n.06', 'synonyms': ['suitcase', 'baggage', 'luggage'], 'def': 'cases used to carry belongings when traveling', 'name': 'suitcase'}, {'frequency': 'c', 'id': 37, 'synset': 'bagel.n.01', 'synonyms': ['bagel', 'beigel'], 'def': 'glazed yeast-raised doughnut-shaped roll with hard crust', 'name': 'bagel'}, {'frequency': 'r', 'id': 38, 'synset': 'bagpipe.n.01', 'synonyms': ['bagpipe'], 'def': 'a tubular wind instrument; the player blows air into a bag and squeezes it out', 'name': 'bagpipe'}, {'frequency': 'r', 'id': 39, 'synset': 'baguet.n.01', 'synonyms': ['baguet', 'baguette'], 'def': 'narrow French stick loaf', 'name': 'baguet'}, {'frequency': 'r', 'id': 40, 'synset': 'bait.n.02', 'synonyms': ['bait', 'lure'], 'def': 'something used to lure fish or other animals into danger so they can be trapped or killed', 'name': 'bait'}, {'frequency': 'f', 'id': 41, 'synset': 'ball.n.06', 'synonyms': ['ball'], 'def': 'a spherical object used as a plaything', 'name': 'ball'}, {'frequency': 'r', 'id': 42, 'synset': 'ballet_skirt.n.01', 'synonyms': ['ballet_skirt', 'tutu'], 'def': 'very short skirt worn by ballerinas', 'name': 'ballet_skirt'}, {'frequency': 'f', 'id': 43, 'synset': 'balloon.n.01', 'synonyms': ['balloon'], 'def': 'large tough nonrigid bag filled with gas or heated air', 'name': 'balloon'}, {'frequency': 'c', 'id': 44, 'synset': 'bamboo.n.02', 'synonyms': ['bamboo'], 'def': 'woody tropical grass having hollow woody stems', 'name': 'bamboo'}, {'frequency': 'f', 'id': 45, 'synset': 'banana.n.02', 'synonyms': ['banana'], 'def': 'elongated crescent-shaped yellow fruit with soft sweet flesh', 'name': 'banana'}, {'frequency': 'r', 'id': 46, 'synset': 'band_aid.n.01', 'synonyms': ['Band_Aid'], 'def': 'trade name for an adhesive bandage to cover small cuts or blisters', 'name': 'Band_Aid'}, {'frequency': 'c', 'id': 47, 'synset': 'bandage.n.01', 'synonyms': ['bandage'], 'def': 'a piece of soft material that covers and protects an injured part of the body', 'name': 'bandage'}, {'frequency': 'c', 'id': 48, 'synset': 'bandanna.n.01', 'synonyms': ['bandanna', 'bandana'], 'def': 'large and brightly colored handkerchief; often used as a neckerchief', 'name': 'bandanna'}, {'frequency': 'r', 'id': 49, 'synset': 'banjo.n.01', 'synonyms': ['banjo'], 'def': 'a stringed instrument of the guitar family with a long neck and circular body', 'name': 'banjo'}, {'frequency': 'f', 'id': 50, 'synset': 'banner.n.01', 'synonyms': ['banner', 'streamer'], 'def': 'long strip of cloth or paper used for decoration or advertising', 'name': 'banner'}, {'frequency': 'r', 'id': 51, 'synset': 'barbell.n.01', 'synonyms': ['barbell'], 'def': 'a bar to which heavy discs are attached at each end; used in weightlifting', 'name': 'barbell'}, {'frequency': 'r', 'id': 52, 'synset': 'barge.n.01', 'synonyms': ['barge'], 'def': 'a flatbottom boat for carrying heavy loads (especially on canals)', 'name': 'barge'}, {'frequency': 'f', 'id': 53, 'synset': 'barrel.n.02', 'synonyms': ['barrel', 'cask'], 'def': 'a cylindrical container that holds liquids', 'name': 'barrel'}, {'frequency': 'c', 'id': 54, 'synset': 'barrette.n.01', 'synonyms': ['barrette'], 'def': "a pin for holding women's hair in place", 'name': 'barrette'}, {'frequency': 'c', 'id': 55, 'synset': 'barrow.n.03', 'synonyms': ['barrow', 'garden_cart', 'lawn_cart', 'wheelbarrow'], 'def': 'a cart for carrying small loads; has handles and one or more wheels', 'name': 'barrow'}, {'frequency': 'f', 'id': 56, 'synset': 'base.n.03', 'synonyms': ['baseball_base'], 'def': 'a place that the runner must touch before scoring', 'name': 'baseball_base'}, {'frequency': 'f', 'id': 57, 'synset': 'baseball.n.02', 'synonyms': ['baseball'], 'def': 'a ball used in playing baseball', 'name': 'baseball'}, {'frequency': 'f', 'id': 58, 'synset': 'baseball_bat.n.01', 'synonyms': ['baseball_bat'], 'def': 'an implement used in baseball by the batter', 'name': 'baseball_bat'}, {'frequency': 'f', 'id': 59, 'synset': 'baseball_cap.n.01', 'synonyms': ['baseball_cap', 'jockey_cap', 'golf_cap'], 'def': 'a cap with a bill', 'name': 'baseball_cap'}, {'frequency': 'f', 'id': 60, 'synset': 'baseball_glove.n.01', 'synonyms': ['baseball_glove', 'baseball_mitt'], 'def': 'the handwear used by fielders in playing baseball', 'name': 'baseball_glove'}, {'frequency': 'f', 'id': 61, 'synset': 'basket.n.01', 'synonyms': ['basket', 'handbasket'], 'def': 'a container that is usually woven and has handles', 'name': 'basket'}, {'frequency': 'c', 'id': 62, 'synset': 'basket.n.03', 'synonyms': ['basketball_hoop'], 'def': 'metal hoop supporting a net through which players try to throw the basketball', 'name': 'basketball_hoop'}, {'frequency': 'c', 'id': 63, 'synset': 'basketball.n.02', 'synonyms': ['basketball'], 'def': 'an inflated ball used in playing basketball', 'name': 'basketball'}, {'frequency': 'r', 'id': 64, 'synset': 'bass_horn.n.01', 'synonyms': ['bass_horn', 'sousaphone', 'tuba'], 'def': 'the lowest brass wind instrument', 'name': 'bass_horn'}, {'frequency': 'r', 'id': 65, 'synset': 'bat.n.01', 'synonyms': ['bat_(animal)'], 'def': 'nocturnal mouselike mammal with forelimbs modified to form membranous wings', 'name': 'bat_(animal)'}, {'frequency': 'f', 'id': 66, 'synset': 'bath_mat.n.01', 'synonyms': ['bath_mat'], 'def': 'a heavy towel or mat to stand on while drying yourself after a bath', 'name': 'bath_mat'}, {'frequency': 'f', 'id': 67, 'synset': 'bath_towel.n.01', 'synonyms': ['bath_towel'], 'def': 'a large towel; to dry yourself after a bath', 'name': 'bath_towel'}, {'frequency': 'c', 'id': 68, 'synset': 'bathrobe.n.01', 'synonyms': ['bathrobe'], 'def': 'a loose-fitting robe of towelling; worn after a bath or swim', 'name': 'bathrobe'}, {'frequency': 'f', 'id': 69, 'synset': 'bathtub.n.01', 'synonyms': ['bathtub', 'bathing_tub'], 'def': 'a large open container that you fill with water and use to wash the body', 'name': 'bathtub'}, {'frequency': 'r', 'id': 70, 'synset': 'batter.n.02', 'synonyms': ['batter_(food)'], 'def': 'a liquid or semiliquid mixture, as of flour, eggs, and milk, used in cooking', 'name': 'batter_(food)'}, {'frequency': 'c', 'id': 71, 'synset': 'battery.n.02', 'synonyms': ['battery'], 'def': 'a portable device that produces electricity', 'name': 'battery'}, {'frequency': 'r', 'id': 72, 'synset': 'beach_ball.n.01', 'synonyms': ['beachball'], 'def': 'large and light ball; for play at the seaside', 'name': 'beachball'}, {'frequency': 'c', 'id': 73, 'synset': 'bead.n.01', 'synonyms': ['bead'], 'def': 'a small ball with a hole through the middle used for ornamentation, jewellery, etc.', 'name': 'bead'}, {'frequency': 'r', 'id': 74, 'synset': 'beaker.n.01', 'synonyms': ['beaker'], 'def': 'a flatbottomed jar made of glass or plastic; used for chemistry', 'name': 'beaker'}, {'frequency': 'c', 'id': 75, 'synset': 'bean_curd.n.01', 'synonyms': ['bean_curd', 'tofu'], 'def': 'cheeselike food made of curdled soybean milk', 'name': 'bean_curd'}, {'frequency': 'c', 'id': 76, 'synset': 'beanbag.n.01', 'synonyms': ['beanbag'], 'def': 'a bag filled with dried beans or similar items; used in games or to sit on', 'name': 'beanbag'}, {'frequency': 'f', 'id': 77, 'synset': 'beanie.n.01', 'synonyms': ['beanie', 'beany'], 'def': 'a small skullcap; formerly worn by schoolboys and college freshmen', 'name': 'beanie'}, {'frequency': 'f', 'id': 78, 'synset': 'bear.n.01', 'synonyms': ['bear'], 'def': 'large carnivorous or omnivorous mammals with shaggy coats and claws', 'name': 'bear'}, {'frequency': 'f', 'id': 79, 'synset': 'bed.n.01', 'synonyms': ['bed'], 'def': 'a piece of furniture that provides a place to sleep', 'name': 'bed'}, {'frequency': 'c', 'id': 80, 'synset': 'bedspread.n.01', 'synonyms': ['bedspread', 'bedcover', 'bed_covering', 'counterpane', 'spread'], 'def': 'decorative cover for a bed', 'name': 'bedspread'}, {'frequency': 'f', 'id': 81, 'synset': 'beef.n.01', 'synonyms': ['cow'], 'def': 'cattle that are reared for their meat', 'name': 'cow'}, {'frequency': 'c', 'id': 82, 'synset': 'beef.n.02', 'synonyms': ['beef_(food)', 'boeuf_(food)'], 'def': 'meat from an adult domestic bovine', 'name': 'beef_(food)'}, {'frequency': 'r', 'id': 83, 'synset': 'beeper.n.01', 'synonyms': ['beeper', 'pager'], 'def': 'an device that beeps when the person carrying it is being paged', 'name': 'beeper'}, {'frequency': 'f', 'id': 84, 'synset': 'beer_bottle.n.01', 'synonyms': ['beer_bottle'], 'def': 'a bottle that holds beer', 'name': 'beer_bottle'}, {'frequency': 'c', 'id': 85, 'synset': 'beer_can.n.01', 'synonyms': ['beer_can'], 'def': 'a can that holds beer', 'name': 'beer_can'}, {'frequency': 'r', 'id': 86, 'synset': 'beetle.n.01', 'synonyms': ['beetle'], 'def': 'insect with hard wing covers', 'name': 'beetle'}, {'frequency': 'f', 'id': 87, 'synset': 'bell.n.01', 'synonyms': ['bell'], 'def': 'a hollow device made of metal that makes a ringing sound when struck', 'name': 'bell'}, {'frequency': 'f', 'id': 88, 'synset': 'bell_pepper.n.02', 'synonyms': ['bell_pepper', 'capsicum'], 'def': 'large bell-shaped sweet pepper in green or red or yellow or orange or black varieties', 'name': 'bell_pepper'}, {'frequency': 'f', 'id': 89, 'synset': 'belt.n.02', 'synonyms': ['belt'], 'def': 'a band to tie or buckle around the body (usually at the waist)', 'name': 'belt'}, {'frequency': 'f', 'id': 90, 'synset': 'belt_buckle.n.01', 'synonyms': ['belt_buckle'], 'def': 'the buckle used to fasten a belt', 'name': 'belt_buckle'}, {'frequency': 'f', 'id': 91, 'synset': 'bench.n.01', 'synonyms': ['bench'], 'def': 'a long seat for more than one person', 'name': 'bench'}, {'frequency': 'c', 'id': 92, 'synset': 'beret.n.01', 'synonyms': ['beret'], 'def': 'a cap with no brim or bill; made of soft cloth', 'name': 'beret'}, {'frequency': 'c', 'id': 93, 'synset': 'bib.n.02', 'synonyms': ['bib'], 'def': 'a napkin tied under the chin of a child while eating', 'name': 'bib'}, {'frequency': 'r', 'id': 94, 'synset': 'bible.n.01', 'synonyms': ['Bible'], 'def': 'the sacred writings of the Christian religions', 'name': 'Bible'}, {'frequency': 'f', 'id': 95, 'synset': 'bicycle.n.01', 'synonyms': ['bicycle', 'bike_(bicycle)'], 'def': 'a wheeled vehicle that has two wheels and is moved by foot pedals', 'name': 'bicycle'}, {'frequency': 'f', 'id': 96, 'synset': 'bill.n.09', 'synonyms': ['visor', 'vizor'], 'def': 'a brim that projects to the front to shade the eyes', 'name': 'visor'}, {'frequency': 'c', 'id': 97, 'synset': 'binder.n.03', 'synonyms': ['binder', 'ring-binder'], 'def': 'holds loose papers or magazines', 'name': 'binder'}, {'frequency': 'c', 'id': 98, 'synset': 'binoculars.n.01', 'synonyms': ['binoculars', 'field_glasses', 'opera_glasses'], 'def': 'an optical instrument designed for simultaneous use by both eyes', 'name': 'binoculars'}, {'frequency': 'f', 'id': 99, 'synset': 'bird.n.01', 'synonyms': ['bird'], 'def': 'animal characterized by feathers and wings', 'name': 'bird'}, {'frequency': 'r', 'id': 100, 'synset': 'bird_feeder.n.01', 'synonyms': ['birdfeeder'], 'def': 'an outdoor device that supplies food for wild birds', 'name': 'birdfeeder'}, {'frequency': 'r', 'id': 101, 'synset': 'birdbath.n.01', 'synonyms': ['birdbath'], 'def': 'an ornamental basin (usually in a garden) for birds to bathe in', 'name': 'birdbath'}, {'frequency': 'c', 'id': 102, 'synset': 'birdcage.n.01', 'synonyms': ['birdcage'], 'def': 'a cage in which a bird can be kept', 'name': 'birdcage'}, {'frequency': 'c', 'id': 103, 'synset': 'birdhouse.n.01', 'synonyms': ['birdhouse'], 'def': 'a shelter for birds', 'name': 'birdhouse'}, {'frequency': 'f', 'id': 104, 'synset': 'birthday_cake.n.01', 'synonyms': ['birthday_cake'], 'def': 'decorated cake served at a birthday party', 'name': 'birthday_cake'}, {'frequency': 'r', 'id': 105, 'synset': 'birthday_card.n.01', 'synonyms': ['birthday_card'], 'def': 'a card expressing a birthday greeting', 'name': 'birthday_card'}, {'frequency': 'r', 'id': 106, 'synset': 'biscuit.n.01', 'synonyms': ['biscuit_(bread)'], 'def': 'small round bread leavened with baking-powder or soda', 'name': 'biscuit_(bread)'}, {'frequency': 'r', 'id': 107, 'synset': 'black_flag.n.01', 'synonyms': ['pirate_flag'], 'def': 'a flag usually bearing a white skull and crossbones on a black background', 'name': 'pirate_flag'}, {'frequency': 'c', 'id': 108, 'synset': 'black_sheep.n.02', 'synonyms': ['black_sheep'], 'def': 'sheep with a black coat', 'name': 'black_sheep'}, {'frequency': 'c', 'id': 109, 'synset': 'blackboard.n.01', 'synonyms': ['blackboard', 'chalkboard'], 'def': 'sheet of slate; for writing with chalk', 'name': 'blackboard'}, {'frequency': 'f', 'id': 110, 'synset': 'blanket.n.01', 'synonyms': ['blanket'], 'def': 'bedding that keeps a person warm in bed', 'name': 'blanket'}, {'frequency': 'c', 'id': 111, 'synset': 'blazer.n.01', 'synonyms': ['blazer', 'sport_jacket', 'sport_coat', 'sports_jacket', 'sports_coat'], 'def': 'lightweight jacket; often striped in the colors of a club or school', 'name': 'blazer'}, {'frequency': 'f', 'id': 112, 'synset': 'blender.n.01', 'synonyms': ['blender', 'liquidizer', 'liquidiser'], 'def': 'an electrically powered mixer that mix or chop or liquefy foods', 'name': 'blender'}, {'frequency': 'r', 'id': 113, 'synset': 'blimp.n.02', 'synonyms': ['blimp'], 'def': 'a small nonrigid airship used for observation or as a barrage balloon', 'name': 'blimp'}, {'frequency': 'c', 'id': 114, 'synset': 'blinker.n.01', 'synonyms': ['blinker', 'flasher'], 'def': 'a light that flashes on and off; used as a signal or to send messages', 'name': 'blinker'}, {'frequency': 'c', 'id': 115, 'synset': 'blueberry.n.02', 'synonyms': ['blueberry'], 'def': 'sweet edible dark-blue berries of blueberry plants', 'name': 'blueberry'}, {'frequency': 'r', 'id': 116, 'synset': 'boar.n.02', 'synonyms': ['boar'], 'def': 'an uncastrated male hog', 'name': 'boar'}, {'frequency': 'r', 'id': 117, 'synset': 'board.n.09', 'synonyms': ['gameboard'], 'def': 'a flat portable surface (usually rectangular) designed for board games', 'name': 'gameboard'}, {'frequency': 'f', 'id': 118, 'synset': 'boat.n.01', 'synonyms': ['boat', 'ship_(boat)'], 'def': 'a vessel for travel on water', 'name': 'boat'}, {'frequency': 'c', 'id': 119, 'synset': 'bobbin.n.01', 'synonyms': ['bobbin', 'spool', 'reel'], 'def': 'a thing around which thread/tape/film or other flexible materials can be wound', 'name': 'bobbin'}, {'frequency': 'r', 'id': 120, 'synset': 'bobby_pin.n.01', 'synonyms': ['bobby_pin', 'hairgrip'], 'def': 'a flat wire hairpin used to hold bobbed hair in place', 'name': 'bobby_pin'}, {'frequency': 'c', 'id': 121, 'synset': 'boiled_egg.n.01', 'synonyms': ['boiled_egg', 'coddled_egg'], 'def': 'egg cooked briefly in the shell in gently boiling water', 'name': 'boiled_egg'}, {'frequency': 'r', 'id': 122, 'synset': 'bolo_tie.n.01', 'synonyms': ['bolo_tie', 'bolo', 'bola_tie', 'bola'], 'def': 'a cord fastened around the neck with an ornamental clasp and worn as a necktie', 'name': 'bolo_tie'}, {'frequency': 'c', 'id': 123, 'synset': 'bolt.n.03', 'synonyms': ['deadbolt'], 'def': 'the part of a lock that is engaged or withdrawn with a key', 'name': 'deadbolt'}, {'frequency': 'f', 'id': 124, 'synset': 'bolt.n.06', 'synonyms': ['bolt'], 'def': 'a screw that screws into a nut to form a fastener', 'name': 'bolt'}, {'frequency': 'r', 'id': 125, 'synset': 'bonnet.n.01', 'synonyms': ['bonnet'], 'def': 'a hat tied under the chin', 'name': 'bonnet'}, {'frequency': 'f', 'id': 126, 'synset': 'book.n.01', 'synonyms': ['book'], 'def': 'a written work or composition that has been published', 'name': 'book'}, {'frequency': 'r', 'id': 127, 'synset': 'book_bag.n.01', 'synonyms': ['book_bag'], 'def': 'a bag in which students carry their books', 'name': 'book_bag'}, {'frequency': 'c', 'id': 128, 'synset': 'bookcase.n.01', 'synonyms': ['bookcase'], 'def': 'a piece of furniture with shelves for storing books', 'name': 'bookcase'}, {'frequency': 'c', 'id': 129, 'synset': 'booklet.n.01', 'synonyms': ['booklet', 'brochure', 'leaflet', 'pamphlet'], 'def': 'a small book usually having a paper cover', 'name': 'booklet'}, {'frequency': 'r', 'id': 130, 'synset': 'bookmark.n.01', 'synonyms': ['bookmark', 'bookmarker'], 'def': 'a marker (a piece of paper or ribbon) placed between the pages of a book', 'name': 'bookmark'}, {'frequency': 'r', 'id': 131, 'synset': 'boom.n.04', 'synonyms': ['boom_microphone', 'microphone_boom'], 'def': 'a pole carrying an overhead microphone projected over a film or tv set', 'name': 'boom_microphone'}, {'frequency': 'f', 'id': 132, 'synset': 'boot.n.01', 'synonyms': ['boot'], 'def': 'footwear that covers the whole foot and lower leg', 'name': 'boot'}, {'frequency': 'f', 'id': 133, 'synset': 'bottle.n.01', 'synonyms': ['bottle'], 'def': 'a glass or plastic vessel used for storing drinks or other liquids', 'name': 'bottle'}, {'frequency': 'c', 'id': 134, 'synset': 'bottle_opener.n.01', 'synonyms': ['bottle_opener'], 'def': 'an opener for removing caps or corks from bottles', 'name': 'bottle_opener'}, {'frequency': 'c', 'id': 135, 'synset': 'bouquet.n.01', 'synonyms': ['bouquet'], 'def': 'an arrangement of flowers that is usually given as a present', 'name': 'bouquet'}, {'frequency': 'r', 'id': 136, 'synset': 'bow.n.04', 'synonyms': ['bow_(weapon)'], 'def': 'a weapon for shooting arrows', 'name': 'bow_(weapon)'}, {'frequency': 'f', 'id': 137, 'synset': 'bow.n.08', 'synonyms': ['bow_(decorative_ribbons)'], 'def': 'a decorative interlacing of ribbons', 'name': 'bow_(decorative_ribbons)'}, {'frequency': 'f', 'id': 138, 'synset': 'bow_tie.n.01', 'synonyms': ['bow-tie', 'bowtie'], 'def': "a man's tie that ties in a bow", 'name': 'bow-tie'}, {'frequency': 'f', 'id': 139, 'synset': 'bowl.n.03', 'synonyms': ['bowl'], 'def': 'a dish that is round and open at the top for serving foods', 'name': 'bowl'}, {'frequency': 'r', 'id': 140, 'synset': 'bowl.n.08', 'synonyms': ['pipe_bowl'], 'def': 'a small round container that is open at the top for holding tobacco', 'name': 'pipe_bowl'}, {'frequency': 'c', 'id': 141, 'synset': 'bowler_hat.n.01', 'synonyms': ['bowler_hat', 'bowler', 'derby_hat', 'derby', 'plug_hat'], 'def': 'a felt hat that is round and hard with a narrow brim', 'name': 'bowler_hat'}, {'frequency': 'r', 'id': 142, 'synset': 'bowling_ball.n.01', 'synonyms': ['bowling_ball'], 'def': 'a large ball with finger holes used in the sport of bowling', 'name': 'bowling_ball'}, {'frequency': 'r', 'id': 143, 'synset': 'bowling_pin.n.01', 'synonyms': ['bowling_pin'], 'def': 'a club-shaped wooden object used in bowling', 'name': 'bowling_pin'}, {'frequency': 'r', 'id': 144, 'synset': 'boxing_glove.n.01', 'synonyms': ['boxing_glove'], 'def': 'large glove coverings the fists of a fighter worn for the sport of boxing', 'name': 'boxing_glove'}, {'frequency': 'c', 'id': 145, 'synset': 'brace.n.06', 'synonyms': ['suspenders'], 'def': 'elastic straps that hold trousers up (usually used in the plural)', 'name': 'suspenders'}, {'frequency': 'f', 'id': 146, 'synset': 'bracelet.n.02', 'synonyms': ['bracelet', 'bangle'], 'def': 'jewelry worn around the wrist for decoration', 'name': 'bracelet'}, {'frequency': 'r', 'id': 147, 'synset': 'brass.n.07', 'synonyms': ['brass_plaque'], 'def': 'a memorial made of brass', 'name': 'brass_plaque'}, {'frequency': 'c', 'id': 148, 'synset': 'brassiere.n.01', 'synonyms': ['brassiere', 'bra', 'bandeau'], 'def': 'an undergarment worn by women to support their breasts', 'name': 'brassiere'}, {'frequency': 'c', 'id': 149, 'synset': 'bread-bin.n.01', 'synonyms': ['bread-bin', 'breadbox'], 'def': 'a container used to keep bread or cake in', 'name': 'bread-bin'}, {'frequency': 'r', 'id': 150, 'synset': 'breechcloth.n.01', 'synonyms': ['breechcloth', 'breechclout', 'loincloth'], 'def': 'a garment that provides covering for the loins', 'name': 'breechcloth'}, {'frequency': 'c', 'id': 151, 'synset': 'bridal_gown.n.01', 'synonyms': ['bridal_gown', 'wedding_gown', 'wedding_dress'], 'def': 'a gown worn by the bride at a wedding', 'name': 'bridal_gown'}, {'frequency': 'c', 'id': 152, 'synset': 'briefcase.n.01', 'synonyms': ['briefcase'], 'def': 'a case with a handle; for carrying papers or files or books', 'name': 'briefcase'}, {'frequency': 'c', 'id': 153, 'synset': 'bristle_brush.n.01', 'synonyms': ['bristle_brush'], 'def': 'a brush that is made with the short stiff hairs of an animal or plant', 'name': 'bristle_brush'}, {'frequency': 'f', 'id': 154, 'synset': 'broccoli.n.01', 'synonyms': ['broccoli'], 'def': 'plant with dense clusters of tight green flower buds', 'name': 'broccoli'}, {'frequency': 'r', 'id': 155, 'synset': 'brooch.n.01', 'synonyms': ['broach'], 'def': 'a decorative pin worn by women', 'name': 'broach'}, {'frequency': 'c', 'id': 156, 'synset': 'broom.n.01', 'synonyms': ['broom'], 'def': 'bundle of straws or twigs attached to a long handle; used for cleaning', 'name': 'broom'}, {'frequency': 'c', 'id': 157, 'synset': 'brownie.n.03', 'synonyms': ['brownie'], 'def': 'square or bar of very rich chocolate cake usually with nuts', 'name': 'brownie'}, {'frequency': 'c', 'id': 158, 'synset': 'brussels_sprouts.n.01', 'synonyms': ['brussels_sprouts'], 'def': 'the small edible cabbage-like buds growing along a stalk', 'name': 'brussels_sprouts'}, {'frequency': 'r', 'id': 159, 'synset': 'bubble_gum.n.01', 'synonyms': ['bubble_gum'], 'def': 'a kind of chewing gum that can be blown into bubbles', 'name': 'bubble_gum'}, {'frequency': 'f', 'id': 160, 'synset': 'bucket.n.01', 'synonyms': ['bucket', 'pail'], 'def': 'a roughly cylindrical vessel that is open at the top', 'name': 'bucket'}, {'frequency': 'r', 'id': 161, 'synset': 'buggy.n.01', 'synonyms': ['horse_buggy'], 'def': 'a small lightweight carriage; drawn by a single horse', 'name': 'horse_buggy'}, {'frequency': 'c', 'id': 162, 'synset': 'bull.n.11', 'synonyms': ['bull'], 'def': 'mature male cow', 'name': 'bull'}, {'frequency': 'r', 'id': 163, 'synset': 'bulldog.n.01', 'synonyms': ['bulldog'], 'def': 'a thickset short-haired dog with a large head and strong undershot lower jaw', 'name': 'bulldog'}, {'frequency': 'r', 'id': 164, 'synset': 'bulldozer.n.01', 'synonyms': ['bulldozer', 'dozer'], 'def': 'large powerful tractor; a large blade in front flattens areas of ground', 'name': 'bulldozer'}, {'frequency': 'c', 'id': 165, 'synset': 'bullet_train.n.01', 'synonyms': ['bullet_train'], 'def': 'a high-speed passenger train', 'name': 'bullet_train'}, {'frequency': 'c', 'id': 166, 'synset': 'bulletin_board.n.02', 'synonyms': ['bulletin_board', 'notice_board'], 'def': 'a board that hangs on a wall; displays announcements', 'name': 'bulletin_board'}, {'frequency': 'r', 'id': 167, 'synset': 'bulletproof_vest.n.01', 'synonyms': ['bulletproof_vest'], 'def': 'a vest capable of resisting the impact of a bullet', 'name': 'bulletproof_vest'}, {'frequency': 'c', 'id': 168, 'synset': 'bullhorn.n.01', 'synonyms': ['bullhorn', 'megaphone'], 'def': 'a portable loudspeaker with built-in microphone and amplifier', 'name': 'bullhorn'}, {'frequency': 'r', 'id': 169, 'synset': 'bully_beef.n.01', 'synonyms': ['corned_beef', 'corn_beef'], 'def': 'beef cured or pickled in brine', 'name': 'corned_beef'}, {'frequency': 'f', 'id': 170, 'synset': 'bun.n.01', 'synonyms': ['bun', 'roll'], 'def': 'small rounded bread either plain or sweet', 'name': 'bun'}, {'frequency': 'c', 'id': 171, 'synset': 'bunk_bed.n.01', 'synonyms': ['bunk_bed'], 'def': 'beds built one above the other', 'name': 'bunk_bed'}, {'frequency': 'f', 'id': 172, 'synset': 'buoy.n.01', 'synonyms': ['buoy'], 'def': 'a float attached by rope to the seabed to mark channels in a harbor or underwater hazards', 'name': 'buoy'}, {'frequency': 'r', 'id': 173, 'synset': 'burrito.n.01', 'synonyms': ['burrito'], 'def': 'a flour tortilla folded around a filling', 'name': 'burrito'}, {'frequency': 'f', 'id': 174, 'synset': 'bus.n.01', 'synonyms': ['bus_(vehicle)', 'autobus', 'charabanc', 'double-decker', 'motorbus', 'motorcoach'], 'def': 'a vehicle carrying many passengers; used for public transport', 'name': 'bus_(vehicle)'}, {'frequency': 'c', 'id': 175, 'synset': 'business_card.n.01', 'synonyms': ['business_card'], 'def': "a card on which are printed the person's name and business affiliation", 'name': 'business_card'}, {'frequency': 'c', 'id': 176, 'synset': 'butcher_knife.n.01', 'synonyms': ['butcher_knife'], 'def': 'a large sharp knife for cutting or trimming meat', 'name': 'butcher_knife'}, {'frequency': 'c', 'id': 177, 'synset': 'butter.n.01', 'synonyms': ['butter'], 'def': 'an edible emulsion of fat globules made by churning milk or cream; for cooking and table use', 'name': 'butter'}, {'frequency': 'c', 'id': 178, 'synset': 'butterfly.n.01', 'synonyms': ['butterfly'], 'def': 'insect typically having a slender body with knobbed antennae and broad colorful wings', 'name': 'butterfly'}, {'frequency': 'f', 'id': 179, 'synset': 'button.n.01', 'synonyms': ['button'], 'def': 'a round fastener sewn to shirts and coats etc to fit through buttonholes', 'name': 'button'}, {'frequency': 'f', 'id': 180, 'synset': 'cab.n.03', 'synonyms': ['cab_(taxi)', 'taxi', 'taxicab'], 'def': 'a car that takes passengers where they want to go in exchange for money', 'name': 'cab_(taxi)'}, {'frequency': 'r', 'id': 181, 'synset': 'cabana.n.01', 'synonyms': ['cabana'], 'def': 'a small tent used as a dressing room beside the sea or a swimming pool', 'name': 'cabana'}, {'frequency': 'r', 'id': 182, 'synset': 'cabin_car.n.01', 'synonyms': ['cabin_car', 'caboose'], 'def': 'a car on a freight train for use of the train crew; usually the last car on the train', 'name': 'cabin_car'}, {'frequency': 'f', 'id': 183, 'synset': 'cabinet.n.01', 'synonyms': ['cabinet'], 'def': 'a piece of furniture resembling a cupboard with doors and shelves and drawers', 'name': 'cabinet'}, {'frequency': 'r', 'id': 184, 'synset': 'cabinet.n.03', 'synonyms': ['locker', 'storage_locker'], 'def': 'a storage compartment for clothes and valuables; usually it has a lock', 'name': 'locker'}, {'frequency': 'f', 'id': 185, 'synset': 'cake.n.03', 'synonyms': ['cake'], 'def': 'baked goods made from or based on a mixture of flour, sugar, eggs, and fat', 'name': 'cake'}, {'frequency': 'c', 'id': 186, 'synset': 'calculator.n.02', 'synonyms': ['calculator'], 'def': 'a small machine that is used for mathematical calculations', 'name': 'calculator'}, {'frequency': 'f', 'id': 187, 'synset': 'calendar.n.02', 'synonyms': ['calendar'], 'def': 'a list or register of events (appointments/social events/court cases, etc)', 'name': 'calendar'}, {'frequency': 'c', 'id': 188, 'synset': 'calf.n.01', 'synonyms': ['calf'], 'def': 'young of domestic cattle', 'name': 'calf'}, {'frequency': 'c', 'id': 189, 'synset': 'camcorder.n.01', 'synonyms': ['camcorder'], 'def': 'a portable television camera and videocassette recorder', 'name': 'camcorder'}, {'frequency': 'c', 'id': 190, 'synset': 'camel.n.01', 'synonyms': ['camel'], 'def': 'cud-chewing mammal used as a draft or saddle animal in desert regions', 'name': 'camel'}, {'frequency': 'f', 'id': 191, 'synset': 'camera.n.01', 'synonyms': ['camera'], 'def': 'equipment for taking photographs', 'name': 'camera'}, {'frequency': 'c', 'id': 192, 'synset': 'camera_lens.n.01', 'synonyms': ['camera_lens'], 'def': 'a lens that focuses the image in a camera', 'name': 'camera_lens'}, {'frequency': 'c', 'id': 193, 'synset': 'camper.n.02', 'synonyms': ['camper_(vehicle)', 'camping_bus', 'motor_home'], 'def': 'a recreational vehicle equipped for camping out while traveling', 'name': 'camper_(vehicle)'}, {'frequency': 'f', 'id': 194, 'synset': 'can.n.01', 'synonyms': ['can', 'tin_can'], 'def': 'airtight sealed metal container for food or drink or paint etc.', 'name': 'can'}, {'frequency': 'c', 'id': 195, 'synset': 'can_opener.n.01', 'synonyms': ['can_opener', 'tin_opener'], 'def': 'a device for cutting cans open', 'name': 'can_opener'}, {'frequency': 'r', 'id': 196, 'synset': 'candelabrum.n.01', 'synonyms': ['candelabrum', 'candelabra'], 'def': 'branched candlestick; ornamental; has several lights', 'name': 'candelabrum'}, {'frequency': 'f', 'id': 197, 'synset': 'candle.n.01', 'synonyms': ['candle', 'candlestick'], 'def': 'stick of wax with a wick in the middle', 'name': 'candle'}, {'frequency': 'f', 'id': 198, 'synset': 'candlestick.n.01', 'synonyms': ['candle_holder'], 'def': 'a holder with sockets for candles', 'name': 'candle_holder'}, {'frequency': 'r', 'id': 199, 'synset': 'candy_bar.n.01', 'synonyms': ['candy_bar'], 'def': 'a candy shaped as a bar', 'name': 'candy_bar'}, {'frequency': 'c', 'id': 200, 'synset': 'candy_cane.n.01', 'synonyms': ['candy_cane'], 'def': 'a hard candy in the shape of a rod (usually with stripes)', 'name': 'candy_cane'}, {'frequency': 'c', 'id': 201, 'synset': 'cane.n.01', 'synonyms': ['walking_cane'], 'def': 'a stick that people can lean on to help them walk', 'name': 'walking_cane'}, {'frequency': 'c', 'id': 202, 'synset': 'canister.n.02', 'synonyms': ['canister', 'cannister'], 'def': 'metal container for storing dry foods such as tea or flour', 'name': 'canister'}, {'frequency': 'r', 'id': 203, 'synset': 'cannon.n.02', 'synonyms': ['cannon'], 'def': 'heavy gun fired from a tank', 'name': 'cannon'}, {'frequency': 'c', 'id': 204, 'synset': 'canoe.n.01', 'synonyms': ['canoe'], 'def': 'small and light boat; pointed at both ends; propelled with a paddle', 'name': 'canoe'}, {'frequency': 'r', 'id': 205, 'synset': 'cantaloup.n.02', 'synonyms': ['cantaloup', 'cantaloupe'], 'def': 'the fruit of a cantaloup vine; small to medium-sized melon with yellowish flesh', 'name': 'cantaloup'}, {'frequency': 'r', 'id': 206, 'synset': 'canteen.n.01', 'synonyms': ['canteen'], 'def': 'a flask for carrying water; used by soldiers or travelers', 'name': 'canteen'}, {'frequency': 'c', 'id': 207, 'synset': 'cap.n.01', 'synonyms': ['cap_(headwear)'], 'def': 'a tight-fitting headwear', 'name': 'cap_(headwear)'}, {'frequency': 'f', 'id': 208, 'synset': 'cap.n.02', 'synonyms': ['bottle_cap', 'cap_(container_lid)'], 'def': 'a top (as for a bottle)', 'name': 'bottle_cap'}, {'frequency': 'r', 'id': 209, 'synset': 'cape.n.02', 'synonyms': ['cape'], 'def': 'a sleeveless garment like a cloak but shorter', 'name': 'cape'}, {'frequency': 'c', 'id': 210, 'synset': 'cappuccino.n.01', 'synonyms': ['cappuccino', 'coffee_cappuccino'], 'def': 'equal parts of espresso and steamed milk', 'name': 'cappuccino'}, {'frequency': 'f', 'id': 211, 'synset': 'car.n.01', 'synonyms': ['car_(automobile)', 'auto_(automobile)', 'automobile'], 'def': 'a motor vehicle with four wheels', 'name': 'car_(automobile)'}, {'frequency': 'f', 'id': 212, 'synset': 'car.n.02', 'synonyms': ['railcar_(part_of_a_train)', 'railway_car_(part_of_a_train)', 'railroad_car_(part_of_a_train)'], 'def': 'a wheeled vehicle adapted to the rails of railroad', 'name': 'railcar_(part_of_a_train)'}, {'frequency': 'r', 'id': 213, 'synset': 'car.n.04', 'synonyms': ['elevator_car'], 'def': 'where passengers ride up and down', 'name': 'elevator_car'}, {'frequency': 'r', 'id': 214, 'synset': 'car_battery.n.01', 'synonyms': ['car_battery', 'automobile_battery'], 'def': 'a battery in a motor vehicle', 'name': 'car_battery'}, {'frequency': 'c', 'id': 215, 'synset': 'card.n.02', 'synonyms': ['identity_card'], 'def': 'a card certifying the identity of the bearer', 'name': 'identity_card'}, {'frequency': 'c', 'id': 216, 'synset': 'card.n.03', 'synonyms': ['card'], 'def': 'a rectangular piece of paper used to send messages (e.g. greetings or pictures)', 'name': 'card'}, {'frequency': 'r', 'id': 217, 'synset': 'cardigan.n.01', 'synonyms': ['cardigan'], 'def': 'knitted jacket that is fastened up the front with buttons or a zipper', 'name': 'cardigan'}, {'frequency': 'r', 'id': 218, 'synset': 'cargo_ship.n.01', 'synonyms': ['cargo_ship', 'cargo_vessel'], 'def': 'a ship designed to carry cargo', 'name': 'cargo_ship'}, {'frequency': 'r', 'id': 219, 'synset': 'carnation.n.01', 'synonyms': ['carnation'], 'def': 'plant with pink to purple-red spice-scented usually double flowers', 'name': 'carnation'}, {'frequency': 'c', 'id': 220, 'synset': 'carriage.n.02', 'synonyms': ['horse_carriage'], 'def': 'a vehicle with wheels drawn by one or more horses', 'name': 'horse_carriage'}, {'frequency': 'f', 'id': 221, 'synset': 'carrot.n.01', 'synonyms': ['carrot'], 'def': 'deep orange edible root of the cultivated carrot plant', 'name': 'carrot'}, {'frequency': 'c', 'id': 222, 'synset': 'carryall.n.01', 'synonyms': ['tote_bag'], 'def': 'a capacious bag or basket', 'name': 'tote_bag'}, {'frequency': 'c', 'id': 223, 'synset': 'cart.n.01', 'synonyms': ['cart'], 'def': 'a heavy open wagon usually having two wheels and drawn by an animal', 'name': 'cart'}, {'frequency': 'c', 'id': 224, 'synset': 'carton.n.02', 'synonyms': ['carton'], 'def': 'a box made of cardboard; opens by flaps on top', 'name': 'carton'}, {'frequency': 'c', 'id': 225, 'synset': 'cash_register.n.01', 'synonyms': ['cash_register', 'register_(for_cash_transactions)'], 'def': 'a cashbox with an adding machine to register transactions', 'name': 'cash_register'}, {'frequency': 'r', 'id': 226, 'synset': 'casserole.n.01', 'synonyms': ['casserole'], 'def': 'food cooked and served in a casserole', 'name': 'casserole'}, {'frequency': 'r', 'id': 227, 'synset': 'cassette.n.01', 'synonyms': ['cassette'], 'def': 'a container that holds a magnetic tape used for recording or playing sound or video', 'name': 'cassette'}, {'frequency': 'c', 'id': 228, 'synset': 'cast.n.05', 'synonyms': ['cast', 'plaster_cast', 'plaster_bandage'], 'def': 'bandage consisting of a firm covering that immobilizes broken bones while they heal', 'name': 'cast'}, {'frequency': 'f', 'id': 229, 'synset': 'cat.n.01', 'synonyms': ['cat'], 'def': 'a domestic house cat', 'name': 'cat'}, {'frequency': 'c', 'id': 230, 'synset': 'cauliflower.n.02', 'synonyms': ['cauliflower'], 'def': 'edible compact head of white undeveloped flowers', 'name': 'cauliflower'}, {'frequency': 'r', 'id': 231, 'synset': 'caviar.n.01', 'synonyms': ['caviar', 'caviare'], 'def': "salted roe of sturgeon or other large fish; usually served as an hors d'oeuvre", 'name': 'caviar'}, {'frequency': 'c', 'id': 232, 'synset': 'cayenne.n.02', 'synonyms': ['cayenne_(spice)', 'cayenne_pepper_(spice)', 'red_pepper_(spice)'], 'def': 'ground pods and seeds of pungent red peppers of the genus Capsicum', 'name': 'cayenne_(spice)'}, {'frequency': 'c', 'id': 233, 'synset': 'cd_player.n.01', 'synonyms': ['CD_player'], 'def': 'electronic equipment for playing compact discs (CDs)', 'name': 'CD_player'}, {'frequency': 'c', 'id': 234, 'synset': 'celery.n.01', 'synonyms': ['celery'], 'def': 'widely cultivated herb with aromatic leaf stalks that are eaten raw or cooked', 'name': 'celery'}, {'frequency': 'f', 'id': 235, 'synset': 'cellular_telephone.n.01', 'synonyms': ['cellular_telephone', 'cellular_phone', 'cellphone', 'mobile_phone', 'smart_phone'], 'def': 'a hand-held mobile telephone', 'name': 'cellular_telephone'}, {'frequency': 'r', 'id': 236, 'synset': 'chain_mail.n.01', 'synonyms': ['chain_mail', 'ring_mail', 'chain_armor', 'chain_armour', 'ring_armor', 'ring_armour'], 'def': '(Middle Ages) flexible armor made of interlinked metal rings', 'name': 'chain_mail'}, {'frequency': 'f', 'id': 237, 'synset': 'chair.n.01', 'synonyms': ['chair'], 'def': 'a seat for one person, with a support for the back', 'name': 'chair'}, {'frequency': 'r', 'id': 238, 'synset': 'chaise_longue.n.01', 'synonyms': ['chaise_longue', 'chaise', 'daybed'], 'def': 'a long chair; for reclining', 'name': 'chaise_longue'}, {'frequency': 'r', 'id': 239, 'synset': 'champagne.n.01', 'synonyms': ['champagne'], 'def': 'a white sparkling wine produced in Champagne or resembling that produced there', 'name': 'champagne'}, {'frequency': 'f', 'id': 240, 'synset': 'chandelier.n.01', 'synonyms': ['chandelier'], 'def': 'branched lighting fixture; often ornate; hangs from the ceiling', 'name': 'chandelier'}, {'frequency': 'r', 'id': 241, 'synset': 'chap.n.04', 'synonyms': ['chap'], 'def': 'leather leggings without a seat; worn over trousers by cowboys to protect their legs', 'name': 'chap'}, {'frequency': 'r', 'id': 242, 'synset': 'checkbook.n.01', 'synonyms': ['checkbook', 'chequebook'], 'def': 'a book issued to holders of checking accounts', 'name': 'checkbook'}, {'frequency': 'r', 'id': 243, 'synset': 'checkerboard.n.01', 'synonyms': ['checkerboard'], 'def': 'a board having 64 squares of two alternating colors', 'name': 'checkerboard'}, {'frequency': 'c', 'id': 244, 'synset': 'cherry.n.03', 'synonyms': ['cherry'], 'def': 'a red fruit with a single hard stone', 'name': 'cherry'}, {'frequency': 'r', 'id': 245, 'synset': 'chessboard.n.01', 'synonyms': ['chessboard'], 'def': 'a checkerboard used to play chess', 'name': 'chessboard'}, {'frequency': 'r', 'id': 246, 'synset': 'chest_of_drawers.n.01', 'synonyms': ['chest_of_drawers_(furniture)', 'bureau_(furniture)', 'chest_(furniture)'], 'def': 'furniture with drawers for keeping clothes', 'name': 'chest_of_drawers_(furniture)'}, {'frequency': 'c', 'id': 247, 'synset': 'chicken.n.02', 'synonyms': ['chicken_(animal)'], 'def': 'a domestic fowl bred for flesh or eggs', 'name': 'chicken_(animal)'}, {'frequency': 'c', 'id': 248, 'synset': 'chicken_wire.n.01', 'synonyms': ['chicken_wire'], 'def': 'a galvanized wire network with a hexagonal mesh; used to build fences', 'name': 'chicken_wire'}, {'frequency': 'r', 'id': 249, 'synset': 'chickpea.n.01', 'synonyms': ['chickpea', 'garbanzo'], 'def': 'the seed of the chickpea plant; usually dried', 'name': 'chickpea'}, {'frequency': 'r', 'id': 250, 'synset': 'chihuahua.n.03', 'synonyms': ['Chihuahua'], 'def': 'an old breed of tiny short-haired dog with protruding eyes from Mexico', 'name': 'Chihuahua'}, {'frequency': 'r', 'id': 251, 'synset': 'chili.n.02', 'synonyms': ['chili_(vegetable)', 'chili_pepper_(vegetable)', 'chilli_(vegetable)', 'chilly_(vegetable)', 'chile_(vegetable)'], 'def': 'very hot and finely tapering pepper of special pungency', 'name': 'chili_(vegetable)'}, {'frequency': 'r', 'id': 252, 'synset': 'chime.n.01', 'synonyms': ['chime', 'gong'], 'def': 'an instrument consisting of a set of bells that are struck with a hammer', 'name': 'chime'}, {'frequency': 'r', 'id': 253, 'synset': 'chinaware.n.01', 'synonyms': ['chinaware'], 'def': 'dishware made of high quality porcelain', 'name': 'chinaware'}, {'frequency': 'c', 'id': 254, 'synset': 'chip.n.04', 'synonyms': ['crisp_(potato_chip)', 'potato_chip'], 'def': 'a thin crisp slice of potato fried in deep fat', 'name': 'crisp_(potato_chip)'}, {'frequency': 'r', 'id': 255, 'synset': 'chip.n.06', 'synonyms': ['poker_chip'], 'def': 'a small disk-shaped counter used to represent money when gambling', 'name': 'poker_chip'}, {'frequency': 'c', 'id': 256, 'synset': 'chocolate_bar.n.01', 'synonyms': ['chocolate_bar'], 'def': 'a bar of chocolate candy', 'name': 'chocolate_bar'}, {'frequency': 'c', 'id': 257, 'synset': 'chocolate_cake.n.01', 'synonyms': ['chocolate_cake'], 'def': 'cake containing chocolate', 'name': 'chocolate_cake'}, {'frequency': 'r', 'id': 258, 'synset': 'chocolate_milk.n.01', 'synonyms': ['chocolate_milk'], 'def': 'milk flavored with chocolate syrup', 'name': 'chocolate_milk'}, {'frequency': 'r', 'id': 259, 'synset': 'chocolate_mousse.n.01', 'synonyms': ['chocolate_mousse'], 'def': 'dessert mousse made with chocolate', 'name': 'chocolate_mousse'}, {'frequency': 'f', 'id': 260, 'synset': 'choker.n.03', 'synonyms': ['choker', 'collar', 'neckband'], 'def': 'necklace that fits tightly around the neck', 'name': 'choker'}, {'frequency': 'f', 'id': 261, 'synset': 'chopping_board.n.01', 'synonyms': ['chopping_board', 'cutting_board', 'chopping_block'], 'def': 'a wooden board where meats or vegetables can be cut', 'name': 'chopping_board'}, {'frequency': 'c', 'id': 262, 'synset': 'chopstick.n.01', 'synonyms': ['chopstick'], 'def': 'one of a pair of slender sticks used as oriental tableware to eat food with', 'name': 'chopstick'}, {'frequency': 'f', 'id': 263, 'synset': 'christmas_tree.n.05', 'synonyms': ['Christmas_tree'], 'def': 'an ornamented evergreen used as a Christmas decoration', 'name': 'Christmas_tree'}, {'frequency': 'c', 'id': 264, 'synset': 'chute.n.02', 'synonyms': ['slide'], 'def': 'sloping channel through which things can descend', 'name': 'slide'}, {'frequency': 'r', 'id': 265, 'synset': 'cider.n.01', 'synonyms': ['cider', 'cyder'], 'def': 'a beverage made from juice pressed from apples', 'name': 'cider'}, {'frequency': 'r', 'id': 266, 'synset': 'cigar_box.n.01', 'synonyms': ['cigar_box'], 'def': 'a box for holding cigars', 'name': 'cigar_box'}, {'frequency': 'c', 'id': 267, 'synset': 'cigarette.n.01', 'synonyms': ['cigarette'], 'def': 'finely ground tobacco wrapped in paper; for smoking', 'name': 'cigarette'}, {'frequency': 'c', 'id': 268, 'synset': 'cigarette_case.n.01', 'synonyms': ['cigarette_case', 'cigarette_pack'], 'def': 'a small flat case for holding cigarettes', 'name': 'cigarette_case'}, {'frequency': 'f', 'id': 269, 'synset': 'cistern.n.02', 'synonyms': ['cistern', 'water_tank'], 'def': 'a tank that holds the water used to flush a toilet', 'name': 'cistern'}, {'frequency': 'r', 'id': 270, 'synset': 'clarinet.n.01', 'synonyms': ['clarinet'], 'def': 'a single-reed instrument with a straight tube', 'name': 'clarinet'}, {'frequency': 'r', 'id': 271, 'synset': 'clasp.n.01', 'synonyms': ['clasp'], 'def': 'a fastener (as a buckle or hook) that is used to hold two things together', 'name': 'clasp'}, {'frequency': 'c', 'id': 272, 'synset': 'cleansing_agent.n.01', 'synonyms': ['cleansing_agent', 'cleanser', 'cleaner'], 'def': 'a preparation used in cleaning something', 'name': 'cleansing_agent'}, {'frequency': 'r', 'id': 273, 'synset': 'clementine.n.01', 'synonyms': ['clementine'], 'def': 'a variety of mandarin orange', 'name': 'clementine'}, {'frequency': 'c', 'id': 274, 'synset': 'clip.n.03', 'synonyms': ['clip'], 'def': 'any of various small fasteners used to hold loose articles together', 'name': 'clip'}, {'frequency': 'c', 'id': 275, 'synset': 'clipboard.n.01', 'synonyms': ['clipboard'], 'def': 'a small writing board with a clip at the top for holding papers', 'name': 'clipboard'}, {'frequency': 'f', 'id': 276, 'synset': 'clock.n.01', 'synonyms': ['clock', 'timepiece', 'timekeeper'], 'def': 'a timepiece that shows the time of day', 'name': 'clock'}, {'frequency': 'f', 'id': 277, 'synset': 'clock_tower.n.01', 'synonyms': ['clock_tower'], 'def': 'a tower with a large clock visible high up on an outside face', 'name': 'clock_tower'}, {'frequency': 'c', 'id': 278, 'synset': 'clothes_hamper.n.01', 'synonyms': ['clothes_hamper', 'laundry_basket', 'clothes_basket'], 'def': 'a hamper that holds dirty clothes to be washed or wet clothes to be dried', 'name': 'clothes_hamper'}, {'frequency': 'c', 'id': 279, 'synset': 'clothespin.n.01', 'synonyms': ['clothespin', 'clothes_peg'], 'def': 'wood or plastic fastener; for holding clothes on a clothesline', 'name': 'clothespin'}, {'frequency': 'r', 'id': 280, 'synset': 'clutch_bag.n.01', 'synonyms': ['clutch_bag'], 'def': "a woman's strapless purse that is carried in the hand", 'name': 'clutch_bag'}, {'frequency': 'f', 'id': 281, 'synset': 'coaster.n.03', 'synonyms': ['coaster'], 'def': 'a covering (plate or mat) that protects the surface of a table', 'name': 'coaster'}, {'frequency': 'f', 'id': 282, 'synset': 'coat.n.01', 'synonyms': ['coat'], 'def': 'an outer garment that has sleeves and covers the body from shoulder down', 'name': 'coat'}, {'frequency': 'c', 'id': 283, 'synset': 'coat_hanger.n.01', 'synonyms': ['coat_hanger', 'clothes_hanger', 'dress_hanger'], 'def': "a hanger that is shaped like a person's shoulders", 'name': 'coat_hanger'}, {'frequency': 'r', 'id': 284, 'synset': 'coatrack.n.01', 'synonyms': ['coatrack', 'hatrack'], 'def': 'a rack with hooks for temporarily holding coats and hats', 'name': 'coatrack'}, {'frequency': 'c', 'id': 285, 'synset': 'cock.n.04', 'synonyms': ['cock', 'rooster'], 'def': 'adult male chicken', 'name': 'cock'}, {'frequency': 'c', 'id': 286, 'synset': 'coconut.n.02', 'synonyms': ['coconut', 'cocoanut'], 'def': 'large hard-shelled brown oval nut with a fibrous husk', 'name': 'coconut'}, {'frequency': 'r', 'id': 287, 'synset': 'coffee_filter.n.01', 'synonyms': ['coffee_filter'], 'def': 'filter (usually of paper) that passes the coffee and retains the coffee grounds', 'name': 'coffee_filter'}, {'frequency': 'f', 'id': 288, 'synset': 'coffee_maker.n.01', 'synonyms': ['coffee_maker', 'coffee_machine'], 'def': 'a kitchen appliance for brewing coffee automatically', 'name': 'coffee_maker'}, {'frequency': 'f', 'id': 289, 'synset': 'coffee_table.n.01', 'synonyms': ['coffee_table', 'cocktail_table'], 'def': 'low table where magazines can be placed and coffee or cocktails are served', 'name': 'coffee_table'}, {'frequency': 'c', 'id': 290, 'synset': 'coffeepot.n.01', 'synonyms': ['coffeepot'], 'def': 'tall pot in which coffee is brewed', 'name': 'coffeepot'}, {'frequency': 'r', 'id': 291, 'synset': 'coil.n.05', 'synonyms': ['coil'], 'def': 'tubing that is wound in a spiral', 'name': 'coil'}, {'frequency': 'c', 'id': 292, 'synset': 'coin.n.01', 'synonyms': ['coin'], 'def': 'a flat metal piece (usually a disc) used as money', 'name': 'coin'}, {'frequency': 'r', 'id': 293, 'synset': 'colander.n.01', 'synonyms': ['colander', 'cullender'], 'def': 'bowl-shaped strainer; used to wash or drain foods', 'name': 'colander'}, {'frequency': 'c', 'id': 294, 'synset': 'coleslaw.n.01', 'synonyms': ['coleslaw', 'slaw'], 'def': 'basically shredded cabbage', 'name': 'coleslaw'}, {'frequency': 'r', 'id': 295, 'synset': 'coloring_material.n.01', 'synonyms': ['coloring_material', 'colouring_material'], 'def': 'any material used for its color', 'name': 'coloring_material'}, {'frequency': 'r', 'id': 296, 'synset': 'combination_lock.n.01', 'synonyms': ['combination_lock'], 'def': 'lock that can be opened only by turning dials in a special sequence', 'name': 'combination_lock'}, {'frequency': 'c', 'id': 297, 'synset': 'comforter.n.04', 'synonyms': ['pacifier', 'teething_ring'], 'def': 'device used for an infant to suck or bite on', 'name': 'pacifier'}, {'frequency': 'r', 'id': 298, 'synset': 'comic_book.n.01', 'synonyms': ['comic_book'], 'def': 'a magazine devoted to comic strips', 'name': 'comic_book'}, {'frequency': 'f', 'id': 299, 'synset': 'computer_keyboard.n.01', 'synonyms': ['computer_keyboard', 'keyboard_(computer)'], 'def': 'a keyboard that is a data input device for computers', 'name': 'computer_keyboard'}, {'frequency': 'r', 'id': 300, 'synset': 'concrete_mixer.n.01', 'synonyms': ['concrete_mixer', 'cement_mixer'], 'def': 'a machine with a large revolving drum in which cement/concrete is mixed', 'name': 'concrete_mixer'}, {'frequency': 'f', 'id': 301, 'synset': 'cone.n.01', 'synonyms': ['cone', 'traffic_cone'], 'def': 'a cone-shaped object used to direct traffic', 'name': 'cone'}, {'frequency': 'f', 'id': 302, 'synset': 'control.n.09', 'synonyms': ['control', 'controller'], 'def': 'a mechanism that controls the operation of a machine', 'name': 'control'}, {'frequency': 'r', 'id': 303, 'synset': 'convertible.n.01', 'synonyms': ['convertible_(automobile)'], 'def': 'a car that has top that can be folded or removed', 'name': 'convertible_(automobile)'}, {'frequency': 'r', 'id': 304, 'synset': 'convertible.n.03', 'synonyms': ['sofa_bed'], 'def': 'a sofa that can be converted into a bed', 'name': 'sofa_bed'}, {'frequency': 'c', 'id': 305, 'synset': 'cookie.n.01', 'synonyms': ['cookie', 'cooky', 'biscuit_(cookie)'], 'def': "any of various small flat sweet cakes (`biscuit' is the British term)", 'name': 'cookie'}, {'frequency': 'r', 'id': 306, 'synset': 'cookie_jar.n.01', 'synonyms': ['cookie_jar', 'cooky_jar'], 'def': 'a jar in which cookies are kept (and sometimes money is hidden)', 'name': 'cookie_jar'}, {'frequency': 'r', 'id': 307, 'synset': 'cooking_utensil.n.01', 'synonyms': ['cooking_utensil'], 'def': 'a kitchen utensil made of material that does not melt easily; used for cooking', 'name': 'cooking_utensil'}, {'frequency': 'f', 'id': 308, 'synset': 'cooler.n.01', 'synonyms': ['cooler_(for_food)', 'ice_chest'], 'def': 'an insulated box for storing food often with ice', 'name': 'cooler_(for_food)'}, {'frequency': 'c', 'id': 309, 'synset': 'cork.n.04', 'synonyms': ['cork_(bottle_plug)', 'bottle_cork'], 'def': 'the plug in the mouth of a bottle (especially a wine bottle)', 'name': 'cork_(bottle_plug)'}, {'frequency': 'r', 'id': 310, 'synset': 'corkboard.n.01', 'synonyms': ['corkboard'], 'def': 'a sheet consisting of cork granules', 'name': 'corkboard'}, {'frequency': 'r', 'id': 311, 'synset': 'corkscrew.n.01', 'synonyms': ['corkscrew', 'bottle_screw'], 'def': 'a bottle opener that pulls corks', 'name': 'corkscrew'}, {'frequency': 'c', 'id': 312, 'synset': 'corn.n.03', 'synonyms': ['edible_corn', 'corn', 'maize'], 'def': 'ears of corn that can be prepared and served for human food', 'name': 'edible_corn'}, {'frequency': 'r', 'id': 313, 'synset': 'cornbread.n.01', 'synonyms': ['cornbread'], 'def': 'bread made primarily of cornmeal', 'name': 'cornbread'}, {'frequency': 'c', 'id': 314, 'synset': 'cornet.n.01', 'synonyms': ['cornet', 'horn', 'trumpet'], 'def': 'a brass musical instrument with a narrow tube and a flared bell and many valves', 'name': 'cornet'}, {'frequency': 'c', 'id': 315, 'synset': 'cornice.n.01', 'synonyms': ['cornice', 'valance', 'valance_board', 'pelmet'], 'def': 'a decorative framework to conceal curtain fixtures at the top of a window casing', 'name': 'cornice'}, {'frequency': 'r', 'id': 316, 'synset': 'cornmeal.n.01', 'synonyms': ['cornmeal'], 'def': 'coarsely ground corn', 'name': 'cornmeal'}, {'frequency': 'r', 'id': 317, 'synset': 'corset.n.01', 'synonyms': ['corset', 'girdle'], 'def': "a woman's close-fitting foundation garment", 'name': 'corset'}, {'frequency': 'r', 'id': 318, 'synset': 'cos.n.02', 'synonyms': ['romaine_lettuce'], 'def': 'lettuce with long dark-green leaves in a loosely packed elongated head', 'name': 'romaine_lettuce'}, {'frequency': 'c', 'id': 319, 'synset': 'costume.n.04', 'synonyms': ['costume'], 'def': 'the attire characteristic of a country or a time or a social class', 'name': 'costume'}, {'frequency': 'r', 'id': 320, 'synset': 'cougar.n.01', 'synonyms': ['cougar', 'puma', 'catamount', 'mountain_lion', 'panther'], 'def': 'large American feline resembling a lion', 'name': 'cougar'}, {'frequency': 'r', 'id': 321, 'synset': 'coverall.n.01', 'synonyms': ['coverall'], 'def': 'a loose-fitting protective garment that is worn over other clothing', 'name': 'coverall'}, {'frequency': 'r', 'id': 322, 'synset': 'cowbell.n.01', 'synonyms': ['cowbell'], 'def': 'a bell hung around the neck of cow so that the cow can be easily located', 'name': 'cowbell'}, {'frequency': 'f', 'id': 323, 'synset': 'cowboy_hat.n.01', 'synonyms': ['cowboy_hat', 'ten-gallon_hat'], 'def': 'a hat with a wide brim and a soft crown; worn by American ranch hands', 'name': 'cowboy_hat'}, {'frequency': 'r', 'id': 324, 'synset': 'crab.n.01', 'synonyms': ['crab_(animal)'], 'def': 'decapod having eyes on short stalks and a broad flattened shell and pincers', 'name': 'crab_(animal)'}, {'frequency': 'c', 'id': 325, 'synset': 'cracker.n.01', 'synonyms': ['cracker'], 'def': 'a thin crisp wafer', 'name': 'cracker'}, {'frequency': 'r', 'id': 326, 'synset': 'crape.n.01', 'synonyms': ['crape', 'crepe', 'French_pancake'], 'def': 'small very thin pancake', 'name': 'crape'}, {'frequency': 'f', 'id': 327, 'synset': 'crate.n.01', 'synonyms': ['crate'], 'def': 'a rugged box (usually made of wood); used for shipping', 'name': 'crate'}, {'frequency': 'r', 'id': 328, 'synset': 'crayon.n.01', 'synonyms': ['crayon', 'wax_crayon'], 'def': 'writing or drawing implement made of a colored stick of composition wax', 'name': 'crayon'}, {'frequency': 'r', 'id': 329, 'synset': 'cream_pitcher.n.01', 'synonyms': ['cream_pitcher'], 'def': 'a small pitcher for serving cream', 'name': 'cream_pitcher'}, {'frequency': 'r', 'id': 330, 'synset': 'credit_card.n.01', 'synonyms': ['credit_card', 'charge_card', 'debit_card'], 'def': 'a card, usually plastic, used to pay for goods and services', 'name': 'credit_card'}, {'frequency': 'c', 'id': 331, 'synset': 'crescent_roll.n.01', 'synonyms': ['crescent_roll', 'croissant'], 'def': 'very rich flaky crescent-shaped roll', 'name': 'crescent_roll'}, {'frequency': 'c', 'id': 332, 'synset': 'crib.n.01', 'synonyms': ['crib', 'cot'], 'def': 'baby bed with high sides made of slats', 'name': 'crib'}, {'frequency': 'c', 'id': 333, 'synset': 'crock.n.03', 'synonyms': ['crock_pot', 'earthenware_jar'], 'def': 'an earthen jar (made of baked clay)', 'name': 'crock_pot'}, {'frequency': 'f', 'id': 334, 'synset': 'crossbar.n.01', 'synonyms': ['crossbar'], 'def': 'a horizontal bar that goes across something', 'name': 'crossbar'}, {'frequency': 'r', 'id': 335, 'synset': 'crouton.n.01', 'synonyms': ['crouton'], 'def': 'a small piece of toasted or fried bread; served in soup or salads', 'name': 'crouton'}, {'frequency': 'r', 'id': 336, 'synset': 'crow.n.01', 'synonyms': ['crow'], 'def': 'black birds having a raucous call', 'name': 'crow'}, {'frequency': 'c', 'id': 337, 'synset': 'crown.n.04', 'synonyms': ['crown'], 'def': 'an ornamental jeweled headdress signifying sovereignty', 'name': 'crown'}, {'frequency': 'c', 'id': 338, 'synset': 'crucifix.n.01', 'synonyms': ['crucifix'], 'def': 'representation of the cross on which Jesus died', 'name': 'crucifix'}, {'frequency': 'c', 'id': 339, 'synset': 'cruise_ship.n.01', 'synonyms': ['cruise_ship', 'cruise_liner'], 'def': 'a passenger ship used commercially for pleasure cruises', 'name': 'cruise_ship'}, {'frequency': 'c', 'id': 340, 'synset': 'cruiser.n.01', 'synonyms': ['police_cruiser', 'patrol_car', 'police_car', 'squad_car'], 'def': 'a car in which policemen cruise the streets', 'name': 'police_cruiser'}, {'frequency': 'c', 'id': 341, 'synset': 'crumb.n.03', 'synonyms': ['crumb'], 'def': 'small piece of e.g. bread or cake', 'name': 'crumb'}, {'frequency': 'r', 'id': 342, 'synset': 'crutch.n.01', 'synonyms': ['crutch'], 'def': 'a wooden or metal staff that fits under the armpit and reaches to the ground', 'name': 'crutch'}, {'frequency': 'c', 'id': 343, 'synset': 'cub.n.03', 'synonyms': ['cub_(animal)'], 'def': 'the young of certain carnivorous mammals such as the bear or wolf or lion', 'name': 'cub_(animal)'}, {'frequency': 'r', 'id': 344, 'synset': 'cube.n.05', 'synonyms': ['cube', 'square_block'], 'def': 'a block in the (approximate) shape of a cube', 'name': 'cube'}, {'frequency': 'f', 'id': 345, 'synset': 'cucumber.n.02', 'synonyms': ['cucumber', 'cuke'], 'def': 'cylindrical green fruit with thin green rind and white flesh eaten as a vegetable', 'name': 'cucumber'}, {'frequency': 'c', 'id': 346, 'synset': 'cufflink.n.01', 'synonyms': ['cufflink'], 'def': 'jewelry consisting of linked buttons used to fasten the cuffs of a shirt', 'name': 'cufflink'}, {'frequency': 'f', 'id': 347, 'synset': 'cup.n.01', 'synonyms': ['cup'], 'def': 'a small open container usually used for drinking; usually has a handle', 'name': 'cup'}, {'frequency': 'c', 'id': 348, 'synset': 'cup.n.08', 'synonyms': ['trophy_cup'], 'def': 'a metal vessel with handles that is awarded as a trophy to a competition winner', 'name': 'trophy_cup'}, {'frequency': 'c', 'id': 349, 'synset': 'cupcake.n.01', 'synonyms': ['cupcake'], 'def': 'small cake baked in a muffin tin', 'name': 'cupcake'}, {'frequency': 'r', 'id': 350, 'synset': 'curler.n.01', 'synonyms': ['hair_curler', 'hair_roller', 'hair_crimper'], 'def': 'a cylindrical tube around which the hair is wound to curl it', 'name': 'hair_curler'}, {'frequency': 'r', 'id': 351, 'synset': 'curling_iron.n.01', 'synonyms': ['curling_iron'], 'def': 'a cylindrical home appliance that heats hair that has been curled around it', 'name': 'curling_iron'}, {'frequency': 'f', 'id': 352, 'synset': 'curtain.n.01', 'synonyms': ['curtain', 'drapery'], 'def': 'hanging cloth used as a blind (especially for a window)', 'name': 'curtain'}, {'frequency': 'f', 'id': 353, 'synset': 'cushion.n.03', 'synonyms': ['cushion'], 'def': 'a soft bag filled with air or padding such as feathers or foam rubber', 'name': 'cushion'}, {'frequency': 'r', 'id': 354, 'synset': 'custard.n.01', 'synonyms': ['custard'], 'def': 'sweetened mixture of milk and eggs baked or boiled or frozen', 'name': 'custard'}, {'frequency': 'c', 'id': 355, 'synset': 'cutter.n.06', 'synonyms': ['cutting_tool'], 'def': 'a cutting implement; a tool for cutting', 'name': 'cutting_tool'}, {'frequency': 'r', 'id': 356, 'synset': 'cylinder.n.04', 'synonyms': ['cylinder'], 'def': 'a cylindrical container', 'name': 'cylinder'}, {'frequency': 'r', 'id': 357, 'synset': 'cymbal.n.01', 'synonyms': ['cymbal'], 'def': 'a percussion instrument consisting of a concave brass disk', 'name': 'cymbal'}, {'frequency': 'r', 'id': 358, 'synset': 'dachshund.n.01', 'synonyms': ['dachshund', 'dachsie', 'badger_dog'], 'def': 'small long-bodied short-legged breed of dog having a short sleek coat and long drooping ears', 'name': 'dachshund'}, {'frequency': 'r', 'id': 359, 'synset': 'dagger.n.01', 'synonyms': ['dagger'], 'def': 'a short knife with a pointed blade used for piercing or stabbing', 'name': 'dagger'}, {'frequency': 'r', 'id': 360, 'synset': 'dartboard.n.01', 'synonyms': ['dartboard'], 'def': 'a circular board of wood or cork used as the target in the game of darts', 'name': 'dartboard'}, {'frequency': 'r', 'id': 361, 'synset': 'date.n.08', 'synonyms': ['date_(fruit)'], 'def': 'sweet edible fruit of the date palm with a single long woody seed', 'name': 'date_(fruit)'}, {'frequency': 'f', 'id': 362, 'synset': 'deck_chair.n.01', 'synonyms': ['deck_chair', 'beach_chair'], 'def': 'a folding chair for use outdoors; a wooden frame supports a length of canvas', 'name': 'deck_chair'}, {'frequency': 'c', 'id': 363, 'synset': 'deer.n.01', 'synonyms': ['deer', 'cervid'], 'def': "distinguished from Bovidae by the male's having solid deciduous antlers", 'name': 'deer'}, {'frequency': 'c', 'id': 364, 'synset': 'dental_floss.n.01', 'synonyms': ['dental_floss', 'floss'], 'def': 'a soft thread for cleaning the spaces between the teeth', 'name': 'dental_floss'}, {'frequency': 'f', 'id': 365, 'synset': 'desk.n.01', 'synonyms': ['desk'], 'def': 'a piece of furniture with a writing surface and usually drawers or other compartments', 'name': 'desk'}, {'frequency': 'r', 'id': 366, 'synset': 'detergent.n.01', 'synonyms': ['detergent'], 'def': 'a surface-active chemical widely used in industry and laundering', 'name': 'detergent'}, {'frequency': 'c', 'id': 367, 'synset': 'diaper.n.01', 'synonyms': ['diaper'], 'def': 'garment consisting of a folded cloth drawn up between the legs and fastened at the waist', 'name': 'diaper'}, {'frequency': 'r', 'id': 368, 'synset': 'diary.n.01', 'synonyms': ['diary', 'journal'], 'def': 'a daily written record of (usually personal) experiences and observations', 'name': 'diary'}, {'frequency': 'r', 'id': 369, 'synset': 'die.n.01', 'synonyms': ['die', 'dice'], 'def': 'a small cube with 1 to 6 spots on the six faces; used in gambling', 'name': 'die'}, {'frequency': 'r', 'id': 370, 'synset': 'dinghy.n.01', 'synonyms': ['dinghy', 'dory', 'rowboat'], 'def': 'a small boat of shallow draft with seats and oars with which it is propelled', 'name': 'dinghy'}, {'frequency': 'f', 'id': 371, 'synset': 'dining_table.n.01', 'synonyms': ['dining_table'], 'def': 'a table at which meals are served', 'name': 'dining_table'}, {'frequency': 'r', 'id': 372, 'synset': 'dinner_jacket.n.01', 'synonyms': ['tux', 'tuxedo'], 'def': 'semiformal evening dress for men', 'name': 'tux'}, {'frequency': 'c', 'id': 373, 'synset': 'dish.n.01', 'synonyms': ['dish'], 'def': 'a piece of dishware normally used as a container for holding or serving food', 'name': 'dish'}, {'frequency': 'c', 'id': 374, 'synset': 'dish.n.05', 'synonyms': ['dish_antenna'], 'def': 'directional antenna consisting of a parabolic reflector', 'name': 'dish_antenna'}, {'frequency': 'c', 'id': 375, 'synset': 'dishrag.n.01', 'synonyms': ['dishrag', 'dishcloth'], 'def': 'a cloth for washing dishes', 'name': 'dishrag'}, {'frequency': 'c', 'id': 376, 'synset': 'dishtowel.n.01', 'synonyms': ['dishtowel', 'tea_towel'], 'def': 'a towel for drying dishes', 'name': 'dishtowel'}, {'frequency': 'f', 'id': 377, 'synset': 'dishwasher.n.01', 'synonyms': ['dishwasher', 'dishwashing_machine'], 'def': 'a machine for washing dishes', 'name': 'dishwasher'}, {'frequency': 'r', 'id': 378, 'synset': 'dishwasher_detergent.n.01', 'synonyms': ['dishwasher_detergent', 'dishwashing_detergent', 'dishwashing_liquid'], 'def': 'a low-sudsing detergent designed for use in dishwashers', 'name': 'dishwasher_detergent'}, {'frequency': 'r', 'id': 379, 'synset': 'diskette.n.01', 'synonyms': ['diskette', 'floppy', 'floppy_disk'], 'def': 'a small plastic magnetic disk enclosed in a stiff envelope used to store data', 'name': 'diskette'}, {'frequency': 'c', 'id': 380, 'synset': 'dispenser.n.01', 'synonyms': ['dispenser'], 'def': 'a container so designed that the contents can be used in prescribed amounts', 'name': 'dispenser'}, {'frequency': 'c', 'id': 381, 'synset': 'dixie_cup.n.01', 'synonyms': ['Dixie_cup', 'paper_cup'], 'def': 'a disposable cup made of paper; for holding drinks', 'name': 'Dixie_cup'}, {'frequency': 'f', 'id': 382, 'synset': 'dog.n.01', 'synonyms': ['dog'], 'def': 'a common domesticated dog', 'name': 'dog'}, {'frequency': 'f', 'id': 383, 'synset': 'dog_collar.n.01', 'synonyms': ['dog_collar'], 'def': 'a collar for a dog', 'name': 'dog_collar'}, {'frequency': 'c', 'id': 384, 'synset': 'doll.n.01', 'synonyms': ['doll'], 'def': 'a toy replica of a HUMAN (NOT AN ANIMAL)', 'name': 'doll'}, {'frequency': 'r', 'id': 385, 'synset': 'dollar.n.02', 'synonyms': ['dollar', 'dollar_bill', 'one_dollar_bill'], 'def': 'a piece of paper money worth one dollar', 'name': 'dollar'}, {'frequency': 'r', 'id': 386, 'synset': 'dolphin.n.02', 'synonyms': ['dolphin'], 'def': 'any of various small toothed whales with a beaklike snout; larger than porpoises', 'name': 'dolphin'}, {'frequency': 'c', 'id': 387, 'synset': 'domestic_ass.n.01', 'synonyms': ['domestic_ass', 'donkey'], 'def': 'domestic beast of burden descended from the African wild ass; patient but stubborn', 'name': 'domestic_ass'}, {'frequency': 'r', 'id': 388, 'synset': 'domino.n.03', 'synonyms': ['eye_mask'], 'def': 'a mask covering the upper part of the face but with holes for the eyes', 'name': 'eye_mask'}, {'frequency': 'r', 'id': 389, 'synset': 'doorbell.n.01', 'synonyms': ['doorbell', 'buzzer'], 'def': 'a button at an outer door that gives a ringing or buzzing signal when pushed', 'name': 'doorbell'}, {'frequency': 'f', 'id': 390, 'synset': 'doorknob.n.01', 'synonyms': ['doorknob', 'doorhandle'], 'def': "a knob used to open a door (often called `doorhandle' in Great Britain)", 'name': 'doorknob'}, {'frequency': 'c', 'id': 391, 'synset': 'doormat.n.02', 'synonyms': ['doormat', 'welcome_mat'], 'def': 'a mat placed outside an exterior door for wiping the shoes before entering', 'name': 'doormat'}, {'frequency': 'f', 'id': 392, 'synset': 'doughnut.n.02', 'synonyms': ['doughnut', 'donut'], 'def': 'a small ring-shaped friedcake', 'name': 'doughnut'}, {'frequency': 'r', 'id': 393, 'synset': 'dove.n.01', 'synonyms': ['dove'], 'def': 'any of numerous small pigeons', 'name': 'dove'}, {'frequency': 'r', 'id': 394, 'synset': 'dragonfly.n.01', 'synonyms': ['dragonfly'], 'def': 'slender-bodied non-stinging insect having iridescent wings that are outspread at rest', 'name': 'dragonfly'}, {'frequency': 'f', 'id': 395, 'synset': 'drawer.n.01', 'synonyms': ['drawer'], 'def': 'a boxlike container in a piece of furniture; made so as to slide in and out', 'name': 'drawer'}, {'frequency': 'c', 'id': 396, 'synset': 'drawers.n.01', 'synonyms': ['underdrawers', 'boxers', 'boxershorts'], 'def': 'underpants worn by men', 'name': 'underdrawers'}, {'frequency': 'f', 'id': 397, 'synset': 'dress.n.01', 'synonyms': ['dress', 'frock'], 'def': 'a one-piece garment for a woman; has skirt and bodice', 'name': 'dress'}, {'frequency': 'c', 'id': 398, 'synset': 'dress_hat.n.01', 'synonyms': ['dress_hat', 'high_hat', 'opera_hat', 'silk_hat', 'top_hat'], 'def': "a man's hat with a tall crown; usually covered with silk or with beaver fur", 'name': 'dress_hat'}, {'frequency': 'c', 'id': 399, 'synset': 'dress_suit.n.01', 'synonyms': ['dress_suit'], 'def': 'formalwear consisting of full evening dress for men', 'name': 'dress_suit'}, {'frequency': 'c', 'id': 400, 'synset': 'dresser.n.05', 'synonyms': ['dresser'], 'def': 'a cabinet with shelves', 'name': 'dresser'}, {'frequency': 'c', 'id': 401, 'synset': 'drill.n.01', 'synonyms': ['drill'], 'def': 'a tool with a sharp rotating point for making holes in hard materials', 'name': 'drill'}, {'frequency': 'r', 'id': 402, 'synset': 'drinking_fountain.n.01', 'synonyms': ['drinking_fountain'], 'def': 'a public fountain to provide a jet of drinking water', 'name': 'drinking_fountain'}, {'frequency': 'r', 'id': 403, 'synset': 'drone.n.04', 'synonyms': ['drone'], 'def': 'an aircraft without a pilot that is operated by remote control', 'name': 'drone'}, {'frequency': 'r', 'id': 404, 'synset': 'dropper.n.01', 'synonyms': ['dropper', 'eye_dropper'], 'def': 'pipet consisting of a small tube with a vacuum bulb at one end for drawing liquid in and releasing it a drop at a time', 'name': 'dropper'}, {'frequency': 'c', 'id': 405, 'synset': 'drum.n.01', 'synonyms': ['drum_(musical_instrument)'], 'def': 'a musical percussion instrument; usually consists of a hollow cylinder with a membrane stretched across each end', 'name': 'drum_(musical_instrument)'}, {'frequency': 'r', 'id': 406, 'synset': 'drumstick.n.02', 'synonyms': ['drumstick'], 'def': 'a stick used for playing a drum', 'name': 'drumstick'}, {'frequency': 'f', 'id': 407, 'synset': 'duck.n.01', 'synonyms': ['duck'], 'def': 'small web-footed broad-billed swimming bird', 'name': 'duck'}, {'frequency': 'r', 'id': 408, 'synset': 'duckling.n.02', 'synonyms': ['duckling'], 'def': 'young duck', 'name': 'duckling'}, {'frequency': 'c', 'id': 409, 'synset': 'duct_tape.n.01', 'synonyms': ['duct_tape'], 'def': 'a wide silvery adhesive tape', 'name': 'duct_tape'}, {'frequency': 'f', 'id': 410, 'synset': 'duffel_bag.n.01', 'synonyms': ['duffel_bag', 'duffle_bag', 'duffel', 'duffle'], 'def': 'a large cylindrical bag of heavy cloth', 'name': 'duffel_bag'}, {'frequency': 'r', 'id': 411, 'synset': 'dumbbell.n.01', 'synonyms': ['dumbbell'], 'def': 'an exercising weight with two ball-like ends connected by a short handle', 'name': 'dumbbell'}, {'frequency': 'c', 'id': 412, 'synset': 'dumpster.n.01', 'synonyms': ['dumpster'], 'def': 'a container designed to receive and transport and dump waste', 'name': 'dumpster'}, {'frequency': 'r', 'id': 413, 'synset': 'dustpan.n.02', 'synonyms': ['dustpan'], 'def': 'a short-handled receptacle into which dust can be swept', 'name': 'dustpan'}, {'frequency': 'r', 'id': 414, 'synset': 'dutch_oven.n.02', 'synonyms': ['Dutch_oven'], 'def': 'iron or earthenware cooking pot; used for stews', 'name': 'Dutch_oven'}, {'frequency': 'c', 'id': 415, 'synset': 'eagle.n.01', 'synonyms': ['eagle'], 'def': 'large birds of prey noted for their broad wings and strong soaring flight', 'name': 'eagle'}, {'frequency': 'f', 'id': 416, 'synset': 'earphone.n.01', 'synonyms': ['earphone', 'earpiece', 'headphone'], 'def': 'device for listening to audio that is held over or inserted into the ear', 'name': 'earphone'}, {'frequency': 'r', 'id': 417, 'synset': 'earplug.n.01', 'synonyms': ['earplug'], 'def': 'a soft plug that is inserted into the ear canal to block sound', 'name': 'earplug'}, {'frequency': 'f', 'id': 418, 'synset': 'earring.n.01', 'synonyms': ['earring'], 'def': 'jewelry to ornament the ear', 'name': 'earring'}, {'frequency': 'c', 'id': 419, 'synset': 'easel.n.01', 'synonyms': ['easel'], 'def': "an upright tripod for displaying something (usually an artist's canvas)", 'name': 'easel'}, {'frequency': 'r', 'id': 420, 'synset': 'eclair.n.01', 'synonyms': ['eclair'], 'def': 'oblong cream puff', 'name': 'eclair'}, {'frequency': 'r', 'id': 421, 'synset': 'eel.n.01', 'synonyms': ['eel'], 'def': 'an elongate fish with fatty flesh', 'name': 'eel'}, {'frequency': 'f', 'id': 422, 'synset': 'egg.n.02', 'synonyms': ['egg', 'eggs'], 'def': 'oval reproductive body of a fowl (especially a hen) used as food', 'name': 'egg'}, {'frequency': 'r', 'id': 423, 'synset': 'egg_roll.n.01', 'synonyms': ['egg_roll', 'spring_roll'], 'def': 'minced vegetables and meat wrapped in a pancake and fried', 'name': 'egg_roll'}, {'frequency': 'c', 'id': 424, 'synset': 'egg_yolk.n.01', 'synonyms': ['egg_yolk', 'yolk_(egg)'], 'def': 'the yellow spherical part of an egg', 'name': 'egg_yolk'}, {'frequency': 'c', 'id': 425, 'synset': 'eggbeater.n.02', 'synonyms': ['eggbeater', 'eggwhisk'], 'def': 'a mixer for beating eggs or whipping cream', 'name': 'eggbeater'}, {'frequency': 'c', 'id': 426, 'synset': 'eggplant.n.01', 'synonyms': ['eggplant', 'aubergine'], 'def': 'egg-shaped vegetable having a shiny skin typically dark purple', 'name': 'eggplant'}, {'frequency': 'r', 'id': 427, 'synset': 'electric_chair.n.01', 'synonyms': ['electric_chair'], 'def': 'a chair-shaped instrument of execution by electrocution', 'name': 'electric_chair'}, {'frequency': 'f', 'id': 428, 'synset': 'electric_refrigerator.n.01', 'synonyms': ['refrigerator'], 'def': 'a refrigerator in which the coolant is pumped around by an electric motor', 'name': 'refrigerator'}, {'frequency': 'f', 'id': 429, 'synset': 'elephant.n.01', 'synonyms': ['elephant'], 'def': 'a common elephant', 'name': 'elephant'}, {'frequency': 'r', 'id': 430, 'synset': 'elk.n.01', 'synonyms': ['elk', 'moose'], 'def': 'large northern deer with enormous flattened antlers in the male', 'name': 'elk'}, {'frequency': 'c', 'id': 431, 'synset': 'envelope.n.01', 'synonyms': ['envelope'], 'def': 'a flat (usually rectangular) container for a letter, thin package, etc.', 'name': 'envelope'}, {'frequency': 'c', 'id': 432, 'synset': 'eraser.n.01', 'synonyms': ['eraser'], 'def': 'an implement used to erase something', 'name': 'eraser'}, {'frequency': 'r', 'id': 433, 'synset': 'escargot.n.01', 'synonyms': ['escargot'], 'def': 'edible snail usually served in the shell with a sauce of melted butter and garlic', 'name': 'escargot'}, {'frequency': 'r', 'id': 434, 'synset': 'eyepatch.n.01', 'synonyms': ['eyepatch'], 'def': 'a protective cloth covering for an injured eye', 'name': 'eyepatch'}, {'frequency': 'r', 'id': 435, 'synset': 'falcon.n.01', 'synonyms': ['falcon'], 'def': 'birds of prey having long pointed powerful wings adapted for swift flight', 'name': 'falcon'}, {'frequency': 'f', 'id': 436, 'synset': 'fan.n.01', 'synonyms': ['fan'], 'def': 'a device for creating a current of air by movement of a surface or surfaces', 'name': 'fan'}, {'frequency': 'f', 'id': 437, 'synset': 'faucet.n.01', 'synonyms': ['faucet', 'spigot', 'tap'], 'def': 'a regulator for controlling the flow of a liquid from a reservoir', 'name': 'faucet'}, {'frequency': 'r', 'id': 438, 'synset': 'fedora.n.01', 'synonyms': ['fedora'], 'def': 'a hat made of felt with a creased crown', 'name': 'fedora'}, {'frequency': 'r', 'id': 439, 'synset': 'ferret.n.02', 'synonyms': ['ferret'], 'def': 'domesticated albino variety of the European polecat bred for hunting rats and rabbits', 'name': 'ferret'}, {'frequency': 'c', 'id': 440, 'synset': 'ferris_wheel.n.01', 'synonyms': ['Ferris_wheel'], 'def': 'a large wheel with suspended seats that remain upright as the wheel rotates', 'name': 'Ferris_wheel'}, {'frequency': 'r', 'id': 441, 'synset': 'ferry.n.01', 'synonyms': ['ferry', 'ferryboat'], 'def': 'a boat that transports people or vehicles across a body of water and operates on a regular schedule', 'name': 'ferry'}, {'frequency': 'r', 'id': 442, 'synset': 'fig.n.04', 'synonyms': ['fig_(fruit)'], 'def': 'fleshy sweet pear-shaped yellowish or purple fruit eaten fresh or preserved or dried', 'name': 'fig_(fruit)'}, {'frequency': 'c', 'id': 443, 'synset': 'fighter.n.02', 'synonyms': ['fighter_jet', 'fighter_aircraft', 'attack_aircraft'], 'def': 'a high-speed military or naval airplane designed to destroy enemy targets', 'name': 'fighter_jet'}, {'frequency': 'f', 'id': 444, 'synset': 'figurine.n.01', 'synonyms': ['figurine'], 'def': 'a small carved or molded figure', 'name': 'figurine'}, {'frequency': 'c', 'id': 445, 'synset': 'file.n.03', 'synonyms': ['file_cabinet', 'filing_cabinet'], 'def': 'office furniture consisting of a container for keeping papers in order', 'name': 'file_cabinet'}, {'frequency': 'r', 'id': 446, 'synset': 'file.n.04', 'synonyms': ['file_(tool)'], 'def': 'a steel hand tool with small sharp teeth on some or all of its surfaces; used for smoothing wood or metal', 'name': 'file_(tool)'}, {'frequency': 'f', 'id': 447, 'synset': 'fire_alarm.n.02', 'synonyms': ['fire_alarm', 'smoke_alarm'], 'def': 'an alarm that is tripped off by fire or smoke', 'name': 'fire_alarm'}, {'frequency': 'c', 'id': 448, 'synset': 'fire_engine.n.01', 'synonyms': ['fire_engine', 'fire_truck'], 'def': 'large trucks that carry firefighters and equipment to the site of a fire', 'name': 'fire_engine'}, {'frequency': 'c', 'id': 449, 'synset': 'fire_extinguisher.n.01', 'synonyms': ['fire_extinguisher', 'extinguisher'], 'def': 'a manually operated device for extinguishing small fires', 'name': 'fire_extinguisher'}, {'frequency': 'c', 'id': 450, 'synset': 'fire_hose.n.01', 'synonyms': ['fire_hose'], 'def': 'a large hose that carries water from a fire hydrant to the site of the fire', 'name': 'fire_hose'}, {'frequency': 'f', 'id': 451, 'synset': 'fireplace.n.01', 'synonyms': ['fireplace'], 'def': 'an open recess in a wall at the base of a chimney where a fire can be built', 'name': 'fireplace'}, {'frequency': 'f', 'id': 452, 'synset': 'fireplug.n.01', 'synonyms': ['fireplug', 'fire_hydrant', 'hydrant'], 'def': 'an upright hydrant for drawing water to use in fighting a fire', 'name': 'fireplug'}, {'frequency': 'c', 'id': 453, 'synset': 'fish.n.01', 'synonyms': ['fish'], 'def': 'any of various mostly cold-blooded aquatic vertebrates usually having scales and breathing through gills', 'name': 'fish'}, {'frequency': 'r', 'id': 454, 'synset': 'fish.n.02', 'synonyms': ['fish_(food)'], 'def': 'the flesh of fish used as food', 'name': 'fish_(food)'}, {'frequency': 'r', 'id': 455, 'synset': 'fishbowl.n.02', 'synonyms': ['fishbowl', 'goldfish_bowl'], 'def': 'a transparent bowl in which small fish are kept', 'name': 'fishbowl'}, {'frequency': 'r', 'id': 456, 'synset': 'fishing_boat.n.01', 'synonyms': ['fishing_boat', 'fishing_vessel'], 'def': 'a vessel for fishing', 'name': 'fishing_boat'}, {'frequency': 'c', 'id': 457, 'synset': 'fishing_rod.n.01', 'synonyms': ['fishing_rod', 'fishing_pole'], 'def': 'a rod that is used in fishing to extend the fishing line', 'name': 'fishing_rod'}, {'frequency': 'f', 'id': 458, 'synset': 'flag.n.01', 'synonyms': ['flag'], 'def': 'emblem usually consisting of a rectangular piece of cloth of distinctive design (do not include pole)', 'name': 'flag'}, {'frequency': 'f', 'id': 459, 'synset': 'flagpole.n.02', 'synonyms': ['flagpole', 'flagstaff'], 'def': 'a tall staff or pole on which a flag is raised', 'name': 'flagpole'}, {'frequency': 'c', 'id': 460, 'synset': 'flamingo.n.01', 'synonyms': ['flamingo'], 'def': 'large pink web-footed bird with down-bent bill', 'name': 'flamingo'}, {'frequency': 'c', 'id': 461, 'synset': 'flannel.n.01', 'synonyms': ['flannel'], 'def': 'a soft light woolen fabric; used for clothing', 'name': 'flannel'}, {'frequency': 'r', 'id': 462, 'synset': 'flash.n.10', 'synonyms': ['flash', 'flashbulb'], 'def': 'a lamp for providing momentary light to take a photograph', 'name': 'flash'}, {'frequency': 'c', 'id': 463, 'synset': 'flashlight.n.01', 'synonyms': ['flashlight', 'torch'], 'def': 'a small portable battery-powered electric lamp', 'name': 'flashlight'}, {'frequency': 'r', 'id': 464, 'synset': 'fleece.n.03', 'synonyms': ['fleece'], 'def': 'a soft bulky fabric with deep pile; used chiefly for clothing', 'name': 'fleece'}, {'frequency': 'f', 'id': 465, 'synset': 'flip-flop.n.02', 'synonyms': ['flip-flop_(sandal)'], 'def': 'a backless sandal held to the foot by a thong between two toes', 'name': 'flip-flop_(sandal)'}, {'frequency': 'c', 'id': 466, 'synset': 'flipper.n.01', 'synonyms': ['flipper_(footwear)', 'fin_(footwear)'], 'def': 'a shoe to aid a person in swimming', 'name': 'flipper_(footwear)'}, {'frequency': 'f', 'id': 467, 'synset': 'flower_arrangement.n.01', 'synonyms': ['flower_arrangement', 'floral_arrangement'], 'def': 'a decorative arrangement of flowers', 'name': 'flower_arrangement'}, {'frequency': 'c', 'id': 468, 'synset': 'flute.n.02', 'synonyms': ['flute_glass', 'champagne_flute'], 'def': 'a tall narrow wineglass', 'name': 'flute_glass'}, {'frequency': 'r', 'id': 469, 'synset': 'foal.n.01', 'synonyms': ['foal'], 'def': 'a young horse', 'name': 'foal'}, {'frequency': 'c', 'id': 470, 'synset': 'folding_chair.n.01', 'synonyms': ['folding_chair'], 'def': 'a chair that can be folded flat for storage', 'name': 'folding_chair'}, {'frequency': 'c', 'id': 471, 'synset': 'food_processor.n.01', 'synonyms': ['food_processor'], 'def': 'a kitchen appliance for shredding, blending, chopping, or slicing food', 'name': 'food_processor'}, {'frequency': 'c', 'id': 472, 'synset': 'football.n.02', 'synonyms': ['football_(American)'], 'def': 'the inflated oblong ball used in playing American football', 'name': 'football_(American)'}, {'frequency': 'r', 'id': 473, 'synset': 'football_helmet.n.01', 'synonyms': ['football_helmet'], 'def': 'a padded helmet with a face mask to protect the head of football players', 'name': 'football_helmet'}, {'frequency': 'c', 'id': 474, 'synset': 'footstool.n.01', 'synonyms': ['footstool', 'footrest'], 'def': 'a low seat or a stool to rest the feet of a seated person', 'name': 'footstool'}, {'frequency': 'f', 'id': 475, 'synset': 'fork.n.01', 'synonyms': ['fork'], 'def': 'cutlery used for serving and eating food', 'name': 'fork'}, {'frequency': 'r', 'id': 476, 'synset': 'forklift.n.01', 'synonyms': ['forklift'], 'def': 'an industrial vehicle with a power operated fork in front that can be inserted under loads to lift and move them', 'name': 'forklift'}, {'frequency': 'r', 'id': 477, 'synset': 'freight_car.n.01', 'synonyms': ['freight_car'], 'def': 'a railway car that carries freight', 'name': 'freight_car'}, {'frequency': 'r', 'id': 478, 'synset': 'french_toast.n.01', 'synonyms': ['French_toast'], 'def': 'bread slice dipped in egg and milk and fried', 'name': 'French_toast'}, {'frequency': 'c', 'id': 479, 'synset': 'freshener.n.01', 'synonyms': ['freshener', 'air_freshener'], 'def': 'anything that freshens', 'name': 'freshener'}, {'frequency': 'f', 'id': 480, 'synset': 'frisbee.n.01', 'synonyms': ['frisbee'], 'def': 'a light, plastic disk propelled with a flip of the wrist for recreation or competition', 'name': 'frisbee'}, {'frequency': 'c', 'id': 481, 'synset': 'frog.n.01', 'synonyms': ['frog', 'toad', 'toad_frog'], 'def': 'a tailless stout-bodied amphibians with long hind limbs for leaping', 'name': 'frog'}, {'frequency': 'c', 'id': 482, 'synset': 'fruit_juice.n.01', 'synonyms': ['fruit_juice'], 'def': 'drink produced by squeezing or crushing fruit', 'name': 'fruit_juice'}, {'frequency': 'r', 'id': 483, 'synset': 'fruit_salad.n.01', 'synonyms': ['fruit_salad'], 'def': 'salad composed of fruits', 'name': 'fruit_salad'}, {'frequency': 'c', 'id': 484, 'synset': 'frying_pan.n.01', 'synonyms': ['frying_pan', 'frypan', 'skillet'], 'def': 'a pan used for frying foods', 'name': 'frying_pan'}, {'frequency': 'r', 'id': 485, 'synset': 'fudge.n.01', 'synonyms': ['fudge'], 'def': 'soft creamy candy', 'name': 'fudge'}, {'frequency': 'r', 'id': 486, 'synset': 'funnel.n.02', 'synonyms': ['funnel'], 'def': 'a cone-shaped utensil used to channel a substance into a container with a small mouth', 'name': 'funnel'}, {'frequency': 'c', 'id': 487, 'synset': 'futon.n.01', 'synonyms': ['futon'], 'def': 'a pad that is used for sleeping on the floor or on a raised frame', 'name': 'futon'}, {'frequency': 'r', 'id': 488, 'synset': 'gag.n.02', 'synonyms': ['gag', 'muzzle'], 'def': "restraint put into a person's mouth to prevent speaking or shouting", 'name': 'gag'}, {'frequency': 'r', 'id': 489, 'synset': 'garbage.n.03', 'synonyms': ['garbage'], 'def': 'a receptacle where waste can be discarded', 'name': 'garbage'}, {'frequency': 'c', 'id': 490, 'synset': 'garbage_truck.n.01', 'synonyms': ['garbage_truck'], 'def': 'a truck for collecting domestic refuse', 'name': 'garbage_truck'}, {'frequency': 'c', 'id': 491, 'synset': 'garden_hose.n.01', 'synonyms': ['garden_hose'], 'def': 'a hose used for watering a lawn or garden', 'name': 'garden_hose'}, {'frequency': 'c', 'id': 492, 'synset': 'gargle.n.01', 'synonyms': ['gargle', 'mouthwash'], 'def': 'a medicated solution used for gargling and rinsing the mouth', 'name': 'gargle'}, {'frequency': 'r', 'id': 493, 'synset': 'gargoyle.n.02', 'synonyms': ['gargoyle'], 'def': 'an ornament consisting of a grotesquely carved figure of a person or animal', 'name': 'gargoyle'}, {'frequency': 'c', 'id': 494, 'synset': 'garlic.n.02', 'synonyms': ['garlic', 'ail'], 'def': 'aromatic bulb used as seasoning', 'name': 'garlic'}, {'frequency': 'r', 'id': 495, 'synset': 'gasmask.n.01', 'synonyms': ['gasmask', 'respirator', 'gas_helmet'], 'def': 'a protective face mask with a filter', 'name': 'gasmask'}, {'frequency': 'r', 'id': 496, 'synset': 'gazelle.n.01', 'synonyms': ['gazelle'], 'def': 'small swift graceful antelope of Africa and Asia having lustrous eyes', 'name': 'gazelle'}, {'frequency': 'c', 'id': 497, 'synset': 'gelatin.n.02', 'synonyms': ['gelatin', 'jelly'], 'def': 'an edible jelly made with gelatin and used as a dessert or salad base or a coating for foods', 'name': 'gelatin'}, {'frequency': 'r', 'id': 498, 'synset': 'gem.n.02', 'synonyms': ['gemstone'], 'def': 'a crystalline rock that can be cut and polished for jewelry', 'name': 'gemstone'}, {'frequency': 'c', 'id': 499, 'synset': 'giant_panda.n.01', 'synonyms': ['giant_panda', 'panda', 'panda_bear'], 'def': 'large black-and-white herbivorous mammal of bamboo forests of China and Tibet', 'name': 'giant_panda'}, {'frequency': 'c', 'id': 500, 'synset': 'gift_wrap.n.01', 'synonyms': ['gift_wrap'], 'def': 'attractive wrapping paper suitable for wrapping gifts', 'name': 'gift_wrap'}, {'frequency': 'c', 'id': 501, 'synset': 'ginger.n.03', 'synonyms': ['ginger', 'gingerroot'], 'def': 'the root of the common ginger plant; used fresh as a seasoning', 'name': 'ginger'}, {'frequency': 'f', 'id': 502, 'synset': 'giraffe.n.01', 'synonyms': ['giraffe'], 'def': 'tall animal having a spotted coat and small horns and very long neck and legs', 'name': 'giraffe'}, {'frequency': 'c', 'id': 503, 'synset': 'girdle.n.02', 'synonyms': ['cincture', 'sash', 'waistband', 'waistcloth'], 'def': 'a band of material around the waist that strengthens a skirt or trousers', 'name': 'cincture'}, {'frequency': 'f', 'id': 504, 'synset': 'glass.n.02', 'synonyms': ['glass_(drink_container)', 'drinking_glass'], 'def': 'a container for holding liquids while drinking', 'name': 'glass_(drink_container)'}, {'frequency': 'c', 'id': 505, 'synset': 'globe.n.03', 'synonyms': ['globe'], 'def': 'a sphere on which a map (especially of the earth) is represented', 'name': 'globe'}, {'frequency': 'f', 'id': 506, 'synset': 'glove.n.02', 'synonyms': ['glove'], 'def': 'handwear covering the hand', 'name': 'glove'}, {'frequency': 'c', 'id': 507, 'synset': 'goat.n.01', 'synonyms': ['goat'], 'def': 'a common goat', 'name': 'goat'}, {'frequency': 'f', 'id': 508, 'synset': 'goggles.n.01', 'synonyms': ['goggles'], 'def': 'tight-fitting spectacles worn to protect the eyes', 'name': 'goggles'}, {'frequency': 'r', 'id': 509, 'synset': 'goldfish.n.01', 'synonyms': ['goldfish'], 'def': 'small golden or orange-red freshwater fishes used as pond or aquarium pets', 'name': 'goldfish'}, {'frequency': 'r', 'id': 510, 'synset': 'golf_club.n.02', 'synonyms': ['golf_club', 'golf-club'], 'def': 'golf equipment used by a golfer to hit a golf ball', 'name': 'golf_club'}, {'frequency': 'c', 'id': 511, 'synset': 'golfcart.n.01', 'synonyms': ['golfcart'], 'def': 'a small motor vehicle in which golfers can ride between shots', 'name': 'golfcart'}, {'frequency': 'r', 'id': 512, 'synset': 'gondola.n.02', 'synonyms': ['gondola_(boat)'], 'def': 'long narrow flat-bottomed boat propelled by sculling; traditionally used on canals of Venice', 'name': 'gondola_(boat)'}, {'frequency': 'c', 'id': 513, 'synset': 'goose.n.01', 'synonyms': ['goose'], 'def': 'loud, web-footed long-necked aquatic birds usually larger than ducks', 'name': 'goose'}, {'frequency': 'r', 'id': 514, 'synset': 'gorilla.n.01', 'synonyms': ['gorilla'], 'def': 'largest ape', 'name': 'gorilla'}, {'frequency': 'r', 'id': 515, 'synset': 'gourd.n.02', 'synonyms': ['gourd'], 'def': 'any of numerous inedible fruits with hard rinds', 'name': 'gourd'}, {'frequency': 'r', 'id': 516, 'synset': 'gown.n.04', 'synonyms': ['surgical_gown', 'scrubs_(surgical_clothing)'], 'def': 'protective garment worn by surgeons during operations', 'name': 'surgical_gown'}, {'frequency': 'f', 'id': 517, 'synset': 'grape.n.01', 'synonyms': ['grape'], 'def': 'any of various juicy fruit with green or purple skins; grow in clusters', 'name': 'grape'}, {'frequency': 'r', 'id': 518, 'synset': 'grasshopper.n.01', 'synonyms': ['grasshopper'], 'def': 'plant-eating insect with hind legs adapted for leaping', 'name': 'grasshopper'}, {'frequency': 'c', 'id': 519, 'synset': 'grater.n.01', 'synonyms': ['grater'], 'def': 'utensil with sharp perforations for shredding foods (as vegetables or cheese)', 'name': 'grater'}, {'frequency': 'c', 'id': 520, 'synset': 'gravestone.n.01', 'synonyms': ['gravestone', 'headstone', 'tombstone'], 'def': 'a stone that is used to mark a grave', 'name': 'gravestone'}, {'frequency': 'r', 'id': 521, 'synset': 'gravy_boat.n.01', 'synonyms': ['gravy_boat', 'gravy_holder'], 'def': 'a dish (often boat-shaped) for serving gravy or sauce', 'name': 'gravy_boat'}, {'frequency': 'c', 'id': 522, 'synset': 'green_bean.n.02', 'synonyms': ['green_bean'], 'def': 'a common bean plant cultivated for its slender green edible pods', 'name': 'green_bean'}, {'frequency': 'c', 'id': 523, 'synset': 'green_onion.n.01', 'synonyms': ['green_onion', 'spring_onion', 'scallion'], 'def': 'a young onion before the bulb has enlarged', 'name': 'green_onion'}, {'frequency': 'r', 'id': 524, 'synset': 'griddle.n.01', 'synonyms': ['griddle'], 'def': 'cooking utensil consisting of a flat heated surface on which food is cooked', 'name': 'griddle'}, {'frequency': 'r', 'id': 525, 'synset': 'grillroom.n.01', 'synonyms': ['grillroom', 'grill_(restaurant)'], 'def': 'a restaurant where food is cooked on a grill', 'name': 'grillroom'}, {'frequency': 'r', 'id': 526, 'synset': 'grinder.n.04', 'synonyms': ['grinder_(tool)'], 'def': 'a machine tool that polishes metal', 'name': 'grinder_(tool)'}, {'frequency': 'r', 'id': 527, 'synset': 'grits.n.01', 'synonyms': ['grits', 'hominy_grits'], 'def': 'coarsely ground corn boiled as a breakfast dish', 'name': 'grits'}, {'frequency': 'c', 'id': 528, 'synset': 'grizzly.n.01', 'synonyms': ['grizzly', 'grizzly_bear'], 'def': 'powerful brownish-yellow bear of the uplands of western North America', 'name': 'grizzly'}, {'frequency': 'c', 'id': 529, 'synset': 'grocery_bag.n.01', 'synonyms': ['grocery_bag'], 'def': "a sack for holding customer's groceries", 'name': 'grocery_bag'}, {'frequency': 'r', 'id': 530, 'synset': 'guacamole.n.01', 'synonyms': ['guacamole'], 'def': 'a dip made of mashed avocado mixed with chopped onions and other seasonings', 'name': 'guacamole'}, {'frequency': 'f', 'id': 531, 'synset': 'guitar.n.01', 'synonyms': ['guitar'], 'def': 'a stringed instrument usually having six strings; played by strumming or plucking', 'name': 'guitar'}, {'frequency': 'c', 'id': 532, 'synset': 'gull.n.02', 'synonyms': ['gull', 'seagull'], 'def': 'mostly white aquatic bird having long pointed wings and short legs', 'name': 'gull'}, {'frequency': 'c', 'id': 533, 'synset': 'gun.n.01', 'synonyms': ['gun'], 'def': 'a weapon that discharges a bullet at high velocity from a metal tube', 'name': 'gun'}, {'frequency': 'r', 'id': 534, 'synset': 'hair_spray.n.01', 'synonyms': ['hair_spray'], 'def': 'substance sprayed on the hair to hold it in place', 'name': 'hair_spray'}, {'frequency': 'c', 'id': 535, 'synset': 'hairbrush.n.01', 'synonyms': ['hairbrush'], 'def': "a brush used to groom a person's hair", 'name': 'hairbrush'}, {'frequency': 'c', 'id': 536, 'synset': 'hairnet.n.01', 'synonyms': ['hairnet'], 'def': 'a small net that someone wears over their hair to keep it in place', 'name': 'hairnet'}, {'frequency': 'c', 'id': 537, 'synset': 'hairpin.n.01', 'synonyms': ['hairpin'], 'def': "a double pronged pin used to hold women's hair in place", 'name': 'hairpin'}, {'frequency': 'f', 'id': 538, 'synset': 'ham.n.01', 'synonyms': ['ham', 'jambon', 'gammon'], 'def': 'meat cut from the thigh of a hog (usually smoked)', 'name': 'ham'}, {'frequency': 'c', 'id': 539, 'synset': 'hamburger.n.01', 'synonyms': ['hamburger', 'beefburger', 'burger'], 'def': 'a sandwich consisting of a patty of minced beef served on a bun', 'name': 'hamburger'}, {'frequency': 'c', 'id': 540, 'synset': 'hammer.n.02', 'synonyms': ['hammer'], 'def': 'a hand tool with a heavy head and a handle; used to deliver an impulsive force by striking', 'name': 'hammer'}, {'frequency': 'r', 'id': 541, 'synset': 'hammock.n.02', 'synonyms': ['hammock'], 'def': 'a hanging bed of canvas or rope netting (usually suspended between two trees)', 'name': 'hammock'}, {'frequency': 'r', 'id': 542, 'synset': 'hamper.n.02', 'synonyms': ['hamper'], 'def': 'a basket usually with a cover', 'name': 'hamper'}, {'frequency': 'r', 'id': 543, 'synset': 'hamster.n.01', 'synonyms': ['hamster'], 'def': 'short-tailed burrowing rodent with large cheek pouches', 'name': 'hamster'}, {'frequency': 'c', 'id': 544, 'synset': 'hand_blower.n.01', 'synonyms': ['hair_dryer'], 'def': 'a hand-held electric blower that can blow warm air onto the hair', 'name': 'hair_dryer'}, {'frequency': 'r', 'id': 545, 'synset': 'hand_glass.n.01', 'synonyms': ['hand_glass', 'hand_mirror'], 'def': 'a mirror intended to be held in the hand', 'name': 'hand_glass'}, {'frequency': 'f', 'id': 546, 'synset': 'hand_towel.n.01', 'synonyms': ['hand_towel', 'face_towel'], 'def': 'a small towel used to dry the hands or face', 'name': 'hand_towel'}, {'frequency': 'c', 'id': 547, 'synset': 'handcart.n.01', 'synonyms': ['handcart', 'pushcart', 'hand_truck'], 'def': 'wheeled vehicle that can be pushed by a person', 'name': 'handcart'}, {'frequency': 'r', 'id': 548, 'synset': 'handcuff.n.01', 'synonyms': ['handcuff'], 'def': 'shackle that consists of a metal loop that can be locked around the wrist', 'name': 'handcuff'}, {'frequency': 'c', 'id': 549, 'synset': 'handkerchief.n.01', 'synonyms': ['handkerchief'], 'def': 'a square piece of cloth used for wiping the eyes or nose or as a costume accessory', 'name': 'handkerchief'}, {'frequency': 'f', 'id': 550, 'synset': 'handle.n.01', 'synonyms': ['handle', 'grip', 'handgrip'], 'def': 'the appendage to an object that is designed to be held in order to use or move it', 'name': 'handle'}, {'frequency': 'r', 'id': 551, 'synset': 'handsaw.n.01', 'synonyms': ['handsaw', "carpenter's_saw"], 'def': 'a saw used with one hand for cutting wood', 'name': 'handsaw'}, {'frequency': 'r', 'id': 552, 'synset': 'hardback.n.01', 'synonyms': ['hardback_book', 'hardcover_book'], 'def': 'a book with cardboard or cloth or leather covers', 'name': 'hardback_book'}, {'frequency': 'r', 'id': 553, 'synset': 'harmonium.n.01', 'synonyms': ['harmonium', 'organ_(musical_instrument)', 'reed_organ_(musical_instrument)'], 'def': 'a free-reed instrument in which air is forced through the reeds by bellows', 'name': 'harmonium'}, {'frequency': 'f', 'id': 554, 'synset': 'hat.n.01', 'synonyms': ['hat'], 'def': 'headwear that protects the head from bad weather, sun, or worn for fashion', 'name': 'hat'}, {'frequency': 'r', 'id': 555, 'synset': 'hatbox.n.01', 'synonyms': ['hatbox'], 'def': 'a round piece of luggage for carrying hats', 'name': 'hatbox'}, {'frequency': 'r', 'id': 556, 'synset': 'hatch.n.03', 'synonyms': ['hatch'], 'def': 'a movable barrier covering a hatchway', 'name': 'hatch'}, {'frequency': 'c', 'id': 557, 'synset': 'head_covering.n.01', 'synonyms': ['veil'], 'def': 'a garment that covers the head and face', 'name': 'veil'}, {'frequency': 'f', 'id': 558, 'synset': 'headband.n.01', 'synonyms': ['headband'], 'def': 'a band worn around or over the head', 'name': 'headband'}, {'frequency': 'f', 'id': 559, 'synset': 'headboard.n.01', 'synonyms': ['headboard'], 'def': 'a vertical board or panel forming the head of a bedstead', 'name': 'headboard'}, {'frequency': 'f', 'id': 560, 'synset': 'headlight.n.01', 'synonyms': ['headlight', 'headlamp'], 'def': 'a powerful light with reflector; attached to the front of an automobile or locomotive', 'name': 'headlight'}, {'frequency': 'c', 'id': 561, 'synset': 'headscarf.n.01', 'synonyms': ['headscarf'], 'def': 'a kerchief worn over the head and tied under the chin', 'name': 'headscarf'}, {'frequency': 'r', 'id': 562, 'synset': 'headset.n.01', 'synonyms': ['headset'], 'def': 'receiver consisting of a pair of headphones', 'name': 'headset'}, {'frequency': 'c', 'id': 563, 'synset': 'headstall.n.01', 'synonyms': ['headstall_(for_horses)', 'headpiece_(for_horses)'], 'def': "the band that is the part of a bridle that fits around a horse's head", 'name': 'headstall_(for_horses)'}, {'frequency': 'r', 'id': 564, 'synset': 'hearing_aid.n.02', 'synonyms': ['hearing_aid'], 'def': 'an acoustic device used to direct sound to the ear of a hearing-impaired person', 'name': 'hearing_aid'}, {'frequency': 'c', 'id': 565, 'synset': 'heart.n.02', 'synonyms': ['heart'], 'def': 'a muscular organ; its contractions move the blood through the body', 'name': 'heart'}, {'frequency': 'c', 'id': 566, 'synset': 'heater.n.01', 'synonyms': ['heater', 'warmer'], 'def': 'device that heats water or supplies warmth to a room', 'name': 'heater'}, {'frequency': 'c', 'id': 567, 'synset': 'helicopter.n.01', 'synonyms': ['helicopter'], 'def': 'an aircraft without wings that obtains its lift from the rotation of overhead blades', 'name': 'helicopter'}, {'frequency': 'f', 'id': 568, 'synset': 'helmet.n.02', 'synonyms': ['helmet'], 'def': 'a protective headgear made of hard material to resist blows', 'name': 'helmet'}, {'frequency': 'r', 'id': 569, 'synset': 'heron.n.02', 'synonyms': ['heron'], 'def': 'grey or white wading bird with long neck and long legs and (usually) long bill', 'name': 'heron'}, {'frequency': 'c', 'id': 570, 'synset': 'highchair.n.01', 'synonyms': ['highchair', 'feeding_chair'], 'def': 'a chair for feeding a very young child', 'name': 'highchair'}, {'frequency': 'f', 'id': 571, 'synset': 'hinge.n.01', 'synonyms': ['hinge'], 'def': 'a joint that holds two parts together so that one can swing relative to the other', 'name': 'hinge'}, {'frequency': 'r', 'id': 572, 'synset': 'hippopotamus.n.01', 'synonyms': ['hippopotamus'], 'def': 'massive thick-skinned animal living in or around rivers of tropical Africa', 'name': 'hippopotamus'}, {'frequency': 'r', 'id': 573, 'synset': 'hockey_stick.n.01', 'synonyms': ['hockey_stick'], 'def': 'sports implement consisting of a stick used by hockey players to move the puck', 'name': 'hockey_stick'}, {'frequency': 'c', 'id': 574, 'synset': 'hog.n.03', 'synonyms': ['hog', 'pig'], 'def': 'domestic swine', 'name': 'hog'}, {'frequency': 'f', 'id': 575, 'synset': 'home_plate.n.01', 'synonyms': ['home_plate_(baseball)', 'home_base_(baseball)'], 'def': '(baseball) a rubber slab where the batter stands; it must be touched by a base runner in order to score', 'name': 'home_plate_(baseball)'}, {'frequency': 'c', 'id': 576, 'synset': 'honey.n.01', 'synonyms': ['honey'], 'def': 'a sweet yellow liquid produced by bees', 'name': 'honey'}, {'frequency': 'f', 'id': 577, 'synset': 'hood.n.06', 'synonyms': ['fume_hood', 'exhaust_hood'], 'def': 'metal covering leading to a vent that exhausts smoke or fumes', 'name': 'fume_hood'}, {'frequency': 'f', 'id': 578, 'synset': 'hook.n.05', 'synonyms': ['hook'], 'def': 'a curved or bent implement for suspending or pulling something', 'name': 'hook'}, {'frequency': 'f', 'id': 579, 'synset': 'horse.n.01', 'synonyms': ['horse'], 'def': 'a common horse', 'name': 'horse'}, {'frequency': 'f', 'id': 580, 'synset': 'hose.n.03', 'synonyms': ['hose', 'hosepipe'], 'def': 'a flexible pipe for conveying a liquid or gas', 'name': 'hose'}, {'frequency': 'r', 'id': 581, 'synset': 'hot-air_balloon.n.01', 'synonyms': ['hot-air_balloon'], 'def': 'balloon for travel through the air in a basket suspended below a large bag of heated air', 'name': 'hot-air_balloon'}, {'frequency': 'r', 'id': 582, 'synset': 'hot_plate.n.01', 'synonyms': ['hotplate'], 'def': 'a portable electric appliance for heating or cooking or keeping food warm', 'name': 'hotplate'}, {'frequency': 'c', 'id': 583, 'synset': 'hot_sauce.n.01', 'synonyms': ['hot_sauce'], 'def': 'a pungent peppery sauce', 'name': 'hot_sauce'}, {'frequency': 'r', 'id': 584, 'synset': 'hourglass.n.01', 'synonyms': ['hourglass'], 'def': 'a sandglass timer that runs for sixty minutes', 'name': 'hourglass'}, {'frequency': 'r', 'id': 585, 'synset': 'houseboat.n.01', 'synonyms': ['houseboat'], 'def': 'a barge that is designed and equipped for use as a dwelling', 'name': 'houseboat'}, {'frequency': 'r', 'id': 586, 'synset': 'hummingbird.n.01', 'synonyms': ['hummingbird'], 'def': 'tiny American bird having brilliant iridescent plumage and long slender bills', 'name': 'hummingbird'}, {'frequency': 'r', 'id': 587, 'synset': 'hummus.n.01', 'synonyms': ['hummus', 'humus', 'hommos', 'hoummos', 'humous'], 'def': 'a thick spread made from mashed chickpeas', 'name': 'hummus'}, {'frequency': 'c', 'id': 588, 'synset': 'ice_bear.n.01', 'synonyms': ['polar_bear'], 'def': 'white bear of Arctic regions', 'name': 'polar_bear'}, {'frequency': 'c', 'id': 589, 'synset': 'ice_cream.n.01', 'synonyms': ['icecream'], 'def': 'frozen dessert containing cream and sugar and flavoring', 'name': 'icecream'}, {'frequency': 'r', 'id': 590, 'synset': 'ice_lolly.n.01', 'synonyms': ['popsicle'], 'def': 'ice cream or water ice on a small wooden stick', 'name': 'popsicle'}, {'frequency': 'c', 'id': 591, 'synset': 'ice_maker.n.01', 'synonyms': ['ice_maker'], 'def': 'an appliance included in some electric refrigerators for making ice cubes', 'name': 'ice_maker'}, {'frequency': 'r', 'id': 592, 'synset': 'ice_pack.n.01', 'synonyms': ['ice_pack', 'ice_bag'], 'def': 'a waterproof bag filled with ice: applied to the body (especially the head) to cool or reduce swelling', 'name': 'ice_pack'}, {'frequency': 'r', 'id': 593, 'synset': 'ice_skate.n.01', 'synonyms': ['ice_skate'], 'def': 'skate consisting of a boot with a steel blade fitted to the sole', 'name': 'ice_skate'}, {'frequency': 'r', 'id': 594, 'synset': 'ice_tea.n.01', 'synonyms': ['ice_tea', 'iced_tea'], 'def': 'strong tea served over ice', 'name': 'ice_tea'}, {'frequency': 'c', 'id': 595, 'synset': 'igniter.n.01', 'synonyms': ['igniter', 'ignitor', 'lighter'], 'def': 'a substance or device used to start a fire', 'name': 'igniter'}, {'frequency': 'r', 'id': 596, 'synset': 'incense.n.01', 'synonyms': ['incense'], 'def': 'a substance that produces a fragrant odor when burned', 'name': 'incense'}, {'frequency': 'r', 'id': 597, 'synset': 'inhaler.n.01', 'synonyms': ['inhaler', 'inhalator'], 'def': 'a dispenser that produces a chemical vapor to be inhaled through mouth or nose', 'name': 'inhaler'}, {'frequency': 'c', 'id': 598, 'synset': 'ipod.n.01', 'synonyms': ['iPod'], 'def': 'a pocket-sized device used to play music files', 'name': 'iPod'}, {'frequency': 'c', 'id': 599, 'synset': 'iron.n.04', 'synonyms': ['iron_(for_clothing)', 'smoothing_iron_(for_clothing)'], 'def': 'home appliance consisting of a flat metal base that is heated and used to smooth cloth', 'name': 'iron_(for_clothing)'}, {'frequency': 'r', 'id': 600, 'synset': 'ironing_board.n.01', 'synonyms': ['ironing_board'], 'def': 'narrow padded board on collapsible supports; used for ironing clothes', 'name': 'ironing_board'}, {'frequency': 'f', 'id': 601, 'synset': 'jacket.n.01', 'synonyms': ['jacket'], 'def': 'a waist-length coat', 'name': 'jacket'}, {'frequency': 'r', 'id': 602, 'synset': 'jam.n.01', 'synonyms': ['jam'], 'def': 'preserve of crushed fruit', 'name': 'jam'}, {'frequency': 'f', 'id': 603, 'synset': 'jean.n.01', 'synonyms': ['jean', 'blue_jean', 'denim'], 'def': '(usually plural) close-fitting trousers of heavy denim for manual work or casual wear', 'name': 'jean'}, {'frequency': 'c', 'id': 604, 'synset': 'jeep.n.01', 'synonyms': ['jeep', 'landrover'], 'def': 'a car suitable for traveling over rough terrain', 'name': 'jeep'}, {'frequency': 'r', 'id': 605, 'synset': 'jelly_bean.n.01', 'synonyms': ['jelly_bean', 'jelly_egg'], 'def': 'sugar-glazed jellied candy', 'name': 'jelly_bean'}, {'frequency': 'f', 'id': 606, 'synset': 'jersey.n.03', 'synonyms': ['jersey', 'T-shirt', 'tee_shirt'], 'def': 'a close-fitting pullover shirt', 'name': 'jersey'}, {'frequency': 'c', 'id': 607, 'synset': 'jet.n.01', 'synonyms': ['jet_plane', 'jet-propelled_plane'], 'def': 'an airplane powered by one or more jet engines', 'name': 'jet_plane'}, {'frequency': 'c', 'id': 608, 'synset': 'jewelry.n.01', 'synonyms': ['jewelry', 'jewellery'], 'def': 'an adornment (as a bracelet or ring or necklace) made of precious metals and set with gems (or imitation gems)', 'name': 'jewelry'}, {'frequency': 'r', 'id': 609, 'synset': 'joystick.n.02', 'synonyms': ['joystick'], 'def': 'a control device for computers consisting of a vertical handle that can move freely in two directions', 'name': 'joystick'}, {'frequency': 'r', 'id': 610, 'synset': 'jump_suit.n.01', 'synonyms': ['jumpsuit'], 'def': "one-piece garment fashioned after a parachutist's uniform", 'name': 'jumpsuit'}, {'frequency': 'c', 'id': 611, 'synset': 'kayak.n.01', 'synonyms': ['kayak'], 'def': 'a small canoe consisting of a light frame made watertight with animal skins', 'name': 'kayak'}, {'frequency': 'r', 'id': 612, 'synset': 'keg.n.02', 'synonyms': ['keg'], 'def': 'small cask or barrel', 'name': 'keg'}, {'frequency': 'r', 'id': 613, 'synset': 'kennel.n.01', 'synonyms': ['kennel', 'doghouse'], 'def': 'outbuilding that serves as a shelter for a dog', 'name': 'kennel'}, {'frequency': 'c', 'id': 614, 'synset': 'kettle.n.01', 'synonyms': ['kettle', 'boiler'], 'def': 'a metal pot for stewing or boiling; usually has a lid', 'name': 'kettle'}, {'frequency': 'f', 'id': 615, 'synset': 'key.n.01', 'synonyms': ['key'], 'def': 'metal instrument used to unlock a lock', 'name': 'key'}, {'frequency': 'r', 'id': 616, 'synset': 'keycard.n.01', 'synonyms': ['keycard'], 'def': 'a plastic card used to gain access typically to a door', 'name': 'keycard'}, {'frequency': 'r', 'id': 617, 'synset': 'kilt.n.01', 'synonyms': ['kilt'], 'def': 'a knee-length pleated tartan skirt worn by men as part of the traditional dress in the Highlands of northern Scotland', 'name': 'kilt'}, {'frequency': 'c', 'id': 618, 'synset': 'kimono.n.01', 'synonyms': ['kimono'], 'def': 'a loose robe; imitated from robes originally worn by Japanese', 'name': 'kimono'}, {'frequency': 'f', 'id': 619, 'synset': 'kitchen_sink.n.01', 'synonyms': ['kitchen_sink'], 'def': 'a sink in a kitchen', 'name': 'kitchen_sink'}, {'frequency': 'c', 'id': 620, 'synset': 'kitchen_table.n.01', 'synonyms': ['kitchen_table'], 'def': 'a table in the kitchen', 'name': 'kitchen_table'}, {'frequency': 'f', 'id': 621, 'synset': 'kite.n.03', 'synonyms': ['kite'], 'def': 'plaything consisting of a light frame covered with tissue paper; flown in wind at end of a string', 'name': 'kite'}, {'frequency': 'c', 'id': 622, 'synset': 'kitten.n.01', 'synonyms': ['kitten', 'kitty'], 'def': 'young domestic cat', 'name': 'kitten'}, {'frequency': 'c', 'id': 623, 'synset': 'kiwi.n.03', 'synonyms': ['kiwi_fruit'], 'def': 'fuzzy brown egg-shaped fruit with slightly tart green flesh', 'name': 'kiwi_fruit'}, {'frequency': 'f', 'id': 624, 'synset': 'knee_pad.n.01', 'synonyms': ['knee_pad'], 'def': 'protective garment consisting of a pad worn by football or baseball or hockey players', 'name': 'knee_pad'}, {'frequency': 'f', 'id': 625, 'synset': 'knife.n.01', 'synonyms': ['knife'], 'def': 'tool with a blade and point used as a cutting instrument', 'name': 'knife'}, {'frequency': 'r', 'id': 626, 'synset': 'knight.n.02', 'synonyms': ['knight_(chess_piece)', 'horse_(chess_piece)'], 'def': 'a chess game piece shaped to resemble the head of a horse', 'name': 'knight_(chess_piece)'}, {'frequency': 'r', 'id': 627, 'synset': 'knitting_needle.n.01', 'synonyms': ['knitting_needle'], 'def': 'needle consisting of a slender rod with pointed ends; usually used in pairs', 'name': 'knitting_needle'}, {'frequency': 'f', 'id': 628, 'synset': 'knob.n.02', 'synonyms': ['knob'], 'def': 'a round handle often found on a door', 'name': 'knob'}, {'frequency': 'r', 'id': 629, 'synset': 'knocker.n.05', 'synonyms': ['knocker_(on_a_door)', 'doorknocker'], 'def': 'a device (usually metal and ornamental) attached by a hinge to a door', 'name': 'knocker_(on_a_door)'}, {'frequency': 'r', 'id': 630, 'synset': 'koala.n.01', 'synonyms': ['koala', 'koala_bear'], 'def': 'sluggish tailless Australian marsupial with grey furry ears and coat', 'name': 'koala'}, {'frequency': 'r', 'id': 631, 'synset': 'lab_coat.n.01', 'synonyms': ['lab_coat', 'laboratory_coat'], 'def': 'a light coat worn to protect clothing from substances used while working in a laboratory', 'name': 'lab_coat'}, {'frequency': 'f', 'id': 632, 'synset': 'ladder.n.01', 'synonyms': ['ladder'], 'def': 'steps consisting of two parallel members connected by rungs', 'name': 'ladder'}, {'frequency': 'c', 'id': 633, 'synset': 'ladle.n.01', 'synonyms': ['ladle'], 'def': 'a spoon-shaped vessel with a long handle frequently used to transfer liquids', 'name': 'ladle'}, {'frequency': 'r', 'id': 634, 'synset': 'ladybug.n.01', 'synonyms': ['ladybug', 'ladybeetle', 'ladybird_beetle'], 'def': 'small round bright-colored and spotted beetle, typically red and black', 'name': 'ladybug'}, {'frequency': 'c', 'id': 635, 'synset': 'lamb.n.01', 'synonyms': ['lamb_(animal)'], 'def': 'young sheep', 'name': 'lamb_(animal)'}, {'frequency': 'r', 'id': 636, 'synset': 'lamb_chop.n.01', 'synonyms': ['lamb-chop', 'lambchop'], 'def': 'chop cut from a lamb', 'name': 'lamb-chop'}, {'frequency': 'f', 'id': 637, 'synset': 'lamp.n.02', 'synonyms': ['lamp'], 'def': 'a piece of furniture holding one or more electric light bulbs', 'name': 'lamp'}, {'frequency': 'f', 'id': 638, 'synset': 'lamppost.n.01', 'synonyms': ['lamppost'], 'def': 'a metal post supporting an outdoor lamp (such as a streetlight)', 'name': 'lamppost'}, {'frequency': 'f', 'id': 639, 'synset': 'lampshade.n.01', 'synonyms': ['lampshade'], 'def': 'a protective ornamental shade used to screen a light bulb from direct view', 'name': 'lampshade'}, {'frequency': 'c', 'id': 640, 'synset': 'lantern.n.01', 'synonyms': ['lantern'], 'def': 'light in a transparent protective case', 'name': 'lantern'}, {'frequency': 'f', 'id': 641, 'synset': 'lanyard.n.02', 'synonyms': ['lanyard', 'laniard'], 'def': 'a cord worn around the neck to hold a knife or whistle, etc.', 'name': 'lanyard'}, {'frequency': 'f', 'id': 642, 'synset': 'laptop.n.01', 'synonyms': ['laptop_computer', 'notebook_computer'], 'def': 'a portable computer small enough to use in your lap', 'name': 'laptop_computer'}, {'frequency': 'r', 'id': 643, 'synset': 'lasagna.n.01', 'synonyms': ['lasagna', 'lasagne'], 'def': 'baked dish of layers of lasagna pasta with sauce and cheese and meat or vegetables', 'name': 'lasagna'}, {'frequency': 'c', 'id': 644, 'synset': 'latch.n.02', 'synonyms': ['latch'], 'def': 'a bar that can be lowered or slid into a groove to fasten a door or gate', 'name': 'latch'}, {'frequency': 'r', 'id': 645, 'synset': 'lawn_mower.n.01', 'synonyms': ['lawn_mower'], 'def': 'garden tool for mowing grass on lawns', 'name': 'lawn_mower'}, {'frequency': 'r', 'id': 646, 'synset': 'leather.n.01', 'synonyms': ['leather'], 'def': 'an animal skin made smooth and flexible by removing the hair and then tanning', 'name': 'leather'}, {'frequency': 'c', 'id': 647, 'synset': 'legging.n.01', 'synonyms': ['legging_(clothing)', 'leging_(clothing)', 'leg_covering'], 'def': 'a garment covering the leg (usually extending from the knee to the ankle)', 'name': 'legging_(clothing)'}, {'frequency': 'c', 'id': 648, 'synset': 'lego.n.01', 'synonyms': ['Lego', 'Lego_set'], 'def': "a child's plastic construction set for making models from blocks", 'name': 'Lego'}, {'frequency': 'f', 'id': 649, 'synset': 'lemon.n.01', 'synonyms': ['lemon'], 'def': 'yellow oval fruit with juicy acidic flesh', 'name': 'lemon'}, {'frequency': 'r', 'id': 650, 'synset': 'lemonade.n.01', 'synonyms': ['lemonade'], 'def': 'sweetened beverage of diluted lemon juice', 'name': 'lemonade'}, {'frequency': 'f', 'id': 651, 'synset': 'lettuce.n.02', 'synonyms': ['lettuce'], 'def': 'leafy plant commonly eaten in salad or on sandwiches', 'name': 'lettuce'}, {'frequency': 'f', 'id': 652, 'synset': 'license_plate.n.01', 'synonyms': ['license_plate', 'numberplate'], 'def': "a plate mounted on the front and back of car and bearing the car's registration number", 'name': 'license_plate'}, {'frequency': 'f', 'id': 653, 'synset': 'life_buoy.n.01', 'synonyms': ['life_buoy', 'lifesaver', 'life_belt', 'life_ring'], 'def': 'a ring-shaped life preserver used to prevent drowning (NOT a life-jacket or vest)', 'name': 'life_buoy'}, {'frequency': 'f', 'id': 654, 'synset': 'life_jacket.n.01', 'synonyms': ['life_jacket', 'life_vest'], 'def': 'life preserver consisting of a sleeveless jacket of buoyant or inflatable design', 'name': 'life_jacket'}, {'frequency': 'f', 'id': 655, 'synset': 'light_bulb.n.01', 'synonyms': ['lightbulb'], 'def': 'glass bulb or tube shaped electric device that emits light (DO NOT MARK LAMPS AS A WHOLE)', 'name': 'lightbulb'}, {'frequency': 'r', 'id': 656, 'synset': 'lightning_rod.n.02', 'synonyms': ['lightning_rod', 'lightning_conductor'], 'def': 'a metallic conductor that is attached to a high point and leads to the ground', 'name': 'lightning_rod'}, {'frequency': 'c', 'id': 657, 'synset': 'lime.n.06', 'synonyms': ['lime'], 'def': 'the green acidic fruit of any of various lime trees', 'name': 'lime'}, {'frequency': 'r', 'id': 658, 'synset': 'limousine.n.01', 'synonyms': ['limousine'], 'def': 'long luxurious car; usually driven by a chauffeur', 'name': 'limousine'}, {'frequency': 'r', 'id': 659, 'synset': 'linen.n.02', 'synonyms': ['linen_paper'], 'def': 'a high-quality paper made of linen fibers or with a linen finish', 'name': 'linen_paper'}, {'frequency': 'c', 'id': 660, 'synset': 'lion.n.01', 'synonyms': ['lion'], 'def': 'large gregarious predatory cat of Africa and India', 'name': 'lion'}, {'frequency': 'c', 'id': 661, 'synset': 'lip_balm.n.01', 'synonyms': ['lip_balm'], 'def': 'a balm applied to the lips', 'name': 'lip_balm'}, {'frequency': 'c', 'id': 662, 'synset': 'lipstick.n.01', 'synonyms': ['lipstick', 'lip_rouge'], 'def': 'makeup that is used to color the lips', 'name': 'lipstick'}, {'frequency': 'r', 'id': 663, 'synset': 'liquor.n.01', 'synonyms': ['liquor', 'spirits', 'hard_liquor', 'liqueur', 'cordial'], 'def': 'an alcoholic beverage that is distilled rather than fermented', 'name': 'liquor'}, {'frequency': 'r', 'id': 664, 'synset': 'lizard.n.01', 'synonyms': ['lizard'], 'def': 'a reptile with usually two pairs of legs and a tapering tail', 'name': 'lizard'}, {'frequency': 'r', 'id': 665, 'synset': 'loafer.n.02', 'synonyms': ['Loafer_(type_of_shoe)'], 'def': 'a low leather step-in shoe', 'name': 'Loafer_(type_of_shoe)'}, {'frequency': 'f', 'id': 666, 'synset': 'log.n.01', 'synonyms': ['log'], 'def': 'a segment of the trunk of a tree when stripped of branches', 'name': 'log'}, {'frequency': 'c', 'id': 667, 'synset': 'lollipop.n.02', 'synonyms': ['lollipop'], 'def': 'hard candy on a stick', 'name': 'lollipop'}, {'frequency': 'c', 'id': 668, 'synset': 'lotion.n.01', 'synonyms': ['lotion'], 'def': 'any of various cosmetic preparations that are applied to the skin', 'name': 'lotion'}, {'frequency': 'f', 'id': 669, 'synset': 'loudspeaker.n.01', 'synonyms': ['speaker_(stero_equipment)'], 'def': 'electronic device that produces sound often as part of a stereo system', 'name': 'speaker_(stero_equipment)'}, {'frequency': 'c', 'id': 670, 'synset': 'love_seat.n.01', 'synonyms': ['loveseat'], 'def': 'small sofa that seats two people', 'name': 'loveseat'}, {'frequency': 'r', 'id': 671, 'synset': 'machine_gun.n.01', 'synonyms': ['machine_gun'], 'def': 'a rapidly firing automatic gun', 'name': 'machine_gun'}, {'frequency': 'f', 'id': 672, 'synset': 'magazine.n.02', 'synonyms': ['magazine'], 'def': 'a paperback periodic publication', 'name': 'magazine'}, {'frequency': 'f', 'id': 673, 'synset': 'magnet.n.01', 'synonyms': ['magnet'], 'def': 'a device that attracts iron and produces a magnetic field', 'name': 'magnet'}, {'frequency': 'r', 'id': 674, 'synset': 'mail_slot.n.01', 'synonyms': ['mail_slot'], 'def': 'a slot (usually in a door) through which mail can be delivered', 'name': 'mail_slot'}, {'frequency': 'c', 'id': 675, 'synset': 'mailbox.n.01', 'synonyms': ['mailbox_(at_home)', 'letter_box_(at_home)'], 'def': 'a private box for delivery of mail', 'name': 'mailbox_(at_home)'}, {'frequency': 'r', 'id': 676, 'synset': 'mallet.n.01', 'synonyms': ['mallet'], 'def': 'a sports implement with a long handle and a hammer-like head used to hit a ball', 'name': 'mallet'}, {'frequency': 'r', 'id': 677, 'synset': 'mammoth.n.01', 'synonyms': ['mammoth'], 'def': 'any of numerous extinct elephants widely distributed in the Pleistocene', 'name': 'mammoth'}, {'frequency': 'c', 'id': 678, 'synset': 'mandarin.n.05', 'synonyms': ['mandarin_orange'], 'def': 'a somewhat flat reddish-orange loose skinned citrus of China', 'name': 'mandarin_orange'}, {'frequency': 'c', 'id': 679, 'synset': 'manger.n.01', 'synonyms': ['manger', 'trough'], 'def': 'a container (usually in a barn or stable) from which cattle or horses feed', 'name': 'manger'}, {'frequency': 'f', 'id': 680, 'synset': 'manhole.n.01', 'synonyms': ['manhole'], 'def': 'a hole (usually with a flush cover) through which a person can gain access to an underground structure', 'name': 'manhole'}, {'frequency': 'c', 'id': 681, 'synset': 'map.n.01', 'synonyms': ['map'], 'def': "a diagrammatic representation of the earth's surface (or part of it)", 'name': 'map'}, {'frequency': 'c', 'id': 682, 'synset': 'marker.n.03', 'synonyms': ['marker'], 'def': 'a writing implement for making a mark', 'name': 'marker'}, {'frequency': 'r', 'id': 683, 'synset': 'martini.n.01', 'synonyms': ['martini'], 'def': 'a cocktail made of gin (or vodka) with dry vermouth', 'name': 'martini'}, {'frequency': 'r', 'id': 684, 'synset': 'mascot.n.01', 'synonyms': ['mascot'], 'def': 'a person or animal that is adopted by a team or other group as a symbolic figure', 'name': 'mascot'}, {'frequency': 'c', 'id': 685, 'synset': 'mashed_potato.n.01', 'synonyms': ['mashed_potato'], 'def': 'potato that has been peeled and boiled and then mashed', 'name': 'mashed_potato'}, {'frequency': 'r', 'id': 686, 'synset': 'masher.n.02', 'synonyms': ['masher'], 'def': 'a kitchen utensil used for mashing (e.g. potatoes)', 'name': 'masher'}, {'frequency': 'f', 'id': 687, 'synset': 'mask.n.04', 'synonyms': ['mask', 'facemask'], 'def': 'a protective covering worn over the face', 'name': 'mask'}, {'frequency': 'f', 'id': 688, 'synset': 'mast.n.01', 'synonyms': ['mast'], 'def': 'a vertical spar for supporting sails', 'name': 'mast'}, {'frequency': 'c', 'id': 689, 'synset': 'mat.n.03', 'synonyms': ['mat_(gym_equipment)', 'gym_mat'], 'def': 'sports equipment consisting of a piece of thick padding on the floor for gymnastics', 'name': 'mat_(gym_equipment)'}, {'frequency': 'r', 'id': 690, 'synset': 'matchbox.n.01', 'synonyms': ['matchbox'], 'def': 'a box for holding matches', 'name': 'matchbox'}, {'frequency': 'f', 'id': 691, 'synset': 'mattress.n.01', 'synonyms': ['mattress'], 'def': 'a thick pad filled with resilient material used as a bed or part of a bed', 'name': 'mattress'}, {'frequency': 'c', 'id': 692, 'synset': 'measuring_cup.n.01', 'synonyms': ['measuring_cup'], 'def': 'graduated cup used to measure liquid or granular ingredients', 'name': 'measuring_cup'}, {'frequency': 'c', 'id': 693, 'synset': 'measuring_stick.n.01', 'synonyms': ['measuring_stick', 'ruler_(measuring_stick)', 'measuring_rod'], 'def': 'measuring instrument having a sequence of marks at regular intervals', 'name': 'measuring_stick'}, {'frequency': 'c', 'id': 694, 'synset': 'meatball.n.01', 'synonyms': ['meatball'], 'def': 'ground meat formed into a ball and fried or simmered in broth', 'name': 'meatball'}, {'frequency': 'c', 'id': 695, 'synset': 'medicine.n.02', 'synonyms': ['medicine'], 'def': 'something that treats or prevents or alleviates the symptoms of disease', 'name': 'medicine'}, {'frequency': 'r', 'id': 696, 'synset': 'melon.n.01', 'synonyms': ['melon'], 'def': 'fruit of the gourd family having a hard rind and sweet juicy flesh', 'name': 'melon'}, {'frequency': 'f', 'id': 697, 'synset': 'microphone.n.01', 'synonyms': ['microphone'], 'def': 'device for converting sound waves into electrical energy', 'name': 'microphone'}, {'frequency': 'r', 'id': 698, 'synset': 'microscope.n.01', 'synonyms': ['microscope'], 'def': 'magnifier of the image of small objects', 'name': 'microscope'}, {'frequency': 'f', 'id': 699, 'synset': 'microwave.n.02', 'synonyms': ['microwave_oven'], 'def': 'kitchen appliance that cooks food by passing an electromagnetic wave through it', 'name': 'microwave_oven'}, {'frequency': 'r', 'id': 700, 'synset': 'milestone.n.01', 'synonyms': ['milestone', 'milepost'], 'def': 'stone post at side of a road to show distances', 'name': 'milestone'}, {'frequency': 'c', 'id': 701, 'synset': 'milk.n.01', 'synonyms': ['milk'], 'def': 'a white nutritious liquid secreted by mammals and used as food by human beings', 'name': 'milk'}, {'frequency': 'f', 'id': 702, 'synset': 'minivan.n.01', 'synonyms': ['minivan'], 'def': 'a small box-shaped passenger van', 'name': 'minivan'}, {'frequency': 'r', 'id': 703, 'synset': 'mint.n.05', 'synonyms': ['mint_candy'], 'def': 'a candy that is flavored with a mint oil', 'name': 'mint_candy'}, {'frequency': 'f', 'id': 704, 'synset': 'mirror.n.01', 'synonyms': ['mirror'], 'def': 'polished surface that forms images by reflecting light', 'name': 'mirror'}, {'frequency': 'c', 'id': 705, 'synset': 'mitten.n.01', 'synonyms': ['mitten'], 'def': 'glove that encases the thumb separately and the other four fingers together', 'name': 'mitten'}, {'frequency': 'c', 'id': 706, 'synset': 'mixer.n.04', 'synonyms': ['mixer_(kitchen_tool)', 'stand_mixer'], 'def': 'a kitchen utensil that is used for mixing foods', 'name': 'mixer_(kitchen_tool)'}, {'frequency': 'c', 'id': 707, 'synset': 'money.n.03', 'synonyms': ['money'], 'def': 'the official currency issued by a government or national bank', 'name': 'money'}, {'frequency': 'f', 'id': 708, 'synset': 'monitor.n.04', 'synonyms': ['monitor_(computer_equipment) computer_monitor'], 'def': 'a computer monitor', 'name': 'monitor_(computer_equipment) computer_monitor'}, {'frequency': 'c', 'id': 709, 'synset': 'monkey.n.01', 'synonyms': ['monkey'], 'def': 'any of various long-tailed primates', 'name': 'monkey'}, {'frequency': 'f', 'id': 710, 'synset': 'motor.n.01', 'synonyms': ['motor'], 'def': 'machine that converts other forms of energy into mechanical energy and so imparts motion', 'name': 'motor'}, {'frequency': 'f', 'id': 711, 'synset': 'motor_scooter.n.01', 'synonyms': ['motor_scooter', 'scooter'], 'def': 'a wheeled vehicle with small wheels and a low-powered engine', 'name': 'motor_scooter'}, {'frequency': 'r', 'id': 712, 'synset': 'motor_vehicle.n.01', 'synonyms': ['motor_vehicle', 'automotive_vehicle'], 'def': 'a self-propelled wheeled vehicle that does not run on rails', 'name': 'motor_vehicle'}, {'frequency': 'r', 'id': 713, 'synset': 'motorboat.n.01', 'synonyms': ['motorboat', 'powerboat'], 'def': 'a boat propelled by an internal-combustion engine', 'name': 'motorboat'}, {'frequency': 'f', 'id': 714, 'synset': 'motorcycle.n.01', 'synonyms': ['motorcycle'], 'def': 'a motor vehicle with two wheels and a strong frame', 'name': 'motorcycle'}, {'frequency': 'f', 'id': 715, 'synset': 'mound.n.01', 'synonyms': ['mound_(baseball)', "pitcher's_mound"], 'def': '(baseball) the slight elevation on which the pitcher stands', 'name': 'mound_(baseball)'}, {'frequency': 'r', 'id': 716, 'synset': 'mouse.n.01', 'synonyms': ['mouse_(animal_rodent)'], 'def': 'a small rodent with pointed snouts and small ears on elongated bodies with slender usually hairless tails', 'name': 'mouse_(animal_rodent)'}, {'frequency': 'f', 'id': 717, 'synset': 'mouse.n.04', 'synonyms': ['mouse_(computer_equipment)', 'computer_mouse'], 'def': 'a computer input device that controls an on-screen pointer', 'name': 'mouse_(computer_equipment)'}, {'frequency': 'f', 'id': 718, 'synset': 'mousepad.n.01', 'synonyms': ['mousepad'], 'def': 'a small portable pad that provides an operating surface for a computer mouse', 'name': 'mousepad'}, {'frequency': 'c', 'id': 719, 'synset': 'muffin.n.01', 'synonyms': ['muffin'], 'def': 'a sweet quick bread baked in a cup-shaped pan', 'name': 'muffin'}, {'frequency': 'f', 'id': 720, 'synset': 'mug.n.04', 'synonyms': ['mug'], 'def': 'with handle and usually cylindrical', 'name': 'mug'}, {'frequency': 'f', 'id': 721, 'synset': 'mushroom.n.02', 'synonyms': ['mushroom'], 'def': 'a common mushroom', 'name': 'mushroom'}, {'frequency': 'r', 'id': 722, 'synset': 'music_stool.n.01', 'synonyms': ['music_stool', 'piano_stool'], 'def': 'a stool for piano players; usually adjustable in height', 'name': 'music_stool'}, {'frequency': 'r', 'id': 723, 'synset': 'musical_instrument.n.01', 'synonyms': ['musical_instrument', 'instrument_(musical)'], 'def': 'any of various devices or contrivances that can be used to produce musical tones or sounds', 'name': 'musical_instrument'}, {'frequency': 'r', 'id': 724, 'synset': 'nailfile.n.01', 'synonyms': ['nailfile'], 'def': 'a small flat file for shaping the nails', 'name': 'nailfile'}, {'frequency': 'r', 'id': 725, 'synset': 'nameplate.n.01', 'synonyms': ['nameplate'], 'def': 'a plate bearing a name', 'name': 'nameplate'}, {'frequency': 'f', 'id': 726, 'synset': 'napkin.n.01', 'synonyms': ['napkin', 'table_napkin', 'serviette'], 'def': 'a small piece of table linen or paper that is used to wipe the mouth and to cover the lap in order to protect clothing', 'name': 'napkin'}, {'frequency': 'r', 'id': 727, 'synset': 'neckerchief.n.01', 'synonyms': ['neckerchief'], 'def': 'a kerchief worn around the neck', 'name': 'neckerchief'}, {'frequency': 'f', 'id': 728, 'synset': 'necklace.n.01', 'synonyms': ['necklace'], 'def': 'jewelry consisting of a cord or chain (often bearing gems) worn about the neck as an ornament', 'name': 'necklace'}, {'frequency': 'f', 'id': 729, 'synset': 'necktie.n.01', 'synonyms': ['necktie', 'tie_(necktie)'], 'def': 'neckwear consisting of a long narrow piece of material worn under a collar and tied in knot at the front', 'name': 'necktie'}, {'frequency': 'r', 'id': 730, 'synset': 'needle.n.03', 'synonyms': ['needle'], 'def': 'a sharp pointed implement (usually metal)', 'name': 'needle'}, {'frequency': 'c', 'id': 731, 'synset': 'nest.n.01', 'synonyms': ['nest'], 'def': 'a structure in which animals lay eggs or give birth to their young', 'name': 'nest'}, {'frequency': 'r', 'id': 732, 'synset': 'newsstand.n.01', 'synonyms': ['newsstand'], 'def': 'a stall where newspapers and other periodicals are sold', 'name': 'newsstand'}, {'frequency': 'c', 'id': 733, 'synset': 'nightwear.n.01', 'synonyms': ['nightshirt', 'nightwear', 'sleepwear', 'nightclothes'], 'def': 'garments designed to be worn in bed', 'name': 'nightshirt'}, {'frequency': 'r', 'id': 734, 'synset': 'nosebag.n.01', 'synonyms': ['nosebag_(for_animals)', 'feedbag'], 'def': 'a canvas bag that is used to feed an animal (such as a horse); covers the muzzle and fastens at the top of the head', 'name': 'nosebag_(for_animals)'}, {'frequency': 'r', 'id': 735, 'synset': 'noseband.n.01', 'synonyms': ['noseband_(for_animals)', 'nosepiece_(for_animals)'], 'def': "a strap that is the part of a bridle that goes over the animal's nose", 'name': 'noseband_(for_animals)'}, {'frequency': 'f', 'id': 736, 'synset': 'notebook.n.01', 'synonyms': ['notebook'], 'def': 'a book with blank pages for recording notes or memoranda', 'name': 'notebook'}, {'frequency': 'c', 'id': 737, 'synset': 'notepad.n.01', 'synonyms': ['notepad'], 'def': 'a pad of paper for keeping notes', 'name': 'notepad'}, {'frequency': 'c', 'id': 738, 'synset': 'nut.n.03', 'synonyms': ['nut'], 'def': 'a small metal block (usually square or hexagonal) with internal screw thread to be fitted onto a bolt', 'name': 'nut'}, {'frequency': 'r', 'id': 739, 'synset': 'nutcracker.n.01', 'synonyms': ['nutcracker'], 'def': 'a hand tool used to crack nuts open', 'name': 'nutcracker'}, {'frequency': 'c', 'id': 740, 'synset': 'oar.n.01', 'synonyms': ['oar'], 'def': 'an implement used to propel or steer a boat', 'name': 'oar'}, {'frequency': 'r', 'id': 741, 'synset': 'octopus.n.01', 'synonyms': ['octopus_(food)'], 'def': 'tentacles of octopus prepared as food', 'name': 'octopus_(food)'}, {'frequency': 'r', 'id': 742, 'synset': 'octopus.n.02', 'synonyms': ['octopus_(animal)'], 'def': 'bottom-living cephalopod having a soft oval body with eight long tentacles', 'name': 'octopus_(animal)'}, {'frequency': 'c', 'id': 743, 'synset': 'oil_lamp.n.01', 'synonyms': ['oil_lamp', 'kerosene_lamp', 'kerosine_lamp'], 'def': 'a lamp that burns oil (as kerosine) for light', 'name': 'oil_lamp'}, {'frequency': 'c', 'id': 744, 'synset': 'olive_oil.n.01', 'synonyms': ['olive_oil'], 'def': 'oil from olives', 'name': 'olive_oil'}, {'frequency': 'r', 'id': 745, 'synset': 'omelet.n.01', 'synonyms': ['omelet', 'omelette'], 'def': 'beaten eggs cooked until just set; may be folded around e.g. ham or cheese or jelly', 'name': 'omelet'}, {'frequency': 'f', 'id': 746, 'synset': 'onion.n.01', 'synonyms': ['onion'], 'def': 'the bulb of an onion plant', 'name': 'onion'}, {'frequency': 'f', 'id': 747, 'synset': 'orange.n.01', 'synonyms': ['orange_(fruit)'], 'def': 'orange (FRUIT of an orange tree)', 'name': 'orange_(fruit)'}, {'frequency': 'c', 'id': 748, 'synset': 'orange_juice.n.01', 'synonyms': ['orange_juice'], 'def': 'bottled or freshly squeezed juice of oranges', 'name': 'orange_juice'}, {'frequency': 'r', 'id': 749, 'synset': 'oregano.n.01', 'synonyms': ['oregano', 'marjoram'], 'def': 'aromatic Eurasian perennial herb used in cooking and baking', 'name': 'oregano'}, {'frequency': 'c', 'id': 750, 'synset': 'ostrich.n.02', 'synonyms': ['ostrich'], 'def': 'fast-running African flightless bird with two-toed feet; largest living bird', 'name': 'ostrich'}, {'frequency': 'c', 'id': 751, 'synset': 'ottoman.n.03', 'synonyms': ['ottoman', 'pouf', 'pouffe', 'hassock'], 'def': 'thick cushion used as a seat', 'name': 'ottoman'}, {'frequency': 'c', 'id': 752, 'synset': 'overall.n.01', 'synonyms': ['overalls_(clothing)'], 'def': 'work clothing consisting of denim trousers usually with a bib and shoulder straps', 'name': 'overalls_(clothing)'}, {'frequency': 'c', 'id': 753, 'synset': 'owl.n.01', 'synonyms': ['owl'], 'def': 'nocturnal bird of prey with hawk-like beak and claws and large head with front-facing eyes', 'name': 'owl'}, {'frequency': 'c', 'id': 754, 'synset': 'packet.n.03', 'synonyms': ['packet'], 'def': 'a small package or bundle', 'name': 'packet'}, {'frequency': 'r', 'id': 755, 'synset': 'pad.n.03', 'synonyms': ['inkpad', 'inking_pad', 'stamp_pad'], 'def': 'absorbent material saturated with ink used to transfer ink evenly to a rubber stamp', 'name': 'inkpad'}, {'frequency': 'c', 'id': 756, 'synset': 'pad.n.04', 'synonyms': ['pad'], 'def': 'a flat mass of soft material used for protection, stuffing, or comfort', 'name': 'pad'}, {'frequency': 'c', 'id': 757, 'synset': 'paddle.n.04', 'synonyms': ['paddle', 'boat_paddle'], 'def': 'a short light oar used without an oarlock to propel a canoe or small boat', 'name': 'paddle'}, {'frequency': 'c', 'id': 758, 'synset': 'padlock.n.01', 'synonyms': ['padlock'], 'def': 'a detachable, portable lock', 'name': 'padlock'}, {'frequency': 'r', 'id': 759, 'synset': 'paintbox.n.01', 'synonyms': ['paintbox'], 'def': "a box containing a collection of cubes or tubes of artists' paint", 'name': 'paintbox'}, {'frequency': 'c', 'id': 760, 'synset': 'paintbrush.n.01', 'synonyms': ['paintbrush'], 'def': 'a brush used as an applicator to apply paint', 'name': 'paintbrush'}, {'frequency': 'f', 'id': 761, 'synset': 'painting.n.01', 'synonyms': ['painting'], 'def': 'graphic art consisting of an artistic composition made by applying paints to a surface', 'name': 'painting'}, {'frequency': 'c', 'id': 762, 'synset': 'pajama.n.02', 'synonyms': ['pajamas', 'pyjamas'], 'def': 'loose-fitting nightclothes worn for sleeping or lounging', 'name': 'pajamas'}, {'frequency': 'c', 'id': 763, 'synset': 'palette.n.02', 'synonyms': ['palette', 'pallet'], 'def': 'board that provides a flat surface on which artists mix paints and the range of colors used', 'name': 'palette'}, {'frequency': 'f', 'id': 764, 'synset': 'pan.n.01', 'synonyms': ['pan_(for_cooking)', 'cooking_pan'], 'def': 'cooking utensil consisting of a wide metal vessel', 'name': 'pan_(for_cooking)'}, {'frequency': 'r', 'id': 765, 'synset': 'pan.n.03', 'synonyms': ['pan_(metal_container)'], 'def': 'shallow container made of metal', 'name': 'pan_(metal_container)'}, {'frequency': 'c', 'id': 766, 'synset': 'pancake.n.01', 'synonyms': ['pancake'], 'def': 'a flat cake of thin batter fried on both sides on a griddle', 'name': 'pancake'}, {'frequency': 'r', 'id': 767, 'synset': 'pantyhose.n.01', 'synonyms': ['pantyhose'], 'def': "a woman's tights consisting of underpants and stockings", 'name': 'pantyhose'}, {'frequency': 'r', 'id': 768, 'synset': 'papaya.n.02', 'synonyms': ['papaya'], 'def': 'large oval melon-like tropical fruit with yellowish flesh', 'name': 'papaya'}, {'frequency': 'r', 'id': 769, 'synset': 'paper_clip.n.01', 'synonyms': ['paperclip'], 'def': 'a wire or plastic clip for holding sheets of paper together', 'name': 'paperclip'}, {'frequency': 'f', 'id': 770, 'synset': 'paper_plate.n.01', 'synonyms': ['paper_plate'], 'def': 'a disposable plate made of cardboard', 'name': 'paper_plate'}, {'frequency': 'f', 'id': 771, 'synset': 'paper_towel.n.01', 'synonyms': ['paper_towel'], 'def': 'a disposable towel made of absorbent paper', 'name': 'paper_towel'}, {'frequency': 'r', 'id': 772, 'synset': 'paperback_book.n.01', 'synonyms': ['paperback_book', 'paper-back_book', 'softback_book', 'soft-cover_book'], 'def': 'a book with paper covers', 'name': 'paperback_book'}, {'frequency': 'r', 'id': 773, 'synset': 'paperweight.n.01', 'synonyms': ['paperweight'], 'def': 'a weight used to hold down a stack of papers', 'name': 'paperweight'}, {'frequency': 'c', 'id': 774, 'synset': 'parachute.n.01', 'synonyms': ['parachute'], 'def': 'rescue equipment consisting of a device that fills with air and retards your fall', 'name': 'parachute'}, {'frequency': 'r', 'id': 775, 'synset': 'parakeet.n.01', 'synonyms': ['parakeet', 'parrakeet', 'parroket', 'paraquet', 'paroquet', 'parroquet'], 'def': 'any of numerous small slender long-tailed parrots', 'name': 'parakeet'}, {'frequency': 'c', 'id': 776, 'synset': 'parasail.n.01', 'synonyms': ['parasail_(sports)'], 'def': 'parachute that will lift a person up into the air when it is towed by a motorboat or a car', 'name': 'parasail_(sports)'}, {'frequency': 'r', 'id': 777, 'synset': 'parchment.n.01', 'synonyms': ['parchment'], 'def': 'a superior paper resembling sheepskin', 'name': 'parchment'}, {'frequency': 'r', 'id': 778, 'synset': 'parka.n.01', 'synonyms': ['parka', 'anorak'], 'def': "a kind of heavy jacket (`windcheater' is a British term)", 'name': 'parka'}, {'frequency': 'f', 'id': 779, 'synset': 'parking_meter.n.01', 'synonyms': ['parking_meter'], 'def': 'a coin-operated timer located next to a parking space', 'name': 'parking_meter'}, {'frequency': 'c', 'id': 780, 'synset': 'parrot.n.01', 'synonyms': ['parrot'], 'def': 'usually brightly colored tropical birds with short hooked beaks and the ability to mimic sounds', 'name': 'parrot'}, {'frequency': 'c', 'id': 781, 'synset': 'passenger_car.n.01', 'synonyms': ['passenger_car_(part_of_a_train)', 'coach_(part_of_a_train)'], 'def': 'a railcar where passengers ride', 'name': 'passenger_car_(part_of_a_train)'}, {'frequency': 'r', 'id': 782, 'synset': 'passenger_ship.n.01', 'synonyms': ['passenger_ship'], 'def': 'a ship built to carry passengers', 'name': 'passenger_ship'}, {'frequency': 'r', 'id': 783, 'synset': 'passport.n.02', 'synonyms': ['passport'], 'def': 'a document issued by a country to a citizen allowing that person to travel abroad and re-enter the home country', 'name': 'passport'}, {'frequency': 'f', 'id': 784, 'synset': 'pastry.n.02', 'synonyms': ['pastry'], 'def': 'any of various baked foods made of dough or batter', 'name': 'pastry'}, {'frequency': 'r', 'id': 785, 'synset': 'patty.n.01', 'synonyms': ['patty_(food)'], 'def': 'small flat mass of chopped food', 'name': 'patty_(food)'}, {'frequency': 'c', 'id': 786, 'synset': 'pea.n.01', 'synonyms': ['pea_(food)'], 'def': 'seed of a pea plant used for food', 'name': 'pea_(food)'}, {'frequency': 'c', 'id': 787, 'synset': 'peach.n.03', 'synonyms': ['peach'], 'def': 'downy juicy fruit with sweet yellowish or whitish flesh', 'name': 'peach'}, {'frequency': 'c', 'id': 788, 'synset': 'peanut_butter.n.01', 'synonyms': ['peanut_butter'], 'def': 'a spread made from ground peanuts', 'name': 'peanut_butter'}, {'frequency': 'c', 'id': 789, 'synset': 'pear.n.01', 'synonyms': ['pear'], 'def': 'sweet juicy gritty-textured fruit available in many varieties', 'name': 'pear'}, {'frequency': 'r', 'id': 790, 'synset': 'peeler.n.03', 'synonyms': ['peeler_(tool_for_fruit_and_vegetables)'], 'def': 'a device for peeling vegetables or fruits', 'name': 'peeler_(tool_for_fruit_and_vegetables)'}, {'frequency': 'r', 'id': 791, 'synset': 'pegboard.n.01', 'synonyms': ['pegboard'], 'def': 'a board perforated with regularly spaced holes into which pegs can be fitted', 'name': 'pegboard'}, {'frequency': 'c', 'id': 792, 'synset': 'pelican.n.01', 'synonyms': ['pelican'], 'def': 'large long-winged warm-water seabird having a large bill with a distensible pouch for fish', 'name': 'pelican'}, {'frequency': 'f', 'id': 793, 'synset': 'pen.n.01', 'synonyms': ['pen'], 'def': 'a writing implement with a point from which ink flows', 'name': 'pen'}, {'frequency': 'c', 'id': 794, 'synset': 'pencil.n.01', 'synonyms': ['pencil'], 'def': 'a thin cylindrical pointed writing implement made of wood and graphite', 'name': 'pencil'}, {'frequency': 'r', 'id': 795, 'synset': 'pencil_box.n.01', 'synonyms': ['pencil_box', 'pencil_case'], 'def': 'a box for holding pencils', 'name': 'pencil_box'}, {'frequency': 'r', 'id': 796, 'synset': 'pencil_sharpener.n.01', 'synonyms': ['pencil_sharpener'], 'def': 'a rotary implement for sharpening the point on pencils', 'name': 'pencil_sharpener'}, {'frequency': 'r', 'id': 797, 'synset': 'pendulum.n.01', 'synonyms': ['pendulum'], 'def': 'an apparatus consisting of an object mounted so that it swings freely under the influence of gravity', 'name': 'pendulum'}, {'frequency': 'c', 'id': 798, 'synset': 'penguin.n.01', 'synonyms': ['penguin'], 'def': 'short-legged flightless birds of cold southern regions having webbed feet and wings modified as flippers', 'name': 'penguin'}, {'frequency': 'r', 'id': 799, 'synset': 'pennant.n.02', 'synonyms': ['pennant'], 'def': 'a flag longer than it is wide (and often tapering)', 'name': 'pennant'}, {'frequency': 'r', 'id': 800, 'synset': 'penny.n.02', 'synonyms': ['penny_(coin)'], 'def': 'a coin worth one-hundredth of the value of the basic unit', 'name': 'penny_(coin)'}, {'frequency': 'c', 'id': 801, 'synset': 'pepper.n.03', 'synonyms': ['pepper', 'peppercorn'], 'def': 'pungent seasoning from the berry of the common pepper plant; whole or ground', 'name': 'pepper'}, {'frequency': 'c', 'id': 802, 'synset': 'pepper_mill.n.01', 'synonyms': ['pepper_mill', 'pepper_grinder'], 'def': 'a mill for grinding pepper', 'name': 'pepper_mill'}, {'frequency': 'c', 'id': 803, 'synset': 'perfume.n.02', 'synonyms': ['perfume'], 'def': 'a toiletry that emits and diffuses a fragrant odor', 'name': 'perfume'}, {'frequency': 'r', 'id': 804, 'synset': 'persimmon.n.02', 'synonyms': ['persimmon'], 'def': 'orange fruit resembling a plum; edible when fully ripe', 'name': 'persimmon'}, {'frequency': 'f', 'id': 805, 'synset': 'person.n.01', 'synonyms': ['baby', 'child', 'boy', 'girl', 'man', 'woman', 'person', 'human'], 'def': 'a human being', 'name': 'baby'}, {'frequency': 'r', 'id': 806, 'synset': 'pet.n.01', 'synonyms': ['pet'], 'def': 'a domesticated animal kept for companionship or amusement', 'name': 'pet'}, {'frequency': 'r', 'id': 807, 'synset': 'petfood.n.01', 'synonyms': ['petfood', 'pet-food'], 'def': 'food prepared for animal pets', 'name': 'petfood'}, {'frequency': 'r', 'id': 808, 'synset': 'pew.n.01', 'synonyms': ['pew_(church_bench)', 'church_bench'], 'def': 'long bench with backs; used in church by the congregation', 'name': 'pew_(church_bench)'}, {'frequency': 'r', 'id': 809, 'synset': 'phonebook.n.01', 'synonyms': ['phonebook', 'telephone_book', 'telephone_directory'], 'def': 'a directory containing an alphabetical list of telephone subscribers and their telephone numbers', 'name': 'phonebook'}, {'frequency': 'c', 'id': 810, 'synset': 'phonograph_record.n.01', 'synonyms': ['phonograph_record', 'phonograph_recording', 'record_(phonograph_recording)'], 'def': 'sound recording consisting of a typically black disk with a continuous groove', 'name': 'phonograph_record'}, {'frequency': 'c', 'id': 811, 'synset': 'piano.n.01', 'synonyms': ['piano'], 'def': 'a keyboard instrument that is played by depressing keys that cause hammers to strike tuned strings and produce sounds', 'name': 'piano'}, {'frequency': 'f', 'id': 812, 'synset': 'pickle.n.01', 'synonyms': ['pickle'], 'def': 'vegetables (especially cucumbers) preserved in brine or vinegar', 'name': 'pickle'}, {'frequency': 'f', 'id': 813, 'synset': 'pickup.n.01', 'synonyms': ['pickup_truck'], 'def': 'a light truck with an open body and low sides and a tailboard', 'name': 'pickup_truck'}, {'frequency': 'c', 'id': 814, 'synset': 'pie.n.01', 'synonyms': ['pie'], 'def': 'dish baked in pastry-lined pan often with a pastry top', 'name': 'pie'}, {'frequency': 'c', 'id': 815, 'synset': 'pigeon.n.01', 'synonyms': ['pigeon'], 'def': 'wild and domesticated birds having a heavy body and short legs', 'name': 'pigeon'}, {'frequency': 'r', 'id': 816, 'synset': 'piggy_bank.n.01', 'synonyms': ['piggy_bank', 'penny_bank'], 'def': "a child's coin bank (often shaped like a pig)", 'name': 'piggy_bank'}, {'frequency': 'f', 'id': 817, 'synset': 'pillow.n.01', 'synonyms': ['pillow'], 'def': 'a cushion to support the head of a sleeping person', 'name': 'pillow'}, {'frequency': 'r', 'id': 818, 'synset': 'pin.n.09', 'synonyms': ['pin_(non_jewelry)'], 'def': 'a small slender (often pointed) piece of wood or metal used to support or fasten or attach things', 'name': 'pin_(non_jewelry)'}, {'frequency': 'f', 'id': 819, 'synset': 'pineapple.n.02', 'synonyms': ['pineapple'], 'def': 'large sweet fleshy tropical fruit with a tuft of stiff leaves', 'name': 'pineapple'}, {'frequency': 'c', 'id': 820, 'synset': 'pinecone.n.01', 'synonyms': ['pinecone'], 'def': 'the seed-producing cone of a pine tree', 'name': 'pinecone'}, {'frequency': 'r', 'id': 821, 'synset': 'ping-pong_ball.n.01', 'synonyms': ['ping-pong_ball'], 'def': 'light hollow ball used in playing table tennis', 'name': 'ping-pong_ball'}, {'frequency': 'r', 'id': 822, 'synset': 'pinwheel.n.03', 'synonyms': ['pinwheel'], 'def': 'a toy consisting of vanes of colored paper or plastic that is pinned to a stick and spins when it is pointed into the wind', 'name': 'pinwheel'}, {'frequency': 'r', 'id': 823, 'synset': 'pipe.n.01', 'synonyms': ['tobacco_pipe'], 'def': 'a tube with a small bowl at one end; used for smoking tobacco', 'name': 'tobacco_pipe'}, {'frequency': 'f', 'id': 824, 'synset': 'pipe.n.02', 'synonyms': ['pipe', 'piping'], 'def': 'a long tube made of metal or plastic that is used to carry water or oil or gas etc.', 'name': 'pipe'}, {'frequency': 'r', 'id': 825, 'synset': 'pistol.n.01', 'synonyms': ['pistol', 'handgun'], 'def': 'a firearm that is held and fired with one hand', 'name': 'pistol'}, {'frequency': 'r', 'id': 826, 'synset': 'pita.n.01', 'synonyms': ['pita_(bread)', 'pocket_bread'], 'def': 'usually small round bread that can open into a pocket for filling', 'name': 'pita_(bread)'}, {'frequency': 'f', 'id': 827, 'synset': 'pitcher.n.02', 'synonyms': ['pitcher_(vessel_for_liquid)', 'ewer'], 'def': 'an open vessel with a handle and a spout for pouring', 'name': 'pitcher_(vessel_for_liquid)'}, {'frequency': 'r', 'id': 828, 'synset': 'pitchfork.n.01', 'synonyms': ['pitchfork'], 'def': 'a long-handled hand tool with sharp widely spaced prongs for lifting and pitching hay', 'name': 'pitchfork'}, {'frequency': 'f', 'id': 829, 'synset': 'pizza.n.01', 'synonyms': ['pizza'], 'def': 'Italian open pie made of thin bread dough spread with a spiced mixture of e.g. tomato sauce and cheese', 'name': 'pizza'}, {'frequency': 'f', 'id': 830, 'synset': 'place_mat.n.01', 'synonyms': ['place_mat'], 'def': 'a mat placed on a table for an individual place setting', 'name': 'place_mat'}, {'frequency': 'f', 'id': 831, 'synset': 'plate.n.04', 'synonyms': ['plate'], 'def': 'dish on which food is served or from which food is eaten', 'name': 'plate'}, {'frequency': 'c', 'id': 832, 'synset': 'platter.n.01', 'synonyms': ['platter'], 'def': 'a large shallow dish used for serving food', 'name': 'platter'}, {'frequency': 'r', 'id': 833, 'synset': 'playing_card.n.01', 'synonyms': ['playing_card'], 'def': 'one of a pack of cards that are used to play card games', 'name': 'playing_card'}, {'frequency': 'r', 'id': 834, 'synset': 'playpen.n.01', 'synonyms': ['playpen'], 'def': 'a portable enclosure in which babies may be left to play', 'name': 'playpen'}, {'frequency': 'c', 'id': 835, 'synset': 'pliers.n.01', 'synonyms': ['pliers', 'plyers'], 'def': 'a gripping hand tool with two hinged arms and (usually) serrated jaws', 'name': 'pliers'}, {'frequency': 'r', 'id': 836, 'synset': 'plow.n.01', 'synonyms': ['plow_(farm_equipment)', 'plough_(farm_equipment)'], 'def': 'a farm tool having one or more heavy blades to break the soil and cut a furrow prior to sowing', 'name': 'plow_(farm_equipment)'}, {'frequency': 'r', 'id': 837, 'synset': 'pocket_watch.n.01', 'synonyms': ['pocket_watch'], 'def': 'a watch that is carried in a small watch pocket', 'name': 'pocket_watch'}, {'frequency': 'c', 'id': 838, 'synset': 'pocketknife.n.01', 'synonyms': ['pocketknife'], 'def': 'a knife with a blade that folds into the handle; suitable for carrying in the pocket', 'name': 'pocketknife'}, {'frequency': 'c', 'id': 839, 'synset': 'poker.n.01', 'synonyms': ['poker_(fire_stirring_tool)', 'stove_poker', 'fire_hook'], 'def': 'fire iron consisting of a metal rod with a handle; used to stir a fire', 'name': 'poker_(fire_stirring_tool)'}, {'frequency': 'f', 'id': 840, 'synset': 'pole.n.01', 'synonyms': ['pole', 'post'], 'def': 'a long (usually round) rod of wood or metal or plastic', 'name': 'pole'}, {'frequency': 'r', 'id': 841, 'synset': 'police_van.n.01', 'synonyms': ['police_van', 'police_wagon', 'paddy_wagon', 'patrol_wagon'], 'def': 'van used by police to transport prisoners', 'name': 'police_van'}, {'frequency': 'f', 'id': 842, 'synset': 'polo_shirt.n.01', 'synonyms': ['polo_shirt', 'sport_shirt'], 'def': 'a shirt with short sleeves designed for comfort and casual wear', 'name': 'polo_shirt'}, {'frequency': 'r', 'id': 843, 'synset': 'poncho.n.01', 'synonyms': ['poncho'], 'def': 'a blanket-like cloak with a hole in the center for the head', 'name': 'poncho'}, {'frequency': 'c', 'id': 844, 'synset': 'pony.n.05', 'synonyms': ['pony'], 'def': 'any of various breeds of small gentle horses usually less than five feet high at the shoulder', 'name': 'pony'}, {'frequency': 'r', 'id': 845, 'synset': 'pool_table.n.01', 'synonyms': ['pool_table', 'billiard_table', 'snooker_table'], 'def': 'game equipment consisting of a heavy table on which pool is played', 'name': 'pool_table'}, {'frequency': 'f', 'id': 846, 'synset': 'pop.n.02', 'synonyms': ['pop_(soda)', 'soda_(pop)', 'tonic', 'soft_drink'], 'def': 'a sweet drink containing carbonated water and flavoring', 'name': 'pop_(soda)'}, {'frequency': 'r', 'id': 847, 'synset': 'portrait.n.02', 'synonyms': ['portrait', 'portrayal'], 'def': 'any likeness of a person, in any medium', 'name': 'portrait'}, {'frequency': 'c', 'id': 848, 'synset': 'postbox.n.01', 'synonyms': ['postbox_(public)', 'mailbox_(public)'], 'def': 'public box for deposit of mail', 'name': 'postbox_(public)'}, {'frequency': 'c', 'id': 849, 'synset': 'postcard.n.01', 'synonyms': ['postcard', 'postal_card', 'mailing-card'], 'def': 'a card for sending messages by post without an envelope', 'name': 'postcard'}, {'frequency': 'f', 'id': 850, 'synset': 'poster.n.01', 'synonyms': ['poster', 'placard'], 'def': 'a sign posted in a public place as an advertisement', 'name': 'poster'}, {'frequency': 'f', 'id': 851, 'synset': 'pot.n.01', 'synonyms': ['pot'], 'def': 'metal or earthenware cooking vessel that is usually round and deep; often has a handle and lid', 'name': 'pot'}, {'frequency': 'f', 'id': 852, 'synset': 'pot.n.04', 'synonyms': ['flowerpot'], 'def': 'a container in which plants are cultivated', 'name': 'flowerpot'}, {'frequency': 'f', 'id': 853, 'synset': 'potato.n.01', 'synonyms': ['potato'], 'def': 'an edible tuber native to South America', 'name': 'potato'}, {'frequency': 'c', 'id': 854, 'synset': 'potholder.n.01', 'synonyms': ['potholder'], 'def': 'an insulated pad for holding hot pots', 'name': 'potholder'}, {'frequency': 'c', 'id': 855, 'synset': 'pottery.n.01', 'synonyms': ['pottery', 'clayware'], 'def': 'ceramic ware made from clay and baked in a kiln', 'name': 'pottery'}, {'frequency': 'c', 'id': 856, 'synset': 'pouch.n.01', 'synonyms': ['pouch'], 'def': 'a small or medium size container for holding or carrying things', 'name': 'pouch'}, {'frequency': 'r', 'id': 857, 'synset': 'power_shovel.n.01', 'synonyms': ['power_shovel', 'excavator', 'digger'], 'def': 'a machine for excavating', 'name': 'power_shovel'}, {'frequency': 'c', 'id': 858, 'synset': 'prawn.n.01', 'synonyms': ['prawn', 'shrimp'], 'def': 'any of various edible decapod crustaceans', 'name': 'prawn'}, {'frequency': 'f', 'id': 859, 'synset': 'printer.n.03', 'synonyms': ['printer', 'printing_machine'], 'def': 'a machine that prints', 'name': 'printer'}, {'frequency': 'c', 'id': 860, 'synset': 'projectile.n.01', 'synonyms': ['projectile_(weapon)', 'missile'], 'def': 'a weapon that is forcibly thrown or projected at a targets', 'name': 'projectile_(weapon)'}, {'frequency': 'c', 'id': 861, 'synset': 'projector.n.02', 'synonyms': ['projector'], 'def': 'an optical instrument that projects an enlarged image onto a screen', 'name': 'projector'}, {'frequency': 'f', 'id': 862, 'synset': 'propeller.n.01', 'synonyms': ['propeller', 'propellor'], 'def': 'a mechanical device that rotates to push against air or water', 'name': 'propeller'}, {'frequency': 'r', 'id': 863, 'synset': 'prune.n.01', 'synonyms': ['prune'], 'def': 'dried plum', 'name': 'prune'}, {'frequency': 'r', 'id': 864, 'synset': 'pudding.n.01', 'synonyms': ['pudding'], 'def': 'any of various soft thick unsweetened baked dishes', 'name': 'pudding'}, {'frequency': 'r', 'id': 865, 'synset': 'puffer.n.02', 'synonyms': ['puffer_(fish)', 'pufferfish', 'blowfish', 'globefish'], 'def': 'fishes whose elongated spiny body can inflate itself with water or air to form a globe', 'name': 'puffer_(fish)'}, {'frequency': 'r', 'id': 866, 'synset': 'puffin.n.01', 'synonyms': ['puffin'], 'def': 'seabirds having short necks and brightly colored compressed bills', 'name': 'puffin'}, {'frequency': 'r', 'id': 867, 'synset': 'pug.n.01', 'synonyms': ['pug-dog'], 'def': 'small compact smooth-coated breed of Asiatic origin having a tightly curled tail and broad flat wrinkled muzzle', 'name': 'pug-dog'}, {'frequency': 'c', 'id': 868, 'synset': 'pumpkin.n.02', 'synonyms': ['pumpkin'], 'def': 'usually large pulpy deep-yellow round fruit of the squash family maturing in late summer or early autumn', 'name': 'pumpkin'}, {'frequency': 'r', 'id': 869, 'synset': 'punch.n.03', 'synonyms': ['puncher'], 'def': 'a tool for making holes or indentations', 'name': 'puncher'}, {'frequency': 'r', 'id': 870, 'synset': 'puppet.n.01', 'synonyms': ['puppet', 'marionette'], 'def': 'a small figure of a person operated from above with strings by a puppeteer', 'name': 'puppet'}, {'frequency': 'r', 'id': 871, 'synset': 'puppy.n.01', 'synonyms': ['puppy'], 'def': 'a young dog', 'name': 'puppy'}, {'frequency': 'r', 'id': 872, 'synset': 'quesadilla.n.01', 'synonyms': ['quesadilla'], 'def': 'a tortilla that is filled with cheese and heated', 'name': 'quesadilla'}, {'frequency': 'r', 'id': 873, 'synset': 'quiche.n.02', 'synonyms': ['quiche'], 'def': 'a tart filled with rich unsweetened custard; often contains other ingredients (as cheese or ham or seafood or vegetables)', 'name': 'quiche'}, {'frequency': 'f', 'id': 874, 'synset': 'quilt.n.01', 'synonyms': ['quilt', 'comforter'], 'def': 'bedding made of two layers of cloth filled with stuffing and stitched together', 'name': 'quilt'}, {'frequency': 'c', 'id': 875, 'synset': 'rabbit.n.01', 'synonyms': ['rabbit'], 'def': 'any of various burrowing animals of the family Leporidae having long ears and short tails', 'name': 'rabbit'}, {'frequency': 'r', 'id': 876, 'synset': 'racer.n.02', 'synonyms': ['race_car', 'racing_car'], 'def': 'a fast car that competes in races', 'name': 'race_car'}, {'frequency': 'c', 'id': 877, 'synset': 'racket.n.04', 'synonyms': ['racket', 'racquet'], 'def': 'a sports implement used to strike a ball in various games', 'name': 'racket'}, {'frequency': 'r', 'id': 878, 'synset': 'radar.n.01', 'synonyms': ['radar'], 'def': 'measuring instrument in which the echo of a pulse of microwave radiation is used to detect and locate distant objects', 'name': 'radar'}, {'frequency': 'c', 'id': 879, 'synset': 'radiator.n.03', 'synonyms': ['radiator'], 'def': 'a mechanism consisting of a metal honeycomb through which hot fluids circulate', 'name': 'radiator'}, {'frequency': 'c', 'id': 880, 'synset': 'radio_receiver.n.01', 'synonyms': ['radio_receiver', 'radio_set', 'radio', 'tuner_(radio)'], 'def': 'an electronic receiver that detects and demodulates and amplifies transmitted radio signals', 'name': 'radio_receiver'}, {'frequency': 'c', 'id': 881, 'synset': 'radish.n.03', 'synonyms': ['radish', 'daikon'], 'def': 'pungent edible root of any of various cultivated radish plants', 'name': 'radish'}, {'frequency': 'c', 'id': 882, 'synset': 'raft.n.01', 'synonyms': ['raft'], 'def': 'a flat float (usually made of logs or planks) that can be used for transport or as a platform for swimmers', 'name': 'raft'}, {'frequency': 'r', 'id': 883, 'synset': 'rag_doll.n.01', 'synonyms': ['rag_doll'], 'def': 'a cloth doll that is stuffed and (usually) painted', 'name': 'rag_doll'}, {'frequency': 'c', 'id': 884, 'synset': 'raincoat.n.01', 'synonyms': ['raincoat', 'waterproof_jacket'], 'def': 'a water-resistant coat', 'name': 'raincoat'}, {'frequency': 'c', 'id': 885, 'synset': 'ram.n.05', 'synonyms': ['ram_(animal)'], 'def': 'uncastrated adult male sheep', 'name': 'ram_(animal)'}, {'frequency': 'c', 'id': 886, 'synset': 'raspberry.n.02', 'synonyms': ['raspberry'], 'def': 'red or black edible aggregate berries usually smaller than the related blackberries', 'name': 'raspberry'}, {'frequency': 'r', 'id': 887, 'synset': 'rat.n.01', 'synonyms': ['rat'], 'def': 'any of various long-tailed rodents similar to but larger than a mouse', 'name': 'rat'}, {'frequency': 'c', 'id': 888, 'synset': 'razorblade.n.01', 'synonyms': ['razorblade'], 'def': 'a blade that has very sharp edge', 'name': 'razorblade'}, {'frequency': 'c', 'id': 889, 'synset': 'reamer.n.01', 'synonyms': ['reamer_(juicer)', 'juicer', 'juice_reamer'], 'def': 'a squeezer with a conical ridged center that is used for squeezing juice from citrus fruit', 'name': 'reamer_(juicer)'}, {'frequency': 'f', 'id': 890, 'synset': 'rearview_mirror.n.01', 'synonyms': ['rearview_mirror'], 'def': 'car mirror that reflects the view out of the rear window', 'name': 'rearview_mirror'}, {'frequency': 'c', 'id': 891, 'synset': 'receipt.n.02', 'synonyms': ['receipt'], 'def': 'an acknowledgment (usually tangible) that payment has been made', 'name': 'receipt'}, {'frequency': 'c', 'id': 892, 'synset': 'recliner.n.01', 'synonyms': ['recliner', 'reclining_chair', 'lounger_(chair)'], 'def': 'an armchair whose back can be lowered and foot can be raised to allow the sitter to recline in it', 'name': 'recliner'}, {'frequency': 'r', 'id': 893, 'synset': 'record_player.n.01', 'synonyms': ['record_player', 'phonograph_(record_player)', 'turntable'], 'def': 'machine in which rotating records cause a stylus to vibrate and the vibrations are amplified acoustically or electronically', 'name': 'record_player'}, {'frequency': 'r', 'id': 894, 'synset': 'red_cabbage.n.02', 'synonyms': ['red_cabbage'], 'def': 'compact head of purplish-red leaves', 'name': 'red_cabbage'}, {'frequency': 'f', 'id': 895, 'synset': 'reflector.n.01', 'synonyms': ['reflector'], 'def': 'device that reflects light, radiation, etc.', 'name': 'reflector'}, {'frequency': 'f', 'id': 896, 'synset': 'remote_control.n.01', 'synonyms': ['remote_control'], 'def': 'a device that can be used to control a machine or apparatus from a distance', 'name': 'remote_control'}, {'frequency': 'c', 'id': 897, 'synset': 'rhinoceros.n.01', 'synonyms': ['rhinoceros'], 'def': 'massive powerful herbivorous odd-toed ungulate of southeast Asia and Africa having very thick skin and one or two horns on the snout', 'name': 'rhinoceros'}, {'frequency': 'r', 'id': 898, 'synset': 'rib.n.03', 'synonyms': ['rib_(food)'], 'def': 'cut of meat including one or more ribs', 'name': 'rib_(food)'}, {'frequency': 'r', 'id': 899, 'synset': 'rifle.n.01', 'synonyms': ['rifle'], 'def': 'a shoulder firearm with a long barrel', 'name': 'rifle'}, {'frequency': 'f', 'id': 900, 'synset': 'ring.n.08', 'synonyms': ['ring'], 'def': 'jewelry consisting of a circlet of precious metal (often set with jewels) worn on the finger', 'name': 'ring'}, {'frequency': 'r', 'id': 901, 'synset': 'river_boat.n.01', 'synonyms': ['river_boat'], 'def': 'a boat used on rivers or to ply a river', 'name': 'river_boat'}, {'frequency': 'r', 'id': 902, 'synset': 'road_map.n.02', 'synonyms': ['road_map'], 'def': '(NOT A ROAD) a MAP showing roads (for automobile travel)', 'name': 'road_map'}, {'frequency': 'c', 'id': 903, 'synset': 'robe.n.01', 'synonyms': ['robe'], 'def': 'any loose flowing garment', 'name': 'robe'}, {'frequency': 'c', 'id': 904, 'synset': 'rocking_chair.n.01', 'synonyms': ['rocking_chair'], 'def': 'a chair mounted on rockers', 'name': 'rocking_chair'}, {'frequency': 'r', 'id': 905, 'synset': 'roller_skate.n.01', 'synonyms': ['roller_skate'], 'def': 'a shoe with pairs of rollers (small hard wheels) fixed to the sole', 'name': 'roller_skate'}, {'frequency': 'r', 'id': 906, 'synset': 'rollerblade.n.01', 'synonyms': ['Rollerblade'], 'def': 'an in-line variant of a roller skate', 'name': 'Rollerblade'}, {'frequency': 'c', 'id': 907, 'synset': 'rolling_pin.n.01', 'synonyms': ['rolling_pin'], 'def': 'utensil consisting of a cylinder (usually of wood) with a handle at each end; used to roll out dough', 'name': 'rolling_pin'}, {'frequency': 'r', 'id': 908, 'synset': 'root_beer.n.01', 'synonyms': ['root_beer'], 'def': 'carbonated drink containing extracts of roots and herbs', 'name': 'root_beer'}, {'frequency': 'c', 'id': 909, 'synset': 'router.n.02', 'synonyms': ['router_(computer_equipment)'], 'def': 'a device that forwards data packets between computer networks', 'name': 'router_(computer_equipment)'}, {'frequency': 'f', 'id': 910, 'synset': 'rubber_band.n.01', 'synonyms': ['rubber_band', 'elastic_band'], 'def': 'a narrow band of elastic rubber used to hold things (such as papers) together', 'name': 'rubber_band'}, {'frequency': 'c', 'id': 911, 'synset': 'runner.n.08', 'synonyms': ['runner_(carpet)'], 'def': 'a long narrow carpet', 'name': 'runner_(carpet)'}, {'frequency': 'f', 'id': 912, 'synset': 'sack.n.01', 'synonyms': ['plastic_bag', 'paper_bag'], 'def': "a bag made of paper or plastic for holding customer's purchases", 'name': 'plastic_bag'}, {'frequency': 'f', 'id': 913, 'synset': 'saddle.n.01', 'synonyms': ['saddle_(on_an_animal)'], 'def': 'a seat for the rider of a horse or camel', 'name': 'saddle_(on_an_animal)'}, {'frequency': 'f', 'id': 914, 'synset': 'saddle_blanket.n.01', 'synonyms': ['saddle_blanket', 'saddlecloth', 'horse_blanket'], 'def': 'stable gear consisting of a blanket placed under the saddle', 'name': 'saddle_blanket'}, {'frequency': 'c', 'id': 915, 'synset': 'saddlebag.n.01', 'synonyms': ['saddlebag'], 'def': 'a large bag (or pair of bags) hung over a saddle', 'name': 'saddlebag'}, {'frequency': 'r', 'id': 916, 'synset': 'safety_pin.n.01', 'synonyms': ['safety_pin'], 'def': 'a pin in the form of a clasp; has a guard so the point of the pin will not stick the user', 'name': 'safety_pin'}, {'frequency': 'c', 'id': 917, 'synset': 'sail.n.01', 'synonyms': ['sail'], 'def': 'a large piece of fabric by means of which wind is used to propel a sailing vessel', 'name': 'sail'}, {'frequency': 'c', 'id': 918, 'synset': 'salad.n.01', 'synonyms': ['salad'], 'def': 'food mixtures either arranged on a plate or tossed and served with a moist dressing; usually consisting of or including greens', 'name': 'salad'}, {'frequency': 'r', 'id': 919, 'synset': 'salad_plate.n.01', 'synonyms': ['salad_plate', 'salad_bowl'], 'def': 'a plate or bowl for individual servings of salad', 'name': 'salad_plate'}, {'frequency': 'r', 'id': 920, 'synset': 'salami.n.01', 'synonyms': ['salami'], 'def': 'highly seasoned fatty sausage of pork and beef usually dried', 'name': 'salami'}, {'frequency': 'r', 'id': 921, 'synset': 'salmon.n.01', 'synonyms': ['salmon_(fish)'], 'def': 'any of various large food and game fishes of northern waters', 'name': 'salmon_(fish)'}, {'frequency': 'r', 'id': 922, 'synset': 'salmon.n.03', 'synonyms': ['salmon_(food)'], 'def': 'flesh of any of various marine or freshwater fish of the family Salmonidae', 'name': 'salmon_(food)'}, {'frequency': 'r', 'id': 923, 'synset': 'salsa.n.01', 'synonyms': ['salsa'], 'def': 'spicy sauce of tomatoes and onions and chili peppers to accompany Mexican foods', 'name': 'salsa'}, {'frequency': 'f', 'id': 924, 'synset': 'saltshaker.n.01', 'synonyms': ['saltshaker'], 'def': 'a shaker with a perforated top for sprinkling salt', 'name': 'saltshaker'}, {'frequency': 'f', 'id': 925, 'synset': 'sandal.n.01', 'synonyms': ['sandal_(type_of_shoe)'], 'def': 'a shoe consisting of a sole fastened by straps to the foot', 'name': 'sandal_(type_of_shoe)'}, {'frequency': 'f', 'id': 926, 'synset': 'sandwich.n.01', 'synonyms': ['sandwich'], 'def': 'two (or more) slices of bread with a filling between them', 'name': 'sandwich'}, {'frequency': 'r', 'id': 927, 'synset': 'satchel.n.01', 'synonyms': ['satchel'], 'def': 'luggage consisting of a small case with a flat bottom and (usually) a shoulder strap', 'name': 'satchel'}, {'frequency': 'r', 'id': 928, 'synset': 'saucepan.n.01', 'synonyms': ['saucepan'], 'def': 'a deep pan with a handle; used for stewing or boiling', 'name': 'saucepan'}, {'frequency': 'f', 'id': 929, 'synset': 'saucer.n.02', 'synonyms': ['saucer'], 'def': 'a small shallow dish for holding a cup at the table', 'name': 'saucer'}, {'frequency': 'f', 'id': 930, 'synset': 'sausage.n.01', 'synonyms': ['sausage'], 'def': 'highly seasoned minced meat stuffed in casings', 'name': 'sausage'}, {'frequency': 'r', 'id': 931, 'synset': 'sawhorse.n.01', 'synonyms': ['sawhorse', 'sawbuck'], 'def': 'a framework for holding wood that is being sawed', 'name': 'sawhorse'}, {'frequency': 'r', 'id': 932, 'synset': 'sax.n.02', 'synonyms': ['saxophone'], 'def': "a wind instrument with a `J'-shaped form typically made of brass", 'name': 'saxophone'}, {'frequency': 'f', 'id': 933, 'synset': 'scale.n.07', 'synonyms': ['scale_(measuring_instrument)'], 'def': 'a measuring instrument for weighing; shows amount of mass', 'name': 'scale_(measuring_instrument)'}, {'frequency': 'r', 'id': 934, 'synset': 'scarecrow.n.01', 'synonyms': ['scarecrow', 'strawman'], 'def': 'an effigy in the shape of a man to frighten birds away from seeds', 'name': 'scarecrow'}, {'frequency': 'f', 'id': 935, 'synset': 'scarf.n.01', 'synonyms': ['scarf'], 'def': 'a garment worn around the head or neck or shoulders for warmth or decoration', 'name': 'scarf'}, {'frequency': 'c', 'id': 936, 'synset': 'school_bus.n.01', 'synonyms': ['school_bus'], 'def': 'a bus used to transport children to or from school', 'name': 'school_bus'}, {'frequency': 'f', 'id': 937, 'synset': 'scissors.n.01', 'synonyms': ['scissors'], 'def': 'a tool having two crossed pivoting blades with looped handles', 'name': 'scissors'}, {'frequency': 'c', 'id': 938, 'synset': 'scoreboard.n.01', 'synonyms': ['scoreboard'], 'def': 'a large board for displaying the score of a contest (and some other information)', 'name': 'scoreboard'}, {'frequency': 'c', 'id': 939, 'synset': 'scrambled_eggs.n.01', 'synonyms': ['scrambled_eggs'], 'def': 'eggs beaten and cooked to a soft firm consistency while stirring', 'name': 'scrambled_eggs'}, {'frequency': 'r', 'id': 940, 'synset': 'scraper.n.01', 'synonyms': ['scraper'], 'def': 'any of various hand tools for scraping', 'name': 'scraper'}, {'frequency': 'r', 'id': 941, 'synset': 'scratcher.n.03', 'synonyms': ['scratcher'], 'def': 'a device used for scratching', 'name': 'scratcher'}, {'frequency': 'c', 'id': 942, 'synset': 'screwdriver.n.01', 'synonyms': ['screwdriver'], 'def': 'a hand tool for driving screws; has a tip that fits into the head of a screw', 'name': 'screwdriver'}, {'frequency': 'c', 'id': 943, 'synset': 'scrub_brush.n.01', 'synonyms': ['scrubbing_brush'], 'def': 'a brush with short stiff bristles for heavy cleaning', 'name': 'scrubbing_brush'}, {'frequency': 'c', 'id': 944, 'synset': 'sculpture.n.01', 'synonyms': ['sculpture'], 'def': 'a three-dimensional work of art', 'name': 'sculpture'}, {'frequency': 'r', 'id': 945, 'synset': 'seabird.n.01', 'synonyms': ['seabird', 'seafowl'], 'def': 'a bird that frequents coastal waters and the open ocean: gulls; pelicans; gannets; cormorants; albatrosses; petrels; etc.', 'name': 'seabird'}, {'frequency': 'r', 'id': 946, 'synset': 'seahorse.n.02', 'synonyms': ['seahorse'], 'def': 'small fish with horse-like heads bent sharply downward and curled tails', 'name': 'seahorse'}, {'frequency': 'r', 'id': 947, 'synset': 'seaplane.n.01', 'synonyms': ['seaplane', 'hydroplane'], 'def': 'an airplane that can land on or take off from water', 'name': 'seaplane'}, {'frequency': 'c', 'id': 948, 'synset': 'seashell.n.01', 'synonyms': ['seashell'], 'def': 'the shell of a marine organism', 'name': 'seashell'}, {'frequency': 'r', 'id': 949, 'synset': 'seedling.n.01', 'synonyms': ['seedling'], 'def': 'young plant or tree grown from a seed', 'name': 'seedling'}, {'frequency': 'c', 'id': 950, 'synset': 'serving_dish.n.01', 'synonyms': ['serving_dish'], 'def': 'a dish used for serving food', 'name': 'serving_dish'}, {'frequency': 'r', 'id': 951, 'synset': 'sewing_machine.n.01', 'synonyms': ['sewing_machine'], 'def': 'a textile machine used as a home appliance for sewing', 'name': 'sewing_machine'}, {'frequency': 'r', 'id': 952, 'synset': 'shaker.n.03', 'synonyms': ['shaker'], 'def': 'a container in which something can be shaken', 'name': 'shaker'}, {'frequency': 'c', 'id': 953, 'synset': 'shampoo.n.01', 'synonyms': ['shampoo'], 'def': 'cleansing agent consisting of soaps or detergents used for washing the hair', 'name': 'shampoo'}, {'frequency': 'r', 'id': 954, 'synset': 'shark.n.01', 'synonyms': ['shark'], 'def': 'typically large carnivorous fishes with sharpe teeth', 'name': 'shark'}, {'frequency': 'r', 'id': 955, 'synset': 'sharpener.n.01', 'synonyms': ['sharpener'], 'def': 'any implement that is used to make something (an edge or a point) sharper', 'name': 'sharpener'}, {'frequency': 'r', 'id': 956, 'synset': 'sharpie.n.03', 'synonyms': ['Sharpie'], 'def': 'a pen with indelible ink that will write on any surface', 'name': 'Sharpie'}, {'frequency': 'r', 'id': 957, 'synset': 'shaver.n.03', 'synonyms': ['shaver_(electric)', 'electric_shaver', 'electric_razor'], 'def': 'a razor powered by an electric motor', 'name': 'shaver_(electric)'}, {'frequency': 'c', 'id': 958, 'synset': 'shaving_cream.n.01', 'synonyms': ['shaving_cream', 'shaving_soap'], 'def': 'toiletry consisting that forms a rich lather for softening the beard before shaving', 'name': 'shaving_cream'}, {'frequency': 'r', 'id': 959, 'synset': 'shawl.n.01', 'synonyms': ['shawl'], 'def': 'cloak consisting of an oblong piece of cloth used to cover the head and shoulders', 'name': 'shawl'}, {'frequency': 'r', 'id': 960, 'synset': 'shears.n.01', 'synonyms': ['shears'], 'def': 'large scissors with strong blades', 'name': 'shears'}, {'frequency': 'f', 'id': 961, 'synset': 'sheep.n.01', 'synonyms': ['sheep'], 'def': 'woolly usually horned ruminant mammal related to the goat', 'name': 'sheep'}, {'frequency': 'r', 'id': 962, 'synset': 'shepherd_dog.n.01', 'synonyms': ['shepherd_dog', 'sheepdog'], 'def': 'any of various usually long-haired breeds of dog reared to herd and guard sheep', 'name': 'shepherd_dog'}, {'frequency': 'r', 'id': 963, 'synset': 'sherbert.n.01', 'synonyms': ['sherbert', 'sherbet'], 'def': 'a frozen dessert made primarily of fruit juice and sugar', 'name': 'sherbert'}, {'frequency': 'r', 'id': 964, 'synset': 'shield.n.02', 'synonyms': ['shield'], 'def': 'armor carried on the arm to intercept blows', 'name': 'shield'}, {'frequency': 'f', 'id': 965, 'synset': 'shirt.n.01', 'synonyms': ['shirt'], 'def': 'a garment worn on the upper half of the body', 'name': 'shirt'}, {'frequency': 'f', 'id': 966, 'synset': 'shoe.n.01', 'synonyms': ['shoe', 'sneaker_(type_of_shoe)', 'tennis_shoe'], 'def': 'common footwear covering the foot', 'name': 'shoe'}, {'frequency': 'c', 'id': 967, 'synset': 'shopping_bag.n.01', 'synonyms': ['shopping_bag'], 'def': 'a bag made of plastic or strong paper (often with handles); used to transport goods after shopping', 'name': 'shopping_bag'}, {'frequency': 'c', 'id': 968, 'synset': 'shopping_cart.n.01', 'synonyms': ['shopping_cart'], 'def': 'a handcart that holds groceries or other goods while shopping', 'name': 'shopping_cart'}, {'frequency': 'f', 'id': 969, 'synset': 'short_pants.n.01', 'synonyms': ['short_pants', 'shorts_(clothing)', 'trunks_(clothing)'], 'def': 'trousers that end at or above the knee', 'name': 'short_pants'}, {'frequency': 'r', 'id': 970, 'synset': 'shot_glass.n.01', 'synonyms': ['shot_glass'], 'def': 'a small glass adequate to hold a single swallow of whiskey', 'name': 'shot_glass'}, {'frequency': 'c', 'id': 971, 'synset': 'shoulder_bag.n.01', 'synonyms': ['shoulder_bag'], 'def': 'a large handbag that can be carried by a strap looped over the shoulder', 'name': 'shoulder_bag'}, {'frequency': 'c', 'id': 972, 'synset': 'shovel.n.01', 'synonyms': ['shovel'], 'def': 'a hand tool for lifting loose material such as snow, dirt, etc.', 'name': 'shovel'}, {'frequency': 'f', 'id': 973, 'synset': 'shower.n.01', 'synonyms': ['shower_head'], 'def': 'a plumbing fixture that sprays water over you', 'name': 'shower_head'}, {'frequency': 'f', 'id': 974, 'synset': 'shower_curtain.n.01', 'synonyms': ['shower_curtain'], 'def': 'a curtain that keeps water from splashing out of the shower area', 'name': 'shower_curtain'}, {'frequency': 'r', 'id': 975, 'synset': 'shredder.n.01', 'synonyms': ['shredder_(for_paper)'], 'def': 'a device that shreds documents', 'name': 'shredder_(for_paper)'}, {'frequency': 'r', 'id': 976, 'synset': 'sieve.n.01', 'synonyms': ['sieve', 'screen_(sieve)'], 'def': 'a strainer for separating lumps from powdered material or grading particles', 'name': 'sieve'}, {'frequency': 'f', 'id': 977, 'synset': 'signboard.n.01', 'synonyms': ['signboard'], 'def': 'structure displaying a board on which advertisements can be posted', 'name': 'signboard'}, {'frequency': 'c', 'id': 978, 'synset': 'silo.n.01', 'synonyms': ['silo'], 'def': 'a cylindrical tower used for storing goods', 'name': 'silo'}, {'frequency': 'f', 'id': 979, 'synset': 'sink.n.01', 'synonyms': ['sink'], 'def': 'plumbing fixture consisting of a water basin fixed to a wall or floor and having a drainpipe', 'name': 'sink'}, {'frequency': 'f', 'id': 980, 'synset': 'skateboard.n.01', 'synonyms': ['skateboard'], 'def': 'a board with wheels that is ridden in a standing or crouching position and propelled by foot', 'name': 'skateboard'}, {'frequency': 'c', 'id': 981, 'synset': 'skewer.n.01', 'synonyms': ['skewer'], 'def': 'a long pin for holding meat in position while it is being roasted', 'name': 'skewer'}, {'frequency': 'f', 'id': 982, 'synset': 'ski.n.01', 'synonyms': ['ski'], 'def': 'sports equipment for skiing on snow', 'name': 'ski'}, {'frequency': 'f', 'id': 983, 'synset': 'ski_boot.n.01', 'synonyms': ['ski_boot'], 'def': 'a stiff boot that is fastened to a ski with a ski binding', 'name': 'ski_boot'}, {'frequency': 'f', 'id': 984, 'synset': 'ski_parka.n.01', 'synonyms': ['ski_parka', 'ski_jacket'], 'def': 'a parka to be worn while skiing', 'name': 'ski_parka'}, {'frequency': 'f', 'id': 985, 'synset': 'ski_pole.n.01', 'synonyms': ['ski_pole'], 'def': 'a pole with metal points used as an aid in skiing', 'name': 'ski_pole'}, {'frequency': 'f', 'id': 986, 'synset': 'skirt.n.02', 'synonyms': ['skirt'], 'def': 'a garment hanging from the waist; worn mainly by girls and women', 'name': 'skirt'}, {'frequency': 'c', 'id': 987, 'synset': 'sled.n.01', 'synonyms': ['sled', 'sledge', 'sleigh'], 'def': 'a vehicle or flat object for transportation over snow by sliding or pulled by dogs, etc.', 'name': 'sled'}, {'frequency': 'c', 'id': 988, 'synset': 'sleeping_bag.n.01', 'synonyms': ['sleeping_bag'], 'def': 'large padded bag designed to be slept in outdoors', 'name': 'sleeping_bag'}, {'frequency': 'r', 'id': 989, 'synset': 'sling.n.05', 'synonyms': ['sling_(bandage)', 'triangular_bandage'], 'def': 'bandage to support an injured forearm; slung over the shoulder or neck', 'name': 'sling_(bandage)'}, {'frequency': 'c', 'id': 990, 'synset': 'slipper.n.01', 'synonyms': ['slipper_(footwear)', 'carpet_slipper_(footwear)'], 'def': 'low footwear that can be slipped on and off easily; usually worn indoors', 'name': 'slipper_(footwear)'}, {'frequency': 'r', 'id': 991, 'synset': 'smoothie.n.02', 'synonyms': ['smoothie'], 'def': 'a thick smooth drink consisting of fresh fruit pureed with ice cream or yoghurt or milk', 'name': 'smoothie'}, {'frequency': 'r', 'id': 992, 'synset': 'snake.n.01', 'synonyms': ['snake', 'serpent'], 'def': 'limbless scaly elongate reptile; some are venomous', 'name': 'snake'}, {'frequency': 'f', 'id': 993, 'synset': 'snowboard.n.01', 'synonyms': ['snowboard'], 'def': 'a board that resembles a broad ski or a small surfboard; used in a standing position to slide down snow-covered slopes', 'name': 'snowboard'}, {'frequency': 'c', 'id': 994, 'synset': 'snowman.n.01', 'synonyms': ['snowman'], 'def': 'a figure of a person made of packed snow', 'name': 'snowman'}, {'frequency': 'c', 'id': 995, 'synset': 'snowmobile.n.01', 'synonyms': ['snowmobile'], 'def': 'tracked vehicle for travel on snow having skis in front', 'name': 'snowmobile'}, {'frequency': 'f', 'id': 996, 'synset': 'soap.n.01', 'synonyms': ['soap'], 'def': 'a cleansing agent made from the salts of vegetable or animal fats', 'name': 'soap'}, {'frequency': 'f', 'id': 997, 'synset': 'soccer_ball.n.01', 'synonyms': ['soccer_ball'], 'def': "an inflated ball used in playing soccer (called `football' outside of the United States)", 'name': 'soccer_ball'}, {'frequency': 'f', 'id': 998, 'synset': 'sock.n.01', 'synonyms': ['sock'], 'def': 'cloth covering for the foot; worn inside the shoe; reaches to between the ankle and the knee', 'name': 'sock'}, {'frequency': 'r', 'id': 999, 'synset': 'soda_fountain.n.02', 'synonyms': ['soda_fountain'], 'def': 'an apparatus for dispensing soda water', 'name': 'soda_fountain'}, {'frequency': 'r', 'id': 1000, 'synset': 'soda_water.n.01', 'synonyms': ['carbonated_water', 'club_soda', 'seltzer', 'sparkling_water'], 'def': 'effervescent beverage artificially charged with carbon dioxide', 'name': 'carbonated_water'}, {'frequency': 'f', 'id': 1001, 'synset': 'sofa.n.01', 'synonyms': ['sofa', 'couch', 'lounge'], 'def': 'an upholstered seat for more than one person', 'name': 'sofa'}, {'frequency': 'r', 'id': 1002, 'synset': 'softball.n.01', 'synonyms': ['softball'], 'def': 'ball used in playing softball', 'name': 'softball'}, {'frequency': 'c', 'id': 1003, 'synset': 'solar_array.n.01', 'synonyms': ['solar_array', 'solar_battery', 'solar_panel'], 'def': 'electrical device consisting of a large array of connected solar cells', 'name': 'solar_array'}, {'frequency': 'r', 'id': 1004, 'synset': 'sombrero.n.02', 'synonyms': ['sombrero'], 'def': 'a straw hat with a tall crown and broad brim; worn in American southwest and in Mexico', 'name': 'sombrero'}, {'frequency': 'c', 'id': 1005, 'synset': 'soup.n.01', 'synonyms': ['soup'], 'def': 'liquid food especially of meat or fish or vegetable stock often containing pieces of solid food', 'name': 'soup'}, {'frequency': 'r', 'id': 1006, 'synset': 'soup_bowl.n.01', 'synonyms': ['soup_bowl'], 'def': 'a bowl for serving soup', 'name': 'soup_bowl'}, {'frequency': 'c', 'id': 1007, 'synset': 'soupspoon.n.01', 'synonyms': ['soupspoon'], 'def': 'a spoon with a rounded bowl for eating soup', 'name': 'soupspoon'}, {'frequency': 'c', 'id': 1008, 'synset': 'sour_cream.n.01', 'synonyms': ['sour_cream', 'soured_cream'], 'def': 'soured light cream', 'name': 'sour_cream'}, {'frequency': 'r', 'id': 1009, 'synset': 'soya_milk.n.01', 'synonyms': ['soya_milk', 'soybean_milk', 'soymilk'], 'def': 'a milk substitute containing soybean flour and water; used in some infant formulas and in making tofu', 'name': 'soya_milk'}, {'frequency': 'r', 'id': 1010, 'synset': 'space_shuttle.n.01', 'synonyms': ['space_shuttle'], 'def': "a reusable spacecraft with wings for a controlled descent through the Earth's atmosphere", 'name': 'space_shuttle'}, {'frequency': 'r', 'id': 1011, 'synset': 'sparkler.n.02', 'synonyms': ['sparkler_(fireworks)'], 'def': 'a firework that burns slowly and throws out a shower of sparks', 'name': 'sparkler_(fireworks)'}, {'frequency': 'f', 'id': 1012, 'synset': 'spatula.n.02', 'synonyms': ['spatula'], 'def': 'a hand tool with a thin flexible blade used to mix or spread soft substances', 'name': 'spatula'}, {'frequency': 'r', 'id': 1013, 'synset': 'spear.n.01', 'synonyms': ['spear', 'lance'], 'def': 'a long pointed rod used as a tool or weapon', 'name': 'spear'}, {'frequency': 'f', 'id': 1014, 'synset': 'spectacles.n.01', 'synonyms': ['spectacles', 'specs', 'eyeglasses', 'glasses'], 'def': 'optical instrument consisting of a frame that holds a pair of lenses for correcting defective vision', 'name': 'spectacles'}, {'frequency': 'c', 'id': 1015, 'synset': 'spice_rack.n.01', 'synonyms': ['spice_rack'], 'def': 'a rack for displaying containers filled with spices', 'name': 'spice_rack'}, {'frequency': 'r', 'id': 1016, 'synset': 'spider.n.01', 'synonyms': ['spider'], 'def': 'predatory arachnid with eight legs, two poison fangs, two feelers, and usually two silk-spinning organs at the back end of the body', 'name': 'spider'}, {'frequency': 'c', 'id': 1017, 'synset': 'sponge.n.01', 'synonyms': ['sponge'], 'def': 'a porous mass usable to absorb water typically used for cleaning', 'name': 'sponge'}, {'frequency': 'f', 'id': 1018, 'synset': 'spoon.n.01', 'synonyms': ['spoon'], 'def': 'a piece of cutlery with a shallow bowl-shaped container and a handle', 'name': 'spoon'}, {'frequency': 'c', 'id': 1019, 'synset': 'sportswear.n.01', 'synonyms': ['sportswear', 'athletic_wear', 'activewear'], 'def': 'attire worn for sport or for casual wear', 'name': 'sportswear'}, {'frequency': 'c', 'id': 1020, 'synset': 'spotlight.n.02', 'synonyms': ['spotlight'], 'def': 'a lamp that produces a strong beam of light to illuminate a restricted area; used to focus attention of a stage performer', 'name': 'spotlight'}, {'frequency': 'r', 'id': 1021, 'synset': 'squirrel.n.01', 'synonyms': ['squirrel'], 'def': 'a kind of arboreal rodent having a long bushy tail', 'name': 'squirrel'}, {'frequency': 'c', 'id': 1022, 'synset': 'stapler.n.01', 'synonyms': ['stapler_(stapling_machine)'], 'def': 'a machine that inserts staples into sheets of paper in order to fasten them together', 'name': 'stapler_(stapling_machine)'}, {'frequency': 'r', 'id': 1023, 'synset': 'starfish.n.01', 'synonyms': ['starfish', 'sea_star'], 'def': 'echinoderms characterized by five arms extending from a central disk', 'name': 'starfish'}, {'frequency': 'f', 'id': 1024, 'synset': 'statue.n.01', 'synonyms': ['statue_(sculpture)'], 'def': 'a sculpture representing a human or animal', 'name': 'statue_(sculpture)'}, {'frequency': 'c', 'id': 1025, 'synset': 'steak.n.01', 'synonyms': ['steak_(food)'], 'def': 'a slice of meat cut from the fleshy part of an animal or large fish', 'name': 'steak_(food)'}, {'frequency': 'r', 'id': 1026, 'synset': 'steak_knife.n.01', 'synonyms': ['steak_knife'], 'def': 'a sharp table knife used in eating steak', 'name': 'steak_knife'}, {'frequency': 'r', 'id': 1027, 'synset': 'steamer.n.02', 'synonyms': ['steamer_(kitchen_appliance)'], 'def': 'a cooking utensil that can be used to cook food by steaming it', 'name': 'steamer_(kitchen_appliance)'}, {'frequency': 'f', 'id': 1028, 'synset': 'steering_wheel.n.01', 'synonyms': ['steering_wheel'], 'def': 'a handwheel that is used for steering', 'name': 'steering_wheel'}, {'frequency': 'r', 'id': 1029, 'synset': 'stencil.n.01', 'synonyms': ['stencil'], 'def': 'a sheet of material (metal, plastic, etc.) that has been perforated with a pattern; ink or paint can pass through the perforations to create the printed pattern on the surface below', 'name': 'stencil'}, {'frequency': 'r', 'id': 1030, 'synset': 'step_ladder.n.01', 'synonyms': ['stepladder'], 'def': 'a folding portable ladder hinged at the top', 'name': 'stepladder'}, {'frequency': 'c', 'id': 1031, 'synset': 'step_stool.n.01', 'synonyms': ['step_stool'], 'def': 'a stool that has one or two steps that fold under the seat', 'name': 'step_stool'}, {'frequency': 'c', 'id': 1032, 'synset': 'stereo.n.01', 'synonyms': ['stereo_(sound_system)'], 'def': 'electronic device for playing audio', 'name': 'stereo_(sound_system)'}, {'frequency': 'r', 'id': 1033, 'synset': 'stew.n.02', 'synonyms': ['stew'], 'def': 'food prepared by stewing especially meat or fish with vegetables', 'name': 'stew'}, {'frequency': 'r', 'id': 1034, 'synset': 'stirrer.n.02', 'synonyms': ['stirrer'], 'def': 'an implement used for stirring', 'name': 'stirrer'}, {'frequency': 'f', 'id': 1035, 'synset': 'stirrup.n.01', 'synonyms': ['stirrup'], 'def': "support consisting of metal loops into which rider's feet go", 'name': 'stirrup'}, {'frequency': 'c', 'id': 1036, 'synset': 'stocking.n.01', 'synonyms': ['stockings_(leg_wear)'], 'def': 'close-fitting hosiery to cover the foot and leg; come in matched pairs', 'name': 'stockings_(leg_wear)'}, {'frequency': 'f', 'id': 1037, 'synset': 'stool.n.01', 'synonyms': ['stool'], 'def': 'a simple seat without a back or arms', 'name': 'stool'}, {'frequency': 'f', 'id': 1038, 'synset': 'stop_sign.n.01', 'synonyms': ['stop_sign'], 'def': 'a traffic sign to notify drivers that they must come to a complete stop', 'name': 'stop_sign'}, {'frequency': 'f', 'id': 1039, 'synset': 'stoplight.n.01', 'synonyms': ['brake_light'], 'def': 'a red light on the rear of a motor vehicle that signals when the brakes are applied', 'name': 'brake_light'}, {'frequency': 'f', 'id': 1040, 'synset': 'stove.n.01', 'synonyms': ['stove', 'kitchen_stove', 'range_(kitchen_appliance)', 'kitchen_range', 'cooking_stove'], 'def': 'a kitchen appliance used for cooking food', 'name': 'stove'}, {'frequency': 'c', 'id': 1041, 'synset': 'strainer.n.01', 'synonyms': ['strainer'], 'def': 'a filter to retain larger pieces while smaller pieces and liquids pass through', 'name': 'strainer'}, {'frequency': 'f', 'id': 1042, 'synset': 'strap.n.01', 'synonyms': ['strap'], 'def': 'an elongated strip of material for binding things together or holding', 'name': 'strap'}, {'frequency': 'f', 'id': 1043, 'synset': 'straw.n.04', 'synonyms': ['straw_(for_drinking)', 'drinking_straw'], 'def': 'a thin paper or plastic tube used to suck liquids into the mouth', 'name': 'straw_(for_drinking)'}, {'frequency': 'f', 'id': 1044, 'synset': 'strawberry.n.01', 'synonyms': ['strawberry'], 'def': 'sweet fleshy red fruit', 'name': 'strawberry'}, {'frequency': 'f', 'id': 1045, 'synset': 'street_sign.n.01', 'synonyms': ['street_sign'], 'def': 'a sign visible from the street', 'name': 'street_sign'}, {'frequency': 'f', 'id': 1046, 'synset': 'streetlight.n.01', 'synonyms': ['streetlight', 'street_lamp'], 'def': 'a lamp supported on a lamppost; for illuminating a street', 'name': 'streetlight'}, {'frequency': 'r', 'id': 1047, 'synset': 'string_cheese.n.01', 'synonyms': ['string_cheese'], 'def': 'cheese formed in long strings twisted together', 'name': 'string_cheese'}, {'frequency': 'r', 'id': 1048, 'synset': 'stylus.n.02', 'synonyms': ['stylus'], 'def': 'a pointed tool for writing or drawing or engraving', 'name': 'stylus'}, {'frequency': 'r', 'id': 1049, 'synset': 'subwoofer.n.01', 'synonyms': ['subwoofer'], 'def': 'a loudspeaker that is designed to reproduce very low bass frequencies', 'name': 'subwoofer'}, {'frequency': 'r', 'id': 1050, 'synset': 'sugar_bowl.n.01', 'synonyms': ['sugar_bowl'], 'def': 'a dish in which sugar is served', 'name': 'sugar_bowl'}, {'frequency': 'r', 'id': 1051, 'synset': 'sugarcane.n.01', 'synonyms': ['sugarcane_(plant)'], 'def': 'juicy canes whose sap is a source of molasses and commercial sugar; fresh canes are sometimes chewed for the juice', 'name': 'sugarcane_(plant)'}, {'frequency': 'c', 'id': 1052, 'synset': 'suit.n.01', 'synonyms': ['suit_(clothing)'], 'def': 'a set of garments (usually including a jacket and trousers or skirt) for outerwear all of the same fabric and color', 'name': 'suit_(clothing)'}, {'frequency': 'c', 'id': 1053, 'synset': 'sunflower.n.01', 'synonyms': ['sunflower'], 'def': 'any plant of the genus Helianthus having large flower heads with dark disk florets and showy yellow rays', 'name': 'sunflower'}, {'frequency': 'f', 'id': 1054, 'synset': 'sunglasses.n.01', 'synonyms': ['sunglasses'], 'def': 'spectacles that are darkened or polarized to protect the eyes from the glare of the sun', 'name': 'sunglasses'}, {'frequency': 'c', 'id': 1055, 'synset': 'sunhat.n.01', 'synonyms': ['sunhat'], 'def': 'a hat with a broad brim that protects the face from direct exposure to the sun', 'name': 'sunhat'}, {'frequency': 'r', 'id': 1056, 'synset': 'sunscreen.n.01', 'synonyms': ['sunscreen', 'sunblock'], 'def': 'a cream spread on the skin; contains a chemical to filter out ultraviolet light and so protect from sunburn', 'name': 'sunscreen'}, {'frequency': 'f', 'id': 1057, 'synset': 'surfboard.n.01', 'synonyms': ['surfboard'], 'def': 'a narrow buoyant board for riding surf', 'name': 'surfboard'}, {'frequency': 'c', 'id': 1058, 'synset': 'sushi.n.01', 'synonyms': ['sushi'], 'def': 'rice (with raw fish) wrapped in seaweed', 'name': 'sushi'}, {'frequency': 'c', 'id': 1059, 'synset': 'swab.n.02', 'synonyms': ['mop'], 'def': 'cleaning implement consisting of absorbent material fastened to a handle; for cleaning floors', 'name': 'mop'}, {'frequency': 'c', 'id': 1060, 'synset': 'sweat_pants.n.01', 'synonyms': ['sweat_pants'], 'def': 'loose-fitting trousers with elastic cuffs; worn by athletes', 'name': 'sweat_pants'}, {'frequency': 'c', 'id': 1061, 'synset': 'sweatband.n.02', 'synonyms': ['sweatband'], 'def': 'a band of material tied around the forehead or wrist to absorb sweat', 'name': 'sweatband'}, {'frequency': 'f', 'id': 1062, 'synset': 'sweater.n.01', 'synonyms': ['sweater'], 'def': 'a crocheted or knitted garment covering the upper part of the body', 'name': 'sweater'}, {'frequency': 'f', 'id': 1063, 'synset': 'sweatshirt.n.01', 'synonyms': ['sweatshirt'], 'def': 'cotton knit pullover with long sleeves worn during athletic activity', 'name': 'sweatshirt'}, {'frequency': 'c', 'id': 1064, 'synset': 'sweet_potato.n.02', 'synonyms': ['sweet_potato'], 'def': 'the edible tuberous root of the sweet potato vine', 'name': 'sweet_potato'}, {'frequency': 'f', 'id': 1065, 'synset': 'swimsuit.n.01', 'synonyms': ['swimsuit', 'swimwear', 'bathing_suit', 'swimming_costume', 'bathing_costume', 'swimming_trunks', 'bathing_trunks'], 'def': 'garment worn for swimming', 'name': 'swimsuit'}, {'frequency': 'c', 'id': 1066, 'synset': 'sword.n.01', 'synonyms': ['sword'], 'def': 'a cutting or thrusting weapon that has a long metal blade', 'name': 'sword'}, {'frequency': 'r', 'id': 1067, 'synset': 'syringe.n.01', 'synonyms': ['syringe'], 'def': 'a medical instrument used to inject or withdraw fluids', 'name': 'syringe'}, {'frequency': 'r', 'id': 1068, 'synset': 'tabasco.n.02', 'synonyms': ['Tabasco_sauce'], 'def': 'very spicy sauce (trade name Tabasco) made from fully-aged red peppers', 'name': 'Tabasco_sauce'}, {'frequency': 'r', 'id': 1069, 'synset': 'table-tennis_table.n.01', 'synonyms': ['table-tennis_table', 'ping-pong_table'], 'def': 'a table used for playing table tennis', 'name': 'table-tennis_table'}, {'frequency': 'f', 'id': 1070, 'synset': 'table.n.02', 'synonyms': ['table'], 'def': 'a piece of furniture having a smooth flat top that is usually supported by one or more vertical legs', 'name': 'table'}, {'frequency': 'c', 'id': 1071, 'synset': 'table_lamp.n.01', 'synonyms': ['table_lamp'], 'def': 'a lamp that sits on a table', 'name': 'table_lamp'}, {'frequency': 'f', 'id': 1072, 'synset': 'tablecloth.n.01', 'synonyms': ['tablecloth'], 'def': 'a covering spread over a dining table', 'name': 'tablecloth'}, {'frequency': 'r', 'id': 1073, 'synset': 'tachometer.n.01', 'synonyms': ['tachometer'], 'def': 'measuring instrument for indicating speed of rotation', 'name': 'tachometer'}, {'frequency': 'r', 'id': 1074, 'synset': 'taco.n.02', 'synonyms': ['taco'], 'def': 'a small tortilla cupped around a filling', 'name': 'taco'}, {'frequency': 'f', 'id': 1075, 'synset': 'tag.n.02', 'synonyms': ['tag'], 'def': 'a label associated with something for the purpose of identification or information', 'name': 'tag'}, {'frequency': 'f', 'id': 1076, 'synset': 'taillight.n.01', 'synonyms': ['taillight', 'rear_light'], 'def': 'lamp (usually red) mounted at the rear of a motor vehicle', 'name': 'taillight'}, {'frequency': 'r', 'id': 1077, 'synset': 'tambourine.n.01', 'synonyms': ['tambourine'], 'def': 'a shallow drum with a single drumhead and with metallic disks in the sides', 'name': 'tambourine'}, {'frequency': 'r', 'id': 1078, 'synset': 'tank.n.01', 'synonyms': ['army_tank', 'armored_combat_vehicle', 'armoured_combat_vehicle'], 'def': 'an enclosed armored military vehicle; has a cannon and moves on caterpillar treads', 'name': 'army_tank'}, {'frequency': 'c', 'id': 1079, 'synset': 'tank.n.02', 'synonyms': ['tank_(storage_vessel)', 'storage_tank'], 'def': 'a large (usually metallic) vessel for holding gases or liquids', 'name': 'tank_(storage_vessel)'}, {'frequency': 'f', 'id': 1080, 'synset': 'tank_top.n.01', 'synonyms': ['tank_top_(clothing)'], 'def': 'a tight-fitting sleeveless shirt with wide shoulder straps and low neck and no front opening', 'name': 'tank_top_(clothing)'}, {'frequency': 'c', 'id': 1081, 'synset': 'tape.n.01', 'synonyms': ['tape_(sticky_cloth_or_paper)'], 'def': 'a long thin piece of cloth or paper as used for binding or fastening', 'name': 'tape_(sticky_cloth_or_paper)'}, {'frequency': 'c', 'id': 1082, 'synset': 'tape.n.04', 'synonyms': ['tape_measure', 'measuring_tape'], 'def': 'measuring instrument consisting of a narrow strip (cloth or metal) marked in inches or centimeters and used for measuring lengths', 'name': 'tape_measure'}, {'frequency': 'c', 'id': 1083, 'synset': 'tapestry.n.02', 'synonyms': ['tapestry'], 'def': 'a heavy textile with a woven design; used for curtains and upholstery', 'name': 'tapestry'}, {'frequency': 'f', 'id': 1084, 'synset': 'tarpaulin.n.01', 'synonyms': ['tarp'], 'def': 'waterproofed canvas', 'name': 'tarp'}, {'frequency': 'c', 'id': 1085, 'synset': 'tartan.n.01', 'synonyms': ['tartan', 'plaid'], 'def': 'a cloth having a crisscross design', 'name': 'tartan'}, {'frequency': 'c', 'id': 1086, 'synset': 'tassel.n.01', 'synonyms': ['tassel'], 'def': 'adornment consisting of a bunch of cords fastened at one end', 'name': 'tassel'}, {'frequency': 'r', 'id': 1087, 'synset': 'tea_bag.n.01', 'synonyms': ['tea_bag'], 'def': 'a measured amount of tea in a bag for an individual serving of tea', 'name': 'tea_bag'}, {'frequency': 'c', 'id': 1088, 'synset': 'teacup.n.02', 'synonyms': ['teacup'], 'def': 'a cup from which tea is drunk', 'name': 'teacup'}, {'frequency': 'c', 'id': 1089, 'synset': 'teakettle.n.01', 'synonyms': ['teakettle'], 'def': 'kettle for boiling water to make tea', 'name': 'teakettle'}, {'frequency': 'c', 'id': 1090, 'synset': 'teapot.n.01', 'synonyms': ['teapot'], 'def': 'pot for brewing tea; usually has a spout and handle', 'name': 'teapot'}, {'frequency': 'f', 'id': 1091, 'synset': 'teddy.n.01', 'synonyms': ['teddy_bear'], 'def': "plaything consisting of a child's toy bear (usually plush and stuffed with soft materials)", 'name': 'teddy_bear'}, {'frequency': 'f', 'id': 1092, 'synset': 'telephone.n.01', 'synonyms': ['telephone', 'phone', 'telephone_set'], 'def': 'electronic device for communicating by voice over long distances', 'name': 'telephone'}, {'frequency': 'c', 'id': 1093, 'synset': 'telephone_booth.n.01', 'synonyms': ['telephone_booth', 'phone_booth', 'call_box', 'telephone_box', 'telephone_kiosk'], 'def': 'booth for using a telephone', 'name': 'telephone_booth'}, {'frequency': 'f', 'id': 1094, 'synset': 'telephone_pole.n.01', 'synonyms': ['telephone_pole', 'telegraph_pole', 'telegraph_post'], 'def': 'tall pole supporting telephone wires', 'name': 'telephone_pole'}, {'frequency': 'r', 'id': 1095, 'synset': 'telephoto_lens.n.01', 'synonyms': ['telephoto_lens', 'zoom_lens'], 'def': 'a camera lens that magnifies the image', 'name': 'telephoto_lens'}, {'frequency': 'c', 'id': 1096, 'synset': 'television_camera.n.01', 'synonyms': ['television_camera', 'tv_camera'], 'def': 'television equipment for capturing and recording video', 'name': 'television_camera'}, {'frequency': 'f', 'id': 1097, 'synset': 'television_receiver.n.01', 'synonyms': ['television_set', 'tv', 'tv_set'], 'def': 'an electronic device that receives television signals and displays them on a screen', 'name': 'television_set'}, {'frequency': 'f', 'id': 1098, 'synset': 'tennis_ball.n.01', 'synonyms': ['tennis_ball'], 'def': 'ball about the size of a fist used in playing tennis', 'name': 'tennis_ball'}, {'frequency': 'f', 'id': 1099, 'synset': 'tennis_racket.n.01', 'synonyms': ['tennis_racket'], 'def': 'a racket used to play tennis', 'name': 'tennis_racket'}, {'frequency': 'r', 'id': 1100, 'synset': 'tequila.n.01', 'synonyms': ['tequila'], 'def': 'Mexican liquor made from fermented juices of an agave plant', 'name': 'tequila'}, {'frequency': 'c', 'id': 1101, 'synset': 'thermometer.n.01', 'synonyms': ['thermometer'], 'def': 'measuring instrument for measuring temperature', 'name': 'thermometer'}, {'frequency': 'c', 'id': 1102, 'synset': 'thermos.n.01', 'synonyms': ['thermos_bottle'], 'def': 'vacuum flask that preserves temperature of hot or cold drinks', 'name': 'thermos_bottle'}, {'frequency': 'c', 'id': 1103, 'synset': 'thermostat.n.01', 'synonyms': ['thermostat'], 'def': 'a regulator for automatically regulating temperature by starting or stopping the supply of heat', 'name': 'thermostat'}, {'frequency': 'r', 'id': 1104, 'synset': 'thimble.n.02', 'synonyms': ['thimble'], 'def': 'a small metal cap to protect the finger while sewing; can be used as a small container', 'name': 'thimble'}, {'frequency': 'c', 'id': 1105, 'synset': 'thread.n.01', 'synonyms': ['thread', 'yarn'], 'def': 'a fine cord of twisted fibers (of cotton or silk or wool or nylon etc.) used in sewing and weaving', 'name': 'thread'}, {'frequency': 'c', 'id': 1106, 'synset': 'thumbtack.n.01', 'synonyms': ['thumbtack', 'drawing_pin', 'pushpin'], 'def': 'a tack for attaching papers to a bulletin board or drawing board', 'name': 'thumbtack'}, {'frequency': 'c', 'id': 1107, 'synset': 'tiara.n.01', 'synonyms': ['tiara'], 'def': 'a jeweled headdress worn by women on formal occasions', 'name': 'tiara'}, {'frequency': 'c', 'id': 1108, 'synset': 'tiger.n.02', 'synonyms': ['tiger'], 'def': 'large feline of forests in most of Asia having a tawny coat with black stripes', 'name': 'tiger'}, {'frequency': 'c', 'id': 1109, 'synset': 'tights.n.01', 'synonyms': ['tights_(clothing)', 'leotards'], 'def': 'skintight knit hose covering the body from the waist to the feet worn by acrobats and dancers and as stockings by women and girls', 'name': 'tights_(clothing)'}, {'frequency': 'c', 'id': 1110, 'synset': 'timer.n.01', 'synonyms': ['timer', 'stopwatch'], 'def': 'a timepiece that measures a time interval and signals its end', 'name': 'timer'}, {'frequency': 'f', 'id': 1111, 'synset': 'tinfoil.n.01', 'synonyms': ['tinfoil'], 'def': 'foil made of tin or an alloy of tin and lead', 'name': 'tinfoil'}, {'frequency': 'r', 'id': 1112, 'synset': 'tinsel.n.01', 'synonyms': ['tinsel'], 'def': 'a showy decoration that is basically valueless', 'name': 'tinsel'}, {'frequency': 'f', 'id': 1113, 'synset': 'tissue.n.02', 'synonyms': ['tissue_paper'], 'def': 'a soft thin (usually translucent) paper', 'name': 'tissue_paper'}, {'frequency': 'c', 'id': 1114, 'synset': 'toast.n.01', 'synonyms': ['toast_(food)'], 'def': 'slice of bread that has been toasted', 'name': 'toast_(food)'}, {'frequency': 'f', 'id': 1115, 'synset': 'toaster.n.02', 'synonyms': ['toaster'], 'def': 'a kitchen appliance (usually electric) for toasting bread', 'name': 'toaster'}, {'frequency': 'c', 'id': 1116, 'synset': 'toaster_oven.n.01', 'synonyms': ['toaster_oven'], 'def': 'kitchen appliance consisting of a small electric oven for toasting or warming food', 'name': 'toaster_oven'}, {'frequency': 'f', 'id': 1117, 'synset': 'toilet.n.02', 'synonyms': ['toilet'], 'def': 'a plumbing fixture for defecation and urination', 'name': 'toilet'}, {'frequency': 'f', 'id': 1118, 'synset': 'toilet_tissue.n.01', 'synonyms': ['toilet_tissue', 'toilet_paper', 'bathroom_tissue'], 'def': 'a soft thin absorbent paper for use in toilets', 'name': 'toilet_tissue'}, {'frequency': 'f', 'id': 1119, 'synset': 'tomato.n.01', 'synonyms': ['tomato'], 'def': 'mildly acid red or yellow pulpy fruit eaten as a vegetable', 'name': 'tomato'}, {'frequency': 'c', 'id': 1120, 'synset': 'tongs.n.01', 'synonyms': ['tongs'], 'def': 'any of various devices for taking hold of objects; usually have two hinged legs with handles above and pointed hooks below', 'name': 'tongs'}, {'frequency': 'c', 'id': 1121, 'synset': 'toolbox.n.01', 'synonyms': ['toolbox'], 'def': 'a box or chest or cabinet for holding hand tools', 'name': 'toolbox'}, {'frequency': 'f', 'id': 1122, 'synset': 'toothbrush.n.01', 'synonyms': ['toothbrush'], 'def': 'small brush; has long handle; used to clean teeth', 'name': 'toothbrush'}, {'frequency': 'f', 'id': 1123, 'synset': 'toothpaste.n.01', 'synonyms': ['toothpaste'], 'def': 'a dentifrice in the form of a paste', 'name': 'toothpaste'}, {'frequency': 'c', 'id': 1124, 'synset': 'toothpick.n.01', 'synonyms': ['toothpick'], 'def': 'pick consisting of a small strip of wood or plastic; used to pick food from between the teeth', 'name': 'toothpick'}, {'frequency': 'c', 'id': 1125, 'synset': 'top.n.09', 'synonyms': ['cover'], 'def': 'covering for a hole (especially a hole in the top of a container)', 'name': 'cover'}, {'frequency': 'c', 'id': 1126, 'synset': 'tortilla.n.01', 'synonyms': ['tortilla'], 'def': 'thin unleavened pancake made from cornmeal or wheat flour', 'name': 'tortilla'}, {'frequency': 'c', 'id': 1127, 'synset': 'tow_truck.n.01', 'synonyms': ['tow_truck'], 'def': 'a truck equipped to hoist and pull wrecked cars (or to remove cars from no-parking zones)', 'name': 'tow_truck'}, {'frequency': 'f', 'id': 1128, 'synset': 'towel.n.01', 'synonyms': ['towel'], 'def': 'a rectangular piece of absorbent cloth (or paper) for drying or wiping', 'name': 'towel'}, {'frequency': 'f', 'id': 1129, 'synset': 'towel_rack.n.01', 'synonyms': ['towel_rack', 'towel_rail', 'towel_bar'], 'def': 'a rack consisting of one or more bars on which towels can be hung', 'name': 'towel_rack'}, {'frequency': 'f', 'id': 1130, 'synset': 'toy.n.03', 'synonyms': ['toy'], 'def': 'a device regarded as providing amusement', 'name': 'toy'}, {'frequency': 'c', 'id': 1131, 'synset': 'tractor.n.01', 'synonyms': ['tractor_(farm_equipment)'], 'def': 'a wheeled vehicle with large wheels; used in farming and other applications', 'name': 'tractor_(farm_equipment)'}, {'frequency': 'f', 'id': 1132, 'synset': 'traffic_light.n.01', 'synonyms': ['traffic_light'], 'def': 'a device to control vehicle traffic often consisting of three or more lights', 'name': 'traffic_light'}, {'frequency': 'r', 'id': 1133, 'synset': 'trail_bike.n.01', 'synonyms': ['dirt_bike'], 'def': 'a lightweight motorcycle equipped with rugged tires and suspension for off-road use', 'name': 'dirt_bike'}, {'frequency': 'c', 'id': 1134, 'synset': 'trailer_truck.n.01', 'synonyms': ['trailer_truck', 'tractor_trailer', 'trucking_rig', 'articulated_lorry', 'semi_truck'], 'def': 'a truck consisting of a tractor and trailer together', 'name': 'trailer_truck'}, {'frequency': 'f', 'id': 1135, 'synset': 'train.n.01', 'synonyms': ['train_(railroad_vehicle)', 'railroad_train'], 'def': 'public or private transport provided by a line of railway cars coupled together and drawn by a locomotive', 'name': 'train_(railroad_vehicle)'}, {'frequency': 'r', 'id': 1136, 'synset': 'trampoline.n.01', 'synonyms': ['trampoline'], 'def': 'gymnastic apparatus consisting of a strong canvas sheet attached with springs to a metal frame', 'name': 'trampoline'}, {'frequency': 'f', 'id': 1137, 'synset': 'tray.n.01', 'synonyms': ['tray'], 'def': 'an open receptacle for holding or displaying or serving articles or food', 'name': 'tray'}, {'frequency': 'r', 'id': 1138, 'synset': 'tree_house.n.01', 'synonyms': ['tree_house'], 'def': '(NOT A TREE) a PLAYHOUSE built in the branches of a tree', 'name': 'tree_house'}, {'frequency': 'r', 'id': 1139, 'synset': 'trench_coat.n.01', 'synonyms': ['trench_coat'], 'def': 'a military style raincoat; belted with deep pockets', 'name': 'trench_coat'}, {'frequency': 'r', 'id': 1140, 'synset': 'triangle.n.05', 'synonyms': ['triangle_(musical_instrument)'], 'def': 'a percussion instrument consisting of a metal bar bent in the shape of an open triangle', 'name': 'triangle_(musical_instrument)'}, {'frequency': 'r', 'id': 1141, 'synset': 'tricycle.n.01', 'synonyms': ['tricycle'], 'def': 'a vehicle with three wheels that is moved by foot pedals', 'name': 'tricycle'}, {'frequency': 'c', 'id': 1142, 'synset': 'tripod.n.01', 'synonyms': ['tripod'], 'def': 'a three-legged rack used for support', 'name': 'tripod'}, {'frequency': 'f', 'id': 1143, 'synset': 'trouser.n.01', 'synonyms': ['trousers', 'pants_(clothing)'], 'def': 'a garment extending from the waist to the knee or ankle, covering each leg separately', 'name': 'trousers'}, {'frequency': 'f', 'id': 1144, 'synset': 'truck.n.01', 'synonyms': ['truck'], 'def': 'an automotive vehicle suitable for hauling', 'name': 'truck'}, {'frequency': 'r', 'id': 1145, 'synset': 'truffle.n.03', 'synonyms': ['truffle_(chocolate)', 'chocolate_truffle'], 'def': 'creamy chocolate candy', 'name': 'truffle_(chocolate)'}, {'frequency': 'c', 'id': 1146, 'synset': 'trunk.n.02', 'synonyms': ['trunk'], 'def': 'luggage consisting of a large strong case used when traveling or for storage', 'name': 'trunk'}, {'frequency': 'r', 'id': 1147, 'synset': 'tub.n.02', 'synonyms': ['vat'], 'def': 'a large open vessel for holding or storing liquids', 'name': 'vat'}, {'frequency': 'c', 'id': 1148, 'synset': 'turban.n.01', 'synonyms': ['turban'], 'def': 'a traditional headdress consisting of a long scarf wrapped around the head', 'name': 'turban'}, {'frequency': 'r', 'id': 1149, 'synset': 'turkey.n.01', 'synonyms': ['turkey_(bird)'], 'def': 'large gallinaceous bird with fan-shaped tail; widely domesticated for food', 'name': 'turkey_(bird)'}, {'frequency': 'c', 'id': 1150, 'synset': 'turkey.n.04', 'synonyms': ['turkey_(food)'], 'def': 'flesh of large domesticated fowl usually roasted', 'name': 'turkey_(food)'}, {'frequency': 'r', 'id': 1151, 'synset': 'turnip.n.01', 'synonyms': ['turnip'], 'def': 'widely cultivated plant having a large fleshy edible white or yellow root', 'name': 'turnip'}, {'frequency': 'c', 'id': 1152, 'synset': 'turtle.n.02', 'synonyms': ['turtle'], 'def': 'any of various aquatic and land reptiles having a bony shell and flipper-like limbs for swimming', 'name': 'turtle'}, {'frequency': 'r', 'id': 1153, 'synset': 'turtleneck.n.01', 'synonyms': ['turtleneck_(clothing)', 'polo-neck'], 'def': 'a sweater or jersey with a high close-fitting collar', 'name': 'turtleneck_(clothing)'}, {'frequency': 'r', 'id': 1154, 'synset': 'typewriter.n.01', 'synonyms': ['typewriter'], 'def': 'hand-operated character printer for printing written messages one character at a time', 'name': 'typewriter'}, {'frequency': 'f', 'id': 1155, 'synset': 'umbrella.n.01', 'synonyms': ['umbrella'], 'def': 'a lightweight handheld collapsible canopy', 'name': 'umbrella'}, {'frequency': 'c', 'id': 1156, 'synset': 'underwear.n.01', 'synonyms': ['underwear', 'underclothes', 'underclothing', 'underpants'], 'def': 'undergarment worn next to the skin and under the outer garments', 'name': 'underwear'}, {'frequency': 'r', 'id': 1157, 'synset': 'unicycle.n.01', 'synonyms': ['unicycle'], 'def': 'a vehicle with a single wheel that is driven by pedals', 'name': 'unicycle'}, {'frequency': 'c', 'id': 1158, 'synset': 'urinal.n.01', 'synonyms': ['urinal'], 'def': 'a plumbing fixture (usually attached to the wall) used by men to urinate', 'name': 'urinal'}, {'frequency': 'r', 'id': 1159, 'synset': 'urn.n.01', 'synonyms': ['urn'], 'def': 'a large vase that usually has a pedestal or feet', 'name': 'urn'}, {'frequency': 'c', 'id': 1160, 'synset': 'vacuum.n.04', 'synonyms': ['vacuum_cleaner'], 'def': 'an electrical home appliance that cleans by suction', 'name': 'vacuum_cleaner'}, {'frequency': 'c', 'id': 1161, 'synset': 'valve.n.03', 'synonyms': ['valve'], 'def': 'control consisting of a mechanical device for controlling the flow of a fluid', 'name': 'valve'}, {'frequency': 'f', 'id': 1162, 'synset': 'vase.n.01', 'synonyms': ['vase'], 'def': 'an open jar of glass or porcelain used as an ornament or to hold flowers', 'name': 'vase'}, {'frequency': 'c', 'id': 1163, 'synset': 'vending_machine.n.01', 'synonyms': ['vending_machine'], 'def': 'a slot machine for selling goods', 'name': 'vending_machine'}, {'frequency': 'f', 'id': 1164, 'synset': 'vent.n.01', 'synonyms': ['vent', 'blowhole', 'air_vent'], 'def': 'a hole for the escape of gas or air', 'name': 'vent'}, {'frequency': 'c', 'id': 1165, 'synset': 'videotape.n.01', 'synonyms': ['videotape'], 'def': 'a video recording made on magnetic tape', 'name': 'videotape'}, {'frequency': 'r', 'id': 1166, 'synset': 'vinegar.n.01', 'synonyms': ['vinegar'], 'def': 'sour-tasting liquid produced usually by oxidation of the alcohol in wine or cider and used as a condiment or food preservative', 'name': 'vinegar'}, {'frequency': 'r', 'id': 1167, 'synset': 'violin.n.01', 'synonyms': ['violin', 'fiddle'], 'def': 'bowed stringed instrument that is the highest member of the violin family', 'name': 'violin'}, {'frequency': 'r', 'id': 1168, 'synset': 'vodka.n.01', 'synonyms': ['vodka'], 'def': 'unaged colorless liquor originating in Russia', 'name': 'vodka'}, {'frequency': 'r', 'id': 1169, 'synset': 'volleyball.n.02', 'synonyms': ['volleyball'], 'def': 'an inflated ball used in playing volleyball', 'name': 'volleyball'}, {'frequency': 'r', 'id': 1170, 'synset': 'vulture.n.01', 'synonyms': ['vulture'], 'def': 'any of various large birds of prey having naked heads and weak claws and feeding chiefly on carrion', 'name': 'vulture'}, {'frequency': 'c', 'id': 1171, 'synset': 'waffle.n.01', 'synonyms': ['waffle'], 'def': 'pancake batter baked in a waffle iron', 'name': 'waffle'}, {'frequency': 'r', 'id': 1172, 'synset': 'waffle_iron.n.01', 'synonyms': ['waffle_iron'], 'def': 'a kitchen appliance for baking waffles', 'name': 'waffle_iron'}, {'frequency': 'c', 'id': 1173, 'synset': 'wagon.n.01', 'synonyms': ['wagon'], 'def': 'any of various kinds of wheeled vehicles drawn by an animal or a tractor', 'name': 'wagon'}, {'frequency': 'c', 'id': 1174, 'synset': 'wagon_wheel.n.01', 'synonyms': ['wagon_wheel'], 'def': 'a wheel of a wagon', 'name': 'wagon_wheel'}, {'frequency': 'c', 'id': 1175, 'synset': 'walking_stick.n.01', 'synonyms': ['walking_stick'], 'def': 'a stick carried in the hand for support in walking', 'name': 'walking_stick'}, {'frequency': 'c', 'id': 1176, 'synset': 'wall_clock.n.01', 'synonyms': ['wall_clock'], 'def': 'a clock mounted on a wall', 'name': 'wall_clock'}, {'frequency': 'f', 'id': 1177, 'synset': 'wall_socket.n.01', 'synonyms': ['wall_socket', 'wall_plug', 'electric_outlet', 'electrical_outlet', 'outlet', 'electric_receptacle'], 'def': 'receptacle providing a place in a wiring system where current can be taken to run electrical devices', 'name': 'wall_socket'}, {'frequency': 'c', 'id': 1178, 'synset': 'wallet.n.01', 'synonyms': ['wallet', 'billfold'], 'def': 'a pocket-size case for holding papers and paper money', 'name': 'wallet'}, {'frequency': 'r', 'id': 1179, 'synset': 'walrus.n.01', 'synonyms': ['walrus'], 'def': 'either of two large northern marine mammals having ivory tusks and tough hide over thick blubber', 'name': 'walrus'}, {'frequency': 'r', 'id': 1180, 'synset': 'wardrobe.n.01', 'synonyms': ['wardrobe'], 'def': 'a tall piece of furniture that provides storage space for clothes; has a door and rails or hooks for hanging clothes', 'name': 'wardrobe'}, {'frequency': 'r', 'id': 1181, 'synset': 'wasabi.n.02', 'synonyms': ['wasabi'], 'def': 'the thick green root of the wasabi plant that the Japanese use in cooking and that tastes like strong horseradish', 'name': 'wasabi'}, {'frequency': 'c', 'id': 1182, 'synset': 'washer.n.03', 'synonyms': ['automatic_washer', 'washing_machine'], 'def': 'a home appliance for washing clothes and linens automatically', 'name': 'automatic_washer'}, {'frequency': 'f', 'id': 1183, 'synset': 'watch.n.01', 'synonyms': ['watch', 'wristwatch'], 'def': 'a small, portable timepiece', 'name': 'watch'}, {'frequency': 'f', 'id': 1184, 'synset': 'water_bottle.n.01', 'synonyms': ['water_bottle'], 'def': 'a bottle for holding water', 'name': 'water_bottle'}, {'frequency': 'c', 'id': 1185, 'synset': 'water_cooler.n.01', 'synonyms': ['water_cooler'], 'def': 'a device for cooling and dispensing drinking water', 'name': 'water_cooler'}, {'frequency': 'c', 'id': 1186, 'synset': 'water_faucet.n.01', 'synonyms': ['water_faucet', 'water_tap', 'tap_(water_faucet)'], 'def': 'a faucet for drawing water from a pipe or cask', 'name': 'water_faucet'}, {'frequency': 'r', 'id': 1187, 'synset': 'water_filter.n.01', 'synonyms': ['water_filter'], 'def': 'a filter to remove impurities from the water supply', 'name': 'water_filter'}, {'frequency': 'r', 'id': 1188, 'synset': 'water_heater.n.01', 'synonyms': ['water_heater', 'hot-water_heater'], 'def': 'a heater and storage tank to supply heated water', 'name': 'water_heater'}, {'frequency': 'r', 'id': 1189, 'synset': 'water_jug.n.01', 'synonyms': ['water_jug'], 'def': 'a jug that holds water', 'name': 'water_jug'}, {'frequency': 'r', 'id': 1190, 'synset': 'water_pistol.n.01', 'synonyms': ['water_gun', 'squirt_gun'], 'def': 'plaything consisting of a toy pistol that squirts water', 'name': 'water_gun'}, {'frequency': 'c', 'id': 1191, 'synset': 'water_scooter.n.01', 'synonyms': ['water_scooter', 'sea_scooter', 'jet_ski'], 'def': 'a motorboat resembling a motor scooter (NOT A SURFBOARD OR WATER SKI)', 'name': 'water_scooter'}, {'frequency': 'c', 'id': 1192, 'synset': 'water_ski.n.01', 'synonyms': ['water_ski'], 'def': 'broad ski for skimming over water towed by a speedboat (DO NOT MARK WATER)', 'name': 'water_ski'}, {'frequency': 'c', 'id': 1193, 'synset': 'water_tower.n.01', 'synonyms': ['water_tower'], 'def': 'a large reservoir for water', 'name': 'water_tower'}, {'frequency': 'c', 'id': 1194, 'synset': 'watering_can.n.01', 'synonyms': ['watering_can'], 'def': 'a container with a handle and a spout with a perforated nozzle; used to sprinkle water over plants', 'name': 'watering_can'}, {'frequency': 'c', 'id': 1195, 'synset': 'watermelon.n.02', 'synonyms': ['watermelon'], 'def': 'large oblong or roundish melon with a hard green rind and sweet watery red or occasionally yellowish pulp', 'name': 'watermelon'}, {'frequency': 'f', 'id': 1196, 'synset': 'weathervane.n.01', 'synonyms': ['weathervane', 'vane_(weathervane)', 'wind_vane'], 'def': 'mechanical device attached to an elevated structure; rotates freely to show the direction of the wind', 'name': 'weathervane'}, {'frequency': 'c', 'id': 1197, 'synset': 'webcam.n.01', 'synonyms': ['webcam'], 'def': 'a digital camera designed to take digital photographs and transmit them over the internet', 'name': 'webcam'}, {'frequency': 'c', 'id': 1198, 'synset': 'wedding_cake.n.01', 'synonyms': ['wedding_cake', 'bridecake'], 'def': 'a rich cake with two or more tiers and covered with frosting and decorations; served at a wedding reception', 'name': 'wedding_cake'}, {'frequency': 'c', 'id': 1199, 'synset': 'wedding_ring.n.01', 'synonyms': ['wedding_ring', 'wedding_band'], 'def': 'a ring given to the bride and/or groom at the wedding', 'name': 'wedding_ring'}, {'frequency': 'f', 'id': 1200, 'synset': 'wet_suit.n.01', 'synonyms': ['wet_suit'], 'def': 'a close-fitting garment made of a permeable material; worn in cold water to retain body heat', 'name': 'wet_suit'}, {'frequency': 'f', 'id': 1201, 'synset': 'wheel.n.01', 'synonyms': ['wheel'], 'def': 'a circular frame with spokes (or a solid disc) that can rotate on a shaft or axle', 'name': 'wheel'}, {'frequency': 'c', 'id': 1202, 'synset': 'wheelchair.n.01', 'synonyms': ['wheelchair'], 'def': 'a movable chair mounted on large wheels', 'name': 'wheelchair'}, {'frequency': 'c', 'id': 1203, 'synset': 'whipped_cream.n.01', 'synonyms': ['whipped_cream'], 'def': 'cream that has been beaten until light and fluffy', 'name': 'whipped_cream'}, {'frequency': 'r', 'id': 1204, 'synset': 'whiskey.n.01', 'synonyms': ['whiskey'], 'def': 'a liquor made from fermented mash of grain', 'name': 'whiskey'}, {'frequency': 'r', 'id': 1205, 'synset': 'whistle.n.03', 'synonyms': ['whistle'], 'def': 'a small wind instrument that produces a whistling sound by blowing into it', 'name': 'whistle'}, {'frequency': 'r', 'id': 1206, 'synset': 'wick.n.02', 'synonyms': ['wick'], 'def': 'a loosely woven cord in a candle or oil lamp that is lit on fire', 'name': 'wick'}, {'frequency': 'c', 'id': 1207, 'synset': 'wig.n.01', 'synonyms': ['wig'], 'def': 'hairpiece covering the head and made of real or synthetic hair', 'name': 'wig'}, {'frequency': 'c', 'id': 1208, 'synset': 'wind_chime.n.01', 'synonyms': ['wind_chime'], 'def': 'a decorative arrangement of pieces of metal or glass or pottery that hang together loosely so the wind can cause them to tinkle', 'name': 'wind_chime'}, {'frequency': 'c', 'id': 1209, 'synset': 'windmill.n.01', 'synonyms': ['windmill'], 'def': 'a mill that is powered by the wind', 'name': 'windmill'}, {'frequency': 'c', 'id': 1210, 'synset': 'window_box.n.01', 'synonyms': ['window_box_(for_plants)'], 'def': 'a container for growing plants on a windowsill', 'name': 'window_box_(for_plants)'}, {'frequency': 'f', 'id': 1211, 'synset': 'windshield_wiper.n.01', 'synonyms': ['windshield_wiper', 'windscreen_wiper', 'wiper_(for_windshield/screen)'], 'def': 'a mechanical device that cleans the windshield', 'name': 'windshield_wiper'}, {'frequency': 'c', 'id': 1212, 'synset': 'windsock.n.01', 'synonyms': ['windsock', 'air_sock', 'air-sleeve', 'wind_sleeve', 'wind_cone'], 'def': 'a truncated cloth cone mounted on a mast/pole; shows wind direction', 'name': 'windsock'}, {'frequency': 'f', 'id': 1213, 'synset': 'wine_bottle.n.01', 'synonyms': ['wine_bottle'], 'def': 'a bottle for holding wine', 'name': 'wine_bottle'}, {'frequency': 'r', 'id': 1214, 'synset': 'wine_bucket.n.01', 'synonyms': ['wine_bucket', 'wine_cooler'], 'def': 'a bucket of ice used to chill a bottle of wine', 'name': 'wine_bucket'}, {'frequency': 'f', 'id': 1215, 'synset': 'wineglass.n.01', 'synonyms': ['wineglass'], 'def': 'a glass that has a stem and in which wine is served', 'name': 'wineglass'}, {'frequency': 'r', 'id': 1216, 'synset': 'wing_chair.n.01', 'synonyms': ['wing_chair'], 'def': 'easy chair having wings on each side of a high back', 'name': 'wing_chair'}, {'frequency': 'c', 'id': 1217, 'synset': 'winker.n.02', 'synonyms': ['blinder_(for_horses)'], 'def': 'blinds that prevent a horse from seeing something on either side', 'name': 'blinder_(for_horses)'}, {'frequency': 'c', 'id': 1218, 'synset': 'wok.n.01', 'synonyms': ['wok'], 'def': 'pan with a convex bottom; used for frying in Chinese cooking', 'name': 'wok'}, {'frequency': 'r', 'id': 1219, 'synset': 'wolf.n.01', 'synonyms': ['wolf'], 'def': 'a wild carnivorous mammal of the dog family, living and hunting in packs', 'name': 'wolf'}, {'frequency': 'c', 'id': 1220, 'synset': 'wooden_spoon.n.02', 'synonyms': ['wooden_spoon'], 'def': 'a spoon made of wood', 'name': 'wooden_spoon'}, {'frequency': 'c', 'id': 1221, 'synset': 'wreath.n.01', 'synonyms': ['wreath'], 'def': 'an arrangement of flowers, leaves, or stems fastened in a ring', 'name': 'wreath'}, {'frequency': 'c', 'id': 1222, 'synset': 'wrench.n.03', 'synonyms': ['wrench', 'spanner'], 'def': 'a hand tool that is used to hold or twist a nut or bolt', 'name': 'wrench'}, {'frequency': 'c', 'id': 1223, 'synset': 'wristband.n.01', 'synonyms': ['wristband'], 'def': 'band consisting of a part of a sleeve that covers the wrist', 'name': 'wristband'}, {'frequency': 'f', 'id': 1224, 'synset': 'wristlet.n.01', 'synonyms': ['wristlet', 'wrist_band'], 'def': 'a band or bracelet worn around the wrist', 'name': 'wristlet'}, {'frequency': 'r', 'id': 1225, 'synset': 'yacht.n.01', 'synonyms': ['yacht'], 'def': 'an expensive vessel propelled by sail or power and used for cruising or racing', 'name': 'yacht'}, {'frequency': 'r', 'id': 1226, 'synset': 'yak.n.02', 'synonyms': ['yak'], 'def': 'large long-haired wild ox of Tibet often domesticated', 'name': 'yak'}, {'frequency': 'c', 'id': 1227, 'synset': 'yogurt.n.01', 'synonyms': ['yogurt', 'yoghurt', 'yoghourt'], 'def': 'a custard-like food made from curdled milk', 'name': 'yogurt'}, {'frequency': 'r', 'id': 1228, 'synset': 'yoke.n.07', 'synonyms': ['yoke_(animal_equipment)'], 'def': 'gear joining two animals at the neck; NOT egg yolk', 'name': 'yoke_(animal_equipment)'}, {'frequency': 'f', 'id': 1229, 'synset': 'zebra.n.01', 'synonyms': ['zebra'], 'def': 'any of several fleet black-and-white striped African equines', 'name': 'zebra'}, {'frequency': 'c', 'id': 1230, 'synset': 'zucchini.n.02', 'synonyms': ['zucchini', 'courgette'], 'def': 'small cucumber-shaped vegetable marrow; typically dark green', 'name': 'zucchini'}]  # noqa
+# fmt: on
diff --git a/detectron2/data/datasets/pascal_voc.py b/detectron2/data/datasets/pascal_voc.py
new file mode 100644
index 0000000..8053af6
--- /dev/null
+++ b/detectron2/data/datasets/pascal_voc.py
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from fvcore.common.file_io import PathManager
+import os
+import numpy as np
+import xml.etree.ElementTree as ET
+
+from detectron2.structures import BoxMode
+from detectron2.data import DatasetCatalog, MetadataCatalog
+
+
+__all__ = ["register_pascal_voc"]
+
+
+# fmt: off
+CLASS_NAMES = [
+    "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
+    "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
+    "pottedplant", "sheep", "sofa", "train", "tvmonitor",
+]
+# fmt: on
+
+
+def load_voc_instances(dirname: str, split: str):
+    """
+    Load Pascal VOC detection annotations to Detectron2 format.
+
+    Args:
+        dirname: Contain "Annotations", "ImageSets", "JPEGImages"
+        split (str): one of "train", "test", "val", "trainval"
+    """
+    with PathManager.open(os.path.join(dirname, "ImageSets", "Main", split + ".txt")) as f:
+        fileids = np.loadtxt(f, dtype=np.str)
+
+    dicts = []
+    for fileid in fileids:
+        anno_file = os.path.join(dirname, "Annotations", fileid + ".xml")
+        jpeg_file = os.path.join(dirname, "JPEGImages", fileid + ".jpg")
+
+        tree = ET.parse(anno_file)
+
+        r = {
+            "file_name": jpeg_file,
+            "image_id": fileid,
+            "height": int(tree.findall("./size/height")[0].text),
+            "width": int(tree.findall("./size/width")[0].text),
+        }
+        instances = []
+
+        for obj in tree.findall("object"):
+            cls = obj.find("name").text
+            # We include "difficult" samples in training.
+            # Based on limited experiments, they don't hurt accuracy.
+            # difficult = int(obj.find("difficult").text)
+            # if difficult == 1:
+            # continue
+            bbox = obj.find("bndbox")
+            bbox = [float(bbox.find(x).text) for x in ["xmin", "ymin", "xmax", "ymax"]]
+            # Original annotations are integers in the range [1, W or H]
+            # Assuming they mean 1-based pixel indices (inclusive),
+            # a box with annotation (xmin=1, xmax=W) covers the whole image.
+            # In coordinate space this is represented by (xmin=0, xmax=W)
+            bbox[0] -= 1.0
+            bbox[1] -= 1.0
+            instances.append(
+                {"category_id": CLASS_NAMES.index(cls), "bbox": bbox, "bbox_mode": BoxMode.XYXY_ABS}
+            )
+        r["annotations"] = instances
+        dicts.append(r)
+    return dicts
+
+
+def register_pascal_voc(name, dirname, split, year):
+    DatasetCatalog.register(name, lambda: load_voc_instances(dirname, split))
+    MetadataCatalog.get(name).set(
+        thing_classes=CLASS_NAMES, dirname=dirname, year=year, split=split
+    )
diff --git a/detectron2/data/datasets/register_coco.py b/detectron2/data/datasets/register_coco.py
new file mode 100644
index 0000000..2aa19e1
--- /dev/null
+++ b/detectron2/data/datasets/register_coco.py
@@ -0,0 +1,124 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from .coco import load_coco_json, load_sem_seg
+
+"""
+This file contains functions to register a COCO-format dataset to the DatasetCatalog.
+"""
+
+__all__ = ["register_coco_instances", "register_coco_panoptic_separated"]
+
+
+def register_coco_instances(name, metadata, json_file, image_root):
+    """
+    Register a dataset in COCO's json annotation format for
+    instance detection, instance segmentation and keypoint detection.
+    (i.e., Type 1 and 2 in http://cocodataset.org/#format-data.
+    `instances*.json` and `person_keypoints*.json` in the dataset).
+
+    This is an example of how to register a new dataset.
+    You can do something similar to this function, to register new datasets.
+
+    Args:
+        name (str): the name that identifies a dataset, e.g. "coco_2014_train".
+        metadata (dict): extra metadata associated with this dataset.  You can
+            leave it as an empty dict.
+        json_file (str): path to the json instance annotation file.
+        image_root (str): directory which contains all the images.
+    """
+    # 1. register a function which returns dicts
+    DatasetCatalog.register(name, lambda: load_coco_json(json_file, image_root, name))
+
+    # 2. Optionally, add metadata about this dataset,
+    # since they might be useful in evaluation, visualization or logging
+    MetadataCatalog.get(name).set(
+        json_file=json_file, image_root=image_root, evaluator_type="coco", **metadata
+    )
+
+
+def register_coco_panoptic_separated(
+    name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json
+):
+    """
+    Register a COCO panoptic segmentation dataset named `name`.
+    The annotations in this registered dataset will contain both instance annotations and
+    semantic annotations, each with its own contiguous ids. Hence it's called "separated".
+
+    It follows the setting used by the PanopticFPN paper:
+
+    1. The instance annotations directly come from polygons in the COCO
+       instances annotation task, rather than from the masks in the COCO panoptic annotations.
+
+       The two format have small differences:
+       Polygons in the instance annotations may have overlaps.
+       The mask annotations are produced by labeling the overlapped polygons
+       with depth ordering.
+
+    2. The semantic annotations are converted from panoptic annotations, where
+       all "things" are assigned a semantic id of 0.
+       All semantic categories will therefore have ids in contiguous
+       range [1, #stuff_categories].
+
+    This function will also register a pure semantic segmentation dataset
+    named ``name + '_stuffonly'``.
+
+    Args:
+        name (str): the name that identifies a dataset,
+            e.g. "coco_2017_train_panoptic"
+        metadata (dict): extra metadata associated with this dataset.
+        image_root (str): directory which contains all the images
+        panoptic_root (str): directory which contains panoptic annotation images
+        panoptic_json (str): path to the json panoptic annotation file
+        sem_seg_root (str): directory which contains all the ground truth segmentation annotations.
+        instances_json (str): path to the json instance annotation file
+    """
+    panoptic_name = name + "_separated"
+    DatasetCatalog.register(
+        panoptic_name,
+        lambda: merge_to_panoptic(
+            load_coco_json(instances_json, image_root, panoptic_name),
+            load_sem_seg(sem_seg_root, image_root),
+        ),
+    )
+    MetadataCatalog.get(panoptic_name).set(
+        panoptic_root=panoptic_root,
+        image_root=image_root,
+        panoptic_json=panoptic_json,
+        sem_seg_root=sem_seg_root,
+        json_file=instances_json,  # TODO rename
+        evaluator_type="coco_panoptic_seg",
+        **metadata
+    )
+
+    semantic_name = name + "_stuffonly"
+    DatasetCatalog.register(semantic_name, lambda: load_sem_seg(sem_seg_root, image_root))
+    MetadataCatalog.get(semantic_name).set(
+        sem_seg_root=sem_seg_root, image_root=image_root, evaluator_type="sem_seg", **metadata
+    )
+
+
+def merge_to_panoptic(detection_dicts, sem_seg_dicts):
+    """
+    Create dataset dicts for panoptic segmentation, by
+    merging two dicts using "file_name" field to match their entries.
+
+    Args:
+        detection_dicts (list[dict]): lists of dicts for object detection or instance segmentation.
+        sem_seg_dicts (list[dict]): lists of dicts for semantic segmentation.
+
+    Returns:
+        list[dict] (one per input image): Each dict contains all (key, value) pairs from dicts in
+            both detection_dicts and sem_seg_dicts that correspond to the same image.
+            The function assumes that the same key in different dicts has the same value.
+    """
+    results = []
+    sem_seg_file_to_entry = {x["file_name"]: x for x in sem_seg_dicts}
+    assert len(sem_seg_file_to_entry) > 0
+
+    for det_dict in detection_dicts:
+        dic = copy.copy(det_dict)
+        dic.update(sem_seg_file_to_entry[dic["file_name"]])
+        results.append(dic)
+    return results
diff --git a/detectron2/data/detection_utils.py b/detectron2/data/detection_utils.py
new file mode 100644
index 0000000..51edadc
--- /dev/null
+++ b/detectron2/data/detection_utils.py
@@ -0,0 +1,415 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+"""
+Common data processing utilities that are used in a
+typical object detection data pipeline.
+"""
+import logging
+import numpy as np
+import torch
+from fvcore.common.file_io import PathManager
+from PIL import Image, ImageOps
+
+from detectron2.structures import (
+    BitMasks,
+    Boxes,
+    BoxMode,
+    Instances,
+    Keypoints,
+    PolygonMasks,
+    RotatedBoxes,
+)
+
+from . import transforms as T
+from .catalog import MetadataCatalog
+
+
+class SizeMismatchError(ValueError):
+    """
+    When loaded image has difference width/height compared with annotation.
+    """
+
+
+def read_image(file_name, format=None):
+    """
+    Read an image into the given format.
+    Will apply rotation and flipping if the image has such exif information.
+
+    Args:
+        file_name (str): image file path
+        format (str): one of the supported image modes in PIL, or "BGR"
+
+    Returns:
+        image (np.ndarray): an HWC image
+    """
+    with PathManager.open(file_name, "rb") as f:
+        image = Image.open(f)
+
+        # capture and ignore this bug: https://github.com/python-pillow/Pillow/issues/3973
+        try:
+            image = ImageOps.exif_transpose(image)
+        except Exception:
+            pass
+
+        if format is not None:
+            # PIL only supports RGB, so convert to RGB and flip channels over below
+            conversion_format = format
+            if format == "BGR":
+                conversion_format = "RGB"
+            image = image.convert(conversion_format)
+        image = np.asarray(image)
+        if format == "BGR":
+            # flip channels if needed
+            image = image[:, :, ::-1]
+        # PIL squeezes out the channel dimension for "L", so make it HWC
+        if format == "L":
+            image = np.expand_dims(image, -1)
+        return image
+
+
+def check_image_size(dataset_dict, image):
+    """
+    Raise an error if the image does not match the size specified in the dict.
+    """
+    if "width" in dataset_dict or "height" in dataset_dict:
+        image_wh = (image.shape[1], image.shape[0])
+        expected_wh = (dataset_dict["width"], dataset_dict["height"])
+        if not image_wh == expected_wh:
+            raise SizeMismatchError(
+                "Mismatched (W,H){}, got {}, expect {}".format(
+                    " for image " + dataset_dict["file_name"]
+                    if "file_name" in dataset_dict
+                    else "",
+                    image_wh,
+                    expected_wh,
+                )
+            )
+
+    # To ensure bbox always remap to original image size
+    if "width" not in dataset_dict:
+        dataset_dict["width"] = image.shape[1]
+    if "height" not in dataset_dict:
+        dataset_dict["height"] = image.shape[0]
+
+
+def transform_proposals(dataset_dict, image_shape, transforms, min_box_side_len, proposal_topk):
+    """
+    Apply transformations to the proposals in dataset_dict, if any.
+
+    Args:
+        dataset_dict (dict): a dict read from the dataset, possibly
+            contains fields "proposal_boxes", "proposal_objectness_logits", "proposal_bbox_mode"
+        image_shape (tuple): height, width
+        transforms (TransformList):
+        min_box_side_len (int): keep proposals with at least this size
+        proposal_topk (int): only keep top-K scoring proposals
+
+    The input dict is modified in-place, with abovementioned keys removed. A new
+    key "proposals" will be added. Its value is an `Instances`
+    object which contains the transformed proposals in its field
+    "proposal_boxes" and "objectness_logits".
+    """
+    if "proposal_boxes" in dataset_dict:
+        # Transform proposal boxes
+        boxes = transforms.apply_box(
+            BoxMode.convert(
+                dataset_dict.pop("proposal_boxes"),
+                dataset_dict.pop("proposal_bbox_mode"),
+                BoxMode.XYXY_ABS,
+            )
+        )
+        boxes = Boxes(boxes)
+        objectness_logits = torch.as_tensor(
+            dataset_dict.pop("proposal_objectness_logits").astype("float32")
+        )
+
+        boxes.clip(image_shape)
+        keep = boxes.nonempty(threshold=min_box_side_len)
+        boxes = boxes[keep]
+        objectness_logits = objectness_logits[keep]
+
+        proposals = Instances(image_shape)
+        proposals.proposal_boxes = boxes[:proposal_topk]
+        proposals.objectness_logits = objectness_logits[:proposal_topk]
+        dataset_dict["proposals"] = proposals
+
+
+def transform_instance_annotations(
+    annotation, transforms, image_size, *, keypoint_hflip_indices=None
+):
+    """
+    Apply transforms to box, segmentation and keypoints of annotations of a single instance.
+
+    It will use `transforms.apply_box` for the box, and
+    `transforms.apply_coords` for segmentation polygons & keypoints.
+    If you need anything more specially designed for each data structure,
+    you'll need to implement your own version of this function or the transforms.
+
+    Args:
+        annotation (dict): dict of instance annotations for a single instance.
+        transforms (TransformList):
+        image_size (tuple): the height, width of the transformed image
+        keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
+
+    Returns:
+        dict:
+            the same input dict with fields "bbox", "segmentation", "keypoints"
+            transformed according to `transforms`.
+            The "bbox_mode" field will be set to XYXY_ABS.
+    """
+    bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
+    # Note that bbox is 1d (per-instance bounding box)
+    annotation["bbox"] = transforms.apply_box([bbox])[0]
+    annotation["bbox_mode"] = BoxMode.XYXY_ABS
+
+    if "segmentation" in annotation:
+        # each instance contains 1 or more polygons
+        polygons = [np.asarray(p).reshape(-1, 2) for p in annotation["segmentation"]]
+        annotation["segmentation"] = [p.reshape(-1) for p in transforms.apply_polygons(polygons)]
+
+    if "keypoints" in annotation:
+        keypoints = transform_keypoint_annotations(
+            annotation["keypoints"], transforms, image_size, keypoint_hflip_indices
+        )
+        annotation["keypoints"] = keypoints
+
+    return annotation
+
+
+def transform_keypoint_annotations(keypoints, transforms, image_size, keypoint_hflip_indices=None):
+    """
+    Transform keypoint annotations of an image.
+
+    Args:
+        keypoints (list[float]): Nx3 float in Detectron2 Dataset format.
+        transforms (TransformList):
+        image_size (tuple): the height, width of the transformed image
+        keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
+    """
+    # (N*3,) -> (N, 3)
+    keypoints = np.asarray(keypoints, dtype="float64").reshape(-1, 3)
+    keypoints[:, :2] = transforms.apply_coords(keypoints[:, :2])
+
+    # This assumes that HorizFlipTransform is the only one that does flip
+    do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
+
+    # Alternative way: check if probe points was horizontally flipped.
+    # probe = np.asarray([[0.0, 0.0], [image_width, 0.0]])
+    # probe_aug = transforms.apply_coords(probe.copy())
+    # do_hflip = np.sign(probe[1][0] - probe[0][0]) != np.sign(probe_aug[1][0] - probe_aug[0][0])  # noqa
+
+    # If flipped, swap each keypoint with its opposite-handed equivalent
+    if do_hflip:
+        assert keypoint_hflip_indices is not None
+        keypoints = keypoints[keypoint_hflip_indices, :]
+
+    # Maintain COCO convention that if visibility == 0, then x, y = 0
+    # TODO may need to reset visibility for cropped keypoints,
+    # but it does not matter for our existing algorithms
+    keypoints[keypoints[:, 2] == 0] = 0
+    return keypoints
+
+
+def annotations_to_instances(annos, image_size, mask_format="polygon"):
+    """
+    Create an :class:`Instances` object used by the models,
+    from instance annotations in the dataset dict.
+
+    Args:
+        annos (list[dict]): a list of instance annotations in one image, each
+            element for one instance.
+        image_size (tuple): height, width
+
+    Returns:
+        Instances:
+            It will contain fields "gt_boxes", "gt_classes",
+            "gt_masks", "gt_keypoints", if they can be obtained from `annos`.
+            This is the format that builtin models expect.
+    """
+    boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos]
+    target = Instances(image_size)
+    boxes = target.gt_boxes = Boxes(boxes)
+    boxes.clip(image_size)
+
+    classes = [obj["category_id"] for obj in annos]
+    classes = torch.tensor(classes, dtype=torch.int64)
+    target.gt_classes = classes
+
+    if len(annos) and "segmentation" in annos[0]:
+        polygons = [obj["segmentation"] for obj in annos]
+        if mask_format == "polygon":
+            masks = PolygonMasks(polygons)
+        else:
+            assert mask_format == "bitmask", mask_format
+            masks = BitMasks.from_polygon_masks(polygons, *image_size)
+        target.gt_masks = masks
+
+    if len(annos) and "keypoints" in annos[0]:
+        kpts = [obj.get("keypoints", []) for obj in annos]
+        target.gt_keypoints = Keypoints(kpts)
+
+    return target
+
+
+def annotations_to_instances_rotated(annos, image_size):
+    """
+    Create an :class:`Instances` object used by the models,
+    from instance annotations in the dataset dict.
+    Compared to `annotations_to_instances`, this function is for rotated boxes only
+
+    Args:
+        annos (list[dict]): a list of instance annotations in one image, each
+            element for one instance.
+        image_size (tuple): height, width
+
+    Returns:
+        Instances:
+            Containing fields "gt_boxes", "gt_classes",
+            if they can be obtained from `annos`.
+            This is the format that builtin models expect.
+    """
+    boxes = [obj["bbox"] for obj in annos]
+    target = Instances(image_size)
+    boxes = target.gt_boxes = RotatedBoxes(boxes)
+    boxes.clip(image_size)
+
+    classes = [obj["category_id"] for obj in annos]
+    classes = torch.tensor(classes, dtype=torch.int64)
+    target.gt_classes = classes
+
+    return target
+
+
+def filter_empty_instances(instances, by_box=True, by_mask=True):
+    """
+    Filter out empty instances in an `Instances` object.
+
+    Args:
+        instances (Instances):
+        by_box (bool): whether to filter out instances with empty boxes
+        by_mask (bool): whether to filter out instances with empty masks
+
+    Returns:
+        Instances: the filtered instances.
+    """
+    assert by_box or by_mask
+    r = []
+    if by_box:
+        r.append(instances.gt_boxes.nonempty())
+    if instances.has("gt_masks") and by_mask:
+        r.append(instances.gt_masks.nonempty())
+
+    # TODO: can also filter visible keypoints
+
+    if not r:
+        return instances
+    m = r[0]
+    for x in r[1:]:
+        m = m & x
+    return instances[m]
+
+
+def create_keypoint_hflip_indices(dataset_names):
+    """
+    Args:
+        dataset_names (list[str]): list of dataset names
+    Returns:
+        ndarray[int]: a vector of size=#keypoints, storing the
+        horizontally-flipped keypoint indices.
+    """
+
+    check_metadata_consistency("keypoint_names", dataset_names)
+    check_metadata_consistency("keypoint_flip_map", dataset_names)
+
+    meta = MetadataCatalog.get(dataset_names[0])
+    names = meta.keypoint_names
+    # TODO flip -> hflip
+    flip_map = dict(meta.keypoint_flip_map)
+    flip_map.update({v: k for k, v in flip_map.items()})
+    flipped_names = [i if i not in flip_map else flip_map[i] for i in names]
+    flip_indices = [names.index(i) for i in flipped_names]
+    return np.asarray(flip_indices)
+
+
+def gen_crop_transform_with_instance(crop_size, image_size, instance):
+    """
+    Generate a CropTransform so that the cropping region contains
+    the center of the given instance.
+
+    Args:
+        crop_size (tuple): h, w in pixels
+        image_size (tuple): h, w
+        instance (dict): an annotation dict of one instance, in Detectron2's
+            dataset format.
+    """
+    crop_size = np.asarray(crop_size, dtype=np.int32)
+    bbox = BoxMode.convert(instance["bbox"], instance["bbox_mode"], BoxMode.XYXY_ABS)
+    center_yx = (bbox[1] + bbox[3]) * 0.5, (bbox[0] + bbox[2]) * 0.5
+
+    min_yx = np.maximum(np.floor(center_yx).astype(np.int32) - crop_size, 0)
+    max_yx = np.maximum(np.asarray(image_size, dtype=np.int32) - crop_size, 0)
+    max_yx = np.minimum(max_yx, np.ceil(center_yx).astype(np.int32))
+
+    y0 = np.random.randint(min_yx[0], max_yx[0] + 1)
+    x0 = np.random.randint(min_yx[1], max_yx[1] + 1)
+    return T.CropTransform(x0, y0, crop_size[1], crop_size[0])
+
+
+def check_metadata_consistency(key, dataset_names):
+    """
+    Check that the datasets have consistent metadata.
+
+    Args:
+        key (str): a metadata key
+        dataset_names (list[str]): a list of dataset names
+
+    Raises:
+        AttributeError: if the key does not exist in the metadata
+        ValueError: if the given datasets do not have the same metadata values defined by key
+    """
+    if len(dataset_names) == 0:
+        return
+    logger = logging.getLogger(__name__)
+    entries_per_dataset = [getattr(MetadataCatalog.get(d), key) for d in dataset_names]
+    for idx, entry in enumerate(entries_per_dataset):
+        if entry != entries_per_dataset[0]:
+            logger.error(
+                "Metadata '{}' for dataset '{}' is '{}'".format(key, dataset_names[idx], str(entry))
+            )
+            logger.error(
+                "Metadata '{}' for dataset '{}' is '{}'".format(
+                    key, dataset_names[0], str(entries_per_dataset[0])
+                )
+            )
+            raise ValueError("Datasets have different metadata '{}'!".format(key))
+
+
+def build_transform_gen(cfg, is_train):
+    """
+    Create a list of :class:`TransformGen` from config.
+    Now it includes resizing and flipping.
+
+    Returns:
+        list[TransformGen]
+    """
+    if is_train:
+        min_size = cfg.INPUT.MIN_SIZE_TRAIN
+        max_size = cfg.INPUT.MAX_SIZE_TRAIN
+        sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
+    else:
+        min_size = cfg.INPUT.MIN_SIZE_TEST
+        max_size = cfg.INPUT.MAX_SIZE_TEST
+        sample_style = "choice"
+    if sample_style == "range":
+        assert len(min_size) == 2, "more than 2 ({}) min_size(s) are provided for ranges".format(
+            len(min_size)
+        )
+
+    logger = logging.getLogger(__name__)
+    tfm_gens = []
+    tfm_gens.append(T.ResizeShortestEdge(min_size, max_size, sample_style))
+    if is_train:
+        tfm_gens.append(T.RandomFlip())
+        logger.info("TransformGens used in training: " + str(tfm_gens))
+    return tfm_gens
diff --git a/detectron2/data/samplers/__init__.py b/detectron2/data/samplers/__init__.py
new file mode 100644
index 0000000..9cfa8a6
--- /dev/null
+++ b/detectron2/data/samplers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .distributed_sampler import InferenceSampler, RepeatFactorTrainingSampler, TrainingSampler
+from .grouped_batch_sampler import GroupedBatchSampler
+
+__all__ = [
+    "GroupedBatchSampler",
+    "TrainingSampler",
+    "InferenceSampler",
+    "RepeatFactorTrainingSampler",
+]
diff --git a/detectron2/data/samplers/distributed_sampler.py b/detectron2/data/samplers/distributed_sampler.py
new file mode 100644
index 0000000..05c1622
--- /dev/null
+++ b/detectron2/data/samplers/distributed_sampler.py
@@ -0,0 +1,199 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import itertools
+import math
+from collections import defaultdict
+from typing import Optional
+import torch
+from torch.utils.data.sampler import Sampler
+
+from detectron2.utils import comm
+
+
+class TrainingSampler(Sampler):
+    """
+    In training, we only care about the "infinite stream" of training data.
+    So this sampler produces an infinite stream of indices and
+    all workers cooperate to correctly shuffle the indices and sample different indices.
+
+    The samplers in each worker effectively produces `indices[worker_id::num_workers]`
+    where `indices` is an infinite stream of indices consisting of
+    `shuffle(range(size)) + shuffle(range(size)) + ...` (if shuffle is True)
+    or `range(size) + range(size) + ...` (if shuffle is False)
+    """
+
+    def __init__(self, size: int, shuffle: bool = True, seed: Optional[int] = None):
+        """
+        Args:
+            size (int): the total number of data of the underlying dataset to sample from
+            shuffle (bool): whether to shuffle the indices or not
+            seed (int): the initial seed of the shuffle. Must be the same
+                across all workers. If None, will use a random seed shared
+                among workers (require synchronization among all workers).
+        """
+        self._size = size
+        assert size > 0
+        self._shuffle = shuffle
+        if seed is None:
+            seed = comm.shared_random_seed()
+        self._seed = int(seed)
+
+        self._rank = comm.get_rank()
+        self._world_size = comm.get_world_size()
+
+    def __iter__(self):
+        start = self._rank
+        yield from itertools.islice(self._infinite_indices(), start, None, self._world_size)
+
+    def _infinite_indices(self):
+        g = torch.Generator()
+        g.manual_seed(self._seed)
+        while True:
+            if self._shuffle:
+                yield from torch.randperm(self._size, generator=g)
+            else:
+                yield from torch.arange(self._size)
+
+
+class RepeatFactorTrainingSampler(Sampler):
+    """
+    Similar to TrainingSampler, but suitable for training on class imbalanced datasets
+    like LVIS. In each epoch, an image may appear multiple times based on its "repeat
+    factor". The repeat factor for an image is a function of the frequency the rarest
+    category labeled in that image. The "frequency of category c" in [0, 1] is defined
+    as the fraction of images in the training set (without repeats) in which category c
+    appears.
+
+    See https://arxiv.org/abs/1908.03195 (>= v2) Appendix B.2.
+    """
+
+    def __init__(self, dataset_dicts, repeat_thresh, shuffle=True, seed=None):
+        """
+        Args:
+            dataset_dicts (list[dict]): annotations in Detectron2 dataset format.
+            repeat_thresh (float): frequency threshold below which data is repeated.
+            shuffle (bool): whether to shuffle the indices or not
+            seed (int): the initial seed of the shuffle. Must be the same
+                across all workers. If None, will use a random seed shared
+                among workers (require synchronization among all workers).
+        """
+        self._shuffle = shuffle
+        if seed is None:
+            seed = comm.shared_random_seed()
+        self._seed = int(seed)
+
+        self._rank = comm.get_rank()
+        self._world_size = comm.get_world_size()
+
+        # Get fractional repeat factors and split into whole number (_int_part)
+        # and fractional (_frac_part) parts.
+        rep_factors = self._get_repeat_factors(dataset_dicts, repeat_thresh)
+        self._int_part = torch.trunc(rep_factors)
+        self._frac_part = rep_factors - self._int_part
+
+    def _get_repeat_factors(self, dataset_dicts, repeat_thresh):
+        """
+        Compute (fractional) per-image repeat factors.
+
+        Args:
+            See __init__.
+
+        Returns:
+            torch.Tensor: the i-th element is the repeat factor for the dataset image
+                at index i.
+        """
+        # 1. For each category c, compute the fraction of images that contain it: f(c)
+        category_freq = defaultdict(int)
+        for dataset_dict in dataset_dicts:  # For each image (without repeats)
+            cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
+            for cat_id in cat_ids:
+                category_freq[cat_id] += 1
+        num_images = len(dataset_dicts)
+        for k, v in category_freq.items():
+            category_freq[k] = v / num_images
+
+        # 2. For each category c, compute the category-level repeat factor:
+        #    r(c) = max(1, sqrt(t / f(c)))
+        category_rep = {
+            cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq))
+            for cat_id, cat_freq in category_freq.items()
+        }
+
+        # 3. For each image I, compute the image-level repeat factor:
+        #    r(I) = max_{c in I} r(c)
+        rep_factors = []
+        for dataset_dict in dataset_dicts:
+            cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
+            rep_factor = max({category_rep[cat_id] for cat_id in cat_ids})
+            rep_factors.append(rep_factor)
+
+        return torch.tensor(rep_factors, dtype=torch.float32)
+
+    def _get_epoch_indices(self, generator):
+        """
+        Create a list of dataset indices (with repeats) to use for one epoch.
+
+        Args:
+            generator (torch.Generator): pseudo random number generator used for
+                stochastic rounding.
+
+        Returns:
+            torch.Tensor: list of dataset indices to use in one epoch. Each index
+                is repeated based on its calculated repeat factor.
+        """
+        # Since repeat factors are fractional, we use stochastic rounding so
+        # that the target repeat factor is achieved in expectation over the
+        # course of training
+        rands = torch.rand(len(self._frac_part), generator=generator)
+        rep_factors = self._int_part + (rands < self._frac_part).float()
+        # Construct a list of indices in which we repeat images as specified
+        indices = []
+        for dataset_index, rep_factor in enumerate(rep_factors):
+            indices.extend([dataset_index] * int(rep_factor.item()))
+        return torch.tensor(indices, dtype=torch.int64)
+
+    def __iter__(self):
+        start = self._rank
+        yield from itertools.islice(self._infinite_indices(), start, None, self._world_size)
+
+    def _infinite_indices(self):
+        g = torch.Generator()
+        g.manual_seed(self._seed)
+        while True:
+            # Sample indices with repeats determined by stochastic rounding; each
+            # "epoch" may have a slightly different size due to the rounding.
+            indices = self._get_epoch_indices(g)
+            if self._shuffle:
+                randperm = torch.randperm(len(indices), generator=g)
+                yield from indices[randperm]
+            else:
+                yield from indices
+
+
+class InferenceSampler(Sampler):
+    """
+    Produce indices for inference.
+    Inference needs to run on the __exact__ set of samples,
+    therefore when the total number of samples is not divisible by the number of workers,
+    this sampler produces different number of samples on different workers.
+    """
+
+    def __init__(self, size: int):
+        """
+        Args:
+            size (int): the total number of data of the underlying dataset to sample from
+        """
+        self._size = size
+        assert size > 0
+        self._rank = comm.get_rank()
+        self._world_size = comm.get_world_size()
+
+        shard_size = (self._size - 1) // self._world_size + 1
+        begin = shard_size * self._rank
+        end = min(shard_size * (self._rank + 1), self._size)
+        self._local_indices = range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
diff --git a/detectron2/data/samplers/grouped_batch_sampler.py b/detectron2/data/samplers/grouped_batch_sampler.py
new file mode 100644
index 0000000..138e106
--- /dev/null
+++ b/detectron2/data/samplers/grouped_batch_sampler.py
@@ -0,0 +1,47 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+from torch.utils.data.sampler import BatchSampler, Sampler
+
+
+class GroupedBatchSampler(BatchSampler):
+    """
+    Wraps another sampler to yield a mini-batch of indices.
+    It enforces that the batch only contain elements from the same group.
+    It also tries to provide mini-batches which follows an ordering which is
+    as close as possible to the ordering from the original sampler.
+    """
+
+    def __init__(self, sampler, group_ids, batch_size):
+        """
+        Args:
+            sampler (Sampler): Base sampler.
+            group_ids (list[int]): If the sampler produces indices in range [0, N),
+                `group_ids` must be a list of `N` ints which contains the group id of each sample.
+                The group ids must be a set of integers in the range [0, num_groups).
+            batch_size (int): Size of mini-batch.
+        """
+        if not isinstance(sampler, Sampler):
+            raise ValueError(
+                "sampler should be an instance of "
+                "torch.utils.data.Sampler, but got sampler={}".format(sampler)
+            )
+        self.sampler = sampler
+        self.group_ids = np.asarray(group_ids)
+        assert self.group_ids.ndim == 1
+        self.batch_size = batch_size
+        groups = np.unique(self.group_ids).tolist()
+
+        # buffer the indices of each group until batch size is reached
+        self.buffer_per_group = {k: [] for k in groups}
+
+    def __iter__(self):
+        for idx in self.sampler:
+            group_id = self.group_ids[idx]
+            group_buffer = self.buffer_per_group[group_id]
+            group_buffer.append(idx)
+            if len(group_buffer) == self.batch_size:
+                yield group_buffer[:]  # yield a copy of the list
+                del group_buffer[:]
+
+    def __len__(self):
+        raise NotImplementedError("len() of GroupedBatchSampler is not well-defined.")
diff --git a/detectron2/data/transforms/__init__.py b/detectron2/data/transforms/__init__.py
new file mode 100644
index 0000000..f7638bb
--- /dev/null
+++ b/detectron2/data/transforms/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .transform import *
+from fvcore.transforms.transform import *
+from .transform_gen import *
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/detectron2/data/transforms/transform.py b/detectron2/data/transforms/transform.py
new file mode 100644
index 0000000..048e672
--- /dev/null
+++ b/detectron2/data/transforms/transform.py
@@ -0,0 +1,139 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# File: transform.py
+
+import numpy as np
+from fvcore.transforms.transform import HFlipTransform, NoOpTransform, Transform
+from PIL import Image
+
+__all__ = ["ExtentTransform", "ResizeTransform"]
+
+
+class ExtentTransform(Transform):
+    """
+    Extracts a subregion from the source image and scales it to the output size.
+
+    The fill color is used to map pixels from the source rect that fall outside
+    the source image.
+
+    See: https://pillow.readthedocs.io/en/latest/PIL.html#PIL.ImageTransform.ExtentTransform
+    """
+
+    def __init__(self, src_rect, output_size, interp=Image.LINEAR, fill=0):
+        """
+        Args:
+            src_rect (x0, y0, x1, y1): src coordinates
+            output_size (h, w): dst image size
+            interp: PIL interpolation methods
+            fill: Fill color used when src_rect extends outside image
+        """
+        super().__init__()
+        self._set_attributes(locals())
+
+    def apply_image(self, img, interp=None):
+        h, w = self.output_size
+        ret = Image.fromarray(img).transform(
+            size=(w, h),
+            method=Image.EXTENT,
+            data=self.src_rect,
+            resample=interp if interp else self.interp,
+            fill=self.fill,
+        )
+        return np.asarray(ret)
+
+    def apply_coords(self, coords):
+        # Transform image center from source coordinates into output coordinates
+        # and then map the new origin to the corner of the output image.
+        h, w = self.output_size
+        x0, y0, x1, y1 = self.src_rect
+        new_coords = coords.astype(np.float32)
+        new_coords[:, 0] -= 0.5 * (x0 + x1)
+        new_coords[:, 1] -= 0.5 * (y0 + y1)
+        new_coords[:, 0] *= w / (x1 - x0)
+        new_coords[:, 1] *= h / (y1 - y0)
+        new_coords[:, 0] += 0.5 * w
+        new_coords[:, 1] += 0.5 * h
+        return new_coords
+
+    def apply_segmentation(self, segmentation):
+        segmentation = self.apply_image(segmentation, interp=Image.NEAREST)
+        return segmentation
+
+
+class ResizeTransform(Transform):
+    """
+    Resize the image to a target size.
+    """
+
+    def __init__(self, h, w, new_h, new_w, interp):
+        """
+        Args:
+            h, w (int): original image size
+            new_h, new_w (int): new image size
+            interp: PIL interpolation methods
+        """
+        # TODO decide on PIL vs opencv
+        super().__init__()
+        self._set_attributes(locals())
+
+    def apply_image(self, img, interp=None):
+        assert img.shape[:2] == (self.h, self.w)
+        pil_image = Image.fromarray(img)
+        interp_method = interp if interp is not None else self.interp
+        pil_image = pil_image.resize((self.new_w, self.new_h), interp_method)
+        ret = np.asarray(pil_image)
+        return ret
+
+    def apply_coords(self, coords):
+        coords[:, 0] = coords[:, 0] * (self.new_w * 1.0 / self.w)
+        coords[:, 1] = coords[:, 1] * (self.new_h * 1.0 / self.h)
+        return coords
+
+    def apply_segmentation(self, segmentation):
+        segmentation = self.apply_image(segmentation, interp=Image.NEAREST)
+        return segmentation
+
+
+def HFlip_rotated_box(transform, rotated_boxes):
+    """
+    Apply the horizontal flip transform on rotated boxes.
+
+    Args:
+        rotated_boxes (ndarray): Nx5 floating point array of
+            (x_center, y_center, width, height, angle_degrees) format
+            in absolute coordinates.
+    """
+    # Transform x_center
+    rotated_boxes[:, 0] = transform.width - rotated_boxes[:, 0]
+    # Transform angle
+    rotated_boxes[:, 4] = -rotated_boxes[:, 4]
+    return rotated_boxes
+
+
+def Resize_rotated_box(transform, rotated_boxes):
+    """
+    Apply the resizing transform on rotated boxes. For details of how these (approximation)
+    formulas are derived, please refer to :meth:`RotatedBoxes.scale`.
+
+    Args:
+        rotated_boxes (ndarray): Nx5 floating point array of
+            (x_center, y_center, width, height, angle_degrees) format
+            in absolute coordinates.
+    """
+    scale_factor_x = transform.new_w * 1.0 / transform.w
+    scale_factor_y = transform.new_h * 1.0 / transform.h
+    rotated_boxes[:, 0] *= scale_factor_x
+    rotated_boxes[:, 1] *= scale_factor_y
+    theta = rotated_boxes[:, 4] * np.pi / 180.0
+    c = np.cos(theta)
+    s = np.sin(theta)
+    rotated_boxes[:, 2] *= np.sqrt(np.square(scale_factor_x * c) + np.square(scale_factor_y * s))
+    rotated_boxes[:, 3] *= np.sqrt(np.square(scale_factor_x * s) + np.square(scale_factor_y * c))
+    rotated_boxes[:, 4] = np.arctan2(scale_factor_x * s, scale_factor_y * c) * 180 / np.pi
+
+    return rotated_boxes
+
+
+HFlipTransform.register_type("rotated_box", HFlip_rotated_box)
+NoOpTransform.register_type("rotated_box", lambda t, x: x)
+ResizeTransform.register_type("rotated_box", Resize_rotated_box)
diff --git a/detectron2/data/transforms/transform_gen.py b/detectron2/data/transforms/transform_gen.py
new file mode 100644
index 0000000..c5cb5b8
--- /dev/null
+++ b/detectron2/data/transforms/transform_gen.py
@@ -0,0 +1,445 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# File: transformer.py
+
+import inspect
+import numpy as np
+import pprint
+import sys
+from abc import ABCMeta, abstractmethod
+from fvcore.transforms.transform import (
+    BlendTransform,
+    CropTransform,
+    HFlipTransform,
+    NoOpTransform,
+    Transform,
+    TransformList,
+)
+from PIL import Image
+
+from .transform import ExtentTransform, ResizeTransform
+
+__all__ = [
+    "RandomBrightness",
+    "RandomContrast",
+    "RandomCrop",
+    "RandomExtent",
+    "RandomFlip",
+    "RandomSaturation",
+    "RandomLighting",
+    "Resize",
+    "ResizeShortestEdge",
+    "TransformGen",
+    "apply_transform_gens",
+]
+
+
+def check_dtype(img):
+    assert isinstance(img, np.ndarray), "[TransformGen] Needs an numpy array, but got a {}!".format(
+        type(img)
+    )
+    assert not isinstance(img.dtype, np.integer) or (
+        img.dtype == np.uint8
+    ), "[TransformGen] Got image of type {}, use uint8 or floating points instead!".format(
+        img.dtype
+    )
+    assert img.ndim in [2, 3], img.ndim
+
+
+class TransformGen(metaclass=ABCMeta):
+    """
+    TransformGen takes an image of type uint8 in range [0, 255], or
+    floating point in range [0, 1] or [0, 255] as input.
+
+    It creates a :class:`Transform` based on the given image, sometimes with randomness.
+    The transform can then be used to transform images
+    or other data (boxes, points, annotations, etc.) associated with it.
+
+    The assumption made in this class
+    is that the image itself is sufficient to instantiate a transform.
+    When this assumption is not true, you need to create the transforms by your own.
+
+    A list of `TransformGen` can be applied with :func:`apply_transform_gens`.
+    """
+
+    def _init(self, params=None):
+        if params:
+            for k, v in params.items():
+                if k != "self" and not k.startswith("_"):
+                    setattr(self, k, v)
+
+    @abstractmethod
+    def get_transform(self, img):
+        pass
+
+    def _rand_range(self, low=1.0, high=None, size=None):
+        """
+        Uniform float random number between low and high.
+        """
+        if high is None:
+            low, high = 0, low
+        if size is None:
+            size = []
+        return np.random.uniform(low, high, size)
+
+    def __repr__(self):
+        """
+        Produce something like:
+        "MyTransformGen(field1={self.field1}, field2={self.field2})"
+        """
+        try:
+            sig = inspect.signature(self.__init__)
+            classname = type(self).__name__
+            argstr = []
+            for name, param in sig.parameters.items():
+                assert (
+                    param.kind != param.VAR_POSITIONAL and param.kind != param.VAR_KEYWORD
+                ), "The default __repr__ doesn't support *args or **kwargs"
+                assert hasattr(self, name), (
+                    "Attribute {} not found! "
+                    "Default __repr__ only works if attributes match the constructor.".format(name)
+                )
+                attr = getattr(self, name)
+                default = param.default
+                if default is attr:
+                    continue
+                argstr.append("{}={}".format(name, pprint.pformat(attr)))
+            return "{}({})".format(classname, ", ".join(argstr))
+        except AssertionError:
+            return super().__repr__()
+
+    __str__ = __repr__
+
+
+class RandomFlip(TransformGen):
+    """
+    Flip the image horizontally with the given probability.
+
+    TODO Vertical flip to be implemented.
+    """
+
+    def __init__(self, prob=0.5):
+        """
+        Args:
+            prob (float): probability of flip.
+        """
+        horiz, vert = True, False
+        # TODO implement vertical flip when we need it
+        super().__init__()
+
+        if horiz and vert:
+            raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.")
+        if not horiz and not vert:
+            raise ValueError("At least one of horiz or vert has to be True!")
+        self._init(locals())
+
+    def get_transform(self, img):
+        _, w = img.shape[:2]
+        do = self._rand_range() < self.prob
+        if do:
+            return HFlipTransform(w)
+        else:
+            return NoOpTransform()
+
+
+class Resize(TransformGen):
+    """ Resize image to a target size"""
+
+    def __init__(self, shape, interp=Image.BILINEAR):
+        """
+        Args:
+            shape: (h, w) tuple or a int
+            interp: PIL interpolation method
+        """
+        if isinstance(shape, int):
+            shape = (shape, shape)
+        shape = tuple(shape)
+        self._init(locals())
+
+    def get_transform(self, img):
+        return ResizeTransform(
+            img.shape[0], img.shape[1], self.shape[0], self.shape[1], self.interp
+        )
+
+
+class ResizeShortestEdge(TransformGen):
+    """
+    Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge.
+    If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
+    """
+
+    def __init__(
+        self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR
+    ):
+        """
+        Args:
+            short_edge_length (list[int]): If ``sample_style=="range"``,
+                a [min, max] interval from which to sample the shortest edge length.
+                If ``sample_style=="choice"``, a list of shortest edge lengths to sample from.
+            max_size (int): maximum allowed longest edge length.
+            sample_style (str): either "range" or "choice".
+        """
+        super().__init__()
+        assert sample_style in ["range", "choice"], sample_style
+
+        self.is_range = sample_style == "range"
+        if isinstance(short_edge_length, int):
+            short_edge_length = (short_edge_length, short_edge_length)
+        self._init(locals())
+
+    def get_transform(self, img):
+        h, w = img.shape[:2]
+
+        if self.is_range:
+            size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1)
+        else:
+            size = np.random.choice(self.short_edge_length)
+        if size == 0:
+            return NoOpTransform()
+
+        scale = size * 1.0 / min(h, w)
+        if h < w:
+            newh, neww = size, scale * w
+        else:
+            newh, neww = scale * h, size
+        if max(newh, neww) > self.max_size:
+            scale = self.max_size * 1.0 / max(newh, neww)
+            newh = newh * scale
+            neww = neww * scale
+        neww = int(neww + 0.5)
+        newh = int(newh + 0.5)
+        return ResizeTransform(h, w, newh, neww, self.interp)
+
+
+class RandomCrop(TransformGen):
+    """
+    Randomly crop a subimage out of an image.
+    """
+
+    def __init__(self, crop_type: str, crop_size):
+        """
+        Args:
+            crop_type (str): one of "relative_range", "relative", "absolute".
+                See `config/defaults.py` for explanation.
+            crop_size (tuple[float]): the relative ratio or absolute pixels of
+                height and width
+        """
+        super().__init__()
+        assert crop_type in ["relative_range", "relative", "absolute"]
+        self._init(locals())
+
+    def get_transform(self, img):
+        h, w = img.shape[:2]
+        croph, cropw = self.get_crop_size((h, w))
+        assert h >= croph and w >= cropw, "Shape computation in {} has bugs.".format(self)
+        h0 = np.random.randint(h - croph + 1)
+        w0 = np.random.randint(w - cropw + 1)
+        return CropTransform(w0, h0, cropw, croph)
+
+    def get_crop_size(self, image_size):
+        """
+        Args:
+            image_size (tuple): height, width
+
+        Returns:
+            crop_size (tuple): height, width in absolute pixels
+        """
+        h, w = image_size
+        if self.crop_type == "relative":
+            ch, cw = self.crop_size
+            return int(h * ch + 0.5), int(w * cw + 0.5)
+        elif self.crop_type == "relative_range":
+            crop_size = np.asarray(self.crop_size, dtype=np.float32)
+            ch, cw = crop_size + np.random.rand(2) * (1 - crop_size)
+            return int(h * ch + 0.5), int(w * cw + 0.5)
+        elif self.crop_type == "absolute":
+            return self.crop_size
+        else:
+            NotImplementedError("Unknown crop type {}".format(self.crop_type))
+
+
+class RandomExtent(TransformGen):
+    """
+    Outputs an image by cropping a random "subrect" of the source image.
+
+    The subrect can be parameterized to include pixels outside the source image,
+    in which case they will be set to zeros (i.e. black). The size of the output
+    image will vary with the size of the random subrect.
+    """
+
+    def __init__(self, scale_range, shift_range):
+        """
+        Args:
+            output_size (h, w): Dimensions of output image
+            scale_range (l, h): Range of input-to-output size scaling factor
+            shift_range (x, y): Range of shifts of the cropped subrect. The rect
+                is shifted by [w / 2 * Uniform(-x, x), h / 2 * Uniform(-y, y)],
+                where (w, h) is the (width, height) of the input image. Set each
+                component to zero to crop at the image's center.
+        """
+        super().__init__()
+        self._init(locals())
+
+    def get_transform(self, img):
+        img_h, img_w = img.shape[:2]
+
+        # Initialize src_rect to fit the input image.
+        src_rect = np.array([-0.5 * img_w, -0.5 * img_h, 0.5 * img_w, 0.5 * img_h])
+
+        # Apply a random scaling to the src_rect.
+        src_rect *= np.random.uniform(self.scale_range[0], self.scale_range[1])
+
+        # Apply a random shift to the coordinates origin.
+        src_rect[0::2] += self.shift_range[0] * img_w * (np.random.rand() - 0.5)
+        src_rect[1::2] += self.shift_range[1] * img_h * (np.random.rand() - 0.5)
+
+        # Map src_rect coordinates into image coordinates (center at corner).
+        src_rect[0::2] += 0.5 * img_w
+        src_rect[1::2] += 0.5 * img_h
+
+        return ExtentTransform(
+            src_rect=(src_rect[0], src_rect[1], src_rect[2], src_rect[3]),
+            output_size=(int(src_rect[3] - src_rect[1]), int(src_rect[2] - src_rect[0])),
+        )
+
+
+class RandomContrast(TransformGen):
+    """
+    Randomly transforms image contrast.
+
+    Contrast intensity is uniformly sampled in (intensity_min, intensity_max).
+    - intensity < 1 will reduce contrast
+    - intensity = 1 will preserve the input image
+    - intensity > 1 will increase contrast
+
+    See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html
+    """
+
+    def __init__(self, intensity_min, intensity_max):
+        """
+        Args:
+            intensity_min (float): Minimum augmentation
+            intensity_max (float): Maximum augmentation
+        """
+        super().__init__()
+        self._init(locals())
+
+    def get_transform(self, img):
+        w = np.random.uniform(self.intensity_min, self.intensity_max)
+        return BlendTransform(src_image=img.mean(), src_weight=1 - w, dst_weight=w)
+
+
+class RandomBrightness(TransformGen):
+    """
+    Randomly transforms image brightness.
+
+    Brightness intensity is uniformly sampled in (intensity_min, intensity_max).
+    - intensity < 1 will reduce brightness
+    - intensity = 1 will preserve the input image
+    - intensity > 1 will increase brightness
+
+    See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html
+    """
+
+    def __init__(self, intensity_min, intensity_max):
+        """
+        Args:
+            intensity_min (float): Minimum augmentation
+            intensity_max (float): Maximum augmentation
+        """
+        super().__init__()
+        self._init(locals())
+
+    def get_transform(self, img):
+        w = np.random.uniform(self.intensity_min, self.intensity_max)
+        return BlendTransform(src_image=0, src_weight=1 - w, dst_weight=w)
+
+
+class RandomSaturation(TransformGen):
+    """
+    Randomly transforms image saturation.
+
+    Saturation intensity is uniformly sampled in (intensity_min, intensity_max).
+    - intensity < 1 will reduce saturation (make the image more grayscale)
+    - intensity = 1 will preserve the input image
+    - intensity > 1 will increase saturation
+
+    See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html
+    """
+
+    def __init__(self, intensity_min, intensity_max):
+        """
+        Args:
+            intensity_min (float): Minimum augmentation (1 preserves input).
+            intensity_max (float): Maximum augmentation (1 preserves input).
+        """
+        super().__init__()
+        self._init(locals())
+
+    def get_transform(self, img):
+        assert img.shape[-1] == 3, "Saturation only works on RGB images"
+        w = np.random.uniform(self.intensity_min, self.intensity_max)
+        grayscale = img.dot([0.299, 0.587, 0.114])[:, :, np.newaxis]
+        return BlendTransform(src_image=grayscale, src_weight=1 - w, dst_weight=w)
+
+
+class RandomLighting(TransformGen):
+    """
+    Randomly transforms image color using fixed PCA over ImageNet.
+
+    The degree of color jittering is randomly sampled via a normal distribution,
+    with standard deviation given by the scale parameter.
+    """
+
+    def __init__(self, scale):
+        """
+        Args:
+            scale (float): Standard deviation of principal component weighting.
+        """
+        super().__init__()
+        self._init(locals())
+        self.eigen_vecs = np.array(
+            [[-0.5675, 0.7192, 0.4009], [-0.5808, -0.0045, -0.8140], [-0.5836, -0.6948, 0.4203]]
+        )
+        self.eigen_vals = np.array([0.2175, 0.0188, 0.0045])
+
+    def get_transform(self, img):
+        assert img.shape[-1] == 3, "Saturation only works on RGB images"
+        weights = np.random.normal(scale=self.scale, size=3)
+        return BlendTransform(
+            src_image=self.eigen_vecs.dot(weights * self.eigen_vals), src_weight=1.0, dst_weight=1.0
+        )
+
+
+def apply_transform_gens(transform_gens, img):
+    """
+    Apply a list of :class:`TransformGen` on the input image, and
+    returns the transformed image and a list of transforms.
+
+    We cannot simply create and return all transforms without
+    applying it to the image, because a subsequent transform may
+    need the output of the previous one.
+
+    Args:
+        transform_gens (list): list of :class:`TransformGen` instance to
+            be applied.
+        img (ndarray): uint8 or floating point images with 1 or 3 channels.
+
+    Returns:
+        ndarray: the transformed image
+        TransformList: contain the transforms that's used.
+    """
+    for g in transform_gens:
+        assert isinstance(g, TransformGen), g
+
+    check_dtype(img)
+
+    tfms = []
+    for g in transform_gens:
+        tfm = g.get_transform(img)
+        assert isinstance(
+            tfm, Transform
+        ), "TransformGen {} must return an instance of Transform! Got {} instead".format(g, tfm)
+        img = tfm.apply_image(img)
+        tfms.append(tfm)
+    return img, TransformList(tfms)
diff --git a/detectron2/engine/__init__.py b/detectron2/engine/__init__.py
new file mode 100644
index 0000000..6a4538d
--- /dev/null
+++ b/detectron2/engine/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+from .launch import *
+from .train_loop import *
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
+
+
+# prefer to let hooks and defaults live in separate namespaces (therefore not in __all__)
+# but still make them available here
+from .hooks import *
+from .defaults import *
diff --git a/detectron2/engine/defaults.py b/detectron2/engine/defaults.py
new file mode 100644
index 0000000..4cddc91
--- /dev/null
+++ b/detectron2/engine/defaults.py
@@ -0,0 +1,493 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+"""
+This file contains components with some default boilerplate logic user may need
+in training / testing. They will not work for everyone, but many users may find them useful.
+
+The behavior of functions/classes in this file is subject to change,
+since they are meant to represent the "common default behavior" people need in their projects.
+"""
+
+import argparse
+import logging
+import os
+from collections import OrderedDict
+import torch
+from fvcore.common.file_io import PathManager
+from fvcore.nn.precise_bn import get_bn_modules
+from torch.nn.parallel import DistributedDataParallel
+
+import detectron2.data.transforms as T
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.data import (
+    MetadataCatalog,
+    build_detection_test_loader,
+    build_detection_train_loader,
+)
+from detectron2.evaluation import (
+    DatasetEvaluator,
+    inference_on_dataset,
+    print_csv_format,
+    verify_results,
+)
+from detectron2.modeling import build_model
+from detectron2.solver import build_lr_scheduler, build_optimizer
+from detectron2.utils import comm
+from detectron2.utils.collect_env import collect_env_info
+from detectron2.utils.env import seed_all_rng
+from detectron2.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter
+from detectron2.utils.logger import setup_logger
+
+from . import hooks
+from .train_loop import SimpleTrainer
+
+__all__ = ["default_argument_parser", "default_setup", "DefaultPredictor", "DefaultTrainer"]
+
+
+def default_argument_parser():
+    """
+    Create a parser with some common arguments used by detectron2 users.
+
+    Returns:
+        argparse.ArgumentParser:
+    """
+    parser = argparse.ArgumentParser(description="Detectron2 Training")
+    parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file")
+    parser.add_argument(
+        "--resume",
+        action="store_true",
+        help="whether to attempt to resume from the checkpoint directory",
+    )
+    parser.add_argument("--eval-only", action="store_true", help="perform evaluation only")
+    parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*")
+    parser.add_argument("--num-machines", type=int, default=1)
+    parser.add_argument(
+        "--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)"
+    )
+
+    # PyTorch still may leave orphan processes in multi-gpu training.
+    # Therefore we use a deterministic way to obtain port,
+    # so that users are aware of orphan processes by seeing the port occupied.
+    port = 2 ** 15 + 2 ** 14 + hash(os.getpid()) % 2 ** 14
+    parser.add_argument("--dist-url", default="tcp://127.0.0.1:{}".format(port))
+    parser.add_argument(
+        "opts",
+        help="Modify config options using the command-line",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+
+
+def default_setup(cfg, args):
+    """
+    Perform some basic common setups at the beginning of a job, including:
+
+    1. Set up the detectron2 logger
+    2. Log basic information about environment, cmdline arguments, and config
+    3. Backup the config to the output directory
+
+    Args:
+        cfg (CfgNode): the full config to be used
+        args (argparse.NameSpace): the command line arguments to be logged
+    """
+    output_dir = cfg.OUTPUT_DIR
+    if comm.is_main_process() and output_dir:
+        PathManager.mkdirs(output_dir)
+
+    rank = comm.get_rank()
+    setup_logger(output_dir, distributed_rank=rank, name="fvcore")
+    logger = setup_logger(output_dir, distributed_rank=rank)
+
+    logger.info("Rank of current process: {}. World size: {}".format(rank, comm.get_world_size()))
+    logger.info("Environment info:\n" + collect_env_info())
+
+    logger.info("Command line arguments: " + str(args))
+    if hasattr(args, "config_file"):
+        logger.info(
+            "Contents of args.config_file={}:\n{}".format(
+                args.config_file, PathManager.open(args.config_file, "r").read()
+            )
+        )
+
+    logger.info("Running with full config:\n{}".format(cfg))
+    if comm.is_main_process() and output_dir:
+        # Note: some of our scripts may expect the existence of
+        # config.yaml in output directory
+        path = os.path.join(output_dir, "config.yaml")
+        with PathManager.open(path, "w") as f:
+            f.write(cfg.dump())
+        logger.info("Full config saved to {}".format(os.path.abspath(path)))
+
+    # make sure each worker has a different, yet deterministic seed if specified
+    seed_all_rng(None if cfg.SEED < 0 else cfg.SEED + rank)
+
+    # cudnn benchmark has large overhead. It shouldn't be used considering the small size of
+    # typical validation set.
+    if not (hasattr(args, "eval_only") and args.eval_only):
+        torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK
+
+
+class DefaultPredictor:
+    """
+    Create a simple end-to-end predictor with the given config.
+    The predictor takes an BGR image, resizes it to the specified resolution,
+    runs the model and produces a dict of predictions.
+
+    This predictor takes care of model loading and input preprocessing for you.
+    If you'd like to do anything more fancy, please refer to its source code
+    as examples to build and use the model manually.
+
+    Attributes:
+        metadata (Metadata): the metadata of the underlying dataset, obtained from
+            cfg.DATASETS.TEST.
+
+    Examples:
+
+    .. code-block:: python
+
+        pred = DefaultPredictor(cfg)
+        outputs = pred(inputs)
+    """
+
+    def __init__(self, cfg):
+        self.cfg = cfg.clone()  # cfg can be modified by model
+        self.model = build_model(self.cfg)
+        self.model.eval()
+        self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
+
+        checkpointer = DetectionCheckpointer(self.model)
+        checkpointer.load(cfg.MODEL.WEIGHTS)
+
+        self.transform_gen = T.ResizeShortestEdge(
+            [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
+        )
+
+        self.input_format = cfg.INPUT.FORMAT
+        assert self.input_format in ["RGB", "BGR"], self.input_format
+
+    @torch.no_grad()
+    def __call__(self, original_image):
+        """
+        Args:
+            original_image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+
+        Returns:
+            predictions (dict): the output of the model
+        """
+        # Apply pre-processing to image.
+        if self.input_format == "RGB":
+            # whether the model expects BGR inputs or RGB
+            original_image = original_image[:, :, ::-1]
+        height, width = original_image.shape[:2]
+        image = self.transform_gen.get_transform(original_image).apply_image(original_image)
+        image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
+
+        inputs = {"image": image, "height": height, "width": width}
+        predictions = self.model([inputs])[0]
+        return predictions
+
+
+class DefaultTrainer(SimpleTrainer):
+    """
+    A trainer with default training logic. Compared to `SimpleTrainer`, it
+    contains the following logic in addition:
+
+    1. Create model, optimizer, scheduler, dataloader from the given config.
+    2. Load a checkpoint or `cfg.MODEL.WEIGHTS`, if exists.
+    3. Register a few common hooks.
+
+    It is created to simplify the **standard model training workflow** and reduce code boilerplate
+    for users who only need the standard training workflow, with standard features.
+    It means this class makes *many assumptions* about your training logic that
+    may easily become invalid in a new research. In fact, any assumptions beyond those made in the
+    :class:`SimpleTrainer` are too much for research.
+
+    The code of this class has been annotated about restrictive assumptions it mades.
+    When they do not work for you, you're encouraged to:
+
+    1. Overwrite methods of this class, OR:
+    2. Use :class:`SimpleTrainer`, which only does minimal SGD training and
+       nothing else. You can then add your own hooks if needed. OR:
+    3. Write your own training loop similar to `tools/plain_train_net.py`.
+
+    Also note that the behavior of this class, like other functions/classes in
+    this file, is not stable, since it is meant to represent the "common default behavior".
+    It is only guaranteed to work well with the standard models and training workflow in detectron2.
+    To obtain more stable behavior, write your own training logic with other public APIs.
+
+    Attributes:
+        scheduler:
+        checkpointer (DetectionCheckpointer):
+        cfg (CfgNode):
+
+    Examples:
+
+    .. code-block:: python
+
+        trainer = DefaultTrainer(cfg)
+        trainer.resume_or_load()  # load last checkpoint or MODEL.WEIGHTS
+        trainer.train()
+    """
+
+    def __init__(self, cfg):
+        """
+        Args:
+            cfg (CfgNode):
+        """
+        # Assume these objects must be constructed in this order.
+        model = self.build_model(cfg)
+        optimizer = self.build_optimizer(cfg, model)
+        data_loader = self.build_train_loader(cfg)
+
+        # For training, wrap with DDP. But don't need this for inference.
+        if comm.get_world_size() > 1:
+            model = DistributedDataParallel(
+                model, device_ids=[comm.get_local_rank()], broadcast_buffers=False
+            )
+        super().__init__(model, data_loader, optimizer)
+
+        self.scheduler = self.build_lr_scheduler(cfg, optimizer)
+        # Assume no other objects need to be checkpointed.
+        # We can later make it checkpoint the stateful hooks
+        self.checkpointer = DetectionCheckpointer(
+            # Assume you want to save checkpoints together with logs/statistics
+            model,
+            cfg.OUTPUT_DIR,
+            optimizer=optimizer,
+            scheduler=self.scheduler,
+        )
+        self.start_iter = 0
+        self.max_iter = cfg.SOLVER.MAX_ITER
+        self.cfg = cfg
+
+        self.register_hooks(self.build_hooks())
+
+    def resume_or_load(self, resume=True):
+        """
+        If `resume==True`, and last checkpoint exists, resume from it.
+
+        Otherwise, load a model specified by the config.
+
+        Args:
+            resume (bool): whether to do resume or not
+        """
+        # The checkpoint stores the training iteration that just finished, thus we start
+        # at the next iteration (or iter zero if there's no checkpoint).
+        self.start_iter = (
+            self.checkpointer.resume_or_load(self.cfg.MODEL.WEIGHTS, resume=resume).get(
+                "iteration", -1
+            )
+            + 1
+        )
+
+    def build_hooks(self):
+        """
+        Build a list of default hooks, including timing, evaluation,
+        checkpointing, lr scheduling, precise BN, writing events.
+
+        Returns:
+            list[HookBase]:
+        """
+        cfg = self.cfg.clone()
+        cfg.defrost()
+        cfg.DATALOADER.NUM_WORKERS = 0  # save some memory and time for PreciseBN
+
+        ret = [
+            hooks.IterationTimer(),
+            hooks.LRScheduler(self.optimizer, self.scheduler),
+            hooks.PreciseBN(
+                # Run at the same freq as (but before) evaluation.
+                cfg.TEST.EVAL_PERIOD,
+                self.model,
+                # Build a new data loader to not affect training
+                self.build_train_loader(cfg),
+                cfg.TEST.PRECISE_BN.NUM_ITER,
+            )
+            if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)
+            else None,
+        ]
+
+        # Do PreciseBN before checkpointer, because it updates the model and need to
+        # be saved by checkpointer.
+        # This is not always the best: if checkpointing has a different frequency,
+        # some checkpoints may have more precise statistics than others.
+        if comm.is_main_process():
+            ret.append(hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD))
+
+        def test_and_save_results():
+            self._last_eval_results = self.test(self.cfg, self.model)
+            return self._last_eval_results
+
+        # Do evaluation after checkpointer, because then if it fails,
+        # we can use the saved checkpoint to debug.
+        ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))
+
+        if comm.is_main_process():
+            # run writers in the end, so that evaluation metrics are written
+            ret.append(hooks.PeriodicWriter(self.build_writers()))
+        return ret
+
+    def build_writers(self):
+        """
+        Build a list of writers to be used. By default it contains
+        writers that write metrics to the screen,
+        a json file, and a tensorboard event file respectively.
+        If you'd like a different list of writers, you can overwrite it in
+        your trainer.
+
+        Returns:
+            list[EventWriter]: a list of :class:`EventWriter` objects.
+
+        It is now implemented by:
+
+        .. code-block:: python
+
+            return [
+                CommonMetricPrinter(self.max_iter),
+                JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")),
+                TensorboardXWriter(self.cfg.OUTPUT_DIR),
+            ]
+
+        """
+        # Assume the default print/log frequency.
+        return [
+            # It may not always print what you want to see, since it prints "common" metrics only.
+            CommonMetricPrinter(self.max_iter),
+            JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")),
+            TensorboardXWriter(self.cfg.OUTPUT_DIR),
+        ]
+
+    def train(self):
+        """
+        Run training.
+
+        Returns:
+            OrderedDict of results, if evaluation is enabled. Otherwise None.
+        """
+        super().train(self.start_iter, self.max_iter)
+        if hasattr(self, "_last_eval_results") and comm.is_main_process():
+            verify_results(self.cfg, self._last_eval_results)
+            return self._last_eval_results
+
+    @classmethod
+    def build_model(cls, cfg):
+        """
+        Returns:
+            torch.nn.Module:
+
+        It now calls :func:`detectron2.modeling.build_model`.
+        Overwrite it if you'd like a different model.
+        """
+        model = build_model(cfg)
+        logger = logging.getLogger(__name__)
+        logger.info("Model:\n{}".format(model))
+        return model
+
+    @classmethod
+    def build_optimizer(cls, cfg, model):
+        """
+        Returns:
+            torch.optim.Optimizer:
+
+        It now calls :func:`detectron2.solver.build_optimizer`.
+        Overwrite it if you'd like a different optimizer.
+        """
+        return build_optimizer(cfg, model)
+
+    @classmethod
+    def build_lr_scheduler(cls, cfg, optimizer):
+        """
+        It now calls :func:`detectron2.solver.build_lr_scheduler`.
+        Overwrite it if you'd like a different scheduler.
+        """
+        return build_lr_scheduler(cfg, optimizer)
+
+    @classmethod
+    def build_train_loader(cls, cfg):
+        """
+        Returns:
+            iterable
+
+        It now calls :func:`detectron2.data.build_detection_train_loader`.
+        Overwrite it if you'd like a different data loader.
+        """
+        return build_detection_train_loader(cfg)
+
+    @classmethod
+    def build_test_loader(cls, cfg, dataset_name):
+        """
+        Returns:
+            iterable
+
+        It now calls :func:`detectron2.data.build_detection_test_loader`.
+        Overwrite it if you'd like a different data loader.
+        """
+        return build_detection_test_loader(cfg, dataset_name)
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name):
+        """
+        Returns:
+            DatasetEvaluator
+
+        It is not implemented by default.
+        """
+        raise NotImplementedError(
+            "Please either implement `build_evaluator()` in subclasses, or pass "
+            "your evaluator as arguments to `DefaultTrainer.test()`."
+        )
+
+    @classmethod
+    def test(cls, cfg, model, evaluators=None):
+        """
+        Args:
+            cfg (CfgNode):
+            model (nn.Module):
+            evaluators (list[DatasetEvaluator] or None): if None, will call
+                :meth:`build_evaluator`. Otherwise, must have the same length as
+                `cfg.DATASETS.TEST`.
+
+        Returns:
+            dict: a dict of result metrics
+        """
+        logger = logging.getLogger(__name__)
+        if isinstance(evaluators, DatasetEvaluator):
+            evaluators = [evaluators]
+        if evaluators is not None:
+            assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format(
+                len(cfg.DATASETS.TEST), len(evaluators)
+            )
+
+        results = OrderedDict()
+        for idx, dataset_name in enumerate(cfg.DATASETS.TEST):
+            data_loader = cls.build_test_loader(cfg, dataset_name)
+            # When evaluators are passed in as arguments,
+            # implicitly assume that evaluators can be created before data_loader.
+            if evaluators is not None:
+                evaluator = evaluators[idx]
+            else:
+                try:
+                    evaluator = cls.build_evaluator(cfg, dataset_name)
+                except NotImplementedError:
+                    logger.warn(
+                        "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, "
+                        "or implement its `build_evaluator` method."
+                    )
+                    results[dataset_name] = {}
+                    continue
+            results_i = inference_on_dataset(model, data_loader, evaluator)
+            results[dataset_name] = results_i
+            if comm.is_main_process():
+                assert isinstance(
+                    results_i, dict
+                ), "Evaluator must return a dict on the main process. Got {} instead.".format(
+                    results_i
+                )
+                logger.info("Evaluation results for {} in csv format:".format(dataset_name))
+                print_csv_format(results_i)
+
+        if len(results) == 1:
+            results = list(results.values())[0]
+        return results
diff --git a/detectron2/engine/hooks.py b/detectron2/engine/hooks.py
new file mode 100644
index 0000000..9697fb2
--- /dev/null
+++ b/detectron2/engine/hooks.py
@@ -0,0 +1,424 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import datetime
+import logging
+import os
+import tempfile
+import time
+from collections import Counter
+import torch
+from fvcore.common.checkpoint import PeriodicCheckpointer as _PeriodicCheckpointer
+from fvcore.common.file_io import PathManager
+from fvcore.common.timer import Timer
+from fvcore.nn.precise_bn import get_bn_modules, update_bn_stats
+
+import detectron2.utils.comm as comm
+from detectron2.evaluation.testing import flatten_results_dict
+from detectron2.utils.events import EventStorage, EventWriter
+
+from .train_loop import HookBase
+
+__all__ = [
+    "CallbackHook",
+    "IterationTimer",
+    "PeriodicWriter",
+    "PeriodicCheckpointer",
+    "LRScheduler",
+    "AutogradProfiler",
+    "EvalHook",
+    "PreciseBN",
+]
+
+
+"""
+Implement some common hooks.
+"""
+
+
+class CallbackHook(HookBase):
+    """
+    Create a hook using callback functions provided by the user.
+    """
+
+    def __init__(self, *, before_train=None, after_train=None, before_step=None, after_step=None):
+        """
+        Each argument is a function that takes one argument: the trainer.
+        """
+        self._before_train = before_train
+        self._before_step = before_step
+        self._after_step = after_step
+        self._after_train = after_train
+
+    def before_train(self):
+        if self._before_train:
+            self._before_train(self.trainer)
+
+    def after_train(self):
+        if self._after_train:
+            self._after_train(self.trainer)
+        # The functions may be closures that hold reference to the trainer
+        # Therefore, delete them to avoid circular reference.
+        del self._before_train, self._after_train
+        del self._before_step, self._after_step
+
+    def before_step(self):
+        if self._before_step:
+            self._before_step(self.trainer)
+
+    def after_step(self):
+        if self._after_step:
+            self._after_step(self.trainer)
+
+
+class IterationTimer(HookBase):
+    """
+    Track the time spent for each iteration (each run_step call in the trainer).
+    Print a summary in the end of training.
+
+    This hook uses the time between the call to its :meth:`before_step`
+    and :meth:`after_step` methods.
+    Under the convention that :meth:`before_step` of all hooks should only
+    take negligible amount of time, the :class:`IterationTimer` hook should be
+    placed at the beginning of the list of hooks to obtain accurate timing.
+    """
+
+    def __init__(self, warmup_iter=3):
+        """
+        Args:
+            warmup_iter (int): the number of iterations at the beginning to exclude
+                from timing.
+        """
+        self._warmup_iter = warmup_iter
+        self._step_timer = Timer()
+
+    def before_train(self):
+        self._start_time = time.perf_counter()
+        self._total_timer = Timer()
+        self._total_timer.pause()
+
+    def after_train(self):
+        logger = logging.getLogger(__name__)
+        total_time = time.perf_counter() - self._start_time
+        total_time_minus_hooks = self._total_timer.seconds()
+        hook_time = total_time - total_time_minus_hooks
+
+        num_iter = self.trainer.iter + 1 - self.trainer.start_iter - self._warmup_iter
+
+        if num_iter > 0 and total_time_minus_hooks > 0:
+            # Speed is meaningful only after warmup
+            # NOTE this format is parsed by grep in some scripts
+            logger.info(
+                "Overall training speed: {} iterations in {} ({:.4f} s / it)".format(
+                    num_iter,
+                    str(datetime.timedelta(seconds=int(total_time_minus_hooks))),
+                    total_time_minus_hooks / num_iter,
+                )
+            )
+
+        logger.info(
+            "Total training time: {} ({} on hooks)".format(
+                str(datetime.timedelta(seconds=int(total_time))),
+                str(datetime.timedelta(seconds=int(hook_time))),
+            )
+        )
+
+    def before_step(self):
+        self._step_timer.reset()
+        self._total_timer.resume()
+
+    def after_step(self):
+        # +1 because we're in after_step
+        iter_done = self.trainer.iter - self.trainer.start_iter + 1
+        if iter_done >= self._warmup_iter:
+            sec = self._step_timer.seconds()
+            self.trainer.storage.put_scalars(time=sec)
+        else:
+            self._start_time = time.perf_counter()
+            self._total_timer.reset()
+
+        self._total_timer.pause()
+
+
+class PeriodicWriter(HookBase):
+    """
+    Write events to EventStorage periodically.
+
+    It is executed every ``period`` iterations and after the last iteration.
+    """
+
+    def __init__(self, writers, period=20):
+        """
+        Args:
+            writers (list[EventWriter]): a list of EventWriter objects
+            period (int):
+        """
+        self._writers = writers
+        for w in writers:
+            assert isinstance(w, EventWriter), w
+        self._period = period
+
+    def after_step(self):
+        if (self.trainer.iter + 1) % self._period == 0 or (
+            self.trainer.iter == self.trainer.max_iter - 1
+        ):
+            for writer in self._writers:
+                writer.write()
+
+    def after_train(self):
+        for writer in self._writers:
+            writer.close()
+
+
+class PeriodicCheckpointer(_PeriodicCheckpointer, HookBase):
+    """
+    Same as :class:`detectron2.checkpoint.PeriodicCheckpointer`, but as a hook.
+
+    Note that when used as a hook,
+    it is unable to save additional data other than what's defined
+    by the given `checkpointer`.
+
+    It is executed every ``period`` iterations and after the last iteration.
+    """
+
+    def before_train(self):
+        self.max_iter = self.trainer.max_iter
+
+    def after_step(self):
+        # No way to use **kwargs
+        self.step(self.trainer.iter)
+
+
+class LRScheduler(HookBase):
+    """
+    A hook which executes a torch builtin LR scheduler and summarizes the LR.
+    It is executed after every iteration.
+    """
+
+    def __init__(self, optimizer, scheduler):
+        """
+        Args:
+            optimizer (torch.optim.Optimizer):
+            scheduler (torch.optim._LRScheduler)
+        """
+        self._optimizer = optimizer
+        self._scheduler = scheduler
+
+        # NOTE: some heuristics on what LR to summarize
+        # summarize the param group with most parameters
+        largest_group = max(len(g["params"]) for g in optimizer.param_groups)
+
+        if largest_group == 1:
+            # If all groups have one parameter,
+            # then find the most common initial LR, and use it for summary
+            lr_count = Counter([g["lr"] for g in optimizer.param_groups])
+            lr = lr_count.most_common()[0][0]
+            for i, g in enumerate(optimizer.param_groups):
+                if g["lr"] == lr:
+                    self._best_param_group_id = i
+                    break
+        else:
+            for i, g in enumerate(optimizer.param_groups):
+                if len(g["params"]) == largest_group:
+                    self._best_param_group_id = i
+                    break
+
+    def after_step(self):
+        lr = self._optimizer.param_groups[self._best_param_group_id]["lr"]
+        self.trainer.storage.put_scalar("lr", lr, smoothing_hint=False)
+        self._scheduler.step()
+
+
+class AutogradProfiler(HookBase):
+    """
+    A hook which runs `torch.autograd.profiler.profile`.
+
+    Examples:
+
+    .. code-block:: python
+
+        hooks.AutogradProfiler(
+             lambda trainer: trainer.iter > 10 and trainer.iter < 20, self.cfg.OUTPUT_DIR
+        )
+
+    The above example will run the profiler for iteration 10~20 and dump
+    results to ``OUTPUT_DIR``. We did not profile the first few iterations
+    because they are typically slower than the rest.
+    The result files can be loaded in the ``chrome://tracing`` page in chrome browser.
+
+    Note:
+        When used together with NCCL on older version of GPUs,
+        autograd profiler may cause deadlock because it unnecessarily allocates
+        memory on every device it sees. The memory management calls, if
+        interleaved with NCCL calls, lead to deadlock on GPUs that do not
+        support `cudaLaunchCooperativeKernelMultiDevice`.
+    """
+
+    def __init__(self, enable_predicate, output_dir, *, use_cuda=True):
+        """
+        Args:
+            enable_predicate (callable[trainer -> bool]): a function which takes a trainer,
+                and returns whether to enable the profiler.
+                It will be called once every step, and can be used to select which steps to profile.
+            output_dir (str): the output directory to dump tracing files.
+            use_cuda (bool): same as in `torch.autograd.profiler.profile`.
+        """
+        self._enable_predicate = enable_predicate
+        self._use_cuda = use_cuda
+        self._output_dir = output_dir
+
+    def before_step(self):
+        if self._enable_predicate(self.trainer):
+            self._profiler = torch.autograd.profiler.profile(use_cuda=self._use_cuda)
+            self._profiler.__enter__()
+        else:
+            self._profiler = None
+
+    def after_step(self):
+        if self._profiler is None:
+            return
+        self._profiler.__exit__(None, None, None)
+        out_file = os.path.join(
+            self._output_dir, "profiler-trace-iter{}.json".format(self.trainer.iter)
+        )
+        if "://" not in out_file:
+            self._profiler.export_chrome_trace(out_file)
+        else:
+            # Support non-posix filesystems
+            with tempfile.TemporaryDirectory(prefix="detectron2_profiler") as d:
+                tmp_file = os.path.join(d, "tmp.json")
+                self._profiler.export_chrome_trace(tmp_file)
+                with open(tmp_file) as f:
+                    content = f.read()
+            with PathManager.open(out_file, "w") as f:
+                f.write(content)
+
+
+class EvalHook(HookBase):
+    """
+    Run an evaluation function periodically, and at the end of training.
+
+    It is executed every ``eval_period`` iterations and after the last iteration.
+    """
+
+    def __init__(self, eval_period, eval_function):
+        """
+        Args:
+            eval_period (int): the period to run `eval_function`.
+            eval_function (callable): a function which takes no arguments, and
+                returns a nested dict of evaluation metrics.
+
+        Note:
+            This hook must be enabled in all or none workers.
+            If you would like only certain workers to perform evaluation,
+            give other workers a no-op function (`eval_function=lambda: None`).
+        """
+        self._period = eval_period
+        self._func = eval_function
+
+    def after_step(self):
+        next_iter = self.trainer.iter + 1
+        is_final = next_iter == self.trainer.max_iter
+        if is_final or (self._period > 0 and next_iter % self._period == 0):
+            results = self._func()
+
+            if results:
+                assert isinstance(
+                    results, dict
+                ), "Eval function must return a dict. Got {} instead.".format(results)
+
+                flattened_results = flatten_results_dict(results)
+                for k, v in flattened_results.items():
+                    try:
+                        v = float(v)
+                    except Exception:
+                        raise ValueError(
+                            "[EvalHook] eval_function should return a nested dict of float. "
+                            "Got '{}: {}' instead.".format(k, v)
+                        )
+                self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False)
+
+            # Evaluation may take different time among workers.
+            # A barrier make them start the next iteration together.
+            comm.synchronize()
+
+    def after_train(self):
+        # func is likely a closure that holds reference to the trainer
+        # therefore we clean it to avoid circular reference in the end
+        del self._func
+
+
+class PreciseBN(HookBase):
+    """
+    The standard implementation of BatchNorm uses EMA in inference, which is
+    sometimes suboptimal.
+    This class computes the true average of statistics rather than the moving average,
+    and put true averages to every BN layer in the given model.
+
+    It is executed every ``period`` iterations and after the last iteration.
+    """
+
+    def __init__(self, period, model, data_loader, num_iter):
+        """
+        Args:
+            period (int): the period this hook is run, or 0 to not run during training.
+                The hook will always run in the end of training.
+            model (nn.Module): a module whose all BN layers in training mode will be
+                updated by precise BN.
+                Note that user is responsible for ensuring the BN layers to be
+                updated are in training mode when this hook is triggered.
+            data_loader (iterable): it will produce data to be run by `model(data)`.
+            num_iter (int): number of iterations used to compute the precise
+                statistics.
+        """
+        self._logger = logging.getLogger(__name__)
+        if len(get_bn_modules(model)) == 0:
+            self._logger.info(
+                "PreciseBN is disabled because model does not contain BN layers in training mode."
+            )
+            self._disabled = True
+            return
+
+        self._model = model
+        self._data_loader = data_loader
+        self._num_iter = num_iter
+        self._period = period
+        self._disabled = False
+
+        self._data_iter = None
+
+    def after_step(self):
+        next_iter = self.trainer.iter + 1
+        is_final = next_iter == self.trainer.max_iter
+        if is_final or (self._period > 0 and next_iter % self._period == 0):
+            self.update_stats()
+
+    def update_stats(self):
+        """
+        Update the model with precise statistics. Users can manually call this method.
+        """
+        if self._disabled:
+            return
+
+        if self._data_iter is None:
+            self._data_iter = iter(self._data_loader)
+
+        num_iter = 0
+
+        def data_loader():
+            nonlocal num_iter
+            while True:
+                num_iter += 1
+                if num_iter % 100 == 0:
+                    self._logger.info(
+                        "Running precise-BN ... {}/{} iterations.".format(num_iter, self._num_iter)
+                    )
+                # This way we can reuse the same iterator
+                yield next(self._data_iter)
+
+        with EventStorage():  # capture events in a new storage to discard them
+            self._logger.info(
+                "Running precise-BN for {} iterations...  ".format(self._num_iter)
+                + "Note that this could produce different statistics every time."
+            )
+            update_bn_stats(self._model, data_loader(), self._num_iter)
diff --git a/detectron2/engine/launch.py b/detectron2/engine/launch.py
new file mode 100644
index 0000000..444c57e
--- /dev/null
+++ b/detectron2/engine/launch.py
@@ -0,0 +1,84 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+from detectron2.utils import comm
+
+__all__ = ["launch"]
+
+
+def _find_free_port():
+    import socket
+
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    # Binding to port 0 will cause the OS to find an available port for us
+    sock.bind(("", 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    # NOTE: there is still a chance the port could be taken by other processes.
+    return port
+
+
+def launch(main_func, num_gpus_per_machine, num_machines=1, machine_rank=0, dist_url=None, args=()):
+    """
+    Args:
+        main_func: a function that will be called by `main_func(*args)`
+        num_machines (int): the total number of machines
+        machine_rank (int): the rank of this machine (one per machine)
+        dist_url (str): url to connect to for distributed training, including protocol
+                       e.g. "tcp://127.0.0.1:8686".
+                       Can be set to auto to automatically select a free port on localhost
+        args (tuple): arguments passed to main_func
+    """
+    world_size = num_machines * num_gpus_per_machine
+    if world_size > 1:
+        # https://github.com/pytorch/pytorch/pull/14391
+        # TODO prctl in spawned processes
+
+        if dist_url == "auto":
+            assert num_machines == 1, "dist_url=auto cannot work with distributed training."
+            port = _find_free_port()
+            dist_url = f"tcp://127.0.0.1:{port}"
+
+        mp.spawn(
+            _distributed_worker,
+            nprocs=num_gpus_per_machine,
+            args=(main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args),
+            daemon=False,
+        )
+    else:
+        main_func(*args)
+
+
+def _distributed_worker(
+    local_rank, main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args
+):
+    assert torch.cuda.is_available(), "cuda is not available. Please check your installation."
+    global_rank = machine_rank * num_gpus_per_machine + local_rank
+    try:
+        dist.init_process_group(
+            backend="NCCL", init_method=dist_url, world_size=world_size, rank=global_rank
+        )
+    except Exception as e:
+        logger = logging.getLogger(__name__)
+        logger.error("Process group URL: {}".format(dist_url))
+        raise e
+    # synchronize is needed here to prevent a possible timeout after calling init_process_group
+    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
+    comm.synchronize()
+
+    assert num_gpus_per_machine <= torch.cuda.device_count()
+    torch.cuda.set_device(local_rank)
+
+    # Setup the local process group (which contains ranks within the same machine)
+    assert comm._LOCAL_PROCESS_GROUP is None
+    num_machines = world_size // num_gpus_per_machine
+    for i in range(num_machines):
+        ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine))
+        pg = dist.new_group(ranks_on_i)
+        if i == machine_rank:
+            comm._LOCAL_PROCESS_GROUP = pg
+
+    main_func(*args)
diff --git a/detectron2/engine/train_loop.py b/detectron2/engine/train_loop.py
new file mode 100644
index 0000000..4e429f3
--- /dev/null
+++ b/detectron2/engine/train_loop.py
@@ -0,0 +1,270 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import logging
+import numpy as np
+import time
+import weakref
+import torch
+
+import detectron2.utils.comm as comm
+from detectron2.utils.events import EventStorage
+
+__all__ = ["HookBase", "TrainerBase", "SimpleTrainer"]
+
+
+class HookBase:
+    """
+    Base class for hooks that can be registered with :class:`TrainerBase`.
+
+    Each hook can implement 4 methods. The way they are called is demonstrated
+    in the following snippet:
+
+    .. code-block:: python
+
+        hook.before_train()
+        for iter in range(start_iter, max_iter):
+            hook.before_step()
+            trainer.run_step()
+            hook.after_step()
+        hook.after_train()
+
+    Notes:
+        1. In the hook method, users can access `self.trainer` to access more
+           properties about the context (e.g., current iteration).
+
+        2. A hook that does something in :meth:`before_step` can often be
+           implemented equivalently in :meth:`after_step`.
+           If the hook takes non-trivial time, it is strongly recommended to
+           implement the hook in :meth:`after_step` instead of :meth:`before_step`.
+           The convention is that :meth:`before_step` should only take negligible time.
+
+           Following this convention will allow hooks that do care about the difference
+           between :meth:`before_step` and :meth:`after_step` (e.g., timer) to
+           function properly.
+
+    Attributes:
+        trainer: A weak reference to the trainer object. Set by the trainer when the hook is
+            registered.
+    """
+
+    def before_train(self):
+        """
+        Called before the first iteration.
+        """
+        pass
+
+    def after_train(self):
+        """
+        Called after the last iteration.
+        """
+        pass
+
+    def before_step(self):
+        """
+        Called before each iteration.
+        """
+        pass
+
+    def after_step(self):
+        """
+        Called after each iteration.
+        """
+        pass
+
+
+class TrainerBase:
+    """
+    Base class for iterative trainer with hooks.
+
+    The only assumption we made here is: the training runs in a loop.
+    A subclass can implement what the loop is.
+    We made no assumptions about the existence of dataloader, optimizer, model, etc.
+
+    Attributes:
+        iter(int): the current iteration.
+
+        start_iter(int): The iteration to start with.
+            By convention the minimum possible value is 0.
+
+        max_iter(int): The iteration to end training.
+
+        storage(EventStorage): An EventStorage that's opened during the course of training.
+    """
+
+    def __init__(self):
+        self._hooks = []
+
+    def register_hooks(self, hooks):
+        """
+        Register hooks to the trainer. The hooks are executed in the order
+        they are registered.
+
+        Args:
+            hooks (list[Optional[HookBase]]): list of hooks
+        """
+        hooks = [h for h in hooks if h is not None]
+        for h in hooks:
+            assert isinstance(h, HookBase)
+            # To avoid circular reference, hooks and trainer cannot own each other.
+            # This normally does not matter, but will cause memory leak if the
+            # involved objects contain __del__:
+            # See http://engineering.hearsaysocial.com/2013/06/16/circular-references-in-python/
+            h.trainer = weakref.proxy(self)
+        self._hooks.extend(hooks)
+
+    def train(self, start_iter: int, max_iter: int):
+        """
+        Args:
+            start_iter, max_iter (int): See docs above
+        """
+        logger = logging.getLogger(__name__)
+        logger.info("Starting training from iteration {}".format(start_iter))
+
+        self.iter = self.start_iter = start_iter
+        self.max_iter = max_iter
+
+        with EventStorage(start_iter) as self.storage:
+            try:
+                self.before_train()
+                for self.iter in range(start_iter, max_iter):
+                    self.before_step()
+                    self.run_step()
+                    self.after_step()
+            finally:
+                self.after_train()
+
+    def before_train(self):
+        for h in self._hooks:
+            h.before_train()
+
+    def after_train(self):
+        for h in self._hooks:
+            h.after_train()
+
+    def before_step(self):
+        for h in self._hooks:
+            h.before_step()
+
+    def after_step(self):
+        for h in self._hooks:
+            h.after_step()
+        # this guarantees, that in each hook's after_step, storage.iter == trainer.iter
+        self.storage.step()
+
+    def run_step(self):
+        raise NotImplementedError
+
+
+class SimpleTrainer(TrainerBase):
+    """
+    A simple trainer for the most common type of task:
+    single-cost single-optimizer single-data-source iterative optimization.
+    It assumes that every step, you:
+
+    1. Compute the loss with a data from the data_loader.
+    2. Compute the gradients with the above loss.
+    3. Update the model with the optimizer.
+
+    If you want to do anything fancier than this,
+    either subclass TrainerBase and implement your own `run_step`,
+    or write your own training loop.
+    """
+
+    def __init__(self, model, data_loader, optimizer):
+        """
+        Args:
+            model: a torch Module. Takes a data from data_loader and returns a
+                dict of losses.
+            data_loader: an iterable. Contains data to be used to call model.
+            optimizer: a torch optimizer.
+        """
+        super().__init__()
+
+        """
+        We set the model to training mode in the trainer.
+        However it's valid to train a model that's in eval mode.
+        If you want your model (or a submodule of it) to behave
+        like evaluation during training, you can overwrite its train() method.
+        """
+        model.train()
+
+        self.model = model
+        self.data_loader = data_loader
+        self._data_loader_iter = iter(data_loader)
+        self.optimizer = optimizer
+
+    def run_step(self):
+        """
+        Implement the standard training logic described above.
+        """
+        assert self.model.training, "[SimpleTrainer] model was changed to eval mode!"
+        start = time.perf_counter()
+        """
+        If your want to do something with the data, you can wrap the dataloader.
+        """
+        data = next(self._data_loader_iter)
+        data_time = time.perf_counter() - start
+
+        """
+        If your want to do something with the losses, you can wrap the model.
+        """
+        loss_dict = self.model(data)
+        losses = sum(loss for loss in loss_dict.values())
+        self._detect_anomaly(losses, loss_dict)
+
+        metrics_dict = loss_dict
+        metrics_dict["data_time"] = data_time
+        self._write_metrics(metrics_dict)
+
+        """
+        If you need accumulate gradients or something similar, you can
+        wrap the optimizer with your custom `zero_grad()` method.
+        """
+        self.optimizer.zero_grad()
+        losses.backward()
+
+        """
+        If you need gradient clipping/scaling or other processing, you can
+        wrap the optimizer with your custom `step()` method.
+        """
+        self.optimizer.step()
+
+    def _detect_anomaly(self, losses, loss_dict):
+        if not torch.isfinite(losses).all():
+            raise FloatingPointError(
+                "Loss became infinite or NaN at iteration={}!\nloss_dict = {}".format(
+                    self.iter, loss_dict
+                )
+            )
+
+    def _write_metrics(self, metrics_dict: dict):
+        """
+        Args:
+            metrics_dict (dict): dict of scalar metrics
+        """
+        metrics_dict = {
+            k: v.detach().cpu().item() if isinstance(v, torch.Tensor) else float(v)
+            for k, v in metrics_dict.items()
+        }
+        # gather metrics among all workers for logging
+        # This assumes we do DDP-style training, which is currently the only
+        # supported method in detectron2.
+        all_metrics_dict = comm.gather(metrics_dict)
+
+        if comm.is_main_process():
+            if "data_time" in all_metrics_dict[0]:
+                # data_time among workers can have high variance. The actual latency
+                # caused by data_time is the maximum among workers.
+                data_time = np.max([x.pop("data_time") for x in all_metrics_dict])
+                self.storage.put_scalar("data_time", data_time)
+
+            # average the rest metrics
+            metrics_dict = {
+                k: np.mean([x[k] for x in all_metrics_dict]) for k in all_metrics_dict[0].keys()
+            }
+            total_losses_reduced = sum(loss for loss in metrics_dict.values())
+
+            self.storage.put_scalar("total_loss", total_losses_reduced)
+            if len(metrics_dict) > 1:
+                self.storage.put_scalars(**metrics_dict)
diff --git a/detectron2/evaluation/__init__.py b/detectron2/evaluation/__init__.py
new file mode 100644
index 0000000..975d8f7
--- /dev/null
+++ b/detectron2/evaluation/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .cityscapes_evaluation import CityscapesEvaluator
+from .coco_evaluation import COCOEvaluator
+from .evaluator import DatasetEvaluator, DatasetEvaluators, inference_context, inference_on_dataset
+from .lvis_evaluation import LVISEvaluator
+from .panoptic_evaluation import COCOPanopticEvaluator
+from .pascal_voc_evaluation import PascalVOCDetectionEvaluator
+from .sem_seg_evaluation import SemSegEvaluator
+from .testing import print_csv_format, verify_results
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/detectron2/evaluation/cityscapes_evaluation.py b/detectron2/evaluation/cityscapes_evaluation.py
new file mode 100644
index 0000000..cdb6fc4
--- /dev/null
+++ b/detectron2/evaluation/cityscapes_evaluation.py
@@ -0,0 +1,114 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import glob
+import logging
+import os
+import tempfile
+from collections import OrderedDict
+import torch
+from PIL import Image
+
+from detectron2.data import MetadataCatalog
+from detectron2.utils import comm
+
+from .evaluator import DatasetEvaluator
+
+
+class CityscapesEvaluator(DatasetEvaluator):
+    """
+    Evaluate instance segmentation results using cityscapes API.
+
+    Note:
+        * It does not work in multi-machine distributed training.
+        * It contains a synchronization, therefore has to be used on all ranks.
+    """
+
+    def __init__(self, dataset_name):
+        """
+        Args:
+            dataset_name (str): the name of the dataset.
+                It must have the following metadata associated with it:
+                "thing_classes", "gt_dir".
+        """
+        self._metadata = MetadataCatalog.get(dataset_name)
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+
+    def reset(self):
+        self._working_dir = tempfile.TemporaryDirectory(prefix="cityscapes_eval_")
+        self._temp_dir = self._working_dir.name
+        # All workers will write to the same results directory
+        # TODO this does not work in distributed training
+        self._temp_dir = comm.all_gather(self._temp_dir)[0]
+        if self._temp_dir != self._working_dir.name:
+            self._working_dir.cleanup()
+        self._logger.info(
+            "Writing cityscapes results to temporary directory {} ...".format(self._temp_dir)
+        )
+
+    def process(self, inputs, outputs):
+        from cityscapesscripts.helpers.labels import name2label
+
+        for input, output in zip(inputs, outputs):
+            file_name = input["file_name"]
+            basename = os.path.splitext(os.path.basename(file_name))[0]
+            pred_txt = os.path.join(self._temp_dir, basename + "_pred.txt")
+
+            output = output["instances"].to(self._cpu_device)
+            num_instances = len(output)
+            with open(pred_txt, "w") as fout:
+                for i in range(num_instances):
+                    pred_class = output.pred_classes[i]
+                    classes = self._metadata.thing_classes[pred_class]
+                    class_id = name2label[classes].id
+                    score = output.scores[i]
+                    mask = output.pred_masks[i].numpy().astype("uint8")
+                    png_filename = os.path.join(
+                        self._temp_dir, basename + "_{}_{}.png".format(i, classes)
+                    )
+
+                    Image.fromarray(mask * 255).save(png_filename)
+                    fout.write("{} {} {}\n".format(os.path.basename(png_filename), class_id, score))
+
+    def evaluate(self):
+        """
+        Returns:
+            dict: has a key "segm", whose value is a dict of "AP" and "AP50".
+        """
+        comm.synchronize()
+        if comm.get_rank() > 0:
+            return
+        os.environ["CITYSCAPES_DATASET"] = os.path.abspath(
+            os.path.join(self._metadata.gt_dir, "..", "..")
+        )
+        # Load the Cityscapes eval script *after* setting the required env var,
+        # since the script reads CITYSCAPES_DATASET into global variables at load time.
+        import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as cityscapes_eval
+
+        self._logger.info("Evaluating results under {} ...".format(self._temp_dir))
+
+        # set some global states in cityscapes evaluation API, before evaluating
+        cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir)
+        cityscapes_eval.args.predictionWalk = None
+        cityscapes_eval.args.JSONOutput = False
+        cityscapes_eval.args.colorized = False
+        cityscapes_eval.args.gtInstancesFile = os.path.join(self._temp_dir, "gtInstances.json")
+
+        # These lines are adopted from
+        # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa
+        groundTruthImgList = glob.glob(cityscapes_eval.args.groundTruthSearch)
+        assert len(
+            groundTruthImgList
+        ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format(
+            cityscapes_eval.args.groundTruthSearch
+        )
+        predictionImgList = []
+        for gt in groundTruthImgList:
+            predictionImgList.append(cityscapes_eval.getPrediction(gt, cityscapes_eval.args))
+        results = cityscapes_eval.evaluateImgLists(
+            predictionImgList, groundTruthImgList, cityscapes_eval.args
+        )["averages"]
+
+        ret = OrderedDict()
+        ret["segm"] = {"AP": results["allAp"] * 100, "AP50": results["allAp50%"] * 100}
+        self._working_dir.cleanup()
+        return ret
diff --git a/detectron2/evaluation/coco_evaluation.py b/detectron2/evaluation/coco_evaluation.py
new file mode 100644
index 0000000..d1cf9d5
--- /dev/null
+++ b/detectron2/evaluation/coco_evaluation.py
@@ -0,0 +1,477 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import contextlib
+import copy
+import io
+import itertools
+import json
+import logging
+import numpy as np
+import os
+import pickle
+from collections import OrderedDict
+import pycocotools.mask as mask_util
+import torch
+from fvcore.common.file_io import PathManager
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from tabulate import tabulate
+
+import detectron2.utils.comm as comm
+from detectron2.data import MetadataCatalog
+from detectron2.data.datasets.coco import convert_to_coco_json
+from detectron2.structures import Boxes, BoxMode, pairwise_iou
+from detectron2.utils.logger import create_small_table
+
+from .evaluator import DatasetEvaluator
+
+
+class COCOEvaluator(DatasetEvaluator):
+    """
+    Evaluate object proposal, instance detection/segmentation, keypoint detection
+    outputs using COCO's metrics and APIs.
+    """
+
+    def __init__(self, dataset_name, cfg, distributed, output_dir=None):
+        """
+        Args:
+            dataset_name (str): name of the dataset to be evaluated.
+                It must have either the following corresponding metadata:
+                    "json_file": the path to the COCO format annotation
+                Or it must be in detectron2's standard dataset format
+                    so it can be converted to COCO format automatically.
+            cfg (CfgNode): config instance
+            distributed (True): if True, will collect results from all ranks for evaluation.
+                Otherwise, will evaluate the results in the current process.
+            output_dir (str): optional, an output directory to dump results.
+        """
+        self._tasks = self._tasks_from_config(cfg)
+        self._distributed = distributed
+        self._output_dir = output_dir
+
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+
+        self._metadata = MetadataCatalog.get(dataset_name)
+        if not hasattr(self._metadata, "json_file"):
+            self._logger.warning(f"json_file was not found in MetaDataCatalog for '{dataset_name}'")
+
+            cache_path = convert_to_coco_json(dataset_name, output_dir)
+            self._metadata.json_file = cache_path
+
+        json_file = PathManager.get_local_path(self._metadata.json_file)
+        with contextlib.redirect_stdout(io.StringIO()):
+            self._coco_api = COCO(json_file)
+
+        self._kpt_oks_sigmas = cfg.TEST.KEYPOINT_OKS_SIGMAS
+        # Test set json files do not contain annotations (evaluation must be
+        # performed using the COCO evaluation server).
+        self._do_evaluation = "annotations" in self._coco_api.dataset
+
+    def reset(self):
+        self._predictions = []
+        self._coco_results = []
+
+    def _tasks_from_config(self, cfg):
+        """
+        Returns:
+            tuple[str]: tasks that can be evaluated under the given configuration.
+        """
+        tasks = ("bbox",)
+        if cfg.MODEL.MASK_ON:
+            tasks = tasks + ("segm",)
+        if cfg.MODEL.KEYPOINT_ON:
+            tasks = tasks + ("keypoints",)
+        return tasks
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a COCO model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+        for input, output in zip(inputs, outputs):
+            prediction = {"image_id": input["image_id"]}
+
+            # TODO this is ugly
+            if "instances" in output:
+                instances = output["instances"].to(self._cpu_device)
+                prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
+            if "proposals" in output:
+                prediction["proposals"] = output["proposals"].to(self._cpu_device)
+            self._predictions.append(prediction)
+
+    def evaluate(self):
+        if self._distributed:
+            comm.synchronize()
+            self._predictions = comm.gather(self._predictions, dst=0)
+            self._predictions = list(itertools.chain(*self._predictions))
+
+            if not comm.is_main_process():
+                return {}
+
+        if len(self._predictions) == 0:
+            self._logger.warning("[COCOEvaluator] Did not receive valid predictions.")
+            return {}
+
+        if self._output_dir:
+            PathManager.mkdirs(self._output_dir)
+            file_path = os.path.join(self._output_dir, "instances_predictions.pth")
+            with PathManager.open(file_path, "wb") as f:
+                torch.save(self._predictions, f)
+
+        self._results = OrderedDict()
+        if "proposals" in self._predictions[0]:
+            self._eval_box_proposals()
+        if "instances" in self._predictions[0]:
+            self._eval_predictions(set(self._tasks))
+        # Copy so the caller can do whatever with results
+        return copy.deepcopy(self._results)
+
+    def _eval_predictions(self, tasks):
+        """
+        Evaluate self._predictions on the given tasks.
+        Fill self._results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results for COCO format ...")
+        self._coco_results = list(itertools.chain(*[x["instances"] for x in self._predictions]))
+
+        # unmap the category ids for COCO
+        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+            reverse_id_mapping = {
+                v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
+            }
+            for result in self._coco_results:
+                result["category_id"] = reverse_id_mapping[result["category_id"]]
+
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "coco_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(self._coco_results))
+                f.flush()
+
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+
+        self._logger.info("Evaluating predictions ...")
+        for task in sorted(tasks):
+            coco_eval = (
+                _evaluate_predictions_on_coco(
+                    self._coco_api, self._coco_results, task, kpt_oks_sigmas=self._kpt_oks_sigmas
+                )
+                if len(self._coco_results) > 0
+                else None  # cocoapi does not handle empty results very well
+            )
+
+            res = self._derive_coco_results(
+                coco_eval, task, class_names=self._metadata.get("thing_classes")
+            )
+            self._results[task] = res
+
+    def _eval_box_proposals(self):
+        """
+        Evaluate the box proposals in self._predictions.
+        Fill self._results with the metrics for "box_proposals" task.
+        """
+        if self._output_dir:
+            # Saving generated box proposals to file.
+            # Predicted box_proposals are in XYXY_ABS mode.
+            bbox_mode = BoxMode.XYXY_ABS.value
+            ids, boxes, objectness_logits = [], [], []
+            for prediction in self._predictions:
+                ids.append(prediction["image_id"])
+                boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy())
+                objectness_logits.append(prediction["proposals"].objectness_logits.numpy())
+
+            proposal_data = {
+                "boxes": boxes,
+                "objectness_logits": objectness_logits,
+                "ids": ids,
+                "bbox_mode": bbox_mode,
+            }
+            with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f:
+                pickle.dump(proposal_data, f)
+
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+
+        self._logger.info("Evaluating bbox proposals ...")
+        res = {}
+        areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
+        for limit in [100, 1000]:
+            for area, suffix in areas.items():
+                stats = _evaluate_box_proposals(
+                    self._predictions, self._coco_api, area=area, limit=limit
+                )
+                key = "AR{}@{:d}".format(suffix, limit)
+                res[key] = float(stats["ar"].item() * 100)
+        self._logger.info("Proposal metrics: \n" + create_small_table(res))
+        self._results["box_proposals"] = res
+
+    def _derive_coco_results(self, coco_eval, iou_type, class_names=None):
+        """
+        Derive the desired score numbers from summarized COCOeval.
+
+        Args:
+            coco_eval (None or COCOEval): None represents no predictions from model.
+            iou_type (str):
+            class_names (None or list[str]): if provided, will use it to predict
+                per-category AP.
+
+        Returns:
+            a dict of {metric name: score}
+        """
+
+        metrics = {
+            "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
+            "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
+            "keypoints": ["AP", "AP50", "AP75", "APm", "APl"],
+        }[iou_type]
+
+        if coco_eval is None:
+            self._logger.warn("No predictions from the model! Set scores to -1")
+            return {metric: -1 for metric in metrics}
+
+        # the standard metrics
+        results = {metric: float(coco_eval.stats[idx] * 100) for idx, metric in enumerate(metrics)}
+        self._logger.info(
+            "Evaluation results for {}: \n".format(iou_type) + create_small_table(results)
+        )
+
+        if class_names is None or len(class_names) <= 1:
+            return results
+        # Compute per-category AP
+        # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa
+        precisions = coco_eval.eval["precision"]
+        # precision has dims (iou, recall, cls, area range, max dets)
+        assert len(class_names) == precisions.shape[2]
+
+        results_per_category = []
+        for idx, name in enumerate(class_names):
+            # area range index 0: all area ranges
+            # max dets index -1: typically 100 per image
+            precision = precisions[:, :, idx, 0, -1]
+            precision = precision[precision > -1]
+            ap = np.mean(precision) if precision.size else float("nan")
+            results_per_category.append(("{}".format(name), float(ap * 100)))
+
+        # tabulate it
+        N_COLS = min(6, len(results_per_category) * 2)
+        results_flatten = list(itertools.chain(*results_per_category))
+        results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)])
+        table = tabulate(
+            results_2d,
+            tablefmt="pipe",
+            floatfmt=".3f",
+            headers=["category", "AP"] * (N_COLS // 2),
+            numalign="left",
+        )
+        self._logger.info("Per-category {} AP: \n".format(iou_type) + table)
+
+        results.update({"AP-" + name: ap for name, ap in results_per_category})
+        return results
+
+
+def instances_to_coco_json(instances, img_id):
+    """
+    Dump an "Instances" object to a COCO-format json that's used for evaluation.
+
+    Args:
+        instances (Instances):
+        img_id (int): the image id
+
+    Returns:
+        list[dict]: list of json annotations in COCO format.
+    """
+    num_instance = len(instances)
+    if num_instance == 0:
+        return []
+
+    boxes = instances.pred_boxes.tensor.numpy()
+    boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    boxes = boxes.tolist()
+    scores = instances.scores.tolist()
+    classes = instances.pred_classes.tolist()
+
+    has_mask = instances.has("pred_masks")
+    if has_mask:
+        # use RLE to encode the masks, because they are too large and takes memory
+        # since this evaluator stores outputs of the entire dataset
+        rles = [
+            mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
+            for mask in instances.pred_masks
+        ]
+        for rle in rles:
+            # "counts" is an array encoded by mask_util as a byte-stream. Python3's
+            # json writer which always produces strings cannot serialize a bytestream
+            # unless you decode it. Thankfully, utf-8 works out (which is also what
+            # the pycocotools/_mask.pyx does).
+            rle["counts"] = rle["counts"].decode("utf-8")
+
+    has_keypoints = instances.has("pred_keypoints")
+    if has_keypoints:
+        keypoints = instances.pred_keypoints
+
+    results = []
+    for k in range(num_instance):
+        result = {
+            "image_id": img_id,
+            "category_id": classes[k],
+            "bbox": boxes[k],
+            "score": scores[k],
+        }
+        if has_mask:
+            result["segmentation"] = rles[k]
+        if has_keypoints:
+            # In COCO annotations,
+            # keypoints coordinates are pixel indices.
+            # However our predictions are floating point coordinates.
+            # Therefore we subtract 0.5 to be consistent with the annotation format.
+            # This is the inverse of data loading logic in `datasets/coco.py`.
+            keypoints[k][:, :2] -= 0.5
+            result["keypoints"] = keypoints[k].flatten().tolist()
+        results.append(result)
+    return results
+
+
+# inspired from Detectron:
+# https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa
+def _evaluate_box_proposals(dataset_predictions, coco_api, thresholds=None, area="all", limit=None):
+    """
+    Evaluate detection proposal recall metrics. This function is a much
+    faster alternative to the official COCO API recall evaluation code. However,
+    it produces slightly different results.
+    """
+    # Record max overlap value for each gt box
+    # Return vector of overlap values
+    areas = {
+        "all": 0,
+        "small": 1,
+        "medium": 2,
+        "large": 3,
+        "96-128": 4,
+        "128-256": 5,
+        "256-512": 6,
+        "512-inf": 7,
+    }
+    area_ranges = [
+        [0 ** 2, 1e5 ** 2],  # all
+        [0 ** 2, 32 ** 2],  # small
+        [32 ** 2, 96 ** 2],  # medium
+        [96 ** 2, 1e5 ** 2],  # large
+        [96 ** 2, 128 ** 2],  # 96-128
+        [128 ** 2, 256 ** 2],  # 128-256
+        [256 ** 2, 512 ** 2],  # 256-512
+        [512 ** 2, 1e5 ** 2],
+    ]  # 512-inf
+    assert area in areas, "Unknown area range: {}".format(area)
+    area_range = area_ranges[areas[area]]
+    gt_overlaps = []
+    num_pos = 0
+
+    for prediction_dict in dataset_predictions:
+        predictions = prediction_dict["proposals"]
+
+        # sort predictions in descending order
+        # TODO maybe remove this and make it explicit in the documentation
+        inds = predictions.objectness_logits.sort(descending=True)[1]
+        predictions = predictions[inds]
+
+        ann_ids = coco_api.getAnnIds(imgIds=prediction_dict["image_id"])
+        anno = coco_api.loadAnns(ann_ids)
+        gt_boxes = [
+            BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
+            for obj in anno
+            if obj["iscrowd"] == 0
+        ]
+        gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4)  # guard against no boxes
+        gt_boxes = Boxes(gt_boxes)
+        gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0])
+
+        if len(gt_boxes) == 0 or len(predictions) == 0:
+            continue
+
+        valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
+        gt_boxes = gt_boxes[valid_gt_inds]
+
+        num_pos += len(gt_boxes)
+
+        if len(gt_boxes) == 0:
+            continue
+
+        if limit is not None and len(predictions) > limit:
+            predictions = predictions[:limit]
+
+        overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes)
+
+        _gt_overlaps = torch.zeros(len(gt_boxes))
+        for j in range(min(len(predictions), len(gt_boxes))):
+            # find which proposal box maximally covers each gt box
+            # and get the iou amount of coverage for each gt box
+            max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+            # find which gt box is 'best' covered (i.e. 'best' = most iou)
+            gt_ovr, gt_ind = max_overlaps.max(dim=0)
+            assert gt_ovr >= 0
+            # find the proposal box that covers the best covered gt box
+            box_ind = argmax_overlaps[gt_ind]
+            # record the iou coverage of this gt box
+            _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+            assert _gt_overlaps[j] == gt_ovr
+            # mark the proposal box and the gt box as used
+            overlaps[box_ind, :] = -1
+            overlaps[:, gt_ind] = -1
+
+        # append recorded iou coverage level
+        gt_overlaps.append(_gt_overlaps)
+    gt_overlaps = torch.cat(gt_overlaps, dim=0)
+    gt_overlaps, _ = torch.sort(gt_overlaps)
+
+    if thresholds is None:
+        step = 0.05
+        thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
+    recalls = torch.zeros_like(thresholds)
+    # compute recall for each iou threshold
+    for i, t in enumerate(thresholds):
+        recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
+    # ar = 2 * np.trapz(recalls, thresholds)
+    ar = recalls.mean()
+    return {
+        "ar": ar,
+        "recalls": recalls,
+        "thresholds": thresholds,
+        "gt_overlaps": gt_overlaps,
+        "num_pos": num_pos,
+    }
+
+
+def _evaluate_predictions_on_coco(coco_gt, coco_results, iou_type, kpt_oks_sigmas=None):
+    """
+    Evaluate the coco results using COCOEval API.
+    """
+    assert len(coco_results) > 0
+
+    if iou_type == "segm":
+        coco_results = copy.deepcopy(coco_results)
+        # When evaluating mask AP, if the results contain bbox, cocoapi will
+        # use the box area as the area of the instance, instead of the mask area.
+        # This leads to a different definition of small/medium/large.
+        # We remove the bbox field to let mask AP use mask area.
+        for c in coco_results:
+            c.pop("bbox", None)
+
+    coco_dt = coco_gt.loadRes(coco_results)
+    coco_eval = COCOeval(coco_gt, coco_dt, iou_type)
+    # Use the COCO default keypoint OKS sigmas unless overrides are specified
+    if kpt_oks_sigmas:
+        coco_eval.params.kpt_oks_sigmas = np.array(kpt_oks_sigmas)
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+
+    return coco_eval
diff --git a/detectron2/evaluation/evaluator.py b/detectron2/evaluation/evaluator.py
new file mode 100644
index 0000000..e3a3bf4
--- /dev/null
+++ b/detectron2/evaluation/evaluator.py
@@ -0,0 +1,174 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import datetime
+import logging
+import time
+from collections import OrderedDict
+from contextlib import contextmanager
+import torch
+
+from detectron2.utils.comm import is_main_process
+
+
+class DatasetEvaluator:
+    """
+    Base class for a dataset evaluator.
+
+    The function :func:`inference_on_dataset` runs the model over
+    all samples in the dataset, and have a DatasetEvaluator to process the inputs/outputs.
+
+    This class will accumulate information of the inputs/outputs (by :meth:`process`),
+    and produce evaluation results in the end (by :meth:`evaluate`).
+    """
+
+    def reset(self):
+        """
+        Preparation for a new round of evaluation.
+        Should be called before starting a round of evaluation.
+        """
+        pass
+
+    def process(self, input, output):
+        """
+        Process an input/output pair.
+
+        Args:
+            input: the input that's used to call the model.
+            output: the return value of `model(output)`
+        """
+        pass
+
+    def evaluate(self):
+        """
+        Evaluate/summarize the performance, after processing all input/output pairs.
+
+        Returns:
+            dict:
+                A new evaluator class can return a dict of arbitrary format
+                as long as the user can process the results.
+                In our train_net.py, we expect the following format:
+
+                * key: the name of the task (e.g., bbox)
+                * value: a dict of {metric name: score}, e.g.: {"AP50": 80}
+        """
+        pass
+
+
+class DatasetEvaluators(DatasetEvaluator):
+    def __init__(self, evaluators):
+        assert len(evaluators)
+        super().__init__()
+        self._evaluators = evaluators
+
+    def reset(self):
+        for evaluator in self._evaluators:
+            evaluator.reset()
+
+    def process(self, input, output):
+        for evaluator in self._evaluators:
+            evaluator.process(input, output)
+
+    def evaluate(self):
+        results = OrderedDict()
+        for evaluator in self._evaluators:
+            result = evaluator.evaluate()
+            if is_main_process():
+                for k, v in result.items():
+                    assert (
+                        k not in results
+                    ), "Different evaluators produce results with the same key {}".format(k)
+                    results[k] = v
+        return results
+
+
+def inference_on_dataset(model, data_loader, evaluator):
+    """
+    Run model on the data_loader and evaluate the metrics with evaluator.
+    The model will be used in eval mode.
+
+    Args:
+        model (nn.Module): a module which accepts an object from
+            `data_loader` and returns some outputs. It will be temporarily set to `eval` mode.
+
+            If you wish to evaluate a model in `training` mode instead, you can
+            wrap the given model and override its behavior of `.eval()` and `.train()`.
+        data_loader: an iterable object with a length.
+            The elements it generates will be the inputs to the model.
+        evaluator (DatasetEvaluator): the evaluator to run. Use
+            :class:`DatasetEvaluators([])` if you only want to benchmark, but
+            don't want to do any evaluation.
+
+    Returns:
+        The return value of `evaluator.evaluate()`
+    """
+    num_devices = torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
+    logger = logging.getLogger(__name__)
+    logger.info("Start inference on {} images".format(len(data_loader)))
+
+    total = len(data_loader)  # inference data loader must have a fixed length
+    evaluator.reset()
+
+    logging_interval = 50
+    num_warmup = min(5, logging_interval - 1, total - 1)
+    start_time = time.time()
+    total_compute_time = 0
+    with inference_context(model), torch.no_grad():
+        for idx, inputs in enumerate(data_loader):
+            if idx == num_warmup:
+                start_time = time.time()
+                total_compute_time = 0
+
+            start_compute_time = time.time()
+            outputs = model(inputs)
+            torch.cuda.synchronize()
+            total_compute_time += time.time() - start_compute_time
+            evaluator.process(inputs, outputs)
+
+            if (idx + 1) % logging_interval == 0:
+                duration = time.time() - start_time
+                seconds_per_img = duration / (idx + 1 - num_warmup)
+                eta = datetime.timedelta(
+                    seconds=int(seconds_per_img * (total - num_warmup) - duration)
+                )
+                logger.info(
+                    "Inference done {}/{}. {:.4f} s / img. ETA={}".format(
+                        idx + 1, total, seconds_per_img, str(eta)
+                    )
+                )
+
+    # Measure the time only for this worker (before the synchronization barrier)
+    total_time = int(time.time() - start_time)
+    total_time_str = str(datetime.timedelta(seconds=total_time))
+    # NOTE this format is parsed by grep
+    logger.info(
+        "Total inference time: {} ({:.6f} s / img per device, on {} devices)".format(
+            total_time_str, total_time / (total - num_warmup), num_devices
+        )
+    )
+    total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
+    logger.info(
+        "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)".format(
+            total_compute_time_str, total_compute_time / (total - num_warmup), num_devices
+        )
+    )
+
+    results = evaluator.evaluate()
+    # An evaluator may return None when not in main process.
+    # Replace it by an empty dict instead to make it easier for downstream code to handle
+    if results is None:
+        results = {}
+    return results
+
+
+@contextmanager
+def inference_context(model):
+    """
+    A context where the model is temporarily changed to eval mode,
+    and restored to previous mode afterwards.
+
+    Args:
+        model: a torch Module
+    """
+    training_mode = model.training
+    model.eval()
+    yield
+    model.train(training_mode)
diff --git a/detectron2/evaluation/lvis_evaluation.py b/detectron2/evaluation/lvis_evaluation.py
new file mode 100644
index 0000000..3b4d569
--- /dev/null
+++ b/detectron2/evaluation/lvis_evaluation.py
@@ -0,0 +1,340 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import itertools
+import json
+import logging
+import os
+import pickle
+from collections import OrderedDict
+import torch
+from fvcore.common.file_io import PathManager
+
+import detectron2.utils.comm as comm
+from detectron2.data import MetadataCatalog
+from detectron2.structures import Boxes, BoxMode, pairwise_iou
+from detectron2.utils.logger import create_small_table
+
+from .coco_evaluation import instances_to_coco_json
+from .evaluator import DatasetEvaluator
+
+
+class LVISEvaluator(DatasetEvaluator):
+    """
+    Evaluate object proposal and instance detection/segmentation outputs using
+    LVIS's metrics and evaluation API.
+    """
+
+    def __init__(self, dataset_name, cfg, distributed, output_dir=None):
+        """
+        Args:
+            dataset_name (str): name of the dataset to be evaluated.
+                It must have the following corresponding metadata:
+                    "json_file": the path to the LVIS format annotation
+            cfg (CfgNode): config instance
+            distributed (True): if True, will collect results from all ranks for evaluation.
+                Otherwise, will evaluate the results in the current process.
+            output_dir (str): optional, an output directory to dump results.
+        """
+        from lvis import LVIS
+
+        self._tasks = self._tasks_from_config(cfg)
+        self._distributed = distributed
+        self._output_dir = output_dir
+
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+
+        self._metadata = MetadataCatalog.get(dataset_name)
+        json_file = PathManager.get_local_path(self._metadata.json_file)
+        self._lvis_api = LVIS(json_file)
+        # Test set json files do not contain annotations (evaluation must be
+        # performed using the LVIS evaluation server).
+        self._do_evaluation = len(self._lvis_api.get_ann_ids()) > 0
+
+    def reset(self):
+        self._predictions = []
+        self._lvis_results = []
+
+    def _tasks_from_config(self, cfg):
+        """
+        Returns:
+            tuple[str]: tasks that can be evaluated under the given configuration.
+        """
+        tasks = ("bbox",)
+        if cfg.MODEL.MASK_ON:
+            tasks = tasks + ("segm",)
+        return tasks
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a LVIS model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a LVIS model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+        for input, output in zip(inputs, outputs):
+            prediction = {"image_id": input["image_id"]}
+
+            if "instances" in output:
+                instances = output["instances"].to(self._cpu_device)
+                prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
+            if "proposals" in output:
+                prediction["proposals"] = output["proposals"].to(self._cpu_device)
+            self._predictions.append(prediction)
+
+    def evaluate(self):
+        if self._distributed:
+            comm.synchronize()
+            self._predictions = comm.gather(self._predictions, dst=0)
+            self._predictions = list(itertools.chain(*self._predictions))
+
+            if not comm.is_main_process():
+                return
+
+        if len(self._predictions) == 0:
+            self._logger.warning("[LVISEvaluator] Did not receive valid predictions.")
+            return {}
+
+        if self._output_dir:
+            PathManager.mkdirs(self._output_dir)
+            file_path = os.path.join(self._output_dir, "instances_predictions.pth")
+            with PathManager.open(file_path, "wb") as f:
+                torch.save(self._predictions, f)
+
+        self._results = OrderedDict()
+        if "proposals" in self._predictions[0]:
+            self._eval_box_proposals()
+        if "instances" in self._predictions[0]:
+            self._eval_predictions(set(self._tasks))
+        # Copy so the caller can do whatever with results
+        return copy.deepcopy(self._results)
+
+    def _eval_predictions(self, tasks):
+        """
+        Evaluate self._predictions on the given tasks.
+        Fill self._results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results in the LVIS format ...")
+        self._lvis_results = list(itertools.chain(*[x["instances"] for x in self._predictions]))
+
+        # unmap the category ids for LVIS (from 0-indexed to 1-indexed)
+        for result in self._lvis_results:
+            result["category_id"] += 1
+
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "lvis_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(self._lvis_results))
+                f.flush()
+
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+
+        self._logger.info("Evaluating predictions ...")
+        for task in sorted(tasks):
+            res = _evaluate_predictions_on_lvis(
+                self._lvis_api,
+                self._lvis_results,
+                task,
+                class_names=self._metadata.get("thing_classes"),
+            )
+            self._results[task] = res
+
+    def _eval_box_proposals(self):
+        """
+        Evaluate the box proposals in self._predictions.
+        Fill self._results with the metrics for "box_proposals" task.
+        """
+        if self._output_dir:
+            # Saving generated box proposals to file.
+            # Predicted box_proposals are in XYXY_ABS mode.
+            bbox_mode = BoxMode.XYXY_ABS.value
+            ids, boxes, objectness_logits = [], [], []
+            for prediction in self._predictions:
+                ids.append(prediction["image_id"])
+                boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy())
+                objectness_logits.append(prediction["proposals"].objectness_logits.numpy())
+
+            proposal_data = {
+                "boxes": boxes,
+                "objectness_logits": objectness_logits,
+                "ids": ids,
+                "bbox_mode": bbox_mode,
+            }
+            with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f:
+                pickle.dump(proposal_data, f)
+
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+
+        self._logger.info("Evaluating bbox proposals ...")
+        res = {}
+        areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
+        for limit in [100, 1000]:
+            for area, suffix in areas.items():
+                stats = _evaluate_box_proposals(
+                    self._predictions, self._lvis_api, area=area, limit=limit
+                )
+                key = "AR{}@{:d}".format(suffix, limit)
+                res[key] = float(stats["ar"].item() * 100)
+        self._logger.info("Proposal metrics: \n" + create_small_table(res))
+        self._results["box_proposals"] = res
+
+
+# inspired from Detectron:
+# https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa
+def _evaluate_box_proposals(dataset_predictions, lvis_api, thresholds=None, area="all", limit=None):
+    """
+    Evaluate detection proposal recall metrics. This function is a much
+    faster alternative to the official LVIS API recall evaluation code. However,
+    it produces slightly different results.
+    """
+    # Record max overlap value for each gt box
+    # Return vector of overlap values
+    areas = {
+        "all": 0,
+        "small": 1,
+        "medium": 2,
+        "large": 3,
+        "96-128": 4,
+        "128-256": 5,
+        "256-512": 6,
+        "512-inf": 7,
+    }
+    area_ranges = [
+        [0 ** 2, 1e5 ** 2],  # all
+        [0 ** 2, 32 ** 2],  # small
+        [32 ** 2, 96 ** 2],  # medium
+        [96 ** 2, 1e5 ** 2],  # large
+        [96 ** 2, 128 ** 2],  # 96-128
+        [128 ** 2, 256 ** 2],  # 128-256
+        [256 ** 2, 512 ** 2],  # 256-512
+        [512 ** 2, 1e5 ** 2],
+    ]  # 512-inf
+    assert area in areas, "Unknown area range: {}".format(area)
+    area_range = area_ranges[areas[area]]
+    gt_overlaps = []
+    num_pos = 0
+
+    for prediction_dict in dataset_predictions:
+        predictions = prediction_dict["proposals"]
+
+        # sort predictions in descending order
+        # TODO maybe remove this and make it explicit in the documentation
+        inds = predictions.objectness_logits.sort(descending=True)[1]
+        predictions = predictions[inds]
+
+        ann_ids = lvis_api.get_ann_ids(img_ids=[prediction_dict["image_id"]])
+        anno = lvis_api.load_anns(ann_ids)
+        gt_boxes = [
+            BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for obj in anno
+        ]
+        gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4)  # guard against no boxes
+        gt_boxes = Boxes(gt_boxes)
+        gt_areas = torch.as_tensor([obj["area"] for obj in anno])
+
+        if len(gt_boxes) == 0 or len(predictions) == 0:
+            continue
+
+        valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
+        gt_boxes = gt_boxes[valid_gt_inds]
+
+        num_pos += len(gt_boxes)
+
+        if len(gt_boxes) == 0:
+            continue
+
+        if limit is not None and len(predictions) > limit:
+            predictions = predictions[:limit]
+
+        overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes)
+
+        _gt_overlaps = torch.zeros(len(gt_boxes))
+        for j in range(min(len(predictions), len(gt_boxes))):
+            # find which proposal box maximally covers each gt box
+            # and get the iou amount of coverage for each gt box
+            max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+            # find which gt box is 'best' covered (i.e. 'best' = most iou)
+            gt_ovr, gt_ind = max_overlaps.max(dim=0)
+            assert gt_ovr >= 0
+            # find the proposal box that covers the best covered gt box
+            box_ind = argmax_overlaps[gt_ind]
+            # record the iou coverage of this gt box
+            _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+            assert _gt_overlaps[j] == gt_ovr
+            # mark the proposal box and the gt box as used
+            overlaps[box_ind, :] = -1
+            overlaps[:, gt_ind] = -1
+
+        # append recorded iou coverage level
+        gt_overlaps.append(_gt_overlaps)
+    gt_overlaps = torch.cat(gt_overlaps, dim=0)
+    gt_overlaps, _ = torch.sort(gt_overlaps)
+
+    if thresholds is None:
+        step = 0.05
+        thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
+    recalls = torch.zeros_like(thresholds)
+    # compute recall for each iou threshold
+    for i, t in enumerate(thresholds):
+        recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
+    # ar = 2 * np.trapz(recalls, thresholds)
+    ar = recalls.mean()
+    return {
+        "ar": ar,
+        "recalls": recalls,
+        "thresholds": thresholds,
+        "gt_overlaps": gt_overlaps,
+        "num_pos": num_pos,
+    }
+
+
+def _evaluate_predictions_on_lvis(lvis_gt, lvis_results, iou_type, class_names=None):
+    """
+    Args:
+        iou_type (str):
+        kpt_oks_sigmas (list[float]):
+        class_names (None or list[str]): if provided, will use it to predict
+            per-category AP.
+
+    Returns:
+        a dict of {metric name: score}
+    """
+    metrics = {
+        "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl", "APr", "APc", "APf"],
+        "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl", "APr", "APc", "APf"],
+    }[iou_type]
+
+    logger = logging.getLogger(__name__)
+
+    if len(lvis_results) == 0:  # TODO: check if needed
+        logger.warn("No predictions from the model! Set scores to -1")
+        return {metric: -1 for metric in metrics}
+
+    if iou_type == "segm":
+        lvis_results = copy.deepcopy(lvis_results)
+        # When evaluating mask AP, if the results contain bbox, LVIS API will
+        # use the box area as the area of the instance, instead of the mask area.
+        # This leads to a different definition of small/medium/large.
+        # We remove the bbox field to let mask AP use mask area.
+        for c in lvis_results:
+            c.pop("bbox", None)
+
+    from lvis import LVISEval, LVISResults
+
+    lvis_results = LVISResults(lvis_gt, lvis_results)
+    lvis_eval = LVISEval(lvis_gt, lvis_results, iou_type)
+    lvis_eval.run()
+    lvis_eval.print_results()
+
+    # Pull the standard metrics from the LVIS results
+    results = lvis_eval.get_results()
+    results = {metric: float(results[metric] * 100) for metric in metrics}
+    logger.info("Evaluation results for {}: \n".format(iou_type) + create_small_table(results))
+    return results
diff --git a/detectron2/evaluation/panoptic_evaluation.py b/detectron2/evaluation/panoptic_evaluation.py
new file mode 100644
index 0000000..b76e98c
--- /dev/null
+++ b/detectron2/evaluation/panoptic_evaluation.py
@@ -0,0 +1,166 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import contextlib
+import io
+import itertools
+import json
+import logging
+import os
+import tempfile
+from collections import OrderedDict
+from fvcore.common.file_io import PathManager
+from PIL import Image
+from tabulate import tabulate
+
+from detectron2.data import MetadataCatalog
+from detectron2.utils import comm
+
+from .evaluator import DatasetEvaluator
+
+logger = logging.getLogger(__name__)
+
+
+class COCOPanopticEvaluator(DatasetEvaluator):
+    """
+    Evaluate Panoptic Quality metrics on COCO using PanopticAPI.
+    It saves panoptic segmentation prediction in `output_dir`
+
+    It contains a synchronize call and has to be called from all workers.
+    """
+
+    def __init__(self, dataset_name, output_dir):
+        """
+        Args:
+            dataset_name (str): name of the dataset
+            output_dir (str): output directory to save results for evaluation
+        """
+        self._metadata = MetadataCatalog.get(dataset_name)
+        self._thing_contiguous_id_to_dataset_id = {
+            v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
+        }
+        self._stuff_contiguous_id_to_dataset_id = {
+            v: k for k, v in self._metadata.stuff_dataset_id_to_contiguous_id.items()
+        }
+
+        self._predictions_json = os.path.join(output_dir, "predictions.json")
+
+    def reset(self):
+        self._predictions = []
+
+    def _convert_category_id(self, segment_info):
+        isthing = segment_info.pop("isthing", None)
+        if isthing is None:
+            # the model produces panoptic category id directly. No more conversion needed
+            return segment_info
+        if isthing is True:
+            segment_info["category_id"] = self._thing_contiguous_id_to_dataset_id[
+                segment_info["category_id"]
+            ]
+        else:
+            segment_info["category_id"] = self._stuff_contiguous_id_to_dataset_id[
+                segment_info["category_id"]
+            ]
+        return segment_info
+
+    def process(self, inputs, outputs):
+        from panopticapi.utils import id2rgb
+
+        for input, output in zip(inputs, outputs):
+            panoptic_img, segments_info = output["panoptic_seg"]
+            panoptic_img = panoptic_img.cpu().numpy()
+
+            file_name = os.path.basename(input["file_name"])
+            file_name_png = os.path.splitext(file_name)[0] + ".png"
+            with io.BytesIO() as out:
+                Image.fromarray(id2rgb(panoptic_img)).save(out, format="PNG")
+                segments_info = [self._convert_category_id(x) for x in segments_info]
+                self._predictions.append(
+                    {
+                        "image_id": input["image_id"],
+                        "file_name": file_name_png,
+                        "png_string": out.getvalue(),
+                        "segments_info": segments_info,
+                    }
+                )
+
+    def evaluate(self):
+        comm.synchronize()
+
+        self._predictions = comm.gather(self._predictions)
+        self._predictions = list(itertools.chain(*self._predictions))
+        if not comm.is_main_process():
+            return
+
+        gt_json = PathManager.get_local_path(self._metadata.panoptic_json)
+        gt_folder = self._metadata.panoptic_root
+
+        with tempfile.TemporaryDirectory(prefix="panoptic_eval") as pred_dir:
+            logger.info("Writing all panoptic predictions to {} ...".format(pred_dir))
+            for p in self._predictions:
+                with open(os.path.join(pred_dir, p["file_name"]), "wb") as f:
+                    f.write(p.pop("png_string"))
+
+            with open(gt_json, "r") as f:
+                json_data = json.load(f)
+            json_data["annotations"] = self._predictions
+            with PathManager.open(self._predictions_json, "w") as f:
+                f.write(json.dumps(json_data))
+
+            from panopticapi.evaluation import pq_compute
+
+            with contextlib.redirect_stdout(io.StringIO()):
+                pq_res = pq_compute(
+                    gt_json,
+                    PathManager.get_local_path(self._predictions_json),
+                    gt_folder=gt_folder,
+                    pred_folder=pred_dir,
+                )
+
+        res = {}
+        res["PQ"] = 100 * pq_res["All"]["pq"]
+        res["SQ"] = 100 * pq_res["All"]["sq"]
+        res["RQ"] = 100 * pq_res["All"]["rq"]
+        res["PQ_th"] = 100 * pq_res["Things"]["pq"]
+        res["SQ_th"] = 100 * pq_res["Things"]["sq"]
+        res["RQ_th"] = 100 * pq_res["Things"]["rq"]
+        res["PQ_st"] = 100 * pq_res["Stuff"]["pq"]
+        res["SQ_st"] = 100 * pq_res["Stuff"]["sq"]
+        res["RQ_st"] = 100 * pq_res["Stuff"]["rq"]
+
+        results = OrderedDict({"panoptic_seg": res})
+        _print_panoptic_results(pq_res)
+
+        return results
+
+
+def _print_panoptic_results(pq_res):
+    headers = ["", "PQ", "SQ", "RQ", "#categories"]
+    data = []
+    for name in ["All", "Things", "Stuff"]:
+        row = [name] + [pq_res[name][k] * 100 for k in ["pq", "sq", "rq"]] + [pq_res[name]["n"]]
+        data.append(row)
+    table = tabulate(
+        data, headers=headers, tablefmt="pipe", floatfmt=".3f", stralign="center", numalign="center"
+    )
+    logger.info("Panoptic Evaluation Results:\n" + table)
+
+
+if __name__ == "__main__":
+    from detectron2.utils.logger import setup_logger
+
+    logger = setup_logger()
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--gt-json")
+    parser.add_argument("--gt-dir")
+    parser.add_argument("--pred-json")
+    parser.add_argument("--pred-dir")
+    args = parser.parse_args()
+
+    from panopticapi.evaluation import pq_compute
+
+    with contextlib.redirect_stdout(io.StringIO()):
+        pq_res = pq_compute(
+            args.gt_json, args.pred_json, gt_folder=args.gt_dir, pred_folder=args.pred_dir
+        )
+        _print_panoptic_results(pq_res)
diff --git a/detectron2/evaluation/pascal_voc_evaluation.py b/detectron2/evaluation/pascal_voc_evaluation.py
new file mode 100644
index 0000000..c558882
--- /dev/null
+++ b/detectron2/evaluation/pascal_voc_evaluation.py
@@ -0,0 +1,292 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import logging
+import numpy as np
+import os
+import tempfile
+import xml.etree.ElementTree as ET
+from collections import OrderedDict, defaultdict
+from functools import lru_cache
+import torch
+
+from detectron2.data import MetadataCatalog
+from detectron2.utils import comm
+
+from .evaluator import DatasetEvaluator
+
+
+class PascalVOCDetectionEvaluator(DatasetEvaluator):
+    """
+    Evaluate Pascal VOC AP.
+    It contains a synchronization, therefore has to be called from all ranks.
+
+    Note that this is a rewrite of the official Matlab API.
+    The results should be similar, but not identical to the one produced by
+    the official API.
+    """
+
+    def __init__(self, dataset_name):
+        """
+        Args:
+            dataset_name (str): name of the dataset, e.g., "voc_2007_test"
+        """
+        self._dataset_name = dataset_name
+        meta = MetadataCatalog.get(dataset_name)
+        self._anno_file_template = os.path.join(meta.dirname, "Annotations", "{}.xml")
+        self._image_set_path = os.path.join(meta.dirname, "ImageSets", "Main", meta.split + ".txt")
+        self._class_names = meta.thing_classes
+        assert meta.year in [2007, 2012], meta.year
+        self._is_2007 = meta.year == 2007
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+
+    def reset(self):
+        self._predictions = defaultdict(list)  # class name -> list of prediction strings
+
+    def process(self, inputs, outputs):
+        for input, output in zip(inputs, outputs):
+            image_id = input["image_id"]
+            instances = output["instances"].to(self._cpu_device)
+            boxes = instances.pred_boxes.tensor.numpy()
+            scores = instances.scores.tolist()
+            classes = instances.pred_classes.tolist()
+            for box, score, cls in zip(boxes, scores, classes):
+                xmin, ymin, xmax, ymax = box
+                # The inverse of data loading logic in `datasets/pascal_voc.py`
+                xmin += 1
+                ymin += 1
+                self._predictions[cls].append(
+                    f"{image_id} {score:.3f} {xmin:.1f} {ymin:.1f} {xmax:.1f} {ymax:.1f}"
+                )
+
+    def evaluate(self):
+        """
+        Returns:
+            dict: has a key "segm", whose value is a dict of "AP", "AP50", and "AP75".
+        """
+        all_predictions = comm.gather(self._predictions, dst=0)
+        if not comm.is_main_process():
+            return
+        predictions = defaultdict(list)
+        for predictions_per_rank in all_predictions:
+            for clsid, lines in predictions_per_rank.items():
+                predictions[clsid].extend(lines)
+        del all_predictions
+
+        self._logger.info(
+            "Evaluating {} using {} metric. "
+            "Note that results do not use the official Matlab API.".format(
+                self._dataset_name, 2007 if self._is_2007 else 2012
+            )
+        )
+
+        with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname:
+            res_file_template = os.path.join(dirname, "{}.txt")
+
+            aps = defaultdict(list)  # iou -> ap per class
+            for cls_id, cls_name in enumerate(self._class_names):
+                lines = predictions.get(cls_id, [""])
+
+                with open(res_file_template.format(cls_name), "w") as f:
+                    f.write("\n".join(lines))
+
+                for thresh in range(50, 100, 5):
+                    rec, prec, ap = voc_eval(
+                        res_file_template,
+                        self._anno_file_template,
+                        self._image_set_path,
+                        cls_name,
+                        ovthresh=thresh / 100.0,
+                        use_07_metric=self._is_2007,
+                    )
+                    aps[thresh].append(ap * 100)
+
+        ret = OrderedDict()
+        mAP = {iou: np.mean(x) for iou, x in aps.items()}
+        ret["bbox"] = {"AP": np.mean(list(mAP.values())), "AP50": mAP[50], "AP75": mAP[75]}
+        return ret
+
+
+##############################################################################
+#
+# Below code is modified from
+# https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/voc_eval.py
+# --------------------------------------------------------
+# Fast/er R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Bharath Hariharan
+# --------------------------------------------------------
+
+"""Python implementation of the PASCAL VOC devkit's AP evaluation code."""
+
+
+@lru_cache(maxsize=None)
+def parse_rec(filename):
+    """Parse a PASCAL VOC xml file."""
+    tree = ET.parse(filename)
+    objects = []
+    for obj in tree.findall("object"):
+        obj_struct = {}
+        obj_struct["name"] = obj.find("name").text
+        obj_struct["pose"] = obj.find("pose").text
+        obj_struct["truncated"] = int(obj.find("truncated").text)
+        obj_struct["difficult"] = int(obj.find("difficult").text)
+        bbox = obj.find("bndbox")
+        obj_struct["bbox"] = [
+            int(bbox.find("xmin").text),
+            int(bbox.find("ymin").text),
+            int(bbox.find("xmax").text),
+            int(bbox.find("ymax").text),
+        ]
+        objects.append(obj_struct)
+
+    return objects
+
+
+def voc_ap(rec, prec, use_07_metric=False):
+    """Compute VOC AP given precision and recall. If use_07_metric is true, uses
+    the VOC 07 11-point method (default:False).
+    """
+    if use_07_metric:
+        # 11 point metric
+        ap = 0.0
+        for t in np.arange(0.0, 1.1, 0.1):
+            if np.sum(rec >= t) == 0:
+                p = 0
+            else:
+                p = np.max(prec[rec >= t])
+            ap = ap + p / 11.0
+    else:
+        # correct AP calculation
+        # first append sentinel values at the end
+        mrec = np.concatenate(([0.0], rec, [1.0]))
+        mpre = np.concatenate(([0.0], prec, [0.0]))
+
+        # compute the precision envelope
+        for i in range(mpre.size - 1, 0, -1):
+            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+        # to calculate area under PR curve, look for points
+        # where X axis (recall) changes value
+        i = np.where(mrec[1:] != mrec[:-1])[0]
+
+        # and sum (\Delta recall) * prec
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+
+
+def voc_eval(detpath, annopath, imagesetfile, classname, ovthresh=0.5, use_07_metric=False):
+    """rec, prec, ap = voc_eval(detpath,
+                                annopath,
+                                imagesetfile,
+                                classname,
+                                [ovthresh],
+                                [use_07_metric])
+
+    Top level function that does the PASCAL VOC evaluation.
+
+    detpath: Path to detections
+        detpath.format(classname) should produce the detection results file.
+    annopath: Path to annotations
+        annopath.format(imagename) should be the xml annotations file.
+    imagesetfile: Text file containing the list of images, one image per line.
+    classname: Category name (duh)
+    [ovthresh]: Overlap threshold (default = 0.5)
+    [use_07_metric]: Whether to use VOC07's 11 point AP computation
+        (default False)
+    """
+    # assumes detections are in detpath.format(classname)
+    # assumes annotations are in annopath.format(imagename)
+    # assumes imagesetfile is a text file with each line an image name
+
+    # first load gt
+    # read list of images
+    with open(imagesetfile, "r") as f:
+        lines = f.readlines()
+    imagenames = [x.strip() for x in lines]
+
+    # load annots
+    recs = {}
+    for imagename in imagenames:
+        recs[imagename] = parse_rec(annopath.format(imagename))
+
+    # extract gt objects for this class
+    class_recs = {}
+    npos = 0
+    for imagename in imagenames:
+        R = [obj for obj in recs[imagename] if obj["name"] == classname]
+        bbox = np.array([x["bbox"] for x in R])
+        difficult = np.array([x["difficult"] for x in R]).astype(np.bool)
+        # difficult = np.array([False for x in R]).astype(np.bool)  # treat all "difficult" as GT
+        det = [False] * len(R)
+        npos = npos + sum(~difficult)
+        class_recs[imagename] = {"bbox": bbox, "difficult": difficult, "det": det}
+
+    # read dets
+    detfile = detpath.format(classname)
+    with open(detfile, "r") as f:
+        lines = f.readlines()
+
+    splitlines = [x.strip().split(" ") for x in lines]
+    image_ids = [x[0] for x in splitlines]
+    confidence = np.array([float(x[1]) for x in splitlines])
+    BB = np.array([[float(z) for z in x[2:]] for x in splitlines]).reshape(-1, 4)
+
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    BB = BB[sorted_ind, :]
+    image_ids = [image_ids[x] for x in sorted_ind]
+
+    # go down dets and mark TPs and FPs
+    nd = len(image_ids)
+    tp = np.zeros(nd)
+    fp = np.zeros(nd)
+    for d in range(nd):
+        R = class_recs[image_ids[d]]
+        bb = BB[d, :].astype(float)
+        ovmax = -np.inf
+        BBGT = R["bbox"].astype(float)
+
+        if BBGT.size > 0:
+            # compute overlaps
+            # intersection
+            ixmin = np.maximum(BBGT[:, 0], bb[0])
+            iymin = np.maximum(BBGT[:, 1], bb[1])
+            ixmax = np.minimum(BBGT[:, 2], bb[2])
+            iymax = np.minimum(BBGT[:, 3], bb[3])
+            iw = np.maximum(ixmax - ixmin + 1.0, 0.0)
+            ih = np.maximum(iymax - iymin + 1.0, 0.0)
+            inters = iw * ih
+
+            # union
+            uni = (
+                (bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0)
+                + (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0)
+                - inters
+            )
+
+            overlaps = inters / uni
+            ovmax = np.max(overlaps)
+            jmax = np.argmax(overlaps)
+
+        if ovmax > ovthresh:
+            if not R["difficult"][jmax]:
+                if not R["det"][jmax]:
+                    tp[d] = 1.0
+                    R["det"][jmax] = 1
+                else:
+                    fp[d] = 1.0
+        else:
+            fp[d] = 1.0
+
+    # compute precision recall
+    fp = np.cumsum(fp)
+    tp = np.cumsum(tp)
+    rec = tp / float(npos)
+    # avoid divide by zero in case the first detection matches a difficult
+    # ground truth
+    prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+    ap = voc_ap(rec, prec, use_07_metric)
+
+    return rec, prec, ap
diff --git a/detectron2/evaluation/sem_seg_evaluation.py b/detectron2/evaluation/sem_seg_evaluation.py
new file mode 100644
index 0000000..888e967
--- /dev/null
+++ b/detectron2/evaluation/sem_seg_evaluation.py
@@ -0,0 +1,163 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import itertools
+import json
+import logging
+import numpy as np
+import os
+from collections import OrderedDict
+import PIL.Image as Image
+import pycocotools.mask as mask_util
+import torch
+from fvcore.common.file_io import PathManager
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.utils.comm import all_gather, is_main_process, synchronize
+
+from .evaluator import DatasetEvaluator
+
+
+class SemSegEvaluator(DatasetEvaluator):
+    """
+    Evaluate semantic segmentation
+    """
+
+    def __init__(self, dataset_name, distributed, num_classes, ignore_label=255, output_dir=None):
+        """
+        Args:
+            dataset_name (str): name of the dataset to be evaluated.
+            distributed (True): if True, will collect results from all ranks for evaluation.
+                Otherwise, will evaluate the results in the current process.
+            num_classes (int): number of classes
+            ignore_label (int): value in semantic segmentation ground truth. Predictions for the
+            corresponding pixels should be ignored.
+            output_dir (str): an output directory to dump results.
+        """
+        self._dataset_name = dataset_name
+        self._distributed = distributed
+        self._output_dir = output_dir
+        self._num_classes = num_classes
+        self._ignore_label = ignore_label
+        self._N = num_classes + 1
+
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+
+        self.input_file_to_gt_file = {
+            dataset_record["file_name"]: dataset_record["sem_seg_file_name"]
+            for dataset_record in DatasetCatalog.get(dataset_name)
+        }
+
+        meta = MetadataCatalog.get(dataset_name)
+        # Dict that maps contiguous training ids to COCO category ids
+        try:
+            c2d = meta.stuff_dataset_id_to_contiguous_id
+            self._contiguous_id_to_dataset_id = {v: k for k, v in c2d.items()}
+        except AttributeError:
+            self._contiguous_id_to_dataset_id = None
+
+    def reset(self):
+        self._conf_matrix = np.zeros((self._N, self._N), dtype=np.int64)
+        self._predictions = []
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a model.
+                It is a list of dicts. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name".
+            outputs: the outputs of a model. It is either list of semantic segmentation predictions
+                (Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic
+                segmentation prediction in the same format.
+        """
+        for input, output in zip(inputs, outputs):
+            output = output["sem_seg"].argmax(dim=0).to(self._cpu_device)
+            pred = np.array(output, dtype=np.int)
+            with PathManager.open(self.input_file_to_gt_file[input["file_name"]], "rb") as f:
+                gt = np.array(Image.open(f), dtype=np.int)
+
+            gt[gt == self._ignore_label] = self._num_classes
+
+            self._conf_matrix += np.bincount(
+                self._N * pred.reshape(-1) + gt.reshape(-1), minlength=self._N ** 2
+            ).reshape(self._N, self._N)
+
+            self._predictions.extend(self.encode_json_sem_seg(pred, input["file_name"]))
+
+    def evaluate(self):
+        """
+        Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval):
+
+        * Mean intersection-over-union averaged across classes (mIoU)
+        * Frequency Weighted IoU (fwIoU)
+        * Mean pixel accuracy averaged across classes (mACC)
+        * Pixel Accuracy (pACC)
+        """
+        if self._distributed:
+            synchronize()
+            conf_matrix_list = all_gather(self._conf_matrix)
+            self._predictions = all_gather(self._predictions)
+            self._predictions = list(itertools.chain(*self._predictions))
+            if not is_main_process():
+                return
+
+            self._conf_matrix = np.zeros_like(self._conf_matrix)
+            for conf_matrix in conf_matrix_list:
+                self._conf_matrix += conf_matrix
+
+        if self._output_dir:
+            PathManager.mkdirs(self._output_dir)
+            file_path = os.path.join(self._output_dir, "sem_seg_predictions.json")
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(self._predictions))
+
+        acc = np.zeros(self._num_classes, dtype=np.float)
+        iou = np.zeros(self._num_classes, dtype=np.float)
+        tp = self._conf_matrix.diagonal()[:-1].astype(np.float)
+        pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float)
+        class_weights = pos_gt / np.sum(pos_gt)
+        pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float)
+        acc_valid = pos_gt > 0
+        acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid]
+        iou_valid = (pos_gt + pos_pred) > 0
+        union = pos_gt + pos_pred - tp
+        iou[acc_valid] = tp[acc_valid] / union[acc_valid]
+        macc = np.sum(acc) / np.sum(acc_valid)
+        miou = np.sum(iou) / np.sum(iou_valid)
+        fiou = np.sum(iou * class_weights)
+        pacc = np.sum(tp) / np.sum(pos_gt)
+
+        res = {}
+        res["mIoU"] = 100 * miou
+        res["fwIoU"] = 100 * fiou
+        res["mACC"] = 100 * macc
+        res["pACC"] = 100 * pacc
+
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth")
+            with PathManager.open(file_path, "wb") as f:
+                torch.save(res, f)
+        results = OrderedDict({"sem_seg": res})
+        self._logger.info(results)
+        return results
+
+    def encode_json_sem_seg(self, sem_seg, input_file_name):
+        """
+        Convert semantic segmentation to COCO stuff format with segments encoded as RLEs.
+        See http://cocodataset.org/#format-results
+        """
+        json_list = []
+        for label in np.unique(sem_seg):
+            if self._contiguous_id_to_dataset_id is not None:
+                assert (
+                    label in self._contiguous_id_to_dataset_id
+                ), "Label {} is not in the metadata info for {}".format(label, self._dataset_name)
+                dataset_id = self._contiguous_id_to_dataset_id[label]
+            else:
+                dataset_id = int(label)
+            mask = (sem_seg == label).astype(np.uint8)
+            mask_rle = mask_util.encode(np.array(mask[:, :, None], order="F"))[0]
+            mask_rle["counts"] = mask_rle["counts"].decode("utf-8")
+            json_list.append(
+                {"file_name": input_file_name, "category_id": dataset_id, "segmentation": mask_rle}
+            )
+        return json_list
diff --git a/detectron2/evaluation/testing.py b/detectron2/evaluation/testing.py
new file mode 100644
index 0000000..ae4310f
--- /dev/null
+++ b/detectron2/evaluation/testing.py
@@ -0,0 +1,77 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import numpy as np
+import pprint
+import sys
+from collections import Mapping, OrderedDict
+
+
+def print_csv_format(results):
+    """
+    Print main metrics in a format similar to Detectron,
+    so that they are easy to copypaste into a spreadsheet.
+
+    Args:
+        results (OrderedDict[dict]): task_name -> {metric -> score}
+    """
+    assert isinstance(results, OrderedDict), results  # unordered results cannot be properly printed
+    logger = logging.getLogger(__name__)
+    for task, res in results.items():
+        # Don't print "AP-category" metrics since they are usually not tracked.
+        important_res = [(k, v) for k, v in res.items() if "-" not in k]
+        logger.info("copypaste: Task: {}".format(task))
+        logger.info("copypaste: " + ",".join([k[0] for k in important_res]))
+        logger.info("copypaste: " + ",".join(["{0:.4f}".format(k[1]) for k in important_res]))
+
+
+def verify_results(cfg, results):
+    """
+    Args:
+        results (OrderedDict[dict]): task_name -> {metric -> score}
+
+    Returns:
+        bool: whether the verification succeeds or not
+    """
+    expected_results = cfg.TEST.EXPECTED_RESULTS
+    if not len(expected_results):
+        return True
+
+    ok = True
+    for task, metric, expected, tolerance in expected_results:
+        actual = results[task][metric]
+        if not np.isfinite(actual):
+            ok = False
+        diff = abs(actual - expected)
+        if diff > tolerance:
+            ok = False
+
+    logger = logging.getLogger(__name__)
+    if not ok:
+        logger.error("Result verification failed!")
+        logger.error("Expected Results: " + str(expected_results))
+        logger.error("Actual Results: " + pprint.pformat(results))
+
+        sys.exit(1)
+    else:
+        logger.info("Results verification passed.")
+    return ok
+
+
+def flatten_results_dict(results):
+    """
+    Expand a hierarchical dict of scalars into a flat dict of scalars.
+    If results[k1][k2][k3] = v, the returned dict will have the entry
+    {"k1/k2/k3": v}.
+
+    Args:
+        results (dict):
+    """
+    r = {}
+    for k, v in results.items():
+        if isinstance(v, Mapping):
+            v = flatten_results_dict(v)
+            for kk, vv in v.items():
+                r[k + "/" + kk] = vv
+        else:
+            r[k] = v
+    return r
diff --git a/detectron2/layers/__init__.py b/detectron2/layers/__init__.py
new file mode 100644
index 0000000..d31d393
--- /dev/null
+++ b/detectron2/layers/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .batch_norm import FrozenBatchNorm2d, get_norm, NaiveSyncBatchNorm
+from .deform_conv import DeformConv, ModulatedDeformConv
+from .mask_ops import paste_masks_in_image
+from .nms import batched_nms, batched_nms_rotated, nms, nms_rotated
+from .roi_align import ROIAlign, roi_align
+from .roi_align_rotated import ROIAlignRotated, roi_align_rotated
+from .shape_spec import ShapeSpec
+from .wrappers import BatchNorm2d, Conv2d, ConvTranspose2d, cat, interpolate
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/detectron2/layers/batch_norm.py b/detectron2/layers/batch_norm.py
new file mode 100644
index 0000000..46c5cc0
--- /dev/null
+++ b/detectron2/layers/batch_norm.py
@@ -0,0 +1,179 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import torch
+import torch.distributed as dist
+from torch import nn
+from torch.autograd.function import Function
+
+from detectron2.utils import comm
+
+from .wrappers import BatchNorm2d
+
+
+class FrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    It contains non-trainable buffers called
+    "weight" and "bias", "running_mean", "running_var",
+    initialized to perform identity transformation.
+
+    The pre-trained backbone models from Caffe2 only contain "weight" and "bias",
+    which are computed from the original four parameters of BN.
+    The affine transform `x * weight + bias` will perform the equivalent
+    computation of `(x - running_mean) / sqrt(running_var) * weight + bias`.
+    When loading a backbone model from Caffe2, "running_mean" and "running_var"
+    will be left unchanged as identity transformation.
+
+    Other pre-trained backbone models may contain all 4 parameters.
+
+    The forward is implemented by `F.batch_norm(..., training=False)`.
+    """
+
+    _version = 3
+
+    def __init__(self, num_features, eps=1e-5):
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.register_buffer("weight", torch.ones(num_features))
+        self.register_buffer("bias", torch.zeros(num_features))
+        self.register_buffer("running_mean", torch.zeros(num_features))
+        self.register_buffer("running_var", torch.ones(num_features) - eps)
+
+    def forward(self, x):
+        scale = self.weight * (self.running_var + self.eps).rsqrt()
+        bias = self.bias - self.running_mean * scale
+        scale = scale.reshape(1, -1, 1, 1)
+        bias = bias.reshape(1, -1, 1, 1)
+        return x * scale + bias
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 2:
+            # No running_mean/var in early versions
+            # This will silent the warnings
+            if prefix + "running_mean" not in state_dict:
+                state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean)
+            if prefix + "running_var" not in state_dict:
+                state_dict[prefix + "running_var"] = torch.ones_like(self.running_var)
+
+        if version is not None and version < 3:
+            logger = logging.getLogger(__name__)
+            logger.info("FrozenBatchNorm {} is upgraded to version 3.".format(prefix.rstrip(".")))
+            # In version < 3, running_var are used without +eps.
+            state_dict[prefix + "running_var"] -= self.eps
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def __repr__(self):
+        return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps)
+
+    @classmethod
+    def convert_frozen_batchnorm(cls, module):
+        """
+        Convert BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.
+
+        Args:
+            module (torch.nn.Module):
+
+        Returns:
+            If module is BatchNorm/SyncBatchNorm, returns a new module.
+            Otherwise, in-place convert module and return it.
+
+        Similar to convert_sync_batchnorm in
+        https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py
+        """
+        bn_module = nn.modules.batchnorm
+        bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)
+        res = module
+        if isinstance(module, bn_module):
+            res = cls(module.num_features)
+            if module.affine:
+                res.weight.data = module.weight.data.clone().detach()
+                res.bias.data = module.bias.data.clone().detach()
+            res.running_mean.data = module.running_mean.data
+            res.running_var.data = module.running_var.data + module.eps
+        else:
+            for name, child in module.named_children():
+                new_child = cls.convert_frozen_batchnorm(child)
+                if new_child is not child:
+                    res.add_module(name, new_child)
+        return res
+
+
+def get_norm(norm, out_channels):
+    """
+    Args:
+        norm (str or callable):
+
+    Returns:
+        nn.Module or None: the normalization layer
+    """
+    if isinstance(norm, str):
+        if len(norm) == 0:
+            return None
+        norm = {
+            "BN": BatchNorm2d,
+            "SyncBN": NaiveSyncBatchNorm,
+            "FrozenBN": FrozenBatchNorm2d,
+            "GN": lambda channels: nn.GroupNorm(32, channels),
+            "nnSyncBN": nn.SyncBatchNorm,  # keep for debugging
+        }[norm]
+    return norm(out_channels)
+
+
+class AllReduce(Function):
+    @staticmethod
+    def forward(ctx, input):
+        input_list = [torch.zeros_like(input) for k in range(dist.get_world_size())]
+        # Use allgather instead of allreduce since I don't trust in-place operations ..
+        dist.all_gather(input_list, input, async_op=False)
+        inputs = torch.stack(input_list, dim=0)
+        return torch.sum(inputs, dim=0)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        dist.all_reduce(grad_output, async_op=False)
+        return grad_output
+
+
+class NaiveSyncBatchNorm(BatchNorm2d):
+    """
+    `torch.nn.SyncBatchNorm` has known unknown bugs.
+    It produces significantly worse AP (and sometimes goes NaN)
+    when the batch size on each worker is quite different
+    (e.g., when scale augmentation is used, or when it is applied to mask head).
+
+    Use this implementation before `nn.SyncBatchNorm` is fixed.
+    It is slower than `nn.SyncBatchNorm`.
+    """
+
+    def forward(self, input):
+        if comm.get_world_size() == 1 or not self.training:
+            return super().forward(input)
+
+        assert input.shape[0] > 0, "SyncBatchNorm does not support empty inputs"
+        C = input.shape[1]
+        mean = torch.mean(input, dim=[0, 2, 3])
+        meansqr = torch.mean(input * input, dim=[0, 2, 3])
+
+        vec = torch.cat([mean, meansqr], dim=0)
+        vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
+
+        mean, meansqr = torch.split(vec, C)
+        var = meansqr - mean * mean
+        self.running_mean += self.momentum * (mean.detach() - self.running_mean)
+        self.running_var += self.momentum * (var.detach() - self.running_var)
+
+        invstd = torch.rsqrt(var + self.eps)
+        scale = self.weight * invstd
+        bias = self.bias - mean * scale
+        scale = scale.reshape(1, -1, 1, 1)
+        bias = bias.reshape(1, -1, 1, 1)
+        return input * scale + bias
diff --git a/detectron2/layers/csrc/README.md b/detectron2/layers/csrc/README.md
new file mode 100644
index 0000000..778ed3d
--- /dev/null
+++ b/detectron2/layers/csrc/README.md
@@ -0,0 +1,7 @@
+
+
+To add a new Op:
+
+1. Create a new directory
+2. Implement new ops there
+3. Delcare its Python interface in `vision.cpp`.
diff --git a/detectron2/layers/csrc/ROIAlign/ROIAlign.h b/detectron2/layers/csrc/ROIAlign/ROIAlign.h
new file mode 100644
index 0000000..7ec4e23
--- /dev/null
+++ b/detectron2/layers/csrc/ROIAlign/ROIAlign.h
@@ -0,0 +1,130 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+at::Tensor ROIAlign_forward_cpu(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    bool aligned);
+
+at::Tensor ROIAlign_backward_cpu(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio,
+    bool aligned);
+
+#ifdef WITH_CUDA
+at::Tensor ROIAlign_forward_cuda(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    bool aligned);
+
+at::Tensor ROIAlign_backward_cuda(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio,
+    bool aligned);
+#endif
+
+// Interface for Python
+inline at::Tensor ROIAlign_forward(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    bool aligned) {
+  if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return ROIAlign_forward_cuda(
+        input,
+        rois,
+        spatial_scale,
+        pooled_height,
+        pooled_width,
+        sampling_ratio,
+        aligned);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return ROIAlign_forward_cpu(
+      input,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      sampling_ratio,
+      aligned);
+}
+
+inline at::Tensor ROIAlign_backward(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio,
+    bool aligned) {
+  if (grad.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return ROIAlign_backward_cuda(
+        grad,
+        rois,
+        spatial_scale,
+        pooled_height,
+        pooled_width,
+        batch_size,
+        channels,
+        height,
+        width,
+        sampling_ratio,
+        aligned);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return ROIAlign_backward_cpu(
+      grad,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      batch_size,
+      channels,
+      height,
+      width,
+      sampling_ratio,
+      aligned);
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/ROIAlign/ROIAlign_cpu.cpp b/detectron2/layers/csrc/ROIAlign/ROIAlign_cpu.cpp
new file mode 100644
index 0000000..1b7da58
--- /dev/null
+++ b/detectron2/layers/csrc/ROIAlign/ROIAlign_cpu.cpp
@@ -0,0 +1,503 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/TensorUtils.h>
+#include "ROIAlign.h"
+
+namespace {
+
+// implementation taken from Caffe2
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int iy_upper,
+    const int ix_upper,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignForward(
+    const int nthreads,
+    const T* input,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* rois,
+    T* output,
+    bool aligned) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      AT_ASSERTM(
+          roi_width >= 0 && roi_height >= 0,
+          "ROIs in ROIAlign cannot have non-negative size!");
+    } else { // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    // When the grid is empty, output zeros.
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(
+        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_start_h,
+        roi_start_w,
+        bin_size_h,
+        bin_size_w,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] +
+                  pc.w2 * offset_input[pc.pos2] +
+                  pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = output_val;
+        } // for pw
+      } // for ph
+    } // for c
+  } // for n
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(T* address, const T& val) {
+  *address += val;
+}
+
+template <typename T>
+void ROIAlignBackward(
+    const int nthreads,
+    const T* grad_output,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* grad_input,
+    const T* rois,
+    const int n_stride,
+    const int c_stride,
+    const int h_stride,
+    const int w_stride,
+    bool aligned) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      AT_ASSERTM(
+          roi_width >= 0 && roi_height >= 0,
+          "ROIs in ROIAlign do not have non-negative size!");
+    } else { // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            w1,
+            w2,
+            w3,
+            w4,
+            x_low,
+            x_high,
+            y_low,
+            y_high,
+            index);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  } // for
+} // ROIAlignBackward
+
+} // namespace
+
+namespace detectron2 {
+
+at::Tensor ROIAlign_forward_cpu(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    bool aligned) {
+  AT_ASSERTM(input.device().is_cpu(), "input must be a CPU tensor");
+  AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlign_forward_cpu";
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  at::Tensor output = at::zeros(
+      {num_rois, channels, pooled_height, pooled_width}, input.options());
+
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+
+  if (output.numel() == 0)
+    return output;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "ROIAlign_forward", [&] {
+    ROIAlignForward<scalar_t>(
+        output_size,
+        input.contiguous().data_ptr<scalar_t>(),
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        sampling_ratio,
+        rois.contiguous().data_ptr<scalar_t>(),
+        output.data_ptr<scalar_t>(),
+        aligned);
+  });
+  return output;
+}
+
+at::Tensor ROIAlign_backward_cpu(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio,
+    bool aligned) {
+  AT_ASSERTM(grad.device().is_cpu(), "grad must be a CPU tensor");
+  AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlign_backward_cpu";
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  at::Tensor grad_input =
+      at::zeros({batch_size, channels, height, width}, grad.options());
+
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad.stride(0);
+  int c_stride = grad.stride(1);
+  int h_stride = grad.stride(2);
+  int w_stride = grad.stride(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.type(), "ROIAlign_forward", [&] {
+    ROIAlignBackward<scalar_t>(
+        grad.numel(),
+        grad.contiguous().data_ptr<scalar_t>(),
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        sampling_ratio,
+        grad_input.data_ptr<scalar_t>(),
+        rois.contiguous().data_ptr<scalar_t>(),
+        n_stride,
+        c_stride,
+        h_stride,
+        w_stride,
+        aligned);
+  });
+  return grad_input;
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/ROIAlign/ROIAlign_cuda.cu b/detectron2/layers/csrc/ROIAlign/ROIAlign_cuda.cu
new file mode 100644
index 0000000..6836743
--- /dev/null
+++ b/detectron2/layers/csrc/ROIAlign/ROIAlign_cuda.cu
@@ -0,0 +1,422 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__device__ T bilinear_interpolate(
+    const T* bottom_data,
+    const int height,
+    const int width,
+    T y,
+    T x,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return 0;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = bottom_data[y_low * width + x_low];
+  T v2 = bottom_data[y_low * width + x_high];
+  T v3 = bottom_data[y_high * width + x_low];
+  T v4 = bottom_data[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__global__ void RoIAlignForward(
+    const int nthreads,
+    const T* bottom_data,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* bottom_rois,
+    T* top_data,
+    bool aligned) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_bottom_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_bottom_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_bottom_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_bottom_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!aligned) { // for backward-compatibility only
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    // When the grid is empty, output zeros.
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T val = bilinear_interpolate(
+            offset_bottom_data, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = bottom_data[y_low * width + x_low];
+  // T v2 = bottom_data[y_low * width + x_high];
+  // T v3 = bottom_data[y_high * width + x_low];
+  // T v4 = bottom_data[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <typename T>
+__global__ void RoIAlignBackwardFeature(
+    const int nthreads,
+    const T* top_diff,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* bottom_diff,
+    const T* bottom_rois,
+    bool aligned) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_bottom_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_bottom_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_bottom_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_bottom_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!aligned) { // for backward-compatibility only
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            w1,
+            w2,
+            w3,
+            w4,
+            x_low,
+            x_high,
+            y_low,
+            y_high,
+            index);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(
+              offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
+          atomicAdd(
+              offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
+          atomicAdd(
+              offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
+          atomicAdd(
+              offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  } // CUDA_1D_KERNEL_LOOP
+} // RoIAlignBackward
+
+namespace detectron2 {
+
+at::Tensor ROIAlign_forward_cuda(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    bool aligned) {
+  AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlign_forward_cuda";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+  at::cuda::CUDAGuard device_guard(input.device());
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  auto output = at::empty(
+      {num_rois, channels, pooled_height, pooled_width}, input.options());
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv((long)output_size, 512L), 4096L));
+  dim3 block(512);
+
+  if (output.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return output;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
+    RoIAlignForward<scalar_t><<<grid, block, 0, stream>>>(
+        output_size,
+        input.contiguous().data_ptr<scalar_t>(),
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        sampling_ratio,
+        rois.contiguous().data_ptr<scalar_t>(),
+        output.data_ptr<scalar_t>(),
+        aligned);
+  });
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+  return output;
+}
+
+// TODO remove the dependency on input and use instead its sizes -> save memory
+at::Tensor ROIAlign_backward_cuda(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio,
+    bool aligned) {
+  AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+  at::CheckedFrom c = "ROIAlign_backward_cuda";
+  at::checkAllSameGPU(c, {grad_t, rois_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+  at::cuda::CUDAGuard device_guard(grad.device());
+
+  auto num_rois = rois.size(0);
+  auto grad_input =
+      at::zeros({batch_size, channels, height, width}, grad.options());
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv((long)grad.numel(), 512L), 4096L));
+  dim3 block(512);
+
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return grad_input;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] {
+    RoIAlignBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
+        grad.numel(),
+        grad.contiguous().data_ptr<scalar_t>(),
+        num_rois,
+        spatial_scale,
+        channels,
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        sampling_ratio,
+        grad_input.data_ptr<scalar_t>(),
+        rois.contiguous().data_ptr<scalar_t>(),
+        aligned);
+  });
+  AT_CUDA_CHECK(cudaGetLastError());
+  return grad_input;
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
new file mode 100644
index 0000000..dfc17b6
--- /dev/null
+++ b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
@@ -0,0 +1,115 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+at::Tensor ROIAlignRotated_forward_cpu(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio);
+
+at::Tensor ROIAlignRotated_backward_cpu(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio);
+
+#ifdef WITH_CUDA
+at::Tensor ROIAlignRotated_forward_cuda(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio);
+
+at::Tensor ROIAlignRotated_backward_cuda(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio);
+#endif
+
+// Interface for Python
+inline at::Tensor ROIAlignRotated_forward(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio) {
+  if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return ROIAlignRotated_forward_cuda(
+        input,
+        rois,
+        spatial_scale,
+        pooled_height,
+        pooled_width,
+        sampling_ratio);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return ROIAlignRotated_forward_cpu(
+      input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+}
+
+inline at::Tensor ROIAlignRotated_backward(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio) {
+  if (grad.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return ROIAlignRotated_backward_cuda(
+        grad,
+        rois,
+        spatial_scale,
+        pooled_height,
+        pooled_width,
+        batch_size,
+        channels,
+        height,
+        width,
+        sampling_ratio);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return ROIAlignRotated_backward_cpu(
+      grad,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      batch_size,
+      channels,
+      height,
+      width,
+      sampling_ratio);
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
new file mode 100644
index 0000000..fc61cd7
--- /dev/null
+++ b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
@@ -0,0 +1,519 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/TensorUtils.h>
+#include "ROIAlignRotated.h"
+
+// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
+// and PyTorch ROIAlign (non-rotated) Op implementations.
+// The key difference between this implementation and those ones is
+// we don't do "legacy offset" in this version, as there aren't many previous
+// works, if any, using the "legacy" ROIAlignRotated Op.
+// This would make the interface a bit cleaner.
+
+namespace detectron2 {
+
+namespace {
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int iy_upper,
+    const int ix_upper,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    T roi_center_h,
+    T roi_center_w,
+    T cos_theta,
+    T sin_theta,
+    std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+
+          // Rotate by theta around the center and translate
+          // In image space, (y, x) is the order for Right Handed System,
+          // and this is essentially multiplying the point by a rotation matrix
+          // to rotate it counterclockwise through angle theta.
+          T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+          T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y < 0) {
+            y = 0;
+          }
+          if (x < 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(T* address, const T& val) {
+  *address += val;
+}
+
+} // namespace
+
+template <typename T>
+void ROIAlignRotatedForward(
+    const int nthreads,
+    const T* input,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* rois,
+    T* output) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    AT_ASSERTM(
+        roi_width >= 0 && roi_height >= 0,
+        "ROIs in ROIAlignRotated do not have non-negative size!");
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(
+        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    pre_calc_for_bilinear_interpolate(
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_start_h,
+        roi_start_w,
+        bin_size_h,
+        bin_size_w,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_center_h,
+        roi_center_w,
+        cos_theta,
+        sin_theta,
+        pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] +
+                  pc.w2 * offset_input[pc.pos2] +
+                  pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = output_val;
+        } // for pw
+      } // for ph
+    } // for c
+  } // for n
+}
+
+template <typename T>
+void ROIAlignRotatedBackward(
+    const int nthreads,
+    const T* grad_output,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* grad_input,
+    const T* rois,
+    const int n_stride,
+    const int c_stride,
+    const int h_stride,
+    const int w_stride) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    AT_ASSERTM(
+        roi_width >= 0 && roi_height >= 0,
+        "ROIs in ROIAlignRotated do not have non-negative size!");
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T yy = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  } // for
+} // ROIAlignRotatedBackward
+
+at::Tensor ROIAlignRotated_forward_cpu(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio) {
+  AT_ASSERTM(input.device().is_cpu(), "input must be a CPU tensor");
+  AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlign_forward_cpu";
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  at::Tensor output = at::zeros(
+      {num_rois, channels, pooled_height, pooled_width}, input.options());
+
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+
+  if (output.numel() == 0) {
+    return output;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.type(), "ROIAlignRotated_forward", [&] {
+        ROIAlignRotatedForward<scalar_t>(
+            output_size,
+            input.contiguous().data_ptr<scalar_t>(),
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            rois.contiguous().data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>());
+      });
+  return output;
+}
+
+at::Tensor ROIAlignRotated_backward_cpu(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio) {
+  AT_ASSERTM(grad.device().is_cpu(), "grad must be a CPU tensor");
+  AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlignRotated_backward_cpu";
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  at::Tensor grad_input =
+      at::zeros({batch_size, channels, height, width}, grad.options());
+
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad.stride(0);
+  int c_stride = grad.stride(1);
+  int h_stride = grad.stride(2);
+  int w_stride = grad.stride(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad.type(), "ROIAlignRotated_forward", [&] {
+        ROIAlignRotatedBackward<scalar_t>(
+            grad.numel(),
+            grad.contiguous().data_ptr<scalar_t>(),
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            grad_input.data_ptr<scalar_t>(),
+            rois.contiguous().data_ptr<scalar_t>(),
+            n_stride,
+            c_stride,
+            h_stride,
+            w_stride);
+      });
+  return grad_input;
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
new file mode 100644
index 0000000..2581cf3
--- /dev/null
+++ b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
@@ -0,0 +1,435 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
+// and PyTorch ROIAlign (non-rotated) Op implementations.
+// The key difference between this implementation and those ones is
+// we don't do "legacy offset" in this version, as there aren't many previous
+// works, if any, using the "legacy" ROIAlignRotated Op.
+// This would make the interface a bit cleaner.
+
+namespace detectron2 {
+
+namespace {
+
+template <typename T>
+__device__ T bilinear_interpolate(
+    const T* input,
+    const int height,
+    const int width,
+    T y,
+    T x) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return 0;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+} // namespace
+
+template <typename T>
+__global__ void RoIAlignRotatedForward(
+    const int nthreads,
+    const T* input,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* rois,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (inte  gral) pooling inside a bin
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T yy = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T val = bilinear_interpolate(offset_input, height, width, y, x);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+template <typename T>
+__global__ void RoIAlignRotatedBackwardFeature(
+    const int nthreads,
+    const T* top_diff,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* bottom_diff,
+    const T* rois) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T yy = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(
+              offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
+          atomicAdd(
+              offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
+          atomicAdd(
+              offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
+          atomicAdd(
+              offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  } // CUDA_1D_KERNEL_LOOP
+} // RoIAlignRotatedBackward
+
+at::Tensor ROIAlignRotated_forward_cuda(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio) {
+  AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlignRotated_forward_cuda";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+  at::cuda::CUDAGuard device_guard(input.device());
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  auto output = at::empty(
+      {num_rois, channels, pooled_height, pooled_width}, input.options());
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv((long)output_size, 512L), 4096L));
+  dim3 block(512);
+
+  if (output.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return output;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "ROIAlignRotated_forward", [&] {
+        RoIAlignRotatedForward<scalar_t><<<grid, block, 0, stream>>>(
+            output_size,
+            input.contiguous().data_ptr<scalar_t>(),
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            rois.contiguous().data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>());
+      });
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+  return output;
+}
+
+// TODO remove the dependency on input and use instead its sizes -> save memory
+at::Tensor ROIAlignRotated_backward_cuda(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio) {
+  AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+  at::CheckedFrom c = "ROIAlign_backward_cuda";
+  at::checkAllSameGPU(c, {grad_t, rois_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+  at::cuda::CUDAGuard device_guard(grad.device());
+
+  auto num_rois = rois.size(0);
+  auto grad_input =
+      at::zeros({batch_size, channels, height, width}, grad.options());
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv((long)grad.numel(), 512L), 4096L));
+  dim3 block(512);
+
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return grad_input;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(
+      grad.scalar_type(), "ROIAlignRotated_backward", [&] {
+        RoIAlignRotatedBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
+            grad.numel(),
+            grad.contiguous().data_ptr<scalar_t>(),
+            num_rois,
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            grad_input.data_ptr<scalar_t>(),
+            rois.contiguous().data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+  return grad_input;
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
new file mode 100644
index 0000000..47b85dc
--- /dev/null
+++ b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
@@ -0,0 +1,35 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+at::Tensor box_iou_rotated_cpu(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2);
+
+#ifdef WITH_CUDA
+at::Tensor box_iou_rotated_cuda(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+inline at::Tensor box_iou_rotated(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2) {
+  assert(boxes1.device().is_cuda() == boxes2.device().is_cuda());
+  if (boxes1.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return box_iou_rotated_cuda(boxes1, boxes2);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+
+  return box_iou_rotated_cpu(boxes1, boxes2);
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
new file mode 100644
index 0000000..a6aaa81
--- /dev/null
+++ b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
@@ -0,0 +1,46 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "box_iou_rotated.h"
+#include "box_iou_rotated_utils.h"
+
+namespace detectron2 {
+
+template <typename T>
+void box_iou_rotated_cpu_kernel(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2,
+    at::Tensor& ious) {
+  auto widths1 = boxes1.select(1, 2).contiguous();
+  auto heights1 = boxes1.select(1, 3).contiguous();
+  auto widths2 = boxes2.select(1, 2).contiguous();
+  auto heights2 = boxes2.select(1, 3).contiguous();
+
+  at::Tensor areas1 = widths1 * heights1;
+  at::Tensor areas2 = widths2 * heights2;
+
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+
+  for (int i = 0; i < num_boxes1; i++) {
+    for (int j = 0; j < num_boxes2; j++) {
+      ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
+          boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>());
+    }
+  }
+}
+
+at::Tensor box_iou_rotated_cpu(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2) {
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+  at::Tensor ious =
+      at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat));
+
+  box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious);
+
+  // reshape from 1d array to 2d array
+  auto shape = std::vector<int64_t>{num_boxes1, num_boxes2};
+  return ious.reshape(shape);
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
new file mode 100644
index 0000000..79448e3
--- /dev/null
+++ b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
@@ -0,0 +1,103 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include "box_iou_rotated_utils.h"
+
+namespace detectron2 {
+
+// 2D block with 32 * 16 = 512 threads per block
+const int BLOCK_DIM_X = 32;
+const int BLOCK_DIM_Y = 16;
+
+template <typename T>
+__global__ void box_iou_rotated_cuda_kernel(
+    const int n_boxes1,
+    const int n_boxes2,
+    const T* dev_boxes1,
+    const T* dev_boxes2,
+    T* dev_ious) {
+  const int row_start = blockIdx.x * blockDim.x;
+  const int col_start = blockIdx.y * blockDim.y;
+
+  const int row_size = min(n_boxes1 - row_start, blockDim.x);
+  const int col_size = min(n_boxes2 - col_start, blockDim.y);
+
+  __shared__ float block_boxes1[BLOCK_DIM_X * 5];
+  __shared__ float block_boxes2[BLOCK_DIM_Y * 5];
+
+  // It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y
+  if (threadIdx.x < row_size && threadIdx.y == 0) {
+    block_boxes1[threadIdx.x * 5 + 0] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 0];
+    block_boxes1[threadIdx.x * 5 + 1] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 1];
+    block_boxes1[threadIdx.x * 5 + 2] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 2];
+    block_boxes1[threadIdx.x * 5 + 3] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 3];
+    block_boxes1[threadIdx.x * 5 + 4] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 4];
+  }
+
+  if (threadIdx.x < col_size && threadIdx.y == 0) {
+    block_boxes2[threadIdx.x * 5 + 0] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 0];
+    block_boxes2[threadIdx.x * 5 + 1] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 1];
+    block_boxes2[threadIdx.x * 5 + 2] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 2];
+    block_boxes2[threadIdx.x * 5 + 3] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 3];
+    block_boxes2[threadIdx.x * 5 + 4] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size && threadIdx.y < col_size) {
+    int offset = (row_start + threadIdx.x) * n_boxes2 + col_start + threadIdx.y;
+    dev_ious[offset] = single_box_iou_rotated<T>(
+        block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5);
+  }
+}
+
+at::Tensor box_iou_rotated_cuda(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2) {
+  using scalar_t = float;
+  AT_ASSERTM(boxes1.type().is_cuda(), "boxes1 must be a CUDA tensor");
+  AT_ASSERTM(boxes2.type().is_cuda(), "boxes2 must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(boxes1.device());
+
+  int num_boxes1 = boxes1.size(0);
+  int num_boxes2 = boxes2.size(0);
+
+  at::Tensor ious =
+      at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat));
+
+  if (num_boxes1 > 0 && num_boxes2 > 0) {
+    const int blocks_x = at::cuda::ATenCeilDiv(num_boxes1, BLOCK_DIM_X);
+    const int blocks_y = at::cuda::ATenCeilDiv(num_boxes2, BLOCK_DIM_Y);
+
+    dim3 blocks(blocks_x, blocks_y);
+    dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
+
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    box_iou_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        num_boxes1,
+        num_boxes2,
+        boxes1.data_ptr<scalar_t>(),
+        boxes2.data_ptr<scalar_t>(),
+        (scalar_t*)ious.data_ptr<scalar_t>());
+
+    AT_CUDA_CHECK(cudaGetLastError());
+  }
+
+  // reshape from 1d array to 2d array
+  auto shape = std::vector<int64_t>{num_boxes1, num_boxes2};
+  return ious.reshape(shape);
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
new file mode 100644
index 0000000..0602b8c
--- /dev/null
+++ b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
@@ -0,0 +1,342 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+
+#include <cassert>
+#include <cmath>
+
+#ifdef __CUDACC__
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define HOST_DEVICE __host__ __device__
+#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
+#else
+#include <algorithm>
+#define HOST_DEVICE
+#define HOST_DEVICE_INLINE HOST_DEVICE inline
+#endif
+
+namespace detectron2 {
+
+namespace {
+
+template <typename T>
+struct RotatedBox {
+  T x_ctr, y_ctr, w, h, a;
+};
+
+template <typename T>
+struct Point {
+  T x, y;
+  HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {}
+  HOST_DEVICE_INLINE Point operator+(const Point& p) const {
+    return Point(x + p.x, y + p.y);
+  }
+  HOST_DEVICE_INLINE Point& operator+=(const Point& p) {
+    x += p.x;
+    y += p.y;
+    return *this;
+  }
+  HOST_DEVICE_INLINE Point operator-(const Point& p) const {
+    return Point(x - p.x, y - p.y);
+  }
+  HOST_DEVICE_INLINE Point operator*(const T coeff) const {
+    return Point(x * coeff, y * coeff);
+  }
+};
+
+template <typename T>
+HOST_DEVICE_INLINE T dot_2d(const Point<T>& A, const Point<T>& B) {
+  return A.x * B.x + A.y * B.y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T cross_2d(const Point<T>& A, const Point<T>& B) {
+  return A.x * B.y - B.x * A.y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE void get_rotated_vertices(
+    const RotatedBox<T>& box,
+    Point<T> (&pts)[4]) {
+  // M_PI / 180. == 0.01745329251
+  double theta = box.a * 0.01745329251;
+  T cosTheta2 = (T)cos(theta) * 0.5f;
+  T sinTheta2 = (T)sin(theta) * 0.5f;
+
+  // y: top --> down; x: left --> right
+  pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
+  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
+  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[2].x = 2 * box.x_ctr - pts[0].x;
+  pts[2].y = 2 * box.y_ctr - pts[0].y;
+  pts[3].x = 2 * box.x_ctr - pts[1].x;
+  pts[3].y = 2 * box.y_ctr - pts[1].y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int get_intersection_points(
+    const Point<T> (&pts1)[4],
+    const Point<T> (&pts2)[4],
+    Point<T> (&intersections)[24]) {
+  // Line vector
+  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+  Point<T> vec1[4], vec2[4];
+  for (int i = 0; i < 4; i++) {
+    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
+  }
+
+  // Line test - test all line combos for intersection
+  int num = 0; // number of intersections
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      // Solve for 2x2 Ax=b
+      T det = cross_2d<T>(vec2[j], vec1[i]);
+
+      // This takes care of parallel lines
+      if (fabs(det) <= 1e-14) {
+        continue;
+      }
+
+      auto vec12 = pts2[j] - pts1[i];
+
+      T t1 = cross_2d<T>(vec2[j], vec12) / det;
+      T t2 = cross_2d<T>(vec1[i], vec12) / det;
+
+      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
+        intersections[num++] = pts1[i] + vec1[i] * t1;
+      }
+    }
+  }
+
+  // Check for vertices of rect1 inside rect2
+  {
+    const auto& AB = vec2[0];
+    const auto& DA = vec2[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      // assume ABCD is the rectangle, and P is the point to be judged
+      // P is inside ABCD iff. P's projection on AB lies within AB
+      // and P's projection on AD lies within AD
+
+      auto AP = pts1[i] - pts2[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
+          (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts1[i];
+      }
+    }
+  }
+
+  // Reverse the check - check for vertices of rect2 inside rect1
+  {
+    const auto& AB = vec1[0];
+    const auto& DA = vec1[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      auto AP = pts2[i] - pts1[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
+          (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts2[i];
+      }
+    }
+  }
+
+  return num;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int convex_hull_graham(
+    const Point<T> (&p)[24],
+    const int& num_in,
+    Point<T> (&q)[24],
+    bool shift_to_zero = false) {
+  assert(num_in >= 2);
+
+  // Step 1:
+  // Find point with minimum y
+  // if more than 1 points have the same minimum y,
+  // pick the one with the minimum x.
+  int t = 0;
+  for (int i = 1; i < num_in; i++) {
+    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
+      t = i;
+    }
+  }
+  auto& start = p[t]; // starting point
+
+  // Step 2:
+  // Subtract starting point from every points (for sorting in the next step)
+  for (int i = 0; i < num_in; i++) {
+    q[i] = p[i] - start;
+  }
+
+  // Swap the starting point to position 0
+  auto tmp = q[0];
+  q[0] = q[t];
+  q[t] = tmp;
+
+  // Step 3:
+  // Sort point 1 ~ num_in according to their relative cross-product values
+  // (essentially sorting according to angles)
+  // If the angles are the same, sort according to their distance to origin
+  T dist[24];
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+#ifdef __CUDACC__
+  // CUDA version
+  // In the future, we can potentially use thrust
+  // for sorting here to improve speed (though not guaranteed)
+  for (int i = 1; i < num_in - 1; i++) {
+    for (int j = i + 1; j < num_in; j++) {
+      T crossProduct = cross_2d<T>(q[i], q[j]);
+      if ((crossProduct < -1e-6) ||
+          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
+        auto q_tmp = q[i];
+        q[i] = q[j];
+        q[j] = q_tmp;
+        auto dist_tmp = dist[i];
+        dist[i] = dist[j];
+        dist[j] = dist_tmp;
+      }
+    }
+  }
+#else
+  // CPU version
+  std::sort(
+      q + 1, q + num_in, [](const Point<T>& A, const Point<T>& B) -> bool {
+        T temp = cross_2d<T>(A, B);
+        if (fabs(temp) < 1e-6) {
+          return dot_2d<T>(A, A) < dot_2d<T>(B, B);
+        } else {
+          return temp > 0;
+        }
+      });
+#endif
+
+  // Step 4:
+  // Make sure there are at least 2 points (that don't overlap with each other)
+  // in the stack
+  int k; // index of the non-overlapped second point
+  for (k = 1; k < num_in; k++) {
+    if (dist[k] > 1e-8) {
+      break;
+    }
+  }
+  if (k == num_in) {
+    // We reach the end, which means the convex hull is just one point
+    q[0] = p[t];
+    return 1;
+  }
+  q[1] = q[k];
+  int m = 2; // 2 points in the stack
+  // Step 5:
+  // Finally we can start the scanning process.
+  // When a non-convex relationship between the 3 points is found
+  // (either concave shape or duplicated points),
+  // we pop the previous point from the stack
+  // until the 3-point relationship is convex again, or
+  // until the stack only contains two points
+  for (int i = k + 1; i < num_in; i++) {
+    while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
+      m--;
+    }
+    q[m++] = q[i];
+  }
+
+  // Step 6 (Optional):
+  // In general sense we need the original coordinates, so we
+  // need to shift the points back (reverting Step 2)
+  // But if we're only interested in getting the area/perimeter of the shape
+  // We can simply return.
+  if (!shift_to_zero) {
+    for (int i = 0; i < m; i++) {
+      q[i] += start;
+    }
+  }
+
+  return m;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int& m) {
+  if (m <= 2) {
+    return 0;
+  }
+
+  T area = 0;
+  for (int i = 1; i < m - 1; i++) {
+    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
+  }
+
+  return area / 2.0;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T rotated_boxes_intersection(
+    const RotatedBox<T>& box1,
+    const RotatedBox<T>& box2) {
+  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+  // from rotated_rect_intersection_pts
+  Point<T> intersectPts[24], orderedPts[24];
+
+  Point<T> pts1[4];
+  Point<T> pts2[4];
+  get_rotated_vertices<T>(box1, pts1);
+  get_rotated_vertices<T>(box2, pts2);
+
+  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
+
+  if (num <= 2) {
+    return 0.0;
+  }
+
+  // Convex Hull to order the intersection points in clockwise order and find
+  // the contour area.
+  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
+  return polygon_area<T>(orderedPts, num_convex);
+}
+
+} // namespace
+
+template <typename T>
+HOST_DEVICE_INLINE T
+single_box_iou_rotated(T const* const box1_raw, T const* const box2_raw) {
+  // shift center to the middle point to achieve higher precision in result
+  RotatedBox<T> box1, box2;
+  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
+  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
+  box1.x_ctr = box1_raw[0] - center_shift_x;
+  box1.y_ctr = box1_raw[1] - center_shift_y;
+  box1.w = box1_raw[2];
+  box1.h = box1_raw[3];
+  box1.a = box1_raw[4];
+  box2.x_ctr = box2_raw[0] - center_shift_x;
+  box2.y_ctr = box2_raw[1] - center_shift_y;
+  box2.w = box2_raw[2];
+  box2.h = box2_raw[3];
+  box2.a = box2_raw[4];
+
+  const T area1 = box1.w * box1.h;
+  const T area2 = box2.w * box2.h;
+  if (area1 < 1e-14 || area2 < 1e-14) {
+    return 0.f;
+  }
+
+  const T intersection = rotated_boxes_intersection<T>(box1, box2);
+  const T iou = intersection / (area1 + area2 - intersection);
+  return iou;
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/cuda_version.cu b/detectron2/layers/csrc/cuda_version.cu
new file mode 100644
index 0000000..e0bd8bb
--- /dev/null
+++ b/detectron2/layers/csrc/cuda_version.cu
@@ -0,0 +1,7 @@
+#include <cuda_runtime_api.h>
+
+namespace detectron2 {
+int get_cudart_version() {
+  return CUDART_VERSION;
+}
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/deformable/deform_conv.h b/detectron2/layers/csrc/deformable/deform_conv.h
new file mode 100644
index 0000000..c9edd05
--- /dev/null
+++ b/detectron2/layers/csrc/deformable/deform_conv.h
@@ -0,0 +1,377 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+#ifdef WITH_CUDA
+int deform_conv_forward_cuda(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor offset,
+    at::Tensor output,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step);
+
+int deform_conv_backward_input_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradInput,
+    at::Tensor gradOffset,
+    at::Tensor weight,
+    at::Tensor columns,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step);
+
+int deform_conv_backward_parameters_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradWeight, // at::Tensor gradBias,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    float scale,
+    int im2col_step);
+
+void modulated_deform_conv_cuda_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor output,
+    at::Tensor columns,
+    int kernel_h,
+    int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_h,
+    const int pad_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int group,
+    const int deformable_group,
+    const bool with_bias);
+
+void modulated_deform_conv_cuda_backward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor columns,
+    at::Tensor grad_input,
+    at::Tensor grad_weight,
+    at::Tensor grad_bias,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask,
+    at::Tensor grad_output,
+    int kernel_h,
+    int kernel_w,
+    int stride_h,
+    int stride_w,
+    int pad_h,
+    int pad_w,
+    int dilation_h,
+    int dilation_w,
+    int group,
+    int deformable_group,
+    const bool with_bias);
+
+#endif
+
+inline int deform_conv_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor offset,
+    at::Tensor output,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+    AT_CHECK(weight.type().is_cuda(), "weight tensor is not on GPU!");
+    AT_CHECK(offset.type().is_cuda(), "offset tensor is not on GPU!");
+    return deform_conv_forward_cuda(
+        input,
+        weight,
+        offset,
+        output,
+        columns,
+        ones,
+        kW,
+        kH,
+        dW,
+        dH,
+        padW,
+        padH,
+        dilationW,
+        dilationH,
+        group,
+        deformable_group,
+        im2col_step);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+inline int deform_conv_backward_input(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradInput,
+    at::Tensor gradOffset,
+    at::Tensor weight,
+    at::Tensor columns,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  if (gradOutput.type().is_cuda()) {
+#ifdef WITH_CUDA
+    AT_CHECK(input.type().is_cuda(), "input tensor is not on GPU!");
+    AT_CHECK(weight.type().is_cuda(), "weight tensor is not on GPU!");
+    AT_CHECK(offset.type().is_cuda(), "offset tensor is not on GPU!");
+    return deform_conv_backward_input_cuda(
+        input,
+        offset,
+        gradOutput,
+        gradInput,
+        gradOffset,
+        weight,
+        columns,
+        kW,
+        kH,
+        dW,
+        dH,
+        padW,
+        padH,
+        dilationW,
+        dilationH,
+        group,
+        deformable_group,
+        im2col_step);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+inline int deform_conv_backward_filter(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradWeight, // at::Tensor gradBias,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    float scale,
+    int im2col_step) {
+  if (gradOutput.type().is_cuda()) {
+#ifdef WITH_CUDA
+    AT_CHECK(input.type().is_cuda(), "input tensor is not on GPU!");
+    AT_CHECK(offset.type().is_cuda(), "offset tensor is not on GPU!");
+    return deform_conv_backward_parameters_cuda(
+        input,
+        offset,
+        gradOutput,
+        gradWeight,
+        columns,
+        ones,
+        kW,
+        kH,
+        dW,
+        dH,
+        padW,
+        padH,
+        dilationW,
+        dilationH,
+        group,
+        deformable_group,
+        scale,
+        im2col_step);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+inline void modulated_deform_conv_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor output,
+    at::Tensor columns,
+    int kernel_h,
+    int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_h,
+    const int pad_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int group,
+    const int deformable_group,
+    const bool with_bias) {
+  if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+    AT_CHECK(weight.type().is_cuda(), "weight tensor is not on GPU!");
+    AT_CHECK(bias.type().is_cuda(), "bias tensor is not on GPU!");
+    AT_CHECK(offset.type().is_cuda(), "offset tensor is not on GPU!");
+    return modulated_deform_conv_cuda_forward(
+        input,
+        weight,
+        bias,
+        ones,
+        offset,
+        mask,
+        output,
+        columns,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dilation_h,
+        dilation_w,
+        group,
+        deformable_group,
+        with_bias);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+inline void modulated_deform_conv_backward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor columns,
+    at::Tensor grad_input,
+    at::Tensor grad_weight,
+    at::Tensor grad_bias,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask,
+    at::Tensor grad_output,
+    int kernel_h,
+    int kernel_w,
+    int stride_h,
+    int stride_w,
+    int pad_h,
+    int pad_w,
+    int dilation_h,
+    int dilation_w,
+    int group,
+    int deformable_group,
+    const bool with_bias) {
+  if (grad_output.type().is_cuda()) {
+#ifdef WITH_CUDA
+    AT_CHECK(input.type().is_cuda(), "input tensor is not on GPU!");
+    AT_CHECK(weight.type().is_cuda(), "weight tensor is not on GPU!");
+    AT_CHECK(bias.type().is_cuda(), "bias tensor is not on GPU!");
+    AT_CHECK(offset.type().is_cuda(), "offset tensor is not on GPU!");
+    return modulated_deform_conv_cuda_backward(
+        input,
+        weight,
+        bias,
+        ones,
+        offset,
+        mask,
+        columns,
+        grad_input,
+        grad_weight,
+        grad_bias,
+        grad_offset,
+        grad_mask,
+        grad_output,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dilation_h,
+        dilation_w,
+        group,
+        deformable_group,
+        with_bias);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/deformable/deform_conv_cuda.cu b/detectron2/layers/csrc/deformable/deform_conv_cuda.cu
new file mode 100644
index 0000000..aea311b
--- /dev/null
+++ b/detectron2/layers/csrc/deformable/deform_conv_cuda.cu
@@ -0,0 +1,1130 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+// modified from
+// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp
+// Original license: Apache 2.0
+
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
+// Original license: Apache 2.0
+
+#include <torch/types.h>
+
+#include "deform_conv.h"
+
+#include <cmath>
+#include <vector>
+
+namespace detectron2 {
+
+void deformable_im2col(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor data_col);
+
+void deformable_col2im(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_im);
+
+void deformable_col2im_coord(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_offset);
+
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask);
+
+void shape_check(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor* gradOutput,
+    at::Tensor weight,
+    int kH,
+    int kW,
+    int dH,
+    int dW,
+    int padH,
+    int padW,
+    int dilationH,
+    int dilationW,
+    int group,
+    int deformable_group) {
+  AT_CHECK(
+      weight.ndimension() == 4,
+      "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
+      "but got: %s",
+      weight.ndimension());
+
+  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  AT_CHECK(
+      kW > 0 && kH > 0,
+      "kernel size should be greater than zero, but got kH: %d kW: %d",
+      kH,
+      kW);
+
+  AT_CHECK(
+      (weight.size(2) == kH && weight.size(3) == kW),
+      "kernel size should be consistent with weight, ",
+      "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
+      kH,
+      kW,
+      weight.size(2),
+      weight.size(3));
+
+  AT_CHECK(
+      dW > 0 && dH > 0,
+      "stride should be greater than zero, but got dH: %d dW: %d",
+      dH,
+      dW);
+
+  AT_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH,
+      dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  AT_CHECK(
+      ndim == 3 || ndim == 4,
+      "3D or 4D input tensor expected but got: %s",
+      ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  AT_CHECK(
+      nInputPlane % deformable_group == 0,
+      "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        nOutputPlane,
+        outputHeight,
+        outputWidth);
+
+  AT_CHECK(
+      input.size(1) == nInputPlane,
+      "invalid number of input planes, expected: %d, but got: %d",
+      nInputPlane,
+      input.size(1));
+
+  AT_CHECK(
+      (inputHeight >= kH && inputWidth >= kW),
+      "input image is smaller than kernel");
+
+  AT_CHECK(
+      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+      "invalid spatial size of offset, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      outputHeight,
+      outputWidth,
+      offset.size(2),
+      offset.size(3));
+
+  AT_CHECK(
+      (offset.size(1) == deformable_group * 2 * kH * kW),
+      "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    AT_CHECK(
+        gradOutput->size(dimf) == nOutputPlane,
+        "invalid number of gradOutput planes, expected: %d, but got: %d",
+        nOutputPlane,
+        gradOutput->size(dimf));
+
+    AT_CHECK(
+        (gradOutput->size(dimh) == outputHeight &&
+         gradOutput->size(dimw) == outputWidth),
+        "invalid size of gradOutput, expected height: %d width: %d , but "
+        "got height: %d width: %d",
+        outputHeight,
+        outputWidth,
+        gradOutput->size(dimh),
+        gradOutput->size(dimw));
+  }
+}
+
+int deform_conv_forward_cuda(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor offset,
+    at::Tensor output,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  // todo: resize columns to include im2col: done
+  // todo: add im2col_step as input
+  // todo: add new output buffer and transpose it to output (or directly
+  // transpose output) todo: possibly change data indexing because of
+  // parallel_imgs
+
+  shape_check(
+      input,
+      offset,
+      NULL,
+      weight,
+      kH,
+      kW,
+      dH,
+      dW,
+      padH,
+      padW,
+      dilationH,
+      dilationW,
+      group,
+      deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view({batchSize / im2col_step,
+                        im2col_step,
+                        nOutputPlane,
+                        outputHeight,
+                        outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view({batchSize / im2col_step,
+                      im2col_step,
+                      nInputPlane,
+                      inputHeight,
+                      inputWidth});
+  offset = offset.view({batchSize / im2col_step,
+                        im2col_step,
+                        deformable_group * 2 * kH * kW,
+                        outputHeight,
+                        outputWidth});
+
+  at::Tensor output_buffer = at::zeros(
+      {batchSize / im2col_step,
+       nOutputPlane,
+       im2col_step * outputHeight,
+       outputWidth},
+      output.options());
+
+  output_buffer = output_buffer.view({output_buffer.size(0),
+                                      group,
+                                      output_buffer.size(1) / group,
+                                      output_buffer.size(2),
+                                      output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(
+        input[elt],
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group,
+                          weight.size(0) / group,
+                          weight.size(1),
+                          weight.size(2),
+                          weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+  }
+
+  output_buffer =
+      output_buffer.view({output_buffer.size(0),
+                          output_buffer.size(1) * output_buffer.size(2),
+                          output_buffer.size(3),
+                          output_buffer.size(4)});
+
+  output_buffer = output_buffer.view({batchSize / im2col_step,
+                                      nOutputPlane,
+                                      im2col_step,
+                                      outputHeight,
+                                      outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_input_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradInput,
+    at::Tensor gradOffset,
+    at::Tensor weight,
+    at::Tensor columns,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  shape_check(
+      input,
+      offset,
+      &gradOutput,
+      weight,
+      kH,
+      kW,
+      dH,
+      dW,
+      padH,
+      padW,
+      dilationH,
+      dilationW,
+      group,
+      deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  AT_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view({batchSize / im2col_step,
+                                im2col_step,
+                                nOutputPlane,
+                                outputHeight,
+                                outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view({batchSize / im2col_step,
+                              im2col_step,
+                              nInputPlane,
+                              inputHeight,
+                              inputWidth});
+  input = input.view({batchSize / im2col_step,
+                      im2col_step,
+                      nInputPlane,
+                      inputHeight,
+                      inputWidth});
+  gradOffset = gradOffset.view({batchSize / im2col_step,
+                                im2col_step,
+                                deformable_group * 2 * kH * kW,
+                                outputHeight,
+                                outputWidth});
+  offset = offset.view({batchSize / im2col_step,
+                        im2col_step,
+                        deformable_group * 2 * kH * kW,
+                        outputHeight,
+                        outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group,
+                          weight.size(0) / group,
+                          weight.size(1),
+                          weight.size(2),
+                          weight.size(3)});
+    gradOutput = gradOutput.view({gradOutput.size(0),
+                                  group,
+                                  gradOutput.size(1) / group,
+                                  gradOutput.size(2),
+                                  gradOutput.size(3),
+                                  gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(
+          weight[g].flatten(1).transpose(0, 1),
+          gradOutput[elt][g].flatten(1),
+          0.0f,
+          1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view({gradOutput.size(0),
+                                  gradOutput.size(1) * gradOutput.size(2),
+                                  gradOutput.size(3),
+                                  gradOutput.size(4),
+                                  gradOutput.size(5)});
+
+    deformable_col2im_coord(
+        columns,
+        input[elt],
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        gradOffset[elt]);
+
+    deformable_col2im(
+        columns,
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        gradInput[elt]);
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_parameters_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradWeight, // at::Tensor gradBias,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    float scale,
+    int im2col_step) {
+  // todo: transpose and reshape outGrad
+  // todo: reshape columns
+  // todo: add im2col_step as input
+
+  shape_check(
+      input,
+      offset,
+      &gradOutput,
+      gradWeight,
+      kH,
+      kW,
+      dH,
+      dW,
+      padH,
+      padW,
+      dilationH,
+      dilationW,
+      group,
+      deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view({batchSize / im2col_step,
+                                im2col_step,
+                                nOutputPlane,
+                                outputHeight,
+                                outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer = gradOutputBuffer.view({batchSize / im2col_step,
+                                            nOutputPlane,
+                                            im2col_step,
+                                            outputHeight,
+                                            outputWidth});
+  gradOutputBuffer.copy_(gradOutput);
+  gradOutputBuffer = gradOutputBuffer.view({batchSize / im2col_step,
+                                            nOutputPlane,
+                                            im2col_step * outputHeight,
+                                            outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize / im2col_step,
+                      im2col_step,
+                      nInputPlane,
+                      inputHeight,
+                      inputWidth});
+  offset = offset.view({batchSize / im2col_step,
+                        im2col_step,
+                        deformable_group * 2 * kH * kW,
+                        outputHeight,
+                        outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(
+        input[elt],
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view({gradOutputBuffer.size(0),
+                                              group,
+                                              gradOutputBuffer.size(1) / group,
+                                              gradOutputBuffer.size(2),
+                                              gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight = gradWeight.view({group,
+                                  gradWeight.size(0) / group,
+                                  gradWeight.size(1),
+                                  gradWeight.size(2),
+                                  gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(
+                              gradOutputBuffer[elt][g].flatten(1),
+                              columns[g].transpose(1, 0),
+                              1.0,
+                              scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3),
+         gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2),
+                                  gradWeight.size(3),
+                                  gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+
+  return 1;
+}
+
+void modulated_deform_conv_cuda_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor output,
+    at::Tensor columns,
+    int kernel_h,
+    int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_h,
+    const int pad_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int group,
+    const int deformable_group,
+    const bool with_bias) {
+  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR(
+        "Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+        kernel_h_,
+        kernel_w,
+        kernel_h_,
+        kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR(
+        "Input shape and kernel channels wont match: (%d vs %d).",
+        channels,
+        channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns = at::zeros(
+      {channels * kernel_h * kernel_w, 1 * height_out * width_out},
+      input.options());
+
+  output = output.view({output.size(0),
+                        group,
+                        output.size(1) / group,
+                        output.size(2),
+                        output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_cuda(
+        input[b],
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        columns);
+
+    // divide into group
+    weight = weight.view({group,
+                          weight.size(0) / group,
+                          weight.size(1),
+                          weight.size(2),
+                          weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view({weight.size(0) * weight.size(1),
+                          weight.size(2),
+                          weight.size(3),
+                          weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view({output.size(0),
+                        output.size(1) * output.size(2),
+                        output.size(3),
+                        output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_cuda_backward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor columns,
+    at::Tensor grad_input,
+    at::Tensor grad_weight,
+    at::Tensor grad_bias,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask,
+    at::Tensor grad_output,
+    int kernel_h,
+    int kernel_w,
+    int stride_h,
+    int stride_w,
+    int pad_h,
+    int pad_w,
+    int dilation_h,
+    int dilation_w,
+    int group,
+    int deformable_group,
+    const bool with_bias) {
+  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR(
+        "Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+        kernel_h_,
+        kernel_w,
+        kernel_h_,
+        kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR(
+        "Input shape and kernel channels wont match: (%d vs %d).",
+        channels,
+        channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros(
+      {channels * kernel_h * kernel_w, height_out * width_out},
+      input.options());
+
+  grad_output = grad_output.view({grad_output.size(0),
+                                  group,
+                                  grad_output.size(1) / group,
+                                  grad_output.size(2),
+                                  grad_output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group,
+                          weight.size(0) / group,
+                          weight.size(1),
+                          weight.size(2),
+                          weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(
+          weight[g].flatten(1).transpose(0, 1),
+          grad_output[b][g].flatten(1),
+          0.0f,
+          1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1),
+                          weight.size(2),
+                          weight.size(3),
+                          weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_cuda(
+        columns,
+        input[b],
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_cuda(
+        columns,
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_cuda(
+        input[b],
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view({group,
+                                    grad_weight.size(0) / group,
+                                    grad_weight.size(1),
+                                    grad_weight.size(2),
+                                    grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+                                    grad_weight.size(2),
+                                    grad_weight.size(3),
+                                    grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+                                  grad_output.size(2),
+                                  grad_output.size(3),
+                                  grad_output.size(4)});
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu b/detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu
new file mode 100644
index 0000000..dab3474
--- /dev/null
+++ b/detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu
@@ -0,0 +1,1288 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+// modified from
+// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+// Original license: Apache 2.0
+// clang-format off
+
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <THC/THCAtomics.cuh>
+
+using namespace at;
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+
+namespace {
+
+const int CUDA_NUM_THREADS = 1024;
+const int kMaxGridNum = 65535;
+
+inline int GET_BLOCKS(const int N) {
+  return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS);
+}
+
+}
+
+template <typename scalar_t>
+__device__ scalar_t deformable_im2col_bilinear(
+    const scalar_t* bottom_data,
+    const int data_width,
+    const int height,
+    const int width,
+    scalar_t h,
+    scalar_t w) {
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  scalar_t lh = h - h_low;
+  scalar_t lw = w - w_low;
+  scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t get_gradient_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int h,
+    const int w,
+    const int height,
+    const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename scalar_t>
+__device__ scalar_t get_coordinate_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int height,
+    const int width,
+    const scalar_t* im_data,
+    const int data_width,
+    const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename scalar_t>
+__global__ void deformable_im2col_gpu_kernel(
+    const int n,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int num_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    scalar_t* data_col_ptr = data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    // const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) *
+    // height + h_in) * width + w_in;
+    const scalar_t* data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+        scalar_t val = static_cast<scalar_t>(0);
+        const scalar_t h_im = h_in + i * dilation_h + offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
+          // const scalar_t map_h = i * dilation_h + offset_h;
+          // const scalar_t map_w = j * dilation_w + offset_w;
+          // const int cur_height = height - h_in;
+          // const int cur_width = width - w_in;
+          // val = deformable_im2col_bilinear(data_im_ptr, width, cur_height,
+          // cur_width, map_h, map_w);
+          val = deformable_im2col_bilinear(
+              data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void deformable_col2im_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_im) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const scalar_t cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          scalar_t weight = get_gradient_weight(
+              cur_inv_h_data,
+              cur_inv_w_data,
+              cur_h + dy,
+              cur_w + dx,
+              height,
+              width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void deformable_col2im_coord_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int offset_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_offset) {
+  CUDA_KERNEL_LOOP(index, n) {
+    scalar_t val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const scalar_t* data_col_ptr = data_col +
+        deformable_group_index * channel_per_deformable_group * batch_size *
+            width_col * height_col;
+    const scalar_t* data_im_ptr = data_im +
+        (b * deformable_group + deformable_group_index) *
+            channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+      scalar_t inv_h = h_in + i * dilation_h + offset_h;
+      scalar_t inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
+        inv_h = inv_w = -2;
+      }
+      const scalar_t weight = get_coordinate_weight(
+          inv_h,
+          inv_w,
+          height,
+          width,
+          data_im_ptr + cnt * height * width,
+          width,
+          bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+
+namespace detectron2 {
+
+void deformable_im2col(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor data_col) {
+  // num_axes should be smaller than block size
+  // todo: check parallel_imgs is correctly passed in
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_im.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.type(), "deformable_im2col_gpu", ([&] {
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+
+        deformable_im2col_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_im_,
+            data_offset_,
+            height,
+            width,
+            ksize_h,
+            ksize_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            parallel_imgs,
+            channels,
+            deformable_group,
+            height_col,
+            width_col,
+            data_col_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
+  }
+}
+
+
+void deformable_col2im(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_im) {
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels =
+      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t* grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        deformable_col2im_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_offset_,
+            channels,
+            height,
+            width,
+            ksize_h,
+            ksize_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            parallel_imgs,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_im_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in deformable_col2im: %s\n", cudaGetErrorString(err));
+  }
+}
+
+
+void deformable_col2im_coord(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_offset) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
+      deformable_group * parallel_imgs;
+  int channel_per_deformable_group =
+      channels * ksize_h * ksize_w / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.type(), "deformable_col2im_coord_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t* grad_offset_ = grad_offset.data_ptr<scalar_t>();
+
+        deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_im_,
+            data_offset_,
+            channels,
+            height,
+            width,
+            ksize_h,
+            ksize_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            parallel_imgs,
+            2 * ksize_h * ksize_w * deformable_group,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_offset_);
+      }));
+}
+
+} // namespace detectron2
+
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_im2col_bilinear(
+    const scalar_t* bottom_data,
+    const int data_width,
+    const int height,
+    const int width,
+    scalar_t h,
+    scalar_t w) {
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  scalar_t lh = h - h_low;
+  scalar_t lw = w - w_low;
+  scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_get_gradient_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int h,
+    const int w,
+    const int height,
+    const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_get_coordinate_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int height,
+    const int width,
+    const scalar_t* im_data,
+    const int data_width,
+    const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_im2col_gpu_kernel(
+    const int n,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const scalar_t* data_mask,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int num_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    scalar_t* data_col_ptr = data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    // const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) *
+    // height + h_in) * width + w_in;
+    const scalar_t* data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+
+    const scalar_t* data_mask_ptr = data_mask +
+        (b_col * deformable_group + deformable_group_index) * kernel_h *
+            kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+        const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+        scalar_t val = static_cast<scalar_t>(0);
+        const scalar_t h_im = h_in + i * dilation_h + offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        // if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
+          // const float map_h = i * dilation_h + offset_h;
+          // const float map_w = j * dilation_w + offset_w;
+          // const int cur_height = height - h_in;
+          // const int cur_width = width - w_in;
+          // val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height,
+          // cur_width, map_h, map_w);
+          val = dmcn_im2col_bilinear(
+              data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+        // data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_col2im_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_offset,
+    const scalar_t* data_mask,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_im) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const scalar_t* data_mask_ptr = data_mask +
+        (b * deformable_group + deformable_group_index) * kernel_h * kernel_w *
+            height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr =
+        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+    const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const scalar_t cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          scalar_t weight = dmcn_get_gradient_weight(
+              cur_inv_h_data,
+              cur_inv_w_data,
+              cur_h + dy,
+              cur_w + dx,
+              height,
+              width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const scalar_t* data_mask,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int offset_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_offset,
+    scalar_t* grad_mask) {
+  CUDA_KERNEL_LOOP(index, n) {
+    scalar_t val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const scalar_t* data_col_ptr = data_col +
+        deformable_group_index * channel_per_deformable_group * batch_size *
+            width_col * height_col;
+    const scalar_t* data_im_ptr = data_im +
+        (b * deformable_group + deformable_group_index) *
+            channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const scalar_t* data_mask_ptr = data_mask +
+        (b * deformable_group + deformable_group_index) * kernel_h * kernel_w *
+            height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const int data_mask_hw_ptr =
+          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+      const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+      scalar_t inv_h = h_in + i * dilation_h + offset_h;
+      scalar_t inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
+        inv_h = inv_w = -2;
+      } else {
+        mval += data_col_ptr[col_pos] *
+            dmcn_im2col_bilinear(
+                    data_im_ptr + cnt * height * width,
+                    width,
+                    height,
+                    width,
+                    inv_h,
+                    inv_w);
+      }
+      const scalar_t weight = dmcn_get_coordinate_weight(
+          inv_h,
+          inv_w,
+          height,
+          width,
+          data_im_ptr + cnt * height * width,
+          width,
+          bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+      // height_col + h) * width_col + w], mask_req, mval);
+      grad_mask
+          [(((b * deformable_group + deformable_group_index) * kernel_h *
+                 kernel_w +
+             offset_c / 2) *
+                height_col +
+            h) *
+               width_col +
+           w] = mval;
+  }
+}
+
+
+namespace detectron2 {
+
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  at::cuda::CUDAGuard device_guard(data_im.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.type(), "modulated_deformable_im2col_gpu", ([&] {
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t* data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+
+        modulated_deformable_im2col_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_im_,
+            data_offset_,
+            data_mask_,
+            height_im,
+            width_im,
+            kernel_h,
+            kenerl_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            batch_size,
+            channels,
+            deformable_group,
+            height_col,
+            width_col,
+            data_col_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf(
+        "error in modulated_deformable_im2col_cuda: %s\n",
+        cudaGetErrorString(err));
+  }
+}
+
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_im) {
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels =
+      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.type(), "modulated_deformable_col2im_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t* data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t* grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_offset_,
+            data_mask_,
+            channels,
+            height_im,
+            width_im,
+            kernel_h,
+            kernel_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            batch_size,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_im_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf(
+        "error in modulated_deformable_col2im_cuda: %s\n",
+        cudaGetErrorString(err));
+  }
+}
+
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
+      kernel_w * deformable_group;
+  const int channel_per_deformable_group =
+      channels * kernel_h * kernel_w / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.type(), "modulated_deformable_col2im_coord_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t* data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t* grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t* grad_mask_ = grad_mask.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_im_,
+            data_offset_,
+            data_mask_,
+            channels,
+            height_im,
+            width_im,
+            kernel_h,
+            kernel_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            batch_size,
+            2 * kernel_h * kernel_w * deformable_group,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_offset_,
+            grad_mask_);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf(
+        "error in modulated_deformable_col2im_coord_cuda: %s\n",
+        cudaGetErrorString(err));
+  }
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/nms_rotated/nms_rotated.h b/detectron2/layers/csrc/nms_rotated/nms_rotated.h
new file mode 100644
index 0000000..f0dd981
--- /dev/null
+++ b/detectron2/layers/csrc/nms_rotated/nms_rotated.h
@@ -0,0 +1,38 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+at::Tensor nms_rotated_cpu(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const float iou_threshold);
+
+#ifdef WITH_CUDA
+at::Tensor nms_rotated_cuda(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const float iou_threshold);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+inline at::Tensor nms_rotated(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const float iou_threshold) {
+  assert(dets.device().is_cuda() == scores.device().is_cuda());
+  if (dets.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return nms_rotated_cuda(dets, scores, iou_threshold);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+
+  return nms_rotated_cpu(dets, scores, iou_threshold);
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp b/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
new file mode 100644
index 0000000..2850d4a
--- /dev/null
+++ b/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
@@ -0,0 +1,73 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "../box_iou_rotated/box_iou_rotated_utils.h"
+#include "nms_rotated.h"
+
+namespace detectron2 {
+
+template <typename scalar_t>
+at::Tensor nms_rotated_cpu_kernel(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const float iou_threshold) {
+  // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
+  // however, the code in this function is much shorter because
+  // we delegate the IoU computation for rotated boxes to
+  // the single_box_iou_rotated function in box_iou_rotated_utils.h
+  AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
+  AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
+  AT_ASSERTM(
+      dets.type() == scores.type(), "dets should have the same type as scores");
+
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
+  at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto keep = keep_t.data_ptr<int64_t>();
+  auto order = order_t.data_ptr<int64_t>();
+
+  int64_t num_to_keep = 0;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) {
+      continue;
+    }
+
+    keep[num_to_keep++] = i;
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) {
+        continue;
+      }
+
+      auto ovr = single_box_iou_rotated<scalar_t>(
+          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>());
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+      }
+    }
+  }
+  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
+}
+
+at::Tensor nms_rotated_cpu(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const float iou_threshold) {
+  auto result = at::empty({0}, dets.options());
+
+  AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms_rotated", [&] {
+    result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
+  });
+  return result;
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu b/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
new file mode 100644
index 0000000..9e02203
--- /dev/null
+++ b/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
@@ -0,0 +1,136 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include "../box_iou_rotated/box_iou_rotated_utils.h"
+
+using namespace detectron2;
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+template <typename T>
+__global__ void nms_rotated_cuda_kernel(
+    const int n_boxes,
+    const float iou_threshold,
+    const T* dev_boxes,
+    unsigned long long* dev_mask) {
+  // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel
+
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size =
+      min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+      min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  // Compared to nms_cuda_kernel, where each box is represented with 4 values
+  // (x1, y1, x2, y2), each rotated box is represented with 5 values
+  // (x_center, y_center, width, height, angle_degrees) here.
+  __shared__ T block_boxes[threadsPerBlock * 5];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 5 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+    block_boxes[threadIdx.x * 5 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+    block_boxes[threadIdx.x * 5 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+    block_boxes[threadIdx.x * 5 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+    block_boxes[threadIdx.x * 5 + 4] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+    const T* cur_box = dev_boxes + cur_box_idx * 5;
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      // Instead of devIoU used by original horizontal nms, here
+      // we use the single_box_iou_rotated function from box_iou_rotated_utils.h
+      if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5) >
+          iou_threshold) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = at::cuda::ATenCeilDiv(n_boxes, threadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+namespace detectron2 {
+
+at::Tensor nms_rotated_cuda(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    float iou_threshold) {
+  // using scalar_t = float;
+  AT_ASSERTM(dets.type().is_cuda(), "dets must be a CUDA tensor");
+  AT_ASSERTM(scores.type().is_cuda(), "scores must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(dets.device());
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+  auto dets_sorted = dets.index_select(0, order_t);
+
+  int dets_num = dets.size(0);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);
+
+  at::Tensor mask =
+      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      dets_sorted.type(), "nms_rotated_kernel_cuda", [&] {
+        nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            dets_num,
+            iou_threshold,
+            dets_sorted.data<scalar_t>(),
+            (unsigned long long*)mask.data<int64_t>());
+      });
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host = (unsigned long long*)mask_cpu.data<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  at::Tensor keep =
+      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data<int64_t>();
+
+  int num_to_keep = 0;
+  for (int i = 0; i < dets_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.index(
+      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
+           .to(order_t.device(), keep.scalar_type())});
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/csrc/vision.cpp b/detectron2/layers/csrc/vision.cpp
new file mode 100644
index 0000000..83ea78f
--- /dev/null
+++ b/detectron2/layers/csrc/vision.cpp
@@ -0,0 +1,97 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+#include <torch/extension.h>
+#include "ROIAlign/ROIAlign.h"
+#include "ROIAlignRotated/ROIAlignRotated.h"
+#include "box_iou_rotated/box_iou_rotated.h"
+#include "deformable/deform_conv.h"
+#include "nms_rotated/nms_rotated.h"
+
+namespace detectron2 {
+
+#ifdef WITH_CUDA
+extern int get_cudart_version();
+#endif
+
+std::string get_cuda_version() {
+#ifdef WITH_CUDA
+  std::ostringstream oss;
+
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else
+  return std::string("not available");
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("get_compiler_version", &get_compiler_version, "get_compiler_version");
+  m.def("get_cuda_version", &get_cuda_version, "get_cuda_version");
+
+  m.def("box_iou_rotated", &box_iou_rotated, "IoU for rotated boxes");
+
+  m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward");
+  m.def(
+      "deform_conv_backward_input",
+      &deform_conv_backward_input,
+      "deform_conv_backward_input");
+  m.def(
+      "deform_conv_backward_filter",
+      &deform_conv_backward_filter,
+      "deform_conv_backward_filter");
+  m.def(
+      "modulated_deform_conv_forward",
+      &modulated_deform_conv_forward,
+      "modulated_deform_conv_forward");
+  m.def(
+      "modulated_deform_conv_backward",
+      &modulated_deform_conv_backward,
+      "modulated_deform_conv_backward");
+
+  m.def("nms_rotated", &nms_rotated, "NMS for rotated boxes");
+
+  m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward");
+  m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward");
+
+  m.def(
+      "roi_align_rotated_forward",
+      &ROIAlignRotated_forward,
+      "Forward pass for Rotated ROI-Align Operator");
+  m.def(
+      "roi_align_rotated_backward",
+      &ROIAlignRotated_backward,
+      "Backward pass for Rotated ROI-Align Operator");
+}
+
+} // namespace detectron2
diff --git a/detectron2/layers/deform_conv.py b/detectron2/layers/deform_conv.py
new file mode 100644
index 0000000..d5ced72
--- /dev/null
+++ b/detectron2/layers/deform_conv.py
@@ -0,0 +1,494 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+from functools import lru_cache
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from detectron2 import _C
+
+from .wrappers import _NewEmptyTensorOp
+
+
+class _DeformConv(Function):
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        offset,
+        weight,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+        im2col_step=64,
+    ):
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                "Expected 4D tensor as input, got {}D tensor instead.".format(input.dim())
+            )
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.im2col_step = im2col_step
+
+        ctx.save_for_backward(input, offset, weight)
+
+        output = input.new_empty(
+            _DeformConv._output_size(input, weight, ctx.padding, ctx.dilation, ctx.stride)
+        )
+
+        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
+
+        if not input.is_cuda:
+            raise NotImplementedError
+        else:
+            cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step)
+            assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize"
+
+            _C.deform_conv_forward(
+                input,
+                weight,
+                offset,
+                output,
+                ctx.bufs_[0],
+                ctx.bufs_[1],
+                weight.size(3),
+                weight.size(2),
+                ctx.stride[1],
+                ctx.stride[0],
+                ctx.padding[1],
+                ctx.padding[0],
+                ctx.dilation[1],
+                ctx.dilation[0],
+                ctx.groups,
+                ctx.deformable_groups,
+                cur_im2col_step,
+            )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, offset, weight = ctx.saved_tensors
+
+        grad_input = grad_offset = grad_weight = None
+
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+        else:
+            cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step)
+            assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize"
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+                grad_input = torch.zeros_like(input)
+                grad_offset = torch.zeros_like(offset)
+                _C.deform_conv_backward_input(
+                    input,
+                    offset,
+                    grad_output,
+                    grad_input,
+                    grad_offset,
+                    weight,
+                    ctx.bufs_[0],
+                    weight.size(3),
+                    weight.size(2),
+                    ctx.stride[1],
+                    ctx.stride[0],
+                    ctx.padding[1],
+                    ctx.padding[0],
+                    ctx.dilation[1],
+                    ctx.dilation[0],
+                    ctx.groups,
+                    ctx.deformable_groups,
+                    cur_im2col_step,
+                )
+
+            if ctx.needs_input_grad[2]:
+                grad_weight = torch.zeros_like(weight)
+                _C.deform_conv_backward_filter(
+                    input,
+                    offset,
+                    grad_output,
+                    grad_weight,
+                    ctx.bufs_[0],
+                    ctx.bufs_[1],
+                    weight.size(3),
+                    weight.size(2),
+                    ctx.stride[1],
+                    ctx.stride[0],
+                    ctx.padding[1],
+                    ctx.padding[0],
+                    ctx.dilation[1],
+                    ctx.dilation[0],
+                    ctx.groups,
+                    ctx.deformable_groups,
+                    1,
+                    cur_im2col_step,
+                )
+
+        return grad_input, grad_offset, grad_weight, None, None, None, None, None, None
+
+    @staticmethod
+    def _output_size(input, weight, padding, dilation, stride):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = padding[d]
+            kernel = dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1,)
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                "convolution input is too small (output would be {})".format(
+                    "x".join(map(str, output_size))
+                )
+            )
+        return output_size
+
+    @staticmethod
+    @lru_cache(maxsize=128)
+    def _cal_im2col_step(input_size, default_size):
+        """
+        Calculate proper im2col step size, which should be divisible by input_size and not larger
+        than prefer_size. Meanwhile the step size should be as large as possible to be more
+        efficient. So we choose the largest one among all divisors of input_size which are smaller
+        than prefer_size.
+        :param input_size: input batch size .
+        :param default_size: default preferred im2col step size.
+        :return: the largest proper step size.
+        """
+        if input_size <= default_size:
+            return input_size
+        best_step = 1
+        for step in range(2, min(int(math.sqrt(input_size)) + 1, default_size)):
+            if input_size % step == 0:
+                if input_size // step <= default_size:
+                    return input_size // step
+                best_step = step
+
+        return best_step
+
+
+class _ModulatedDeformConv(Function):
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        offset,
+        mask,
+        weight,
+        bias=None,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+    ):
+        ctx.stride = stride
+        ctx.padding = padding
+        ctx.dilation = dilation
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.with_bias = bias is not None
+        if not ctx.with_bias:
+            bias = input.new_empty(1)  # fake tensor
+        if not input.is_cuda:
+            raise NotImplementedError
+        if (
+            weight.requires_grad
+            or mask.requires_grad
+            or offset.requires_grad
+            or input.requires_grad
+        ):
+            ctx.save_for_backward(input, offset, mask, weight, bias)
+        output = input.new_empty(_ModulatedDeformConv._infer_shape(ctx, input, weight))
+        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
+        _C.modulated_deform_conv_forward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            output,
+            ctx._bufs[1],
+            weight.shape[2],
+            weight.shape[3],
+            ctx.stride,
+            ctx.stride,
+            ctx.padding,
+            ctx.padding,
+            ctx.dilation,
+            ctx.dilation,
+            ctx.groups,
+            ctx.deformable_groups,
+            ctx.with_bias,
+        )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input = torch.zeros_like(input)
+        grad_offset = torch.zeros_like(offset)
+        grad_mask = torch.zeros_like(mask)
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
+        _C.modulated_deform_conv_backward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            ctx._bufs[1],
+            grad_input,
+            grad_weight,
+            grad_bias,
+            grad_offset,
+            grad_mask,
+            grad_output,
+            weight.shape[2],
+            weight.shape[3],
+            ctx.stride,
+            ctx.stride,
+            ctx.padding,
+            ctx.padding,
+            ctx.dilation,
+            ctx.dilation,
+            ctx.groups,
+            ctx.deformable_groups,
+            ctx.with_bias,
+        )
+        if not ctx.with_bias:
+            grad_bias = None
+
+        return (
+            grad_input,
+            grad_offset,
+            grad_mask,
+            grad_weight,
+            grad_bias,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+    @staticmethod
+    def _infer_shape(ctx, input, weight):
+        n = input.size(0)
+        channels_out = weight.size(0)
+        height, width = input.shape[2:4]
+        kernel_h, kernel_w = weight.shape[2:4]
+        height_out = (
+            height + 2 * ctx.padding - (ctx.dilation * (kernel_h - 1) + 1)
+        ) // ctx.stride + 1
+        width_out = (
+            width + 2 * ctx.padding - (ctx.dilation * (kernel_w - 1) + 1)
+        ) // ctx.stride + 1
+        return n, channels_out, height_out, width_out
+
+
+deform_conv = _DeformConv.apply
+modulated_deform_conv = _ModulatedDeformConv.apply
+
+
+class DeformConv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+        bias=False,
+        norm=None,
+        activation=None,
+    ):
+        """
+        Deformable convolution.
+
+        Arguments are similar to :class:`Conv2D`. Extra arguments:
+
+        Args:
+            deformable_groups (int): number of groups used in deformable convolution.
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+        """
+        super(DeformConv, self).__init__()
+
+        assert not bias
+        assert in_channels % groups == 0, "in_channels {} cannot be divisible by groups {}".format(
+            in_channels, groups
+        )
+        assert (
+            out_channels % groups == 0
+        ), "out_channels {} cannot be divisible by groups {}".format(out_channels, groups)
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+        self.norm = norm
+        self.activation = activation
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // self.groups, *self.kernel_size)
+        )
+        self.bias = None
+
+        nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
+
+    def forward(self, x, offset):
+        if x.numel() == 0:
+            # When input is empty, we want to return a empty tensor with "correct" shape,
+            # So that the following operations will not panic
+            # if they check for the shape of the tensor.
+            # This computes the height and width of the output tensor
+            output_shape = [
+                (i + 2 * p - (di * (k - 1) + 1)) // s + 1
+                for i, p, di, k, s in zip(
+                    x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
+                )
+            ]
+            output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
+            return _NewEmptyTensorOp.apply(x, output_shape)
+
+        x = deform_conv(
+            x,
+            offset,
+            self.weight,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            self.deformable_groups,
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+
+    def extra_repr(self):
+        tmpstr = "in_channels=" + str(self.in_channels)
+        tmpstr += ", out_channels=" + str(self.out_channels)
+        tmpstr += ", kernel_size=" + str(self.kernel_size)
+        tmpstr += ", stride=" + str(self.stride)
+        tmpstr += ", padding=" + str(self.padding)
+        tmpstr += ", dilation=" + str(self.dilation)
+        tmpstr += ", groups=" + str(self.groups)
+        tmpstr += ", deformable_groups=" + str(self.deformable_groups)
+        tmpstr += ", bias=False"
+        return tmpstr
+
+
+class ModulatedDeformConv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+        bias=True,
+        norm=None,
+        activation=None,
+    ):
+        """
+        Modulated deformable convolution.
+
+        Arguments are similar to :class:`Conv2D`. Extra arguments:
+
+        Args:
+            deformable_groups (int): number of groups used in deformable convolution.
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+        """
+        super(ModulatedDeformConv, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+        self.with_bias = bias
+        self.norm = norm
+        self.activation = activation
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.bias = None
+
+        nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0)
+
+    def forward(self, x, offset, mask):
+        if x.numel() == 0:
+            output_shape = [
+                (i + 2 * p - (di * (k - 1) + 1)) // s + 1
+                for i, p, di, k, s in zip(
+                    x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
+                )
+            ]
+            output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
+            return _NewEmptyTensorOp.apply(x, output_shape)
+
+        x = modulated_deform_conv(
+            x,
+            offset,
+            mask,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            self.deformable_groups,
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+
+    def extra_repr(self):
+        tmpstr = "in_channels=" + str(self.in_channels)
+        tmpstr += ", out_channels=" + str(self.out_channels)
+        tmpstr += ", kernel_size=" + str(self.kernel_size)
+        tmpstr += ", stride=" + str(self.stride)
+        tmpstr += ", padding=" + str(self.padding)
+        tmpstr += ", dilation=" + str(self.dilation)
+        tmpstr += ", groups=" + str(self.groups)
+        tmpstr += ", deformable_groups=" + str(self.deformable_groups)
+        tmpstr += ", bias=" + str(self.with_bias)
+        return tmpstr
diff --git a/detectron2/layers/mask_ops.py b/detectron2/layers/mask_ops.py
new file mode 100644
index 0000000..9dfa118
--- /dev/null
+++ b/detectron2/layers/mask_ops.py
@@ -0,0 +1,241 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+import torch
+from PIL import Image
+from torch.nn import functional as F
+
+__all__ = ["paste_masks_in_image"]
+
+
+BYTES_PER_FLOAT = 4
+# TODO: This memory limit may be too much or too little. It would be better to
+# determine it based on available resources.
+GPU_MEM_LIMIT = 1024 ** 3  # 1 GB memory limit
+
+
+def _do_paste_mask(masks, boxes, img_h, img_w, skip_empty=True):
+    """
+    Args:
+        masks: N, 1, H, W
+        boxes: N, 4
+        img_h, img_w (int):
+        skip_empty (bool): only paste masks within the region that
+            tightly bound all boxes, and returns the results this region only.
+            An important optimization for CPU.
+
+    Returns:
+        if skip_empty == False, a mask of shape (N, img_h, img_w)
+        if skip_empty == True, a mask of shape (N, h', w'), and the slice
+            object for the corresponding region.
+    """
+    # On GPU, paste all masks together (up to chunk size)
+    # by using the entire image to sample the masks
+    # Compared to pasting them one by one,
+    # this has more operations but is faster on COCO-scale dataset.
+    device = masks.device
+    if skip_empty:
+        x0_int, y0_int = torch.clamp(boxes.min(dim=0).values.floor()[:2] - 1, min=0).to(
+            dtype=torch.int32
+        )
+        x1_int = torch.clamp(boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32)
+        y1_int = torch.clamp(boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32)
+    else:
+        x0_int, y0_int = 0, 0
+        x1_int, y1_int = img_w, img_h
+    x0, y0, x1, y1 = torch.split(boxes, 1, dim=1)  # each is Nx1
+
+    N = masks.shape[0]
+
+    img_y = torch.arange(y0_int, y1_int, device=device, dtype=torch.float32) + 0.5
+    img_x = torch.arange(x0_int, x1_int, device=device, dtype=torch.float32) + 0.5
+    img_y = (img_y - y0) / (y1 - y0) * 2 - 1
+    img_x = (img_x - x0) / (x1 - x0) * 2 - 1
+    # img_x, img_y have shapes (N, w), (N, h)
+
+    gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1))
+    gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1))
+    grid = torch.stack([gx, gy], dim=3)
+
+    img_masks = F.grid_sample(masks.to(dtype=torch.float32), grid, align_corners=False)
+
+    if skip_empty:
+        return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int))
+    else:
+        return img_masks[:, 0], ()
+
+
+def paste_masks_in_image(masks, boxes, image_shape, threshold=0.5):
+    """
+    Paste a set of masks that are of a fixed resolution (e.g., 28 x 28) into an image.
+    The location, height, and width for pasting each mask is determined by their
+    corresponding bounding boxes in boxes.
+
+    Args:
+        masks (tensor): Tensor of shape (Bimg, Hmask, Wmask), where Bimg is the number of
+            detected object instances in the image and Hmask, Wmask are the mask width and mask
+            height of the predicted mask (e.g., Hmask = Wmask = 28). Values are in [0, 1].
+        boxes (Boxes): A Boxes of length Bimg. boxes.tensor[i] and masks[i] correspond
+            to the same object instance.
+        image_shape (tuple): height, width
+        threshold (float): A threshold in [0, 1] for converting the (soft) masks to
+            binary masks.
+
+    Returns:
+        img_masks (Tensor): A tensor of shape (Bimg, Himage, Wimage), where Bimg is the
+        number of detected object instances and Himage, Wimage are the image width
+        and height. img_masks[i] is a binary mask for object instance i.
+    """
+    assert masks.shape[-1] == masks.shape[-2], "Only square mask predictions are supported"
+    N = len(masks)
+    if N == 0:
+        return masks.new_empty((0,) + image_shape, dtype=torch.uint8)
+
+    boxes = boxes.tensor
+    device = boxes.device
+    assert len(boxes) == N, boxes.shape
+
+    img_h, img_w = image_shape
+
+    # The actual implementation split the input into chunks,
+    # and paste them chunk by chunk.
+    if device.type == "cpu":
+        # CPU is most efficient when they are pasted one by one with skip_empty=True
+        # so that it performs minimal number of operations.
+        num_chunks = N
+    else:
+        # GPU benefits from parallelism for larger chunks, but may have memory issue
+        num_chunks = int(np.ceil(N * img_h * img_w * BYTES_PER_FLOAT / GPU_MEM_LIMIT))
+        assert (
+            num_chunks <= N
+        ), "Default GPU_MEM_LIMIT in mask_ops.py is too small; try increasing it"
+    chunks = torch.chunk(torch.arange(N, device=device), num_chunks)
+
+    img_masks = torch.zeros(
+        N, img_h, img_w, device=device, dtype=torch.bool if threshold >= 0 else torch.uint8
+    )
+    for inds in chunks:
+        masks_chunk, spatial_inds = _do_paste_mask(
+            masks[inds, None, :, :], boxes[inds], img_h, img_w, skip_empty=device.type == "cpu"
+        )
+
+        if threshold >= 0:
+            masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool)
+        else:
+            # for visualization and debugging
+            masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8)
+
+        img_masks[(inds,) + spatial_inds] = masks_chunk
+    return img_masks
+
+
+# The below are the original paste function (from Detectron1) which has
+# larger quantization error.
+# It is faster on CPU, while the aligned one is faster on GPU thanks to grid_sample.
+
+
+def paste_mask_in_image_old(mask, box, img_h, img_w, threshold):
+    """
+    Paste a single mask in an image.
+    This is a per-box implementation of :func:`paste_masks_in_image`.
+    This function has larger quantization error due to incorrect pixel
+    modeling and is not used any more.
+
+    Args:
+        mask (Tensor): A tensor of shape (Hmask, Wmask) storing the mask of a single
+            object instance. Values are in [0, 1].
+        box (Tensor): A tensor of shape (4, ) storing the x0, y0, x1, y1 box corners
+            of the object instance.
+        img_h, img_w (int): Image height and width.
+        threshold (float): Mask binarization threshold in [0, 1].
+
+    Returns:
+        im_mask (Tensor):
+            The resized and binarized object mask pasted into the original
+            image plane (a tensor of shape (img_h, img_w)).
+    """
+    # Conversion from continuous box coordinates to discrete pixel coordinates
+    # via truncation (cast to int32). This determines which pixels to paste the
+    # mask onto.
+    box = box.to(dtype=torch.int32)  # Continuous to discrete coordinate conversion
+    # An example (1D) box with continuous coordinates (x0=0.7, x1=4.3) will map to
+    # a discrete coordinates (x0=0, x1=4). Note that box is mapped to 5 = x1 - x0 + 1
+    # pixels (not x1 - x0 pixels).
+    samples_w = box[2] - box[0] + 1  # Number of pixel samples, *not* geometric width
+    samples_h = box[3] - box[1] + 1  # Number of pixel samples, *not* geometric height
+
+    # Resample the mask from it's original grid to the new samples_w x samples_h grid
+    mask = Image.fromarray(mask.cpu().numpy())
+    mask = mask.resize((samples_w, samples_h), resample=Image.BILINEAR)
+    mask = np.array(mask, copy=False)
+
+    if threshold >= 0:
+        mask = np.array(mask > threshold, dtype=np.uint8)
+        mask = torch.from_numpy(mask)
+    else:
+        # for visualization and debugging, we also
+        # allow it to return an unmodified mask
+        mask = torch.from_numpy(mask * 255).to(torch.uint8)
+
+    im_mask = torch.zeros((img_h, img_w), dtype=torch.uint8)
+    x_0 = max(box[0], 0)
+    x_1 = min(box[2] + 1, img_w)
+    y_0 = max(box[1], 0)
+    y_1 = min(box[3] + 1, img_h)
+
+    im_mask[y_0:y_1, x_0:x_1] = mask[
+        (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])
+    ]
+    return im_mask
+
+
+# Our pixel modeling requires extrapolation for any continuous
+# coordinate < 0.5 or > length - 0.5. When sampling pixels on the masks,
+# we would like this extrapolation to be an interpolation between boundary values and zero,
+# instead of using absolute zero or boundary values.
+# Therefore `paste_mask_in_image_old` is often used with zero padding around the masks like this:
+# masks, scale = pad_masks(masks[:, 0, :, :], 1)
+# boxes = scale_boxes(boxes.tensor, scale)
+
+
+def pad_masks(masks, padding):
+    """
+    Args:
+        masks (tensor): A tensor of shape (B, M, M) representing B masks.
+        padding (int): Number of cells to pad on all sides.
+
+    Returns:
+        The padded masks and the scale factor of the padding size / original size.
+    """
+    B = masks.shape[0]
+    M = masks.shape[-1]
+    pad2 = 2 * padding
+    scale = float(M + pad2) / M
+    padded_masks = masks.new_zeros((B, M + pad2, M + pad2))
+    padded_masks[:, padding:-padding, padding:-padding] = masks
+    return padded_masks, scale
+
+
+def scale_boxes(boxes, scale):
+    """
+    Args:
+        boxes (tensor): A tensor of shape (B, 4) representing B boxes with 4
+            coords representing the corners x0, y0, x1, y1,
+        scale (float): The box scaling factor.
+
+    Returns:
+        Scaled boxes.
+    """
+    w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
+    h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
+    x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
+    y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
+
+    w_half *= scale
+    h_half *= scale
+
+    scaled_boxes = torch.zeros_like(boxes)
+    scaled_boxes[:, 0] = x_c - w_half
+    scaled_boxes[:, 2] = x_c + w_half
+    scaled_boxes[:, 1] = y_c - h_half
+    scaled_boxes[:, 3] = y_c + h_half
+    return scaled_boxes
diff --git a/detectron2/layers/nms.py b/detectron2/layers/nms.py
new file mode 100644
index 0000000..726a963
--- /dev/null
+++ b/detectron2/layers/nms.py
@@ -0,0 +1,146 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import torch
+from torchvision.ops import boxes as box_ops
+from torchvision.ops import nms  # BC-compat
+
+
+def batched_nms(boxes, scores, idxs, iou_threshold):
+    """
+    Same as torchvision.ops.boxes.batched_nms, but safer.
+    """
+    assert boxes.shape[-1] == 4
+    # TODO may need better strategy.
+    # Investigate after having a fully-cuda NMS op.
+    if len(boxes) < 40000:
+        return box_ops.batched_nms(boxes, scores, idxs, iou_threshold)
+
+    result_mask = scores.new_zeros(scores.size(), dtype=torch.bool)
+    for id in torch.unique(idxs).cpu().tolist():
+        mask = (idxs == id).nonzero().view(-1)
+        keep = nms(boxes[mask], scores[mask], iou_threshold)
+        result_mask[mask[keep]] = True
+    keep = result_mask.nonzero().view(-1)
+    keep = keep[scores[keep].argsort(descending=True)]
+    return keep
+
+
+# Note: this function (nms_rotated) might be moved into
+# torchvision/ops/boxes.py in the future
+def nms_rotated(boxes, scores, iou_threshold):
+    """
+    Performs non-maximum suppression (NMS) on the rotated boxes according
+    to their intersection-over-union (IoU).
+
+    Rotated NMS iteratively removes lower scoring rotated boxes which have an
+    IoU greater than iou_threshold with another (higher scoring) rotated box.
+
+    Note that RotatedBox (5, 3, 4, 2, -90) covers exactly the same region as
+    RotatedBox (5, 3, 4, 2, 90) does, and their IoU will be 1. However, they
+    can be representing completely different objects in certain tasks, e.g., OCR.
+
+    As for the question of whether rotated-NMS should treat them as faraway boxes
+    even though their IOU is 1, it depends on the application and/or ground truth annotation.
+
+    As an extreme example, consider a single character v and the square box around it.
+
+    If the angle is 0 degree, the object (text) would be read as 'v';
+
+    If the angle is 90 degrees, the object (text) would become '>';
+
+    If the angle is 180 degrees, the object (text) would become '^';
+
+    If the angle is 270/-90 degrees, the object (text) would become '<'
+
+    All of these cases have IoU of 1 to each other, and rotated NMS that only
+    uses IoU as criterion would only keep one of them with the highest score -
+    which, practically, still makes sense in most cases because typically
+    only one of theses orientations is the correct one. Also, it does not matter
+    as much if the box is only used to classify the object (instead of transcribing
+    them with a sequential OCR recognition model) later.
+
+    On the other hand, when we use IoU to filter proposals that are close to the
+    ground truth during training, we should definitely take the angle into account if
+    we know the ground truth is labeled with the strictly correct orientation (as in,
+    upside-down words are annotated with -180 degrees even though they can be covered
+    with a 0/90/-90 degree box, etc.)
+
+    The way the original dataset is annotated also matters. For example, if the dataset
+    is a 4-point polygon dataset that does not enforce ordering of vertices/orientation,
+    we can estimate a minimum rotated bounding box to this polygon, but there's no way
+    we can tell the correct angle with 100% confidence (as shown above, there could be 4 different
+    rotated boxes, with angles differed by 90 degrees to each other, covering the exactly
+    same region). In that case we have to just use IoU to determine the box
+    proximity (as many detection benchmarks (even for text) do) unless there're other
+    assumptions we can make (like width is always larger than height, or the object is not
+    rotated by more than 90 degrees CCW/CW, etc.)
+
+    In summary, not considering angles in rotated NMS seems to be a good option for now,
+    but we should be aware of its implications.
+
+    Args:
+        boxes (Tensor[N, 5]): Rotated boxes to perform NMS on. They are expected to be in
+           (x_center, y_center, width, height, angle_degrees) format.
+        scores (Tensor[N]): Scores for each one of the rotated boxes
+        iou_threshold (float): Discards all overlapping rotated boxes with IoU < iou_threshold
+
+    Returns:
+        keep (Tensor): int64 tensor with the indices of the elements that have been kept
+        by Rotated NMS, sorted in decreasing order of scores
+    """
+    from detectron2 import _C
+
+    return _C.nms_rotated(boxes, scores, iou_threshold)
+
+
+# Note: this function (batched_nms_rotated) might be moved into
+# torchvision/ops/boxes.py in the future
+def batched_nms_rotated(boxes, scores, idxs, iou_threshold):
+    """
+    Performs non-maximum suppression in a batched fashion.
+
+    Each index value correspond to a category, and NMS
+    will not be applied between elements of different categories.
+
+    Args:
+        boxes (Tensor[N, 5]):
+           boxes where NMS will be performed. They
+           are expected to be in (x_ctr, y_ctr, width, height, angle_degrees) format
+        scores (Tensor[N]):
+           scores for each one of the boxes
+        idxs (Tensor[N]):
+           indices of the categories for each one of the boxes.
+        iou_threshold (float):
+           discards all overlapping boxes
+           with IoU < iou_threshold
+
+    Returns:
+        Tensor:
+            int64 tensor with the indices of the elements that have been kept
+            by NMS, sorted in decreasing order of scores
+    """
+    assert boxes.shape[-1] == 5
+
+    if boxes.numel() == 0:
+        return torch.empty((0,), dtype=torch.int64, device=boxes.device)
+    # Strategy: in order to perform NMS independently per class,
+    # we add an offset to all the boxes. The offset is dependent
+    # only on the class idx, and is large enough so that boxes
+    # from different classes do not overlap
+
+    # Note that batched_nms in torchvision/ops/boxes.py only uses max_coordinate,
+    # which won't handle negative coordinates correctly.
+    # Here by using min_coordinate we can make sure the negative coordinates are
+    # correctly handled.
+    max_coordinate = (
+        torch.max(boxes[:, 0], boxes[:, 1]) + torch.max(boxes[:, 2], boxes[:, 3]) / 2
+    ).max()
+    min_coordinate = (
+        torch.min(boxes[:, 0], boxes[:, 1]) - torch.min(boxes[:, 2], boxes[:, 3]) / 2
+    ).min()
+    offsets = idxs.to(boxes) * (max_coordinate - min_coordinate + 1)
+    boxes_for_nms = boxes.clone()  # avoid modifying the original values in boxes
+    boxes_for_nms[:, :2] += offsets[:, None]
+    keep = nms_rotated(boxes_for_nms, scores, iou_threshold)
+    return keep
diff --git a/detectron2/layers/roi_align.py b/detectron2/layers/roi_align.py
new file mode 100644
index 0000000..328bbab
--- /dev/null
+++ b/detectron2/layers/roi_align.py
@@ -0,0 +1,105 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from detectron2 import _C
+
+
+class _ROIAlign(Function):
+    @staticmethod
+    def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio, aligned):
+        ctx.save_for_backward(roi)
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.sampling_ratio = sampling_ratio
+        ctx.input_shape = input.size()
+        ctx.aligned = aligned
+        output = _C.roi_align_forward(
+            input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned
+        )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        rois, = ctx.saved_tensors
+        output_size = ctx.output_size
+        spatial_scale = ctx.spatial_scale
+        sampling_ratio = ctx.sampling_ratio
+        bs, ch, h, w = ctx.input_shape
+        grad_input = _C.roi_align_backward(
+            grad_output,
+            rois,
+            spatial_scale,
+            output_size[0],
+            output_size[1],
+            bs,
+            ch,
+            h,
+            w,
+            sampling_ratio,
+            ctx.aligned,
+        )
+        return grad_input, None, None, None, None, None
+
+
+roi_align = _ROIAlign.apply
+
+
+class ROIAlign(nn.Module):
+    def __init__(self, output_size, spatial_scale, sampling_ratio, aligned=True):
+        """
+        Args:
+            output_size (tuple): h, w
+            spatial_scale (float): scale the input boxes by this number
+            sampling_ratio (int): number of inputs samples to take for each output
+                sample. 0 to take samples densely.
+            aligned (bool): if False, use the legacy implementation in
+                Detectron. If True, align the results more perfectly.
+
+        Note:
+            The meaning of aligned=True:
+
+            Given a continuous coordinate c, its two neighboring pixel indices (in our
+            pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,
+            c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled
+            from the underlying signal at continuous coordinates 0.5 and 1.5). But the original
+            roi_align (aligned=False) does not subtract the 0.5 when computing neighboring
+            pixel indices and therefore it uses pixels with a slightly incorrect alignment
+            (relative to our pixel model) when performing bilinear interpolation.
+
+            With `aligned=True`,
+            we first appropriately scale the ROI and then shift it by -0.5
+            prior to calling roi_align. This produces the correct neighbors; see
+            detectron2/tests/test_roi_align.py for verification.
+
+            The difference does not make a difference to the model's performance if
+            ROIAlign is used together with conv layers.
+        """
+        super(ROIAlign, self).__init__()
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+        self.sampling_ratio = sampling_ratio
+        self.aligned = aligned
+
+    def forward(self, input, rois):
+        """
+        Args:
+            input: NCHW images
+            rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy.
+        """
+        assert rois.dim() == 2 and rois.size(1) == 5
+        return roi_align(
+            input, rois, self.output_size, self.spatial_scale, self.sampling_ratio, self.aligned
+        )
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += "output_size=" + str(self.output_size)
+        tmpstr += ", spatial_scale=" + str(self.spatial_scale)
+        tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
+        tmpstr += ", aligned=" + str(self.aligned)
+        tmpstr += ")"
+        return tmpstr
diff --git a/detectron2/layers/roi_align_rotated.py b/detectron2/layers/roi_align_rotated.py
new file mode 100644
index 0000000..57381a9
--- /dev/null
+++ b/detectron2/layers/roi_align_rotated.py
@@ -0,0 +1,88 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from detectron2 import _C
+
+
+class _ROIAlignRotated(Function):
+    @staticmethod
+    def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
+        ctx.save_for_backward(roi)
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.sampling_ratio = sampling_ratio
+        ctx.input_shape = input.size()
+        output = _C.roi_align_rotated_forward(
+            input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio
+        )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        rois, = ctx.saved_tensors
+        output_size = ctx.output_size
+        spatial_scale = ctx.spatial_scale
+        sampling_ratio = ctx.sampling_ratio
+        bs, ch, h, w = ctx.input_shape
+        grad_input = _C.roi_align_rotated_backward(
+            grad_output,
+            rois,
+            spatial_scale,
+            output_size[0],
+            output_size[1],
+            bs,
+            ch,
+            h,
+            w,
+            sampling_ratio,
+        )
+        return grad_input, None, None, None, None, None
+
+
+roi_align_rotated = _ROIAlignRotated.apply
+
+
+class ROIAlignRotated(nn.Module):
+    def __init__(self, output_size, spatial_scale, sampling_ratio):
+        """
+        Args:
+            output_size (tuple): h, w
+            spatial_scale (float): scale the input boxes by this number
+            sampling_ratio (int): number of inputs samples to take for each output
+                sample. 0 to take samples densely.
+
+        Note:
+            ROIAlignRotated supports continuous coordinate by default:
+            Given a continuous coordinate c, its two neighboring pixel indices (in our
+            pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,
+            c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled
+            from the underlying signal at continuous coordinates 0.5 and 1.5).
+        """
+        super(ROIAlignRotated, self).__init__()
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+        self.sampling_ratio = sampling_ratio
+
+    def forward(self, input, rois):
+        """
+        Args:
+            input: NCHW images
+            rois: Bx6 boxes. First column is the index into N.
+                The other 5 columns are (x_ctr, y_ctr, width, height, angle_degrees).
+        """
+        assert rois.dim() == 2 and rois.size(1) == 6
+        return roi_align_rotated(
+            input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
+        )
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += "output_size=" + str(self.output_size)
+        tmpstr += ", spatial_scale=" + str(self.spatial_scale)
+        tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
+        tmpstr += ")"
+        return tmpstr
diff --git a/detectron2/layers/rotated_boxes.py b/detectron2/layers/rotated_boxes.py
new file mode 100644
index 0000000..2b212c9
--- /dev/null
+++ b/detectron2/layers/rotated_boxes.py
@@ -0,0 +1,24 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+# import torch
+from detectron2 import _C
+
+
+def pairwise_iou_rotated(boxes1, boxes2):
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+
+    Both sets of boxes are expected to be in
+    (x_center, y_center, width, height, angle) format.
+
+    Arguments:
+        boxes1 (Tensor[N, 5])
+        boxes2 (Tensor[M, 5])
+
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+
+    return _C.box_iou_rotated(boxes1, boxes2)
diff --git a/detectron2/layers/shape_spec.py b/detectron2/layers/shape_spec.py
new file mode 100644
index 0000000..ed7f0d0
--- /dev/null
+++ b/detectron2/layers/shape_spec.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from collections import namedtuple
+
+
+class ShapeSpec(namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])):
+    """
+    A simple structure that contains basic shape specification about a tensor.
+    It is often used as the auxiliary inputs/outputs of models,
+    to obtain the shape inference ability among pytorch modules.
+
+    Attributes:
+        channels:
+        height:
+        width:
+        stride:
+    """
+
+    def __new__(cls, *, channels=None, height=None, width=None, stride=None):
+        return super().__new__(cls, channels, height, width, stride)
diff --git a/detectron2/layers/wrappers.py b/detectron2/layers/wrappers.py
new file mode 100644
index 0000000..42272cb
--- /dev/null
+++ b/detectron2/layers/wrappers.py
@@ -0,0 +1,170 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Wrappers around on some nn functions, mainly to support empty tensors.
+
+Ideally, add support directly in PyTorch to empty tensors in those functions.
+
+These can be removed once https://github.com/pytorch/pytorch/issues/12013
+is implemented
+"""
+
+import math
+import torch
+from torch.nn.modules.utils import _ntuple
+
+
+def cat(tensors, dim=0):
+    """
+    Efficient version of torch.cat that avoids a copy if there is only a single element in a list
+    """
+    assert isinstance(tensors, (list, tuple))
+    if len(tensors) == 1:
+        return tensors[0]
+    return torch.cat(tensors, dim)
+
+
+class _NewEmptyTensorOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, new_shape):
+        ctx.shape = x.shape
+        return x.new_empty(new_shape)
+
+    @staticmethod
+    def backward(ctx, grad):
+        shape = ctx.shape
+        return _NewEmptyTensorOp.apply(grad, shape), None
+
+
+class Conv2d(torch.nn.Conv2d):
+    """
+    A wrapper around :class:`torch.nn.Conv2d` to support zero-size tensor and more features.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """
+        Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`:
+
+        Args:
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+
+        It assumes that norm layer is used before activation.
+        """
+        norm = kwargs.pop("norm", None)
+        activation = kwargs.pop("activation", None)
+        super().__init__(*args, **kwargs)
+
+        self.norm = norm
+        self.activation = activation
+
+    def forward(self, x):
+        if x.numel() == 0:
+            # When input is empty, we want to return a empty tensor with "correct" shape,
+            # So that the following operations will not panic
+            # if they check for the shape of the tensor.
+            # This computes the height and width of the output tensor
+            output_shape = [
+                (i + 2 * p - (di * (k - 1) + 1)) // s + 1
+                for i, p, di, k, s in zip(
+                    x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
+                )
+            ]
+            output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
+            empty = _NewEmptyTensorOp.apply(x, output_shape)
+            if self.training:
+                # https://github.com/pytorch/pytorch/issues/12013
+                assert not isinstance(
+                    self.norm, torch.nn.SyncBatchNorm
+                ), "SyncBatchNorm does not support empty inputs!"
+
+                # This is to make DDP happy.
+                # DDP expects all workers to have gradient w.r.t the same set of parameters.
+                _dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + _dummy
+            else:
+                return empty
+
+        x = super().forward(x)
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+
+
+class ConvTranspose2d(torch.nn.ConvTranspose2d):
+    """
+    A wrapper around :class:`torch.nn.ConvTranspose2d` to support zero-size tensor.
+    """
+
+    def forward(self, x):
+        if x.numel() > 0:
+            return super(ConvTranspose2d, self).forward(x)
+        # get output shape
+
+        output_shape = [
+            (i - 1) * d - 2 * p + (di * (k - 1) + 1) + op
+            for i, p, di, k, d, op in zip(
+                x.shape[-2:],
+                self.padding,
+                self.dilation,
+                self.kernel_size,
+                self.stride,
+                self.output_padding,
+            )
+        ]
+        output_shape = [x.shape[0], self.out_channels] + output_shape
+        # This is to make DDP happy.
+        # DDP expects all workers to have gradient w.r.t the same set of parameters.
+        _dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+        return _NewEmptyTensorOp.apply(x, output_shape) + _dummy
+
+
+class BatchNorm2d(torch.nn.BatchNorm2d):
+    """
+    A wrapper around :class:`torch.nn.BatchNorm2d` to support zero-size tensor.
+    """
+
+    def forward(self, x):
+        if x.numel() > 0:
+            return super(BatchNorm2d, self).forward(x)
+        # get output shape
+        output_shape = x.shape
+        return _NewEmptyTensorOp.apply(x, output_shape)
+
+
+def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+    """
+    A wrapper around :func:`torch.nn.functional.interpolate` to support zero-size tensor.
+    """
+    if input.numel() > 0:
+        return torch.nn.functional.interpolate(
+            input, size, scale_factor, mode, align_corners=align_corners
+        )
+
+    def _check_size_scale_factor(dim):
+        if size is None and scale_factor is None:
+            raise ValueError("either size or scale_factor should be defined")
+        if size is not None and scale_factor is not None:
+            raise ValueError("only one of size or scale_factor should be defined")
+        if (
+            scale_factor is not None
+            and isinstance(scale_factor, tuple)
+            and len(scale_factor) != dim
+        ):
+            raise ValueError(
+                "scale_factor shape must match input shape. "
+                "Input is {}D, scale_factor size is {}".format(dim, len(scale_factor))
+            )
+
+    def _output_size(dim):
+        _check_size_scale_factor(dim)
+        if size is not None:
+            return size
+        scale_factors = _ntuple(dim)(scale_factor)
+        # math.floor might return float in py2.7
+        return [int(math.floor(input.size(i + 2) * scale_factors[i])) for i in range(dim)]
+
+    output_shape = tuple(_output_size(2))
+    output_shape = input.shape[:-2] + output_shape
+    return _NewEmptyTensorOp.apply(input, output_shape)
diff --git a/detectron2/model_zoo/__init__.py b/detectron2/model_zoo/__init__.py
new file mode 100644
index 0000000..de21580
--- /dev/null
+++ b/detectron2/model_zoo/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Model Zoo API for Detectron2: a collection of functions to create common model architectures and
+optionally load pre-trained weights as released in
+`MODEL_ZOO.md <https://github.com/facebookresearch/detectron2/blob/master/MODEL_ZOO.md>`_.
+"""
+from .model_zoo import get, ModelZooUrls
diff --git a/detectron2/model_zoo/model_zoo.py b/detectron2/model_zoo/model_zoo.py
new file mode 100644
index 0000000..45cf70f
--- /dev/null
+++ b/detectron2/model_zoo/model_zoo.py
@@ -0,0 +1,114 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import os
+import pkg_resources
+
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.modeling import build_model
+
+
+class ModelZooUrls(object):
+    """
+    Mapping from names to officially released Detectron2 pre-trained models.
+    """
+
+    S3_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/"
+
+    # format: {config_path.yaml} -> model_id/model_final_{commit}.pkl
+    CONFIG_PATH_TO_URL_SUFFIX = {
+        # COCO Detection with Faster R-CNN
+        "COCO-Detection/faster_rcnn_R_50_C4_1x.yaml": "137257644/model_final_721ade.pkl",
+        "COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml": "137847829/model_final_51d356.pkl",
+        "COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml": "137257794/model_final_b275ba.pkl",
+        "COCO-Detection/faster_rcnn_R_50_C4_3x.yaml": "137849393/model_final_f97cb7.pkl",
+        "COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml": "137849425/model_final_68d202.pkl",
+        "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml": "137849458/model_final_280758.pkl",
+        "COCO-Detection/faster_rcnn_R_101_C4_3x.yaml": "138204752/model_final_298dad.pkl",
+        "COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml": "138204841/model_final_3e0943.pkl",
+        "COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml": "137851257/model_final_f6e8b1.pkl",
+        "COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml": "139173657/model_final_68b088.pkl",
+        # COCO Detection with Retina-Net
+        "COCO-Detection/retinanet_R_50_FPN_1x.yaml": "137593951/model_final_b796dc.pkl",
+        "COCO-Detection/retinanet_R_50_FPN_3x.yaml": "137849486/model_final_4cafe0.pkl",
+        "COCO-Detection/retinanet_R_101_FPN_3x.yaml": "138363263/model_final_59f53c.pkl",
+        # COCO Detection with RPN and Fast R-CNN
+        "COCO-Detection/rpn_R_50_C4_1x.yaml": "137258005/model_final_450694.pkl",
+        "COCO-Detection/rpn_R_50_FPN_1x.yaml": "137258492/model_final_02ce48.pkl",
+        "COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml": "137635226/model_final_e5f7ce.pkl",
+        # COCO Instance Segmentation Baselines with Mask R-CNN
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml": "137259246/model_final_9243eb.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml": "137260150/model_final_4f86c3.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml": "137260431/model_final_a54504.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml": "137849525/model_final_4ce675.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml": "137849551/model_final_84107b.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml": "137849600/model_final_f10217.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml": "138363239/model_final_a2914c.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml": "138363294/model_final_0464b7.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml": "138205316/model_final_a3ec72.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml": "139653917/model_final_2d9806.pkl",  # noqa
+        # COCO Person Keypoint Detection Baselines with Keypoint R-CNN
+        "COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml": "137261548/model_final_04e291.pkl",
+        "COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml": "137849621/model_final_a6e10b.pkl",
+        "COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml": "138363331/model_final_997cc7.pkl",
+        "COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml": "139686956/model_final_5ad38f.pkl",
+        # COCO Panoptic Segmentation Baselines with Panoptic FPN
+        "COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml": "139514544/model_final_dbfeb4.pkl",
+        "COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml": "139514569/model_final_c10459.pkl",
+        "COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml": "139514519/model_final_cafdb1.pkl",
+        # LVIS Instance Segmentation Baselines with Mask R-CNN
+        "LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml": "144219072/model_final_571f7c.pkl",
+        "LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml": "144219035/model_final_824ab5.pkl",
+        "LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml": "144219108/model_final_5e3439.pkl",  # noqa
+        # Cityscapes & Pascal VOC Baselines
+        "Cityscapes/mask_rcnn_R_50_FPN.yaml": "142423278/model_final_af9cf5.pkl",
+        "PascalVOC-Detection/faster_rcnn_R_50_C4.yaml": "142202221/model_final_b1acc2.pkl",
+        # Other Settings
+        "Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml": "138602847/model_final_e9d89b.pkl",
+        "Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml": "144998488/model_final_480dd8.pkl",
+        "Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml": "143915318/model_final_220cfb.pkl",
+        "Misc/mask_rcnn_R_50_FPN_3x_gn.yaml": "138602888/model_final_dc5d9e.pkl",
+        "Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml": "138602908/model_final_01ca85.pkl",
+        "Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml": "139797668/model_final_be35db.pkl",
+        "Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml": "18131413/model_0039999_e76410.pkl",  # noqa
+    }
+
+    @classmethod
+    def get(cls, config_path):
+        if config_path in cls.CONFIG_PATH_TO_URL_SUFFIX:
+            name = config_path.replace(".yaml", "")
+            return cls.S3_PREFIX + name + "/" + cls.CONFIG_PATH_TO_URL_SUFFIX[config_path]
+        raise RuntimeError("{} not available in Model Zoo!".format(name))
+
+
+def get(config_path, trained: bool = False):
+    """
+    Get a model specified by relative path under Detectron2's official ``configs`` directory.
+
+    Args:
+        trained (bool): Whether to initialize with the trained model zoo weights. If False, the
+            initialization weights specified in the config file's ``MODEL.WEIGHTS`` key are used
+            instead; this will typically (though not always) initialize a subset of weights using
+            an ImageNet pre-trained model, while randomly initializing the other weights.
+
+    Example:
+
+    .. code-block:: python
+
+        from detectron2 import model_zoo
+        model = model_zoo.get("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml", trained=True)
+    """
+
+    cfg_file = pkg_resources.resource_filename(
+        "detectron2.model_zoo", os.path.join("configs", config_path)
+    )
+    if not os.path.exists(cfg_file):
+        raise RuntimeError("{} not available in Model Zoo!".format(config_path))
+
+    cfg = get_cfg()
+    cfg.merge_from_file(cfg_file)
+    if trained:
+        cfg.MODEL.WEIGHTS = ModelZooUrls.get(config_path)
+
+    model = build_model(cfg)
+    DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
+    return model
diff --git a/detectron2/modeling/__init__.py b/detectron2/modeling/__init__.py
new file mode 100644
index 0000000..4202d6a
--- /dev/null
+++ b/detectron2/modeling/__init__.py
@@ -0,0 +1,54 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+
+from detectron2.layers import ShapeSpec
+
+from .anchor_generator import build_anchor_generator, ANCHOR_GENERATOR_REGISTRY
+from .backbone import (
+    BACKBONE_REGISTRY,
+    FPN,
+    Backbone,
+    ResNet,
+    ResNetBlockBase,
+    build_backbone,
+    build_resnet_backbone,
+    make_stage,
+)
+from .meta_arch import (
+    META_ARCH_REGISTRY,
+    SEM_SEG_HEADS_REGISTRY,
+    GeneralizedRCNN,
+    PanopticFPN,
+    ProposalNetwork,
+    RetinaNet,
+    SemanticSegmentor,
+    build_model,
+    build_sem_seg_head,
+)
+from .postprocessing import detector_postprocess
+from .proposal_generator import (
+    PROPOSAL_GENERATOR_REGISTRY,
+    build_proposal_generator,
+    RPN_HEAD_REGISTRY,
+    build_rpn_head,
+)
+from .roi_heads import (
+    ROI_BOX_HEAD_REGISTRY,
+    ROI_HEADS_REGISTRY,
+    ROI_KEYPOINT_HEAD_REGISTRY,
+    ROI_MASK_HEAD_REGISTRY,
+    ROIHeads,
+    StandardROIHeads,
+    build_box_head,
+    build_keypoint_head,
+    build_mask_head,
+    build_roi_heads,
+)
+from .test_time_augmentation import DatasetMapperTTA, GeneralizedRCNNWithTTA
+
+_EXCLUDE = {"torch", "ShapeSpec"}
+__all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")]
+
+assert (
+    torch.Tensor([1]) == torch.Tensor([2])
+).dtype == torch.bool, "Your Pytorch is too old. Please update to contain https://github.com/pytorch/pytorch/pull/21113"
diff --git a/detectron2/modeling/anchor_generator.py b/detectron2/modeling/anchor_generator.py
new file mode 100644
index 0000000..61c636e
--- /dev/null
+++ b/detectron2/modeling/anchor_generator.py
@@ -0,0 +1,352 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import math
+from typing import List
+import torch
+from torch import nn
+
+from detectron2.layers import ShapeSpec
+from detectron2.structures import Boxes, RotatedBoxes
+from detectron2.utils.registry import Registry
+
+ANCHOR_GENERATOR_REGISTRY = Registry("ANCHOR_GENERATOR")
+"""
+Registry for modules that creates object detection anchors for feature maps.
+"""
+
+
+class BufferList(nn.Module):
+    """
+    Similar to nn.ParameterList, but for buffers
+    """
+
+    def __init__(self, buffers=None):
+        super(BufferList, self).__init__()
+        if buffers is not None:
+            self.extend(buffers)
+
+    def extend(self, buffers):
+        offset = len(self)
+        for i, buffer in enumerate(buffers):
+            self.register_buffer(str(offset + i), buffer)
+        return self
+
+    def __len__(self):
+        return len(self._buffers)
+
+    def __iter__(self):
+        return iter(self._buffers.values())
+
+
+def _create_grid_offsets(size, stride, device):
+    grid_height, grid_width = size
+    shifts_x = torch.arange(0, grid_width * stride, step=stride, dtype=torch.float32, device=device)
+    shifts_y = torch.arange(
+        0, grid_height * stride, step=stride, dtype=torch.float32, device=device
+    )
+    shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
+    shift_x = shift_x.reshape(-1)
+    shift_y = shift_y.reshape(-1)
+    return shift_x, shift_y
+
+
+@ANCHOR_GENERATOR_REGISTRY.register()
+class DefaultAnchorGenerator(nn.Module):
+    """
+    For a set of image sizes and feature maps, computes a set of anchors.
+    """
+
+    def __init__(self, cfg, input_shape: List[ShapeSpec]):
+        super().__init__()
+        # fmt: off
+        sizes         = cfg.MODEL.ANCHOR_GENERATOR.SIZES
+        aspect_ratios = cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS
+        self.strides  = [x.stride for x in input_shape]
+        # fmt: on
+        """
+        sizes (list[list[int]]): sizes[i] is the list of anchor sizes to use
+            for the i-th feature map. If len(sizes) == 1, then the same list of
+            anchor sizes, given by sizes[0], is used for all feature maps. Anchor
+            sizes are given in absolute lengths in units of the input image;
+            they do not dynamically scale if the input image size changes.
+        aspect_ratios (list[list[float]]): aspect_ratios[i] is the list of
+            anchor aspect ratios to use for the i-th feature map. If
+            len(aspect_ratios) == 1, then the same list of anchor aspect ratios,
+            given by aspect_ratios[0], is used for all feature maps.
+        strides (list[int]): stride of each input feature.
+        """
+
+        self.num_features = len(self.strides)
+        self.cell_anchors = self._calculate_anchors(sizes, aspect_ratios)
+
+    def _calculate_anchors(self, sizes, aspect_ratios):
+        # If one size (or aspect ratio) is specified and there are multiple feature
+        # maps, then we "broadcast" anchors of that single size (or aspect ratio)
+        # over all feature maps.
+        if len(sizes) == 1:
+            sizes *= self.num_features
+        if len(aspect_ratios) == 1:
+            aspect_ratios *= self.num_features
+        assert self.num_features == len(sizes)
+        assert self.num_features == len(aspect_ratios)
+
+        cell_anchors = [
+            self.generate_cell_anchors(s, a).float() for s, a in zip(sizes, aspect_ratios)
+        ]
+
+        return BufferList(cell_anchors)
+
+    @property
+    def box_dim(self):
+        """
+        Returns:
+            int: the dimension of each anchor box.
+        """
+        return 4
+
+    @property
+    def num_cell_anchors(self):
+        """
+        Returns:
+            list[int]: Each int is the number of anchors at every pixel
+                location, on that feature map.
+                For example, if at every pixel we use anchors of 3 aspect
+                ratios and 5 sizes, the number of anchors is 15.
+                (See also ANCHOR_GENERATOR.SIZES and ANCHOR_GENERATOR.ASPECT_RATIOS in config)
+
+                In standard RPN models, `num_cell_anchors` on every feature map is the same.
+        """
+        return [len(cell_anchors) for cell_anchors in self.cell_anchors]
+
+    def grid_anchors(self, grid_sizes):
+        anchors = []
+        for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors):
+            shift_x, shift_y = _create_grid_offsets(size, stride, base_anchors.device)
+            shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)
+
+            anchors.append((shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4))
+
+        return anchors
+
+    def generate_cell_anchors(self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)):
+        """
+        Generate a tensor storing anchor boxes, which are continuous geometric rectangles
+        centered on one feature map point sample. We can later build the set of anchors
+        for the entire feature map by tiling these tensors; see `meth:grid_anchors`.
+
+        Args:
+            sizes (tuple[float]): Absolute size of the anchors in the units of the input
+                image (the input received by the network, after undergoing necessary scaling).
+                The absolute size is given as the side length of a box.
+            aspect_ratios (tuple[float]]): Aspect ratios of the boxes computed as box
+                height / width.
+
+        Returns:
+            Tensor of shape (len(sizes) * len(aspect_ratios), 4) storing anchor boxes
+                in XYXY format.
+        """
+
+        # This is different from the anchor generator defined in the original Faster R-CNN
+        # code or Detectron. They yield the same AP, however the old version defines cell
+        # anchors in a less natural way with a shift relative to the feature grid and
+        # quantization that results in slightly different sizes for different aspect ratios.
+        # See also https://github.com/facebookresearch/Detectron/issues/227
+
+        anchors = []
+        for size in sizes:
+            area = size ** 2.0
+            for aspect_ratio in aspect_ratios:
+                # s * s = w * h
+                # a = h / w
+                # ... some algebra ...
+                # w = sqrt(s * s / a)
+                # h = a * w
+                w = math.sqrt(area / aspect_ratio)
+                h = aspect_ratio * w
+                x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0
+                anchors.append([x0, y0, x1, y1])
+        return torch.tensor(anchors)
+
+    def forward(self, features):
+        """
+        Args:
+            features (list[Tensor]): list of backbone feature maps on which to generate anchors.
+
+        Returns:
+            list[list[Boxes]]: a list of #image elements. Each is a list of #feature level Boxes.
+                The Boxes contains anchors of this image on the specific feature level.
+        """
+        num_images = len(features[0])
+        grid_sizes = [feature_map.shape[-2:] for feature_map in features]
+        anchors_over_all_feature_maps = self.grid_anchors(grid_sizes)
+
+        anchors_in_image = []
+        for anchors_per_feature_map in anchors_over_all_feature_maps:
+            boxes = Boxes(anchors_per_feature_map)
+            anchors_in_image.append(boxes)
+
+        anchors = [copy.deepcopy(anchors_in_image) for _ in range(num_images)]
+        return anchors
+
+
+@ANCHOR_GENERATOR_REGISTRY.register()
+class RotatedAnchorGenerator(nn.Module):
+    """
+    The anchor generator used by Rotated RPN (RRPN).
+    """
+
+    def __init__(self, cfg, input_shape: List[ShapeSpec]):
+        super().__init__()
+        # fmt: off
+        sizes         = cfg.MODEL.ANCHOR_GENERATOR.SIZES
+        aspect_ratios = cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS
+        angles        = cfg.MODEL.ANCHOR_GENERATOR.ANGLES
+        self.strides  = [x.stride for x in input_shape]
+        # fmt: on
+
+        self.num_features = len(self.strides)
+        self.cell_anchors = self._calculate_anchors(sizes, aspect_ratios, angles, self.strides)
+
+    def _calculate_anchors(self, sizes, aspect_ratios, angles, feature_strides):
+        """
+        Args:
+            sizes (list[list[int]]): sizes[i] is the list of anchor sizes to use
+                for the i-th feature map. If len(sizes) == 1, then the same list of
+                anchor sizes, given by sizes[0], is used for all feature maps. Anchor
+                sizes are given in absolute lengths in units of the input image;
+                they do not dynamically scale if the input image size changes.
+            aspect_ratios (list[list[float]]): aspect_ratios[i] is the list of
+                anchor aspect ratios to use for the i-th feature map. If
+                len(aspect_ratios) == 1, then the same list of anchor aspect ratios,
+                given by aspect_ratios[0], is used for all feature maps.
+            angles (list[list[float]]): angles[i] is the list of
+                anchor angles to use for the i-th feature map. If
+                len(angles) == 1, then the same list of anchor angles,
+                given by angles[0], is used for all feature maps.
+            feature_strides (list[number]): list of feature map strides (with respect
+                to the input image) for each input feature map.
+        """
+
+        # If one size (or aspect ratio) is specified and there are multiple feature
+        # maps, then we "broadcast" anchors of that single size
+        # (or aspect ratio/angle) over all feature maps.
+
+        if len(sizes) == 1:
+            sizes *= self.num_features
+        if len(aspect_ratios) == 1:
+            aspect_ratios *= self.num_features
+        if len(angles) == 1:
+            angles *= self.num_features
+        assert self.num_features == len(sizes)
+        assert self.num_features == len(aspect_ratios)
+        assert self.num_features == len(angles)
+
+        cell_anchors = [
+            self.generate_cell_anchors(size, aspect_ratio, angle).float()
+            for size, aspect_ratio, angle in zip(sizes, aspect_ratios, angles)
+        ]
+
+        return BufferList(cell_anchors)
+
+    @property
+    def box_dim(self):
+        """
+        Returns:
+            int: the dimension of each anchor box.
+        """
+        return 5
+
+    @property
+    def num_cell_anchors(self):
+        """
+        Returns:
+            list[int]: Each int is the number of anchors at every pixel
+                location, on that feature map.
+                For example, if at every pixel we use anchors of 3 aspect
+                ratios, 2 sizes and 5 angles, the number of anchors is 30.
+                (See also ANCHOR_GENERATOR.SIZES, ANCHOR_GENERATOR.ASPECT_RATIOS
+                and ANCHOR_GENERATOR.ANGLES in config)
+
+                In standard RRPN models, `num_cell_anchors` on every feature map is the same.
+        """
+        return [len(cell_anchors) for cell_anchors in self.cell_anchors]
+
+    def grid_anchors(self, grid_sizes):
+        anchors = []
+        for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors):
+            shift_x, shift_y = _create_grid_offsets(size, stride, base_anchors.device)
+            zeros = torch.zeros_like(shift_x)
+            shifts = torch.stack((shift_x, shift_y, zeros, zeros, zeros), dim=1)
+
+            anchors.append((shifts.view(-1, 1, 5) + base_anchors.view(1, -1, 5)).reshape(-1, 5))
+
+        return anchors
+
+    def generate_cell_anchors(
+        self,
+        sizes=(32, 64, 128, 256, 512),
+        aspect_ratios=(0.5, 1, 2),
+        angles=(-90, -60, -30, 0, 30, 60, 90),
+    ):
+        """
+        Generate a tensor storing anchor boxes, which are continuous geometric rectangles
+        centered on one feature map point sample. We can later build the set of anchors
+        for the entire feature map by tiling these tensors; see `meth:grid_anchors`.
+
+        Args:
+            sizes (tuple[float]): Absolute size of the anchors in the units of the input
+                image (the input received by the network, after undergoing necessary scaling).
+                The absolute size is given as the side length of a box.
+            aspect_ratios (tuple[float]]): Aspect ratios of the boxes computed as box
+                height / width.
+            angles (tuple[float]]): Angles of boxes indicating how many degrees
+                the boxes are rotated counter-clockwise.
+
+        Returns:
+            Tensor of shape (len(sizes) * len(aspect_ratios) * len(angles), 5)
+                storing anchor boxes in (x_ctr, y_ctr, w, h, angle) format.
+        """
+        anchors = []
+        for size in sizes:
+            area = size ** 2.0
+            for aspect_ratio in aspect_ratios:
+                # s * s = w * h
+                # a = h / w
+                # ... some algebra ...
+                # w = sqrt(s * s / a)
+                # h = a * w
+                w = math.sqrt(area / aspect_ratio)
+                h = aspect_ratio * w
+                anchors.extend([0, 0, w, h, a] for a in angles)
+
+        return torch.tensor(anchors)
+
+    def forward(self, features):
+        """
+        Args:
+            features (list[Tensor]): list of backbone feature maps on which to generate anchors.
+
+        Returns:
+            list[list[RotatedBoxes]]:
+                a list of #image elements. Each is a list of #feature level RotatedBoxes.
+                The RotatedBoxes contains anchors of this image on the specific feature level.
+        """
+        num_images = len(features[0])
+        grid_sizes = [feature_map.shape[-2:] for feature_map in features]
+        anchors_over_all_feature_maps = self.grid_anchors(grid_sizes)
+
+        anchors_in_image = []
+        for anchors_per_feature_map in anchors_over_all_feature_maps:
+            boxes = RotatedBoxes(anchors_per_feature_map)
+            anchors_in_image.append(boxes)
+
+        anchors = [copy.deepcopy(anchors_in_image) for _ in range(num_images)]
+        return anchors
+
+
+def build_anchor_generator(cfg, input_shape):
+    """
+    Built an anchor generator from `cfg.MODEL.ANCHOR_GENERATOR.NAME`.
+    """
+    anchor_generator = cfg.MODEL.ANCHOR_GENERATOR.NAME
+    return ANCHOR_GENERATOR_REGISTRY.get(anchor_generator)(cfg, input_shape)
diff --git a/detectron2/modeling/backbone/__init__.py b/detectron2/modeling/backbone/__init__.py
new file mode 100644
index 0000000..bd737d9
--- /dev/null
+++ b/detectron2/modeling/backbone/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .build import build_backbone, BACKBONE_REGISTRY  # noqa F401 isort:skip
+
+from .backbone import Backbone
+from .fpn import FPN
+from .resnet import ResNet, ResNetBlockBase, build_resnet_backbone, make_stage
+
+# TODO can expose more resnet blocks after careful consideration
diff --git a/detectron2/modeling/backbone/backbone.py b/detectron2/modeling/backbone/backbone.py
new file mode 100644
index 0000000..a530efe
--- /dev/null
+++ b/detectron2/modeling/backbone/backbone.py
@@ -0,0 +1,70 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from abc import ABCMeta, abstractmethod
+import torch.nn as nn
+
+from detectron2.layers import ShapeSpec
+
+__all__ = ["Backbone"]
+
+
+class Backbone(nn.Module, metaclass=ABCMeta):
+    """
+    Abstract base class for network backbones.
+    """
+
+    def __init__(self):
+        """
+        The `__init__` method of any subclass can specify its own set of arguments.
+        """
+        super().__init__()
+
+    @abstractmethod
+    def forward(self):
+        """
+        Subclasses must override this method, but adhere to the same return type.
+
+        Returns:
+            dict[str: Tensor]: mapping from feature name (e.g., "res2") to tensor
+        """
+        pass
+
+    @property
+    def size_divisibility(self):
+        """
+        Some backbones require the input height and width to be divisible by a
+        specific integer. This is typically true for encoder / decoder type networks
+        with lateral connection (e.g., FPN) for which feature maps need to match
+        dimension in the "bottom up" and "top down" paths. Set to 0 if no specific
+        input size divisibility is required.
+        """
+        return 0
+
+    def output_shape(self):
+        """
+        Returns:
+            dict[str->ShapeSpec]
+        """
+        # this is a backward-compatible default
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+
+    # the properties below are not used any more
+
+    @property
+    def out_features(self):
+        """deprecated"""
+        return self._out_features
+
+    @property
+    def out_feature_strides(self):
+        """deprecated"""
+        return {f: self._out_feature_strides[f] for f in self._out_features}
+
+    @property
+    def out_feature_channels(self):
+        """deprecated"""
+        return {f: self._out_feature_channels[f] for f in self._out_features}
diff --git a/detectron2/modeling/backbone/build.py b/detectron2/modeling/backbone/build.py
new file mode 100644
index 0000000..3d2ecae
--- /dev/null
+++ b/detectron2/modeling/backbone/build.py
@@ -0,0 +1,33 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from detectron2.layers import ShapeSpec
+from detectron2.utils.registry import Registry
+
+from .backbone import Backbone
+
+BACKBONE_REGISTRY = Registry("BACKBONE")
+BACKBONE_REGISTRY.__doc__ = """
+Registry for backbones, which extract feature maps from images
+
+The registered object must be a callable that accepts two arguments:
+
+1. A :class:`detectron2.config.CfgNode`
+2. A :class:`detectron2.layers.ShapeSpec`, which contains the input shape specification.
+
+It must returns an instance of :class:`Backbone`.
+"""
+
+
+def build_backbone(cfg, input_shape=None):
+    """
+    Build a backbone from `cfg.MODEL.BACKBONE.NAME`.
+
+    Returns:
+        an instance of :class:`Backbone`
+    """
+    if input_shape is None:
+        input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))
+
+    backbone_name = cfg.MODEL.BACKBONE.NAME
+    backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape)
+    assert isinstance(backbone, Backbone)
+    return backbone
diff --git a/detectron2/modeling/backbone/fpn.py b/detectron2/modeling/backbone/fpn.py
new file mode 100644
index 0000000..5d75ea3
--- /dev/null
+++ b/detectron2/modeling/backbone/fpn.py
@@ -0,0 +1,244 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+import fvcore.nn.weight_init as weight_init
+import torch.nn.functional as F
+from torch import nn
+
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+
+from .backbone import Backbone
+from .build import BACKBONE_REGISTRY
+from .resnet import build_resnet_backbone
+
+__all__ = ["build_resnet_fpn_backbone", "build_retinanet_resnet_fpn_backbone", "FPN"]
+
+
+class FPN(Backbone):
+    """
+    This module implements Feature Pyramid Network.
+    It creates pyramid features built on top of some input feature maps.
+    """
+
+    def __init__(
+        self, bottom_up, in_features, out_channels, norm="", top_block=None, fuse_type="sum"
+    ):
+        """
+        Args:
+            bottom_up (Backbone): module representing the bottom up subnetwork.
+                Must be a subclass of :class:`Backbone`. The multi-scale feature
+                maps generated by the bottom up network, and listed in `in_features`,
+                are used to generate FPN levels.
+            in_features (list[str]): names of the input feature maps coming
+                from the backbone to which FPN is attached. For example, if the
+                backbone produces ["res2", "res3", "res4"], any *contiguous* sublist
+                of these may be used; order must be from high to low resolution.
+            out_channels (int): number of channels in the output feature maps.
+            norm (str): the normalization to use.
+            top_block (nn.Module or None): if provided, an extra operation will
+                be performed on the output of the last (smallest resolution)
+                FPN output, and the result will extend the result list. The top_block
+                further downsamples the feature map. It must have an attribute
+                "num_levels", meaning the number of extra FPN levels added by
+                this block, and "in_feature", which is a string representing
+                its input feature (e.g., p5).
+            fuse_type (str): types for fusing the top down features and the lateral
+                ones. It can be "sum" (default), which sums up element-wise; or "avg",
+                which takes the element-wise mean of the two.
+        """
+        super(FPN, self).__init__()
+        assert isinstance(bottom_up, Backbone)
+
+        # Feature map strides and channels from the bottom up network (e.g. ResNet)
+        in_strides = [bottom_up.out_feature_strides[f] for f in in_features]
+        in_channels = [bottom_up.out_feature_channels[f] for f in in_features]
+
+        _assert_strides_are_log2_contiguous(in_strides)
+        lateral_convs = []
+        output_convs = []
+
+        use_bias = norm == ""
+        for idx, in_channels in enumerate(in_channels):
+            lateral_norm = get_norm(norm, out_channels)
+            output_norm = get_norm(norm, out_channels)
+
+            lateral_conv = Conv2d(
+                in_channels, out_channels, kernel_size=1, bias=use_bias, norm=lateral_norm
+            )
+            output_conv = Conv2d(
+                out_channels,
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=use_bias,
+                norm=output_norm,
+            )
+            weight_init.c2_xavier_fill(lateral_conv)
+            weight_init.c2_xavier_fill(output_conv)
+            stage = int(math.log2(in_strides[idx]))
+            self.add_module("fpn_lateral{}".format(stage), lateral_conv)
+            self.add_module("fpn_output{}".format(stage), output_conv)
+
+            lateral_convs.append(lateral_conv)
+            output_convs.append(output_conv)
+        # Place convs into top-down order (from low to high resolution)
+        # to make the top-down computation in forward clearer.
+        self.lateral_convs = lateral_convs[::-1]
+        self.output_convs = output_convs[::-1]
+        self.top_block = top_block
+        self.in_features = in_features
+        self.bottom_up = bottom_up
+        # Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"]
+        self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in in_strides}
+        # top block output feature maps.
+        if self.top_block is not None:
+            for s in range(stage, stage + self.top_block.num_levels):
+                self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1)
+
+        self._out_features = list(self._out_feature_strides.keys())
+        self._out_feature_channels = {k: out_channels for k in self._out_features}
+        self._size_divisibility = in_strides[-1]
+        assert fuse_type in {"avg", "sum"}
+        self._fuse_type = fuse_type
+
+    @property
+    def size_divisibility(self):
+        return self._size_divisibility
+
+    def forward(self, x):
+        """
+        Args:
+            input (dict[str: Tensor]): mapping feature map name (e.g., "res5") to
+                feature map tensor for each feature level in high to low resolution order.
+
+        Returns:
+            dict[str: Tensor]:
+                mapping from feature map name to FPN feature map tensor
+                in high to low resolution order. Returned feature names follow the FPN
+                paper convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
+                ["p2", "p3", ..., "p6"].
+        """
+        # Reverse feature maps into top-down order (from low to high resolution)
+        bottom_up_features = self.bottom_up(x)
+        x = [bottom_up_features[f] for f in self.in_features[::-1]]
+        results = []
+        prev_features = self.lateral_convs[0](x[0])
+        results.append(self.output_convs[0](prev_features))
+        for features, lateral_conv, output_conv in zip(
+            x[1:], self.lateral_convs[1:], self.output_convs[1:]
+        ):
+            top_down_features = F.interpolate(prev_features, scale_factor=2, mode="nearest")
+            lateral_features = lateral_conv(features)
+            prev_features = lateral_features + top_down_features
+            if self._fuse_type == "avg":
+                prev_features /= 2
+            results.insert(0, output_conv(prev_features))
+
+        if self.top_block is not None:
+            top_block_in_feature = bottom_up_features.get(self.top_block.in_feature, None)
+            if top_block_in_feature is None:
+                top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)]
+            results.extend(self.top_block(top_block_in_feature))
+        assert len(self._out_features) == len(results)
+        return dict(zip(self._out_features, results))
+
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+
+
+def _assert_strides_are_log2_contiguous(strides):
+    """
+    Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2".
+    """
+    for i, stride in enumerate(strides[1:], 1):
+        assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format(
+            stride, strides[i - 1]
+        )
+
+
+class LastLevelMaxPool(nn.Module):
+    """
+    This module is used in the original FPN to generate a downsampled
+    P6 feature from P5.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.num_levels = 1
+        self.in_feature = "p5"
+
+    def forward(self, x):
+        return [F.max_pool2d(x, kernel_size=1, stride=2, padding=0)]
+
+
+class LastLevelP6P7(nn.Module):
+    """
+    This module is used in RetinaNet to generate extra layers, P6 and P7 from
+    C5 feature.
+    """
+
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.num_levels = 2
+        self.in_feature = "res5"
+        self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
+        self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
+        for module in [self.p6, self.p7]:
+            weight_init.c2_xavier_fill(module)
+
+    def forward(self, c5):
+        p6 = self.p6(c5)
+        p7 = self.p7(F.relu(p6))
+        return [p6, p7]
+
+
+@BACKBONE_REGISTRY.register()
+def build_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    bottom_up = build_resnet_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelMaxPool(),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone
+
+
+@BACKBONE_REGISTRY.register()
+def build_retinanet_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    bottom_up = build_resnet_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    in_channels_p6p7 = bottom_up.out_feature_channels["res5"]
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelP6P7(in_channels_p6p7, out_channels),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone
diff --git a/detectron2/modeling/backbone/resnet.py b/detectron2/modeling/backbone/resnet.py
new file mode 100644
index 0000000..af6c6ea
--- /dev/null
+++ b/detectron2/modeling/backbone/resnet.py
@@ -0,0 +1,479 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from detectron2.layers import (
+    Conv2d,
+    DeformConv,
+    FrozenBatchNorm2d,
+    ModulatedDeformConv,
+    ShapeSpec,
+    get_norm,
+)
+
+from .backbone import Backbone
+from .build import BACKBONE_REGISTRY
+
+__all__ = [
+    "ResNetBlockBase",
+    "BottleneckBlock",
+    "DeformBottleneckBlock",
+    "BasicStem",
+    "ResNet",
+    "make_stage",
+    "build_resnet_backbone",
+]
+
+
+class ResNetBlockBase(nn.Module):
+    def __init__(self, in_channels, out_channels, stride):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+
+        Args:
+            in_channels (int):
+            out_channels (int):
+            stride (int):
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+
+    def freeze(self):
+        for p in self.parameters():
+            p.requires_grad = False
+        FrozenBatchNorm2d.convert_frozen_batchnorm(self)
+        return self
+
+
+class BottleneckBlock(ResNetBlockBase):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        stride_in_1x1=False,
+        dilation=1,
+    ):
+        """
+        Args:
+            norm (str or callable): a callable that takes the number of
+                channels and return a `nn.Module`, or a pre-defined string
+                (one of {"FrozenBN", "BN", "GN"}).
+            stride_in_1x1 (bool): when stride==2, whether to put stride in the
+                first 1x1 convolution or the bottleneck 3x3 convolution.
+        """
+        super().__init__(in_channels, out_channels, stride)
+
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+
+        # The original MSRA ResNet models have stride in the first 1x1 conv
+        # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
+        # stride in the 3x3 conv
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=1,
+            stride=stride_1x1,
+            bias=False,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            bias=False,
+            groups=num_groups,
+            dilation=dilation,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+
+        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+
+        # Zero-initialize the last normalization in each residual branch,
+        # so that at the beginning, the residual branch starts with zeros,
+        # and each residual block behaves like an identity.
+        # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+        # "For BN layers, the learnable scaling coefficient γ is initialized
+        # to be 1, except for each residual block's last BN
+        # where γ is initialized to be 0."
+
+        # nn.init.constant_(self.conv3.norm.weight, 0)
+        # TODO this somehow hurts performance when training GN models from scratch.
+        # Add it as an option when we need to use this code to train a backbone.
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+
+        out = self.conv2(out)
+        out = F.relu_(out)
+
+        out = self.conv3(out)
+
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+
+        out += shortcut
+        out = F.relu_(out)
+        return out
+
+
+class DeformBottleneckBlock(ResNetBlockBase):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        stride_in_1x1=False,
+        dilation=1,
+        deform_modulated=False,
+        deform_num_groups=1,
+    ):
+        """
+        Similar to :class:`BottleneckBlock`, but with deformable conv in the 3x3 convolution.
+        """
+        super().__init__(in_channels, out_channels, stride)
+        self.deform_modulated = deform_modulated
+
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=1,
+            stride=stride_1x1,
+            bias=False,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+
+        if deform_modulated:
+            deform_conv_op = ModulatedDeformConv
+            # offset channels are 2 or 3 (if with modulated) * kernel_size * kernel_size
+            offset_channels = 27
+        else:
+            deform_conv_op = DeformConv
+            offset_channels = 18
+
+        self.conv2_offset = Conv2d(
+            bottleneck_channels,
+            offset_channels * deform_num_groups,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            dilation=dilation,
+        )
+        self.conv2 = deform_conv_op(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            bias=False,
+            groups=num_groups,
+            dilation=dilation,
+            deformable_groups=deform_num_groups,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+
+        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+
+        nn.init.constant_(self.conv2_offset.weight, 0)
+        nn.init.constant_(self.conv2_offset.bias, 0)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+
+        if self.deform_modulated:
+            offset_mask = self.conv2_offset(out)
+            offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1)
+            offset = torch.cat((offset_x, offset_y), dim=1)
+            mask = mask.sigmoid()
+            out = self.conv2(out, offset, mask)
+        else:
+            offset = self.conv2_offset(out)
+            out = self.conv2(out, offset)
+        out = F.relu_(out)
+
+        out = self.conv3(out)
+
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+
+        out += shortcut
+        out = F.relu_(out)
+        return out
+
+
+def make_stage(block_class, num_blocks, first_stride, **kwargs):
+    """
+    Create a resnet stage by creating many blocks.
+    Args:
+        block_class (class): a subclass of ResNetBlockBase
+        num_blocks (int):
+        first_stride (int): the stride of the first block. The other blocks will have stride=1.
+            A `stride` argument will be passed to the block constructor.
+        kwargs: other arguments passed to the block constructor.
+
+    Returns:
+        list[nn.Module]: a list of block module.
+    """
+    blocks = []
+    for i in range(num_blocks):
+        blocks.append(block_class(stride=first_stride if i == 0 else 1, **kwargs))
+        kwargs["in_channels"] = kwargs["out_channels"]
+    return blocks
+
+
+class BasicStem(nn.Module):
+    def __init__(self, in_channels=3, out_channels=64, norm="BN"):
+        """
+        Args:
+            norm (str or callable): a callable that takes the number of
+                channels and return a `nn.Module`, or a pre-defined string
+                (one of {"FrozenBN", "BN", "GN"}).
+        """
+        super().__init__()
+        self.conv1 = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        weight_init.c2_msra_fill(self.conv1)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu_(x)
+        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
+        return x
+
+    @property
+    def out_channels(self):
+        return self.conv1.out_channels
+
+    @property
+    def stride(self):
+        return 4  # = stride 2 conv -> stride 2 max pool
+
+
+class ResNet(Backbone):
+    def __init__(self, stem, stages, num_classes=None, out_features=None):
+        """
+        Args:
+            stem (nn.Module): a stem module
+            stages (list[list[ResNetBlock]]): several (typically 4) stages,
+                each contains multiple :class:`ResNetBlockBase`.
+            num_classes (None or int): if None, will not perform classification.
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. Can be anything in "stem", "linear", or "res2" ...
+                If None, will return the output of the last layer.
+        """
+        super(ResNet, self).__init__()
+        self.stem = stem
+        self.num_classes = num_classes
+
+        current_stride = self.stem.stride
+        self._out_feature_strides = {"stem": current_stride}
+        self._out_feature_channels = {"stem": self.stem.out_channels}
+
+        self.stages_and_names = []
+        for i, blocks in enumerate(stages):
+            for block in blocks:
+                assert isinstance(block, ResNetBlockBase), block
+                curr_channels = block.out_channels
+            stage = nn.Sequential(*blocks)
+            name = "res" + str(i + 2)
+            self.add_module(name, stage)
+            self.stages_and_names.append((stage, name))
+            self._out_feature_strides[name] = current_stride = int(
+                current_stride * np.prod([k.stride for k in blocks])
+            )
+            self._out_feature_channels[name] = blocks[-1].out_channels
+
+        if num_classes is not None:
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+            self.linear = nn.Linear(curr_channels, num_classes)
+
+            # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+            # "The 1000-way fully-connected layer is initialized by
+            # drawing weights from a zero-mean Gaussian with standard deviation of 0.01."
+            nn.init.normal_(self.linear.weight, stddev=0.01)
+            name = "linear"
+
+        if out_features is None:
+            out_features = [name]
+        self._out_features = out_features
+        assert len(self._out_features)
+        children = [x[0] for x in self.named_children()]
+        for out_feature in self._out_features:
+            assert out_feature in children, "Available children: {}".format(", ".join(children))
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        if "stem" in self._out_features:
+            outputs["stem"] = x
+        for stage, name in self.stages_and_names:
+            x = stage(x)
+            if name in self._out_features:
+                outputs[name] = x
+        if self.num_classes is not None:
+            x = self.avgpool(x)
+            x = self.linear(x)
+            if "linear" in self._out_features:
+                outputs["linear"] = x
+        return outputs
+
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+
+
+@BACKBONE_REGISTRY.register()
+def build_resnet_backbone(cfg, input_shape):
+    """
+    Create a ResNet instance from config.
+
+    Returns:
+        ResNet: a :class:`ResNet` instance.
+    """
+    # need registration of new blocks/stems?
+    norm = cfg.MODEL.RESNETS.NORM
+    stem = BasicStem(
+        in_channels=input_shape.channels,
+        out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS,
+        norm=norm,
+    )
+    freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT
+
+    if freeze_at >= 1:
+        for p in stem.parameters():
+            p.requires_grad = False
+        stem = FrozenBatchNorm2d.convert_frozen_batchnorm(stem)
+
+    # fmt: off
+    out_features        = cfg.MODEL.RESNETS.OUT_FEATURES
+    depth               = cfg.MODEL.RESNETS.DEPTH
+    num_groups          = cfg.MODEL.RESNETS.NUM_GROUPS
+    width_per_group     = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
+    bottleneck_channels = num_groups * width_per_group
+    in_channels         = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
+    out_channels        = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
+    stride_in_1x1       = cfg.MODEL.RESNETS.STRIDE_IN_1X1
+    res5_dilation       = cfg.MODEL.RESNETS.RES5_DILATION
+    deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
+    deform_modulated    = cfg.MODEL.RESNETS.DEFORM_MODULATED
+    deform_num_groups   = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS
+    # fmt: on
+    assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)
+
+    num_blocks_per_stage = {50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3]}[depth]
+
+    stages = []
+
+    # Avoid creating variables without gradients
+    # It consumes extra memory and may cause allreduce to fail
+    out_stage_idx = [{"res2": 2, "res3": 3, "res4": 4, "res5": 5}[f] for f in out_features]
+    max_stage_idx = max(out_stage_idx)
+    for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)):
+        dilation = res5_dilation if stage_idx == 5 else 1
+        first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
+        stage_kargs = {
+            "num_blocks": num_blocks_per_stage[idx],
+            "first_stride": first_stride,
+            "in_channels": in_channels,
+            "bottleneck_channels": bottleneck_channels,
+            "out_channels": out_channels,
+            "num_groups": num_groups,
+            "norm": norm,
+            "stride_in_1x1": stride_in_1x1,
+            "dilation": dilation,
+        }
+        if deform_on_per_stage[idx]:
+            stage_kargs["block_class"] = DeformBottleneckBlock
+            stage_kargs["deform_modulated"] = deform_modulated
+            stage_kargs["deform_num_groups"] = deform_num_groups
+        else:
+            stage_kargs["block_class"] = BottleneckBlock
+        blocks = make_stage(**stage_kargs)
+        in_channels = out_channels
+        out_channels *= 2
+        bottleneck_channels *= 2
+
+        if freeze_at >= stage_idx:
+            for block in blocks:
+                block.freeze()
+        stages.append(blocks)
+    return ResNet(stem, stages, out_features=out_features)
diff --git a/detectron2/modeling/box_regression.py b/detectron2/modeling/box_regression.py
new file mode 100644
index 0000000..4cfc369
--- /dev/null
+++ b/detectron2/modeling/box_regression.py
@@ -0,0 +1,217 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+import torch
+
+# Value for clamping large dw and dh predictions. The heuristic is that we clamp
+# such that dw and dh are no larger than what would transform a 16px box into a
+# 1000px box (based on a small anchor, 16px, and a typical image size, 1000px).
+_DEFAULT_SCALE_CLAMP = math.log(1000.0 / 16)
+
+
+__all__ = ["Box2BoxTransform", "Box2BoxTransformRotated"]
+
+
+class Box2BoxTransform(object):
+    """
+    The box-to-box transform defined in R-CNN. The transformation is parameterized
+    by 4 deltas: (dx, dy, dw, dh). The transformation scales the box's width and height
+    by exp(dw), exp(dh) and shifts a box's center by the offset (dx * width, dy * height).
+    """
+
+    def __init__(self, weights, scale_clamp=_DEFAULT_SCALE_CLAMP):
+        """
+        Args:
+            weights (4-element tuple): Scaling factors that are applied to the
+                (dx, dy, dw, dh) deltas. In Fast R-CNN, these were originally set
+                such that the deltas have unit variance; now they are treated as
+                hyperparameters of the system.
+            scale_clamp (float): When predicting deltas, the predicted box scaling
+                factors (dw and dh) are clamped such that they are <= scale_clamp.
+        """
+        self.weights = weights
+        self.scale_clamp = scale_clamp
+
+    def get_deltas(self, src_boxes, target_boxes):
+        """
+        Get box regression transformation deltas (dx, dy, dw, dh) that can be used
+        to transform the `src_boxes` into the `target_boxes`. That is, the relation
+        ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
+        any delta is too large and is clamped).
+
+        Args:
+            src_boxes (Tensor): source boxes, e.g., object proposals
+            target_boxes (Tensor): target of the transformation, e.g., ground-truth
+                boxes.
+        """
+        assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
+        assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
+
+        src_widths = src_boxes[:, 2] - src_boxes[:, 0]
+        src_heights = src_boxes[:, 3] - src_boxes[:, 1]
+        src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths
+        src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights
+
+        target_widths = target_boxes[:, 2] - target_boxes[:, 0]
+        target_heights = target_boxes[:, 3] - target_boxes[:, 1]
+        target_ctr_x = target_boxes[:, 0] + 0.5 * target_widths
+        target_ctr_y = target_boxes[:, 1] + 0.5 * target_heights
+
+        wx, wy, ww, wh = self.weights
+        dx = wx * (target_ctr_x - src_ctr_x) / src_widths
+        dy = wy * (target_ctr_y - src_ctr_y) / src_heights
+        dw = ww * torch.log(target_widths / src_widths)
+        dh = wh * torch.log(target_heights / src_heights)
+
+        deltas = torch.stack((dx, dy, dw, dh), dim=1)
+        assert (src_widths > 0).all().item(), "Input boxes to Box2BoxTransform are not valid!"
+        return deltas
+
+    def apply_deltas(self, deltas, boxes):
+        """
+        Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`.
+
+        Args:
+            deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
+                deltas[i] represents k potentially different class-specific
+                box transformations for the single box boxes[i].
+            boxes (Tensor): boxes to transform, of shape (N, 4)
+        """
+        assert torch.isfinite(deltas).all().item(), "Box regression deltas become infinite or NaN!"
+        boxes = boxes.to(deltas.dtype)
+
+        widths = boxes[:, 2] - boxes[:, 0]
+        heights = boxes[:, 3] - boxes[:, 1]
+        ctr_x = boxes[:, 0] + 0.5 * widths
+        ctr_y = boxes[:, 1] + 0.5 * heights
+
+        wx, wy, ww, wh = self.weights
+        dx = deltas[:, 0::4] / wx
+        dy = deltas[:, 1::4] / wy
+        dw = deltas[:, 2::4] / ww
+        dh = deltas[:, 3::4] / wh
+
+        # Prevent sending too large values into torch.exp()
+        dw = torch.clamp(dw, max=self.scale_clamp)
+        dh = torch.clamp(dh, max=self.scale_clamp)
+
+        pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
+        pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
+        pred_w = torch.exp(dw) * widths[:, None]
+        pred_h = torch.exp(dh) * heights[:, None]
+
+        pred_boxes = torch.zeros_like(deltas)
+        pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w  # x1
+        pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h  # y1
+        pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w  # x2
+        pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h  # y2
+        return pred_boxes
+
+
+class Box2BoxTransformRotated(object):
+    """
+    The box-to-box transform defined in Rotated R-CNN. The transformation is parameterized
+    by 5 deltas: (dx, dy, dw, dh, da). The transformation scales the box's width and height
+    by exp(dw), exp(dh), shifts a box's center by the offset (dx * width, dy * height),
+    and rotate a box's angle by da (radians).
+    Note: angles of deltas are in radians while angles of boxes are in degrees.
+    """
+
+    def __init__(self, weights, scale_clamp=_DEFAULT_SCALE_CLAMP):
+        """
+        Args:
+            weights (5-element tuple): Scaling factors that are applied to the
+                (dx, dy, dw, dh, da) deltas. These are treated as
+                hyperparameters of the system.
+            scale_clamp (float): When predicting deltas, the predicted box scaling
+                factors (dw and dh) are clamped such that they are <= scale_clamp.
+        """
+        self.weights = weights
+        self.scale_clamp = scale_clamp
+
+    def get_deltas(self, src_boxes, target_boxes):
+        """
+        Get box regression transformation deltas (dx, dy, dw, dh, da) that can be used
+        to transform the `src_boxes` into the `target_boxes`. That is, the relation
+        ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
+        any delta is too large and is clamped).
+
+        Args:
+            src_boxes (Tensor): Nx5 source boxes, e.g., object proposals
+            target_boxes (Tensor): Nx5 target of the transformation, e.g., ground-truth
+                boxes.
+        """
+        assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
+        assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
+
+        src_ctr_x, src_ctr_y, src_widths, src_heights, src_angles = torch.unbind(src_boxes, dim=1)
+
+        target_ctr_x, target_ctr_y, target_widths, target_heights, target_angles = torch.unbind(
+            target_boxes, dim=1
+        )
+
+        wx, wy, ww, wh, wa = self.weights
+        dx = wx * (target_ctr_x - src_ctr_x) / src_widths
+        dy = wy * (target_ctr_y - src_ctr_y) / src_heights
+        dw = ww * torch.log(target_widths / src_widths)
+        dh = wh * torch.log(target_heights / src_heights)
+        # Angles of deltas are in radians while angles of boxes are in degrees.
+        # the conversion to radians serve as a way to normalize the values
+        da = target_angles - src_angles
+        while len(torch.where(da < -180.0)[0]) > 0:
+            da[torch.where(da < -180.0)] += 360.0
+        while len(torch.where(da > 180.0)[0]) > 0:
+            da[torch.where(da > 180.0)] -= 360.0
+        da *= wa * math.pi / 180.0
+
+        deltas = torch.stack((dx, dy, dw, dh, da), dim=1)
+        assert (
+            (src_widths > 0).all().item()
+        ), "Input boxes to Box2BoxTransformRotated are not valid!"
+        return deltas
+
+    def apply_deltas(self, deltas, boxes):
+        """
+        Apply transformation `deltas` (dx, dy, dw, dh, da) to `boxes`.
+
+        Args:
+            deltas (Tensor): transformation deltas of shape (N, 5).
+                deltas[i] represents box transformation for the single box boxes[i].
+            boxes (Tensor): boxes to transform, of shape (N, 5)
+        """
+        assert deltas.shape[1] == 5 and boxes.shape[1] == 5
+        assert torch.isfinite(deltas).all().item(), "Box regression deltas become infinite or NaN!"
+
+        boxes = boxes.to(deltas.dtype)
+
+        ctr_x, ctr_y, widths, heights, angles = torch.unbind(boxes, dim=1)
+        wx, wy, ww, wh, wa = self.weights
+        dx, dy, dw, dh, da = torch.unbind(deltas, dim=1)
+
+        dx.div_(wx)
+        dy.div_(wy)
+        dw.div_(ww)
+        dh.div_(wh)
+        da.div_(wa)
+
+        # Prevent sending too large values into torch.exp()
+        dw = torch.clamp(dw, max=self.scale_clamp)
+        dh = torch.clamp(dh, max=self.scale_clamp)
+
+        pred_boxes = torch.zeros_like(deltas)
+        pred_boxes[:, 0] = dx * widths + ctr_x  # x_ctr
+        pred_boxes[:, 1] = dy * heights + ctr_y  # y_ctr
+        pred_boxes[:, 2] = torch.exp(dw) * widths  # width
+        pred_boxes[:, 3] = torch.exp(dh) * heights  # height
+
+        # Following original RRPN implementation,
+        # angles of deltas are in radians while angles of boxes are in degrees.
+        pred_angle = da * 180.0 / math.pi + angles
+
+        while len(torch.where(pred_angle < -180.0)[0]) > 0:
+            pred_angle[torch.where(pred_angle < -180.0)] += 360.0
+        while len(torch.where(pred_angle > 180.0)[0]) > 0:
+            pred_angle[torch.where(pred_angle > 180.0)] -= 360.0
+
+        pred_boxes[:, 4] = pred_angle
+
+        return pred_boxes
diff --git a/detectron2/modeling/matcher.py b/detectron2/modeling/matcher.py
new file mode 100644
index 0000000..adfcce9
--- /dev/null
+++ b/detectron2/modeling/matcher.py
@@ -0,0 +1,132 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+
+
+class Matcher(object):
+    """
+    This class assigns to each predicted "element" (e.g., a box) a ground-truth
+    element. Each predicted element will have exactly zero or one matches; each
+    ground-truth element may be matched to zero or more predicted elements.
+
+    The matching is determined by the MxN match_quality_matrix, that characterizes
+    how well each (ground-truth, prediction)-pair match each other. For example,
+    if the elements are boxes, this matrix may contain box intersection-over-union
+    overlap values.
+
+    The matcher returns (a) a vector of length N containing the index of the
+    ground-truth element m in [0, M) that matches to prediction n in [0, N).
+    (b) a vector of length N containing the labels for each prediction.
+    """
+
+    def __init__(self, thresholds, labels, allow_low_quality_matches=False):
+        """
+        Args:
+            thresholds (list): a list of thresholds used to stratify predictions
+                into levels.
+            labels (list): a list of values to label predictions belonging at
+                each level. A label can be one of {-1, 0, 1} signifying
+                {ignore, negative class, positive class}, respectively.
+            allow_low_quality_matches (bool): if True, produce additional matches
+                for predictions with maximum match quality lower than high_threshold.
+                See set_low_quality_matches_ for more details.
+
+            For example,
+                thresholds = [0.3, 0.5]
+                labels = [0, -1, 1]
+                All predictions with iou < 0.3 will be marked with 0 and
+                thus will be considered as false positives while training.
+                All predictions with 0.3 <= iou < 0.5 will be marked with -1 and
+                thus will be ignored.
+                All predictions with 0.5 <= iou will be marked with 1 and
+                thus will be considered as true positives.
+        """
+        # Add -inf and +inf to first and last position in thresholds
+        thresholds = thresholds[:]
+        assert thresholds[0] > 0
+        thresholds.insert(0, -float("inf"))
+        thresholds.append(float("inf"))
+        assert all(low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:]))
+        assert all(l in [-1, 0, 1] for l in labels)
+        assert len(labels) == len(thresholds) - 1
+        self.thresholds = thresholds
+        self.labels = labels
+        self.allow_low_quality_matches = allow_low_quality_matches
+
+    def __call__(self, match_quality_matrix):
+        """
+        Args:
+            match_quality_matrix (Tensor[float]): an MxN tensor, containing the
+                pairwise quality between M ground-truth elements and N predicted
+                elements. All elements must be >= 0 (due to the us of `torch.nonzero`
+                for selecting indices in :meth:`set_low_quality_matches_`).
+
+        Returns:
+            matches (Tensor[int64]): a vector of length N, where matches[i] is a matched
+                ground-truth index in [0, M)
+            match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates
+                whether a prediction is a true or false positive or ignored
+        """
+        assert match_quality_matrix.dim() == 2
+        if match_quality_matrix.numel() == 0:
+            default_matches = match_quality_matrix.new_full(
+                (match_quality_matrix.size(1),), 0, dtype=torch.int64
+            )
+            # When no gt boxes exist, we define IOU = 0 and therefore set labels
+            # to `self.labels[0]`, which usually defaults to background class 0
+            # To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds
+            default_match_labels = match_quality_matrix.new_full(
+                (match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8
+            )
+            return default_matches, default_match_labels
+
+        assert torch.all(match_quality_matrix >= 0)
+
+        # match_quality_matrix is M (gt) x N (predicted)
+        # Max over gt elements (dim 0) to find best gt candidate for each prediction
+        matched_vals, matches = match_quality_matrix.max(dim=0)
+
+        match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
+
+        for (l, low, high) in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]):
+            low_high = (matched_vals >= low) & (matched_vals < high)
+            match_labels[low_high] = l
+
+        if self.allow_low_quality_matches:
+            self.set_low_quality_matches_(match_labels, match_quality_matrix)
+
+        return matches, match_labels
+
+    def set_low_quality_matches_(self, match_labels, match_quality_matrix):
+        """
+        Produce additional matches for predictions that have only low-quality matches.
+        Specifically, for each ground-truth G find the set of predictions that have
+        maximum overlap with it (including ties); for each prediction in that set, if
+        it is unmatched, then match it to the ground-truth G.
+
+        This function implements the RPN assignment case (i) in Sec. 3.1.2 of the
+        Faster R-CNN paper: https://arxiv.org/pdf/1506.01497v3.pdf.
+        """
+        # For each gt, find the prediction with which it has highest quality
+        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
+        # Find the highest quality match available, even if it is low, including ties.
+        # Note that the matches qualities must be positive due to the use of
+        # `torch.nonzero`.
+        gt_pred_pairs_of_highest_quality = torch.nonzero(
+            match_quality_matrix == highest_quality_foreach_gt[:, None]
+        )
+        # Example gt_pred_pairs_of_highest_quality:
+        #   tensor([[    0, 39796],
+        #           [    1, 32055],
+        #           [    1, 32070],
+        #           [    2, 39190],
+        #           [    2, 40255],
+        #           [    3, 40390],
+        #           [    3, 41455],
+        #           [    4, 45470],
+        #           [    5, 45325],
+        #           [    5, 46390]])
+        # Each row is a (gt index, prediction index)
+        # Note how gt items 1, 2, 3, and 5 each have two ties
+
+        pred_inds_to_update = gt_pred_pairs_of_highest_quality[:, 1]
+        match_labels[pred_inds_to_update] = 1
diff --git a/detectron2/modeling/meta_arch/__init__.py b/detectron2/modeling/meta_arch/__init__.py
new file mode 100644
index 0000000..96ef9b5
--- /dev/null
+++ b/detectron2/modeling/meta_arch/__init__.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from .build import META_ARCH_REGISTRY, build_model  # isort:skip
+
+from .panoptic_fpn import PanopticFPN
+
+# import all the meta_arch, so they will be registered
+from .rcnn import GeneralizedRCNN, ProposalNetwork
+from .retinanet import RetinaNet
+from .semantic_seg import SEM_SEG_HEADS_REGISTRY, SemanticSegmentor, build_sem_seg_head
diff --git a/detectron2/modeling/meta_arch/build.py b/detectron2/modeling/meta_arch/build.py
new file mode 100644
index 0000000..34bd4b7
--- /dev/null
+++ b/detectron2/modeling/meta_arch/build.py
@@ -0,0 +1,18 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from detectron2.utils.registry import Registry
+
+META_ARCH_REGISTRY = Registry("META_ARCH")  # noqa F401 isort:skip
+META_ARCH_REGISTRY.__doc__ = """
+Registry for meta-architectures, i.e. the whole model.
+
+The registered object will be called with `obj(cfg)`
+and expected to return a `nn.Module` object.
+"""
+
+
+def build_model(cfg):
+    """
+    Built the whole model, defined by `cfg.MODEL.META_ARCHITECTURE`.
+    """
+    meta_arch = cfg.MODEL.META_ARCHITECTURE
+    return META_ARCH_REGISTRY.get(meta_arch)(cfg)
diff --git a/detectron2/modeling/meta_arch/panoptic_fpn.py b/detectron2/modeling/meta_arch/panoptic_fpn.py
new file mode 100644
index 0000000..fc47904
--- /dev/null
+++ b/detectron2/modeling/meta_arch/panoptic_fpn.py
@@ -0,0 +1,216 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import torch
+from torch import nn
+
+from detectron2.structures import ImageList
+
+from ..backbone import build_backbone
+from ..postprocessing import detector_postprocess, sem_seg_postprocess
+from ..proposal_generator import build_proposal_generator
+from ..roi_heads import build_roi_heads
+from .build import META_ARCH_REGISTRY
+from .semantic_seg import build_sem_seg_head
+
+__all__ = ["PanopticFPN"]
+
+
+@META_ARCH_REGISTRY.register()
+class PanopticFPN(nn.Module):
+    """
+    Main class for Panoptic FPN architectures (see https://arxiv.org/abd/1901.02446).
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.device = torch.device(cfg.MODEL.DEVICE)
+
+        self.instance_loss_weight = cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT
+
+        # options when combining instance & semantic outputs
+        self.combine_on = cfg.MODEL.PANOPTIC_FPN.COMBINE.ENABLED
+        self.combine_overlap_threshold = cfg.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH
+        self.combine_stuff_area_limit = cfg.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT
+        self.combine_instances_confidence_threshold = (
+            cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH
+        )
+
+        self.backbone = build_backbone(cfg)
+        self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape())
+        self.roi_heads = build_roi_heads(cfg, self.backbone.output_shape())
+        self.sem_seg_head = build_sem_seg_head(cfg, self.backbone.output_shape())
+
+        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
+        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
+        self.normalizer = lambda x: (x - pixel_mean) / pixel_std
+        self.to(self.device)
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+
+        For now, each item in the list is a dict that contains:
+            image: Tensor, image in (C, H, W) format.
+            instances: Instances
+            sem_seg: semantic segmentation ground truth.
+            Other information that's included in the original dicts, such as:
+                "height", "width" (int): the output resolution of the model, used in inference.
+                    See :meth:`postprocess` for details.
+
+        Returns:
+            list[dict]: each dict is the results for one image. The dict
+                contains the following keys:
+                "instances": see :meth:`GeneralizedRCNN.forward` for its format.
+                "sem_seg": see :meth:`SemanticSegmentor.forward` for its format.
+                "panoptic_seg": available when `PANOPTIC_FPN.COMBINE.ENABLED`.
+                    See the return value of
+                    :func:`combine_semantic_and_instance_outputs` for its format.
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [self.normalizer(x) for x in images]
+        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
+        features = self.backbone(images.tensor)
+
+        if "proposals" in batched_inputs[0]:
+            proposals = [x["proposals"].to(self.device) for x in batched_inputs]
+            proposal_losses = {}
+
+        if "sem_seg" in batched_inputs[0]:
+            gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs]
+            gt_sem_seg = ImageList.from_tensors(
+                gt_sem_seg, self.backbone.size_divisibility, self.sem_seg_head.ignore_value
+            ).tensor
+        else:
+            gt_sem_seg = None
+        sem_seg_results, sem_seg_losses = self.sem_seg_head(features, gt_sem_seg)
+
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+        if self.proposal_generator:
+            proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
+        detector_results, detector_losses = self.roi_heads(
+            images, features, proposals, gt_instances
+        )
+
+        if self.training:
+            losses = {}
+            losses.update(sem_seg_losses)
+            losses.update({k: v * self.instance_loss_weight for k, v in detector_losses.items()})
+            losses.update(proposal_losses)
+            return losses
+
+        processed_results = []
+        for sem_seg_result, detector_result, input_per_image, image_size in zip(
+            sem_seg_results, detector_results, batched_inputs, images.image_sizes
+        ):
+            height = input_per_image.get("height")
+            width = input_per_image.get("width")
+            sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height, width)
+            detector_r = detector_postprocess(detector_result, height, width)
+
+            processed_results.append({"sem_seg": sem_seg_r, "instances": detector_r})
+
+            if self.combine_on:
+                panoptic_r = combine_semantic_and_instance_outputs(
+                    detector_r,
+                    sem_seg_r.argmax(dim=0),
+                    self.combine_overlap_threshold,
+                    self.combine_stuff_area_limit,
+                    self.combine_instances_confidence_threshold,
+                )
+                processed_results[-1]["panoptic_seg"] = panoptic_r
+        return processed_results
+
+
+def combine_semantic_and_instance_outputs(
+    instance_results,
+    semantic_results,
+    overlap_threshold,
+    stuff_area_limit,
+    instances_confidence_threshold,
+):
+    """
+    Implement a simple combining logic following
+    "combine_semantic_and_instance_predictions.py" in panopticapi
+    to produce panoptic segmentation outputs.
+
+    Args:
+        instance_results: output of :func:`detector_postprocess`.
+        semantic_results: an (H, W) tensor, each is the contiguous semantic
+            category id
+
+    Returns:
+        panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
+        segments_info (list[dict]): Describe each segment in `panoptic_seg`.
+            Each dict contains keys "id", "category_id", "isthing".
+    """
+    panoptic_seg = torch.zeros_like(semantic_results, dtype=torch.int32)
+
+    # sort instance outputs by scores
+    sorted_inds = torch.argsort(-instance_results.scores)
+
+    current_segment_id = 0
+    segments_info = []
+
+    instance_masks = instance_results.pred_masks.to(dtype=torch.bool, device=panoptic_seg.device)
+
+    # Add instances one-by-one, check for overlaps with existing ones
+    for inst_id in sorted_inds:
+        score = instance_results.scores[inst_id].item()
+        if score < instances_confidence_threshold:
+            break
+        mask = instance_masks[inst_id]  # H,W
+        mask_area = mask.sum().item()
+
+        if mask_area == 0:
+            continue
+
+        intersect = (mask > 0) & (panoptic_seg > 0)
+        intersect_area = intersect.sum().item()
+
+        if intersect_area * 1.0 / mask_area > overlap_threshold:
+            continue
+
+        if intersect_area > 0:
+            mask = mask & (panoptic_seg == 0)
+
+        current_segment_id += 1
+        panoptic_seg[mask] = current_segment_id
+        segments_info.append(
+            {
+                "id": current_segment_id,
+                "isthing": True,
+                "score": score,
+                "category_id": instance_results.pred_classes[inst_id].item(),
+                "instance_id": inst_id.item(),
+            }
+        )
+
+    # Add semantic results to remaining empty areas
+    semantic_labels = torch.unique(semantic_results).cpu().tolist()
+    for semantic_label in semantic_labels:
+        if semantic_label == 0:  # 0 is a special "thing" class
+            continue
+        mask = (semantic_results == semantic_label) & (panoptic_seg == 0)
+        mask_area = mask.sum().item()
+        if mask_area < stuff_area_limit:
+            continue
+
+        current_segment_id += 1
+        panoptic_seg[mask] = current_segment_id
+        segments_info.append(
+            {
+                "id": current_segment_id,
+                "isthing": False,
+                "category_id": semantic_label,
+                "area": mask_area,
+            }
+        )
+
+    return panoptic_seg, segments_info
diff --git a/detectron2/modeling/meta_arch/rcnn.py b/detectron2/modeling/meta_arch/rcnn.py
new file mode 100644
index 0000000..5fefe64
--- /dev/null
+++ b/detectron2/modeling/meta_arch/rcnn.py
@@ -0,0 +1,204 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import torch
+from torch import nn
+
+from detectron2.structures import ImageList
+from detectron2.utils.logger import log_first_n
+
+from ..backbone import build_backbone
+from ..postprocessing import detector_postprocess
+from ..proposal_generator import build_proposal_generator
+from ..roi_heads import build_roi_heads
+from .build import META_ARCH_REGISTRY
+
+__all__ = ["GeneralizedRCNN", "ProposalNetwork"]
+
+
+@META_ARCH_REGISTRY.register()
+class GeneralizedRCNN(nn.Module):
+    """
+    Generalized R-CNN. Any models that contains the following three components:
+    1. Per-image feature extraction (aka backbone)
+    2. Region proposal generation
+    3. Per-region feature extraction and prediction
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.device = torch.device(cfg.MODEL.DEVICE)
+        self.backbone = build_backbone(cfg)
+        self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape())
+        self.roi_heads = build_roi_heads(cfg, self.backbone.output_shape())
+
+        assert len(cfg.MODEL.PIXEL_MEAN) == len(cfg.MODEL.PIXEL_STD)
+        num_channels = len(cfg.MODEL.PIXEL_MEAN)
+        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(num_channels, 1, 1)
+        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(num_channels, 1, 1)
+        self.normalizer = lambda x: (x - pixel_mean) / pixel_std
+        self.to(self.device)
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+
+                * image: Tensor, image in (C, H, W) format.
+                * instances (optional): groundtruth :class:`Instances`
+                * proposals (optional): :class:`Instances`, precomputed proposals.
+
+                Other information that's included in the original dicts, such as:
+
+                * "height", "width" (int): the output resolution of the model, used in inference.
+                    See :meth:`postprocess` for details.
+
+        Returns:
+            list[dict]:
+                Each dict is the output for one input image.
+                The dict contains one key "instances" whose value is a :class:`Instances`.
+                The :class:`Instances` object has the following keys:
+                    "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints"
+        """
+        if not self.training:
+            return self.inference(batched_inputs)
+
+        images = self.preprocess_image(batched_inputs)
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        elif "targets" in batched_inputs[0]:
+            log_first_n(
+                logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
+            )
+            gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+
+        features = self.backbone(images.tensor)
+
+        if self.proposal_generator:
+            proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
+        else:
+            assert "proposals" in batched_inputs[0]
+            proposals = [x["proposals"].to(self.device) for x in batched_inputs]
+            proposal_losses = {}
+
+        _, detector_losses = self.roi_heads(images, features, proposals, gt_instances)
+
+        losses = {}
+        losses.update(detector_losses)
+        losses.update(proposal_losses)
+        return losses
+
+    def inference(self, batched_inputs, detected_instances=None, do_postprocess=True):
+        """
+        Run inference on the given inputs.
+
+        Args:
+            batched_inputs (list[dict]): same as in :meth:`forward`
+            detected_instances (None or list[Instances]): if not None, it
+                contains an `Instances` object per image. The `Instances`
+                object contains "pred_boxes" and "pred_classes" which are
+                known boxes in the image.
+                The inference will then skip the detection of bounding boxes,
+                and only predict other per-ROI outputs.
+            do_postprocess (bool): whether to apply post-processing on the outputs.
+
+        Returns:
+            same as in :meth:`forward`.
+        """
+        assert not self.training
+
+        images = self.preprocess_image(batched_inputs)
+        features = self.backbone(images.tensor)
+
+        if detected_instances is None:
+            if self.proposal_generator:
+                proposals, _ = self.proposal_generator(images, features, None)
+            else:
+                assert "proposals" in batched_inputs[0]
+                proposals = [x["proposals"].to(self.device) for x in batched_inputs]
+
+            results, _ = self.roi_heads(images, features, proposals, None)
+        else:
+            detected_instances = [x.to(self.device) for x in detected_instances]
+            results = self.roi_heads.forward_with_given_boxes(features, detected_instances)
+
+        if do_postprocess:
+            processed_results = []
+            for results_per_image, input_per_image, image_size in zip(
+                results, batched_inputs, images.image_sizes
+            ):
+                height = input_per_image.get("height", image_size[0])
+                width = input_per_image.get("width", image_size[1])
+                r = detector_postprocess(results_per_image, height, width)
+                processed_results.append({"instances": r})
+            return processed_results
+        else:
+            return results
+
+    def preprocess_image(self, batched_inputs):
+        """
+        Normalize, pad and batch the input images.
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [self.normalizer(x) for x in images]
+        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
+        return images
+
+
+@META_ARCH_REGISTRY.register()
+class ProposalNetwork(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.device = torch.device(cfg.MODEL.DEVICE)
+
+        self.backbone = build_backbone(cfg)
+        self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape())
+
+        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(-1, 1, 1)
+        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(-1, 1, 1)
+        self.normalizer = lambda x: (x - pixel_mean) / pixel_std
+        self.to(self.device)
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            Same as in :class:`GeneralizedRCNN.forward`
+
+        Returns:
+            list[dict]: Each dict is the output for one input image.
+                The dict contains one key "proposals" whose value is a
+                :class:`Instances` with keys "proposal_boxes" and "objectness_logits".
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [self.normalizer(x) for x in images]
+        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
+        features = self.backbone(images.tensor)
+
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        elif "targets" in batched_inputs[0]:
+            log_first_n(
+                logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
+            )
+            gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+        proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
+        # In training, the proposals are not useful at all but we generate them anyway.
+        # This makes RPN-only models about 5% slower.
+        if self.training:
+            return proposal_losses
+
+        processed_results = []
+        for results_per_image, input_per_image, image_size in zip(
+            proposals, batched_inputs, images.image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            r = detector_postprocess(results_per_image, height, width)
+            processed_results.append({"proposals": r})
+        return processed_results
diff --git a/detectron2/modeling/meta_arch/retinanet.py b/detectron2/modeling/meta_arch/retinanet.py
new file mode 100644
index 0000000..2e9a159
--- /dev/null
+++ b/detectron2/modeling/meta_arch/retinanet.py
@@ -0,0 +1,430 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import math
+from typing import List
+import torch
+from fvcore.nn import sigmoid_focal_loss_jit, smooth_l1_loss
+from torch import nn
+
+from detectron2.layers import ShapeSpec, batched_nms, cat
+from detectron2.structures import Boxes, ImageList, Instances, pairwise_iou
+from detectron2.utils.logger import log_first_n
+
+from ..anchor_generator import build_anchor_generator
+from ..backbone import build_backbone
+from ..box_regression import Box2BoxTransform
+from ..matcher import Matcher
+from ..postprocessing import detector_postprocess
+from .build import META_ARCH_REGISTRY
+
+__all__ = ["RetinaNet"]
+
+
+def permute_to_N_HWA_K(tensor, K):
+    """
+    Transpose/reshape a tensor from (N, (A x K), H, W) to (N, (HxWxA), K)
+    """
+    assert tensor.dim() == 4, tensor.shape
+    N, _, H, W = tensor.shape
+    tensor = tensor.view(N, -1, K, H, W)
+    tensor = tensor.permute(0, 3, 4, 1, 2)
+    tensor = tensor.reshape(N, -1, K)  # Size=(N,HWA,K)
+    return tensor
+
+
+def permute_all_cls_and_box_to_N_HWA_K_and_concat(box_cls, box_delta, num_classes=80):
+    """
+    Rearrange the tensor layout from the network output, i.e.:
+    list[Tensor]: #lvl tensors of shape (N, A x K, Hi, Wi)
+    to per-image predictions, i.e.:
+    Tensor: of shape (N x sum(Hi x Wi x A), K)
+    """
+    # for each feature level, permute the outputs to make them be in the
+    # same format as the labels. Note that the labels are computed for
+    # all feature levels concatenated, so we keep the same representation
+    # for the objectness and the box_delta
+    box_cls_flattened = [permute_to_N_HWA_K(x, num_classes) for x in box_cls]
+    box_delta_flattened = [permute_to_N_HWA_K(x, 4) for x in box_delta]
+    # concatenate on the first dimension (representing the feature levels), to
+    # take into account the way the labels were generated (with all feature maps
+    # being concatenated as well)
+    box_cls = cat(box_cls_flattened, dim=1).view(-1, num_classes)
+    box_delta = cat(box_delta_flattened, dim=1).view(-1, 4)
+    return box_cls, box_delta
+
+
+@META_ARCH_REGISTRY.register()
+class RetinaNet(nn.Module):
+    """
+    Implement RetinaNet (https://arxiv.org/abs/1708.02002).
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.device = torch.device(cfg.MODEL.DEVICE)
+
+        # fmt: off
+        self.num_classes              = cfg.MODEL.RETINANET.NUM_CLASSES
+        self.in_features              = cfg.MODEL.RETINANET.IN_FEATURES
+        # Loss parameters:
+        self.focal_loss_alpha         = cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA
+        self.focal_loss_gamma         = cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA
+        self.smooth_l1_loss_beta      = cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA
+        # Inference parameters:
+        self.score_threshold          = cfg.MODEL.RETINANET.SCORE_THRESH_TEST
+        self.topk_candidates          = cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST
+        self.nms_threshold            = cfg.MODEL.RETINANET.NMS_THRESH_TEST
+        self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE
+        # fmt: on
+
+        self.backbone = build_backbone(cfg)
+
+        backbone_shape = self.backbone.output_shape()
+        feature_shapes = [backbone_shape[f] for f in self.in_features]
+        self.head = RetinaNetHead(cfg, feature_shapes)
+        self.anchor_generator = build_anchor_generator(cfg, feature_shapes)
+
+        # Matching and loss
+        self.box2box_transform = Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS)
+        self.matcher = Matcher(
+            cfg.MODEL.RETINANET.IOU_THRESHOLDS,
+            cfg.MODEL.RETINANET.IOU_LABELS,
+            allow_low_quality_matches=True,
+        )
+
+        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
+        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
+        self.normalizer = lambda x: (x - pixel_mean) / pixel_std
+        self.to(self.device)
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+
+                * image: Tensor, image in (C, H, W) format.
+                * instances: Instances
+
+                Other information that's included in the original dicts, such as:
+
+                * "height", "width" (int): the output resolution of the model, used in inference.
+                    See :meth:`postprocess` for details.
+        Returns:
+            dict[str: Tensor]:
+                mapping from a named loss to a tensor storing the loss. Used during training only.
+        """
+        images = self.preprocess_image(batched_inputs)
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        elif "targets" in batched_inputs[0]:
+            log_first_n(
+                logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
+            )
+            gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+
+        features = self.backbone(images.tensor)
+        features = [features[f] for f in self.in_features]
+        box_cls, box_delta = self.head(features)
+        anchors = self.anchor_generator(features)
+
+        if self.training:
+            gt_classes, gt_anchors_reg_deltas = self.get_ground_truth(anchors, gt_instances)
+            return self.losses(gt_classes, gt_anchors_reg_deltas, box_cls, box_delta)
+        else:
+            results = self.inference(box_cls, box_delta, anchors, images)
+            processed_results = []
+            for results_per_image, input_per_image, image_size in zip(
+                results, batched_inputs, images.image_sizes
+            ):
+                height = input_per_image.get("height", image_size[0])
+                width = input_per_image.get("width", image_size[1])
+                r = detector_postprocess(results_per_image, height, width)
+                processed_results.append({"instances": r})
+            return processed_results
+
+    def losses(self, gt_classes, gt_anchors_deltas, pred_class_logits, pred_anchor_deltas):
+        """
+        Args:
+            For `gt_classes` and `gt_anchors_deltas` parameters, see
+                :meth:`RetinaNet.get_ground_truth`.
+            Their shapes are (N, R) and (N, R, 4), respectively, where R is
+            the total number of anchors across levels, i.e. sum(Hi x Wi x A)
+            For `pred_class_logits` and `pred_anchor_deltas`, see
+                :meth:`RetinaNetHead.forward`.
+
+        Returns:
+            dict[str: Tensor]:
+                mapping from a named loss to a scalar tensor
+                storing the loss. Used during training only. The dict keys are:
+                "loss_cls" and "loss_box_reg"
+        """
+        pred_class_logits, pred_anchor_deltas = permute_all_cls_and_box_to_N_HWA_K_and_concat(
+            pred_class_logits, pred_anchor_deltas, self.num_classes
+        )  # Shapes: (N x R, K) and (N x R, 4), respectively.
+
+        gt_classes = gt_classes.flatten()
+        gt_anchors_deltas = gt_anchors_deltas.view(-1, 4)
+
+        valid_idxs = gt_classes >= 0
+        foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes)
+        num_foreground = foreground_idxs.sum()
+
+        gt_classes_target = torch.zeros_like(pred_class_logits)
+        gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1
+
+        # logits loss
+        loss_cls = sigmoid_focal_loss_jit(
+            pred_class_logits[valid_idxs],
+            gt_classes_target[valid_idxs],
+            alpha=self.focal_loss_alpha,
+            gamma=self.focal_loss_gamma,
+            reduction="sum",
+        ) / max(1, num_foreground)
+
+        # regression loss
+        loss_box_reg = smooth_l1_loss(
+            pred_anchor_deltas[foreground_idxs],
+            gt_anchors_deltas[foreground_idxs],
+            beta=self.smooth_l1_loss_beta,
+            reduction="sum",
+        ) / max(1, num_foreground)
+
+        return {"loss_cls": loss_cls, "loss_box_reg": loss_box_reg}
+
+    @torch.no_grad()
+    def get_ground_truth(self, anchors, targets):
+        """
+        Args:
+            anchors (list[list[Boxes]]): a list of N=#image elements. Each is a
+                list of #feature level Boxes. The Boxes contains anchors of
+                this image on the specific feature level.
+            targets (list[Instances]): a list of N `Instances`s. The i-th
+                `Instances` contains the ground-truth per-instance annotations
+                for the i-th input image.  Specify `targets` during training only.
+
+        Returns:
+            gt_classes (Tensor):
+                An integer tensor of shape (N, R) storing ground-truth
+                labels for each anchor.
+                R is the total number of anchors, i.e. the sum of Hi x Wi x A for all levels.
+                Anchors with an IoU with some target higher than the foreground threshold
+                are assigned their corresponding label in the [0, K-1] range.
+                Anchors whose IoU are below the background threshold are assigned
+                the label "K". Anchors whose IoU are between the foreground and background
+                thresholds are assigned a label "-1", i.e. ignore.
+            gt_anchors_deltas (Tensor):
+                Shape (N, R, 4).
+                The last dimension represents ground-truth box2box transform
+                targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box.
+                The values in the tensor are meaningful only when the corresponding
+                anchor is labeled as foreground.
+        """
+        gt_classes = []
+        gt_anchors_deltas = []
+        anchors = [Boxes.cat(anchors_i) for anchors_i in anchors]
+        # list[Tensor(R, 4)], one for each image
+
+        for anchors_per_image, targets_per_image in zip(anchors, targets):
+            match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, anchors_per_image)
+            gt_matched_idxs, anchor_labels = self.matcher(match_quality_matrix)
+
+            # ground truth box regression
+            matched_gt_boxes = targets_per_image[gt_matched_idxs].gt_boxes
+            gt_anchors_reg_deltas_i = self.box2box_transform.get_deltas(
+                anchors_per_image.tensor, matched_gt_boxes.tensor
+            )
+
+            # ground truth classes
+            has_gt = len(targets_per_image) > 0
+            if has_gt:
+                gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs]
+                # Anchors with label 0 are treated as background.
+                gt_classes_i[anchor_labels == 0] = self.num_classes
+                # Anchors with label -1 are ignored.
+                gt_classes_i[anchor_labels == -1] = -1
+            else:
+                gt_classes_i = torch.zeros_like(gt_matched_idxs) + self.num_classes
+
+            gt_classes.append(gt_classes_i)
+            gt_anchors_deltas.append(gt_anchors_reg_deltas_i)
+
+        return torch.stack(gt_classes), torch.stack(gt_anchors_deltas)
+
+    def inference(self, box_cls, box_delta, anchors, images):
+        """
+        Arguments:
+            box_cls, box_delta: Same as the output of :meth:`RetinaNetHead.forward`
+            anchors (list[list[Boxes]]): a list of #images elements. Each is a
+                list of #feature level Boxes. The Boxes contain anchors of this
+                image on the specific feature level.
+            images (ImageList): the input images
+
+        Returns:
+            results (List[Instances]): a list of #images elements.
+        """
+        assert len(anchors) == len(images)
+        results = []
+
+        box_cls = [permute_to_N_HWA_K(x, self.num_classes) for x in box_cls]
+        box_delta = [permute_to_N_HWA_K(x, 4) for x in box_delta]
+        # list[Tensor], one per level, each has shape (N, Hi x Wi x A, K or 4)
+
+        for img_idx, anchors_per_image in enumerate(anchors):
+            image_size = images.image_sizes[img_idx]
+            box_cls_per_image = [box_cls_per_level[img_idx] for box_cls_per_level in box_cls]
+            box_reg_per_image = [box_reg_per_level[img_idx] for box_reg_per_level in box_delta]
+            results_per_image = self.inference_single_image(
+                box_cls_per_image, box_reg_per_image, anchors_per_image, tuple(image_size)
+            )
+            results.append(results_per_image)
+        return results
+
+    def inference_single_image(self, box_cls, box_delta, anchors, image_size):
+        """
+        Single-image inference. Return bounding-box detection results by thresholding
+        on scores and applying non-maximum suppression (NMS).
+
+        Arguments:
+            box_cls (list[Tensor]): list of #feature levels. Each entry contains
+                tensor of size (H x W x A, K)
+            box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4.
+            anchors (list[Boxes]): list of #feature levels. Each entry contains
+                a Boxes object, which contains all the anchors for that
+                image in that feature level.
+            image_size (tuple(H, W)): a tuple of the image height and width.
+
+        Returns:
+            Same as `inference`, but for only one image.
+        """
+        boxes_all = []
+        scores_all = []
+        class_idxs_all = []
+
+        # Iterate over every feature level
+        for box_cls_i, box_reg_i, anchors_i in zip(box_cls, box_delta, anchors):
+            # (HxWxAxK,)
+            box_cls_i = box_cls_i.flatten().sigmoid_()
+
+            # Keep top k top scoring indices only.
+            num_topk = min(self.topk_candidates, box_reg_i.size(0))
+            # torch.sort is actually faster than .topk (at least on GPUs)
+            predicted_prob, topk_idxs = box_cls_i.sort(descending=True)
+            predicted_prob = predicted_prob[:num_topk]
+            topk_idxs = topk_idxs[:num_topk]
+
+            # filter out the proposals with low confidence score
+            keep_idxs = predicted_prob > self.score_threshold
+            predicted_prob = predicted_prob[keep_idxs]
+            topk_idxs = topk_idxs[keep_idxs]
+
+            anchor_idxs = topk_idxs // self.num_classes
+            classes_idxs = topk_idxs % self.num_classes
+
+            box_reg_i = box_reg_i[anchor_idxs]
+            anchors_i = anchors_i[anchor_idxs]
+            # predict boxes
+            predicted_boxes = self.box2box_transform.apply_deltas(box_reg_i, anchors_i.tensor)
+
+            boxes_all.append(predicted_boxes)
+            scores_all.append(predicted_prob)
+            class_idxs_all.append(classes_idxs)
+
+        boxes_all, scores_all, class_idxs_all = [
+            cat(x) for x in [boxes_all, scores_all, class_idxs_all]
+        ]
+        keep = batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold)
+        keep = keep[: self.max_detections_per_image]
+
+        result = Instances(image_size)
+        result.pred_boxes = Boxes(boxes_all[keep])
+        result.scores = scores_all[keep]
+        result.pred_classes = class_idxs_all[keep]
+        return result
+
+    def preprocess_image(self, batched_inputs):
+        """
+        Normalize, pad and batch the input images.
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [self.normalizer(x) for x in images]
+        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
+        return images
+
+
+class RetinaNetHead(nn.Module):
+    """
+    The head used in RetinaNet for object classification and box regression.
+    It has two subnets for the two tasks, with a common structure but separate parameters.
+    """
+
+    def __init__(self, cfg, input_shape: List[ShapeSpec]):
+        super().__init__()
+        # fmt: off
+        in_channels      = input_shape[0].channels
+        num_classes      = cfg.MODEL.RETINANET.NUM_CLASSES
+        num_convs        = cfg.MODEL.RETINANET.NUM_CONVS
+        prior_prob       = cfg.MODEL.RETINANET.PRIOR_PROB
+        num_anchors      = build_anchor_generator(cfg, input_shape).num_cell_anchors
+        # fmt: on
+        assert (
+            len(set(num_anchors)) == 1
+        ), "Using different number of anchors between levels is not currently supported!"
+        num_anchors = num_anchors[0]
+
+        cls_subnet = []
+        bbox_subnet = []
+        for _ in range(num_convs):
+            cls_subnet.append(
+                nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+            )
+            cls_subnet.append(nn.ReLU())
+            bbox_subnet.append(
+                nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+            )
+            bbox_subnet.append(nn.ReLU())
+
+        self.cls_subnet = nn.Sequential(*cls_subnet)
+        self.bbox_subnet = nn.Sequential(*bbox_subnet)
+        self.cls_score = nn.Conv2d(
+            in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1
+        )
+        self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1)
+
+        # Initialization
+        for modules in [self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred]:
+            for layer in modules.modules():
+                if isinstance(layer, nn.Conv2d):
+                    torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
+                    torch.nn.init.constant_(layer.bias, 0)
+
+        # Use prior in model initialization to improve stability
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        torch.nn.init.constant_(self.cls_score.bias, bias_value)
+
+    def forward(self, features):
+        """
+        Arguments:
+            features (list[Tensor]): FPN feature map tensors in high to low resolution.
+                Each tensor in the list correspond to different feature levels.
+
+        Returns:
+            logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi).
+                The tensor predicts the classification probability
+                at each spatial position for each of the A anchors and K object
+                classes.
+            bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi).
+                The tensor predicts 4-vector (dx,dy,dw,dh) box
+                regression values for every anchor. These values are the
+                relative offset between the anchor and the ground truth box.
+        """
+        logits = []
+        bbox_reg = []
+        for feature in features:
+            logits.append(self.cls_score(self.cls_subnet(feature)))
+            bbox_reg.append(self.bbox_pred(self.bbox_subnet(feature)))
+        return logits, bbox_reg
diff --git a/detectron2/modeling/meta_arch/semantic_seg.py b/detectron2/modeling/meta_arch/semantic_seg.py
new file mode 100644
index 0000000..722a18f
--- /dev/null
+++ b/detectron2/modeling/meta_arch/semantic_seg.py
@@ -0,0 +1,170 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+from typing import Dict
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.layers import Conv2d, ShapeSpec
+from detectron2.structures import ImageList
+from detectron2.utils.registry import Registry
+
+from ..backbone import build_backbone
+from ..postprocessing import sem_seg_postprocess
+from .build import META_ARCH_REGISTRY
+
+__all__ = ["SemanticSegmentor", "SEM_SEG_HEADS_REGISTRY", "SemSegFPNHead", "build_sem_seg_head"]
+
+
+SEM_SEG_HEADS_REGISTRY = Registry("SEM_SEG_HEADS")
+"""
+Registry for semantic segmentation heads, which make semantic segmentation predictions
+from feature maps.
+"""
+
+
+@META_ARCH_REGISTRY.register()
+class SemanticSegmentor(nn.Module):
+    """
+    Main class for semantic segmentation architectures.
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.device = torch.device(cfg.MODEL.DEVICE)
+
+        self.backbone = build_backbone(cfg)
+        self.sem_seg_head = build_sem_seg_head(cfg, self.backbone.output_shape())
+
+        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(-1, 1, 1)
+        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(-1, 1, 1)
+        self.normalizer = lambda x: (x - pixel_mean) / pixel_std
+
+        self.to(self.device)
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
+                Each item in the list contains the inputs for one image.
+
+        For now, each item in the list is a dict that contains:
+            image: Tensor, image in (C, H, W) format.
+            sem_seg: semantic segmentation ground truth
+            Other information that's included in the original dicts, such as:
+                "height", "width" (int): the output resolution of the model, used in inference.
+                    See :meth:`postprocess` for details.
+
+        Returns:
+            list[dict]: Each dict is the output for one input image.
+                The dict contains one key "sem_seg" whose value is a
+                Tensor of the output resolution that represents the
+                per-pixel segmentation prediction.
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [self.normalizer(x) for x in images]
+        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
+
+        features = self.backbone(images.tensor)
+
+        if "sem_seg" in batched_inputs[0]:
+            targets = [x["sem_seg"].to(self.device) for x in batched_inputs]
+            targets = ImageList.from_tensors(
+                targets, self.backbone.size_divisibility, self.sem_seg_head.ignore_value
+            ).tensor
+        else:
+            targets = None
+        results, losses = self.sem_seg_head(features, targets)
+
+        if self.training:
+            return losses
+
+        processed_results = []
+        for result, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes):
+            height = input_per_image.get("height")
+            width = input_per_image.get("width")
+            r = sem_seg_postprocess(result, image_size, height, width)
+            processed_results.append({"sem_seg": r})
+        return processed_results
+
+
+def build_sem_seg_head(cfg, input_shape):
+    """
+    Build a semantic segmentation head from `cfg.MODEL.SEM_SEG_HEAD.NAME`.
+    """
+    name = cfg.MODEL.SEM_SEG_HEAD.NAME
+    return SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)
+
+
+@SEM_SEG_HEADS_REGISTRY.register()
+class SemSegFPNHead(nn.Module):
+    """
+    A semantic segmentation head described in detail in the Panoptic Feature Pyramid Networks paper
+    (https://arxiv.org/abs/1901.02446). It takes FPN features as input and merges information from
+    all levels of the FPN into single output.
+    """
+
+    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
+        super().__init__()
+
+        # fmt: off
+        self.in_features      = cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
+        feature_strides       = {k: v.stride for k, v in input_shape.items()}
+        feature_channels      = {k: v.channels for k, v in input_shape.items()}
+        self.ignore_value     = cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE
+        num_classes           = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
+        conv_dims             = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
+        self.common_stride    = cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE
+        norm                  = cfg.MODEL.SEM_SEG_HEAD.NORM
+        self.loss_weight      = cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT
+        # fmt: on
+
+        self.scale_heads = []
+        for in_feature in self.in_features:
+            head_ops = []
+            head_length = max(
+                1, int(np.log2(feature_strides[in_feature]) - np.log2(self.common_stride))
+            )
+            for k in range(head_length):
+                norm_module = nn.GroupNorm(32, conv_dims) if norm == "GN" else None
+                conv = Conv2d(
+                    feature_channels[in_feature] if k == 0 else conv_dims,
+                    conv_dims,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=not norm,
+                    norm=norm_module,
+                    activation=F.relu,
+                )
+                weight_init.c2_msra_fill(conv)
+                head_ops.append(conv)
+                if feature_strides[in_feature] != self.common_stride:
+                    head_ops.append(
+                        nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
+                    )
+            self.scale_heads.append(nn.Sequential(*head_ops))
+            self.add_module(in_feature, self.scale_heads[-1])
+        self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0)
+        weight_init.c2_msra_fill(self.predictor)
+
+    def forward(self, features, targets=None):
+        for i, f in enumerate(self.in_features):
+            if i == 0:
+                x = self.scale_heads[i](features[f])
+            else:
+                x = x + self.scale_heads[i](features[f])
+        x = self.predictor(x)
+        x = F.interpolate(x, scale_factor=self.common_stride, mode="bilinear", align_corners=False)
+
+        if self.training:
+            losses = {}
+            losses["loss_sem_seg"] = (
+                F.cross_entropy(x, targets, reduction="mean", ignore_index=self.ignore_value)
+                * self.loss_weight
+            )
+            return [], losses
+        else:
+            return x, {}
diff --git a/detectron2/modeling/poolers.py b/detectron2/modeling/poolers.py
new file mode 100644
index 0000000..9e1c863
--- /dev/null
+++ b/detectron2/modeling/poolers.py
@@ -0,0 +1,235 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import math
+import sys
+import torch
+from torch import nn
+from torchvision.ops import RoIPool
+
+from detectron2.layers import ROIAlign, ROIAlignRotated, cat
+
+__all__ = ["ROIPooler"]
+
+
+def assign_boxes_to_levels(box_lists, min_level, max_level, canonical_box_size, canonical_level):
+    """
+    Map each box in `box_lists` to a feature map level index and return the assignment
+    vector.
+
+    Args:
+        box_lists (list[Boxes] | list[RotatedBoxes]): A list of N Boxes or N RotatedBoxes,
+            where N is the number of images in the batch.
+        min_level (int): Smallest feature map level index. The input is considered index 0,
+            the output of stage 1 is index 1, and so.
+        max_level (int): Largest feature map level index.
+        canonical_box_size (int): A canonical box size in pixels (sqrt(box area)).
+        canonical_level (int): The feature map level index on which a canonically-sized box
+            should be placed.
+
+    Returns:
+        A tensor of length M, where M is the total number of boxes aggregated over all
+            N batch images. The memory layout corresponds to the concatenation of boxes
+            from all images. Each element is the feature map index, as an offset from
+            `self.min_level`, for the corresponding box (so value i means the box is at
+            `self.min_level + i`).
+    """
+    eps = sys.float_info.epsilon
+    box_sizes = torch.sqrt(cat([boxes.area() for boxes in box_lists]))
+    # Eqn.(1) in FPN paper
+    level_assignments = torch.floor(
+        canonical_level + torch.log2(box_sizes / canonical_box_size + eps)
+    )
+    # clamp level to (min, max), in case the box size is too large or too small
+    # for the available feature maps
+    level_assignments = torch.clamp(level_assignments, min=min_level, max=max_level)
+    return level_assignments.to(torch.int64) - min_level
+
+
+def convert_boxes_to_pooler_format(box_lists):
+    """
+    Convert all boxes in `box_lists` to the low-level format used by ROI pooling ops
+    (see description under Returns).
+
+    Args:
+        box_lists (list[Boxes] | list[RotatedBoxes]):
+            A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch.
+
+    Returns:
+        When input is list[Boxes]:
+            A tensor of shape (M, 5), where M is the total number of boxes aggregated over all
+            N batch images.
+            The 5 columns are (batch index, x0, y0, x1, y1), where batch index
+            is the index in [0, N) identifying which batch image the box with corners at
+            (x0, y0, x1, y1) comes from.
+        When input is list[RotatedBoxes]:
+            A tensor of shape (M, 6), where M is the total number of boxes aggregated over all
+            N batch images.
+            The 6 columns are (batch index, x_ctr, y_ctr, width, height, angle_degrees),
+            where batch index is the index in [0, N) identifying which batch image the
+            rotated box (x_ctr, y_ctr, width, height, angle_degrees) comes from.
+    """
+
+    def fmt_box_list(box_tensor, batch_index):
+        repeated_index = torch.full(
+            (len(box_tensor), 1), batch_index, dtype=box_tensor.dtype, device=box_tensor.device
+        )
+        return cat((repeated_index, box_tensor), dim=1)
+
+    pooler_fmt_boxes = cat(
+        [fmt_box_list(box_list.tensor, i) for i, box_list in enumerate(box_lists)], dim=0
+    )
+
+    return pooler_fmt_boxes
+
+
+class ROIPooler(nn.Module):
+    """
+    Region of interest feature map pooler that supports pooling from one or more
+    feature maps.
+    """
+
+    def __init__(
+        self,
+        output_size,
+        scales,
+        sampling_ratio,
+        pooler_type,
+        canonical_box_size=224,
+        canonical_level=4,
+    ):
+        """
+        Args:
+            output_size (int, tuple[int] or list[int]): output size of the pooled region,
+                e.g., 14 x 14. If tuple or list is given, the length must be 2.
+            scales (list[float]): The scale for each low-level pooling op relative to
+                the input image. For a feature map with stride s relative to the input
+                image, scale is defined as a 1 / s. The stride must be power of 2.
+                When there are multiple scales, they must form a pyramid, i.e. they must be
+                a monotically decreasing geometric sequence with a factor of 1/2.
+            sampling_ratio (int): The `sampling_ratio` parameter for the ROIAlign op.
+            pooler_type (string): Name of the type of pooling operation that should be applied.
+                For instance, "ROIPool" or "ROIAlignV2".
+            canonical_box_size (int): A canonical box size in pixels (sqrt(box area)). The default
+                is heuristically defined as 224 pixels in the FPN paper (based on ImageNet
+                pre-training).
+            canonical_level (int): The feature map level index from which a canonically-sized box
+                should be placed. The default is defined as level 4 (stride=16) in the FPN paper,
+                i.e., a box of size 224x224 will be placed on the feature with stride=16.
+                The box placement for all boxes will be determined from their sizes w.r.t
+                canonical_box_size. For example, a box whose area is 4x that of a canonical box
+                should be used to pool features from feature level ``canonical_level+1``.
+
+                Note that the actual input feature maps given to this module may not have
+                sufficiently many levels for the input boxes. If the boxes are too large or too
+                small for the input feature maps, the closest level will be used.
+        """
+        super().__init__()
+
+        if isinstance(output_size, int):
+            output_size = (output_size, output_size)
+        assert len(output_size) == 2
+        assert isinstance(output_size[0], int) and isinstance(output_size[1], int)
+        self.output_size = output_size
+
+        if pooler_type == "ROIAlign":
+            self.level_poolers = nn.ModuleList(
+                ROIAlign(
+                    output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=False
+                )
+                for scale in scales
+            )
+        elif pooler_type == "ROIAlignV2":
+            self.level_poolers = nn.ModuleList(
+                ROIAlign(
+                    output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=True
+                )
+                for scale in scales
+            )
+        elif pooler_type == "ROIPool":
+            self.level_poolers = nn.ModuleList(
+                RoIPool(output_size, spatial_scale=scale) for scale in scales
+            )
+        elif pooler_type == "ROIAlignRotated":
+            self.level_poolers = nn.ModuleList(
+                ROIAlignRotated(output_size, spatial_scale=scale, sampling_ratio=sampling_ratio)
+                for scale in scales
+            )
+        else:
+            raise ValueError("Unknown pooler type: {}".format(pooler_type))
+
+        # Map scale (defined as 1 / stride) to its feature map level under the
+        # assumption that stride is a power of 2.
+        min_level = -math.log2(scales[0])
+        max_level = -math.log2(scales[-1])
+        assert math.isclose(min_level, int(min_level)) and math.isclose(
+            max_level, int(max_level)
+        ), "Featuremap stride is not power of 2!"
+        self.min_level = int(min_level)
+        self.max_level = int(max_level)
+        assert (
+            len(scales) == self.max_level - self.min_level + 1
+        ), "[ROIPooler] Sizes of input featuremaps do not form a pyramid!"
+        assert 0 < self.min_level and self.min_level <= self.max_level
+        if len(scales) > 1:
+            # When there is only one feature map, canonical_level is redundant and we should not
+            # require it to be a sensible value. Therefore we skip this assertion
+            assert self.min_level <= canonical_level and canonical_level <= self.max_level
+        self.canonical_level = canonical_level
+        assert canonical_box_size > 0
+        self.canonical_box_size = canonical_box_size
+
+    def forward(self, x, box_lists):
+        """
+        Args:
+            x (list[Tensor]): A list of feature maps of NCHW shape, with scales matching those
+                used to construct this module.
+            box_lists (list[Boxes] | list[RotatedBoxes]):
+                A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch.
+                The box coordinates are defined on the original image and
+                will be scaled by the `scales` argument of :class:`ROIPooler`.
+
+        Returns:
+            Tensor:
+                A tensor of shape (M, C, output_size, output_size) where M is the total number of
+                boxes aggregated over all N batch images and C is the number of channels in `x`.
+        """
+        num_level_assignments = len(self.level_poolers)
+
+        assert isinstance(x, list) and isinstance(
+            box_lists, list
+        ), "Arguments to pooler must be lists"
+        assert (
+            len(x) == num_level_assignments
+        ), "unequal value, num_level_assignments={}, but x is list of {} Tensors".format(
+            num_level_assignments, len(x)
+        )
+
+        assert len(box_lists) == x[0].size(
+            0
+        ), "unequal value, x[0] batch dim 0 is {}, but box_list has length {}".format(
+            x[0].size(0), len(box_lists)
+        )
+
+        pooler_fmt_boxes = convert_boxes_to_pooler_format(box_lists)
+
+        if num_level_assignments == 1:
+            return self.level_poolers[0](x[0], pooler_fmt_boxes)
+
+        level_assignments = assign_boxes_to_levels(
+            box_lists, self.min_level, self.max_level, self.canonical_box_size, self.canonical_level
+        )
+
+        num_boxes = len(pooler_fmt_boxes)
+        num_channels = x[0].shape[1]
+        output_size = self.output_size[0]
+
+        dtype, device = x[0].dtype, x[0].device
+        output = torch.zeros(
+            (num_boxes, num_channels, output_size, output_size), dtype=dtype, device=device
+        )
+
+        for level, (x_level, pooler) in enumerate(zip(x, self.level_poolers)):
+            inds = torch.nonzero(level_assignments == level).squeeze(1)
+            pooler_fmt_boxes_level = pooler_fmt_boxes[inds]
+            output[inds] = pooler(x_level, pooler_fmt_boxes_level)
+
+        return output
diff --git a/detectron2/modeling/postprocessing.py b/detectron2/modeling/postprocessing.py
new file mode 100644
index 0000000..f65b73a
--- /dev/null
+++ b/detectron2/modeling/postprocessing.py
@@ -0,0 +1,78 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from torch.nn import functional as F
+
+from detectron2.layers import paste_masks_in_image
+from detectron2.structures import Instances
+
+
+def detector_postprocess(results, output_height, output_width, mask_threshold=0.5):
+    """
+    Resize the output instances.
+    The input images are often resized when entering an object detector.
+    As a result, we often need the outputs of the detector in a different
+    resolution from its inputs.
+
+    This function will resize the raw outputs of an R-CNN detector
+    to produce outputs according to the desired output resolution.
+
+    Args:
+        results (Instances): the raw outputs from the detector.
+            `results.image_size` contains the input image resolution the detector sees.
+            This object might be modified in-place.
+        output_height, output_width: the desired output resolution.
+
+    Returns:
+        Instances: the resized output from the model, based on the output resolution
+    """
+    scale_x, scale_y = (output_width / results.image_size[1], output_height / results.image_size[0])
+    results = Instances((output_height, output_width), **results.get_fields())
+
+    if results.has("pred_boxes"):
+        output_boxes = results.pred_boxes
+    elif results.has("proposal_boxes"):
+        output_boxes = results.proposal_boxes
+
+    output_boxes.scale(scale_x, scale_y)
+    output_boxes.clip(results.image_size)
+
+    results = results[output_boxes.nonempty()]
+
+    if results.has("pred_masks"):
+        results.pred_masks = paste_masks_in_image(
+            results.pred_masks[:, 0, :, :],  # N, 1, M, M
+            results.pred_boxes,
+            results.image_size,
+            threshold=mask_threshold,
+        )
+
+    if results.has("pred_keypoints"):
+        results.pred_keypoints[:, :, 0] *= scale_x
+        results.pred_keypoints[:, :, 1] *= scale_y
+
+    return results
+
+
+def sem_seg_postprocess(result, img_size, output_height, output_width):
+    """
+    Return semantic segmentation predictions in the original resolution.
+
+    The input images are often resized when entering semantic segmentor. Moreover, in same
+    cases, they also padded inside segmentor to be divisible by maximum network stride.
+    As a result, we often need the predictions of the segmentor in a different
+    resolution from its inputs.
+
+    Args:
+        result (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W),
+            where C is the number of classes, and H, W are the height and width of the prediction.
+        img_size (tuple): image size that segmentor is taking as input.
+        output_height, output_width: the desired output resolution.
+
+    Returns:
+        semantic segmentation prediction (Tensor): A tensor of the shape
+            (C, output_height, output_width) that contains per-pixel soft predictions.
+    """
+    result = result[:, : img_size[0], : img_size[1]].expand(1, -1, -1, -1)
+    result = F.interpolate(
+        result, size=(output_height, output_width), mode="bilinear", align_corners=False
+    )[0]
+    return result
diff --git a/detectron2/modeling/proposal_generator/__init__.py b/detectron2/modeling/proposal_generator/__init__.py
new file mode 100644
index 0000000..9eeeb3c
--- /dev/null
+++ b/detectron2/modeling/proposal_generator/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .build import PROPOSAL_GENERATOR_REGISTRY, build_proposal_generator
+from .rpn import RPN_HEAD_REGISTRY, build_rpn_head
diff --git a/detectron2/modeling/proposal_generator/build.py b/detectron2/modeling/proposal_generator/build.py
new file mode 100644
index 0000000..7f252bc
--- /dev/null
+++ b/detectron2/modeling/proposal_generator/build.py
@@ -0,0 +1,24 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from detectron2.utils.registry import Registry
+
+PROPOSAL_GENERATOR_REGISTRY = Registry("PROPOSAL_GENERATOR")
+PROPOSAL_GENERATOR_REGISTRY.__doc__ = """
+Registry for proposal generator, which produces object proposals from feature maps.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+The call should return a `nn.Module` object.
+"""
+
+from . import rpn, rrpn  # noqa F401 isort:skip
+
+
+def build_proposal_generator(cfg, input_shape):
+    """
+    Build a proposal generator from `cfg.MODEL.PROPOSAL_GENERATOR.NAME`.
+    The name can be "PrecomputedProposals" to use no proposal generator.
+    """
+    name = cfg.MODEL.PROPOSAL_GENERATOR.NAME
+    if name == "PrecomputedProposals":
+        return None
+
+    return PROPOSAL_GENERATOR_REGISTRY.get(name)(cfg, input_shape)
diff --git a/detectron2/modeling/proposal_generator/proposal_utils.py b/detectron2/modeling/proposal_generator/proposal_utils.py
new file mode 100644
index 0000000..d4af905
--- /dev/null
+++ b/detectron2/modeling/proposal_generator/proposal_utils.py
@@ -0,0 +1,57 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+import torch
+
+from detectron2.structures import Instances
+
+
+def add_ground_truth_to_proposals(gt_boxes, proposals):
+    """
+    Call `add_ground_truth_to_proposals_single_image` for all images.
+
+    Args:
+        gt_boxes(list[Boxes]): list of N elements. Element i is a Boxes
+            representing the gound-truth for image i.
+        proposals (list[Instances]): list of N elements. Element i is a Instances
+            representing the proposals for image i.
+
+    Returns:
+        list[Instances]: list of N Instances. Each is the proposals for the image,
+            with field "proposal_boxes" and "objectness_logits".
+    """
+    assert gt_boxes is not None
+
+    assert len(proposals) == len(gt_boxes)
+    if len(proposals) == 0:
+        return proposals
+
+    return [
+        add_ground_truth_to_proposals_single_image(gt_boxes_i, proposals_i)
+        for gt_boxes_i, proposals_i in zip(gt_boxes, proposals)
+    ]
+
+
+def add_ground_truth_to_proposals_single_image(gt_boxes, proposals):
+    """
+    Augment `proposals` with ground-truth boxes from `gt_boxes`.
+
+    Args:
+        Same as `add_ground_truth_to_proposals`, but with gt_boxes and proposals
+        per image.
+
+    Returns:
+        Same as `add_ground_truth_to_proposals`, but for only one image.
+    """
+    device = proposals.objectness_logits.device
+    # Concatenating gt_boxes with proposals requires them to have the same fields
+    # Assign all ground-truth boxes an objectness logit corresponding to P(object) \approx 1.
+    gt_logit_value = math.log((1.0 - 1e-10) / (1 - (1.0 - 1e-10)))
+
+    gt_logits = gt_logit_value * torch.ones(len(gt_boxes), device=device)
+    gt_proposal = Instances(proposals.image_size)
+
+    gt_proposal.proposal_boxes = gt_boxes
+    gt_proposal.objectness_logits = gt_logits
+    new_proposals = Instances.cat([proposals, gt_proposal])
+
+    return new_proposals
diff --git a/detectron2/modeling/proposal_generator/rpn.py b/detectron2/modeling/proposal_generator/rpn.py
new file mode 100644
index 0000000..bd8df0f
--- /dev/null
+++ b/detectron2/modeling/proposal_generator/rpn.py
@@ -0,0 +1,188 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from typing import Dict, List
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from detectron2.layers import ShapeSpec
+from detectron2.utils.registry import Registry
+
+from ..anchor_generator import build_anchor_generator
+from ..box_regression import Box2BoxTransform
+from ..matcher import Matcher
+from .build import PROPOSAL_GENERATOR_REGISTRY
+from .rpn_outputs import RPNOutputs, find_top_rpn_proposals
+
+RPN_HEAD_REGISTRY = Registry("RPN_HEAD")
+"""
+Registry for RPN heads, which take feature maps and perform
+objectness classification and bounding box regression for anchors.
+"""
+
+
+def build_rpn_head(cfg, input_shape):
+    """
+    Build an RPN head defined by `cfg.MODEL.RPN.HEAD_NAME`.
+    """
+    name = cfg.MODEL.RPN.HEAD_NAME
+    return RPN_HEAD_REGISTRY.get(name)(cfg, input_shape)
+
+
+@RPN_HEAD_REGISTRY.register()
+class StandardRPNHead(nn.Module):
+    """
+    RPN classification and regression heads. Uses a 3x3 conv to produce a shared
+    hidden state from which one 1x1 conv predicts objectness logits for each anchor
+    and a second 1x1 conv predicts bounding-box deltas specifying how to deform
+    each anchor into an object proposal.
+    """
+
+    def __init__(self, cfg, input_shape: List[ShapeSpec]):
+        super().__init__()
+
+        # Standard RPN is shared across levels:
+        in_channels = [s.channels for s in input_shape]
+        assert len(set(in_channels)) == 1, "Each level must have the same channel!"
+        in_channels = in_channels[0]
+
+        # RPNHead should take the same input as anchor generator
+        # NOTE: it assumes that creating an anchor generator does not have unwanted side effect.
+        anchor_generator = build_anchor_generator(cfg, input_shape)
+        num_cell_anchors = anchor_generator.num_cell_anchors
+        box_dim = anchor_generator.box_dim
+        assert (
+            len(set(num_cell_anchors)) == 1
+        ), "Each level must have the same number of cell anchors"
+        num_cell_anchors = num_cell_anchors[0]
+
+        # 3x3 conv for the hidden representation
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+        # 1x1 conv for predicting objectness logits
+        self.objectness_logits = nn.Conv2d(in_channels, num_cell_anchors, kernel_size=1, stride=1)
+        # 1x1 conv for predicting box2box transform deltas
+        self.anchor_deltas = nn.Conv2d(
+            in_channels, num_cell_anchors * box_dim, kernel_size=1, stride=1
+        )
+
+        for l in [self.conv, self.objectness_logits, self.anchor_deltas]:
+            nn.init.normal_(l.weight, std=0.01)
+            nn.init.constant_(l.bias, 0)
+
+    def forward(self, features):
+        """
+        Args:
+            features (list[Tensor]): list of feature maps
+        """
+        pred_objectness_logits = []
+        pred_anchor_deltas = []
+        for x in features:
+            t = F.relu(self.conv(x))
+            pred_objectness_logits.append(self.objectness_logits(t))
+            pred_anchor_deltas.append(self.anchor_deltas(t))
+        return pred_objectness_logits, pred_anchor_deltas
+
+
+@PROPOSAL_GENERATOR_REGISTRY.register()
+class RPN(nn.Module):
+    """
+    Region Proposal Network, introduced by the Faster R-CNN paper.
+    """
+
+    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
+        super().__init__()
+
+        # fmt: off
+        self.min_box_side_len        = cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE
+        self.in_features             = cfg.MODEL.RPN.IN_FEATURES
+        self.nms_thresh              = cfg.MODEL.RPN.NMS_THRESH
+        self.batch_size_per_image    = cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE
+        self.positive_fraction       = cfg.MODEL.RPN.POSITIVE_FRACTION
+        self.smooth_l1_beta          = cfg.MODEL.RPN.SMOOTH_L1_BETA
+        self.loss_weight             = cfg.MODEL.RPN.LOSS_WEIGHT
+        # fmt: on
+
+        # Map from self.training state to train/test settings
+        self.pre_nms_topk = {
+            True: cfg.MODEL.RPN.PRE_NMS_TOPK_TRAIN,
+            False: cfg.MODEL.RPN.PRE_NMS_TOPK_TEST,
+        }
+        self.post_nms_topk = {
+            True: cfg.MODEL.RPN.POST_NMS_TOPK_TRAIN,
+            False: cfg.MODEL.RPN.POST_NMS_TOPK_TEST,
+        }
+        self.boundary_threshold = cfg.MODEL.RPN.BOUNDARY_THRESH
+
+        self.anchor_generator = build_anchor_generator(
+            cfg, [input_shape[f] for f in self.in_features]
+        )
+        self.box2box_transform = Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS)
+        self.anchor_matcher = Matcher(
+            cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, allow_low_quality_matches=True
+        )
+        self.rpn_head = build_rpn_head(cfg, [input_shape[f] for f in self.in_features])
+
+    def forward(self, images, features, gt_instances=None):
+        """
+        Args:
+            images (ImageList): input images of length `N`
+            features (dict[str: Tensor]): input data as a mapping from feature
+                map name to tensor. Axis 0 represents the number of images `N` in
+                the input data; axes 1-3 are channels, height, and width, which may
+                vary between feature maps (e.g., if a feature pyramid is used).
+            gt_instances (list[Instances], optional): a length `N` list of `Instances`s.
+                Each `Instances` stores ground-truth instances for the corresponding image.
+
+        Returns:
+            proposals: list[Instances] or None
+            loss: dict[Tensor]
+        """
+        gt_boxes = [x.gt_boxes for x in gt_instances] if gt_instances is not None else None
+        del gt_instances
+        features = [features[f] for f in self.in_features]
+        pred_objectness_logits, pred_anchor_deltas = self.rpn_head(features)
+        anchors = self.anchor_generator(features)
+        # TODO: The anchors only depend on the feature map shape; there's probably
+        # an opportunity for some optimizations (e.g., caching anchors).
+        outputs = RPNOutputs(
+            self.box2box_transform,
+            self.anchor_matcher,
+            self.batch_size_per_image,
+            self.positive_fraction,
+            images,
+            pred_objectness_logits,
+            pred_anchor_deltas,
+            anchors,
+            self.boundary_threshold,
+            gt_boxes,
+            self.smooth_l1_beta,
+        )
+
+        if self.training:
+            losses = {k: v * self.loss_weight for k, v in outputs.losses().items()}
+        else:
+            losses = {}
+
+        with torch.no_grad():
+            # Find the top proposals by applying NMS and removing boxes that
+            # are too small. The proposals are treated as fixed for approximate
+            # joint training with roi heads. This approach ignores the derivative
+            # w.r.t. the proposal boxes’ coordinates that are also network
+            # responses, so is approximate.
+            proposals = find_top_rpn_proposals(
+                outputs.predict_proposals(),
+                outputs.predict_objectness_logits(),
+                images,
+                self.nms_thresh,
+                self.pre_nms_topk[self.training],
+                self.post_nms_topk[self.training],
+                self.min_box_side_len,
+                self.training,
+            )
+            # For RPN-only models, the proposals are the final output and we return them in
+            # high-to-low confidence order.
+            # For end-to-end models, the RPN proposals are an intermediate state
+            # and this sorting is actually not needed. But the cost is negligible.
+            inds = [p.objectness_logits.sort(descending=True)[1] for p in proposals]
+            proposals = [p[ind] for p, ind in zip(proposals, inds)]
+
+        return proposals, losses
diff --git a/detectron2/modeling/proposal_generator/rpn_outputs.py b/detectron2/modeling/proposal_generator/rpn_outputs.py
new file mode 100644
index 0000000..7820f42
--- /dev/null
+++ b/detectron2/modeling/proposal_generator/rpn_outputs.py
@@ -0,0 +1,436 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import itertools
+import logging
+import numpy as np
+import torch
+import torch.nn.functional as F
+from fvcore.nn import smooth_l1_loss
+
+from detectron2.layers import batched_nms, cat
+from detectron2.structures import Boxes, Instances, pairwise_iou
+from detectron2.utils.events import get_event_storage
+
+from ..sampling import subsample_labels
+
+logger = logging.getLogger(__name__)
+
+# TODO: comments for future refactoring of this module
+#
+# From @rbg:
+# This code involves a significant amount of tensor reshaping and permuting. Look for
+# ways to simplify this.
+
+"""
+Shape shorthand in this module:
+
+    N: number of images in the minibatch
+    L: number of feature maps per image on which RPN is run
+    A: number of cell anchors (must be the same for all feature maps)
+    Hi, Wi: height and width of the i-th feature map
+    4: size of the box parameterization
+
+Naming convention:
+
+    objectness: refers to the binary classification of an anchor as object vs. not
+    object.
+
+    deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box
+    transform (see :class:`box_regression.Box2BoxTransform`).
+
+    pred_objectness_logits: predicted objectness scores in [-inf, +inf]; use
+        sigmoid(pred_objectness_logits) to estimate P(object).
+
+    gt_objectness_logits: ground-truth binary classification labels for objectness
+
+    pred_anchor_deltas: predicted box2box transform deltas
+
+    gt_anchor_deltas: ground-truth box2box transform deltas
+"""
+
+
+def find_top_rpn_proposals(
+    proposals,
+    pred_objectness_logits,
+    images,
+    nms_thresh,
+    pre_nms_topk,
+    post_nms_topk,
+    min_box_side_len,
+    training,
+):
+    """
+    For each feature map, select the `pre_nms_topk` highest scoring proposals,
+    apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk`
+    highest scoring proposals among all the feature maps if `training` is True,
+    otherwise, returns the highest `post_nms_topk` scoring proposals for each
+    feature map.
+
+    Args:
+        proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4).
+            All proposal predictions on the feature maps.
+        pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A).
+        images (ImageList): Input images as an :class:`ImageList`.
+        nms_thresh (float): IoU threshold to use for NMS
+        pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS.
+            When RPN is run on multiple feature maps (as in FPN) this number is per
+            feature map.
+        post_nms_topk (int): number of top k scoring proposals to keep after applying NMS.
+            When RPN is run on multiple feature maps (as in FPN) this number is total,
+            over all feature maps.
+        min_box_side_len (float): minimum proposal box side length in pixels (absolute units
+            wrt input images).
+        training (bool): True if proposals are to be used in training, otherwise False.
+            This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..."
+            comment.
+
+    Returns:
+        proposals (list[Instances]): list of N Instances. The i-th Instances
+            stores post_nms_topk object proposals for image i.
+    """
+    image_sizes = images.image_sizes  # in (h, w) order
+    num_images = len(image_sizes)
+    device = proposals[0].device
+
+    # 1. Select top-k anchor for every level and every image
+    topk_scores = []  # #lvl Tensor, each of shape N x topk
+    topk_proposals = []
+    level_ids = []  # #lvl Tensor, each of shape (topk,)
+    batch_idx = torch.arange(num_images, device=device)
+    for level_id, proposals_i, logits_i in zip(
+        itertools.count(), proposals, pred_objectness_logits
+    ):
+        Hi_Wi_A = logits_i.shape[1]
+        num_proposals_i = min(pre_nms_topk, Hi_Wi_A)
+
+        # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812)
+        # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
+        logits_i, idx = logits_i.sort(descending=True, dim=1)
+        topk_scores_i = logits_i[batch_idx, :num_proposals_i]
+        topk_idx = idx[batch_idx, :num_proposals_i]
+
+        # each is N x topk
+        topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx]  # N x topk x 4
+
+        topk_proposals.append(topk_proposals_i)
+        topk_scores.append(topk_scores_i)
+        level_ids.append(torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device))
+
+    # 2. Concat all levels together
+    topk_scores = cat(topk_scores, dim=1)
+    topk_proposals = cat(topk_proposals, dim=1)
+    level_ids = cat(level_ids, dim=0)
+
+    # 3. For each image, run a per-level NMS, and choose topk results.
+    results = []
+    for n, image_size in enumerate(image_sizes):
+        boxes = Boxes(topk_proposals[n])
+        scores_per_img = topk_scores[n]
+        boxes.clip(image_size)
+
+        # filter empty boxes
+        keep = boxes.nonempty(threshold=min_box_side_len)
+        lvl = level_ids
+        if keep.sum().item() != len(boxes):
+            boxes, scores_per_img, lvl = boxes[keep], scores_per_img[keep], level_ids[keep]
+
+        keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh)
+        # In Detectron1, there was different behavior during training vs. testing.
+        # (https://github.com/facebookresearch/Detectron/issues/459)
+        # During training, topk is over the proposals from *all* images in the training batch.
+        # During testing, it is over the proposals for each image separately.
+        # As a result, the training behavior becomes batch-dependent,
+        # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size.
+        # This bug is addressed in Detectron2 to make the behavior independent of batch size.
+        keep = keep[:post_nms_topk]
+
+        res = Instances(image_size)
+        res.proposal_boxes = boxes[keep]
+        res.objectness_logits = scores_per_img[keep]
+        results.append(res)
+    return results
+
+
+def rpn_losses(
+    gt_objectness_logits,
+    gt_anchor_deltas,
+    pred_objectness_logits,
+    pred_anchor_deltas,
+    smooth_l1_beta,
+):
+    """
+    Args:
+        gt_objectness_logits (Tensor): shape (N,), each element in {-1, 0, 1} representing
+            ground-truth objectness labels with: -1 = ignore; 0 = not object; 1 = object.
+        gt_anchor_deltas (Tensor): shape (N, box_dim), row i represents ground-truth
+            box2box transform targets (dx, dy, dw, dh) or (dx, dy, dw, dh, da) that map anchor i to
+            its matched ground-truth box.
+        pred_objectness_logits (Tensor): shape (N,), each element is a predicted objectness
+            logit.
+        pred_anchor_deltas (Tensor): shape (N, box_dim), each row is a predicted box2box
+            transform (dx, dy, dw, dh) or (dx, dy, dw, dh, da)
+        smooth_l1_beta (float): The transition point between L1 and L2 loss in
+            the smooth L1 loss function. When set to 0, the loss becomes L1. When
+            set to +inf, the loss becomes constant 0.
+
+    Returns:
+        objectness_loss, localization_loss, both unnormalized (summed over samples).
+    """
+    pos_masks = gt_objectness_logits == 1
+    localization_loss = smooth_l1_loss(
+        pred_anchor_deltas[pos_masks], gt_anchor_deltas[pos_masks], smooth_l1_beta, reduction="sum"
+    )
+
+    valid_masks = gt_objectness_logits >= 0
+    objectness_loss = F.binary_cross_entropy_with_logits(
+        pred_objectness_logits[valid_masks],
+        gt_objectness_logits[valid_masks].to(torch.float32),
+        reduction="sum",
+    )
+    return objectness_loss, localization_loss
+
+
+class RPNOutputs(object):
+    def __init__(
+        self,
+        box2box_transform,
+        anchor_matcher,
+        batch_size_per_image,
+        positive_fraction,
+        images,
+        pred_objectness_logits,
+        pred_anchor_deltas,
+        anchors,
+        boundary_threshold=0,
+        gt_boxes=None,
+        smooth_l1_beta=0.0,
+    ):
+        """
+        Args:
+            box2box_transform (Box2BoxTransform): :class:`Box2BoxTransform` instance for
+                anchor-proposal transformations.
+            anchor_matcher (Matcher): :class:`Matcher` instance for matching anchors to
+                ground-truth boxes; used to determine training labels.
+            batch_size_per_image (int): number of proposals to sample when training
+            positive_fraction (float): target fraction of sampled proposals that should be positive
+            images (ImageList): :class:`ImageList` instance representing N input images
+            pred_objectness_logits (list[Tensor]): A list of L elements.
+                Element i is a tensor of shape (N, A, Hi, Wi) representing
+                the predicted objectness logits for anchors.
+            pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape
+                (N, A*4, Hi, Wi) representing the predicted "deltas" used to transform anchors
+                to proposals.
+            anchors (list[list[Boxes]]): A list of N elements. Each element is a list of L
+                Boxes. The Boxes at (n, l) stores the entire anchor array for feature map l in image
+                n (i.e. the cell anchors repeated over all locations in feature map (n, l)).
+            boundary_threshold (int): if >= 0, then anchors that extend beyond the image
+                boundary by more than boundary_thresh are not used in training. Set to a very large
+                number or < 0 to disable this behavior. Only needed in training.
+            gt_boxes (list[Boxes], optional): A list of N elements. Element i a Boxes storing
+                the ground-truth ("gt") boxes for image i.
+            smooth_l1_beta (float): The transition point between L1 and L2 loss in
+                the smooth L1 loss function. When set to 0, the loss becomes L1. When
+                set to +inf, the loss becomes constant 0.
+        """
+        self.box2box_transform = box2box_transform
+        self.anchor_matcher = anchor_matcher
+        self.batch_size_per_image = batch_size_per_image
+        self.positive_fraction = positive_fraction
+        self.pred_objectness_logits = pred_objectness_logits
+        self.pred_anchor_deltas = pred_anchor_deltas
+
+        self.anchors = anchors
+        self.gt_boxes = gt_boxes
+        self.num_feature_maps = len(pred_objectness_logits)
+        self.num_images = len(images)
+        self.image_sizes = images.image_sizes
+        self.boundary_threshold = boundary_threshold
+        self.smooth_l1_beta = smooth_l1_beta
+
+    def _get_ground_truth(self):
+        """
+        Returns:
+            gt_objectness_logits: list of N tensors. Tensor i is a vector whose length is the
+                total number of anchors in image i (i.e., len(anchors[i])). Label values are
+                in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative class; 1 = positive class.
+            gt_anchor_deltas: list of N tensors. Tensor i has shape (len(anchors[i]), 4).
+        """
+        gt_objectness_logits = []
+        gt_anchor_deltas = []
+        # Concatenate anchors from all feature maps into a single Boxes per image
+        anchors = [Boxes.cat(anchors_i) for anchors_i in self.anchors]
+        for image_size_i, anchors_i, gt_boxes_i in zip(self.image_sizes, anchors, self.gt_boxes):
+            """
+            image_size_i: (h, w) for the i-th image
+            anchors_i: anchors for i-th image
+            gt_boxes_i: ground-truth boxes for i-th image
+            """
+            match_quality_matrix = pairwise_iou(gt_boxes_i, anchors_i)
+            matched_idxs, gt_objectness_logits_i = self.anchor_matcher(match_quality_matrix)
+
+            if self.boundary_threshold >= 0:
+                # Discard anchors that go out of the boundaries of the image
+                # NOTE: This is legacy functionality that is turned off by default in Detectron2
+                anchors_inside_image = anchors_i.inside_box(image_size_i, self.boundary_threshold)
+                gt_objectness_logits_i[~anchors_inside_image] = -1
+
+            if len(gt_boxes_i) == 0:
+                # These values won't be used anyway since the anchor is labeled as background
+                gt_anchor_deltas_i = torch.zeros_like(anchors_i.tensor)
+            else:
+                # TODO wasted computation for ignored boxes
+                matched_gt_boxes = gt_boxes_i[matched_idxs]
+                gt_anchor_deltas_i = self.box2box_transform.get_deltas(
+                    anchors_i.tensor, matched_gt_boxes.tensor
+                )
+
+            gt_objectness_logits.append(gt_objectness_logits_i)
+            gt_anchor_deltas.append(gt_anchor_deltas_i)
+
+        return gt_objectness_logits, gt_anchor_deltas
+
+    def losses(self):
+        """
+        Return the losses from a set of RPN predictions and their associated ground-truth.
+
+        Returns:
+            dict[loss name -> loss value]: A dict mapping from loss name to loss value.
+                Loss names are: `loss_rpn_cls` for objectness classification and
+                `loss_rpn_loc` for proposal localization.
+        """
+
+        def resample(label):
+            """
+            Randomly sample a subset of positive and negative examples by overwriting
+            the label vector to the ignore value (-1) for all elements that are not
+            included in the sample.
+            """
+            pos_idx, neg_idx = subsample_labels(
+                label, self.batch_size_per_image, self.positive_fraction, 0
+            )
+            # Fill with the ignore label (-1), then set positive and negative labels
+            label.fill_(-1)
+            label.scatter_(0, pos_idx, 1)
+            label.scatter_(0, neg_idx, 0)
+            return label
+
+        gt_objectness_logits, gt_anchor_deltas = self._get_ground_truth()
+        """
+        gt_objectness_logits: list of N tensors. Tensor i is a vector whose length is the
+            total number of anchors in image i (i.e., len(anchors[i]))
+        gt_anchor_deltas: list of N tensors. Tensor i has shape (len(anchors[i]), B),
+            where B is the box dimension
+        """
+        # Collect all objectness labels and delta targets over feature maps and images
+        # The final ordering is L, N, H, W, A from slowest to fastest axis.
+        num_anchors_per_map = [np.prod(x.shape[1:]) for x in self.pred_objectness_logits]
+        num_anchors_per_image = sum(num_anchors_per_map)
+
+        # Stack to: (N, num_anchors_per_image)
+        gt_objectness_logits = torch.stack(
+            [resample(label) for label in gt_objectness_logits], dim=0
+        )
+
+        # Log the number of positive/negative anchors per-image that's used in training
+        num_pos_anchors = (gt_objectness_logits == 1).sum().item()
+        num_neg_anchors = (gt_objectness_logits == 0).sum().item()
+        storage = get_event_storage()
+        storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / self.num_images)
+        storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / self.num_images)
+
+        assert gt_objectness_logits.shape[1] == num_anchors_per_image
+        # Split to tuple of L tensors, each with shape (N, num_anchors_per_map)
+        gt_objectness_logits = torch.split(gt_objectness_logits, num_anchors_per_map, dim=1)
+        # Concat from all feature maps
+        gt_objectness_logits = cat([x.flatten() for x in gt_objectness_logits], dim=0)
+
+        # Stack to: (N, num_anchors_per_image, B)
+        gt_anchor_deltas = torch.stack(gt_anchor_deltas, dim=0)
+        assert gt_anchor_deltas.shape[1] == num_anchors_per_image
+        B = gt_anchor_deltas.shape[2]  # box dimension (4 or 5)
+
+        # Split to tuple of L tensors, each with shape (N, num_anchors_per_image)
+        gt_anchor_deltas = torch.split(gt_anchor_deltas, num_anchors_per_map, dim=1)
+        # Concat from all feature maps
+        gt_anchor_deltas = cat([x.reshape(-1, B) for x in gt_anchor_deltas], dim=0)
+
+        # Collect all objectness logits and delta predictions over feature maps
+        # and images to arrive at the same shape as the labels and targets
+        # The final ordering is L, N, H, W, A from slowest to fastest axis.
+        pred_objectness_logits = cat(
+            [
+                # Reshape: (N, A, Hi, Wi) -> (N, Hi, Wi, A) -> (N*Hi*Wi*A, )
+                x.permute(0, 2, 3, 1).flatten()
+                for x in self.pred_objectness_logits
+            ],
+            dim=0,
+        )
+        pred_anchor_deltas = cat(
+            [
+                # Reshape: (N, A*B, Hi, Wi) -> (N, A, B, Hi, Wi) -> (N, Hi, Wi, A, B)
+                #          -> (N*Hi*Wi*A, B)
+                x.view(x.shape[0], -1, B, x.shape[-2], x.shape[-1])
+                .permute(0, 3, 4, 1, 2)
+                .reshape(-1, B)
+                for x in self.pred_anchor_deltas
+            ],
+            dim=0,
+        )
+
+        objectness_loss, localization_loss = rpn_losses(
+            gt_objectness_logits,
+            gt_anchor_deltas,
+            pred_objectness_logits,
+            pred_anchor_deltas,
+            self.smooth_l1_beta,
+        )
+        normalizer = 1.0 / (self.batch_size_per_image * self.num_images)
+        loss_cls = objectness_loss * normalizer  # cls: classification loss
+        loss_loc = localization_loss * normalizer  # loc: localization loss
+        losses = {"loss_rpn_cls": loss_cls, "loss_rpn_loc": loss_loc}
+
+        return losses
+
+    def predict_proposals(self):
+        """
+        Transform anchors into proposals by applying the predicted anchor deltas.
+
+        Returns:
+            proposals (list[Tensor]): A list of L tensors. Tensor i has shape
+                (N, Hi*Wi*A, B), where B is box dimension (4 or 5).
+        """
+        proposals = []
+        # Transpose anchors from images-by-feature-maps (N, L) to feature-maps-by-images (L, N)
+        anchors = list(zip(*self.anchors))
+        # For each feature map
+        for anchors_i, pred_anchor_deltas_i in zip(anchors, self.pred_anchor_deltas):
+            B = anchors_i[0].tensor.size(1)
+            N, _, Hi, Wi = pred_anchor_deltas_i.shape
+            # Reshape: (N, A*B, Hi, Wi) -> (N, A, B, Hi, Wi) -> (N, Hi, Wi, A, B) -> (N*Hi*Wi*A, B)
+            pred_anchor_deltas_i = (
+                pred_anchor_deltas_i.view(N, -1, B, Hi, Wi).permute(0, 3, 4, 1, 2).reshape(-1, B)
+            )
+            # Concatenate all anchors to shape (N*Hi*Wi*A, B)
+            # type(anchors_i[0]) is Boxes (B = 4) or RotatedBoxes (B = 5)
+            anchors_i = type(anchors_i[0]).cat(anchors_i)
+            proposals_i = self.box2box_transform.apply_deltas(
+                pred_anchor_deltas_i, anchors_i.tensor
+            )
+            # Append feature map proposals with shape (N, Hi*Wi*A, B)
+            proposals.append(proposals_i.view(N, -1, B))
+        return proposals
+
+    def predict_objectness_logits(self):
+        """
+        Return objectness logits in the same format as the proposals returned by
+        :meth:`predict_proposals`.
+
+        Returns:
+            pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape
+                (N, Hi*Wi*A).
+        """
+        pred_objectness_logits = [
+            # Reshape: (N, A, Hi, Wi) -> (N, Hi, Wi, A) -> (N, Hi*Wi*A)
+            score.permute(0, 2, 3, 1).reshape(self.num_images, -1)
+            for score in self.pred_objectness_logits
+        ]
+        return pred_objectness_logits
diff --git a/detectron2/modeling/proposal_generator/rrpn.py b/detectron2/modeling/proposal_generator/rrpn.py
new file mode 100644
index 0000000..effac74
--- /dev/null
+++ b/detectron2/modeling/proposal_generator/rrpn.py
@@ -0,0 +1,87 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+from typing import Dict
+import torch
+
+from detectron2.layers import ShapeSpec
+
+from ..box_regression import Box2BoxTransformRotated
+from .build import PROPOSAL_GENERATOR_REGISTRY
+from .rpn import RPN
+from .rrpn_outputs import RRPNOutputs, find_top_rrpn_proposals
+
+logger = logging.getLogger(__name__)
+
+
+@PROPOSAL_GENERATOR_REGISTRY.register()
+class RRPN(RPN):
+    """
+    Rotated RPN subnetwork.
+    Please refer to https://arxiv.org/pdf/1703.01086.pdf for the original RRPN paper:
+    Ma, J., Shao, W., Ye, H., Wang, L., Wang, H., Zheng, Y., & Xue, X. (2018).
+    Arbitrary-oriented scene text detection via rotation proposals.
+    IEEE Transactions on Multimedia, 20(11), 3111-3122.
+    """
+
+    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
+        super().__init__(cfg, input_shape)
+        self.box2box_transform = Box2BoxTransformRotated(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS)
+
+    def forward(self, images, features, gt_instances=None):
+        """
+        Args:
+            images (ImageList): input images of length `N`
+            features (dict[str: Tensor]): input data as a mapping from feature
+                map name to tensor. Axis 0 represents the number of images `N` in
+                the input data; axes 1-3 are channels, height, and width, which may
+                vary between feature maps (e.g., if a feature pyramid is used).
+            gt_instances (list[Instances], optional): a length `N` list of `Instances`s.
+                Each `Instances` stores ground-truth instances for the corresponding image.
+
+        Returns:
+            proposals: list[Instances] or None
+            loss: dict[Tensor]
+        """
+        gt_boxes = [x.gt_boxes for x in gt_instances] if gt_instances is not None else None
+        del gt_instances
+        features = [features[f] for f in self.in_features]
+        pred_objectness_logits, pred_anchor_deltas = self.rpn_head(features)
+        anchors = self.anchor_generator(features)
+
+        outputs = RRPNOutputs(
+            self.box2box_transform,
+            self.anchor_matcher,
+            self.batch_size_per_image,
+            self.positive_fraction,
+            images,
+            pred_objectness_logits,
+            pred_anchor_deltas,
+            anchors,
+            self.boundary_threshold,
+            gt_boxes,
+            self.smooth_l1_beta,
+        )
+
+        if self.training:
+            losses = outputs.losses()
+        else:
+            losses = {}
+
+        with torch.no_grad():
+            # Find the top proposals by applying NMS and removing boxes that
+            # are too small. The proposals are treated as fixed for approximate
+            # joint training with roi heads. This approach ignores the derivative
+            # w.r.t. the proposal boxes’ coordinates that are also network
+            # responses, so is approximate.
+            proposals = find_top_rrpn_proposals(
+                outputs.predict_proposals(),
+                outputs.predict_objectness_logits(),
+                images,
+                self.nms_thresh,
+                self.pre_nms_topk[self.training],
+                self.post_nms_topk[self.training],
+                self.min_box_side_len,
+                self.training,
+            )
+
+        return proposals, losses
diff --git a/detectron2/modeling/proposal_generator/rrpn_outputs.py b/detectron2/modeling/proposal_generator/rrpn_outputs.py
new file mode 100644
index 0000000..f39fc07
--- /dev/null
+++ b/detectron2/modeling/proposal_generator/rrpn_outputs.py
@@ -0,0 +1,240 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import itertools
+import logging
+import torch
+
+from detectron2.layers import batched_nms_rotated, cat
+from detectron2.structures import Instances, RotatedBoxes, pairwise_iou_rotated
+
+from .rpn_outputs import RPNOutputs
+
+logger = logging.getLogger(__name__)
+
+"""
+Shape shorthand in this module:
+
+    N: number of images in the minibatch
+    L: number of feature maps per image on which RRPN is run
+    A: number of cell anchors (must be the same for all feature maps)
+    Hi, Wi: height and width of the i-th feature map
+    5: size of the box parameterization
+
+Naming convention:
+
+    objectness: refers to the binary classification of an anchor as object vs. not
+    object.
+
+    deltas: refers to the 5-d (dx, dy, dw, dh, da) deltas that parameterize the rotated box2box
+    transform (see :class:`box_regression.Box2BoxTransformRotated`).
+
+    pred_objectness_logits: predicted objectness scores in [-inf, +inf]; use
+        sigmoid(pred_objectness_logits) to estimate P(object).
+
+    gt_objectness_logits: ground-truth binary classification labels for objectness
+
+    pred_anchor_deltas: predicted rotated box2box transform deltas
+
+    gt_anchor_deltas: ground-truth rotated box2box transform deltas
+"""
+
+
+def find_top_rrpn_proposals(
+    proposals,
+    pred_objectness_logits,
+    images,
+    nms_thresh,
+    pre_nms_topk,
+    post_nms_topk,
+    min_box_side_len,
+    training,
+):
+    """
+    For each feature map, select the `pre_nms_topk` highest scoring proposals,
+    apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk`
+    highest scoring proposals among all the feature maps if `training` is True,
+    otherwise, returns the highest `post_nms_topk` scoring proposals for each
+    feature map.
+
+    Args:
+        proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 5).
+            All proposal predictions on the feature maps.
+        pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A).
+        images (ImageList): Input images as an :class:`ImageList`.
+        nms_thresh (float): IoU threshold to use for NMS
+        pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS.
+            When RRPN is run on multiple feature maps (as in FPN) this number is per
+            feature map.
+        post_nms_topk (int): number of top k scoring proposals to keep after applying NMS.
+            When RRPN is run on multiple feature maps (as in FPN) this number is total,
+            over all feature maps.
+        min_box_side_len (float): minimum proposal box side length in pixels (absolute units
+            wrt input images).
+        training (bool): True if proposals are to be used in training, otherwise False.
+            This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..."
+            comment.
+
+    Returns:
+        proposals (list[Instances]): list of N Instances. The i-th Instances
+            stores post_nms_topk object proposals for image i.
+    """
+    image_sizes = images.image_sizes  # in (h, w) order
+    num_images = len(image_sizes)
+    device = proposals[0].device
+
+    # 1. Select top-k anchor for every level and every image
+    topk_scores = []  # #lvl Tensor, each of shape N x topk
+    topk_proposals = []
+    level_ids = []  # #lvl Tensor, each of shape (topk,)
+    batch_idx = torch.arange(num_images, device=device)
+    for level_id, proposals_i, logits_i in zip(
+        itertools.count(), proposals, pred_objectness_logits
+    ):
+        Hi_Wi_A = logits_i.shape[1]
+        num_proposals_i = min(pre_nms_topk, Hi_Wi_A)
+
+        # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812)
+        # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
+        logits_i, idx = logits_i.sort(descending=True, dim=1)
+        topk_scores_i = logits_i[batch_idx, :num_proposals_i]
+        topk_idx = idx[batch_idx, :num_proposals_i]
+
+        # each is N x topk
+        topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx]  # N x topk x 5
+
+        topk_proposals.append(topk_proposals_i)
+        topk_scores.append(topk_scores_i)
+        level_ids.append(torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device))
+
+    # 2. Concat all levels together
+    topk_scores = cat(topk_scores, dim=1)
+    topk_proposals = cat(topk_proposals, dim=1)
+    level_ids = cat(level_ids, dim=0)
+
+    # 3. For each image, run a per-level NMS, and choose topk results.
+    results = []
+    for n, image_size in enumerate(image_sizes):
+        boxes = RotatedBoxes(topk_proposals[n])
+        scores_per_img = topk_scores[n]
+        boxes.clip(image_size)
+
+        # filter empty boxes
+        keep = boxes.nonempty(threshold=min_box_side_len)
+        lvl = level_ids
+        if keep.sum().item() != len(boxes):
+            boxes, scores_per_img, lvl = (boxes[keep], scores_per_img[keep], level_ids[keep])
+
+        keep = batched_nms_rotated(boxes.tensor, scores_per_img, lvl, nms_thresh)
+        # In Detectron1, there was different behavior during training vs. testing.
+        # (https://github.com/facebookresearch/Detectron/issues/459)
+        # During training, topk is over the proposals from *all* images in the training batch.
+        # During testing, it is over the proposals for each image separately.
+        # As a result, the training behavior becomes batch-dependent,
+        # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size.
+        # This bug is addressed in Detectron2 to make the behavior independent of batch size.
+        keep = keep[:post_nms_topk]
+
+        res = Instances(image_size)
+        res.proposal_boxes = boxes[keep]
+        res.objectness_logits = scores_per_img[keep]
+        results.append(res)
+    return results
+
+
+class RRPNOutputs(RPNOutputs):
+    def __init__(
+        self,
+        box2box_transform,
+        anchor_matcher,
+        batch_size_per_image,
+        positive_fraction,
+        images,
+        pred_objectness_logits,
+        pred_anchor_deltas,
+        anchors,
+        boundary_threshold=0,
+        gt_boxes=None,
+        smooth_l1_beta=0.0,
+    ):
+        """
+        Args:
+            box2box_transform (Box2BoxTransformRotated): :class:`Box2BoxTransformRotated`
+                instance for anchor-proposal transformations.
+            anchor_matcher (Matcher): :class:`Matcher` instance for matching anchors to
+                ground-truth boxes; used to determine training labels.
+            batch_size_per_image (int): number of proposals to sample when training
+            positive_fraction (float): target fraction of sampled proposals that should be positive
+            images (ImageList): :class:`ImageList` instance representing N input images
+            pred_objectness_logits (list[Tensor]): A list of L elements.
+                Element i is a tensor of shape (N, A, Hi, Wi) representing
+                the predicted objectness logits for anchors.
+            pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape
+                (N, A*5, Hi, Wi) representing the predicted "deltas" used to transform anchors
+                to proposals.
+            anchors (list[list[RotatedBoxes]]): A list of N elements. Each element is a list of L
+                RotatedBoxes. The RotatedBoxes at (n, l) stores the entire anchor array for
+                feature map l in image n (i.e. the cell anchors repeated over all locations in
+                feature map (n, l)).
+            boundary_threshold (int): if >= 0, then anchors that extend beyond the image
+                boundary by more than boundary_thresh are not used in training. Set to a very large
+                number or < 0 to disable this behavior. Only needed in training.
+            gt_boxes (list[RotatedBoxes], optional): A list of N elements. Element i a RotatedBoxes
+                storing the ground-truth ("gt") rotated boxes for image i.
+            smooth_l1_beta (float): The transition point between L1 and L2 loss in
+                the smooth L1 loss function. When set to 0, the loss becomes L1. When
+                set to +inf, the loss becomes constant 0.
+        """
+        super(RRPNOutputs, self).__init__(
+            box2box_transform,
+            anchor_matcher,
+            batch_size_per_image,
+            positive_fraction,
+            images,
+            pred_objectness_logits,
+            pred_anchor_deltas,
+            anchors,
+            boundary_threshold,
+            gt_boxes,
+            smooth_l1_beta,
+        )
+
+    def _get_ground_truth(self):
+        """
+        Returns:
+            gt_objectness_logits: list of N tensors. Tensor i is a vector whose length is the
+                total number of anchors in image i (i.e., len(anchors[i])). Label values are
+                in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative class; 1 = positive class.
+            gt_anchor_deltas: list of N tensors. Tensor i has shape (len(anchors[i]), 5).
+        """
+        gt_objectness_logits = []
+        gt_anchor_deltas = []
+        # Concatenate anchors from all feature maps into a single RotatedBoxes per image
+        anchors = [RotatedBoxes.cat(anchors_i) for anchors_i in self.anchors]
+        for image_size_i, anchors_i, gt_boxes_i in zip(self.image_sizes, anchors, self.gt_boxes):
+            """
+            image_size_i: (h, w) for the i-th image
+            anchors_i: anchors for i-th image
+            gt_boxes_i: ground-truth boxes for i-th image
+            """
+            match_quality_matrix = pairwise_iou_rotated(gt_boxes_i, anchors_i)
+            matched_idxs, gt_objectness_logits_i = self.anchor_matcher(match_quality_matrix)
+
+            if self.boundary_threshold >= 0:
+                # Discard anchors that go out of the boundaries of the image
+                # NOTE: This is legacy functionality that is turned off by default in Detectron2
+                anchors_inside_image = anchors_i.inside_box(image_size_i, self.boundary_threshold)
+                gt_objectness_logits_i[~anchors_inside_image] = -1
+
+            if len(gt_boxes_i) == 0:
+                # These values won't be used anyway since the anchor is labeled as background
+                gt_anchor_deltas_i = torch.zeros_like(anchors_i.tensor)
+            else:
+                # TODO wasted computation for ignored boxes
+                matched_gt_boxes = gt_boxes_i[matched_idxs]
+                gt_anchor_deltas_i = self.box2box_transform.get_deltas(
+                    anchors_i.tensor, matched_gt_boxes.tensor
+                )
+
+            gt_objectness_logits.append(gt_objectness_logits_i)
+            gt_anchor_deltas.append(gt_anchor_deltas_i)
+
+        return gt_objectness_logits, gt_anchor_deltas
diff --git a/detectron2/modeling/roi_heads/__init__.py b/detectron2/modeling/roi_heads/__init__.py
new file mode 100644
index 0000000..42ff4e7
--- /dev/null
+++ b/detectron2/modeling/roi_heads/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .box_head import ROI_BOX_HEAD_REGISTRY, build_box_head
+from .keypoint_head import ROI_KEYPOINT_HEAD_REGISTRY, build_keypoint_head
+from .mask_head import ROI_MASK_HEAD_REGISTRY, build_mask_head
+from .roi_heads import (
+    ROI_HEADS_REGISTRY,
+    ROIHeads,
+    StandardROIHeads,
+    build_roi_heads,
+    select_foreground_proposals,
+)
+from .rotated_fast_rcnn import RROIHeads
+
+from . import cascade_rcnn  # isort:skip
diff --git a/detectron2/modeling/roi_heads/box_head.py b/detectron2/modeling/roi_heads/box_head.py
new file mode 100644
index 0000000..e73a6a2
--- /dev/null
+++ b/detectron2/modeling/roi_heads/box_head.py
@@ -0,0 +1,93 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.utils.registry import Registry
+
+ROI_BOX_HEAD_REGISTRY = Registry("ROI_BOX_HEAD")
+ROI_BOX_HEAD_REGISTRY.__doc__ = """
+Registry for box heads, which make box predictions from per-region features.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+"""
+
+
+@ROI_BOX_HEAD_REGISTRY.register()
+class FastRCNNConvFCHead(nn.Module):
+    """
+    A head with several 3x3 conv layers (each followed by norm & relu) and
+    several fc layers (each followed by relu).
+    """
+
+    def __init__(self, cfg, input_shape: ShapeSpec):
+        """
+        The following attributes are parsed from config:
+            num_conv, num_fc: the number of conv/fc layers
+            conv_dim/fc_dim: the dimension of the conv/fc layers
+            norm: normalization for the conv layers
+        """
+        super().__init__()
+
+        # fmt: off
+        num_conv   = cfg.MODEL.ROI_BOX_HEAD.NUM_CONV
+        conv_dim   = cfg.MODEL.ROI_BOX_HEAD.CONV_DIM
+        num_fc     = cfg.MODEL.ROI_BOX_HEAD.NUM_FC
+        fc_dim     = cfg.MODEL.ROI_BOX_HEAD.FC_DIM
+        norm       = cfg.MODEL.ROI_BOX_HEAD.NORM
+        # fmt: on
+        assert num_conv + num_fc > 0
+
+        self._output_size = (input_shape.channels, input_shape.height, input_shape.width)
+
+        self.conv_norm_relus = []
+        for k in range(num_conv):
+            conv = Conv2d(
+                self._output_size[0],
+                conv_dim,
+                kernel_size=3,
+                padding=1,
+                bias=not norm,
+                norm=get_norm(norm, conv_dim),
+                activation=F.relu,
+            )
+            self.add_module("conv{}".format(k + 1), conv)
+            self.conv_norm_relus.append(conv)
+            self._output_size = (conv_dim, self._output_size[1], self._output_size[2])
+
+        self.fcs = []
+        for k in range(num_fc):
+            fc = nn.Linear(np.prod(self._output_size), fc_dim)
+            self.add_module("fc{}".format(k + 1), fc)
+            self.fcs.append(fc)
+            self._output_size = fc_dim
+
+        for layer in self.conv_norm_relus:
+            weight_init.c2_msra_fill(layer)
+        for layer in self.fcs:
+            weight_init.c2_xavier_fill(layer)
+
+    def forward(self, x):
+        for layer in self.conv_norm_relus:
+            x = layer(x)
+        if len(self.fcs):
+            if x.dim() > 2:
+                x = torch.flatten(x, start_dim=1)
+            for layer in self.fcs:
+                x = F.relu(layer(x))
+        return x
+
+    @property
+    def output_size(self):
+        return self._output_size
+
+
+def build_box_head(cfg, input_shape):
+    """
+    Build a box head defined by `cfg.MODEL.ROI_BOX_HEAD.NAME`.
+    """
+    name = cfg.MODEL.ROI_BOX_HEAD.NAME
+    return ROI_BOX_HEAD_REGISTRY.get(name)(cfg, input_shape)
diff --git a/detectron2/modeling/roi_heads/cascade_rcnn.py b/detectron2/modeling/roi_heads/cascade_rcnn.py
new file mode 100644
index 0000000..6ca6082
--- /dev/null
+++ b/detectron2/modeling/roi_heads/cascade_rcnn.py
@@ -0,0 +1,243 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+from torch import nn
+from torch.autograd.function import Function
+
+from detectron2.layers import ShapeSpec
+from detectron2.structures import Boxes, Instances, pairwise_iou
+from detectron2.utils.events import get_event_storage
+
+from ..box_regression import Box2BoxTransform
+from ..matcher import Matcher
+from ..poolers import ROIPooler
+from .box_head import build_box_head
+from .fast_rcnn import FastRCNNOutputLayers, FastRCNNOutputs, fast_rcnn_inference
+from .roi_heads import ROI_HEADS_REGISTRY, StandardROIHeads
+
+
+class _ScaleGradient(Function):
+    @staticmethod
+    def forward(ctx, input, scale):
+        ctx.scale = scale
+        return input
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output * ctx.scale, None
+
+
+@ROI_HEADS_REGISTRY.register()
+class CascadeROIHeads(StandardROIHeads):
+    def _init_box_head(self, cfg):
+        # fmt: off
+        pooler_resolution        = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        pooler_scales            = tuple(1.0 / self.feature_strides[k] for k in self.in_features)
+        sampling_ratio           = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type              = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
+        cascade_bbox_reg_weights = cfg.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS
+        cascade_ious             = cfg.MODEL.ROI_BOX_CASCADE_HEAD.IOUS
+        self.num_cascade_stages  = len(cascade_ious)
+        assert len(cascade_bbox_reg_weights) == self.num_cascade_stages
+        assert cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG,  \
+            "CascadeROIHeads only support class-agnostic regression now!"
+        assert cascade_ious[0] == cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS[0]
+        # fmt: on
+
+        in_channels = [self.feature_channels[f] for f in self.in_features]
+        # Check all channel counts are equal
+        assert len(set(in_channels)) == 1, in_channels
+        in_channels = in_channels[0]
+
+        self.box_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+        pooled_shape = ShapeSpec(
+            channels=in_channels, width=pooler_resolution, height=pooler_resolution
+        )
+
+        self.box_head = nn.ModuleList()
+        self.box_predictor = nn.ModuleList()
+        self.box2box_transform = []
+        self.proposal_matchers = []
+        for k in range(self.num_cascade_stages):
+            box_head = build_box_head(cfg, pooled_shape)
+            self.box_head.append(box_head)
+            self.box_predictor.append(
+                FastRCNNOutputLayers(
+                    box_head.output_size, self.num_classes, cls_agnostic_bbox_reg=True
+                )
+            )
+            self.box2box_transform.append(Box2BoxTransform(weights=cascade_bbox_reg_weights[k]))
+
+            if k == 0:
+                # The first matching is done by the matcher of ROIHeads (self.proposal_matcher).
+                self.proposal_matchers.append(None)
+            else:
+                self.proposal_matchers.append(
+                    Matcher([cascade_ious[k]], [0, 1], allow_low_quality_matches=False)
+                )
+
+    def forward(self, images, features, proposals, targets=None):
+        del images
+        if self.training:
+            proposals = self.label_and_sample_proposals(proposals, targets)
+
+        features_list = [features[f] for f in self.in_features]
+
+        if self.training:
+            # Need targets to box head
+            losses = self._forward_box(features_list, proposals, targets)
+            losses.update(self._forward_mask(features_list, proposals))
+            losses.update(self._forward_keypoint(features_list, proposals))
+            return proposals, losses
+        else:
+            pred_instances = self._forward_box(features_list, proposals)
+            pred_instances = self.forward_with_given_boxes(features, pred_instances)
+            return pred_instances, {}
+
+    def _forward_box(self, features, proposals, targets=None):
+        head_outputs = []
+        image_sizes = [x.image_size for x in proposals]
+        for k in range(self.num_cascade_stages):
+            if k > 0:
+                # The output boxes of the previous stage are the input proposals of the next stage
+                proposals = self._create_proposals_from_boxes(
+                    head_outputs[-1].predict_boxes(), image_sizes
+                )
+                if self.training:
+                    proposals = self._match_and_label_boxes(proposals, k, targets)
+            head_outputs.append(self._run_stage(features, proposals, k))
+
+        if self.training:
+            losses = {}
+            storage = get_event_storage()
+            for stage, output in enumerate(head_outputs):
+                with storage.name_scope("stage{}".format(stage)):
+                    stage_losses = output.losses()
+                losses.update({k + "_stage{}".format(stage): v for k, v in stage_losses.items()})
+            return losses
+        else:
+            # Each is a list[Tensor] of length #image. Each tensor is Ri x (K+1)
+            scores_per_stage = [h.predict_probs() for h in head_outputs]
+
+            # Average the scores across heads
+            scores = [
+                sum(list(scores_per_image)) * (1.0 / self.num_cascade_stages)
+                for scores_per_image in zip(*scores_per_stage)
+            ]
+            # Use the boxes of the last head
+            boxes = head_outputs[-1].predict_boxes()
+            pred_instances, _ = fast_rcnn_inference(
+                boxes,
+                scores,
+                image_sizes,
+                self.test_score_thresh,
+                self.test_nms_thresh,
+                self.test_detections_per_img,
+            )
+            return pred_instances
+
+    @torch.no_grad()
+    def _match_and_label_boxes(self, proposals, stage, targets):
+        """
+        Match proposals with groundtruth using the matcher at the given stage.
+        Label the proposals as foreground or background based on the match.
+
+        Args:
+            proposals (list[Instances]): One Instances for each image, with
+                the field "proposal_boxes".
+            stage (int): the current stage
+            targets (list[Instances]): the ground truth instances
+
+        Returns:
+            list[Instances]: the same proposals, but with fields "gt_classes" and "gt_boxes"
+        """
+        num_fg_samples, num_bg_samples = [], []
+        for proposals_per_image, targets_per_image in zip(proposals, targets):
+            match_quality_matrix = pairwise_iou(
+                targets_per_image.gt_boxes, proposals_per_image.proposal_boxes
+            )
+            # proposal_labels are 0 or 1
+            matched_idxs, proposal_labels = self.proposal_matchers[stage](match_quality_matrix)
+            if len(targets_per_image) > 0:
+                gt_classes = targets_per_image.gt_classes[matched_idxs]
+                # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
+                gt_classes[proposal_labels == 0] = self.num_classes
+                gt_boxes = targets_per_image.gt_boxes[matched_idxs]
+            else:
+                gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
+                gt_boxes = Boxes(
+                    targets_per_image.gt_boxes.tensor.new_zeros((len(proposals_per_image), 4))
+                )
+            proposals_per_image.gt_classes = gt_classes
+            proposals_per_image.gt_boxes = gt_boxes
+
+            num_fg_samples.append((proposal_labels == 1).sum().item())
+            num_bg_samples.append(proposal_labels.numel() - num_fg_samples[-1])
+
+        # Log the number of fg/bg samples in each stage
+        storage = get_event_storage()
+        storage.put_scalar(
+            "stage{}/roi_head/num_fg_samples".format(stage),
+            sum(num_fg_samples) / len(num_fg_samples),
+        )
+        storage.put_scalar(
+            "stage{}/roi_head/num_bg_samples".format(stage),
+            sum(num_bg_samples) / len(num_bg_samples),
+        )
+        return proposals
+
+    def _run_stage(self, features, proposals, stage):
+        """
+        Args:
+            features (list[Tensor]): #lvl input features to ROIHeads
+            proposals (list[Instances]): #image Instances, with the field "proposal_boxes"
+            stage (int): the current stage
+
+        Returns:
+            FastRCNNOutputs: the output of this stage
+        """
+        box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+        # The original implementation averages the losses among heads,
+        # but scale up the parameter gradients of the heads.
+        # This is equivalent to adding the losses among heads,
+        # but scale down the gradients on features.
+        box_features = _ScaleGradient.apply(box_features, 1.0 / self.num_cascade_stages)
+        box_features = self.box_head[stage](box_features)
+        pred_class_logits, pred_proposal_deltas = self.box_predictor[stage](box_features)
+        del box_features
+
+        outputs = FastRCNNOutputs(
+            self.box2box_transform[stage],
+            pred_class_logits,
+            pred_proposal_deltas,
+            proposals,
+            self.smooth_l1_beta,
+        )
+        return outputs
+
+    def _create_proposals_from_boxes(self, boxes, image_sizes):
+        """
+        Args:
+            boxes (list[Tensor]): per-image predicted boxes, each of shape Ri x 4
+            image_sizes (list[tuple]): list of image shapes in (h, w)
+
+        Returns:
+            list[Instances]: per-image proposals with the given boxes.
+        """
+        # Just like RPN, the proposals should not have gradients
+        boxes = [Boxes(b.detach()) for b in boxes]
+        proposals = []
+        for boxes_per_image, image_size in zip(boxes, image_sizes):
+            boxes_per_image.clip(image_size)
+            if self.training:
+                # do not filter empty boxes at inference time,
+                # because the scores from each stage need to be aligned and added later
+                boxes_per_image = boxes_per_image[boxes_per_image.nonempty()]
+            prop = Instances(image_size)
+            prop.proposal_boxes = boxes_per_image
+            proposals.append(prop)
+        return proposals
diff --git a/detectron2/modeling/roi_heads/fast_rcnn.py b/detectron2/modeling/roi_heads/fast_rcnn.py
new file mode 100644
index 0000000..1342edf
--- /dev/null
+++ b/detectron2/modeling/roi_heads/fast_rcnn.py
@@ -0,0 +1,352 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import numpy as np
+import torch
+from fvcore.nn import smooth_l1_loss
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.layers import batched_nms, cat
+from detectron2.structures import Boxes, Instances
+from detectron2.utils.events import get_event_storage
+
+logger = logging.getLogger(__name__)
+
+"""
+Shape shorthand in this module:
+
+    N: number of images in the minibatch
+    R: number of ROIs, combined over all images, in the minibatch
+    Ri: number of ROIs in image i
+    K: number of foreground classes. E.g.,there are 80 foreground classes in COCO.
+
+Naming convention:
+
+    deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box
+    transform (see :class:`box_regression.Box2BoxTransform`).
+
+    pred_class_logits: predicted class scores in [-inf, +inf]; use
+        softmax(pred_class_logits) to estimate P(class).
+
+    gt_classes: ground-truth classification labels in [0, K], where [0, K) represent
+        foreground object classes and K represents the background class.
+
+    pred_proposal_deltas: predicted box2box transform deltas for transforming proposals
+        to detection box predictions.
+
+    gt_proposal_deltas: ground-truth box2box transform deltas
+"""
+
+
+def fast_rcnn_inference(boxes, scores, image_shapes, score_thresh, nms_thresh, topk_per_image):
+    """
+    Call `fast_rcnn_inference_single_image` for all images.
+
+    Args:
+        boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic
+            boxes for each image. Element i has shape (Ri, K * 4) if doing
+            class-specific regression, or (Ri, 4) if doing class-agnostic
+            regression, where Ri is the number of predicted objects for image i.
+            This is compatible with the output of :meth:`FastRCNNOutputs.predict_boxes`.
+        scores (list[Tensor]): A list of Tensors of predicted class scores for each image.
+            Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
+            for image i. Compatible with the output of :meth:`FastRCNNOutputs.predict_probs`.
+        image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch.
+        score_thresh (float): Only return detections with a confidence score exceeding this
+            threshold.
+        nms_thresh (float):  The threshold to use for box non-maximum suppression. Value in [0, 1].
+        topk_per_image (int): The number of top scoring detections to return. Set < 0 to return
+            all detections.
+
+    Returns:
+        instances: (list[Instances]): A list of N instances, one for each image in the batch,
+            that stores the topk most confidence detections.
+        kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates
+            the corresponding boxes/scores index in [0, Ri) from the input, for image i.
+    """
+    result_per_image = [
+        fast_rcnn_inference_single_image(
+            boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image
+        )
+        for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes)
+    ]
+    return tuple(list(x) for x in zip(*result_per_image))
+
+
+def fast_rcnn_inference_single_image(
+    boxes, scores, image_shape, score_thresh, nms_thresh, topk_per_image
+):
+    """
+    Single-image inference. Return bounding-box detection results by thresholding
+    on scores and applying non-maximum suppression (NMS).
+
+    Args:
+        Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes
+        per image.
+
+    Returns:
+        Same as `fast_rcnn_inference`, but for only one image.
+    """
+    scores = scores[:, :-1]
+    num_bbox_reg_classes = boxes.shape[1] // 4
+    # Convert to Boxes to use the `clip` function ...
+    boxes = Boxes(boxes.reshape(-1, 4))
+    boxes.clip(image_shape)
+    boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4)  # R x C x 4
+
+    # Filter results based on detection scores
+    filter_mask = scores > score_thresh  # R x K
+    # R' x 2. First column contains indices of the R predictions;
+    # Second column contains indices of classes.
+    filter_inds = filter_mask.nonzero()
+    if num_bbox_reg_classes == 1:
+        boxes = boxes[filter_inds[:, 0], 0]
+    else:
+        boxes = boxes[filter_mask]
+    scores = scores[filter_mask]
+
+    # Apply per-class NMS
+    keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh)
+    if topk_per_image >= 0:
+        keep = keep[:topk_per_image]
+    boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep]
+
+    result = Instances(image_shape)
+    result.pred_boxes = Boxes(boxes)
+    result.scores = scores
+    result.pred_classes = filter_inds[:, 1]
+    return result, filter_inds[:, 0]
+
+
+class FastRCNNOutputs(object):
+    """
+    A class that stores information about outputs of a Fast R-CNN head.
+    """
+
+    def __init__(
+        self, box2box_transform, pred_class_logits, pred_proposal_deltas, proposals, smooth_l1_beta
+    ):
+        """
+        Args:
+            box2box_transform (Box2BoxTransform/Box2BoxTransformRotated):
+                box2box transform instance for proposal-to-detection transformations.
+            pred_class_logits (Tensor): A tensor of shape (R, K + 1) storing the predicted class
+                logits for all R predicted object instances.
+                Each row corresponds to a predicted object instance.
+            pred_proposal_deltas (Tensor): A tensor of shape (R, K * B) or (R, B) for
+                class-specific or class-agnostic regression. It stores the predicted deltas that
+                transform proposals into final box detections.
+                B is the box dimension (4 or 5).
+                When B is 4, each row is [dx, dy, dw, dh (, ....)].
+                When B is 5, each row is [dx, dy, dw, dh, da (, ....)].
+            proposals (list[Instances]): A list of N Instances, where Instances i stores the
+                proposals for image i, in the field "proposal_boxes".
+                When training, each Instances must have ground-truth labels
+                stored in the field "gt_classes" and "gt_boxes".
+            smooth_l1_beta (float): The transition point between L1 and L2 loss in
+                the smooth L1 loss function. When set to 0, the loss becomes L1. When
+                set to +inf, the loss becomes constant 0.
+        """
+        self.box2box_transform = box2box_transform
+        self.num_preds_per_image = [len(p) for p in proposals]
+        self.pred_class_logits = pred_class_logits
+        self.pred_proposal_deltas = pred_proposal_deltas
+        self.smooth_l1_beta = smooth_l1_beta
+
+        box_type = type(proposals[0].proposal_boxes)
+        # cat(..., dim=0) concatenates over all images in the batch
+        self.proposals = box_type.cat([p.proposal_boxes for p in proposals])
+        assert not self.proposals.tensor.requires_grad, "Proposals should not require gradients!"
+        self.image_shapes = [x.image_size for x in proposals]
+
+        # The following fields should exist only when training.
+        if proposals[0].has("gt_boxes"):
+            self.gt_boxes = box_type.cat([p.gt_boxes for p in proposals])
+            assert proposals[0].has("gt_classes")
+            self.gt_classes = cat([p.gt_classes for p in proposals], dim=0)
+
+    def _log_accuracy(self):
+        """
+        Log the accuracy metrics to EventStorage.
+        """
+        num_instances = self.gt_classes.numel()
+        pred_classes = self.pred_class_logits.argmax(dim=1)
+        bg_class_ind = self.pred_class_logits.shape[1] - 1
+
+        fg_inds = (self.gt_classes >= 0) & (self.gt_classes < bg_class_ind)
+        num_fg = fg_inds.nonzero().numel()
+        fg_gt_classes = self.gt_classes[fg_inds]
+        fg_pred_classes = pred_classes[fg_inds]
+
+        num_false_negative = (fg_pred_classes == bg_class_ind).nonzero().numel()
+        num_accurate = (pred_classes == self.gt_classes).nonzero().numel()
+        fg_num_accurate = (fg_pred_classes == fg_gt_classes).nonzero().numel()
+
+        storage = get_event_storage()
+        storage.put_scalar("fast_rcnn/cls_accuracy", num_accurate / num_instances)
+        if num_fg > 0:
+            storage.put_scalar("fast_rcnn/fg_cls_accuracy", fg_num_accurate / num_fg)
+            storage.put_scalar("fast_rcnn/false_negative", num_false_negative / num_fg)
+
+    def softmax_cross_entropy_loss(self):
+        """
+        Compute the softmax cross entropy loss for box classification.
+
+        Returns:
+            scalar Tensor
+        """
+        self._log_accuracy()
+        return F.cross_entropy(self.pred_class_logits, self.gt_classes, reduction="mean")
+
+    def smooth_l1_loss(self):
+        """
+        Compute the smooth L1 loss for box regression.
+
+        Returns:
+            scalar Tensor
+        """
+        gt_proposal_deltas = self.box2box_transform.get_deltas(
+            self.proposals.tensor, self.gt_boxes.tensor
+        )
+        box_dim = gt_proposal_deltas.size(1)  # 4 or 5
+        cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim
+        device = self.pred_proposal_deltas.device
+
+        bg_class_ind = self.pred_class_logits.shape[1] - 1
+
+        # Box delta loss is only computed between the prediction for the gt class k
+        # (if 0 <= k < bg_class_ind) and the target; there is no loss defined on predictions
+        # for non-gt classes and background.
+        # Empty fg_inds produces a valid loss of zero as long as the size_average
+        # arg to smooth_l1_loss is False (otherwise it uses torch.mean internally
+        # and would produce a nan loss).
+        fg_inds = torch.nonzero((self.gt_classes >= 0) & (self.gt_classes < bg_class_ind)).squeeze(
+            1
+        )
+        if cls_agnostic_bbox_reg:
+            # pred_proposal_deltas only corresponds to foreground class for agnostic
+            gt_class_cols = torch.arange(box_dim, device=device)
+        else:
+            fg_gt_classes = self.gt_classes[fg_inds]
+            # pred_proposal_deltas for class k are located in columns [b * k : b * k + b],
+            # where b is the dimension of box representation (4 or 5)
+            # Note that compared to Detectron1,
+            # we do not perform bounding box regression for background classes.
+            gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange(box_dim, device=device)
+
+        loss_box_reg = smooth_l1_loss(
+            self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols],
+            gt_proposal_deltas[fg_inds],
+            self.smooth_l1_beta,
+            reduction="sum",
+        )
+        # The loss is normalized using the total number of regions (R), not the number
+        # of foreground regions even though the box regression loss is only defined on
+        # foreground regions. Why? Because doing so gives equal training influence to
+        # each foreground example. To see how, consider two different minibatches:
+        #  (1) Contains a single foreground region
+        #  (2) Contains 100 foreground regions
+        # If we normalize by the number of foreground regions, the single example in
+        # minibatch (1) will be given 100 times as much influence as each foreground
+        # example in minibatch (2). Normalizing by the total number of regions, R,
+        # means that the single example in minibatch (1) and each of the 100 examples
+        # in minibatch (2) are given equal influence.
+        loss_box_reg = loss_box_reg / self.gt_classes.numel()
+        return loss_box_reg
+
+    def losses(self):
+        """
+        Compute the default losses for box head in Fast(er) R-CNN,
+        with softmax cross entropy loss and smooth L1 loss.
+
+        Returns:
+            A dict of losses (scalar tensors) containing keys "loss_cls" and "loss_box_reg".
+        """
+        return {
+            "loss_cls": self.softmax_cross_entropy_loss(),
+            "loss_box_reg": self.smooth_l1_loss(),
+        }
+
+    def predict_boxes(self):
+        """
+        Returns:
+            list[Tensor]: A list of Tensors of predicted class-specific or class-agnostic boxes
+                for each image. Element i has shape (Ri, K * B) or (Ri, B), where Ri is
+                the number of predicted objects for image i and B is the box dimension (4 or 5)
+        """
+        num_pred = len(self.proposals)
+        B = self.proposals.tensor.shape[1]
+        K = self.pred_proposal_deltas.shape[1] // B
+        boxes = self.box2box_transform.apply_deltas(
+            self.pred_proposal_deltas.view(num_pred * K, B),
+            self.proposals.tensor.unsqueeze(1).expand(num_pred, K, B).reshape(-1, B),
+        )
+        return boxes.view(num_pred, K * B).split(self.num_preds_per_image, dim=0)
+
+    def predict_probs(self):
+        """
+        Returns:
+            list[Tensor]: A list of Tensors of predicted class probabilities for each image.
+                Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
+                for image i.
+        """
+        probs = F.softmax(self.pred_class_logits, dim=-1)
+        return probs.split(self.num_preds_per_image, dim=0)
+
+    def inference(self, score_thresh, nms_thresh, topk_per_image):
+        """
+        Args:
+            score_thresh (float): same as fast_rcnn_inference.
+            nms_thresh (float): same as fast_rcnn_inference.
+            topk_per_image (int): same as fast_rcnn_inference.
+        Returns:
+            list[Instances]: same as fast_rcnn_inference.
+            list[Tensor]: same as fast_rcnn_inference.
+        """
+        boxes = self.predict_boxes()
+        scores = self.predict_probs()
+        image_shapes = self.image_shapes
+
+        return fast_rcnn_inference(
+            boxes, scores, image_shapes, score_thresh, nms_thresh, topk_per_image
+        )
+
+
+class FastRCNNOutputLayers(nn.Module):
+    """
+    Two linear layers for predicting Fast R-CNN outputs:
+      (1) proposal-to-detection box regression deltas
+      (2) classification scores
+    """
+
+    def __init__(self, input_size, num_classes, cls_agnostic_bbox_reg, box_dim=4):
+        """
+        Args:
+            input_size (int): channels, or (channels, height, width)
+            num_classes (int): number of foreground classes
+            cls_agnostic_bbox_reg (bool): whether to use class agnostic for bbox regression
+            box_dim (int): the dimension of bounding boxes.
+                Example box dimensions: 4 for regular XYXY boxes and 5 for rotated XYWHA boxes
+        """
+        super(FastRCNNOutputLayers, self).__init__()
+
+        if not isinstance(input_size, int):
+            input_size = np.prod(input_size)
+
+        # The prediction layer for num_classes foreground classes and one background class
+        # (hence + 1)
+        self.cls_score = nn.Linear(input_size, num_classes + 1)
+        num_bbox_reg_classes = 1 if cls_agnostic_bbox_reg else num_classes
+        self.bbox_pred = nn.Linear(input_size, num_bbox_reg_classes * box_dim)
+
+        nn.init.normal_(self.cls_score.weight, std=0.01)
+        nn.init.normal_(self.bbox_pred.weight, std=0.001)
+        for l in [self.cls_score, self.bbox_pred]:
+            nn.init.constant_(l.bias, 0)
+
+    def forward(self, x):
+        if x.dim() > 2:
+            x = torch.flatten(x, start_dim=1)
+        scores = self.cls_score(x)
+        proposal_deltas = self.bbox_pred(x)
+        return scores, proposal_deltas
diff --git a/detectron2/modeling/roi_heads/keypoint_head.py b/detectron2/modeling/roi_heads/keypoint_head.py
new file mode 100644
index 0000000..0e78efe
--- /dev/null
+++ b/detectron2/modeling/roi_heads/keypoint_head.py
@@ -0,0 +1,168 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.layers import Conv2d, ConvTranspose2d, ShapeSpec, cat, interpolate
+from detectron2.structures import heatmaps_to_keypoints
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.registry import Registry
+
+_TOTAL_SKIPPED = 0
+
+ROI_KEYPOINT_HEAD_REGISTRY = Registry("ROI_KEYPOINT_HEAD")
+ROI_KEYPOINT_HEAD_REGISTRY.__doc__ = """
+Registry for keypoint heads, which make keypoint predictions from per-region features.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+"""
+
+
+def build_keypoint_head(cfg, input_shape):
+    """
+    Build a keypoint head from `cfg.MODEL.ROI_KEYPOINT_HEAD.NAME`.
+    """
+    name = cfg.MODEL.ROI_KEYPOINT_HEAD.NAME
+    return ROI_KEYPOINT_HEAD_REGISTRY.get(name)(cfg, input_shape)
+
+
+def keypoint_rcnn_loss(pred_keypoint_logits, instances, normalizer):
+    """
+    Arguments:
+        pred_keypoint_logits (Tensor): A tensor of shape (N, K, S, S) where N is the total number
+            of instances in the batch, K is the number of keypoints, and S is the side length
+            of the keypoint heatmap. The values are spatial logits.
+        instances (list[Instances]): A list of M Instances, where M is the batch size.
+            These instances are predictions from the model
+            that are in 1:1 correspondence with pred_keypoint_logits.
+            Each Instances should contain a `gt_keypoints` field containing a `structures.Keypoint`
+            instance.
+        normalizer (float): Normalize the loss by this amount.
+            If not specified, we normalize by the number of visible keypoints in the minibatch.
+
+    Returns a scalar tensor containing the loss.
+    """
+    heatmaps = []
+    valid = []
+
+    keypoint_side_len = pred_keypoint_logits.shape[2]
+    for instances_per_image in instances:
+        if len(instances_per_image) == 0:
+            continue
+        keypoints = instances_per_image.gt_keypoints
+        heatmaps_per_image, valid_per_image = keypoints.to_heatmap(
+            instances_per_image.proposal_boxes.tensor, keypoint_side_len
+        )
+        heatmaps.append(heatmaps_per_image.view(-1))
+        valid.append(valid_per_image.view(-1))
+
+    if len(heatmaps):
+        keypoint_targets = cat(heatmaps, dim=0)
+        valid = cat(valid, dim=0).to(dtype=torch.uint8)
+        valid = torch.nonzero(valid).squeeze(1)
+
+    # torch.mean (in binary_cross_entropy_with_logits) doesn't
+    # accept empty tensors, so handle it separately
+    if len(heatmaps) == 0 or valid.numel() == 0:
+        global _TOTAL_SKIPPED
+        _TOTAL_SKIPPED += 1
+        storage = get_event_storage()
+        storage.put_scalar("kpts_num_skipped_batches", _TOTAL_SKIPPED, smoothing_hint=False)
+        return pred_keypoint_logits.sum() * 0
+
+    N, K, H, W = pred_keypoint_logits.shape
+    pred_keypoint_logits = pred_keypoint_logits.view(N * K, H * W)
+
+    keypoint_loss = F.cross_entropy(
+        pred_keypoint_logits[valid], keypoint_targets[valid], reduction="sum"
+    )
+
+    # If a normalizer isn't specified, normalize by the number of visible keypoints in the minibatch
+    if normalizer is None:
+        normalizer = valid.numel()
+    keypoint_loss /= normalizer
+
+    return keypoint_loss
+
+
+def keypoint_rcnn_inference(pred_keypoint_logits, pred_instances):
+    """
+    Post process each predicted keypoint heatmap in `pred_keypoint_logits` into (x, y, score, prob)
+        and add it to the `pred_instances` as a `pred_keypoints` field.
+
+    Args:
+        pred_keypoint_logits (Tensor): A tensor of shape (R, K, S, S) where R is the total number
+           of instances in the batch, K is the number of keypoints, and S is the side length of
+           the keypoint heatmap. The values are spatial logits.
+        pred_instances (list[Instances]): A list of N Instances, where N is the number of images.
+
+    Returns:
+        None. boxes will contain an extra "pred_keypoints" field.
+            The field is a tensor of shape (#instance, K, 3) where the last
+            dimension corresponds to (x, y, probability).
+    """
+    # flatten all bboxes from all images together (list[Boxes] -> Rx4 tensor)
+    bboxes_flat = cat([b.pred_boxes.tensor for b in pred_instances], dim=0)
+
+    keypoint_results = heatmaps_to_keypoints(pred_keypoint_logits.detach(), bboxes_flat.detach())
+    num_instances_per_image = [len(i) for i in pred_instances]
+    keypoint_results = keypoint_results.split(num_instances_per_image, dim=0)
+
+    for keypoint_results_per_image, instances_per_image in zip(keypoint_results, pred_instances):
+        # keypoint_results_per_image is (num instances)x(num keypoints)x(x, y, score, prob)
+        keypoint_xyp = keypoint_results_per_image[:, :, [0, 1, 3]]
+        instances_per_image.pred_keypoints = keypoint_xyp
+
+
+@ROI_KEYPOINT_HEAD_REGISTRY.register()
+class KRCNNConvDeconvUpsampleHead(nn.Module):
+    """
+    A standard keypoint head containing a series of 3x3 convs, followed by
+    a transpose convolution and bilinear interpolation for upsampling.
+    """
+
+    def __init__(self, cfg, input_shape: ShapeSpec):
+        """
+        The following attributes are parsed from config:
+            conv_dims: an iterable of output channel counts for each conv in the head
+                         e.g. (512, 512, 512) for three convs outputting 512 channels.
+            num_keypoints: number of keypoint heatmaps to predicts, determines the number of
+                           channels in the final output.
+        """
+        super(KRCNNConvDeconvUpsampleHead, self).__init__()
+
+        # fmt: off
+        # default up_scale to 2 (this can eventually be moved to config)
+        up_scale      = 2
+        conv_dims     = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS
+        num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS
+        in_channels   = input_shape.channels
+        # fmt: on
+
+        self.blocks = []
+        for idx, layer_channels in enumerate(conv_dims, 1):
+            module = Conv2d(in_channels, layer_channels, 3, stride=1, padding=1)
+            self.add_module("conv_fcn{}".format(idx), module)
+            self.blocks.append(module)
+            in_channels = layer_channels
+
+        deconv_kernel = 4
+        self.score_lowres = ConvTranspose2d(
+            in_channels, num_keypoints, deconv_kernel, stride=2, padding=deconv_kernel // 2 - 1
+        )
+        self.up_scale = up_scale
+
+        for name, param in self.named_parameters():
+            if "bias" in name:
+                nn.init.constant_(param, 0)
+            elif "weight" in name:
+                # Caffe2 implementation uses MSRAFill, which in fact
+                # corresponds to kaiming_normal_ in PyTorch
+                nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")
+
+    def forward(self, x):
+        for layer in self.blocks:
+            x = F.relu(layer(x))
+        x = self.score_lowres(x)
+        x = interpolate(x, scale_factor=self.up_scale, mode="bilinear", align_corners=False)
+        return x
diff --git a/detectron2/modeling/roi_heads/mask_head.py b/detectron2/modeling/roi_heads/mask_head.py
new file mode 100644
index 0000000..1702973
--- /dev/null
+++ b/detectron2/modeling/roi_heads/mask_head.py
@@ -0,0 +1,206 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.layers import Conv2d, ConvTranspose2d, ShapeSpec, cat, get_norm
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.registry import Registry
+
+ROI_MASK_HEAD_REGISTRY = Registry("ROI_MASK_HEAD")
+ROI_MASK_HEAD_REGISTRY.__doc__ = """
+Registry for mask heads, which predicts instance masks given
+per-region features.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+"""
+
+
+def mask_rcnn_loss(pred_mask_logits, instances):
+    """
+    Compute the mask prediction loss defined in the Mask R-CNN paper.
+
+    Args:
+        pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask)
+            for class-specific or class-agnostic, where B is the total number of predicted masks
+            in all images, C is the number of foreground classes, and Hmask, Wmask are the height
+            and width of the mask predictions. The values are logits.
+        instances (list[Instances]): A list of N Instances, where N is the number of images
+            in the batch. These instances are in 1:1
+            correspondence with the pred_mask_logits. The ground-truth labels (class, box, mask,
+            ...) associated with each instance are stored in fields.
+
+    Returns:
+        mask_loss (Tensor): A scalar tensor containing the loss.
+    """
+    cls_agnostic_mask = pred_mask_logits.size(1) == 1
+    total_num_masks = pred_mask_logits.size(0)
+    mask_side_len = pred_mask_logits.size(2)
+    assert pred_mask_logits.size(2) == pred_mask_logits.size(3), "Mask prediction must be square!"
+
+    gt_classes = []
+    gt_masks = []
+    for instances_per_image in instances:
+        if len(instances_per_image) == 0:
+            continue
+        if not cls_agnostic_mask:
+            gt_classes_per_image = instances_per_image.gt_classes.to(dtype=torch.int64)
+            gt_classes.append(gt_classes_per_image)
+
+        gt_masks_per_image = instances_per_image.gt_masks.crop_and_resize(
+            instances_per_image.proposal_boxes.tensor, mask_side_len
+        ).to(device=pred_mask_logits.device)
+        # A tensor of shape (N, M, M), N=#instances in the image; M=mask_side_len
+        gt_masks.append(gt_masks_per_image)
+
+    if len(gt_masks) == 0:
+        return pred_mask_logits.sum() * 0
+
+    gt_masks = cat(gt_masks, dim=0)
+
+    if cls_agnostic_mask:
+        pred_mask_logits = pred_mask_logits[:, 0]
+    else:
+        indices = torch.arange(total_num_masks)
+        gt_classes = cat(gt_classes, dim=0)
+        pred_mask_logits = pred_mask_logits[indices, gt_classes]
+
+    if gt_masks.dtype == torch.bool:
+        gt_masks_bool = gt_masks
+    else:
+        # Here we allow gt_masks to be float as well (depend on the implementation of rasterize())
+        gt_masks_bool = gt_masks > 0.5
+
+    # Log the training accuracy (using gt classes and 0.5 threshold)
+    mask_incorrect = (pred_mask_logits > 0.0) != gt_masks_bool
+    mask_accuracy = 1 - (mask_incorrect.sum().item() / max(mask_incorrect.numel(), 1.0))
+    num_positive = gt_masks_bool.sum().item()
+    false_positive = (mask_incorrect & ~gt_masks_bool).sum().item() / max(
+        gt_masks_bool.numel() - num_positive, 1.0
+    )
+    false_negative = (mask_incorrect & gt_masks_bool).sum().item() / max(num_positive, 1.0)
+
+    storage = get_event_storage()
+    storage.put_scalar("mask_rcnn/accuracy", mask_accuracy)
+    storage.put_scalar("mask_rcnn/false_positive", false_positive)
+    storage.put_scalar("mask_rcnn/false_negative", false_negative)
+
+    mask_loss = F.binary_cross_entropy_with_logits(
+        pred_mask_logits, gt_masks.to(dtype=torch.float32), reduction="mean"
+    )
+    return mask_loss
+
+
+def mask_rcnn_inference(pred_mask_logits, pred_instances):
+    """
+    Convert pred_mask_logits to estimated foreground probability masks while also
+    extracting only the masks for the predicted classes in pred_instances. For each
+    predicted box, the mask of the same class is attached to the instance by adding a
+    new "pred_masks" field to pred_instances.
+
+    Args:
+        pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask)
+            for class-specific or class-agnostic, where B is the total number of predicted masks
+            in all images, C is the number of foreground classes, and Hmask, Wmask are the height
+            and width of the mask predictions. The values are logits.
+        pred_instances (list[Instances]): A list of N Instances, where N is the number of images
+            in the batch. Each Instances must have field "pred_classes".
+
+    Returns:
+        None. pred_instances will contain an extra "pred_masks" field storing a mask of size (Hmask,
+            Wmask) for predicted class. Note that the masks are returned as a soft (non-quantized)
+            masks the resolution predicted by the network; post-processing steps, such as resizing
+            the predicted masks to the original image resolution and/or binarizing them, is left
+            to the caller.
+    """
+    cls_agnostic_mask = pred_mask_logits.size(1) == 1
+
+    if cls_agnostic_mask:
+        mask_probs_pred = pred_mask_logits.sigmoid()
+    else:
+        # Select masks corresponding to the predicted classes
+        num_masks = pred_mask_logits.shape[0]
+        class_pred = cat([i.pred_classes for i in pred_instances])
+        indices = torch.arange(num_masks, device=class_pred.device)
+        mask_probs_pred = pred_mask_logits[indices, class_pred][:, None].sigmoid()
+    # mask_probs_pred.shape: (B, 1, Hmask, Wmask)
+
+    num_boxes_per_image = [len(i) for i in pred_instances]
+    mask_probs_pred = mask_probs_pred.split(num_boxes_per_image, dim=0)
+
+    for prob, instances in zip(mask_probs_pred, pred_instances):
+        instances.pred_masks = prob  # (1, Hmask, Wmask)
+
+
+@ROI_MASK_HEAD_REGISTRY.register()
+class MaskRCNNConvUpsampleHead(nn.Module):
+    """
+    A mask head with several conv layers, plus an upsample layer (with `ConvTranspose2d`).
+    """
+
+    def __init__(self, cfg, input_shape: ShapeSpec):
+        """
+        The following attributes are parsed from config:
+            num_conv: the number of conv layers
+            conv_dim: the dimension of the conv layers
+            norm: normalization for the conv layers
+        """
+        super(MaskRCNNConvUpsampleHead, self).__init__()
+
+        # fmt: off
+        num_classes       = cfg.MODEL.ROI_HEADS.NUM_CLASSES
+        conv_dims         = cfg.MODEL.ROI_MASK_HEAD.CONV_DIM
+        self.norm         = cfg.MODEL.ROI_MASK_HEAD.NORM
+        num_conv          = cfg.MODEL.ROI_MASK_HEAD.NUM_CONV
+        input_channels    = input_shape.channels
+        cls_agnostic_mask = cfg.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK
+        # fmt: on
+
+        self.conv_norm_relus = []
+
+        for k in range(num_conv):
+            conv = Conv2d(
+                input_channels if k == 0 else conv_dims,
+                conv_dims,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=not self.norm,
+                norm=get_norm(self.norm, conv_dims),
+                activation=F.relu,
+            )
+            self.add_module("mask_fcn{}".format(k + 1), conv)
+            self.conv_norm_relus.append(conv)
+
+        self.deconv = ConvTranspose2d(
+            conv_dims if num_conv > 0 else input_channels,
+            conv_dims,
+            kernel_size=2,
+            stride=2,
+            padding=0,
+        )
+
+        num_mask_classes = 1 if cls_agnostic_mask else num_classes
+        self.predictor = Conv2d(conv_dims, num_mask_classes, kernel_size=1, stride=1, padding=0)
+
+        for layer in self.conv_norm_relus + [self.deconv]:
+            weight_init.c2_msra_fill(layer)
+        # use normal distribution initialization for mask prediction layer
+        nn.init.normal_(self.predictor.weight, std=0.001)
+        if self.predictor.bias is not None:
+            nn.init.constant_(self.predictor.bias, 0)
+
+    def forward(self, x):
+        for layer in self.conv_norm_relus:
+            x = layer(x)
+        x = F.relu(self.deconv(x))
+        return self.predictor(x)
+
+
+def build_mask_head(cfg, input_shape):
+    """
+    Build a mask head defined by `cfg.MODEL.ROI_MASK_HEAD.NAME`.
+    """
+    name = cfg.MODEL.ROI_MASK_HEAD.NAME
+    return ROI_MASK_HEAD_REGISTRY.get(name)(cfg, input_shape)
diff --git a/detectron2/modeling/roi_heads/roi_heads.py b/detectron2/modeling/roi_heads/roi_heads.py
new file mode 100644
index 0000000..9058745
--- /dev/null
+++ b/detectron2/modeling/roi_heads/roi_heads.py
@@ -0,0 +1,711 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import numpy as np
+from typing import Dict
+import torch
+from torch import nn
+
+from detectron2.layers import ShapeSpec
+from detectron2.structures import Boxes, Instances, pairwise_iou
+from detectron2.utils.events import get_event_storage
+from detectron2.utils.registry import Registry
+
+from ..backbone.resnet import BottleneckBlock, make_stage
+from ..box_regression import Box2BoxTransform
+from ..matcher import Matcher
+from ..poolers import ROIPooler
+from ..proposal_generator.proposal_utils import add_ground_truth_to_proposals
+from ..sampling import subsample_labels
+from .box_head import build_box_head
+from .fast_rcnn import FastRCNNOutputLayers, FastRCNNOutputs
+from .keypoint_head import build_keypoint_head, keypoint_rcnn_inference, keypoint_rcnn_loss
+from .mask_head import build_mask_head, mask_rcnn_inference, mask_rcnn_loss
+
+ROI_HEADS_REGISTRY = Registry("ROI_HEADS")
+ROI_HEADS_REGISTRY.__doc__ = """
+Registry for ROI heads in a generalized R-CNN model.
+ROIHeads take feature maps and region proposals, and
+perform per-region computation.
+
+The registered object will be called with `obj(cfg, input_shape)`.
+The call is expected to return an :class:`ROIHeads`.
+"""
+
+logger = logging.getLogger(__name__)
+
+
+def build_roi_heads(cfg, input_shape):
+    """
+    Build ROIHeads defined by `cfg.MODEL.ROI_HEADS.NAME`.
+    """
+    name = cfg.MODEL.ROI_HEADS.NAME
+    return ROI_HEADS_REGISTRY.get(name)(cfg, input_shape)
+
+
+def select_foreground_proposals(proposals, bg_label):
+    """
+    Given a list of N Instances (for N images), each containing a `gt_classes` field,
+    return a list of Instances that contain only instances with `gt_classes != -1 &&
+    gt_classes != bg_label`.
+
+    Args:
+        proposals (list[Instances]): A list of N Instances, where N is the number of
+            images in the batch.
+        bg_label: label index of background class.
+
+    Returns:
+        list[Instances]: N Instances, each contains only the selected foreground instances.
+        list[Tensor]: N boolean vector, correspond to the selection mask of
+            each Instances object. True for selected instances.
+    """
+    assert isinstance(proposals, (list, tuple))
+    assert isinstance(proposals[0], Instances)
+    assert proposals[0].has("gt_classes")
+    fg_proposals = []
+    fg_selection_masks = []
+    for proposals_per_image in proposals:
+        gt_classes = proposals_per_image.gt_classes
+        fg_selection_mask = (gt_classes != -1) & (gt_classes != bg_label)
+        fg_idxs = fg_selection_mask.nonzero().squeeze(1)
+        fg_proposals.append(proposals_per_image[fg_idxs])
+        fg_selection_masks.append(fg_selection_mask)
+    return fg_proposals, fg_selection_masks
+
+
+def select_proposals_with_visible_keypoints(proposals):
+    """
+    Args:
+        proposals (list[Instances]): a list of N Instances, where N is the
+            number of images.
+
+    Returns:
+        proposals: only contains proposals with at least one visible keypoint.
+
+    Note that this is still slightly different from Detectron.
+    In Detectron, proposals for training keypoint head are re-sampled from
+    all the proposals with IOU>threshold & >=1 visible keypoint.
+
+    Here, the proposals are first sampled from all proposals with
+    IOU>threshold, then proposals with no visible keypoint are filtered out.
+    This strategy seems to make no difference on Detectron and is easier to implement.
+    """
+    ret = []
+    all_num_fg = []
+    for proposals_per_image in proposals:
+        # If empty/unannotated image (hard negatives), skip filtering for train
+        if len(proposals_per_image) == 0:
+            ret.append(proposals_per_image)
+            continue
+        gt_keypoints = proposals_per_image.gt_keypoints.tensor
+        # #fg x K x 3
+        vis_mask = gt_keypoints[:, :, 2] >= 1
+        xs, ys = gt_keypoints[:, :, 0], gt_keypoints[:, :, 1]
+        proposal_boxes = proposals_per_image.proposal_boxes.tensor.unsqueeze(dim=1)  # #fg x 1 x 4
+        kp_in_box = (
+            (xs >= proposal_boxes[:, :, 0])
+            & (xs <= proposal_boxes[:, :, 2])
+            & (ys >= proposal_boxes[:, :, 1])
+            & (ys <= proposal_boxes[:, :, 3])
+        )
+        selection = (kp_in_box & vis_mask).any(dim=1)
+        selection_idxs = torch.nonzero(selection).squeeze(1)
+        all_num_fg.append(selection_idxs.numel())
+        ret.append(proposals_per_image[selection_idxs])
+
+    storage = get_event_storage()
+    storage.put_scalar("keypoint_head/num_fg_samples", np.mean(all_num_fg))
+    return ret
+
+
+class ROIHeads(torch.nn.Module):
+    """
+    ROIHeads perform all per-region computation in an R-CNN.
+
+    It contains logic of cropping the regions, extract per-region features,
+    and make per-region predictions.
+
+    It can have many variants, implemented as subclasses of this class.
+    """
+
+    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
+        super(ROIHeads, self).__init__()
+
+        # fmt: off
+        self.batch_size_per_image     = cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE
+        self.positive_sample_fraction = cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION
+        self.test_score_thresh        = cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST
+        self.test_nms_thresh          = cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST
+        self.test_detections_per_img  = cfg.TEST.DETECTIONS_PER_IMAGE
+        self.in_features              = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        self.num_classes              = cfg.MODEL.ROI_HEADS.NUM_CLASSES
+        self.proposal_append_gt       = cfg.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT
+        self.feature_strides          = {k: v.stride for k, v in input_shape.items()}
+        self.feature_channels         = {k: v.channels for k, v in input_shape.items()}
+        self.cls_agnostic_bbox_reg    = cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG
+        self.smooth_l1_beta           = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA
+        # fmt: on
+
+        # Matcher to assign box proposals to gt boxes
+        self.proposal_matcher = Matcher(
+            cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS,
+            cfg.MODEL.ROI_HEADS.IOU_LABELS,
+            allow_low_quality_matches=False,
+        )
+
+        # Box2BoxTransform for bounding box regression
+        self.box2box_transform = Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
+
+    def _sample_proposals(self, matched_idxs, matched_labels, gt_classes):
+        """
+        Based on the matching between N proposals and M groundtruth,
+        sample the proposals and set their classification labels.
+
+        Args:
+            matched_idxs (Tensor): a vector of length N, each is the best-matched
+                gt index in [0, M) for each proposal.
+            matched_labels (Tensor): a vector of length N, the matcher's label
+                (one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal.
+            gt_classes (Tensor): a vector of length M.
+
+        Returns:
+            Tensor: a vector of indices of sampled proposals. Each is in [0, N).
+            Tensor: a vector of the same length, the classification label for
+                each sampled proposal. Each sample is labeled as either a category in
+                [0, num_classes) or the background (num_classes).
+        """
+        has_gt = gt_classes.numel() > 0
+        # Get the corresponding GT for each proposal
+        if has_gt:
+            gt_classes = gt_classes[matched_idxs]
+            # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
+            gt_classes[matched_labels == 0] = self.num_classes
+            # Label ignore proposals (-1 label)
+            gt_classes[matched_labels == -1] = -1
+        else:
+            gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
+
+        sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
+            gt_classes, self.batch_size_per_image, self.positive_sample_fraction, self.num_classes
+        )
+
+        sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
+        return sampled_idxs, gt_classes[sampled_idxs]
+
+    @torch.no_grad()
+    def label_and_sample_proposals(self, proposals, targets):
+        """
+        Prepare some proposals to be used to train the ROI heads.
+        It performs box matching between `proposals` and `targets`, and assigns
+        training labels to the proposals.
+        It returns `self.batch_size_per_image` random samples from proposals and groundtruth boxes,
+        with a fraction of positives that is no larger than `self.positive_sample_fraction.
+
+        Args:
+            See :meth:`ROIHeads.forward`
+
+        Returns:
+            list[Instances]:
+                length `N` list of `Instances`s containing the proposals
+                sampled for training. Each `Instances` has the following fields:
+                - proposal_boxes: the proposal boxes
+                - gt_boxes: the ground-truth box that the proposal is assigned to
+                  (this is only meaningful if the proposal has a label > 0; if label = 0
+                   then the ground-truth box is random)
+                Other fields such as "gt_classes", "gt_masks", that's included in `targets`.
+        """
+        gt_boxes = [x.gt_boxes for x in targets]
+        # Augment proposals with ground-truth boxes.
+        # In the case of learned proposals (e.g., RPN), when training starts
+        # the proposals will be low quality due to random initialization.
+        # It's possible that none of these initial
+        # proposals have high enough overlap with the gt objects to be used
+        # as positive examples for the second stage components (box head,
+        # cls head, mask head). Adding the gt boxes to the set of proposals
+        # ensures that the second stage components will have some positive
+        # examples from the start of training. For RPN, this augmentation improves
+        # convergence and empirically improves box AP on COCO by about 0.5
+        # points (under one tested configuration).
+        if self.proposal_append_gt:
+            proposals = add_ground_truth_to_proposals(gt_boxes, proposals)
+
+        proposals_with_gt = []
+
+        num_fg_samples = []
+        num_bg_samples = []
+        for proposals_per_image, targets_per_image in zip(proposals, targets):
+            has_gt = len(targets_per_image) > 0
+            match_quality_matrix = pairwise_iou(
+                targets_per_image.gt_boxes, proposals_per_image.proposal_boxes
+            )
+            matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix)
+            sampled_idxs, gt_classes = self._sample_proposals(
+                matched_idxs, matched_labels, targets_per_image.gt_classes
+            )
+
+            # Set target attributes of the sampled proposals:
+            proposals_per_image = proposals_per_image[sampled_idxs]
+            proposals_per_image.gt_classes = gt_classes
+
+            # We index all the attributes of targets that start with "gt_"
+            # and have not been added to proposals yet (="gt_classes").
+            if has_gt:
+                sampled_targets = matched_idxs[sampled_idxs]
+                # NOTE: here the indexing waste some compute, because heads
+                # like masks, keypoints, etc, will filter the proposals again,
+                # (by foreground/background, or number of keypoints in the image, etc)
+                # so we essentially index the data twice.
+                for (trg_name, trg_value) in targets_per_image.get_fields().items():
+                    if trg_name.startswith("gt_") and not proposals_per_image.has(trg_name):
+                        proposals_per_image.set(trg_name, trg_value[sampled_targets])
+            else:
+                gt_boxes = Boxes(
+                    targets_per_image.gt_boxes.tensor.new_zeros((len(sampled_idxs), 4))
+                )
+                proposals_per_image.gt_boxes = gt_boxes
+
+            num_bg_samples.append((gt_classes == self.num_classes).sum().item())
+            num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1])
+            proposals_with_gt.append(proposals_per_image)
+
+        # Log the number of fg/bg samples that are selected for training ROI heads
+        storage = get_event_storage()
+        storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples))
+        storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples))
+
+        return proposals_with_gt
+
+    def forward(self, images, features, proposals, targets=None):
+        """
+        Args:
+            images (ImageList):
+            features (dict[str: Tensor]): input data as a mapping from feature
+                map name to tensor. Axis 0 represents the number of images `N` in
+                the input data; axes 1-3 are channels, height, and width, which may
+                vary between feature maps (e.g., if a feature pyramid is used).
+            proposals (list[Instances]): length `N` list of `Instances`s. The i-th
+                `Instances` contains object proposals for the i-th input image,
+                with fields "proposal_boxes" and "objectness_logits".
+            targets (list[Instances], optional): length `N` list of `Instances`s. The i-th
+                `Instances` contains the ground-truth per-instance annotations
+                for the i-th input image.  Specify `targets` during training only.
+                It may have the following fields:
+                - gt_boxes: the bounding box of each instance.
+                - gt_classes: the label for each instance with a category ranging in [0, #class].
+                - gt_masks: PolygonMasks or BitMasks, the ground-truth masks of each instance.
+                - gt_keypoints: NxKx3, the groud-truth keypoints for each instance.
+
+        Returns:
+            results (list[Instances]): length `N` list of `Instances`s containing the
+                detected instances. Returned during inference only; may be []
+                during training.
+            losses (dict[str: Tensor]): mapping from a named loss to a tensor
+                storing the loss. Used during training only.
+        """
+        raise NotImplementedError()
+
+
+@ROI_HEADS_REGISTRY.register()
+class Res5ROIHeads(ROIHeads):
+    """
+    The ROIHeads in a typical "C4" R-CNN model, where
+    the box and mask head share the cropping and
+    the per-region feature computation by a Res5 block.
+    """
+
+    def __init__(self, cfg, input_shape):
+        super().__init__(cfg, input_shape)
+
+        assert len(self.in_features) == 1
+
+        # fmt: off
+        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        pooler_type       = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
+        pooler_scales     = (1.0 / self.feature_strides[self.in_features[0]], )
+        sampling_ratio    = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        self.mask_on      = cfg.MODEL.MASK_ON
+        # fmt: on
+        assert not cfg.MODEL.KEYPOINT_ON
+
+        self.pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+
+        self.res5, out_channels = self._build_res5_block(cfg)
+        self.box_predictor = FastRCNNOutputLayers(
+            out_channels, self.num_classes, self.cls_agnostic_bbox_reg
+        )
+
+        if self.mask_on:
+            self.mask_head = build_mask_head(
+                cfg,
+                ShapeSpec(channels=out_channels, width=pooler_resolution, height=pooler_resolution),
+            )
+
+    def _build_res5_block(self, cfg):
+        # fmt: off
+        stage_channel_factor = 2 ** 3  # res5 is 8x res2
+        num_groups           = cfg.MODEL.RESNETS.NUM_GROUPS
+        width_per_group      = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
+        bottleneck_channels  = num_groups * width_per_group * stage_channel_factor
+        out_channels         = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * stage_channel_factor
+        stride_in_1x1        = cfg.MODEL.RESNETS.STRIDE_IN_1X1
+        norm                 = cfg.MODEL.RESNETS.NORM
+        assert not cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE[-1], \
+            "Deformable conv is not yet supported in res5 head."
+        # fmt: on
+
+        blocks = make_stage(
+            BottleneckBlock,
+            3,
+            first_stride=2,
+            in_channels=out_channels // 2,
+            bottleneck_channels=bottleneck_channels,
+            out_channels=out_channels,
+            num_groups=num_groups,
+            norm=norm,
+            stride_in_1x1=stride_in_1x1,
+        )
+        return nn.Sequential(*blocks), out_channels
+
+    def _shared_roi_transform(self, features, boxes):
+        x = self.pooler(features, boxes)
+        return self.res5(x)
+
+    def forward(self, images, features, proposals, targets=None):
+        """
+        See :class:`ROIHeads.forward`.
+        """
+        del images
+
+        if self.training:
+            proposals = self.label_and_sample_proposals(proposals, targets)
+        del targets
+
+        proposal_boxes = [x.proposal_boxes for x in proposals]
+        box_features = self._shared_roi_transform(
+            [features[f] for f in self.in_features], proposal_boxes
+        )
+        feature_pooled = box_features.mean(dim=[2, 3])  # pooled to 1x1
+        pred_class_logits, pred_proposal_deltas = self.box_predictor(feature_pooled)
+        del feature_pooled
+
+        outputs = FastRCNNOutputs(
+            self.box2box_transform,
+            pred_class_logits,
+            pred_proposal_deltas,
+            proposals,
+            self.smooth_l1_beta,
+        )
+
+        if self.training:
+            del features
+            losses = outputs.losses()
+            if self.mask_on:
+                proposals, fg_selection_masks = select_foreground_proposals(
+                    proposals, self.num_classes
+                )
+                # Since the ROI feature transform is shared between boxes and masks,
+                # we don't need to recompute features. The mask loss is only defined
+                # on foreground proposals, so we need to select out the foreground
+                # features.
+                mask_features = box_features[torch.cat(fg_selection_masks, dim=0)]
+                del box_features
+                mask_logits = self.mask_head(mask_features)
+                losses["loss_mask"] = mask_rcnn_loss(mask_logits, proposals)
+            return [], losses
+        else:
+            pred_instances, _ = outputs.inference(
+                self.test_score_thresh, self.test_nms_thresh, self.test_detections_per_img
+            )
+            pred_instances = self.forward_with_given_boxes(features, pred_instances)
+            return pred_instances, {}
+
+    def forward_with_given_boxes(self, features, instances):
+        """
+        Use the given boxes in `instances` to produce other (non-box) per-ROI outputs.
+
+        Args:
+            features: same as in `forward()`
+            instances (list[Instances]): instances to predict other outputs. Expect the keys
+                "pred_boxes" and "pred_classes" to exist.
+
+        Returns:
+            instances (Instances):
+                the same `Instances` object, with extra
+                fields such as `pred_masks` or `pred_keypoints`.
+        """
+        assert not self.training
+        assert instances[0].has("pred_boxes") and instances[0].has("pred_classes")
+
+        if self.mask_on:
+            features = [features[f] for f in self.in_features]
+            x = self._shared_roi_transform(features, [x.pred_boxes for x in instances])
+            mask_logits = self.mask_head(x)
+            mask_rcnn_inference(mask_logits, instances)
+        return instances
+
+
+@ROI_HEADS_REGISTRY.register()
+class StandardROIHeads(ROIHeads):
+    """
+    It's "standard" in a sense that there is no ROI transform sharing
+    or feature sharing between tasks.
+    The cropped rois go to separate branches (boxes and masks) directly.
+    This way, it is easier to make separate abstractions for different branches.
+
+    This class is used by most models, such as FPN and C5.
+    To implement more models, you can subclass it and implement a different
+    :meth:`forward()` or a head.
+    """
+
+    def __init__(self, cfg, input_shape):
+        super(StandardROIHeads, self).__init__(cfg, input_shape)
+        self._init_box_head(cfg)
+        self._init_mask_head(cfg)
+        self._init_keypoint_head(cfg)
+
+    def _init_box_head(self, cfg):
+        # fmt: off
+        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        pooler_scales     = tuple(1.0 / self.feature_strides[k] for k in self.in_features)
+        sampling_ratio    = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type       = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
+        # fmt: on
+
+        # If StandardROIHeads is applied on multiple feature maps (as in FPN),
+        # then we share the same predictors and therefore the channel counts must be the same
+        in_channels = [self.feature_channels[f] for f in self.in_features]
+        # Check all channel counts are equal
+        assert len(set(in_channels)) == 1, in_channels
+        in_channels = in_channels[0]
+
+        self.box_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+        # Here we split "box head" and "box predictor", which is mainly due to historical reasons.
+        # They are used together so the "box predictor" layers should be part of the "box head".
+        # New subclasses of ROIHeads do not need "box predictor"s.
+        self.box_head = build_box_head(
+            cfg, ShapeSpec(channels=in_channels, height=pooler_resolution, width=pooler_resolution)
+        )
+        self.box_predictor = FastRCNNOutputLayers(
+            self.box_head.output_size, self.num_classes, self.cls_agnostic_bbox_reg
+        )
+
+    def _init_mask_head(self, cfg):
+        # fmt: off
+        self.mask_on           = cfg.MODEL.MASK_ON
+        if not self.mask_on:
+            return
+        pooler_resolution = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION
+        pooler_scales     = tuple(1.0 / self.feature_strides[k] for k in self.in_features)
+        sampling_ratio    = cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type       = cfg.MODEL.ROI_MASK_HEAD.POOLER_TYPE
+        # fmt: on
+
+        in_channels = [self.feature_channels[f] for f in self.in_features][0]
+
+        self.mask_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+        self.mask_head = build_mask_head(
+            cfg, ShapeSpec(channels=in_channels, width=pooler_resolution, height=pooler_resolution)
+        )
+
+    def _init_keypoint_head(self, cfg):
+        # fmt: off
+        self.keypoint_on                         = cfg.MODEL.KEYPOINT_ON
+        if not self.keypoint_on:
+            return
+        pooler_resolution                        = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION
+        pooler_scales                            = tuple(1.0 / self.feature_strides[k] for k in self.in_features)  # noqa
+        sampling_ratio                           = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type                              = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE
+        self.normalize_loss_by_visible_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS  # noqa
+        self.keypoint_loss_weight                = cfg.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT
+        # fmt: on
+
+        in_channels = [self.feature_channels[f] for f in self.in_features][0]
+
+        self.keypoint_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+        self.keypoint_head = build_keypoint_head(
+            cfg, ShapeSpec(channels=in_channels, width=pooler_resolution, height=pooler_resolution)
+        )
+
+    def forward(self, images, features, proposals, targets=None):
+        """
+        See :class:`ROIHeads.forward`.
+        """
+        del images
+        if self.training:
+            proposals = self.label_and_sample_proposals(proposals, targets)
+        del targets
+
+        features_list = [features[f] for f in self.in_features]
+
+        if self.training:
+            losses = self._forward_box(features_list, proposals)
+            # During training the proposals used by the box head are
+            # used by the mask, keypoint (and densepose) heads.
+            losses.update(self._forward_mask(features_list, proposals))
+            losses.update(self._forward_keypoint(features_list, proposals))
+            return proposals, losses
+        else:
+            pred_instances = self._forward_box(features_list, proposals)
+            # During inference cascaded prediction is used: the mask and keypoints heads are only
+            # applied to the top scoring box detections.
+            pred_instances = self.forward_with_given_boxes(features, pred_instances)
+            return pred_instances, {}
+
+    def forward_with_given_boxes(self, features, instances):
+        """
+        Use the given boxes in `instances` to produce other (non-box) per-ROI outputs.
+
+        This is useful for downstream tasks where a box is known, but need to obtain
+        other attributes (outputs of other heads).
+        Test-time augmentation also uses this.
+
+        Args:
+            features: same as in `forward()`
+            instances (list[Instances]): instances to predict other outputs. Expect the keys
+                "pred_boxes" and "pred_classes" to exist.
+
+        Returns:
+            instances (Instances):
+                the same `Instances` object, with extra
+                fields such as `pred_masks` or `pred_keypoints`.
+        """
+        assert not self.training
+        assert instances[0].has("pred_boxes") and instances[0].has("pred_classes")
+        features = [features[f] for f in self.in_features]
+
+        instances = self._forward_mask(features, instances)
+        instances = self._forward_keypoint(features, instances)
+        return instances
+
+    def _forward_box(self, features, proposals):
+        """
+        Forward logic of the box prediction branch.
+
+        Args:
+            features (list[Tensor]): #level input features for box prediction
+            proposals (list[Instances]): the per-image object proposals with
+                their matching ground truth.
+                Each has fields "proposal_boxes", and "objectness_logits",
+                "gt_classes", "gt_boxes".
+
+        Returns:
+            In training, a dict of losses.
+            In inference, a list of `Instances`, the predicted instances.
+        """
+        box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+        box_features = self.box_head(box_features)
+        pred_class_logits, pred_proposal_deltas = self.box_predictor(box_features)
+        del box_features
+
+        outputs = FastRCNNOutputs(
+            self.box2box_transform,
+            pred_class_logits,
+            pred_proposal_deltas,
+            proposals,
+            self.smooth_l1_beta,
+        )
+        if self.training:
+            return outputs.losses()
+        else:
+            pred_instances, _ = outputs.inference(
+                self.test_score_thresh, self.test_nms_thresh, self.test_detections_per_img
+            )
+            return pred_instances
+
+    def _forward_mask(self, features, instances):
+        """
+        Forward logic of the mask prediction branch.
+
+        Args:
+            features (list[Tensor]): #level input features for mask prediction
+            instances (list[Instances]): the per-image instances to train/predict masks.
+                In training, they can be the proposals.
+                In inference, they can be the predicted boxes.
+
+        Returns:
+            In training, a dict of losses.
+            In inference, update `instances` with new fields "pred_masks" and return it.
+        """
+        if not self.mask_on:
+            return {} if self.training else instances
+
+        if self.training:
+            # The loss is only defined on positive proposals.
+            proposals, _ = select_foreground_proposals(instances, self.num_classes)
+            proposal_boxes = [x.proposal_boxes for x in proposals]
+            mask_features = self.mask_pooler(features, proposal_boxes)
+            mask_logits = self.mask_head(mask_features)
+            return {"loss_mask": mask_rcnn_loss(mask_logits, proposals)}
+        else:
+            pred_boxes = [x.pred_boxes for x in instances]
+            mask_features = self.mask_pooler(features, pred_boxes)
+            mask_logits = self.mask_head(mask_features)
+            mask_rcnn_inference(mask_logits, instances)
+            return instances
+
+    def _forward_keypoint(self, features, instances):
+        """
+        Forward logic of the keypoint prediction branch.
+
+        Args:
+            features (list[Tensor]): #level input features for keypoint prediction
+            instances (list[Instances]): the per-image instances to train/predict keypoints.
+                In training, they can be the proposals.
+                In inference, they can be the predicted boxes.
+
+        Returns:
+            In training, a dict of losses.
+            In inference, update `instances` with new fields "pred_keypoints" and return it.
+        """
+        if not self.keypoint_on:
+            return {} if self.training else instances
+
+        num_images = len(instances)
+
+        if self.training:
+            # The loss is defined on positive proposals with at >=1 visible keypoints.
+            proposals, _ = select_foreground_proposals(instances, self.num_classes)
+            proposals = select_proposals_with_visible_keypoints(proposals)
+            proposal_boxes = [x.proposal_boxes for x in proposals]
+
+            keypoint_features = self.keypoint_pooler(features, proposal_boxes)
+            keypoint_logits = self.keypoint_head(keypoint_features)
+
+            normalizer = (
+                num_images
+                * self.batch_size_per_image
+                * self.positive_sample_fraction
+                * keypoint_logits.shape[1]
+            )
+            loss = keypoint_rcnn_loss(
+                keypoint_logits,
+                proposals,
+                normalizer=None if self.normalize_loss_by_visible_keypoints else normalizer,
+            )
+            return {"loss_keypoint": loss * self.keypoint_loss_weight}
+        else:
+            pred_boxes = [x.pred_boxes for x in instances]
+            keypoint_features = self.keypoint_pooler(features, pred_boxes)
+            keypoint_logits = self.keypoint_head(keypoint_features)
+            keypoint_rcnn_inference(keypoint_logits, instances)
+            return instances
diff --git a/detectron2/modeling/roi_heads/rotated_fast_rcnn.py b/detectron2/modeling/roi_heads/rotated_fast_rcnn.py
new file mode 100644
index 0000000..aebc61c
--- /dev/null
+++ b/detectron2/modeling/roi_heads/rotated_fast_rcnn.py
@@ -0,0 +1,298 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import numpy as np
+from typing import Dict
+import torch
+
+from detectron2.layers import ShapeSpec, batched_nms_rotated
+from detectron2.structures import Instances, RotatedBoxes, pairwise_iou_rotated
+from detectron2.utils.events import get_event_storage
+
+from ..box_regression import Box2BoxTransformRotated
+from ..poolers import ROIPooler
+from ..proposal_generator.proposal_utils import add_ground_truth_to_proposals
+from .box_head import build_box_head
+from .fast_rcnn import FastRCNNOutputLayers, FastRCNNOutputs
+from .roi_heads import ROI_HEADS_REGISTRY, StandardROIHeads
+
+logger = logging.getLogger(__name__)
+
+"""
+Shape shorthand in this module:
+
+    N: number of images in the minibatch
+    R: number of ROIs, combined over all images, in the minibatch
+    Ri: number of ROIs in image i
+    K: number of foreground classes. E.g.,there are 80 foreground classes in COCO.
+
+Naming convention:
+
+    deltas: refers to the 5-d (dx, dy, dw, dh, da) deltas that parameterize the box2box
+    transform (see :class:`box_regression.Box2BoxTransformRotated`).
+
+    pred_class_logits: predicted class scores in [-inf, +inf]; use
+        softmax(pred_class_logits) to estimate P(class).
+
+    gt_classes: ground-truth classification labels in [0, K], where [0, K) represent
+        foreground object classes and K represents the background class.
+
+    pred_proposal_deltas: predicted rotated box2box transform deltas for transforming proposals
+        to detection box predictions.
+
+    gt_proposal_deltas: ground-truth rotated box2box transform deltas
+"""
+
+
+def fast_rcnn_inference_rotated(
+    boxes, scores, image_shapes, score_thresh, nms_thresh, topk_per_image
+):
+    """
+    Call `fast_rcnn_inference_single_image_rotated` for all images.
+
+    Args:
+        boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic
+            boxes for each image. Element i has shape (Ri, K * 5) if doing
+            class-specific regression, or (Ri, 5) if doing class-agnostic
+            regression, where Ri is the number of predicted objects for image i.
+            This is compatible with the output of :meth:`FastRCNNOutputs.predict_boxes`.
+        scores (list[Tensor]): A list of Tensors of predicted class scores for each image.
+            Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
+            for image i. Compatible with the output of :meth:`FastRCNNOutputs.predict_probs`.
+        image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch.
+        score_thresh (float): Only return detections with a confidence score exceeding this
+            threshold.
+        nms_thresh (float):  The threshold to use for box non-maximum suppression. Value in [0, 1].
+        topk_per_image (int): The number of top scoring detections to return. Set < 0 to return
+            all detections.
+
+    Returns:
+        instances: (list[Instances]): A list of N instances, one for each image in the batch,
+            that stores the topk most confidence detections.
+        kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates
+            the corresponding boxes/scores index in [0, Ri) from the input, for image i.
+    """
+    result_per_image = [
+        fast_rcnn_inference_single_image_rotated(
+            boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image
+        )
+        for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes)
+    ]
+    return tuple(list(x) for x in zip(*result_per_image))
+
+
+def fast_rcnn_inference_single_image_rotated(
+    boxes, scores, image_shape, score_thresh, nms_thresh, topk_per_image
+):
+    """
+    Single-image inference. Return rotated bounding-box detection results by thresholding
+    on scores and applying rotated non-maximum suppression (Rotated NMS).
+
+    Args:
+        Same as `fast_rcnn_inference_rotated`, but with rotated boxes, scores, and image shapes
+        per image.
+
+    Returns:
+        Same as `fast_rcnn_inference_rotated`, but for only one image.
+    """
+    B = 5  # box dimension
+    scores = scores[:, :-1]
+    num_bbox_reg_classes = boxes.shape[1] // B
+    # Convert to Boxes to use the `clip` function ...
+    boxes = RotatedBoxes(boxes.reshape(-1, B))
+    boxes.clip(image_shape)
+    boxes = boxes.tensor.view(-1, num_bbox_reg_classes, B)  # R x C x B
+    # Filter results based on detection scores
+    filter_mask = scores > score_thresh  # R x K
+    # R' x 2. First column contains indices of the R predictions;
+    # Second column contains indices of classes.
+    filter_inds = filter_mask.nonzero()
+    if num_bbox_reg_classes == 1:
+        boxes = boxes[filter_inds[:, 0], 0]
+    else:
+        boxes = boxes[filter_mask]
+    scores = scores[filter_mask]
+
+    # Apply per-class Rotated NMS
+    keep = batched_nms_rotated(boxes, scores, filter_inds[:, 1], nms_thresh)
+    if topk_per_image >= 0:
+        keep = keep[:topk_per_image]
+    boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep]
+
+    result = Instances(image_shape)
+    result.pred_boxes = RotatedBoxes(boxes)
+    result.scores = scores
+    result.pred_classes = filter_inds[:, 1]
+
+    return result, filter_inds[:, 0]
+
+
+class RotatedFastRCNNOutputs(FastRCNNOutputs):
+    """
+    A class that stores information about outputs of a Fast R-CNN head with RotatedBoxes.
+    """
+
+    def inference(self, score_thresh, nms_thresh, topk_per_image):
+        """
+        Args:
+            score_thresh (float): same as `fast_rcnn_inference_rotated`.
+            nms_thresh (float): same as `fast_rcnn_inference_rotated`.
+            topk_per_image (int): same as `fast_rcnn_inference_rotated`.
+        Returns:
+            list[Instances]: same as `fast_rcnn_inference_rotated`.
+            list[Tensor]: same as `fast_rcnn_inference_rotated`.
+        """
+        boxes = self.predict_boxes()
+        scores = self.predict_probs()
+        image_shapes = self.image_shapes
+
+        return fast_rcnn_inference_rotated(
+            boxes, scores, image_shapes, score_thresh, nms_thresh, topk_per_image
+        )
+
+
+@ROI_HEADS_REGISTRY.register()
+class RROIHeads(StandardROIHeads):
+    """
+    This class is used by Rotated RPN (RRPN).
+    For now, it just supports box head but not mask or keypoints.
+    """
+
+    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
+        super().__init__(cfg, input_shape)
+        self.box2box_transform = Box2BoxTransformRotated(
+            weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS
+        )
+        assert (
+            not self.mask_on and not self.keypoint_on
+        ), "Mask/Keypoints not supported in Rotated ROIHeads."
+
+    def _init_box_head(self, cfg):
+        # fmt: off
+        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        pooler_scales     = tuple(1.0 / self.feature_strides[k] for k in self.in_features)
+        sampling_ratio    = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type       = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
+        # fmt: on
+
+        # If StandardROIHeads is applied on multiple feature maps (as in FPN),
+        # then we share the same predictors and therefore the channel counts must be the same
+        in_channels = [self.feature_channels[f] for f in self.in_features]
+        # Check all channel counts are equal
+        assert len(set(in_channels)) == 1, in_channels
+        in_channels = in_channels[0]
+
+        assert pooler_type in ["ROIAlignRotated"]
+
+        self.box_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+        self.box_head = build_box_head(
+            cfg, ShapeSpec(channels=in_channels, height=pooler_resolution, width=pooler_resolution)
+        )
+
+        self.box_predictor = FastRCNNOutputLayers(
+            input_size=self.box_head.output_size,
+            num_classes=self.num_classes,
+            cls_agnostic_bbox_reg=self.cls_agnostic_bbox_reg,
+            box_dim=5,
+        )
+
+    @torch.no_grad()
+    def label_and_sample_proposals(self, proposals, targets):
+        """
+        Prepare some proposals to be used to train the RROI heads.
+        It performs box matching between `proposals` and `targets`, and assigns
+        training labels to the proposals.
+        It returns `self.batch_size_per_image` random samples from proposals and groundtruth boxes,
+        with a fraction of positives that is no larger than `self.positive_sample_fraction.
+
+        Args:
+            See :meth:`StandardROIHeads.forward`
+
+        Returns:
+            list[Instances]: length `N` list of `Instances`s containing the proposals
+                sampled for training. Each `Instances` has the following fields:
+                - proposal_boxes: the rotated proposal boxes
+                - gt_boxes: the ground-truth rotated boxes that the proposal is assigned to
+                  (this is only meaningful if the proposal has a label > 0; if label = 0
+                   then the ground-truth box is random)
+                - gt_classes: the ground-truth classification lable for each proposal
+        """
+        gt_boxes = [x.gt_boxes for x in targets]
+        if self.proposal_append_gt:
+            proposals = add_ground_truth_to_proposals(gt_boxes, proposals)
+
+        proposals_with_gt = []
+
+        num_fg_samples = []
+        num_bg_samples = []
+        for proposals_per_image, targets_per_image in zip(proposals, targets):
+            has_gt = len(targets_per_image) > 0
+            match_quality_matrix = pairwise_iou_rotated(
+                targets_per_image.gt_boxes, proposals_per_image.proposal_boxes
+            )
+            matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix)
+            sampled_idxs, gt_classes = self._sample_proposals(
+                matched_idxs, matched_labels, targets_per_image.gt_classes
+            )
+
+            proposals_per_image = proposals_per_image[sampled_idxs]
+            proposals_per_image.gt_classes = gt_classes
+
+            if has_gt:
+                sampled_targets = matched_idxs[sampled_idxs]
+                proposals_per_image.gt_boxes = targets_per_image.gt_boxes[sampled_targets]
+            else:
+                gt_boxes = RotatedBoxes(
+                    targets_per_image.gt_boxes.tensor.new_zeros((len(sampled_idxs), 5))
+                )
+                proposals_per_image.gt_boxes = gt_boxes
+
+            num_bg_samples.append((gt_classes == self.num_classes).sum().item())
+            num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1])
+            proposals_with_gt.append(proposals_per_image)
+
+        # Log the number of fg/bg samples that are selected for training ROI heads
+        storage = get_event_storage()
+        storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples))
+        storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples))
+
+        return proposals_with_gt
+
+    def _forward_box(self, features, proposals):
+        """
+        Forward logic of the box prediction branch.
+
+        Args:
+            features (list[Tensor]): #level input features for box prediction
+            proposals (list[Instances]): the per-image object proposals with
+                their matching ground truth.
+                Each has fields "proposal_boxes", and "objectness_logits",
+                "gt_classes", "gt_boxes".
+
+        Returns:
+            In training, a dict of losses.
+            In inference, a list of `Instances`, the predicted instances.
+        """
+        box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+        box_features = self.box_head(box_features)
+        pred_class_logits, pred_proposal_deltas = self.box_predictor(box_features)
+        del box_features
+
+        outputs = RotatedFastRCNNOutputs(
+            self.box2box_transform,
+            pred_class_logits,
+            pred_proposal_deltas,
+            proposals,
+            self.smooth_l1_beta,
+        )
+        if self.training:
+            return outputs.losses()
+        else:
+            pred_instances, _ = outputs.inference(
+                self.test_score_thresh, self.test_nms_thresh, self.test_detections_per_img
+            )
+            return pred_instances
diff --git a/detectron2/modeling/sampling.py b/detectron2/modeling/sampling.py
new file mode 100644
index 0000000..92efa91
--- /dev/null
+++ b/detectron2/modeling/sampling.py
@@ -0,0 +1,48 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+
+__all__ = ["subsample_labels"]
+
+
+def subsample_labels(labels, num_samples, positive_fraction, bg_label):
+    """
+    Return `num_samples` random samples from `labels`, with a fraction of
+    positives no larger than `positive_fraction`.
+
+    Args:
+        labels (Tensor): (N, ) label vector with values:
+            * -1: ignore
+            * bg_label: background ("negative") class
+            * otherwise: one or more foreground ("positive") classes
+        num_samples (int): The total number of labels with value >= 0 to return.
+            Values that are not sampled will be filled with -1 (ignore).
+        positive_fraction (float): The number of subsampled labels with values > 0
+            is `min(num_positives, int(positive_fraction * num_samples))`. The number
+            of negatives sampled is `min(num_negatives, num_samples - num_positives_sampled)`.
+            In order words, if there are not enough positives, the sample is filled with
+            negatives. If there are also not enough negatives, then as many elements are
+            sampled as is possible.
+        bg_label (int): label index of background ("negative") class.
+
+    Returns:
+        pos_idx, neg_idx (Tensor):
+            1D indices. The total number of indices is `num_samples` if possible.
+            The fraction of positive indices is `positive_fraction` if possible.
+    """
+    positive = torch.nonzero((labels != -1) & (labels != bg_label)).squeeze(1)
+    negative = torch.nonzero(labels == bg_label).squeeze(1)
+
+    num_pos = int(num_samples * positive_fraction)
+    # protect against not enough positive examples
+    num_pos = min(positive.numel(), num_pos)
+    num_neg = num_samples - num_pos
+    # protect against not enough negative examples
+    num_neg = min(negative.numel(), num_neg)
+
+    # randomly select positive and negative examples
+    perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
+    perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
+
+    pos_idx = positive[perm1]
+    neg_idx = negative[perm2]
+    return pos_idx, neg_idx
diff --git a/detectron2/modeling/test_time_augmentation.py b/detectron2/modeling/test_time_augmentation.py
new file mode 100644
index 0000000..9535c9b
--- /dev/null
+++ b/detectron2/modeling/test_time_augmentation.py
@@ -0,0 +1,248 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import numpy as np
+from contextlib import contextmanager
+from itertools import count
+import torch
+from torch import nn
+from torch.nn.parallel import DistributedDataParallel
+
+from detectron2.data.detection_utils import read_image
+from detectron2.data.transforms import ResizeShortestEdge
+from detectron2.structures import Instances
+
+from .meta_arch import GeneralizedRCNN
+from .postprocessing import detector_postprocess
+from .roi_heads.fast_rcnn import fast_rcnn_inference_single_image
+
+__all__ = ["DatasetMapperTTA", "GeneralizedRCNNWithTTA"]
+
+
+class DatasetMapperTTA:
+    """
+    Implement test-time augmentation for detection data.
+    It is a callable which takes a dataset dict from a detection dataset,
+    and returns a list of dataset dicts where the images
+    are augmented from the input image by the transformations defined in the config.
+    This is used for test-time augmentation.
+    """
+
+    def __init__(self, cfg):
+        self.min_sizes = cfg.TEST.AUG.MIN_SIZES
+        self.max_size = cfg.TEST.AUG.MAX_SIZE
+        self.flip = cfg.TEST.AUG.FLIP
+        self.image_format = cfg.INPUT.FORMAT
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dict: a detection dataset dict
+
+        Returns:
+            list[dict]:
+                a list of dataset dicts, which contain augmented version of the input image.
+                The total number of dicts is ``len(min_sizes) * (2 if flip else 1)``.
+        """
+        ret = []
+        if "image" not in dataset_dict:
+            numpy_image = read_image(dataset_dict["file_name"], self.image_format)
+        else:
+            numpy_image = dataset_dict["image"].permute(1, 2, 0).numpy().astype("uint8")
+        for min_size in self.min_sizes:
+            image = np.copy(numpy_image)
+            tfm = ResizeShortestEdge(min_size, self.max_size).get_transform(image)
+            resized = tfm.apply_image(image)
+            resized = torch.as_tensor(resized.transpose(2, 0, 1).astype("float32"))
+
+            dic = copy.deepcopy(dataset_dict)
+            dic["horiz_flip"] = False
+            dic["image"] = resized
+            ret.append(dic)
+
+            if self.flip:
+                dic = copy.deepcopy(dataset_dict)
+                dic["horiz_flip"] = True
+                dic["image"] = torch.flip(resized, dims=[2])
+                ret.append(dic)
+        return ret
+
+
+class GeneralizedRCNNWithTTA(nn.Module):
+    """
+    A GeneralizedRCNN with test-time augmentation enabled.
+    Its :meth:`__call__` method has the same interface as :meth:`GeneralizedRCNN.forward`.
+    """
+
+    def __init__(self, cfg, model, tta_mapper=None, batch_size=3):
+        """
+        Args:
+            cfg (CfgNode):
+            model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on.
+            tta_mapper (callable): takes a dataset dict and returns a list of
+                augmented versions of the dataset dict. Defaults to
+                `DatasetMapperTTA(cfg)`.
+            batch_size (int): batch the augmented images into this batch size for inference.
+        """
+        super().__init__()
+        if isinstance(model, DistributedDataParallel):
+            model = model.module
+        assert isinstance(
+            model, GeneralizedRCNN
+        ), "TTA is only supported on GeneralizedRCNN. Got a model of type {}".format(type(model))
+        self.cfg = cfg.clone()
+        assert not self.cfg.MODEL.KEYPOINT_ON, "TTA for keypoint is not supported yet"
+        assert (
+            not self.cfg.MODEL.LOAD_PROPOSALS
+        ), "TTA for pre-computed proposals is not supported yet"
+
+        self.model = model
+
+        if tta_mapper is None:
+            tta_mapper = DatasetMapperTTA(cfg)
+        self.tta_mapper = tta_mapper
+        self.batch_size = batch_size
+
+    @contextmanager
+    def _turn_off_roi_head(self, attr):
+        """
+        Open a context where one head in `model.roi_heads` is temporarily turned off.
+        Args:
+            attr (str): the attribute in `model.roi_heads` which can be used
+                to turn off a specific head, e.g., "mask_on", "keypoint_on".
+        """
+        roi_heads = self.model.roi_heads
+        try:
+            old = getattr(roi_heads, attr)
+        except AttributeError:
+            # The head may not be implemented in certain ROIHeads
+            old = None
+
+        if old is None:
+            yield
+        else:
+            setattr(roi_heads, attr, False)
+            yield
+            setattr(roi_heads, attr, old)
+
+    def _batch_inference(self, batched_inputs, detected_instances=None, do_postprocess=True):
+        """
+        Execute inference on a list of inputs,
+        using batch size = self.batch_size, instead of the length of the list.
+
+        Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference`
+        """
+        if detected_instances is None:
+            detected_instances = [None] * len(batched_inputs)
+
+        outputs = []
+        inputs, instances = [], []
+        for idx, input, instance in zip(count(), batched_inputs, detected_instances):
+            inputs.append(input)
+            instances.append(instance)
+            if len(inputs) == self.batch_size or idx == len(batched_inputs) - 1:
+                outputs.extend(
+                    self.model.inference(
+                        inputs,
+                        instances if instances[0] is not None else None,
+                        do_postprocess=do_postprocess,
+                    )
+                )
+                inputs, instances = [], []
+        return outputs
+
+    def __call__(self, batched_inputs):
+        """
+        Same input/output format as :meth:`GeneralizedRCNN.forward`
+        """
+        return [self._inference_one_image(x) for x in batched_inputs]
+
+    def _inference_one_image(self, input):
+        """
+        Args:
+            input (dict): one dataset dict
+
+        Returns:
+            dict: one output dict
+        """
+        augmented_inputs = self.tta_mapper(input)
+
+        do_hflip = [k.pop("horiz_flip", False) for k in augmented_inputs]
+        heights = [k["height"] for k in augmented_inputs]
+        widths = [k["width"] for k in augmented_inputs]
+        assert (
+            len(set(heights)) == 1 and len(set(widths)) == 1
+        ), "Augmented version of the inputs should have the same original resolution!"
+        height = heights[0]
+        width = widths[0]
+
+        # 1. Detect boxes from all augmented versions
+        # 1.1: forward with all augmented images
+        with self._turn_off_roi_head("mask_on"), self._turn_off_roi_head("keypoint_on"):
+            # temporarily disable mask/keypoint head
+            outputs = self._batch_inference(augmented_inputs, do_postprocess=False)
+        # 1.2: union the results
+        all_boxes = []
+        all_scores = []
+        all_classes = []
+        for idx, output in enumerate(outputs):
+            rescaled_output = detector_postprocess(output, height, width)
+            pred_boxes = rescaled_output.pred_boxes.tensor
+            if do_hflip[idx]:
+                pred_boxes[:, [0, 2]] = width - pred_boxes[:, [2, 0]]
+            all_boxes.append(pred_boxes)
+            all_scores.extend(rescaled_output.scores)
+            all_classes.extend(rescaled_output.pred_classes)
+        all_boxes = torch.cat(all_boxes, dim=0).cpu()
+        num_boxes = len(all_boxes)
+
+        # 1.3: select from the union of all results
+        num_classes = self.cfg.MODEL.ROI_HEADS.NUM_CLASSES
+        # +1 because fast_rcnn_inference expects background scores as well
+        all_scores_2d = torch.zeros(num_boxes, num_classes + 1, device=all_boxes.device)
+        for idx, cls, score in zip(count(), all_classes, all_scores):
+            all_scores_2d[idx, cls] = score
+
+        merged_instances, _ = fast_rcnn_inference_single_image(
+            all_boxes,
+            all_scores_2d,
+            (height, width),
+            1e-8,
+            self.cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST,
+            self.cfg.TEST.DETECTIONS_PER_IMAGE,
+        )
+
+        if not self.cfg.MODEL.MASK_ON:
+            return {"instances": merged_instances}
+
+        # 2. Use the detected boxes to obtain masks
+        # 2.1: rescale the detected boxes
+        augmented_instances = []
+        for idx, input in enumerate(augmented_inputs):
+            actual_height, actual_width = input["image"].shape[1:3]
+            scale_x = actual_width * 1.0 / width
+            scale_y = actual_height * 1.0 / height
+            pred_boxes = merged_instances.pred_boxes.clone()
+            pred_boxes.tensor[:, 0::2] *= scale_x
+            pred_boxes.tensor[:, 1::2] *= scale_y
+            if do_hflip[idx]:
+                pred_boxes.tensor[:, [0, 2]] = actual_width - pred_boxes.tensor[:, [2, 0]]
+
+            aug_instances = Instances(
+                image_size=(actual_height, actual_width),
+                pred_boxes=pred_boxes,
+                pred_classes=merged_instances.pred_classes,
+                scores=merged_instances.scores,
+            )
+            augmented_instances.append(aug_instances)
+        # 2.2: run forward on the detected boxes
+        outputs = self._batch_inference(augmented_inputs, augmented_instances, do_postprocess=False)
+        for idx, output in enumerate(outputs):
+            if do_hflip[idx]:
+                output.pred_masks = output.pred_masks.flip(dims=[3])
+        # 2.3: average the predictions
+        all_pred_masks = torch.stack([o.pred_masks for o in outputs], dim=0)
+        avg_pred_masks = torch.mean(all_pred_masks, dim=0)
+        output = outputs[0]
+        output.pred_masks = avg_pred_masks
+        output = detector_postprocess(output, height, width)
+        return {"instances": output}
diff --git a/detectron2/solver/__init__.py b/detectron2/solver/__init__.py
new file mode 100644
index 0000000..10f84e1
--- /dev/null
+++ b/detectron2/solver/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .build import build_lr_scheduler, build_optimizer
+from .lr_scheduler import WarmupCosineLR, WarmupMultiStepLR
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/detectron2/solver/build.py b/detectron2/solver/build.py
new file mode 100644
index 0000000..af685c9
--- /dev/null
+++ b/detectron2/solver/build.py
@@ -0,0 +1,60 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from typing import Any, Dict, List
+import torch
+
+from detectron2.config import CfgNode
+
+from .lr_scheduler import WarmupCosineLR, WarmupMultiStepLR
+
+
+def build_optimizer(cfg: CfgNode, model: torch.nn.Module) -> torch.optim.Optimizer:
+    """
+    Build an optimizer from config.
+    """
+    params: List[Dict[str, Any]] = []
+    for key, value in model.named_parameters():
+        if not value.requires_grad:
+            continue
+        lr = cfg.SOLVER.BASE_LR
+        weight_decay = cfg.SOLVER.WEIGHT_DECAY
+        if key.endswith("norm.weight") or key.endswith("norm.bias"):
+            weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM
+        elif key.endswith(".bias"):
+            # NOTE: unlike Detectron v1, we now default BIAS_LR_FACTOR to 1.0
+            # and WEIGHT_DECAY_BIAS to WEIGHT_DECAY so that bias optimizer
+            # hyperparameters are by default exactly the same as for regular
+            # weights.
+            lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR
+            weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS
+        params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}]
+
+    optimizer = torch.optim.SGD(params, lr, momentum=cfg.SOLVER.MOMENTUM)
+    return optimizer
+
+
+def build_lr_scheduler(
+    cfg: CfgNode, optimizer: torch.optim.Optimizer
+) -> torch.optim.lr_scheduler._LRScheduler:
+    """
+    Build a LR scheduler from config.
+    """
+    name = cfg.SOLVER.LR_SCHEDULER_NAME
+    if name == "WarmupMultiStepLR":
+        return WarmupMultiStepLR(
+            optimizer,
+            cfg.SOLVER.STEPS,
+            cfg.SOLVER.GAMMA,
+            warmup_factor=cfg.SOLVER.WARMUP_FACTOR,
+            warmup_iters=cfg.SOLVER.WARMUP_ITERS,
+            warmup_method=cfg.SOLVER.WARMUP_METHOD,
+        )
+    elif name == "WarmupCosineLR":
+        return WarmupCosineLR(
+            optimizer,
+            cfg.SOLVER.MAX_ITER,
+            warmup_factor=cfg.SOLVER.WARMUP_FACTOR,
+            warmup_iters=cfg.SOLVER.WARMUP_ITERS,
+            warmup_method=cfg.SOLVER.WARMUP_METHOD,
+        )
+    else:
+        raise ValueError("Unknown LR scheduler: {}".format(name))
diff --git a/detectron2/solver/lr_scheduler.py b/detectron2/solver/lr_scheduler.py
new file mode 100644
index 0000000..7c88689
--- /dev/null
+++ b/detectron2/solver/lr_scheduler.py
@@ -0,0 +1,116 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+from bisect import bisect_right
+from typing import List
+import torch
+
+# NOTE: PyTorch's LR scheduler interface uses names that assume the LR changes
+# only on epoch boundaries. We typically use iteration based schedules instead.
+# As a result, "epoch" (e.g., as in self.last_epoch) should be understood to mean
+# "iteration" instead.
+
+# FIXME: ideally this would be achieved with a CombinedLRScheduler, separating
+# MultiStepLR with WarmupLR but the current LRScheduler design doesn't allow it.
+
+
+class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        milestones: List[int],
+        gamma: float = 0.1,
+        warmup_factor: float = 0.001,
+        warmup_iters: int = 1000,
+        warmup_method: str = "linear",
+        last_epoch: int = -1,
+    ):
+        if not list(milestones) == sorted(milestones):
+            raise ValueError(
+                "Milestones should be a list of" " increasing integers. Got {}", milestones
+            )
+        self.milestones = milestones
+        self.gamma = gamma
+        self.warmup_factor = warmup_factor
+        self.warmup_iters = warmup_iters
+        self.warmup_method = warmup_method
+        super().__init__(optimizer, last_epoch)
+
+    def get_lr(self) -> List[float]:
+        warmup_factor = _get_warmup_factor_at_iter(
+            self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor
+        )
+        return [
+            base_lr * warmup_factor * self.gamma ** bisect_right(self.milestones, self.last_epoch)
+            for base_lr in self.base_lrs
+        ]
+
+    def _compute_values(self) -> List[float]:
+        # The new interface
+        return self.get_lr()
+
+
+class WarmupCosineLR(torch.optim.lr_scheduler._LRScheduler):
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        max_iters: int,
+        warmup_factor: float = 0.001,
+        warmup_iters: int = 1000,
+        warmup_method: str = "linear",
+        last_epoch: int = -1,
+    ):
+        self.max_iters = max_iters
+        self.warmup_factor = warmup_factor
+        self.warmup_iters = warmup_iters
+        self.warmup_method = warmup_method
+        super().__init__(optimizer, last_epoch)
+
+    def get_lr(self) -> List[float]:
+        warmup_factor = _get_warmup_factor_at_iter(
+            self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor
+        )
+        # Different definitions of half-cosine with warmup are possible. For
+        # simplicity we multiply the standard half-cosine schedule by the warmup
+        # factor. An alternative is to start the period of the cosine at warmup_iters
+        # instead of at 0. In the case that warmup_iters << max_iters the two are
+        # very close to each other.
+        return [
+            base_lr
+            * warmup_factor
+            * 0.5
+            * (1.0 + math.cos(math.pi * self.last_epoch / self.max_iters))
+            for base_lr in self.base_lrs
+        ]
+
+    def _compute_values(self) -> List[float]:
+        # The new interface
+        return self.get_lr()
+
+
+def _get_warmup_factor_at_iter(
+    method: str, iter: int, warmup_iters: int, warmup_factor: float
+) -> float:
+    """
+    Return the learning rate warmup factor at a specific iteration.
+    See https://arxiv.org/abs/1706.02677 for more details.
+
+    Args:
+        method (str): warmup method; either "constant" or "linear".
+        iter (int): iteration at which to calculate the warmup factor.
+        warmup_iters (int): the number of warmup iterations.
+        warmup_factor (float): the base warmup factor (the meaning changes according
+            to the method used).
+
+    Returns:
+        float: the effective warmup factor at the given iteration.
+    """
+    if iter >= warmup_iters:
+        return 1.0
+
+    if method == "constant":
+        return warmup_factor
+    elif method == "linear":
+        alpha = iter / warmup_iters
+        return warmup_factor * (1 - alpha) + alpha
+    else:
+        raise ValueError("Unknown warmup method: {}".format(method))
diff --git a/detectron2/structures/__init__.py b/detectron2/structures/__init__.py
new file mode 100644
index 0000000..9237e14
--- /dev/null
+++ b/detectron2/structures/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .boxes import Boxes, BoxMode, pairwise_iou
+from .image_list import ImageList
+from .instances import Instances
+from .keypoints import Keypoints, heatmaps_to_keypoints
+from .masks import BitMasks, PolygonMasks, rasterize_polygons_within_box
+from .rotated_boxes import RotatedBoxes
+from .rotated_boxes import pairwise_iou as pairwise_iou_rotated
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/detectron2/structures/boxes.py b/detectron2/structures/boxes.py
new file mode 100644
index 0000000..68f125e
--- /dev/null
+++ b/detectron2/structures/boxes.py
@@ -0,0 +1,299 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import numpy as np
+from enum import Enum, unique
+from typing import Iterator, List, Tuple, Union
+import torch
+
+from detectron2.layers import cat
+
+_RawBoxType = Union[List[float], Tuple[float, ...], torch.Tensor, np.ndarray]
+
+
+@unique
+class BoxMode(Enum):
+    """
+    Enum of different ways to represent a box.
+
+    Attributes:
+
+        XYXY_ABS: (x0, y0, x1, y1) in absolute floating points coordinates.
+            The coordinates in range [0, width or height].
+        XYWH_ABS: (x0, y0, w, h) in absolute floating points coordinates.
+        XYXY_REL: (x0, y0, x1, y1) in range [0, 1]. They are relative to the size of the image.
+        XYWH_REL: (x0, y0, w, h) in range [0, 1]. They are relative to the size of the image.
+    """
+
+    XYXY_ABS = 0
+    XYWH_ABS = 1
+    XYXY_REL = 2
+    XYWH_REL = 3
+
+    @staticmethod
+    def convert(box: _RawBoxType, from_mode: "BoxMode", to_mode: "BoxMode") -> _RawBoxType:
+        """
+        Args:
+            box: can be a 4-tuple, 4-list or a Nx4 array/tensor.
+            from_mode, to_mode (BoxMode)
+
+        Returns:
+            The converted box of the same type.
+        """
+        if from_mode == to_mode:
+            return box
+
+        original_type = type(box)
+        single_box = isinstance(box, (list, tuple))
+        if single_box:
+            arr = np.array(box)
+            assert arr.shape == (
+                4,
+            ), "BoxMode.convert takes either a 4-tuple/list or a Nx4 array/tensor"
+        else:
+            arr = copy.deepcopy(box)  # avoid modifying the input box
+
+        assert to_mode.value < 2 and from_mode.value < 2, "Relative mode not yet supported!"
+
+        original_shape = arr.shape
+        arr = arr.reshape(-1, 4)
+        if to_mode == BoxMode.XYXY_ABS and from_mode == BoxMode.XYWH_ABS:
+            arr[:, 2] += arr[:, 0]
+            arr[:, 3] += arr[:, 1]
+        elif from_mode == BoxMode.XYXY_ABS and to_mode == BoxMode.XYWH_ABS:
+            arr[:, 2] -= arr[:, 0]
+            arr[:, 3] -= arr[:, 1]
+        else:
+            raise RuntimeError("Cannot be here!")
+        if single_box:
+            return original_type(arr.flatten())
+        return arr.reshape(*original_shape)
+
+
+class Boxes:
+    """
+    This structure stores a list of boxes as a Nx4 torch.Tensor.
+    It supports some common methods about boxes
+    (`area`, `clip`, `nonempty`, etc),
+    and also behaves like a Tensor
+    (support indexing, `to(device)`, `.device`, and iteration over all boxes)
+
+    Attributes:
+        tensor: float matrix of Nx4.
+    """
+
+    BoxSizeType = Union[List[int], Tuple[int, int]]
+
+    def __init__(self, tensor: torch.Tensor):
+        """
+        Args:
+            tensor (Tensor[float]): a Nx4 matrix.  Each row is (x1, y1, x2, y2).
+        """
+        device = tensor.device if isinstance(tensor, torch.Tensor) else torch.device("cpu")
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            tensor = torch.zeros(0, 4, dtype=torch.float32, device=device)
+        assert tensor.dim() == 2 and tensor.size(-1) == 4, tensor.size()
+
+        self.tensor = tensor
+
+    def clone(self) -> "Boxes":
+        """
+        Clone the Boxes.
+
+        Returns:
+            Boxes
+        """
+        return Boxes(self.tensor.clone())
+
+    def to(self, device: str) -> "Boxes":
+        return Boxes(self.tensor.to(device))
+
+    def area(self) -> torch.Tensor:
+        """
+        Computes the area of all the boxes.
+
+        Returns:
+            torch.Tensor: a vector with areas of each box.
+        """
+        box = self.tensor
+        area = (box[:, 2] - box[:, 0]) * (box[:, 3] - box[:, 1])
+        return area
+
+    def clip(self, box_size: BoxSizeType) -> None:
+        """
+        Clip (in place) the boxes by limiting x coordinates to the range [0, width]
+        and y coordinates to the range [0, height].
+
+        Args:
+            box_size (height, width): The clipping box's size.
+        """
+        assert torch.isfinite(self.tensor).all()
+        h, w = box_size
+        self.tensor[:, 0].clamp_(min=0, max=w)
+        self.tensor[:, 1].clamp_(min=0, max=h)
+        self.tensor[:, 2].clamp_(min=0, max=w)
+        self.tensor[:, 3].clamp_(min=0, max=h)
+
+    def nonempty(self, threshold: int = 0) -> torch.Tensor:
+        """
+        Find boxes that are non-empty.
+        A box is considered empty, if either of its side is no larger than threshold.
+
+        Returns:
+            Tensor:
+                a binary vector which represents whether each box is empty
+                (False) or non-empty (True).
+        """
+        box = self.tensor
+        widths = box[:, 2] - box[:, 0]
+        heights = box[:, 3] - box[:, 1]
+        keep = (widths > threshold) & (heights > threshold)
+        return keep
+
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Boxes":
+        """
+        Returns:
+            Boxes: Create a new :class:`Boxes` by indexing.
+
+        The following usage are allowed:
+        1. `new_boxes = boxes[3]`: return a `Boxes` which contains only one box.
+        2. `new_boxes = boxes[2:10]`: return a slice of boxes.
+        3. `new_boxes = boxes[vector]`, where vector is a torch.BoolTensor
+           with `length = len(boxes)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned Boxes might share storage with this Boxes,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return Boxes(self.tensor[item].view(1, -1))
+        b = self.tensor[item]
+        assert b.dim() == 2, "Indexing on Boxes with {} failed to return a matrix!".format(item)
+        return Boxes(b)
+
+    def __len__(self) -> int:
+        return self.tensor.shape[0]
+
+    def __repr__(self) -> str:
+        return "Boxes(" + str(self.tensor) + ")"
+
+    def inside_box(self, box_size: BoxSizeType, boundary_threshold: int = 0) -> torch.Tensor:
+        """
+        Args:
+            box_size (height, width): Size of the reference box.
+            boundary_threshold (int): Boxes that extend beyond the reference box
+                boundary by more than boundary_threshold are considered "outside".
+
+        Returns:
+            a binary vector, indicating whether each box is inside the reference box.
+        """
+        height, width = box_size
+        inds_inside = (
+            (self.tensor[..., 0] >= -boundary_threshold)
+            & (self.tensor[..., 1] >= -boundary_threshold)
+            & (self.tensor[..., 2] < width + boundary_threshold)
+            & (self.tensor[..., 3] < height + boundary_threshold)
+        )
+        return inds_inside
+
+    def get_centers(self) -> torch.Tensor:
+        """
+        Returns:
+            The box centers in a Nx2 array of (x, y).
+        """
+        return (self.tensor[:, :2] + self.tensor[:, 2:]) / 2
+
+    def scale(self, scale_x: float, scale_y: float) -> None:
+        """
+        Scale the box with horizontal and vertical scaling factors
+        """
+        self.tensor[:, 0::2] *= scale_x
+        self.tensor[:, 1::2] *= scale_y
+
+    @staticmethod
+    def cat(boxes_list: List["Boxes"]) -> "Boxes":
+        """
+        Concatenates a list of Boxes into a single Boxes
+
+        Arguments:
+            boxes_list (list[Boxes])
+
+        Returns:
+            Boxes: the concatenated Boxes
+        """
+        assert isinstance(boxes_list, (list, tuple))
+        assert len(boxes_list) > 0
+        assert all(isinstance(box, Boxes) for box in boxes_list)
+
+        cat_boxes = type(boxes_list[0])(cat([b.tensor for b in boxes_list], dim=0))
+        return cat_boxes
+
+    @property
+    def device(self) -> str:
+        return self.tensor.device
+
+    def __iter__(self) -> Iterator[torch.Tensor]:
+        """
+        Yield a box as a Tensor of shape (4,) at a time.
+        """
+        yield from self.tensor
+
+
+# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
+# with slight modifications
+def pairwise_iou(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
+    """
+    Given two lists of boxes of size N and M,
+    compute the IoU (intersection over union)
+    between __all__ N x M pairs of boxes.
+    The box order must be (xmin, ymin, xmax, ymax).
+
+    Args:
+        boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively.
+
+    Returns:
+        Tensor: IoU, sized [N,M].
+    """
+    area1 = boxes1.area()
+    area2 = boxes2.area()
+
+    boxes1, boxes2 = boxes1.tensor, boxes2.tensor
+
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    # handle empty boxes
+    iou = torch.where(
+        inter > 0,
+        inter / (area1[:, None] + area2 - inter),
+        torch.zeros(1, dtype=inter.dtype, device=inter.device),
+    )
+    return iou
+
+
+def matched_boxlist_iou(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
+    """
+    Compute pairwise intersection over union (IOU) of two sets of matched
+    boxes. The box order must be (xmin, ymin, xmax, ymax).
+    Similar to boxlist_iou, but computes only diagonal elements of the matrix
+    Arguments:
+        boxes1: (Boxes) bounding boxes, sized [N,4].
+        boxes2: (Boxes) bounding boxes, sized [N,4].
+    Returns:
+        (tensor) iou, sized [N].
+    """
+    assert len(boxes1) == len(boxes2), (
+        "boxlists should have the same"
+        "number of entries, got {}, {}".format(len(boxes1), len(boxes2))
+    )
+    area1 = boxes1.area()  # [N]
+    area2 = boxes2.area()  # [N]
+    box1, box2 = boxes1.tensor, boxes2.tensor
+    lt = torch.max(box1[:, :2], box2[:, :2])  # [N,2]
+    rb = torch.min(box1[:, 2:], box2[:, 2:])  # [N,2]
+    wh = (rb - lt).clamp(min=0)  # [N,2]
+    inter = wh[:, 0] * wh[:, 1]  # [N]
+    iou = inter / (area1 + area2 - inter)  # [N]
+    return iou
diff --git a/detectron2/structures/image_list.py b/detectron2/structures/image_list.py
new file mode 100644
index 0000000..de2603b
--- /dev/null
+++ b/detectron2/structures/image_list.py
@@ -0,0 +1,96 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from __future__ import division
+from typing import Any, List, Sequence, Tuple, Union
+import torch
+from torch.nn import functional as F
+
+
+class ImageList(object):
+    """
+    Structure that holds a list of images (of possibly
+    varying sizes) as a single tensor.
+    This works by padding the images to the same size,
+    and storing in a field the original sizes of each image
+
+    Attributes:
+        image_sizes (list[tuple[int, int]]): each tuple is (h, w)
+    """
+
+    def __init__(self, tensor: torch.Tensor, image_sizes: List[Tuple[int, int]]):
+        """
+        Arguments:
+            tensor (Tensor): of shape (N, H, W) or (N, C_1, ..., C_K, H, W) where K >= 1
+            image_sizes (list[tuple[int, int]]): Each tuple is (h, w).
+        """
+        self.tensor = tensor
+        self.image_sizes = image_sizes
+
+    def __len__(self) -> int:
+        return len(self.image_sizes)
+
+    def __getitem__(self, idx: Union[int, slice]) -> torch.Tensor:
+        """
+        Access the individual image in its original size.
+
+        Returns:
+            Tensor: an image of shape (H, W) or (C_1, ..., C_K, H, W) where K >= 1
+        """
+        size = self.image_sizes[idx]
+        return self.tensor[idx, ..., : size[0], : size[1]]  # type: ignore
+
+    def to(self, *args: Any, **kwargs: Any) -> "ImageList":
+        cast_tensor = self.tensor.to(*args, **kwargs)
+        return ImageList(cast_tensor, self.image_sizes)
+
+    @staticmethod
+    def from_tensors(
+        tensors: Sequence[torch.Tensor], size_divisibility: int = 0, pad_value: float = 0.0
+    ) -> "ImageList":
+        """
+        Args:
+            tensors: a tuple or list of `torch.Tensors`, each of shape (Hi, Wi) or
+                (C_1, ..., C_K, Hi, Wi) where K >= 1. The Tensors will be padded with `pad_value`
+                so that they will have the same shape.
+            size_divisibility (int): If `size_divisibility > 0`, also adds padding to ensure
+                the common height and width is divisible by `size_divisibility`
+            pad_value (float): value to pad
+
+        Returns:
+            an `ImageList`.
+        """
+        assert len(tensors) > 0
+        assert isinstance(tensors, (tuple, list))
+        for t in tensors:
+            assert isinstance(t, torch.Tensor), type(t)
+            assert t.shape[1:-2] == tensors[0].shape[1:-2], t.shape
+        # per dimension maximum (H, W) or (C_1, ..., C_K, H, W) where K >= 1 among all tensors
+        max_size = tuple(max(s) for s in zip(*[img.shape for img in tensors]))
+
+        if size_divisibility > 0:
+            import math
+
+            stride = size_divisibility
+            max_size = list(max_size)  # type: ignore
+            max_size[-2] = int(math.ceil(max_size[-2] / stride) * stride)  # type: ignore
+            max_size[-1] = int(math.ceil(max_size[-1] / stride) * stride)  # type: ignore
+            max_size = tuple(max_size)
+
+        image_sizes = [im.shape[-2:] for im in tensors]
+
+        if len(tensors) == 1:
+            # This seems slightly (2%) faster.
+            # TODO: check whether it's faster for multiple images as well
+            image_size = image_sizes[0]
+            padded = F.pad(
+                tensors[0],
+                [0, max_size[-1] - image_size[1], 0, max_size[-2] - image_size[0]],
+                value=pad_value,
+            )
+            batched_imgs = padded.unsqueeze_(0)
+        else:
+            batch_shape = (len(tensors),) + max_size
+            batched_imgs = tensors[0].new_full(batch_shape, pad_value)
+            for img, pad_img in zip(tensors, batched_imgs):
+                pad_img[..., : img.shape[-2], : img.shape[-1]].copy_(img)
+
+        return ImageList(batched_imgs.contiguous(), image_sizes)
diff --git a/detectron2/structures/instances.py b/detectron2/structures/instances.py
new file mode 100644
index 0000000..3027180
--- /dev/null
+++ b/detectron2/structures/instances.py
@@ -0,0 +1,186 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import itertools
+from typing import Any, Dict, List, Tuple, Union
+import torch
+
+from detectron2.layers import cat
+
+
+class Instances:
+    """
+    This class represents a list of instances in an image.
+    It stores the attributes of instances (e.g., boxes, masks, labels, scores) as "fields".
+    All fields must have the same `__len__` which is the number of instances.
+
+    All other (non-field) attributes of this class are considered private:
+    they must start with '_' and are not modifiable by a user.
+
+    Some basic usage:
+
+    1. Set/Get a field:
+       instances.gt_boxes = Boxes(...)
+       print(instances.pred_masks)
+       print('gt_masks' in instances)
+    2. `len(instances)` returns the number of instances
+    3. Indexing: `instances[indices]` will apply the indexing on all the fields
+       and returns a new `Instances`.
+       Typically, `indices` is a binary vector of length num_instances,
+       or a vector of integer indices.
+    """
+
+    def __init__(self, image_size: Tuple[int, int], **kwargs: Any):
+        """
+        Args:
+            image_size (height, width): the spatial size of the image.
+            kwargs: fields to add to this `Instances`.
+        """
+        self._image_size = image_size
+        self._fields: Dict[str, Any] = {}
+        for k, v in kwargs.items():
+            self.set(k, v)
+
+    @property
+    def image_size(self) -> Tuple[int, int]:
+        """
+        Returns:
+            tuple: height, width
+        """
+        return self._image_size
+
+    def __setattr__(self, name: str, val: Any) -> None:
+        if name.startswith("_"):
+            super().__setattr__(name, val)
+        else:
+            self.set(name, val)
+
+    def __getattr__(self, name: str) -> Any:
+        if name == "_fields" or name not in self._fields:
+            raise AttributeError("Cannot find field '{}' in the given Instances!".format(name))
+        return self._fields[name]
+
+    def set(self, name: str, value: Any) -> None:
+        """
+        Set the field named `name` to `value`.
+        The length of `value` must be the number of instances,
+        and must agree with other existing fields in this object.
+        """
+        data_len = len(value)
+        if len(self._fields):
+            assert (
+                len(self) == data_len
+            ), "Adding a field of length {} to a Instances of length {}".format(data_len, len(self))
+        self._fields[name] = value
+
+    def has(self, name: str) -> bool:
+        """
+        Returns:
+            bool: whether the field called `name` exists.
+        """
+        return name in self._fields
+
+    def remove(self, name: str) -> None:
+        """
+        Remove the field called `name`.
+        """
+        del self._fields[name]
+
+    def get(self, name: str) -> Any:
+        """
+        Returns the field called `name`.
+        """
+        return self._fields[name]
+
+    def get_fields(self) -> Dict[str, Any]:
+        """
+        Returns:
+            dict: a dict which maps names (str) to data of the fields
+
+        Modifying the returned dict will modify this instance.
+        """
+        return self._fields
+
+    # Tensor-like methods
+    def to(self, device: str) -> "Instances":
+        """
+        Returns:
+            Instances: all fields are called with a `to(device)`, if the field has this method.
+        """
+        ret = Instances(self._image_size)
+        for k, v in self._fields.items():
+            if hasattr(v, "to"):
+                v = v.to(device)
+            ret.set(k, v)
+        return ret
+
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Instances":
+        """
+        Args:
+            item: an index-like object and will be used to index all the fields.
+
+        Returns:
+            If `item` is a string, return the data in the corresponding field.
+            Otherwise, returns an `Instances` where all fields are indexed by `item`.
+        """
+        ret = Instances(self._image_size)
+        for k, v in self._fields.items():
+            ret.set(k, v[item])
+        return ret
+
+    def __len__(self) -> int:
+        for v in self._fields.values():
+            return len(v)
+        raise NotImplementedError("Empty Instances does not support __len__!")
+
+    def __iter__(self):
+        raise NotImplementedError("`Instances` object is not iterable!")
+
+    @staticmethod
+    def cat(instance_lists: List["Instances"]) -> "Instances":
+        """
+        Args:
+            instance_lists (list[Instances])
+
+        Returns:
+            Instances
+        """
+        assert all(isinstance(i, Instances) for i in instance_lists)
+        assert len(instance_lists) > 0
+        if len(instance_lists) == 1:
+            return instance_lists[0]
+
+        image_size = instance_lists[0].image_size
+        for i in instance_lists[1:]:
+            assert i.image_size == image_size
+        ret = Instances(image_size)
+        for k in instance_lists[0]._fields.keys():
+            values = [i.get(k) for i in instance_lists]
+            v0 = values[0]
+            if isinstance(v0, torch.Tensor):
+                values = cat(values, dim=0)
+            elif isinstance(v0, list):
+                values = list(itertools.chain(*values))
+            elif hasattr(type(v0), "cat"):
+                values = type(v0).cat(values)
+            else:
+                raise ValueError("Unsupported type {} for concatenation".format(type(v0)))
+            ret.set(k, values)
+        return ret
+
+    def __str__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={}, ".format(len(self))
+        s += "image_height={}, ".format(self._image_size[0])
+        s += "image_width={}, ".format(self._image_size[1])
+        s += "fields=[{}])".format(", ".join(self._fields.keys()))
+        return s
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={}, ".format(len(self))
+        s += "image_height={}, ".format(self._image_size[0])
+        s += "image_width={}, ".format(self._image_size[1])
+        s += "fields=["
+        for k, v in self._fields.items():
+            s += "{} = {}, ".format(k, v)
+        s += "])"
+        return s
diff --git a/detectron2/structures/keypoints.py b/detectron2/structures/keypoints.py
new file mode 100644
index 0000000..8cc23bd
--- /dev/null
+++ b/detectron2/structures/keypoints.py
@@ -0,0 +1,202 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+from typing import Any, List, Tuple, Union
+import torch
+
+from detectron2.layers import interpolate
+
+
+class Keypoints:
+    """
+    Stores keypoint annotation data. GT Instances have a `gt_keypoints` property
+    containing the x,y location and visibility flag of each keypoint. This tensor has shape
+    (N, K, 3) where N is the number of instances and K is the number of keypoints per instance.
+
+    The visibility flag follows the COCO format and must be one of three integers:
+    * v=0: not labeled (in which case x=y=0)
+    * v=1: labeled but not visible
+    * v=2: labeled and visible
+    """
+
+    def __init__(self, keypoints: Union[torch.Tensor, np.ndarray, List[List[float]]]):
+        """
+        Arguments:
+            keypoints: A Tensor, numpy array, or list of the x, y, and visibility of each keypoint.
+                The shape should be (N, K, 3) where N is the number of
+                instances, and K is the number of keypoints per instance.
+        """
+        device = keypoints.device if isinstance(keypoints, torch.Tensor) else torch.device("cpu")
+        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=device)
+        assert keypoints.dim() == 3 and keypoints.shape[2] == 3, keypoints.shape
+        self.tensor = keypoints
+
+    def __len__(self) -> int:
+        return self.tensor.size(0)
+
+    def to(self, *args: Any, **kwargs: Any) -> "Keypoints":
+        return type(self)(self.tensor.to(*args, **kwargs))
+
+    def to_heatmap(self, boxes: torch.Tensor, heatmap_size: int) -> torch.Tensor:
+        """
+        Arguments:
+            boxes: Nx4 tensor, the boxes to draw the keypoints to
+
+        Returns:
+            heatmaps:
+                A tensor of shape (N, K) containing an integer spatial label
+                in the range [0, heatmap_size**2 - 1] for each keypoint in the input.
+            valid:
+                A tensor of shape (N, K) containing whether each keypoint is in the roi or not.
+        """
+        return _keypoints_to_heatmap(self.tensor, boxes, heatmap_size)
+
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Keypoints":
+        """
+        Create a new `Keypoints` by indexing on this `Keypoints`.
+
+        The following usage are allowed:
+
+        1. `new_kpts = kpts[3]`: return a `Keypoints` which contains only one instance.
+        2. `new_kpts = kpts[2:10]`: return a slice of key points.
+        3. `new_kpts = kpts[vector]`, where vector is a torch.ByteTensor
+           with `length = len(kpts)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned Keypoints might share storage with this Keypoints,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return Keypoints([self.tensor[item]])
+        return Keypoints(self.tensor[item])
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={})".format(len(self.tensor))
+        return s
+
+
+# TODO make this nicer, this is a direct translation from C2 (but removing the inner loop)
+def _keypoints_to_heatmap(
+    keypoints: torch.Tensor, rois: torch.Tensor, heatmap_size: int
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Encode keypoint locations into a target heatmap for use in SoftmaxWithLoss across space.
+
+    Maps keypoints from the half-open interval [x1, x2) on continuous image coordinates to the
+    closed interval [0, heatmap_size - 1] on discrete image coordinates. We use the
+    continuous-discrete conversion from Heckbert 1990 ("What is the coordinate of a pixel?"):
+    d = floor(c) and c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate.
+
+    Arguments:
+        keypoints: tensor of keypoint locations in of shape (N, K, 3).
+        rois: Nx4 tensor of rois in xyxy format
+        heatmap_size: integer side length of square heatmap.
+
+    Returns:
+        heatmaps: A tensor of shape (N, K) containing an integer spatial label
+            in the range [0, heatmap_size**2 - 1] for each keypoint in the input.
+        valid: A tensor of shape (N, K) containing whether each keypoint is in
+            the roi or not.
+    """
+
+    if rois.numel() == 0:
+        return rois.new().long(), rois.new().long()
+    offset_x = rois[:, 0]
+    offset_y = rois[:, 1]
+    scale_x = heatmap_size / (rois[:, 2] - rois[:, 0])
+    scale_y = heatmap_size / (rois[:, 3] - rois[:, 1])
+
+    offset_x = offset_x[:, None]
+    offset_y = offset_y[:, None]
+    scale_x = scale_x[:, None]
+    scale_y = scale_y[:, None]
+
+    x = keypoints[..., 0]
+    y = keypoints[..., 1]
+
+    x_boundary_inds = x == rois[:, 2][:, None]
+    y_boundary_inds = y == rois[:, 3][:, None]
+
+    x = (x - offset_x) * scale_x
+    x = x.floor().long()
+    y = (y - offset_y) * scale_y
+    y = y.floor().long()
+
+    x[x_boundary_inds] = heatmap_size - 1
+    y[y_boundary_inds] = heatmap_size - 1
+
+    valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size)
+    vis = keypoints[..., 2] > 0
+    valid = (valid_loc & vis).long()
+
+    lin_ind = y * heatmap_size + x
+    heatmaps = lin_ind * valid
+
+    return heatmaps, valid
+
+
+@torch.no_grad()
+def heatmaps_to_keypoints(maps: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
+    """
+    Args:
+        maps (Tensor): (#ROIs, #keypoints, POOL_H, POOL_W)
+        rois (Tensor): (#ROIs, 4)
+
+    Extract predicted keypoint locations from heatmaps. Output has shape
+    (#rois, #keypoints, 4) with the last dimension corresponding to (x, y, logit, prob)
+    for each keypoint.
+
+    Converts a discrete image coordinate in an NxN image to a continuous keypoint coordinate. We
+    maintain consistency with keypoints_to_heatmap by using the conversion from Heckbert 1990:
+    c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate.
+    """
+    offset_x = rois[:, 0]
+    offset_y = rois[:, 1]
+
+    widths = (rois[:, 2] - rois[:, 0]).clamp(min=1)
+    heights = (rois[:, 3] - rois[:, 1]).clamp(min=1)
+    widths_ceil = widths.ceil()
+    heights_ceil = heights.ceil()
+
+    num_rois, num_keypoints = maps.shape[:2]
+    xy_preds = maps.new_zeros(rois.shape[0], num_keypoints, 4)
+
+    width_corrections = widths / widths_ceil
+    height_corrections = heights / heights_ceil
+
+    keypoints_idx = torch.arange(num_keypoints, device=maps.device)
+
+    for i in range(num_rois):
+        outsize = (int(heights_ceil[i]), int(widths_ceil[i]))
+        roi_map = interpolate(maps[[i]], size=outsize, mode="bicubic", align_corners=False).squeeze(
+            0
+        )  # #keypoints x H x W
+
+        # softmax over the spatial region
+        max_score, _ = roi_map.view(num_keypoints, -1).max(1)
+        max_score = max_score.view(num_keypoints, 1, 1)
+        tmp_full_resolution = (roi_map - max_score).exp_()
+        tmp_pool_resolution = (maps[i] - max_score).exp_()
+        # Produce scores over the region H x W, but normalize with POOL_H x POOL_W
+        # So that the scores of objects of different absolute sizes will be more comparable
+        roi_map_probs = tmp_full_resolution / tmp_pool_resolution.sum((1, 2), keepdim=True)
+
+        w = roi_map.shape[2]
+        pos = roi_map.view(num_keypoints, -1).argmax(1)
+
+        x_int = pos % w
+        y_int = (pos - x_int) // w
+
+        assert (
+            roi_map_probs[keypoints_idx, y_int, x_int]
+            == roi_map_probs.view(num_keypoints, -1).max(1)[0]
+        ).all()
+
+        x = (x_int.float() + 0.5) * width_corrections[i]
+        y = (y_int.float() + 0.5) * height_corrections[i]
+
+        xy_preds[i, :, 0] = x + offset_x[i]
+        xy_preds[i, :, 1] = y + offset_y[i]
+        xy_preds[i, :, 2] = roi_map[keypoints_idx, y_int, x_int]
+        xy_preds[i, :, 3] = roi_map_probs[keypoints_idx, y_int, x_int]
+
+    return xy_preds
diff --git a/detectron2/structures/masks.py b/detectron2/structures/masks.py
new file mode 100644
index 0000000..2cb0fd9
--- /dev/null
+++ b/detectron2/structures/masks.py
@@ -0,0 +1,370 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import numpy as np
+from typing import Any, Iterator, List, Union
+import pycocotools.mask as mask_utils
+import torch
+
+from detectron2.layers.roi_align import ROIAlign
+
+from .boxes import Boxes
+
+
+def polygon_area(x, y):
+    # Using the shoelace formula
+    # https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
+    return 0.5 * np.abs(np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)))
+
+
+def polygons_to_bitmask(polygons: List[np.ndarray], height: int, width: int) -> np.ndarray:
+    """
+    Args:
+        polygons (list[ndarray]): each array has shape (Nx2,)
+        height, width (int)
+
+    Returns:
+        ndarray: a bool mask of shape (height, width)
+    """
+    assert len(polygons) > 0, "COCOAPI does not support empty polygons"
+    rles = mask_utils.frPyObjects(polygons, height, width)
+    rle = mask_utils.merge(rles)
+    return mask_utils.decode(rle).astype(np.bool)
+
+
+def rasterize_polygons_within_box(
+    polygons: List[np.ndarray], box: np.ndarray, mask_size: int
+) -> torch.Tensor:
+    """
+    Rasterize the polygons into a mask image and
+    crop the mask content in the given box.
+    The cropped mask is resized to (mask_size, mask_size).
+
+    This function is used when generating training targets for mask head in Mask R-CNN.
+    Given original ground-truth masks for an image, new ground-truth mask
+    training targets in the size of `mask_size x mask_size`
+    must be provided for each predicted box. This function will be called to
+    produce such targets.
+
+    Args:
+        polygons (list[ndarray[float]]): a list of polygons, which represents an instance.
+        box: 4-element numpy array
+        mask_size (int):
+
+    Returns:
+        Tensor: BoolTensor of shape (mask_size, mask_size)
+    """
+    # 1. Shift the polygons w.r.t the boxes
+    w, h = box[2] - box[0], box[3] - box[1]
+
+    polygons = copy.deepcopy(polygons)
+    for p in polygons:
+        p[0::2] = p[0::2] - box[0]
+        p[1::2] = p[1::2] - box[1]
+
+    # 2. Rescale the polygons to the new box size
+    ratio_h = mask_size / max(h, 0.1)
+    ratio_w = mask_size / max(w, 0.1)
+
+    if ratio_h == ratio_w:
+        for p in polygons:
+            p *= ratio_h
+    else:
+        for p in polygons:
+            p[0::2] *= ratio_w
+            p[1::2] *= ratio_h
+
+    # 3. Rasterize the polygons with coco api
+    mask = polygons_to_bitmask(polygons, mask_size, mask_size)
+    mask = torch.from_numpy(mask)
+    return mask
+
+
+class BitMasks:
+    """
+    This class stores the segmentation masks for all objects in one image, in
+    the form of bitmaps.
+
+    Attributes:
+        tensor: bool Tensor of N,H,W, representing N instances in the image.
+    """
+
+    def __init__(self, tensor: Union[torch.Tensor, np.ndarray]):
+        """
+        Args:
+            tensor: bool Tensor of N,H,W, representing N instances in the image.
+        """
+        device = tensor.device if isinstance(tensor, torch.Tensor) else torch.device("cpu")
+        tensor = torch.as_tensor(tensor, dtype=torch.bool, device=device)
+        assert tensor.dim() == 3, tensor.size()
+        self.image_size = tensor.shape[1:]
+        self.tensor = tensor
+
+    def to(self, device: str) -> "BitMasks":
+        return BitMasks(self.tensor.to(device))
+
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "BitMasks":
+        """
+        Returns:
+            BitMasks: Create a new :class:`BitMasks` by indexing.
+
+        The following usage are allowed:
+
+        1. `new_masks = masks[3]`: return a `BitMasks` which contains only one mask.
+        2. `new_masks = masks[2:10]`: return a slice of masks.
+        3. `new_masks = masks[vector]`, where vector is a torch.BoolTensor
+           with `length = len(masks)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned object might share storage with this object,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return BitMasks(self.tensor[item].view(1, -1))
+        m = self.tensor[item]
+        assert m.dim() == 3, "Indexing on BitMasks with {} returns a tensor with shape {}!".format(
+            item, m.shape
+        )
+        return BitMasks(m)
+
+    def __iter__(self) -> torch.Tensor:
+        yield from self.tensor
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={})".format(len(self.tensor))
+        return s
+
+    def __len__(self) -> int:
+        return self.tensor.shape[0]
+
+    def nonempty(self) -> torch.Tensor:
+        """
+        Find masks that are non-empty.
+
+        Returns:
+            Tensor: a BoolTensor which represents
+                whether each mask is empty (False) or non-empty (True).
+        """
+        return self.tensor.flatten(1).any(dim=1)
+
+    @staticmethod
+    def from_polygon_masks(
+        polygon_masks: Union["PolygonMasks", List[List[np.ndarray]]], height: int, width: int
+    ) -> "BitMasks":
+        """
+        Args:
+            polygon_masks (list[list[ndarray]] or PolygonMasks)
+            height, width (int)
+        """
+        if isinstance(polygon_masks, PolygonMasks):
+            polygon_masks = polygon_masks.polygons
+        masks = [polygons_to_bitmask(p, height, width) for p in polygon_masks]
+        return BitMasks(torch.stack([torch.from_numpy(x) for x in masks]))
+
+    def crop_and_resize(self, boxes: torch.Tensor, mask_size: int) -> torch.Tensor:
+        """
+        Crop each bitmask by the given box, and resize results to (mask_size, mask_size).
+        This can be used to prepare training targets for Mask R-CNN.
+        It has less reconstruction error compared to rasterization with polygons.
+        However we observe no difference in accuracy,
+        but BitMasks requires more memory to store all the masks.
+
+        Args:
+            boxes (Tensor): Nx4 tensor storing the boxes for each mask
+            mask_size (int): the size of the rasterized mask.
+
+        Returns:
+            Tensor:
+                A bool tensor of shape (N, mask_size, mask_size), where
+                N is the number of predicted boxes for this image.
+        """
+        assert len(boxes) == len(self), "{} != {}".format(len(boxes), len(self))
+        device = self.tensor.device
+
+        batch_inds = torch.arange(len(boxes), device=device).to(dtype=boxes.dtype)[:, None]
+        rois = torch.cat([batch_inds, boxes], dim=1)  # Nx5
+
+        bit_masks = self.tensor.to(dtype=torch.float32)
+        rois = rois.to(device=device)
+        output = (
+            ROIAlign((mask_size, mask_size), 1.0, 0, aligned=True)
+            .forward(bit_masks[:, None, :, :], rois)
+            .squeeze(1)
+        )
+        output = output >= 0.5
+        return output
+
+    def get_bounding_boxes(self) -> None:
+        # not needed now
+        raise NotImplementedError
+
+
+class PolygonMasks:
+    """
+    This class stores the segmentation masks for all objects in one image, in the form of polygons.
+
+    Attributes:
+        polygons: list[list[ndarray]]. Each ndarray is a float64 vector representing a polygon.
+    """
+
+    def __init__(self, polygons: List[List[Union[torch.Tensor, np.ndarray]]]):
+        """
+        Arguments:
+            polygons (list[list[Tensor[float]]]): The first
+                level of the list correspond to individual instances,
+                the second level to all the polygons that compose the
+                instance, and the third level to the polygon coordinates.
+                The third level Tensor should have the format of
+                torch.Tensor([x0, y0, x1, y1, ..., xn, yn]) (n >= 3).
+        """
+        assert isinstance(polygons, list)
+
+        def _make_array(t: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
+            # Use float64 for higher precision, because why not?
+            # Always put polygons on CPU (self.to is a no-op) since they
+            # are supposed to be small tensors.
+            # May need to change this assumption if GPU placement becomes useful
+            if isinstance(t, torch.Tensor):
+                t = t.cpu().numpy()
+            return np.asarray(t).astype("float64")
+
+        def process_polygons(
+            polygons_per_instance: List[Union[torch.Tensor, np.ndarray]]
+        ) -> List[torch.Tensor]:
+            assert isinstance(polygons_per_instance, list), type(polygons_per_instance)
+            # transform the polygon to a tensor
+            polygons_per_instance = [_make_array(p) for p in polygons_per_instance]
+            for polygon in polygons_per_instance:
+                assert len(polygon) % 2 == 0 and len(polygon) >= 6
+            return polygons_per_instance
+
+        self.polygons: List[List[torch.Tensor]] = [
+            process_polygons(polygons_per_instance) for polygons_per_instance in polygons
+        ]
+
+    def to(self, *args: Any, **kwargs: Any) -> "PolygonMasks":
+        return self
+
+    def get_bounding_boxes(self) -> Boxes:
+        """
+        Returns:
+            Boxes: tight bounding boxes around polygon masks.
+        """
+        boxes = torch.zeros(len(self.polygons), 4, dtype=torch.float32)
+        for idx, polygons_per_instance in enumerate(self.polygons):
+            minxy = torch.as_tensor([float("inf"), float("inf")], dtype=torch.float32)
+            maxxy = torch.zeros(2, dtype=torch.float32)
+            for polygon in polygons_per_instance:
+                coords = torch.from_numpy(polygon).view(-1, 2).to(dtype=torch.float32)
+                minxy = torch.min(minxy, torch.min(coords, dim=0).values)
+                maxxy = torch.max(maxxy, torch.max(coords, dim=0).values)
+            boxes[idx, :2] = minxy
+            boxes[idx, 2:] = maxxy
+        return Boxes(boxes)
+
+    def nonempty(self) -> torch.Tensor:
+        """
+        Find masks that are non-empty.
+
+        Returns:
+            Tensor:
+                a BoolTensor which represents whether each mask is empty (False) or not (True).
+        """
+        keep = [1 if len(polygon) > 0 else 0 for polygon in self.polygons]
+        return torch.as_tensor(keep, dtype=torch.bool)
+
+    def __getitem__(self, item: Union[int, slice, List[int], torch.BoolTensor]) -> "PolygonMasks":
+        """
+        Support indexing over the instances and return a `PolygonMasks` object.
+        `item` can be:
+
+        1. An integer. It will return an object with only one instance.
+        2. A slice. It will return an object with the selected instances.
+        3. A list[int]. It will return an object with the selected instances,
+           correpsonding to the indices in the list.
+        4. A vector mask of type BoolTensor, whose length is num_instances.
+           It will return an object with the instances whose mask is nonzero.
+        """
+        if isinstance(item, int):
+            selected_polygons = [self.polygons[item]]
+        elif isinstance(item, slice):
+            selected_polygons = self.polygons[item]
+        elif isinstance(item, list):
+            selected_polygons = [self.polygons[i] for i in item]
+        elif isinstance(item, torch.Tensor):
+            # Polygons is a list, so we have to move the indices back to CPU.
+            if item.dtype == torch.bool:
+                assert item.dim() == 1, item.shape
+                item = item.nonzero().squeeze(1).cpu().numpy().tolist()
+            elif item.dtype in [torch.int32, torch.int64]:
+                item = item.cpu().numpy().tolist()
+            else:
+                raise ValueError("Unsupported tensor dtype={} for indexing!".format(item.dtype))
+            selected_polygons = [self.polygons[i] for i in item]
+        return PolygonMasks(selected_polygons)
+
+    def __iter__(self) -> Iterator[List[torch.Tensor]]:
+        """
+        Yields:
+            list[ndarray]: the polygons for one instance.
+            Each Tensor is a float64 vector representing a polygon.
+        """
+        return iter(self.polygons)
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={})".format(len(self.polygons))
+        return s
+
+    def __len__(self) -> int:
+        return len(self.polygons)
+
+    def crop_and_resize(self, boxes: torch.Tensor, mask_size: int) -> torch.Tensor:
+        """
+        Crop each mask by the given box, and resize results to (mask_size, mask_size).
+        This can be used to prepare training targets for Mask R-CNN.
+
+        Args:
+            boxes (Tensor): Nx4 tensor storing the boxes for each mask
+            mask_size (int): the size of the rasterized mask.
+
+        Returns:
+            Tensor: A bool tensor of shape (N, mask_size, mask_size), where
+            N is the number of predicted boxes for this image.
+        """
+        assert len(boxes) == len(self), "{} != {}".format(len(boxes), len(self))
+
+        device = boxes.device
+        # Put boxes on the CPU, as the polygon representation is not efficient GPU-wise
+        # (several small tensors for representing a single instance mask)
+        boxes = boxes.to(torch.device("cpu"))
+
+        results = [
+            rasterize_polygons_within_box(poly, box.numpy(), mask_size)
+            for poly, box in zip(self.polygons, boxes)
+        ]
+        """
+        poly: list[list[float]], the polygons for one instance
+        box: a tensor of shape (4,)
+        """
+        if len(results) == 0:
+            return torch.empty(0, mask_size, mask_size, dtype=torch.bool, device=device)
+        return torch.stack(results, dim=0).to(device=device)
+
+    def area(self):
+        """
+        Computes area of the mask.
+        Only works with Polygons, using the shoelace formula:
+        https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
+
+        Returns:
+            Tensor: a vector, area for each instance
+        """
+
+        area = []
+        for polygons_per_instance in self.polygons:
+            area_per_instance = 0
+            for p in polygons_per_instance:
+                area_per_instance += polygon_area(p[0::2], p[1::2])
+            area.append(area_per_instance)
+
+        return torch.tensor(area)
diff --git a/detectron2/structures/rotated_boxes.py b/detectron2/structures/rotated_boxes.py
new file mode 100644
index 0000000..689aefe
--- /dev/null
+++ b/detectron2/structures/rotated_boxes.py
@@ -0,0 +1,499 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+from typing import Iterator, List, Union
+import torch
+
+from detectron2.layers import cat
+from detectron2.layers.rotated_boxes import pairwise_iou_rotated
+
+from .boxes import Boxes
+
+
+class RotatedBoxes(Boxes):
+    """
+    This structure stores a list of rotated boxes as a Nx5 torch.Tensor.
+    It supports some common methods about boxes
+    (`area`, `clip`, `nonempty`, etc),
+    and also behaves like a Tensor
+    (support indexing, `to(device)`, `.device`, and iteration over all boxes)
+    """
+
+    def __init__(self, tensor: torch.Tensor):
+        """
+        Args:
+            tensor (Tensor[float]): a Nx5 matrix.  Each row is
+                (x_center, y_center, width, height, angle),
+                in which angle is represented in degrees.
+                While there's no strict range restriction for it,
+                the recommended principal range is between (-180, 180] degrees.
+
+        Assume we have a horizontal box B = (x_center, y_center, width, height),
+        where width is along the x-axis and height is along the y-axis.
+        The rotated box B_rot (x_center, y_center, width, height, angle)
+        can be seen as:
+
+        1. When angle == 0:
+           B_rot == B
+        2. When angle > 0:
+           B_rot is obtained by rotating B w.r.t its center by :math:`|angle|` degrees CCW;
+        3. When angle < 0:
+           B_rot is obtained by rotating B w.r.t its center by :math:`|angle|` degrees CW.
+
+        Mathematically, since the right-handed coordinate system for image space
+        is (y, x), where y is top->down and x is left->right, the 4 vertices of the
+        rotated rectangle :math:`(yr_i, xr_i)` (i = 1, 2, 3, 4) can be obtained from
+        the vertices of the horizontal rectangle (y_i, x_i) (i = 1, 2, 3, 4)
+        in the following way (:math:`\\theta = angle*\\pi/180` is the angle in radians,
+        (y_c, x_c) is the center of the rectangle):
+
+        .. math::
+
+            yr_i = \\cos(\\theta) (y_i - y_c) - \\sin(\\theta) (x_i - x_c) + y_c,
+
+            xr_i = \\sin(\\theta) (y_i - y_c) + \\cos(\\theta) (x_i - x_c) + x_c,
+
+        which is the standard rigid-body rotation transformation.
+
+        Intuitively, the angle is
+        (1) the rotation angle from y-axis in image space
+        to the height vector (top->down in the box's local coordinate system)
+        of the box in CCW, and
+        (2) the rotation angle from x-axis in image space
+        to the width vector (left->right in the box's local coordinate system)
+        of the box in CCW.
+
+        More intuitively, consider the following horizontal box ABCD represented
+        in (x1, y1, x2, y2): (3, 2, 7, 4),
+        covering the [3, 7] x [2, 4] region of the continuous coordinate system
+        which looks like this:
+
+        .. code:: none
+
+            O--------> x
+            |
+            |  A---B
+            |  |   |
+            |  D---C
+            |
+            v y
+
+        Note that each capital letter represents one 0-dimensional geometric point
+        instead of a 'square pixel' here.
+
+        In the example above, using (x, y) to represent a point we have:
+
+        .. math::
+
+            O = (0, 0), A = (3, 2), B = (7, 2), C = (7, 4), D = (3, 4)
+
+        We name vector AB = vector DC as the width vector in box's local coordinate system, and
+        vector AD = vector BC as the height vector in box's local coordinate system. Initially,
+        when angle = 0 degree, they're aligned with the positive directions of x-axis and y-axis
+        in the image space, respectively.
+
+        For better illustration, we denote the center of the box as E,
+
+        .. code:: none
+
+            O--------> x
+            |
+            |  A---B
+            |  | E |
+            |  D---C
+            |
+            v y
+
+        where the center E = ((3+7)/2, (2+4)/2) = (5, 3).
+
+        Also,
+
+        .. math::
+
+            width = |AB| = |CD| = 7 - 3 = 4,
+            height = |AD| = |BC| = 4 - 2 = 2.
+
+        Therefore, the corresponding representation for the same shape in rotated box in
+        (x_center, y_center, width, height, angle) format is:
+
+        (5, 3, 4, 2, 0),
+
+        Now, let's consider (5, 3, 4, 2, 90), which is rotated by 90 degrees
+        CCW (counter-clockwise) by definition. It looks like this:
+
+        .. code:: none
+
+            O--------> x
+            |   B-C
+            |   | |
+            |   |E|
+            |   | |
+            |   A-D
+            v y
+
+        The center E is still located at the same point (5, 3), while the vertices
+        ABCD are rotated by 90 degrees CCW with regard to E:
+        A = (4, 5), B = (4, 1), C = (6, 1), D = (6, 5)
+
+        Here, 90 degrees can be seen as the CCW angle to rotate from y-axis to
+        vector AD or vector BC (the top->down height vector in box's local coordinate system),
+        or the CCW angle to rotate from x-axis to vector AB or vector DC (the left->right
+        width vector in box's local coordinate system).
+
+        .. math::
+
+            width = |AB| = |CD| = 5 - 1 = 4,
+            height = |AD| = |BC| = 6 - 4 = 2.
+
+        Next, how about (5, 3, 4, 2, -90), which is rotated by 90 degrees CW (clockwise)
+        by definition? It looks like this:
+
+        .. code:: none
+
+            O--------> x
+            |   D-A
+            |   | |
+            |   |E|
+            |   | |
+            |   C-B
+            v y
+
+        The center E is still located at the same point (5, 3), while the vertices
+        ABCD are rotated by 90 degrees CW with regard to E:
+        A = (6, 1), B = (6, 5), C = (4, 5), D = (4, 1)
+
+        .. math::
+
+            width = |AB| = |CD| = 5 - 1 = 4,
+            height = |AD| = |BC| = 6 - 4 = 2.
+
+        This covers exactly the same region as (5, 3, 4, 2, 90) does, and their IoU
+        will be 1. However, these two will generate different RoI Pooling results and
+        should not be treated as an identical box.
+
+        On the other hand, it's easy to see that (X, Y, W, H, A) is identical to
+        (X, Y, W, H, A+360N), for any integer N. For example (5, 3, 4, 2, 270) would be
+        identical to (5, 3, 4, 2, -90), because rotating the shape 270 degrees CCW is
+        equivalent to rotating the same shape 90 degrees CW.
+
+        We could rotate further to get (5, 3, 4, 2, 180), or (5, 3, 4, 2, -180):
+
+        .. code:: none
+
+            O--------> x
+            |
+            |  C---D
+            |  | E |
+            |  B---A
+            |
+            v y
+
+        .. math::
+
+            A = (7, 4), B = (3, 4), C = (3, 2), D = (7, 2),
+
+            width = |AB| = |CD| = 7 - 3 = 4,
+            height = |AD| = |BC| = 4 - 2 = 2.
+
+        Finally, this is a very inaccurate (heavily quantized) illustration of
+        how (5, 3, 4, 2, 60) looks like in case anyone wonders:
+
+        .. code:: none
+
+            O--------> x
+            |     B\
+            |    /  C
+            |   /E /
+            |  A  /
+            |   `D
+            v y
+
+        It's still a rectangle with center of (5, 3), width of 4 and height of 2,
+        but its angle (and thus orientation) is somewhere between
+        (5, 3, 4, 2, 0) and (5, 3, 4, 2, 90).
+        """
+        device = tensor.device if isinstance(tensor, torch.Tensor) else torch.device("cpu")
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            tensor = torch.zeros(0, 5, dtype=torch.float32, device=device)
+        assert tensor.dim() == 2 and tensor.size(-1) == 5, tensor.size()
+
+        self.tensor = tensor
+
+    def clone(self) -> "RotatedBoxes":
+        """
+        Clone the RotatedBoxes.
+
+        Returns:
+            RotatedBoxes
+        """
+        return RotatedBoxes(self.tensor.clone())
+
+    def to(self, device: str) -> "RotatedBoxes":
+        return RotatedBoxes(self.tensor.to(device))
+
+    def area(self) -> torch.Tensor:
+        """
+        Computes the area of all the boxes.
+
+        Returns:
+            torch.Tensor: a vector with areas of each box.
+        """
+        box = self.tensor
+        area = box[:, 2] * box[:, 3]
+        return area
+
+    def normalize_angles(self) -> None:
+        """
+        Restrict angles to the range of (-180, 180] degrees
+        """
+        self.tensor[:, 4] = self.tensor[:, 4] % 360
+        self.tensor[:, 4][torch.where(self.tensor[:, 4] > 180)] -= 360
+
+    def clip(self, box_size: Boxes.BoxSizeType, clip_angle_threshold: float = 1.0) -> None:
+        """
+        Clip (in place) the boxes by limiting x coordinates to the range [0, width]
+        and y coordinates to the range [0, height].
+
+        For RRPN:
+        Only clip boxes that are almost horizontal with a tolerance of
+        clip_angle_threshold to maintain backward compatibility.
+
+        Rotated boxes beyond this threshold are not clipped for two reasons:
+
+        1. There are potentially multiple ways to clip a rotated box to make it
+           fit within the image.
+        2. It's tricky to make the entire rectangular box fit within the image
+           and still be able to not leave out pixels of interest.
+
+        Therefore we rely on ops like RoIAlignRotated to safely handle this.
+
+        Args:
+            box_size (height, width): The clipping box's size.
+            clip_angle_threshold:
+                Iff. abs(normalized(angle)) <= clip_angle_threshold (in degrees),
+                we do the clipping as horizontal boxes.
+        """
+        h, w = box_size
+
+        # normalize angles to be within (-180, 180] degrees
+        self.normalize_angles()
+
+        idx = torch.where(torch.abs(self.tensor[:, 4]) <= clip_angle_threshold)[0]
+
+        # convert to (x1, y1, x2, y2)
+        x1 = self.tensor[idx, 0] - self.tensor[idx, 2] / 2.0
+        y1 = self.tensor[idx, 1] - self.tensor[idx, 3] / 2.0
+        x2 = self.tensor[idx, 0] + self.tensor[idx, 2] / 2.0
+        y2 = self.tensor[idx, 1] + self.tensor[idx, 3] / 2.0
+
+        # clip
+        x1.clamp_(min=0, max=w)
+        y1.clamp_(min=0, max=h)
+        x2.clamp_(min=0, max=w)
+        y2.clamp_(min=0, max=h)
+
+        # convert back to (xc, yc, w, h)
+        self.tensor[idx, 0] = (x1 + x2) / 2.0
+        self.tensor[idx, 1] = (y1 + y2) / 2.0
+        # make sure widths and heights do not increase due to numerical errors
+        self.tensor[idx, 2] = torch.min(self.tensor[idx, 2], x2 - x1)
+        self.tensor[idx, 3] = torch.min(self.tensor[idx, 3], y2 - y1)
+
+    def nonempty(self, threshold: int = 0) -> torch.Tensor:
+        """
+        Find boxes that are non-empty.
+        A box is considered empty, if either of its side is no larger than threshold.
+
+        Returns:
+            Tensor: a binary vector which represents
+            whether each box is empty (False) or non-empty (True).
+        """
+        box = self.tensor
+        widths = box[:, 2]
+        heights = box[:, 3]
+        keep = (widths > threshold) & (heights > threshold)
+        return keep
+
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "RotatedBoxes":
+        """
+        Returns:
+            RotatedBoxes: Create a new :class:`RotatedBoxes` by indexing.
+
+        The following usage are allowed:
+
+        1. `new_boxes = boxes[3]`: return a `RotatedBoxes` which contains only one box.
+        2. `new_boxes = boxes[2:10]`: return a slice of boxes.
+        3. `new_boxes = boxes[vector]`, where vector is a torch.ByteTensor
+           with `length = len(boxes)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned RotatedBoxes might share storage with this RotatedBoxes,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return RotatedBoxes(self.tensor[item].view(1, -1))
+        b = self.tensor[item]
+        assert b.dim() == 2, "Indexing on RotatedBoxes with {} failed to return a matrix!".format(
+            item
+        )
+        return RotatedBoxes(b)
+
+    def __len__(self) -> int:
+        return self.tensor.shape[0]
+
+    def __repr__(self) -> str:
+        return "RotatedBoxes(" + str(self.tensor) + ")"
+
+    def inside_box(self, box_size: Boxes.BoxSizeType, boundary_threshold: int = 0) -> torch.Tensor:
+        """
+        Args:
+            box_size (height, width): Size of the reference box covering
+                [0, width] x [0, height]
+            boundary_threshold (int): Boxes that extend beyond the reference box
+                boundary by more than boundary_threshold are considered "outside".
+
+        For RRPN, it might not be necessary to call this function since it's common
+        for rotated box to extend to outside of the image boundaries
+        (the clip function only clips the near-horizontal boxes)
+
+        Returns:
+            a binary vector, indicating whether each box is inside the reference box.
+        """
+        height, width = box_size
+
+        cnt_x = self.tensor[..., 0]
+        cnt_y = self.tensor[..., 1]
+        half_w = self.tensor[..., 2] / 2.0
+        half_h = self.tensor[..., 3] / 2.0
+        a = self.tensor[..., 4]
+        c = torch.abs(torch.cos(a * math.pi / 180.0))
+        s = torch.abs(torch.sin(a * math.pi / 180.0))
+        # This basically computes the horizontal bounding rectangle of the rotated box
+        max_rect_dx = c * half_w + s * half_h
+        max_rect_dy = c * half_h + s * half_w
+
+        inds_inside = (
+            (cnt_x - max_rect_dx >= -boundary_threshold)
+            & (cnt_y - max_rect_dy >= -boundary_threshold)
+            & (cnt_x + max_rect_dx < width + boundary_threshold)
+            & (cnt_y + max_rect_dy < height + boundary_threshold)
+        )
+
+        return inds_inside
+
+    def get_centers(self) -> torch.Tensor:
+        """
+        Returns:
+            The box centers in a Nx2 array of (x, y).
+        """
+        return self.tensor[:, :2]
+
+    def scale(self, scale_x: float, scale_y: float) -> None:
+        """
+        Scale the rotated box with horizontal and vertical scaling factors
+        Note: when scale_factor_x != scale_factor_y,
+        the rotated box does not preserve the rectangular shape when the angle
+        is not a multiple of 90 degrees under resize transformation.
+        Instead, the shape is a parallelogram (that has skew)
+        Here we make an approximation by fitting a rotated rectangle to the parallelogram.
+        """
+        self.tensor[:, 0] *= scale_x
+        self.tensor[:, 1] *= scale_y
+        theta = self.tensor[:, 4] * math.pi / 180.0
+        c = torch.cos(theta)
+        s = torch.sin(theta)
+
+        # In image space, y is top->down and x is left->right
+        # Consider the local coordintate system for the rotated box,
+        # where the box center is located at (0, 0), and the four vertices ABCD are
+        # A(-w / 2, -h / 2), B(w / 2, -h / 2), C(w / 2, h / 2), D(-w / 2, h / 2)
+        # the midpoint of the left edge AD of the rotated box E is:
+        # E = (A+D)/2 = (-w / 2, 0)
+        # the midpoint of the top edge AB of the rotated box F is:
+        # F(0, -h / 2)
+        # To get the old coordinates in the global system, apply the rotation transformation
+        # (Note: the right-handed coordinate system for image space is yOx):
+        # (old_x, old_y) = (s * y + c * x, c * y - s * x)
+        # E(old) = (s * 0 + c * (-w/2), c * 0 - s * (-w/2)) = (-c * w / 2, s * w / 2)
+        # F(old) = (s * (-h / 2) + c * 0, c * (-h / 2) - s * 0) = (-s * h / 2, -c * h / 2)
+        # After applying the scaling factor (sfx, sfy):
+        # E(new) = (-sfx * c * w / 2, sfy * s * w / 2)
+        # F(new) = (-sfx * s * h / 2, -sfy * c * h / 2)
+        # The new width after scaling tranformation becomes:
+
+        # w(new) = |E(new) - O| * 2
+        #        = sqrt[(sfx * c * w / 2)^2 + (sfy * s * w / 2)^2] * 2
+        #        = sqrt[(sfx * c)^2 + (sfy * s)^2] * w
+        # i.e., scale_factor_w = sqrt[(sfx * c)^2 + (sfy * s)^2]
+        #
+        # For example,
+        # when angle = 0 or 180, |c| = 1, s = 0, scale_factor_w == scale_factor_x;
+        # when |angle| = 90, c = 0, |s| = 1, scale_factor_w == scale_factor_y
+        self.tensor[:, 2] *= torch.sqrt((scale_x * c) ** 2 + (scale_y * s) ** 2)
+
+        # h(new) = |F(new) - O| * 2
+        #        = sqrt[(sfx * s * h / 2)^2 + (sfy * c * h / 2)^2] * 2
+        #        = sqrt[(sfx * s)^2 + (sfy * c)^2] * h
+        # i.e., scale_factor_h = sqrt[(sfx * s)^2 + (sfy * c)^2]
+        #
+        # For example,
+        # when angle = 0 or 180, |c| = 1, s = 0, scale_factor_h == scale_factor_y;
+        # when |angle| = 90, c = 0, |s| = 1, scale_factor_h == scale_factor_x
+        self.tensor[:, 3] *= torch.sqrt((scale_x * s) ** 2 + (scale_y * c) ** 2)
+
+        # The angle is the rotation angle from y-axis in image space to the height
+        # vector (top->down in the box's local coordinate system) of the box in CCW.
+        #
+        # angle(new) = angle_yOx(O - F(new))
+        #            = angle_yOx( (sfx * s * h / 2, sfy * c * h / 2) )
+        #            = atan2(sfx * s * h / 2, sfy * c * h / 2)
+        #            = atan2(sfx * s, sfy * c)
+        #
+        # For example,
+        # when sfx == sfy, angle(new) == atan2(s, c) == angle(old)
+        self.tensor[:, 4] = torch.atan2(scale_x * s, scale_y * c) * 180 / math.pi
+
+    @staticmethod
+    def cat(boxes_list: List["RotatedBoxes"]) -> "RotatedBoxes":  # type: ignore
+        """
+        Concatenates a list of RotatedBoxes into a single RotatedBoxes
+
+        Arguments:
+            boxes_list (list[RotatedBoxes])
+
+        Returns:
+            RotatedBoxes: the concatenated RotatedBoxes
+        """
+        assert isinstance(boxes_list, (list, tuple))
+        assert len(boxes_list) > 0
+        assert all(isinstance(box, RotatedBoxes) for box in boxes_list)
+
+        cat_boxes = type(boxes_list[0])(cat([b.tensor for b in boxes_list], dim=0))
+        return cat_boxes
+
+    @property
+    def device(self) -> str:
+        return self.tensor.device
+
+    def __iter__(self) -> Iterator[torch.Tensor]:
+        """
+        Yield a box as a Tensor of shape (5,) at a time.
+        """
+        yield from self.tensor
+
+
+def pairwise_iou(boxes1: RotatedBoxes, boxes2: RotatedBoxes) -> None:
+    """
+    Given two lists of rotated boxes of size N and M,
+    compute the IoU (intersection over union)
+    between __all__ N x M pairs of boxes.
+    The box order must be (x_center, y_center, width, height, angle).
+
+    Args:
+        boxes1, boxes2 (RotatedBoxes):
+            two `RotatedBoxes`. Contains N & M rotated boxes, respectively.
+
+    Returns:
+        Tensor: IoU, sized [N,M].
+    """
+
+    return pairwise_iou_rotated(boxes1.tensor, boxes2.tensor)
diff --git a/detectron2/utils/README.md b/detectron2/utils/README.md
new file mode 100644
index 0000000..9765b24
--- /dev/null
+++ b/detectron2/utils/README.md
@@ -0,0 +1,5 @@
+# Utility functions
+
+This folder contain utility functions that are not used in the
+core library, but are useful for building models or training
+code using the config system.
diff --git a/detectron2/utils/__init__.py b/detectron2/utils/__init__.py
new file mode 100644
index 0000000..168f997
--- /dev/null
+++ b/detectron2/utils/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
diff --git a/detectron2/utils/collect_env.py b/detectron2/utils/collect_env.py
new file mode 100644
index 0000000..5e3b0e4
--- /dev/null
+++ b/detectron2/utils/collect_env.py
@@ -0,0 +1,92 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+import os
+import subprocess
+import sys
+from collections import defaultdict
+import PIL
+import torch
+import torchvision
+from tabulate import tabulate
+
+__all__ = ["collect_env_info"]
+
+
+def collect_torch_env():
+    try:
+        import torch.__config__
+
+        return torch.__config__.show()
+    except ImportError:
+        # compatible with older versions of pytorch
+        from torch.utils.collect_env import get_pretty_env_info
+
+        return get_pretty_env_info()
+
+
+def get_env_module():
+    var_name = "DETECTRON2_ENV_MODULE"
+    return var_name, os.environ.get(var_name, "<not set>")
+
+
+def collect_env_info():
+    data = []
+    data.append(("sys.platform", sys.platform))
+    data.append(("Python", sys.version.replace("\n", "")))
+    data.append(("Numpy", np.__version__))
+    try:
+        from detectron2 import _C
+    except ImportError:
+        data.append(("detectron2._C", "failed to import"))
+    else:
+        data.append(("Detectron2 Compiler", _C.get_compiler_version()))
+        data.append(("Detectron2 CUDA Compiler", _C.get_cuda_version()))
+
+    data.append(get_env_module())
+    data.append(("PyTorch", torch.__version__))
+    data.append(("PyTorch Debug Build", torch.version.debug))
+    try:
+        data.append(("torchvision", torchvision.__version__))
+    except AttributeError:
+        data.append(("torchvision", "unknown"))
+
+    has_cuda = torch.cuda.is_available()
+    data.append(("CUDA available", has_cuda))
+    if has_cuda:
+        devices = defaultdict(list)
+        for k in range(torch.cuda.device_count()):
+            devices[torch.cuda.get_device_name(k)].append(str(k))
+        for name, devids in devices.items():
+            data.append(("GPU " + ",".join(devids), name))
+
+        from torch.utils.cpp_extension import CUDA_HOME
+
+        data.append(("CUDA_HOME", str(CUDA_HOME)))
+
+        if CUDA_HOME is not None and os.path.isdir(CUDA_HOME):
+            try:
+                nvcc = os.path.join(CUDA_HOME, "bin", "nvcc")
+                nvcc = subprocess.check_output("'{}' -V | tail -n1".format(nvcc), shell=True)
+                nvcc = nvcc.decode("utf-8").strip()
+            except subprocess.SubprocessError:
+                nvcc = "Not Available"
+            data.append(("NVCC", nvcc))
+
+        cuda_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
+        if cuda_arch_list:
+            data.append(("TORCH_CUDA_ARCH_LIST", cuda_arch_list))
+    data.append(("Pillow", PIL.__version__))
+
+    try:
+        import cv2
+
+        data.append(("cv2", cv2.__version__))
+    except ImportError:
+        pass
+    env_str = tabulate(data) + "\n"
+    env_str += collect_torch_env()
+    return env_str
+
+
+if __name__ == "__main__":
+    print(collect_env_info())
diff --git a/detectron2/utils/colormap.py b/detectron2/utils/colormap.py
new file mode 100644
index 0000000..1bf1455
--- /dev/null
+++ b/detectron2/utils/colormap.py
@@ -0,0 +1,140 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+"""
+An awesome colormap for really neat visualizations.
+Copied from Detectron, and removed gray colors.
+"""
+
+import numpy as np
+
+__all__ = ["colormap", "random_color"]
+
+# fmt: off
+# RGB:
+_COLORS = np.array(
+    [
+        0.000, 0.447, 0.741,
+        0.850, 0.325, 0.098,
+        0.929, 0.694, 0.125,
+        0.494, 0.184, 0.556,
+        0.466, 0.674, 0.188,
+        0.301, 0.745, 0.933,
+        0.635, 0.078, 0.184,
+        0.300, 0.300, 0.300,
+        0.600, 0.600, 0.600,
+        1.000, 0.000, 0.000,
+        1.000, 0.500, 0.000,
+        0.749, 0.749, 0.000,
+        0.000, 1.000, 0.000,
+        0.000, 0.000, 1.000,
+        0.667, 0.000, 1.000,
+        0.333, 0.333, 0.000,
+        0.333, 0.667, 0.000,
+        0.333, 1.000, 0.000,
+        0.667, 0.333, 0.000,
+        0.667, 0.667, 0.000,
+        0.667, 1.000, 0.000,
+        1.000, 0.333, 0.000,
+        1.000, 0.667, 0.000,
+        1.000, 1.000, 0.000,
+        0.000, 0.333, 0.500,
+        0.000, 0.667, 0.500,
+        0.000, 1.000, 0.500,
+        0.333, 0.000, 0.500,
+        0.333, 0.333, 0.500,
+        0.333, 0.667, 0.500,
+        0.333, 1.000, 0.500,
+        0.667, 0.000, 0.500,
+        0.667, 0.333, 0.500,
+        0.667, 0.667, 0.500,
+        0.667, 1.000, 0.500,
+        1.000, 0.000, 0.500,
+        1.000, 0.333, 0.500,
+        1.000, 0.667, 0.500,
+        1.000, 1.000, 0.500,
+        0.000, 0.333, 1.000,
+        0.000, 0.667, 1.000,
+        0.000, 1.000, 1.000,
+        0.333, 0.000, 1.000,
+        0.333, 0.333, 1.000,
+        0.333, 0.667, 1.000,
+        0.333, 1.000, 1.000,
+        0.667, 0.000, 1.000,
+        0.667, 0.333, 1.000,
+        0.667, 0.667, 1.000,
+        0.667, 1.000, 1.000,
+        1.000, 0.000, 1.000,
+        1.000, 0.333, 1.000,
+        1.000, 0.667, 1.000,
+        0.333, 0.000, 0.000,
+        0.500, 0.000, 0.000,
+        0.667, 0.000, 0.000,
+        0.833, 0.000, 0.000,
+        1.000, 0.000, 0.000,
+        0.000, 0.167, 0.000,
+        0.000, 0.333, 0.000,
+        0.000, 0.500, 0.000,
+        0.000, 0.667, 0.000,
+        0.000, 0.833, 0.000,
+        0.000, 1.000, 0.000,
+        0.000, 0.000, 0.167,
+        0.000, 0.000, 0.333,
+        0.000, 0.000, 0.500,
+        0.000, 0.000, 0.667,
+        0.000, 0.000, 0.833,
+        0.000, 0.000, 1.000,
+        0.000, 0.000, 0.000,
+        0.143, 0.143, 0.143,
+        0.857, 0.857, 0.857,
+        1.000, 1.000, 1.000
+    ]
+).astype(np.float32).reshape(-1, 3)
+# fmt: on
+
+
+def colormap(rgb=False, maximum=255):
+    """
+    Args:
+        rgb (bool): whether to return RGB colors or BGR colors.
+        maximum (int): either 255 or 1
+
+    Returns:
+        ndarray: a float32 array of Nx3 colors, in range [0, 255] or [0, 1]
+    """
+    assert maximum in [255, 1], maximum
+    c = _COLORS * maximum
+    if not rgb:
+        c = c[:, ::-1]
+    return c
+
+
+def random_color(rgb=False, maximum=255):
+    """
+    Args:
+        rgb (bool): whether to return RGB colors or BGR colors.
+        maximum (int): either 255 or 1
+
+    Returns:
+        ndarray: a vector of 3 numbers
+    """
+    idx = np.random.randint(0, len(_COLORS))
+    ret = _COLORS[idx] * maximum
+    if not rgb:
+        ret = ret[::-1]
+    return ret
+
+
+if __name__ == "__main__":
+    import cv2
+
+    size = 100
+    H, W = 10, 10
+    canvas = np.random.rand(H * size, W * size, 3).astype("float32")
+    for h in range(H):
+        for w in range(W):
+            idx = h * W + w
+            if idx >= len(_COLORS):
+                break
+            canvas[h * size : (h + 1) * size, w * size : (w + 1) * size] = _COLORS[idx]
+    cv2.imshow("a", canvas)
+    cv2.waitKey(0)
diff --git a/detectron2/utils/comm.py b/detectron2/utils/comm.py
new file mode 100644
index 0000000..8cc7b3d
--- /dev/null
+++ b/detectron2/utils/comm.py
@@ -0,0 +1,263 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+This file contains primitives for multi-gpu communication.
+This is useful when doing distributed training.
+"""
+
+import functools
+import logging
+import numpy as np
+import pickle
+import torch
+import torch.distributed as dist
+
+_LOCAL_PROCESS_GROUP = None
+"""
+A torch process group which only includes processes that on the same machine as the current process.
+This variable is set when processes are spawned by `launch()` in "engine/launch.py".
+"""
+
+
+def get_world_size() -> int:
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank() -> int:
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def get_local_rank() -> int:
+    """
+    Returns:
+        The rank of the current process within the local (per-machine) process group.
+    """
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    assert _LOCAL_PROCESS_GROUP is not None
+    return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
+
+
+def get_local_size() -> int:
+    """
+    Returns:
+        The size of the per-machine process group,
+        i.e. the number of processes per machine.
+    """
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
+
+
+def is_main_process() -> bool:
+    return get_rank() == 0
+
+
+def synchronize():
+    """
+    Helper function to synchronize (barrier) among all processes when
+    using distributed training
+    """
+    if not dist.is_available():
+        return
+    if not dist.is_initialized():
+        return
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return
+    dist.barrier()
+
+
+@functools.lru_cache()
+def _get_global_gloo_group():
+    """
+    Return a process group based on gloo backend, containing all the ranks
+    The result is cached.
+    """
+    if dist.get_backend() == "nccl":
+        return dist.new_group(backend="gloo")
+    else:
+        return dist.group.WORLD
+
+
+def _serialize_to_tensor(data, group):
+    backend = dist.get_backend(group)
+    assert backend in ["gloo", "nccl"]
+    device = torch.device("cpu" if backend == "gloo" else "cuda")
+
+    buffer = pickle.dumps(data)
+    if len(buffer) > 1024 ** 3:
+        logger = logging.getLogger(__name__)
+        logger.warning(
+            "Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
+                get_rank(), len(buffer) / (1024 ** 3), device
+            )
+        )
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to(device=device)
+    return tensor
+
+
+def _pad_to_largest_tensor(tensor, group):
+    """
+    Returns:
+        list[int]: size of the tensor, on each rank
+        Tensor: padded tensor that has the max size
+    """
+    world_size = dist.get_world_size(group=group)
+    assert (
+        world_size >= 1
+    ), "comm.gather/all_gather must be called from ranks within the given group!"
+    local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device)
+    size_list = [
+        torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size)
+    ]
+    dist.all_gather(size_list, local_size, group=group)
+    size_list = [int(size.item()) for size in size_list]
+
+    max_size = max(size_list)
+
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    if local_size != max_size:
+        padding = torch.zeros((max_size - local_size,), dtype=torch.uint8, device=tensor.device)
+        tensor = torch.cat((tensor, padding), dim=0)
+    return size_list, tensor
+
+
+def all_gather(data, group=None):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors).
+
+    Args:
+        data: any picklable object
+        group: a torch process group. By default, will use a group which
+            contains all ranks on gloo backend.
+
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    if get_world_size() == 1:
+        return [data]
+    if group is None:
+        group = _get_global_gloo_group()
+    if dist.get_world_size(group) == 1:
+        return [data]
+
+    tensor = _serialize_to_tensor(data, group)
+
+    size_list, tensor = _pad_to_largest_tensor(tensor, group)
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    tensor_list = [
+        torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list
+    ]
+    dist.all_gather(tensor_list, tensor, group=group)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def gather(data, dst=0, group=None):
+    """
+    Run gather on arbitrary picklable data (not necessarily tensors).
+
+    Args:
+        data: any picklable object
+        dst (int): destination rank
+        group: a torch process group. By default, will use a group which
+            contains all ranks on gloo backend.
+
+    Returns:
+        list[data]: on dst, a list of data gathered from each rank. Otherwise,
+            an empty list.
+    """
+    if get_world_size() == 1:
+        return [data]
+    if group is None:
+        group = _get_global_gloo_group()
+    if dist.get_world_size(group=group) == 1:
+        return [data]
+    rank = dist.get_rank(group=group)
+
+    tensor = _serialize_to_tensor(data, group)
+    size_list, tensor = _pad_to_largest_tensor(tensor, group)
+
+    # receiving Tensor from all ranks
+    if rank == dst:
+        max_size = max(size_list)
+        tensor_list = [
+            torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list
+        ]
+        dist.gather(tensor, tensor_list, dst=dst, group=group)
+
+        data_list = []
+        for size, tensor in zip(size_list, tensor_list):
+            buffer = tensor.cpu().numpy().tobytes()[:size]
+            data_list.append(pickle.loads(buffer))
+        return data_list
+    else:
+        dist.gather(tensor, [], dst=dst, group=group)
+        return []
+
+
+def shared_random_seed():
+    """
+    Returns:
+        int: a random number that is the same across all workers.
+            If workers need a shared RNG, they can use this shared seed to
+            create one.
+
+    All workers must call this function, otherwise it will deadlock.
+    """
+    ints = np.random.randint(2 ** 31)
+    all_ints = all_gather(ints)
+    return all_ints[0]
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Reduce the values in the dictionary from all processes so that process with rank
+    0 has the reduced results.
+
+    Args:
+        input_dict (dict): inputs to be reduced. All the values must be scalar CUDA Tensor.
+        average (bool): whether to do average or sum
+
+    Returns:
+        a dict with the same keys as input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.reduce(values, dst=0)
+        if dist.get_rank() == 0 and average:
+            # only main process gets accumulated, so only divide by
+            # world_size in this case
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
diff --git a/detectron2/utils/env.py b/detectron2/utils/env.py
new file mode 100644
index 0000000..a05057f
--- /dev/null
+++ b/detectron2/utils/env.py
@@ -0,0 +1,105 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import importlib
+import importlib.util
+import logging
+import numpy as np
+import os
+import random
+import sys
+from datetime import datetime
+import torch
+
+__all__ = ["seed_all_rng"]
+
+
+def seed_all_rng(seed=None):
+    """
+    Set the random seed for the RNG in torch, numpy and python.
+
+    Args:
+        seed (int): if None, will use a strong random seed.
+    """
+    if seed is None:
+        seed = (
+            os.getpid()
+            + int(datetime.now().strftime("%S%f"))
+            + int.from_bytes(os.urandom(2), "big")
+        )
+        logger = logging.getLogger(__name__)
+        logger.info("Using a generated random seed {}".format(seed))
+    np.random.seed(seed)
+    torch.set_rng_state(torch.manual_seed(seed).get_state())
+    random.seed(seed)
+
+
+# from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path
+def _import_file(module_name, file_path, make_importable=False):
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    if make_importable:
+        sys.modules[module_name] = module
+    return module
+
+
+def _configure_libraries():
+    """
+    Configurations for some libraries.
+    """
+    # An environment option to disable `import cv2` globally,
+    # in case it leads to negative performance impact
+    disable_cv2 = int(os.environ.get("DETECTRON2_DISABLE_CV2", False))
+    if disable_cv2:
+        sys.modules["cv2"] = None
+    else:
+        # Disable opencl in opencv since its interaction with cuda often has negative effects
+        # This envvar is supported after OpenCV 3.4.0
+        os.environ["OPENCV_OPENCL_RUNTIME"] = "disabled"
+        try:
+            import cv2
+
+            if int(cv2.__version__.split(".")[0]) >= 3:
+                cv2.ocl.setUseOpenCL(False)
+        except ImportError:
+            pass
+
+
+_ENV_SETUP_DONE = False
+
+
+def setup_environment():
+    """Perform environment setup work. The default setup is a no-op, but this
+    function allows the user to specify a Python source file or a module in
+    the $DETECTRON2_ENV_MODULE environment variable, that performs
+    custom setup work that may be necessary to their computing environment.
+    """
+    global _ENV_SETUP_DONE
+    if _ENV_SETUP_DONE:
+        return
+    _ENV_SETUP_DONE = True
+
+    _configure_libraries()
+
+    custom_module_path = os.environ.get("DETECTRON2_ENV_MODULE")
+
+    if custom_module_path:
+        setup_custom_environment(custom_module_path)
+    else:
+        # The default setup is a no-op
+        pass
+
+
+def setup_custom_environment(custom_module):
+    """
+    Load custom environment setup by importing a Python source file or a
+    module, and run the setup function.
+    """
+    if custom_module.endswith(".py"):
+        module = _import_file("detectron2.utils.env.custom_module", custom_module)
+    else:
+        module = importlib.import_module(custom_module)
+    assert hasattr(module, "setup_environment") and callable(module.setup_environment), (
+        "Custom environment module defined in {} does not have the "
+        "required callable attribute 'setup_environment'."
+    ).format(custom_module)
+    module.setup_environment()
diff --git a/detectron2/utils/events.py b/detectron2/utils/events.py
new file mode 100644
index 0000000..f7f79bd
--- /dev/null
+++ b/detectron2/utils/events.py
@@ -0,0 +1,337 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import datetime
+import json
+import logging
+import os
+from collections import defaultdict
+from contextlib import contextmanager
+import torch
+from fvcore.common.file_io import PathManager
+from fvcore.common.history_buffer import HistoryBuffer
+
+_CURRENT_STORAGE_STACK = []
+
+
+def get_event_storage():
+    assert len(
+        _CURRENT_STORAGE_STACK
+    ), "get_event_storage() has to be called inside a 'with EventStorage(...)' context!"
+    return _CURRENT_STORAGE_STACK[-1]
+
+
+class EventWriter:
+    """
+    Base class for writers that obtain events from :class:`EventStorage` and process them.
+    """
+
+    def write(self):
+        raise NotImplementedError
+
+    def close(self):
+        pass
+
+
+class JSONWriter(EventWriter):
+    """
+    Write scalars to a json file.
+
+    It saves scalars as one json per line (instead of a big json) for easy parsing.
+
+    Examples parsing such a json file:
+
+    .. code-block:: none
+
+        $ cat metrics.json | jq -s '.[0:2]'
+        [
+          {
+            "data_time": 0.008433341979980469,
+            "iteration": 20,
+            "loss": 1.9228371381759644,
+            "loss_box_reg": 0.050025828182697296,
+            "loss_classifier": 0.5316952466964722,
+            "loss_mask": 0.7236229181289673,
+            "loss_rpn_box": 0.0856662318110466,
+            "loss_rpn_cls": 0.48198649287223816,
+            "lr": 0.007173333333333333,
+            "time": 0.25401854515075684
+          },
+          {
+            "data_time": 0.007216215133666992,
+            "iteration": 40,
+            "loss": 1.282649278640747,
+            "loss_box_reg": 0.06222952902317047,
+            "loss_classifier": 0.30682939291000366,
+            "loss_mask": 0.6970193982124329,
+            "loss_rpn_box": 0.038663312792778015,
+            "loss_rpn_cls": 0.1471673548221588,
+            "lr": 0.007706666666666667,
+            "time": 0.2490077018737793
+          }
+        ]
+
+        $ cat metrics.json | jq '.loss_mask'
+        0.7126231789588928
+        0.689423680305481
+        0.6776131987571716
+        ...
+
+    """
+
+    def __init__(self, json_file, window_size=20):
+        """
+        Args:
+            json_file (str): path to the json file. New data will be appended if the file exists.
+            window_size (int): the window size of median smoothing for the scalars whose
+                `smoothing_hint` are True.
+        """
+        self._file_handle = PathManager.open(json_file, "a")
+        self._window_size = window_size
+
+    def write(self):
+        storage = get_event_storage()
+        to_save = {"iteration": storage.iter}
+        to_save.update(storage.latest_with_smoothing_hint(self._window_size))
+        self._file_handle.write(json.dumps(to_save, sort_keys=True) + "\n")
+        self._file_handle.flush()
+        try:
+            os.fsync(self._file_handle.fileno())
+        except AttributeError:
+            pass
+
+    def close(self):
+        self._file_handle.close()
+
+
+class TensorboardXWriter(EventWriter):
+    """
+    Write all scalars to a tensorboard file.
+    """
+
+    def __init__(self, log_dir: str, window_size: int = 20, **kwargs):
+        """
+        Args:
+            log_dir (str): The directory to save the output events
+            window_size (int): the scalars will be median-smoothed by this window size
+            kwargs: other arguments passed to `torch.utils.tensorboard.SummaryWriter(...)`
+        """
+        self._window_size = window_size
+        from torch.utils.tensorboard import SummaryWriter
+
+        self._writer = SummaryWriter(log_dir, **kwargs)
+
+    def write(self):
+        storage = get_event_storage()
+        for k, v in storage.latest_with_smoothing_hint(self._window_size).items():
+            self._writer.add_scalar(k, v, storage.iter)
+
+    def close(self):
+        if hasattr(self, "_writer"):  # doesn't exist when the code fails at import
+            self._writer.close()
+
+
+class CommonMetricPrinter(EventWriter):
+    """
+    Print **common** metrics to the terminal, including
+    iteration time, ETA, memory, all losses, and the learning rate.
+
+    To print something different, please implement a similar printer by yourself.
+    """
+
+    def __init__(self, max_iter):
+        """
+        Args:
+            max_iter (int): the maximum number of iterations to train.
+                Used to compute ETA.
+        """
+        self.logger = logging.getLogger(__name__)
+        self._max_iter = max_iter
+
+    def write(self):
+        storage = get_event_storage()
+        iteration = storage.iter
+
+        data_time, time = None, None
+        eta_string = "N/A"
+        try:
+            data_time = storage.history("data_time").avg(20)
+            time = storage.history("time").global_avg()
+            eta_seconds = storage.history("time").median(1000) * (self._max_iter - iteration)
+            storage.put_scalar("eta_seconds", eta_seconds, smoothing_hint=False)
+            eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+        except KeyError:  # they may not exist in the first few iterations (due to warmup)
+            pass
+
+        try:
+            lr = "{:.6f}".format(storage.history("lr").latest())
+        except KeyError:
+            lr = "N/A"
+
+        if torch.cuda.is_available():
+            max_mem_mb = torch.cuda.max_memory_allocated() / 1024.0 / 1024.0
+        else:
+            max_mem_mb = None
+
+        # NOTE: max_mem is parsed by grep in "dev/parse_results.sh"
+        self.logger.info(
+            """\
+eta: {eta}  iter: {iter}  {losses}  \
+{time}  {data_time}  \
+lr: {lr}  {memory}\
+""".format(
+                eta=eta_string,
+                iter=iteration,
+                losses="  ".join(
+                    [
+                        "{}: {:.3f}".format(k, v.median(20))
+                        for k, v in storage.histories().items()
+                        if "loss" in k
+                    ]
+                ),
+                time="time: {:.4f}".format(time) if time is not None else "",
+                data_time="data_time: {:.4f}".format(data_time) if data_time is not None else "",
+                lr=lr,
+                memory="max_mem: {:.0f}M".format(max_mem_mb) if max_mem_mb is not None else "",
+            )
+        )
+
+
+class EventStorage:
+    """
+    The user-facing class that provides metric storage functionalities.
+
+    In the future we may add support for storing / logging other types of data if needed.
+    """
+
+    def __init__(self, start_iter=0):
+        """
+        Args:
+            start_iter (int): the iteration number to start with
+        """
+        self._history = defaultdict(HistoryBuffer)
+        self._smoothing_hints = {}
+        self._latest_scalars = {}
+        self._iter = start_iter
+        self._current_prefix = ""
+
+    def put_scalar(self, name, value, smoothing_hint=True):
+        """
+        Add a scalar `value` to the `HistoryBuffer` associated with `name`.
+
+        Args:
+            smoothing_hint (bool): a 'hint' on whether this scalar is noisy and should be
+                smoothed when logged. The hint will be accessible through
+                :meth:`EventStorage.smoothing_hints`.  A writer may ignore the hint
+                and apply custom smoothing rule.
+
+                It defaults to True because most scalars we save need to be smoothed to
+                provide any useful signal.
+        """
+        name = self._current_prefix + name
+        history = self._history[name]
+        value = float(value)
+        history.update(value, self._iter)
+        self._latest_scalars[name] = value
+
+        existing_hint = self._smoothing_hints.get(name)
+        if existing_hint is not None:
+            assert (
+                existing_hint == smoothing_hint
+            ), "Scalar {} was put with a different smoothing_hint!".format(name)
+        else:
+            self._smoothing_hints[name] = smoothing_hint
+
+    def put_scalars(self, *, smoothing_hint=True, **kwargs):
+        """
+        Put multiple scalars from keyword arguments.
+
+        Examples:
+
+            storage.put_scalars(loss=my_loss, accuracy=my_accuracy, smoothing_hint=True)
+        """
+        for k, v in kwargs.items():
+            self.put_scalar(k, v, smoothing_hint=smoothing_hint)
+
+    def history(self, name):
+        """
+        Returns:
+            HistoryBuffer: the scalar history for name
+        """
+        ret = self._history.get(name, None)
+        if ret is None:
+            raise KeyError("No history metric available for {}!".format(name))
+        return ret
+
+    def histories(self):
+        """
+        Returns:
+            dict[name -> HistoryBuffer]: the HistoryBuffer for all scalars
+        """
+        return self._history
+
+    def latest(self):
+        """
+        Returns:
+            dict[name -> number]: the scalars that's added in the current iteration.
+        """
+        return self._latest_scalars
+
+    def latest_with_smoothing_hint(self, window_size=20):
+        """
+        Similar to :meth:`latest`, but the returned values
+        are either the un-smoothed original latest value,
+        or a median of the given window_size,
+        depend on whether the smoothing_hint is True.
+
+        This provides a default behavior that other writers can use.
+        """
+        result = {}
+        for k, v in self._latest_scalars.items():
+            result[k] = self._history[k].median(window_size) if self._smoothing_hints[k] else v
+        return result
+
+    def smoothing_hints(self):
+        """
+        Returns:
+            dict[name -> bool]: the user-provided hint on whether the scalar
+                is noisy and needs smoothing.
+        """
+        return self._smoothing_hints
+
+    def step(self):
+        """
+        User should call this function at the beginning of each iteration, to
+        notify the storage of the start of a new iteration.
+        The storage will then be able to associate the new data with the
+        correct iteration number.
+        """
+        self._iter += 1
+        self._latest_scalars = {}
+
+    @property
+    def iter(self):
+        return self._iter
+
+    @property
+    def iteration(self):
+        # for backward compatibility
+        return self._iter
+
+    def __enter__(self):
+        _CURRENT_STORAGE_STACK.append(self)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        assert _CURRENT_STORAGE_STACK[-1] == self
+        _CURRENT_STORAGE_STACK.pop()
+
+    @contextmanager
+    def name_scope(self, name):
+        """
+        Yields:
+            A context within which all the events added to this storage
+            will be prefixed by the name scope.
+        """
+        old_prefix = self._current_prefix
+        self._current_prefix = name.rstrip("/") + "/"
+        yield
+        self._current_prefix = old_prefix
diff --git a/detectron2/utils/logger.py b/detectron2/utils/logger.py
new file mode 100644
index 0000000..a167d7d
--- /dev/null
+++ b/detectron2/utils/logger.py
@@ -0,0 +1,196 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import functools
+import logging
+import os
+import sys
+from collections import Counter
+from fvcore.common.file_io import PathManager
+from tabulate import tabulate
+from termcolor import colored
+
+
+class _ColorfulFormatter(logging.Formatter):
+    def __init__(self, *args, **kwargs):
+        self._root_name = kwargs.pop("root_name") + "."
+        self._abbrev_name = kwargs.pop("abbrev_name", "")
+        if len(self._abbrev_name):
+            self._abbrev_name = self._abbrev_name + "."
+        super(_ColorfulFormatter, self).__init__(*args, **kwargs)
+
+    def formatMessage(self, record):
+        record.name = record.name.replace(self._root_name, self._abbrev_name)
+        log = super(_ColorfulFormatter, self).formatMessage(record)
+        if record.levelno == logging.WARNING:
+            prefix = colored("WARNING", "red", attrs=["blink"])
+        elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL:
+            prefix = colored("ERROR", "red", attrs=["blink", "underline"])
+        else:
+            return log
+        return prefix + " " + log
+
+
+@functools.lru_cache()  # so that calling setup_logger multiple times won't add many handlers
+def setup_logger(
+    output=None, distributed_rank=0, *, color=True, name="detectron2", abbrev_name=None
+):
+    """
+    Args:
+        output (str): a file name or a directory to save log. If None, will not save log file.
+            If ends with ".txt" or ".log", assumed to be a file name.
+            Otherwise, logs will be saved to `output/log.txt`.
+        name (str): the root module name of this logger
+        abbrev_name (str): an abbreviation of the module, to avoid long names in logs.
+            Set to "" to not log the root module in logs.
+            By default, will abbreviate "detectron2" to "d2" and leave other
+            modules unchanged.
+    """
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+
+    if abbrev_name is None:
+        abbrev_name = "d2" if name == "detectron2" else name
+
+    plain_formatter = logging.Formatter(
+        "[%(asctime)s] %(name)s %(levelname)s: %(message)s", datefmt="%m/%d %H:%M:%S"
+    )
+    # stdout logging: master only
+    if distributed_rank == 0:
+        ch = logging.StreamHandler(stream=sys.stdout)
+        ch.setLevel(logging.DEBUG)
+        if color:
+            formatter = _ColorfulFormatter(
+                colored("[%(asctime)s %(name)s]: ", "green") + "%(message)s",
+                datefmt="%m/%d %H:%M:%S",
+                root_name=name,
+                abbrev_name=str(abbrev_name),
+            )
+        else:
+            formatter = plain_formatter
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+
+    # file logging: all workers
+    if output is not None:
+        if output.endswith(".txt") or output.endswith(".log"):
+            filename = output
+        else:
+            filename = os.path.join(output, "log.txt")
+        if distributed_rank > 0:
+            filename = filename + ".rank{}".format(distributed_rank)
+        PathManager.mkdirs(os.path.dirname(filename))
+
+        fh = logging.StreamHandler(_cached_log_stream(filename))
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(plain_formatter)
+        logger.addHandler(fh)
+
+    return logger
+
+
+# cache the opened file object, so that different calls to `setup_logger`
+# with the same file name can safely write to the same file.
+@functools.lru_cache(maxsize=None)
+def _cached_log_stream(filename):
+    return PathManager.open(filename, "a")
+
+
+"""
+Below are some other convenient logging methods.
+They are mainly adopted from
+https://github.com/abseil/abseil-py/blob/master/absl/logging/__init__.py
+"""
+
+
+def _find_caller():
+    """
+    Returns:
+        str: module name of the caller
+        tuple: a hashable key to be used to identify different callers
+    """
+    frame = sys._getframe(2)
+    while frame:
+        code = frame.f_code
+        if os.path.join("utils", "logger.") not in code.co_filename:
+            mod_name = frame.f_globals["__name__"]
+            if mod_name == "__main__":
+                mod_name = "detectron2"
+            return mod_name, (code.co_filename, frame.f_lineno, code.co_name)
+        frame = frame.f_back
+
+
+_LOG_COUNTER = Counter()
+
+
+def log_first_n(lvl, msg, n=1, *, name=None, key="caller"):
+    """
+    Log only for the first n times.
+
+    Args:
+        lvl (int): the logging level
+        msg (str):
+        n (int):
+        name (str): name of the logger to use. Will use the caller's module by default.
+        key (str or tuple[str]): the string(s) can be one of "caller" or
+            "message", which defines how to identify duplicated logs.
+            For example, if called with `n=1, key="caller"`, this function
+            will only log the first call from the same caller, regardless of
+            the message content.
+            If called with `n=1, key="message"`, this function will log the
+            same content only once, even if they are called from different places.
+            If called with `n=1, key=("caller", "message")`, this function
+            will not log only if the same caller has logged the same message before.
+    """
+    if isinstance(key, str):
+        key = (key,)
+    assert len(key) > 0
+
+    caller_module, caller_key = _find_caller()
+    hash_key = ()
+    if "caller" in key:
+        hash_key = hash_key + caller_key
+    if "message" in key:
+        hash_key = hash_key + (msg,)
+
+    _LOG_COUNTER[hash_key] += 1
+    if _LOG_COUNTER[hash_key] <= n:
+        logging.getLogger(name or caller_module).log(lvl, msg)
+
+
+def log_every_n(lvl, msg, n=1, *, name=None):
+    """
+    Log once per n times.
+
+    Args:
+        lvl (int): the logging level
+        msg (str):
+        n (int):
+        name (str): name of the logger to use. Will use the caller's module by default.
+    """
+    caller_module, key = _find_caller()
+    _LOG_COUNTER[key] += 1
+    if n == 1 or _LOG_COUNTER[key] % n == 1:
+        logging.getLogger(name or caller_module).log(lvl, msg)
+
+
+def create_small_table(small_dict):
+    """
+    Create a small table using the keys of small_dict as headers. This is only
+    suitable for small dictionaries.
+
+    Args:
+        small_dict (dict): a result dictionary of only a few items.
+
+    Returns:
+        str: the table as a string.
+    """
+    keys, values = tuple(zip(*small_dict.items()))
+    table = tabulate(
+        [values],
+        headers=keys,
+        tablefmt="pipe",
+        floatfmt=".3f",
+        stralign="center",
+        numalign="center",
+    )
+    return table
diff --git a/detectron2/utils/registry.py b/detectron2/utils/registry.py
new file mode 100644
index 0000000..c9aef92
--- /dev/null
+++ b/detectron2/utils/registry.py
@@ -0,0 +1,64 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+class Registry(object):
+    """
+    The registry that provides name -> object mapping, to support third-party users' custom modules.
+
+    To create a registry (inside detectron2):
+
+    .. code-block:: python
+
+        BACKBONE_REGISTRY = Registry('BACKBONE')
+
+    To register an object:
+
+    .. code-block:: python
+
+        @BACKBONE_REGISTRY.register()
+        class MyBackbone():
+            ...
+
+    Or:
+
+    .. code-block:: python
+
+        BACKBONE_REGISTRY.register(MyBackbone)
+    """
+
+    def __init__(self, name):
+        """
+        Args:
+            name (str): the name of this registry
+        """
+        self._name = name
+
+        self._obj_map = {}
+
+    def _do_register(self, name, obj):
+        assert (
+            name not in self._obj_map
+        ), "An object named '{}' was already registered in '{}' registry!".format(name, self._name)
+        self._obj_map[name] = obj
+
+    def register(self, obj=None):
+        """
+        Register the given object under the the name `obj.__name__`.
+        Can be used as either a decorator or not. See docstring of this class for usage.
+        """
+        if obj is None:
+            # used as a decorator
+            def deco(func_or_class):
+                name = func_or_class.__name__
+                self._do_register(name, func_or_class)
+                return func_or_class
+
+            return deco
+
+        # used as a function call
+        name = obj.__name__
+        self._do_register(name, obj)
+
+    def get(self, name):
+        ret = self._obj_map.get(name)
+        if ret is None:
+            raise KeyError("No object named '{}' found in '{}' registry!".format(name, self._name))
+        return ret
diff --git a/detectron2/utils/serialize.py b/detectron2/utils/serialize.py
new file mode 100644
index 0000000..734a62c
--- /dev/null
+++ b/detectron2/utils/serialize.py
@@ -0,0 +1,29 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import cloudpickle
+
+
+class PicklableWrapper(object):
+    """
+    Wrap an object to make it more picklable, note that it uses
+    heavy weight serialization libraries that are slower than pickle.
+    It's best to use it only on closures (which are usually not picklable).
+
+    This is a simplified version of
+    https://github.com/joblib/joblib/blob/master/joblib/externals/loky/cloudpickle_wrapper.py
+    """
+
+    def __init__(self, obj):
+        self._obj = obj
+
+    def __reduce__(self):
+        s = cloudpickle.dumps(self._obj)
+        return cloudpickle.loads, (s,)
+
+    def __call__(self, *args, **kwargs):
+        return self._obj(*args, **kwargs)
+
+    def __getattr__(self, attr):
+        # Ensure that the wrapped object can be used seamlessly as the previous object.
+        if attr not in ["_obj"]:
+            return getattr(self._obj, attr)
+        return getattr(self, attr)
diff --git a/detectron2/utils/video_visualizer.py b/detectron2/utils/video_visualizer.py
new file mode 100644
index 0000000..0144b67
--- /dev/null
+++ b/detectron2/utils/video_visualizer.py
@@ -0,0 +1,235 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+import pycocotools.mask as mask_util
+
+from detectron2.utils.visualizer import (
+    ColorMode,
+    Visualizer,
+    _create_text_labels,
+    _PanopticPrediction,
+)
+
+from .colormap import random_color
+
+
+class _DetectedInstance:
+    """
+    Used to store data about detected objects in video frame,
+    in order to transfer color to objects in the future frames.
+
+    Attributes:
+        label (int):
+        bbox (tuple[float]):
+        mask_rle (dict):
+        color (tuple[float]): RGB colors in range (0, 1)
+        ttl (int): time-to-live for the instance. For example, if ttl=2,
+            the instance color can be transferred to objects in the next two frames.
+    """
+
+    __slots__ = ["label", "bbox", "mask_rle", "color", "ttl"]
+
+    def __init__(self, label, bbox, mask_rle, color, ttl):
+        self.label = label
+        self.bbox = bbox
+        self.mask_rle = mask_rle
+        self.color = color
+        self.ttl = ttl
+
+
+class VideoVisualizer:
+    def __init__(self, metadata, instance_mode=ColorMode.IMAGE):
+        """
+        Args:
+            metadata (MetadataCatalog): image metadata.
+        """
+        self.metadata = metadata
+        self._old_instances = []
+        assert instance_mode in [
+            ColorMode.IMAGE,
+            ColorMode.IMAGE_BW,
+        ], "Other mode not supported yet."
+        self._instance_mode = instance_mode
+
+    def draw_instance_predictions(self, frame, predictions):
+        """
+        Draw instance-level prediction results on an image.
+
+        Args:
+            frame (ndarray): an RGB image of shape (H, W, C), in the range [0, 255].
+            predictions (Instances): the output of an instance detection/segmentation
+                model. Following fields will be used to draw:
+                "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        frame_visualizer = Visualizer(frame, self.metadata)
+        num_instances = len(predictions)
+        if num_instances == 0:
+            return frame_visualizer.output
+
+        boxes = predictions.pred_boxes.tensor.numpy() if predictions.has("pred_boxes") else None
+        scores = predictions.scores if predictions.has("scores") else None
+        classes = predictions.pred_classes.numpy() if predictions.has("pred_classes") else None
+        keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None
+
+        if predictions.has("pred_masks"):
+            masks = predictions.pred_masks
+            # mask IOU is not yet enabled
+            # masks_rles = mask_util.encode(np.asarray(masks.permute(1, 2, 0), order="F"))
+            # assert len(masks_rles) == num_instances
+        else:
+            masks = None
+
+        detected = [
+            _DetectedInstance(classes[i], boxes[i], mask_rle=None, color=None, ttl=8)
+            for i in range(num_instances)
+        ]
+        colors = self._assign_colors(detected)
+
+        labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            # any() returns uint8 tensor
+            frame_visualizer.output.img = frame_visualizer._create_grayscale_image(
+                (masks.any(dim=0) > 0).numpy() if masks is not None else None
+            )
+            alpha = 0.3
+        else:
+            alpha = 0.5
+
+        frame_visualizer.overlay_instances(
+            boxes=None if masks is not None else boxes,  # boxes are a bit distracting
+            masks=masks,
+            labels=labels,
+            keypoints=keypoints,
+            assigned_colors=colors,
+            alpha=alpha,
+        )
+
+        return frame_visualizer.output
+
+    def draw_sem_seg(self, frame, sem_seg, area_threshold=None):
+        """
+        Args:
+            sem_seg (ndarray or Tensor): semantic segmentation of shape (H, W),
+                each value is the integer label.
+            area_threshold (Optional[int]): only draw segmentations larger than the threshold
+        """
+        # don't need to do anything special
+        frame_visualizer = Visualizer(frame, self.metadata)
+        frame_visualizer.draw_sem_seg(sem_seg, area_threshold=None)
+        return frame_visualizer.output
+
+    def draw_panoptic_seg_predictions(
+        self, frame, panoptic_seg, segments_info, area_threshold=None, alpha=0.5
+    ):
+        frame_visualizer = Visualizer(frame, self.metadata)
+        pred = _PanopticPrediction(panoptic_seg, segments_info)
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            frame_visualizer.output.img = frame_visualizer._create_grayscale_image(
+                pred.non_empty_mask()
+            )
+
+        # draw mask for all semantic segments first i.e. "stuff"
+        for mask, sinfo in pred.semantic_masks():
+            category_idx = sinfo["category_id"]
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]]
+            except AttributeError:
+                mask_color = None
+
+            frame_visualizer.draw_binary_mask(
+                mask,
+                color=mask_color,
+                text=self.metadata.stuff_classes[category_idx],
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+
+        all_instances = list(pred.instance_masks())
+        if len(all_instances) == 0:
+            return frame_visualizer.output
+        # draw mask for all instances second
+        masks, sinfo = list(zip(*all_instances))
+        num_instances = len(masks)
+        masks_rles = mask_util.encode(
+            np.asarray(np.asarray(masks).transpose(1, 2, 0), dtype=np.uint8, order="F")
+        )
+        assert len(masks_rles) == num_instances
+
+        category_ids = [x["category_id"] for x in sinfo]
+        detected = [
+            _DetectedInstance(category_ids[i], bbox=None, mask_rle=masks_rles[i], color=None, ttl=8)
+            for i in range(num_instances)
+        ]
+        colors = self._assign_colors(detected)
+        labels = [self.metadata.thing_classes[k] for k in category_ids]
+
+        frame_visualizer.overlay_instances(
+            boxes=None,
+            masks=masks,
+            labels=labels,
+            keypoints=None,
+            assigned_colors=colors,
+            alpha=alpha,
+        )
+        return frame_visualizer.output
+
+    def _assign_colors(self, instances):
+        """
+        Naive tracking heuristics to assign same color to the same instance,
+        will update the internal state of tracked instances.
+
+        Returns:
+            list[tuple[float]]: list of colors.
+        """
+
+        # Compute iou with either boxes or masks:
+        is_crowd = np.zeros((len(instances),), dtype=np.bool)
+        if instances[0].bbox is None:
+            assert instances[0].mask_rle is not None
+            # use mask iou only when box iou is None
+            # because box seems good enough
+            rles_old = [x.mask_rle for x in self._old_instances]
+            rles_new = [x.mask_rle for x in instances]
+            ious = mask_util.iou(rles_old, rles_new, is_crowd)
+            threshold = 0.5
+        else:
+            boxes_old = [x.bbox for x in self._old_instances]
+            boxes_new = [x.bbox for x in instances]
+            ious = mask_util.iou(boxes_old, boxes_new, is_crowd)
+            threshold = 0.6
+        if len(ious) == 0:
+            ious = np.zeros((len(self._old_instances), len(instances)), dtype="float32")
+
+        # Only allow matching instances of the same label:
+        for old_idx, old in enumerate(self._old_instances):
+            for new_idx, new in enumerate(instances):
+                if old.label != new.label:
+                    ious[old_idx, new_idx] = 0
+
+        matched_new_per_old = np.asarray(ious).argmax(axis=1)
+        max_iou_per_old = np.asarray(ious).max(axis=1)
+
+        # Try to find match for each old instance:
+        extra_instances = []
+        for idx, inst in enumerate(self._old_instances):
+            if max_iou_per_old[idx] > threshold:
+                newidx = matched_new_per_old[idx]
+                if instances[newidx].color is None:
+                    instances[newidx].color = inst.color
+                    continue
+            # If an old instance does not match any new instances,
+            # keep it for the next frame in case it is just missed by the detector
+            inst.ttl -= 1
+            if inst.ttl > 0:
+                extra_instances.append(inst)
+
+        # Assign random color to newly-detected instances:
+        for inst in instances:
+            if inst.color is None:
+                inst.color = random_color(rgb=True, maximum=1)
+        self._old_instances = instances[:] + extra_instances
+        return [d.color for d in instances]
diff --git a/detectron2/utils/visualizer.py b/detectron2/utils/visualizer.py
new file mode 100644
index 0000000..34274a5
--- /dev/null
+++ b/detectron2/utils/visualizer.py
@@ -0,0 +1,1125 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import colorsys
+import logging
+import math
+import numpy as np
+from enum import Enum, unique
+import cv2
+import matplotlib as mpl
+import matplotlib.colors as mplc
+import matplotlib.figure as mplfigure
+import pycocotools.mask as mask_util
+import torch
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+
+from detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes
+
+from .colormap import random_color
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["ColorMode", "VisImage", "Visualizer"]
+
+
+_SMALL_OBJECT_AREA_THRESH = 1000
+_LARGE_MASK_AREA_THRESH = 120000
+_OFF_WHITE = (1.0, 1.0, 240.0 / 255)
+_BLACK = (0, 0, 0)
+_RED = (1.0, 0, 0)
+
+_KEYPOINT_THRESHOLD = 0.05
+
+
+@unique
+class ColorMode(Enum):
+    """
+    Enum of different color modes to use for instance visualizations.
+
+    Attributes:
+        IMAGE: Picks a random color for every instance and overlay segmentations with low opacity.
+        SEGMENTATION: Let instances of the same category have similar colors, and overlay them with
+            high opacity. This provides more attention on the quality of segmentation.
+        IMAGE_BW: same as IMAGE, but convert all areas without masks to gray-scale.
+            Only available for drawing per-instance mask predictions.
+    """
+
+    IMAGE = 0
+    SEGMENTATION = 1
+    IMAGE_BW = 2
+
+
+class GenericMask:
+    """
+    Attribute:
+        polygons (list[ndarray]): list[ndarray]: polygons for this mask.
+            Each ndarray has format [x, y, x, y, ...]
+        mask (ndarray): a binary mask
+    """
+
+    def __init__(self, mask_or_polygons, height, width):
+        self._mask = self._polygons = self._has_holes = None
+        self.height = height
+        self.width = width
+
+        m = mask_or_polygons
+        if isinstance(m, dict):
+            # RLEs
+            assert "counts" in m and "size" in m
+            if isinstance(m["counts"], list):  # uncompressed RLEs
+                h, w = m["size"]
+                assert h == height and w == width
+                m = mask_util.frPyObjects(m, h, w)
+            self._mask = mask_util.decode(m)[:, :]
+            return
+
+        if isinstance(m, list):  # list[ndarray]
+            self._polygons = [np.asarray(x).reshape(-1) for x in m]
+            return
+
+        if isinstance(m, np.ndarray):  # assumed to be a binary mask
+            assert m.shape[1] != 2, m.shape
+            assert m.shape == (height, width), m.shape
+            self._mask = m.astype("uint8")
+            return
+
+        raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m)))
+
+    @property
+    def mask(self):
+        if self._mask is None:
+            self._mask = self.polygons_to_mask(self._polygons)
+        return self._mask
+
+    @property
+    def polygons(self):
+        if self._polygons is None:
+            self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
+        return self._polygons
+
+    @property
+    def has_holes(self):
+        if self._has_holes is None:
+            if self._mask is not None:
+                self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
+            else:
+                self._has_holes = False  # if original format is polygon, does not have holes
+        return self._has_holes
+
+    def mask_to_polygons(self, mask):
+        # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level
+        # hierarchy. External contours (boundary) of the object are placed in hierarchy-1.
+        # Internal contours (holes) are placed in hierarchy-2.
+        # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours.
+        mask = np.ascontiguousarray(mask)  # some versions of cv2 does not support incontiguous arr
+        res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
+        hierarchy = res[-1]
+        if hierarchy is None:  # empty mask
+            return [], False
+        has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0
+        res = res[-2]
+        res = [x.flatten() for x in res]
+        res = [x for x in res if len(x) >= 6]
+        return res, has_holes
+
+    def polygons_to_mask(self, polygons):
+        rle = mask_util.frPyObjects(polygons, self.height, self.width)
+        rle = mask_util.merge(rle)
+        return mask_util.decode(rle)[:, :]
+
+    def area(self):
+        return self.mask.sum()
+
+    def bbox(self):
+        p = mask_util.frPyObjects(self.polygons, self.height, self.width)
+        p = mask_util.merge(p)
+        bbox = mask_util.toBbox(p)
+        bbox[2] += bbox[0]
+        bbox[3] += bbox[1]
+        return bbox
+
+
+class _PanopticPrediction:
+    def __init__(self, panoptic_seg, segments_info):
+        self._seg = panoptic_seg
+
+        self._sinfo = {s["id"]: s for s in segments_info}  # seg id -> seg info
+        segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True)
+        areas = areas.numpy()
+        sorted_idxs = np.argsort(-areas)
+        self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs]
+        self._seg_ids = self._seg_ids.tolist()
+        for sid, area in zip(self._seg_ids, self._seg_areas):
+            if sid in self._sinfo:
+                self._sinfo[sid]["area"] = float(area)
+
+    def non_empty_mask(self):
+        """
+        Returns:
+            (H, W) array, a mask for all pixels that have a prediction
+        """
+        empty_ids = []
+        for id in self._seg_ids:
+            if id not in self._sinfo:
+                empty_ids.append(id)
+        if len(empty_ids) == 0:
+            return np.zeros(self._seg.shape, dtype=np.uint8)
+        assert (
+            len(empty_ids) == 1
+        ), ">1 ids corresponds to no labels. This is currently not supported"
+        return (self._seg != empty_ids[0]).numpy().astype(np.bool)
+
+    def semantic_masks(self):
+        for sid in self._seg_ids:
+            sinfo = self._sinfo.get(sid)
+            if sinfo is None or sinfo["isthing"]:
+                # Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions.
+                continue
+            yield (self._seg == sid).numpy().astype(np.bool), sinfo
+
+    def instance_masks(self):
+        for sid in self._seg_ids:
+            sinfo = self._sinfo.get(sid)
+            if sinfo is None or not sinfo["isthing"]:
+                continue
+            mask = (self._seg == sid).numpy().astype(np.bool)
+            if mask.sum() > 0:
+                yield mask, sinfo
+
+
+def _create_text_labels(classes, scores, class_names):
+    """
+    Args:
+        classes (list[int] or None):
+        scores (list[float] or None):
+        class_names (list[str] or None):
+
+    Returns:
+        list[str] or None
+    """
+    labels = None
+    if classes is not None and class_names is not None and len(class_names) > 1:
+        labels = [class_names[i] for i in classes]
+    if scores is not None:
+        if labels is None:
+            labels = ["{:.0f}%".format(s * 100) for s in scores]
+        else:
+            labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
+    return labels
+
+
+class VisImage:
+    def __init__(self, img, scale=1.0):
+        """
+        Args:
+            img (ndarray): an RGB image of shape (H, W, 3).
+            scale (float): scale the input image
+        """
+        self.img = img
+        self.scale = scale
+        self.width, self.height = img.shape[1], img.shape[0]
+        self._setup_figure(img)
+
+    def _setup_figure(self, img):
+        """
+        Args:
+            Same as in :meth:`__init__()`.
+
+        Returns:
+            fig (matplotlib.pyplot.figure): top level container for all the image plot elements.
+            ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system.
+        """
+        fig = mplfigure.Figure(frameon=False)
+        self.dpi = fig.get_dpi()
+        # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
+        # (https://github.com/matplotlib/matplotlib/issues/15363)
+        fig.set_size_inches(
+            (self.width * self.scale + 1e-2) / self.dpi,
+            (self.height * self.scale + 1e-2) / self.dpi,
+        )
+        self.canvas = FigureCanvasAgg(fig)
+        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
+        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
+        ax.axis("off")
+        ax.set_xlim(0.0, self.width)
+        ax.set_ylim(self.height)
+
+        self.fig = fig
+        self.ax = ax
+
+    def save(self, filepath):
+        """
+        Args:
+            filepath (str): a string that contains the absolute path, including the file name, where
+                the visualized image will be saved.
+        """
+        if filepath.lower().endswith(".jpg") or filepath.lower().endswith(".png"):
+            # faster than matplotlib's imshow
+            cv2.imwrite(filepath, self.get_image()[:, :, ::-1])
+        else:
+            # support general formats (e.g. pdf)
+            self.ax.imshow(self.img, interpolation="nearest")
+            self.fig.savefig(filepath)
+
+    def get_image(self):
+        """
+        Returns:
+            ndarray: the visualized image of shape (H, W, 3) (RGB) in uint8 type.
+              The shape is scaled w.r.t the input image using the given `scale` argument.
+        """
+        canvas = self.canvas
+        s, (width, height) = canvas.print_to_buffer()
+        if (self.width, self.height) != (width, height):
+            img = cv2.resize(self.img, (width, height))
+        else:
+            img = self.img
+
+        # buf = io.BytesIO()  # works for cairo backend
+        # canvas.print_rgba(buf)
+        # width, height = self.width, self.height
+        # s = buf.getvalue()
+
+        buffer = np.frombuffer(s, dtype="uint8")
+
+        # imshow is slow. blend manually (still quite slow)
+        img_rgba = buffer.reshape(height, width, 4)
+        rgb, alpha = np.split(img_rgba, [3], axis=2)
+
+        try:
+            import numexpr as ne  # fuse them with numexpr
+
+            visualized_image = ne.evaluate("img * (1 - alpha / 255.0) + rgb * (alpha / 255.0)")
+        except ImportError:
+            alpha = alpha.astype("float32") / 255.0
+            visualized_image = img * (1 - alpha) + rgb * alpha
+
+        visualized_image = visualized_image.astype("uint8")
+
+        return visualized_image
+
+
+class Visualizer:
+    def __init__(self, img_rgb, metadata, scale=1.0, instance_mode=ColorMode.IMAGE):
+        """
+        Args:
+            img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
+                the height and width of the image respectively. C is the number of
+                color channels. The image is required to be in RGB format since that
+                is a requirement of the Matplotlib library. The image is also expected
+                to be in the range [0, 255].
+            metadata (MetadataCatalog): image metadata.
+        """
+        self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
+        self.metadata = metadata
+        self.output = VisImage(self.img, scale=scale)
+        self.cpu_device = torch.device("cpu")
+
+        # too small texts are useless, therefore clamp to 9
+        self._default_font_size = max(
+            np.sqrt(self.output.height * self.output.width) // 90, 10 // scale
+        )
+        self._instance_mode = instance_mode
+
+    def draw_instance_predictions(self, predictions):
+        """
+        Draw instance-level prediction results on an image.
+
+        Args:
+            predictions (Instances): the output of an instance detection/segmentation
+                model. Following fields will be used to draw:
+                "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
+        scores = predictions.scores if predictions.has("scores") else None
+        classes = predictions.pred_classes if predictions.has("pred_classes") else None
+        labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
+        keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None
+
+        if predictions.has("pred_masks"):
+            masks = np.asarray(predictions.pred_masks)
+            masks = [GenericMask(x, self.output.height, self.output.width) for x in masks]
+        else:
+            masks = None
+
+        if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
+            colors = [
+                self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes
+            ]
+            alpha = 0.8
+        else:
+            colors = None
+            alpha = 0.5
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            assert predictions.has("pred_masks"), "ColorMode.IMAGE_BW requires segmentations"
+            self.output.img = self._create_grayscale_image(
+                (predictions.pred_masks.any(dim=0) > 0).numpy()
+            )
+            alpha = 0.3
+
+        self.overlay_instances(
+            masks=masks,
+            boxes=boxes,
+            labels=labels,
+            keypoints=keypoints,
+            assigned_colors=colors,
+            alpha=alpha,
+        )
+        return self.output
+
+    def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.8):
+        """
+        Draw semantic segmentation predictions/labels.
+
+        Args:
+            sem_seg (Tensor or ndarray): the segmentation of shape (H, W).
+            area_threshold (int): segments with less than `area_threshold` are not drawn.
+            alpha (float): the larger it is, the more opaque the segmentations are.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        if isinstance(sem_seg, torch.Tensor):
+            sem_seg = sem_seg.numpy()
+        labels, areas = np.unique(sem_seg, return_counts=True)
+        sorted_idxs = np.argsort(-areas).tolist()
+        labels = labels[sorted_idxs]
+        for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels):
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[label]]
+            except (AttributeError, IndexError):
+                mask_color = None
+
+            binary_mask = (sem_seg == label).astype(np.uint8)
+            text = self.metadata.stuff_classes[label]
+            self.draw_binary_mask(
+                binary_mask,
+                color=mask_color,
+                edge_color=_OFF_WHITE,
+                text=text,
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+        return self.output
+
+    def draw_panoptic_seg_predictions(
+        self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7
+    ):
+        """
+        Draw panoptic prediction results on an image.
+
+        Args:
+            panoptic_seg (Tensor): of shape (height, width) where the values are ids for each
+                segment.
+            segments_info (list[dict]): Describe each segment in `panoptic_seg`.
+                Each dict contains keys "id", "category_id", "isthing".
+            area_threshold (int): stuff segments with less than `area_threshold` are not drawn.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        pred = _PanopticPrediction(panoptic_seg, segments_info)
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            self.output.img = self._create_grayscale_image(pred.non_empty_mask())
+
+        # draw mask for all semantic segments first i.e. "stuff"
+        for mask, sinfo in pred.semantic_masks():
+            category_idx = sinfo["category_id"]
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]]
+            except AttributeError:
+                mask_color = None
+
+            text = self.metadata.stuff_classes[category_idx]
+            self.draw_binary_mask(
+                mask,
+                color=mask_color,
+                edge_color=_OFF_WHITE,
+                text=text,
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+
+        # draw mask for all instances second
+        all_instances = list(pred.instance_masks())
+        if len(all_instances) == 0:
+            return self.output
+        masks, sinfo = list(zip(*all_instances))
+        category_ids = [x["category_id"] for x in sinfo]
+
+        try:
+            scores = [x["score"] for x in sinfo]
+        except KeyError:
+            scores = None
+        labels = _create_text_labels(category_ids, scores, self.metadata.thing_classes)
+
+        try:
+            colors = [random_color(rgb=True, maximum=1) for k in category_ids]
+        except AttributeError:
+            colors = None
+        self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha)
+
+        return self.output
+
+    def draw_dataset_dict(self, dic):
+        """
+        Draw annotations/segmentaions in Detectron2 Dataset format.
+
+        Args:
+            dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        annos = dic.get("annotations", None)
+        if annos:
+            if "segmentation" in annos[0]:
+                masks = [x["segmentation"] for x in annos]
+            else:
+                masks = None
+            if "keypoints" in annos[0]:
+                keypts = [x["keypoints"] for x in annos]
+                keypts = np.array(keypts).reshape(len(annos), -1, 3)
+            else:
+                keypts = None
+
+            boxes = [BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS) for x in annos]
+
+            labels = [x["category_id"] for x in annos]
+            names = self.metadata.get("thing_classes", None)
+            if names:
+                labels = [names[i] for i in labels]
+            labels = [
+                "{}".format(i) + ("|crowd" if a.get("iscrowd", 0) else "")
+                for i, a in zip(labels, annos)
+            ]
+            self.overlay_instances(labels=labels, boxes=boxes, masks=masks, keypoints=keypts)
+
+        sem_seg = dic.get("sem_seg", None)
+        if sem_seg is None and "sem_seg_file_name" in dic:
+            sem_seg = cv2.imread(dic["sem_seg_file_name"], cv2.IMREAD_GRAYSCALE)
+        if sem_seg is not None:
+            self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5)
+        return self.output
+
+    def overlay_instances(
+        self,
+        *,
+        boxes=None,
+        labels=None,
+        masks=None,
+        keypoints=None,
+        assigned_colors=None,
+        alpha=0.5
+    ):
+        """
+        Args:
+            boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`,
+                or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
+                or a :class:`RotatedBoxes`,
+                or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
+                for the N objects in a single image,
+            labels (list[str]): the text to be displayed for each instance.
+            masks (masks-like object): Supported types are:
+
+                * `structures.masks.PolygonMasks`, `structures.masks.BitMasks`.
+                * list[list[ndarray]]: contains the segmentation masks for all objects in one image.
+                    The first level of the list corresponds to individual instances. The second
+                    level to all the polygon that compose the instance, and the third level
+                    to the polygon coordinates. The third level should have the format of
+                    [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
+                * list[ndarray]: each ndarray is a binary mask of shape (H, W).
+                * list[dict]: each dict is a COCO-style RLE.
+            keypoints (Keypoint or array like): an array-like object of shape (N, K, 3),
+                where the N is the number of instances and K is the number of keypoints.
+                The last dimension corresponds to (x, y, visibility or score).
+            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+                for full list of formats that the colors are accepted in.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        num_instances = None
+        if boxes is not None:
+            boxes = self._convert_boxes(boxes)
+            num_instances = len(boxes)
+        if masks is not None:
+            masks = self._convert_masks(masks)
+            if num_instances:
+                assert len(masks) == num_instances
+            else:
+                num_instances = len(masks)
+        if keypoints is not None:
+            if num_instances:
+                assert len(keypoints) == num_instances
+            else:
+                num_instances = len(keypoints)
+            keypoints = self._convert_keypoints(keypoints)
+        if labels is not None:
+            assert len(labels) == num_instances
+        if assigned_colors is None:
+            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
+        if num_instances == 0:
+            return self.output
+        if boxes is not None and boxes.shape[1] == 5:
+            return self.overlay_rotated_instances(
+                boxes=boxes, labels=labels, assigned_colors=assigned_colors
+            )
+
+        # Display in largest to smallest order to reduce occlusion.
+        areas = None
+        if boxes is not None:
+            areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
+        elif masks is not None:
+            areas = np.asarray([x.area() for x in masks])
+
+        if areas is not None:
+            sorted_idxs = np.argsort(-areas).tolist()
+            # Re-order overlapped instances in descending order.
+            boxes = boxes[sorted_idxs] if boxes is not None else None
+            labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+            masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None
+            assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
+            keypoints = keypoints[sorted_idxs] if keypoints is not None else None
+
+        for i in range(num_instances):
+            color = assigned_colors[i]
+            if boxes is not None:
+                self.draw_box(boxes[i], edge_color=color)
+
+            if masks is not None:
+                for segment in masks[i].polygons:
+                    self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha)
+
+            if labels is not None:
+                # first get a box
+                if boxes is not None:
+                    x0, y0, x1, y1 = boxes[i]
+                    text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
+                    horiz_align = "left"
+                elif masks is not None:
+                    x0, y0, x1, y1 = masks[i].bbox()
+
+                    # draw text in the center (defined by median) when box is not drawn
+                    # median is less sensitive to outliers.
+                    text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1]
+                    horiz_align = "center"
+                else:
+                    continue  # drawing the box confidence for keypoints isn't very useful.
+                # for small objects, draw text at the side to avoid occlusion
+                instance_area = (y1 - y0) * (x1 - x0)
+                if (
+                    instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
+                    or y1 - y0 < 40 * self.output.scale
+                ):
+                    if y1 >= self.output.height - 5:
+                        text_pos = (x1, y0)
+                    else:
+                        text_pos = (x0, y1)
+
+                height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
+                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+                font_size = (
+                    np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
+                    * 0.5
+                    * self._default_font_size
+                )
+                self.draw_text(
+                    labels[i],
+                    text_pos,
+                    color=lighter_color,
+                    horizontal_alignment=horiz_align,
+                    font_size=font_size,
+                )
+
+        # draw keypoints
+        if keypoints is not None:
+            for keypoints_per_instance in keypoints:
+                self.draw_and_connect_keypoints(keypoints_per_instance)
+
+        return self.output
+
+    def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None):
+        """
+        Args:
+            boxes (ndarray): an Nx5 numpy array of
+                (x_center, y_center, width, height, angle_degrees) format
+                for the N objects in a single image.
+            labels (list[str]): the text to be displayed for each instance.
+            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+                for full list of formats that the colors are accepted in.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+
+        num_instances = len(boxes)
+
+        if assigned_colors is None:
+            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
+        if num_instances == 0:
+            return self.output
+
+        # Display in largest to smallest order to reduce occlusion.
+        if boxes is not None:
+            areas = boxes[:, 2] * boxes[:, 3]
+
+        sorted_idxs = np.argsort(-areas).tolist()
+        # Re-order overlapped instances in descending order.
+        boxes = boxes[sorted_idxs]
+        labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+        colors = [assigned_colors[idx] for idx in sorted_idxs]
+
+        for i in range(num_instances):
+            self.draw_rotated_box_with_label(
+                boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None
+            )
+
+        return self.output
+
+    def draw_and_connect_keypoints(self, keypoints):
+        """
+        Draws keypoints of an instance and follows the rules for keypoint connections
+        to draw lines between appropriate keypoints. This follows color heuristics for
+        line color.
+
+        Args:
+            keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints
+                and the last dimension corresponds to (x, y, probability).
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        visible = {}
+        keypoint_names = self.metadata.get("keypoint_names")
+        for idx, keypoint in enumerate(keypoints):
+            # draw keypoint
+            x, y, prob = keypoint
+            if prob > _KEYPOINT_THRESHOLD:
+                self.draw_circle((x, y), color=_RED)
+                if keypoint_names:
+                    keypoint_name = keypoint_names[idx]
+                    visible[keypoint_name] = (x, y)
+
+        if self.metadata.get("keypoint_connection_rules"):
+            for kp0, kp1, color in self.metadata.keypoint_connection_rules:
+                if kp0 in visible and kp1 in visible:
+                    x0, y0 = visible[kp0]
+                    x1, y1 = visible[kp1]
+                    color = tuple(x / 255.0 for x in color)
+                    self.draw_line([x0, x1], [y0, y1], color=color)
+
+        # draw lines from nose to mid-shoulder and mid-shoulder to mid-hip
+        # Note that this strategy is specific to person keypoints.
+        # For other keypoints, it should just do nothing
+        try:
+            ls_x, ls_y = visible["left_shoulder"]
+            rs_x, rs_y = visible["right_shoulder"]
+            mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2
+        except KeyError:
+            pass
+        else:
+            # draw line from nose to mid-shoulder
+            nose_x, nose_y = visible.get("nose", (None, None))
+            if nose_x is not None:
+                self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED)
+
+            try:
+                # draw line from mid-shoulder to mid-hip
+                lh_x, lh_y = visible["left_hip"]
+                rh_x, rh_y = visible["right_hip"]
+            except KeyError:
+                pass
+            else:
+                mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2
+                self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED)
+        return self.output
+
+    """
+    Primitive drawing functions:
+    """
+
+    def draw_text(
+        self,
+        text,
+        position,
+        *,
+        font_size=None,
+        color="g",
+        horizontal_alignment="center",
+        rotation=0
+    ):
+        """
+        Args:
+            text (str): class label
+            position (tuple): a tuple of the x and y coordinates to place text on image.
+            font_size (int, optional): font of the text. If not provided, a font size
+                proportional to the image width is calculated and used.
+            color: color of the text. Refer to `matplotlib.colors` for full list
+                of formats that are accepted.
+            horizontal_alignment (str): see `matplotlib.text.Text`
+            rotation: rotation angle in degrees CCW
+
+        Returns:
+            output (VisImage): image object with text drawn.
+        """
+        if not font_size:
+            font_size = self._default_font_size
+
+        # since the text background is dark, we don't want the text to be dark
+        color = np.maximum(list(mplc.to_rgb(color)), 0.2)
+        color[np.argmax(color)] = max(0.8, np.max(color))
+
+        x, y = position
+        self.output.ax.text(
+            x,
+            y,
+            text,
+            size=font_size * self.output.scale,
+            family="sans-serif",
+            bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
+            verticalalignment="top",
+            horizontalalignment=horizontal_alignment,
+            color=color,
+            zorder=10,
+            rotation=rotation,
+        )
+        return self.output
+
+    def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
+        """
+        Args:
+            box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
+                are the coordinates of the image's top left corner. x1 and y1 are the
+                coordinates of the image's bottom right corner.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+                for full list of formats that are accepted.
+            line_style (string): the string to use to create the outline of the boxes.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        x0, y0, x1, y1 = box_coord
+        width = x1 - x0
+        height = y1 - y0
+
+        linewidth = max(self._default_font_size / 4, 1)
+
+        self.output.ax.add_patch(
+            mpl.patches.Rectangle(
+                (x0, y0),
+                width,
+                height,
+                fill=False,
+                edgecolor=edge_color,
+                linewidth=linewidth * self.output.scale,
+                alpha=alpha,
+                linestyle=line_style,
+            )
+        )
+        return self.output
+
+    def draw_rotated_box_with_label(
+        self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None
+    ):
+        """
+        Args:
+            rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle),
+                where cnt_x and cnt_y are the center coordinates of the box.
+                w and h are the width and height of the box. angle represents how
+                many degrees the box is rotated CCW with regard to the 0-degree box.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+                for full list of formats that are accepted.
+            line_style (string): the string to use to create the outline of the boxes.
+            label (string): label for rotated box. It will not be rendered when set to None.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        cnt_x, cnt_y, w, h, angle = rotated_box
+        area = w * h
+        # use thinner lines when the box is small
+        linewidth = self._default_font_size / (
+            6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3
+        )
+
+        theta = angle * math.pi / 180.0
+        c = math.cos(theta)
+        s = math.sin(theta)
+        rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)]
+        # x: left->right ; y: top->down
+        rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect]
+        for k in range(4):
+            j = (k + 1) % 4
+            self.draw_line(
+                [rotated_rect[k][0], rotated_rect[j][0]],
+                [rotated_rect[k][1], rotated_rect[j][1]],
+                color=edge_color,
+                linestyle="--" if k == 1 else line_style,
+                linewidth=linewidth,
+            )
+
+        if label is not None:
+            text_pos = rotated_rect[1]  # topleft corner
+
+            height_ratio = h / np.sqrt(self.output.height * self.output.width)
+            label_color = self._change_color_brightness(edge_color, brightness_factor=0.7)
+            font_size = (
+                np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size
+            )
+            self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle)
+
+        return self.output
+
+    def draw_circle(self, circle_coord, color, radius=3):
+        """
+        Args:
+            circle_coord (list(int) or tuple(int)): contains the x and y coordinates
+                of the center of the circle.
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            radius (int): radius of the circle.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        x, y = circle_coord
+        self.output.ax.add_patch(
+            mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color)
+        )
+        return self.output
+
+    def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None):
+        """
+        Args:
+            x_data (list[int]): a list containing x values of all the points being drawn.
+                Length of list should match the length of y_data.
+            y_data (list[int]): a list containing y values of all the points being drawn.
+                Length of list should match the length of x_data.
+            color: color of the line. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            linestyle: style of the line. Refer to `matplotlib.lines.Line2D`
+                for a full list of formats that are accepted.
+            linewidth (float or None): width of the line. When it's None,
+                a default value will be computed and used.
+
+        Returns:
+            output (VisImage): image object with line drawn.
+        """
+        if linewidth is None:
+            linewidth = self._default_font_size / 3
+        linewidth = max(linewidth, 1)
+        self.output.ax.add_line(
+            mpl.lines.Line2D(
+                x_data,
+                y_data,
+                linewidth=linewidth * self.output.scale,
+                color=color,
+                linestyle=linestyle,
+            )
+        )
+        return self.output
+
+    def draw_binary_mask(
+        self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.5, area_threshold=4096
+    ):
+        """
+        Args:
+            binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and
+                W is the image width. Each value in the array is either a 0 or 1 value of uint8
+                type.
+            color: color of the mask. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted. If None, will pick a random color.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted.
+            text (str): if None, will be drawn in the object's center of mass.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            area_threshold (float): a connected component small than this will not be shown.
+
+        Returns:
+            output (VisImage): image object with mask drawn.
+        """
+        if color is None:
+            color = random_color(rgb=True, maximum=1)
+        if area_threshold is None:
+            area_threshold = 4096
+
+        has_valid_segment = False
+        binary_mask = binary_mask.astype("uint8")  # opencv needs uint8
+        mask = GenericMask(binary_mask, self.output.height, self.output.width)
+        shape2d = (binary_mask.shape[0], binary_mask.shape[1])
+
+        if not mask.has_holes:
+            # draw polygons for regular masks
+            for segment in mask.polygons:
+                area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1]))
+                if area < area_threshold:
+                    continue
+                has_valid_segment = True
+                segment = segment.reshape(-1, 2)
+                self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha)
+        else:
+            rgba = np.zeros(shape2d + (4,), dtype="float32")
+            rgba[:, :, :3] = color
+            rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha
+            has_valid_segment = True
+            self.output.ax.imshow(rgba)
+
+        if text is not None and has_valid_segment:
+            # TODO sometimes drawn on wrong objects. the heuristics here can improve.
+            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+            _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8)
+            largest_component_id = np.argmax(stats[1:, -1]) + 1
+
+            # draw text on the largest component, as well as other very large components.
+            for cid in range(1, _num_cc):
+                if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH:
+                    # median is more stable than centroid
+                    # center = centroids[largest_component_id]
+                    center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
+                    self.draw_text(text, center, color=lighter_color)
+        return self.output
+
+    def draw_polygon(self, segment, color, edge_color=None, alpha=0.5):
+        """
+        Args:
+            segment: numpy array of shape Nx2, containing all the points in the polygon.
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted. If not provided, a darker shade
+                of the polygon color will be used instead.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+
+        Returns:
+            output (VisImage): image object with polygon drawn.
+        """
+        if edge_color is None:
+            # make edge color darker than the polygon color
+            if alpha > 0.8:
+                edge_color = self._change_color_brightness(color, brightness_factor=-0.7)
+            else:
+                edge_color = color
+        edge_color = mplc.to_rgb(edge_color) + (1,)
+
+        polygon = mpl.patches.Polygon(
+            segment,
+            fill=True,
+            facecolor=mplc.to_rgb(color) + (alpha,),
+            edgecolor=edge_color,
+            linewidth=max(self._default_font_size // 15 * self.output.scale, 1),
+        )
+        self.output.ax.add_patch(polygon)
+        return self.output
+
+    """
+    Internal methods:
+    """
+
+    def _jitter(self, color):
+        """
+        Randomly modifies given color to produce a slightly different color than the color given.
+
+        Args:
+            color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
+                picked. The values in the list are in the [0.0, 1.0] range.
+
+        Returns:
+            jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
+                color after being jittered. The values in the list are in the [0.0, 1.0] range.
+        """
+        color = mplc.to_rgb(color)
+        vec = np.random.rand(3)
+        # better to do it in another color space
+        vec = vec / np.linalg.norm(vec) * 0.5
+        res = np.clip(vec + color, 0, 1)
+        return tuple(res)
+
+    def _create_grayscale_image(self, mask=None):
+        """
+        Create a grayscale version of the original image.
+        The colors in masked area, if given, will be kept.
+        """
+        img_bw = self.img.astype("f4").mean(axis=2)
+        img_bw = np.stack([img_bw] * 3, axis=2)
+        if mask is not None:
+            img_bw[mask] = self.img[mask]
+        return img_bw
+
+    def _change_color_brightness(self, color, brightness_factor):
+        """
+        Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
+        less or more saturation than the original color.
+
+        Args:
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
+                0 will correspond to no change, a factor in [-1.0, 0) range will result in
+                a darker color and a factor in (0, 1.0] range will result in a lighter color.
+
+        Returns:
+            modified_color (tuple[double]): a tuple containing the RGB values of the
+                modified color. Each value in the tuple is in the [0.0, 1.0] range.
+        """
+        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
+        color = mplc.to_rgb(color)
+        polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
+        modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
+        modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
+        modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
+        modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
+        return modified_color
+
+    def _convert_boxes(self, boxes):
+        """
+        Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
+        """
+        if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes):
+            return boxes.tensor.numpy()
+        else:
+            return np.asarray(boxes)
+
+    def _convert_masks(self, masks_or_polygons):
+        """
+        Convert different format of masks or polygons to a tuple of masks and polygons.
+
+        Returns:
+            list[GenericMask]:
+        """
+
+        m = masks_or_polygons
+        if isinstance(m, PolygonMasks):
+            m = m.polygons
+        if isinstance(m, BitMasks):
+            m = m.tensor.numpy()
+        if isinstance(m, torch.Tensor):
+            m = m.numpy()
+        ret = []
+        for x in m:
+            if isinstance(x, GenericMask):
+                ret.append(x)
+            else:
+                ret.append(GenericMask(x, self.output.height, self.output.width))
+        return ret
+
+    def _convert_keypoints(self, keypoints):
+        if isinstance(keypoints, Keypoints):
+            keypoints = keypoints.tensor
+        keypoints = np.asarray(keypoints)
+        return keypoints
+
+    def get_output(self):
+        """
+        Returns:
+            output (VisImage): the image output containing the visualizations added
+            to the image.
+        """
+        return self.output
diff --git a/dev/README.md b/dev/README.md
new file mode 100644
index 0000000..287f91f
--- /dev/null
+++ b/dev/README.md
@@ -0,0 +1,6 @@
+
+## Some scripts for developers to use, include:
+
+- `linter.sh`: lint the codebase before commit
+- `run_{inference,instant}_tests.sh`: run inference/training for a few iterations.
+- `parse_results.sh`: parse results from log file.
diff --git a/dev/linter.sh b/dev/linter.sh
new file mode 100644
index 0000000..cabae27
--- /dev/null
+++ b/dev/linter.sh
@@ -0,0 +1,35 @@
+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# Run this script at project root by "./dev/linter.sh" before you commit
+
+{
+	black --version | grep "19.3b0" > /dev/null
+} || {
+	echo "Linter requires black==19.3b0 !"
+	exit 1
+}
+
+set -v
+
+echo "Running isort ..."
+isort -y --multi-line 3 --trailing-comma -sp . --skip datasets --skip docs --skip-glob '*/__init__.py' --atomic
+
+echo "Running black ..."
+black -l 100 .
+
+echo "Running flake8 ..."
+if [ -x "$(command -v flake8-3)" ]; then
+  flake8-3 .
+else
+  python3 -m flake8 .
+fi
+
+# echo "Running mypy ..."
+# Pytorch does not have enough type annotations
+# mypy detectron2/solver detectron2/structures detectron2/config
+
+echo "Running clang-format ..."
+find . -regex ".*\.\(cpp\|c\|cc\|cu\|cxx\|h\|hh\|hpp\|hxx\|tcc\|mm\|m\)" -print0 | xargs -0 clang-format -i
+
+command -v arc > /dev/null && arc lint
diff --git a/dev/parse_results.sh b/dev/parse_results.sh
new file mode 100644
index 0000000..358cac8
--- /dev/null
+++ b/dev/parse_results.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# A shell script that parses metrics from the log file.
+# Make it easier for developers to track performance of models.
+
+LOG="$1"
+
+if [[ -z "$LOG" ]]; then
+	echo "Usage: $0 /path/to/log/file"
+	exit 1
+fi
+
+# [12/15 11:47:32] trainer INFO: Total training time: 12:15:04.446477 (0.4900 s / it)
+# [12/15 11:49:03] inference INFO: Total inference time: 0:01:25.326167 (0.13652186737060548 s / img per device, on 8 devices)
+
+# training time
+trainspeed=$(grep -o 'Overall training.*' "$LOG" | grep -Eo '\(.*\)' | grep -o '[0-9\.]*')
+echo "Training speed: $trainspeed s/it"
+
+# inference time: there could be multiple inference during training
+inferencespeed=$(grep -o 'Total inference.*' "$LOG" | tail -n1 | grep -Eo '\(.*\)' | grep -o '[0-9\.]*' | head -n1)
+echo "Inference speed: $inferencespeed s/it"
+
+# [12/15 11:47:18] trainer INFO: eta: 0:00:00  iter: 90000  loss: 0.5407 (0.7256)  loss_classifier: 0.1744 (0.2446)  loss_box_reg: 0.0838 (0.1160)  loss_mask: 0.2159 (0.2722)  loss_objectness: 0.0244 (0.0429)  loss_rpn_box_reg: 0.0279 (0.0500)  time: 0.4487 (0.4899)  data: 0.0076 (0.0975) lr: 0.000200  max mem: 4161
+memory=$(grep -o 'max[_ ]mem: [0-9]*' "$LOG" | tail -n1 | grep -o '[0-9]*')
+echo "Training memory: $memory MB"
+
+echo "Easy to copypaste:"
+echo "$trainspeed","$inferencespeed","$memory"
+
+echo "------------------------------"
+
+# [12/26 17:26:32] engine.coco_evaluation: copypaste: Task: bbox
+# [12/26 17:26:32] engine.coco_evaluation: copypaste: AP,AP50,AP75,APs,APm,APl
+# [12/26 17:26:32] engine.coco_evaluation: copypaste: 0.0017,0.0024,0.0017,0.0005,0.0019,0.0011
+# [12/26 17:26:32] engine.coco_evaluation: copypaste: Task: segm
+# [12/26 17:26:32] engine.coco_evaluation: copypaste: AP,AP50,AP75,APs,APm,APl
+# [12/26 17:26:32] engine.coco_evaluation: copypaste: 0.0014,0.0021,0.0016,0.0005,0.0016,0.0011
+
+echo "COCO Results:"
+num_tasks=$(grep -o 'copypaste:.*Task.*' "$LOG" | sort -u | wc -l)
+# each task has 3 lines
+grep -o 'copypaste:.*' "$LOG" | cut -d ' ' -f 2- | tail -n $((num_tasks * 3))
diff --git a/dev/run_inference_tests.sh b/dev/run_inference_tests.sh
new file mode 100644
index 0000000..17e422d
--- /dev/null
+++ b/dev/run_inference_tests.sh
@@ -0,0 +1,44 @@
+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+BIN="python tools/train_net.py"
+OUTPUT="inference_test_output"
+NUM_GPUS=2
+
+CFG_LIST=( "${@:1}" )
+
+if [ ${#CFG_LIST[@]} -eq 0 ]; then
+  CFG_LIST=( ./configs/quick_schedules/*inference_acc_test.yaml )
+fi
+
+echo "========================================================================"
+echo "Configs to run:"
+echo "${CFG_LIST[@]}"
+echo "========================================================================"
+
+
+for cfg in "${CFG_LIST[@]}"; do
+    echo "========================================================================"
+    echo "Running $cfg ..."
+    echo "========================================================================"
+    $BIN \
+      --eval-only \
+      --num-gpus $NUM_GPUS \
+      --config-file "$cfg" \
+      OUTPUT_DIR $OUTPUT
+      rm -rf $OUTPUT
+done
+
+
+echo "========================================================================"
+echo "Running demo.py ..."
+echo "========================================================================"
+DEMO_BIN="python demo/demo.py"
+COCO_DIR=datasets/coco/val2014
+mkdir -pv $OUTPUT
+
+set -v
+
+$DEMO_BIN --config-file ./configs/quick_schedules/panoptic_fpn_R_50_inference_acc_test.yaml \
+  --input $COCO_DIR/COCO_val2014_0000001933* --output $OUTPUT
+rm -rf $OUTPUT
diff --git a/dev/run_instant_tests.sh b/dev/run_instant_tests.sh
new file mode 100644
index 0000000..2c51de6
--- /dev/null
+++ b/dev/run_instant_tests.sh
@@ -0,0 +1,27 @@
+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+BIN="python tools/train_net.py"
+OUTPUT="instant_test_output"
+NUM_GPUS=2
+
+CFG_LIST=( "${@:1}" )
+if [ ${#CFG_LIST[@]} -eq 0 ]; then
+  CFG_LIST=( ./configs/quick_schedules/*instant_test.yaml )
+fi
+
+echo "========================================================================"
+echo "Configs to run:"
+echo "${CFG_LIST[@]}"
+echo "========================================================================"
+
+for cfg in "${CFG_LIST[@]}"; do
+    echo "========================================================================"
+    echo "Running $cfg ..."
+    echo "========================================================================"
+    $BIN --num-gpus $NUM_GPUS --config-file "$cfg" \
+      SOLVER.IMS_PER_BATCH $(($NUM_GPUS * 2)) \
+      OUTPUT_DIR "$OUTPUT"
+    rm -rf "$OUTPUT"
+done
+
diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 0000000..e35d885
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1 @@
+_build
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..d537643
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000..2af25c3
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,15 @@
+# Read the docs:
+
+The latest documentation built from this directory is available at [detectron2.readthedocs.io](https://detectron2.readthedocs.io/).
+
+# Build the docs:
+
+1. Install detectron2 according to [INSTALL.md](INSTALL.md).
+2. Install additional libraries required to build docs:
+  - docutils>=0.14
+  - Sphinx>=1.7
+  - recommonmark==0.4.0
+  - sphinx_rtd_theme
+  - mock
+
+3. Run `make html` from this directory.
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..a350a3d
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,284 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# flake8: noqa
+
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+import mock
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+import sphinx_rtd_theme
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# to support markdown
+from recommonmark.parser import CommonMarkParser
+
+sys.path.insert(0, os.path.abspath("../"))
+
+DEPLOY = os.environ.get("READTHEDOCS") == "True"
+
+
+# -- Project information -----------------------------------------------------
+
+try:
+    import torch  # noqa
+except ImportError:
+    for m in [
+        "torch",
+        "torchvision",
+        "torch.nn",
+        "torch.nn.parallel",
+        "torch.distributed",
+        "torch.multiprocessing",
+        "torch.autograd",
+        "torch.autograd.function",
+        "torch.nn.modules",
+        "torch.nn.modules.utils",
+        "torch.utils",
+        "torch.utils.data",
+        "torchvision",
+        "torchvision.ops",
+    ]:
+        sys.modules[m] = mock.Mock(name=m)
+
+for m in [
+    "cv2",
+    "scipy",
+    "portalocker",
+    "detectron2._C",
+    "pycocotools",
+    "pycocotools.mask",
+    "pycocotools.coco",
+    "pycocotools.cocoeval",
+]:
+    sys.modules[m] = mock.Mock(name=m)
+sys.modules["cv2"].__version__ = "3.4"
+
+import detectron2  # isort: skip
+
+
+project = "detectron2"
+copyright = "2019, detectron2 contributors"
+author = "detectron2 contributors"
+
+# The short X.Y version
+version = detectron2.__version__
+# The full version, including alpha/beta/rc tags
+release = version
+
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+needs_sphinx = "1.7"
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    "sphinx.ext.autodoc",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.todo",
+    "sphinx.ext.coverage",
+    "sphinx.ext.mathjax",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.githubpages",
+]
+
+# -- Configurations for plugins ------------
+napoleon_google_docstring = True
+napoleon_include_init_with_doc = True
+napoleon_include_special_with_doc = True
+napoleon_numpy_docstring = False
+napoleon_use_rtype = False
+autodoc_inherit_docstrings = False
+autodoc_member_order = "bysource"
+
+if DEPLOY:
+    intersphinx_timeout = 10
+else:
+    # skip this when building locally
+    intersphinx_timeout = 0.1
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3.6", None),
+    "numpy": ("https://docs.scipy.org/doc/numpy/", None),
+    "torch": ("https://pytorch.org/docs/master/", None),
+}
+# -------------------------
+
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+source_parsers = {".md": CommonMarkParser}
+
+source_suffix = [".rst", ".md"]
+
+# The master toctree document.
+master_doc = "index"
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "build", "README.md"]
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = "sphinx"
+
+
+# -- Options for HTML output -------------------------------------------------
+
+html_theme = "sphinx_rtd_theme"
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = "detectron2doc"
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, "detectron2.tex", "detectron2 Documentation", "detectron2 contributors", "manual")
+]
+
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(master_doc, "detectron2", "detectron2 Documentation", [author], 1)]
+
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (
+        master_doc,
+        "detectron2",
+        "detectron2 Documentation",
+        author,
+        "detectron2",
+        "One line description of project.",
+        "Miscellaneous",
+    )
+]
+
+
+# -- Options for todo extension ----------------------------------------------
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = True
+
+
+_DEPRECATED_NAMES = set(["out_feature_channels", "out_feature_strides", "out_features"])
+
+
+def autodoc_skip_member(app, what, name, obj, skip, options):
+    # we hide something deliberately
+    if getattr(obj, "__HIDE_SPHINX_DOC__", False):
+        return True
+    # Hide some names that are deprecated or not intended to be used
+    if name in _DEPRECATED_NAMES:
+        return True
+    return None
+
+
+def url_resolver(url):
+    if ".html" not in url:
+        url = url.replace("../", "")
+        return "https://github.com/facebookresearch/detectron2/blob/master/" + url
+    else:
+        if DEPLOY:
+            return "http://detectron2.readthedocs.io/" + url
+        else:
+            return "/" + url
+
+
+def setup(app):
+    from recommonmark.transform import AutoStructify
+
+    app.connect("autodoc-skip-member", autodoc_skip_member)
+    # app.connect('autodoc-skip-member', autodoc_skip_member)
+    app.add_config_value(
+        "recommonmark_config",
+        {
+            "url_resolver": url_resolver,
+            "auto_toc_tree_section": "Contents",
+            "enable_math": True,
+            "enable_inline_math": True,
+            "enable_eval_rst": True,
+        },
+        True,
+    )
+    app.add_transform(AutoStructify)
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..8634b7b
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,14 @@
+.. detectron2 documentation master file, created by
+   sphinx-quickstart on Sat Sep 21 13:46:45 2019.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to detectron2's documentation!
+======================================
+
+.. toctree::
+   :maxdepth: 2
+
+   tutorials/index
+   notes/index
+   modules/index
diff --git a/docs/modules/checkpoint.rst b/docs/modules/checkpoint.rst
new file mode 100644
index 0000000..616cb18
--- /dev/null
+++ b/docs/modules/checkpoint.rst
@@ -0,0 +1,7 @@
+detectron2.checkpoint package
+=============================
+
+.. automodule:: detectron2.checkpoint
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/modules/config.rst b/docs/modules/config.rst
new file mode 100644
index 0000000..8324e61
--- /dev/null
+++ b/docs/modules/config.rst
@@ -0,0 +1,16 @@
+detectron2.config package
+=========================
+
+.. automodule:: detectron2.config
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Config References
+-----------------
+
+.. literalinclude:: ../../detectron2/config/defaults.py
+  :language: python
+  :linenos:
+  :lines: 4-
diff --git a/docs/modules/data.rst b/docs/modules/data.rst
new file mode 100644
index 0000000..3697f0e
--- /dev/null
+++ b/docs/modules/data.rst
@@ -0,0 +1,40 @@
+detectron2.data package
+=======================
+
+.. automodule:: detectron2.data
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+detectron2.data.detection\_utils module
+---------------------------------------
+
+.. automodule:: detectron2.data.detection_utils
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+detectron2.data.datasets module
+---------------------------------------
+
+.. automodule:: detectron2.data.datasets
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+detectron2.data.samplers module
+---------------------------------------
+
+.. automodule:: detectron2.data.samplers
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.data.transforms module
+---------------------------------------
+
+.. automodule:: detectron2.data.transforms
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/modules/engine.rst b/docs/modules/engine.rst
new file mode 100644
index 0000000..bb8b533
--- /dev/null
+++ b/docs/modules/engine.rst
@@ -0,0 +1,25 @@
+detectron2.engine package
+=========================
+
+
+.. automodule:: detectron2.engine
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.engine.defaults module
+---------------------------------
+
+.. automodule:: detectron2.engine.defaults
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+detectron2.engine.hooks module
+---------------------------------
+
+.. automodule:: detectron2.engine.hooks
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/modules/evaluation.rst b/docs/modules/evaluation.rst
new file mode 100644
index 0000000..d9d34ff
--- /dev/null
+++ b/docs/modules/evaluation.rst
@@ -0,0 +1,7 @@
+detectron2.evaluation package
+=============================
+
+.. automodule:: detectron2.evaluation
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/modules/index.rst b/docs/modules/index.rst
new file mode 100644
index 0000000..2e3cef7
--- /dev/null
+++ b/docs/modules/index.rst
@@ -0,0 +1,16 @@
+API Documentation
+==================
+
+.. toctree::
+
+    checkpoint
+    config
+    data
+    engine
+    evaluation
+    layers
+    model_zoo
+    modeling
+    solver
+    structures
+    utils
diff --git a/docs/modules/layers.rst b/docs/modules/layers.rst
new file mode 100644
index 0000000..6aeb521
--- /dev/null
+++ b/docs/modules/layers.rst
@@ -0,0 +1,7 @@
+detectron2.layers package
+=========================
+
+.. automodule:: detectron2.layers
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/modules/model_zoo.rst b/docs/modules/model_zoo.rst
new file mode 100644
index 0000000..8b1c7d5
--- /dev/null
+++ b/docs/modules/model_zoo.rst
@@ -0,0 +1,7 @@
+detectron2.model_zoo package
+============================
+
+.. automodule:: detectron2.model_zoo
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/modules/modeling.rst b/docs/modules/modeling.rst
new file mode 100644
index 0000000..f00daa5
--- /dev/null
+++ b/docs/modules/modeling.rst
@@ -0,0 +1,56 @@
+detectron2.modeling package
+===========================
+
+.. automodule:: detectron2.modeling
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.modeilng.poolers module
+---------------------------------------
+
+.. automodule:: detectron2.modeling.poolers
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.modeilng.sampling module
+------------------------------------
+
+.. automodule:: detectron2.modeling.sampling
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.modeilng.box_regression module
+------------------------------------------
+
+.. automodule:: detectron2.modeling.box_regression
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+Model Registries
+-----------------
+
+These are different registries provided in modeling.
+Each registry provide you the ability to replace it with your customized component,
+without having to modify detectron2's code.
+
+Note that it is impossible to allow users to customize any line of code directly.
+Even just to add one line at some place,
+you'll likely need to find out the smallest registry which contains that line,
+and register your component to that registry.
+
+
+.. autodata:: detectron2.modeling.META_ARCH_REGISTRY
+.. autodata:: detectron2.modeling.BACKBONE_REGISTRY
+.. autodata:: detectron2.modeling.PROPOSAL_GENERATOR_REGISTRY
+.. autodata:: detectron2.modeling.ROI_HEADS_REGISTRY
+.. autodata:: detectron2.modeling.ROI_BOX_HEAD_REGISTRY
+.. autodata:: detectron2.modeling.ROI_MASK_HEAD_REGISTRY
+.. autodata:: detectron2.modeling.ROI_KEYPOINT_HEAD_REGISTRY
diff --git a/docs/modules/solver.rst b/docs/modules/solver.rst
new file mode 100644
index 0000000..7f4a49f
--- /dev/null
+++ b/docs/modules/solver.rst
@@ -0,0 +1,7 @@
+detectron2.solver package
+=========================
+
+.. automodule:: detectron2.solver
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/modules/structures.rst b/docs/modules/structures.rst
new file mode 100644
index 0000000..5701c61
--- /dev/null
+++ b/docs/modules/structures.rst
@@ -0,0 +1,7 @@
+detectron2.structures package
+=============================
+
+.. automodule:: detectron2.structures
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/modules/utils.rst b/docs/modules/utils.rst
new file mode 100644
index 0000000..7693b3c
--- /dev/null
+++ b/docs/modules/utils.rst
@@ -0,0 +1,63 @@
+detectron2.utils package
+========================
+
+detectron2.utils.colormap module
+--------------------------------
+
+.. automodule:: detectron2.utils.colormap
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+detectron2.utils.comm module
+----------------------------
+
+.. automodule:: detectron2.utils.comm
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.utils.events module
+------------------------------
+
+.. automodule:: detectron2.utils.events
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.utils.logger module
+------------------------------
+
+.. automodule:: detectron2.utils.logger
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.utils.registry module
+--------------------------------
+
+.. automodule:: detectron2.utils.registry
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+
+detectron2.utils.video\_visualizer module
+-----------------------------------------
+
+.. automodule:: detectron2.utils.video_visualizer
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+detectron2.utils.visualizer module
+----------------------------------
+
+.. automodule:: detectron2.utils.visualizer
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
diff --git a/docs/notes/benchmarks.md b/docs/notes/benchmarks.md
new file mode 100644
index 0000000..1ca3474
--- /dev/null
+++ b/docs/notes/benchmarks.md
@@ -0,0 +1,215 @@
+
+# Benchmarks
+
+Here we benchmark the training speed of a Mask R-CNN in detectron2,
+with some other popular open source Mask R-CNN implementations.
+
+
+### Settings
+
+* Hardware: 8 NVIDIA V100s with NVLink.
+* Software: Python 3.7, CUDA 10.0, cuDNN 7.6.4, PyTorch 1.3.0 (at
+  [this link](https://download.pytorch.org/whl/nightly/cu100/torch-1.3.0%2Bcu100-cp37-cp37m-linux_x86_64.whl)),
+  TensorFlow 1.5.0rc2, Keras 2.2.5, MxNet 1.6.0b20190820.
+* Model: an end-to-end R-50-FPN Mask-RCNN model, using the same hyperparameter as the
+	[Detectron baseline config](https://github.com/facebookresearch/Detectron/blob/master/configs/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml).
+* Metrics: We use the average throughput in iterations 100-500 to skip GPU warmup time.
+	Note that for R-CNN-style models, the throughput of a model typically changes during training, because
+	it depends on the predictions of the model. Therefore this metric is not directly comparable with
+	"train speed" in model zoo, which is the average speed of the entire training run.
+
+
+### Main Results
+
+```eval_rst
++-----------------------------+--------------------+
+| Implementation              | Throughput (img/s) |
++=============================+====================+
+| Detectron2                  | 59                 |
++-----------------------------+--------------------+
+| maskrcnn-benchmark_         | 51                 |
++-----------------------------+--------------------+
+| tensorpack_                 | 50                 |
++-----------------------------+--------------------+
+| mmdetection_                | 41                 |
++-----------------------------+--------------------+
+| simpledet_                  | 39                 |
++-----------------------------+--------------------+
+| Detectron_                  | 19                 |
++-----------------------------+--------------------+
+| `matterport/Mask_RCNN`__    | 14                 |
++-----------------------------+--------------------+
+
+.. _maskrcnn-benchmark: https://github.com/facebookresearch/maskrcnn-benchmark/
+.. _tensorpack: https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN
+.. _mmdetection: https://github.com/open-mmlab/mmdetection/
+.. _simpledet: https://github.com/TuSimple/simpledet/
+.. _Detectron: https://github.com/facebookresearch/Detectron
+__ https://github.com/matterport/Mask_RCNN/
+```
+
+
+Details for each implementation:
+
+* __Detectron2__:
+  ```
+  python tools/train_net.py  --config-file configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml --num-gpus 8
+  ```
+
+* __maskrcnn-benchmark__: use commit `0ce8f6f` with `sed -i ‘s/torch.uint8/torch.bool/g’ **/*.py` to make it compatible with latest PyTorch.
+  Then, run training with
+  ```
+  python -m torch.distributed.launch --nproc_per_node=8 tools/train_net.py --config-file configs/e2e_mask_rcnn_R_50_FPN_1x.yaml
+  ```
+  The speed we observed is faster than its model zoo, likely due to different software versions.
+
+* __tensorpack__: at commit `caafda`, `export TF_CUDNN_USE_AUTOTUNE=0`, then run
+  ```
+  mpirun -np 8 ./train.py --config DATA.BASEDIR=/data/coco TRAINER=horovod BACKBONE.STRIDE_1X1=True TRAIN.STEPS_PER_EPOCH=50 --load ImageNet-R50-AlignPadding.npz
+  ```
+
+* __mmdetection__: at commit `4d9a5f`, apply the following diff, then run
+	```
+	./tools/dist_train.sh configs/mask_rcnn_r50_fpn_1x.py 8
+	```
+
+    The speed we observed is faster than its model zoo, likely due to different software versions.
+
+	<details>
+	<summary>
+	(diff to make it use the same architecture - click to expand)
+	</summary>
+	```diff
+	diff --git i/configs/mask_rcnn_r50_fpn_1x.py w/configs/mask_rcnn_r50_fpn_1x.py
+	index 04f6d22..ed721f2 100644
+	--- i/configs/mask_rcnn_r50_fpn_1x.py
+	+++ w/configs/mask_rcnn_r50_fpn_1x.py
+	@@ -1,14 +1,15 @@
+	# model settings
+	model = dict(
+		type='MaskRCNN',
+	-    pretrained='torchvision://resnet50',
+	+    pretrained='open-mmlab://resnet50_caffe',
+		backbone=dict(
+			type='ResNet',
+			depth=50,
+			num_stages=4,
+			out_indices=(0, 1, 2, 3),
+			frozen_stages=1,
+	-        style='pytorch'),
+	+        norm_cfg=dict(type="BN", requires_grad=False),
+	+        style='caffe'),
+		neck=dict(
+			type='FPN',
+			in_channels=[256, 512, 1024, 2048],
+	@@ -115,7 +116,7 @@ test_cfg = dict(
+	dataset_type = 'CocoDataset'
+	data_root = 'data/coco/'
+	img_norm_cfg = dict(
+	-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+	+    mean=[123.675, 116.28, 103.53], std=[1.0, 1.0, 1.0], to_rgb=False)
+	train_pipeline = [
+		dict(type='LoadImageFromFile'),
+		dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+	```
+	</details>
+
+* __SimpleDet__: at commit `9187a1`, run
+	```
+	python detection_train.py --config config/mask_r50v1_fpn_1x.py
+	```
+
+* __Detectron__: run
+  ```
+  python tools/train_net.py --cfg configs/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml
+  ```
+  Note that many of its ops run on CPUs, therefore the performance is limited.
+
+* __matterport/Mask_RCNN__: at commit `3deaec`, apply the following diff, `export TF_CUDNN_USE_AUTOTUNE=0`, then run
+	```
+	python coco.py train --dataset=/data/coco/ --model=imagenet
+	```
+  Note that many small details in this implementation might be different
+  from Detectron's standards.
+
+	<details>
+	<summary>
+	(diff to make it use the same hyperparameters - click to expand)
+	</summary>
+	```diff
+	diff --git i/mrcnn/model.py w/mrcnn/model.py
+	index 62cb2b0..61d7779 100644
+	--- i/mrcnn/model.py
+	+++ w/mrcnn/model.py
+	@@ -2367,8 +2367,8 @@ class MaskRCNN():
+				epochs=epochs,
+				steps_per_epoch=self.config.STEPS_PER_EPOCH,
+				callbacks=callbacks,
+	-            validation_data=val_generator,
+	-            validation_steps=self.config.VALIDATION_STEPS,
+	+            #validation_data=val_generator,
+	+            #validation_steps=self.config.VALIDATION_STEPS,
+				max_queue_size=100,
+				workers=workers,
+				use_multiprocessing=True,
+	diff --git i/mrcnn/parallel_model.py w/mrcnn/parallel_model.py
+	index d2bf53b..060172a 100644
+	--- i/mrcnn/parallel_model.py
+	+++ w/mrcnn/parallel_model.py
+	@@ -32,6 +32,7 @@ class ParallelModel(KM.Model):
+			keras_model: The Keras model to parallelize
+			gpu_count: Number of GPUs. Must be > 1
+			"""
+	+        super().__init__()
+			self.inner_model = keras_model
+			self.gpu_count = gpu_count
+			merged_outputs = self.make_parallel()
+	diff --git i/samples/coco/coco.py w/samples/coco/coco.py
+	index 5d172b5..239ed75 100644
+	--- i/samples/coco/coco.py
+	+++ w/samples/coco/coco.py
+	@@ -81,7 +81,10 @@ class CocoConfig(Config):
+		IMAGES_PER_GPU = 2
+
+		# Uncomment to train on 8 GPUs (default is 1)
+	-    # GPU_COUNT = 8
+	+    GPU_COUNT = 8
+	+    BACKBONE = "resnet50"
+	+    STEPS_PER_EPOCH = 50
+	+    TRAIN_ROIS_PER_IMAGE = 512
+
+		# Number of classes (including background)
+		NUM_CLASSES = 1 + 80  # COCO has 80 classes
+	@@ -496,29 +499,10 @@ if __name__ == '__main__':
+			# *** This training schedule is an example. Update to your needs ***
+
+			# Training - Stage 1
+	-        print("Training network heads")
+			model.train(dataset_train, dataset_val,
+						learning_rate=config.LEARNING_RATE,
+						epochs=40,
+	-                    layers='heads',
+	-                    augmentation=augmentation)
+	-
+	-        # Training - Stage 2
+	-        # Finetune layers from ResNet stage 4 and up
+	-        print("Fine tune Resnet stage 4 and up")
+	-        model.train(dataset_train, dataset_val,
+	-                    learning_rate=config.LEARNING_RATE,
+	-                    epochs=120,
+	-                    layers='4+',
+	-                    augmentation=augmentation)
+	-
+	-        # Training - Stage 3
+	-        # Fine tune all layers
+	-        print("Fine tune all layers")
+	-        model.train(dataset_train, dataset_val,
+	-                    learning_rate=config.LEARNING_RATE / 10,
+	-                    epochs=160,
+	-                    layers='all',
+	+                    layers='3+',
+						augmentation=augmentation)
+
+		elif args.command == "evaluate":
+	```
+	</details>
diff --git a/docs/notes/changelog.md b/docs/notes/changelog.md
new file mode 100644
index 0000000..b59163b
--- /dev/null
+++ b/docs/notes/changelog.md
@@ -0,0 +1,12 @@
+# Change Log
+
+
+### Notable Changes:
+
+* 2019-11-11: `detectron2.data.detection_utils.read_image` transposes images with exif information.
+* 2019-10-10: initial release.
+
+### Config Version Change Log
+
+* v1: Rename `RPN_HEAD.NAME` to `RPN.HEAD_NAME`.
+* v2: A batch of rename of many configurations before release.
diff --git a/docs/notes/compatibility.md b/docs/notes/compatibility.md
new file mode 100644
index 0000000..94bd4b0
--- /dev/null
+++ b/docs/notes/compatibility.md
@@ -0,0 +1,85 @@
+# Compatibility with Other Libraries
+
+## Compatibility with Detectron
+
+Detectron2 addresses some legacy issues left in Detectron, as a result, their models
+are not compatible:
+running inference with the same model weights will produce different results in the two code bases.
+
+The major differences regarding inference are:
+
+- The height and width of a box with corners (x1, y1) and (x2, y2) is now computed more naturally as
+	width = x2 - x1 and height = y2 - y1;
+	In Detectron, a "+ 1" was added both height and width.
+
+	Note that the relevant ops in Caffe2 have [adopted this change of convention](https://github.com/pytorch/pytorch/pull/20550)
+	with an extra option.
+	So it is still possible to run inference with a Detectron2-trained model in Caffe2.
+
+	The change in height/width calculations most notably changes:
+	- encoding/decoding in bounding box regression.
+	- non-maximum suppression. The effect here is very negligible, though.
+
+- RPN now uses simpler anchors with fewer quantization artifacts.
+
+  In Detectron, the anchors were quantized and
+  [do not have accurate areas](https://github.com/facebookresearch/Detectron/issues/227).
+  In Detectron2, the anchors are center-aligned to feature grid points and not quantized.
+
+- Classification layers have a different ordering of class labels.
+
+	This involves any trainable parameter with shape (..., num_categories + 1, ...).
+	In Detectron2, integer labels [0, K-1] correspond to the K = num_categories object categories
+	and the label "K" corresponds to the special "background" category.
+	In Detectron, label "0" means background, and labels [1, K] correspond to the K categories.
+
+- ROIAlign is implemented differently. The new implementation is [available in Caffe2](https://github.com/pytorch/pytorch/pull/23706).
+
+  1. All the ROIs are shifted by half a pixel compared to Detectron in order to create better image-feature-map alignment.
+	   See `layers/roi_align.py` for details.
+     To enable the old behavior, use `ROIAlign(aligned=False)`, or `POOLER_TYPE=ROIAlign` instead of
+     `ROIAlignV2` (the default).
+
+  1. The ROIs are not required to have a minimum size of 1.
+     This will lead to tiny differences in the output, but should be negligible.
+
+- Mask inference function is different.
+
+	In Detectron2, the "paste_mask" function is different and should be more accurate than in Detectron. This change
+	can improve mask AP on COCO by ~0.5% absolute.
+
+There are some other differences in training as well, but they won't affect
+model-level compatibility. The major ones are:
+
+- We fixed a [bug](https://github.com/facebookresearch/Detectron/issues/459) in
+  Detectron, by making `RPN.POST_NMS_TOPK_TRAIN` per-image, rather than per-batch.
+  The fix may lead to a small accuracy drop for a few models (e.g. keypoint
+  detection) and will require some parameter tuning to match the Detectron results.
+- For simplicity, we change the default loss in bounding box regression to L1 loss, instead of smooth L1 loss.
+  We have observed that this tends to slightly decrease box AP50 while improving box AP for higher
+	overlap thresholds (and leading to a slight overall improvement in box AP).
+- We interpret the coordinates in COCO bounding box and segmentation annotations
+  as coordinates in range `[0, width]` or `[0, height]`, and the coordinates in
+  COCO keypoint annotations are pixel indices in range `[0, width - 1]` or `[0, height - 1]`.
+
+
+We will later share more details and rationale behind the above mentioned issues
+about pixels, coordinates, and "+1"s.
+
+
+## Compatibility with Caffe2
+
+As mentioned above, despite the incompatibilities with Detectron, the relevant
+ops have been implemented in Caffe2, in [PR1](https://github.com/pytorch/pytorch/pull/20550)
+and [PR2](https://github.com/pytorch/pytorch/pull/23706).
+Therefore, models trained with detectron2 can be used in Caffe2.
+
+A conversion script will be available later, for easier deployment.
+
+
+## Compatibility with TensorFlow
+
+Most ops are available in TensorFlow, although some tiny differences in
+the implementation of resize / ROIAlign / padding need to be addressed.
+A working conversion script is provided by [tensorpack FasterRCNN](https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN/convert_d2)
+to run a standard Detectron2 model in TensorFlow.
diff --git a/docs/notes/index.rst b/docs/notes/index.rst
new file mode 100644
index 0000000..0c1b65d
--- /dev/null
+++ b/docs/notes/index.rst
@@ -0,0 +1,9 @@
+Notes
+======================================
+
+.. toctree::
+   :maxdepth: 2
+
+   benchmarks
+   compatibility
+   changelog
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000..886e611
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,17 @@
+termcolor
+numpy
+tqdm
+docutils>=0.14
+Sphinx>=1.7
+recommonmark==0.4.0
+sphinx_rtd_theme
+mock
+matplotlib
+termcolor
+yacs
+tabulate
+cloudpickle
+Pillow
+git+git://github.com/facebookresearch/fvcore.git
+https://download.pytorch.org/whl/nightly/cpu/torch-1.3.0.dev20191010%2Bcpu-cp37-cp37m-linux_x86_64.whl
+https://download.pytorch.org/whl/nightly/cpu/torchvision-0.5.0.dev20191008%2Bcpu-cp37-cp37m-linux_x86_64.whl
diff --git a/docs/tutorials/configs.md b/docs/tutorials/configs.md
new file mode 100644
index 0000000..6b7ba78
--- /dev/null
+++ b/docs/tutorials/configs.md
@@ -0,0 +1,43 @@
+# Use Configs
+
+Detectron2's config system uses yaml and [yacs](https://github.com/rbgirshick/yacs).
+In addition to the basic operations that access and update a config, we provide
+the following extra functionalities:
+
+1. The config can have `_BASE_: base.yaml` field, which will load a base config first.
+   Values in the base config will be overwritten in sub-configs, if there are any conflicts.
+   We provided several base configs for standard model architectures.
+2. We provide config versioning, for backward compatibility.
+   If your config file is versioned with a config line like `VERSION: 2`,
+   detectron2 will still recognize it even if we rename some keys in the future.
+
+### Use Configs
+
+Some basic usage of the `CfgNode` object is shown below:
+```python
+from detectron2.config import get_cfg
+cfg = get_cfg()    # obtain detectron2's default config
+cfg.xxx = yyy      # add new configs for your own custom components
+cfg.merge_from_file("my_cfg.yaml")   # load values from a file
+
+cfg.merge_from_list(["MODEL.WEIGHTS", "weights.pth"])   # can also load values from a list of str
+```
+
+To see a list of available configs in detectron2, see [Config References](../modules/config.html#config-references)
+
+
+### Best Practice with Configs
+
+1. Treat the configs you write as "code": avoid copying them or duplicating them; use "_BASE_"
+   instead to share common parts between configs.
+
+2. Keep the configs you write simple: don't include keys that do not affect the experimental setting.
+
+3. Keep a version number in your configs (or the base config), e.g., `VERSION: 2`,
+   for backward compatibility.
+   The builtin configs do not include version number because they are meant to
+   be always up-to-date.
+
+4. Save a full config together with a trained model, and use it to run inference.
+   This is more robust to changes that may happen to the config definition
+   (e.g., if a default value changed).
diff --git a/docs/tutorials/data_loading.md b/docs/tutorials/data_loading.md
new file mode 100644
index 0000000..a4ee720
--- /dev/null
+++ b/docs/tutorials/data_loading.md
@@ -0,0 +1,48 @@
+
+# Use Custom Dataloaders
+
+## How the Existing Dataloader Works
+
+Detectron2 contains a builtin data loading pipeline.
+It's good to understand how it works, in case you need to write a custom one.
+
+Detectron2 provides two functions
+[build_detection_{train,test}_loader](../modules/data.html#detectron2.data.build_detection_train_loader)
+that create a default data loader from a given config.
+Here is how `build_detection_{train,test}_loader` work:
+
+1. It takes the name of the dataset (e.g., "coco_2017_train") and loads a `list[dict]` representing the dataset items
+   in a lightweight, canonical format. These dataset items are not yet ready to be used by the model (e.g., images are
+   not loaded into memory, random augmentations have not been applied, etc.).
+   Details about the dataset format and dataset registration can be found in
+   [datasets](datasets.html).
+2. Each dict in this list is mapped by a function ("mapper"):
+	 * Users can customize this mapping function by specifying the "mapper" argument in
+        `build_detection_{train,test}_loader`. The default mapper is [DatasetMapper]( ../modules/data.html#detectron2.data.DatasetMapper)
+	 * The output format of such function can be arbitrary, as long as it is accepted by the consumer of this data loader (usually the model).
+   * The role of the mapper is to transform the lightweight, canonical representation of a dataset item into a format
+     that is ready for the model to consume (including, e.g., read images, perform random data augmentation and convert to torch Tensors).
+	 The output format of the default mapper is explained below.
+3. The outputs of the mapper are batched (simply into a list).
+4. This batched data is the output of the data loader. Typically, it's also the input of
+   `model.forward()`.
+
+
+## Write a Custom Dataloader
+
+Using a different "mapper" with `build_detection_{train,test}_loader(mapper=)` works for most use cases
+of custom data loading. Refer to [API documentation](../modules/data.html) for details.
+
+If you want to do something different (e.g., use different sampling or batching logic),
+you can write your own data loader. The data loader is simply a
+python iterator that produces [the format](models.html) your model accepts. 
+You can implement it using any tools you like.
+
+## Use a Custom Dataloader
+
+If you use [DefaultTrainer](../modules/engine.html#detectron2.engine.defaults.DefaultTrainer), 
+you can overwrite its `build_{train,test}__loader` method to use your own dataloader.
+See the [densepose dataloader](/projects/DensePose/train_net.py)
+for an example.
+
+If you write your own training loop, you can plug in your data loader easily.
diff --git a/docs/tutorials/datasets.md b/docs/tutorials/datasets.md
new file mode 100644
index 0000000..f71ff64
--- /dev/null
+++ b/docs/tutorials/datasets.md
@@ -0,0 +1,179 @@
+# Use Custom Datasets
+
+If you want to use a custom dataset while also reusing detectron2's data loaders,
+you will need to
+
+1. Register your dataset (i.e., tell detectron2 how to obtain your dataset).
+2. Optionally, register metadata for your dataset.
+
+Next, we explain the above two concepts in details.
+
+The [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5)
+has a working example of how to register and train on a dataset of custom formats.
+
+
+### Register a Dataset
+
+To let detectron2 know how to obtain a dataset named "my_dataset", you will implement
+a function that returns the items in your dataset and then tell detectron2 about this
+function:
+```python
+def get_dicts():
+  ...
+  return list[dict] in the following format
+
+from detectron2.data import DatasetCatalog
+DatasetCatalog.register("my_dataset", get_dicts)
+```
+
+Here, the snippet associates a dataset "my_dataset" with a function that returns the data.
+If you do not modify downstream code (i.e., you use the standard data loader and data mapper),
+then the function has to return a list of dicts in detectron2's standard dataset format, described
+next. You can also use arbitrary custom data format, as long as the
+downstream code (mainly the [custom data loader](data_loading.html)) supports it.
+
+For standard tasks
+(instance detection, instance/semantic/panoptic segmentation, keypoint detection),
+we use a format similar to COCO's json annotations
+as the basic dataset representation.
+
+The format uses one dict to represent the annotations of
+one image. The dict may have the following fields.
+The fields are often optional, and some functions may be able to
+infer certain fields from others if needed, e.g., the data loader
+will load the image from "file_name" and load "sem_seg" from "sem_seg_file_name".
+
++ `file_name`: the full path to the image file. Will apply rotation and flipping if the image has such exif information.
++ `sem_seg_file_name`: the full path to the ground truth semantic segmentation file.
++ `sem_seg`: semantic segmentation ground truth in a 2D `torch.Tensor`. Values in the array represent
+   category labels.
++ `height`, `width`: integer. The shape of image.
++ `image_id` (str or int): a unique id that identifies this image. Used
+	during evaluation to identify the images, but a dataset may use it for different purposes.
++ `annotations` (list[dict]): the per-instance annotations of every
+  instance in this image. Each annotation dict may contain:
+  + `bbox` (list[float]): list of 4 numbers representing the bounding box of the instance.
+  + `bbox_mode` (int): the format of bbox.
+    It must be a member of
+    [structures.BoxMode](../modules/structures.html#detectron2.structures.BoxMode).
+    Currently supports: `BoxMode.XYXY_ABS`, `BoxMode.XYWH_ABS`.
+  + `category_id` (int): an integer in the range [0, num_categories) representing the category label.
+    The value num_categories is reserved to represent the "background" category, if applicable.
+  + `segmentation` (list[list[float]] or dict):
+    + If `list[list[float]]`, it represents a list of polygons, one for each connected component
+      of the object. Each `list[float]` is one simple polygon in the format of `[x1, y1, ..., xn, yn]`.
+      The Xs and Ys are either relative coordinates in [0, 1], or absolute coordinates,
+      depend on whether "bbox_mode" is relative.
+    + If `dict`, it represents the per-pixel segmentation mask in COCO's RLE format.
+  + `keypoints` (list[float]): in the format of [x1, y1, v1,..., xn, yn, vn].
+    v[i] means the [visibility](http://cocodataset.org/#format-data) of this keypoint.
+    `n` must be equal to the number of keypoint categories.
+    The Xs and Ys are either relative coordinates in [0, 1], or absolute coordinates,
+    depend on whether "bbox_mode" is relative.
+
+    Note that the coordinate annotations in COCO format are integers in range [0, H-1 or W-1].
+    By default, detectron2 adds 0.5 to absolute keypoint coordinates to convert them from discrete
+    pixel indices to floating point coordinates.
+  + `iscrowd`: 0 or 1. Whether this instance is labeled as COCO's "crowd region".
++ `proposal_boxes` (array): 2D numpy array with shape (K, 4) representing K precomputed proposal boxes for this image.
++ `proposal_objectness_logits` (array): numpy array with shape (K, ), which corresponds to the objectness
+  logits of proposals in 'proposal_boxes'.
++ `proposal_bbox_mode` (int): the format of the precomputed proposal bbox.
+  It must be a member of
+  [structures.BoxMode](../modules/structures.html#detectron2.structures.BoxMode).
+  Default format is `BoxMode.XYXY_ABS`.
+
+
+If your dataset is already in the COCO format, you can simply register it by
+```python
+from detectron2.data.datasets import register_coco_instances
+register_coco_instances("my_dataset", {}, "json_annotation.json", "path/to/image/dir")
+```
+which will take care of everything (including metadata) for you.
+
+If your dataset is in COCO format with custom per-instance annotations,
+the [load_coco_json](../modules/data.html#detectron2.data.datasets.load_coco_json) function can be used.
+
+
+### "Metadata" for Datasets
+
+Each dataset is associated with some metadata, accessible through
+`MetadataCatalog.get(dataset_name).some_metadata`.
+Metadata is a key-value mapping that contains primitive information that helps interpret what's in the dataset, e.g.,
+names of classes, colors of classes, root of files, etc.
+This information will be useful for augmentation, evaluation, visualization, logging, etc.
+The structure of metadata depends on the what is needed from the corresponding downstream code.
+
+
+If you register a new dataset through `DatasetCatalog.register`,
+you may also want to add its corresponding metadata through
+`MetadataCatalog.get(dataset_name).set(name, value)`, to enable any features that need metadata.
+You can do it like this (using the metadata field "thing_classes" as an example):
+
+```python
+from detectron2.data import MetadataCatalog
+MetadataCatalog.get("my_dataset").thing_classes = ["person", "dog"]
+```
+
+Here is a list of metadata keys that are used by builtin features in detectron2.
+If you add your own dataset without these metadata, some features may be
+unavailable to you:
+
+* `thing_classes` (list[str]): Used by all instance detection/segmentation tasks.
+  A list of names for each instance/thing category.
+  If you load a COCO format dataset, it will be automatically set by the function `load_coco_json`.
+
+* `stuff_classes` (list[str]): Used by semantic and panoptic segmentation tasks.
+  A list of names for each stuff category.
+
+* `stuff_colors` (list[tuple(r, g, b)]): Pre-defined color (in [0, 255]) for each stuff category.
+  Used for visualization. If not given, random colors are used.
+
+* `keypoint_names` (list[str]): Used by keypoint localization. A list of names for each keypoint.
+
+* `keypoint_flip_map` (list[tuple[str]]): Used by the keypoint localization task. A list of pairs of names,
+  where each pair are the two keypoints that should be flipped if the image is
+  flipped during augmentation.
+* `keypoint_connection_rules`: list[tuple(str, str, (r, g, b))]. Each tuple specifies a pair of keypoints
+  that are connected and the color to use for the line between them when visualized.
+
+Some additional metadata that are specific to the evaluation of certain datasets (e.g. COCO):
+
+* `thing_dataset_id_to_contiguous_id` (dict[int->int]): Used by all instance detection/segmentation tasks in the COCO format.
+  A mapping from instance class ids in the dataset to contiguous ids in range [0, #class).
+  Will be automatically set by the function `load_coco_json`.
+
+* `stuff_dataset_id_to_contiguous_id` (dict[int->int]): Used when generating prediction json files for
+  semantic/panoptic segmentation.
+  A mapping from semantic segmentation class ids in the dataset
+  to contiguous ids in [0, num_categories). It is useful for evaluation only.
+
+* `json_file`: The COCO annotation json file. Used by COCO evaluation for COCO-format datasets.
+* `panoptic_root`, `panoptic_json`: Used by panoptic evaluation.
+* `evaluator_type`: Used by the builtin main training script to select
+   evaluator. No need to use it if you write your own main script.
+   You can just provide the [DatasetEvaluator](../modules/evaluation.html#detectron2.evaluation.DatasetEvaluator)
+   for your dataset directly in your main script.
+
+NOTE: For background on the difference between "thing" and "stuff" categories, see
+[On Seeing Stuff: The Perception of Materials by Humans and Machines](http://persci.mit.edu/pub_pdfs/adelson_spie_01.pdf).
+In detectron2, the term "thing" is used for instance-level tasks,
+and "stuff" is used for semantic segmentation tasks.
+Both are used in panoptic segmentation.
+
+
+### Update the Config for New Datasets
+
+Once you've registered the dataset, you can use the name of the dataset (e.g., "my_dataset" in
+example above) in `DATASETS.{TRAIN,TEST}`.
+There are other configs you might want to change to train or evaluate on new datasets:
+
+* `MODEL.ROI_HEADS.NUM_CLASSES` and `MODEL.RETINANET.NUM_CLASSES` are the number of thing classes
+	for R-CNN and RetinaNet models.
+* `MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS` sets the number of keypoints for Keypoint R-CNN.
+  You'll also need to set [Keypoint OKS](http://cocodataset.org/#keypoints-eval)
+	with `TEST.KEYPOINT_OKS_SIGMAS` for evaluation.
+* `MODEL.SEM_SEG_HEAD.NUM_CLASSES` sets the number of stuff classes for Semantic FPN & Panoptic FPN.
+* If you're training Fast R-CNN (with precomputed proposals), `DATASETS.PROPOSAL_FILES_{TRAIN,TEST}`
+	need to match the datasts. The format of proposal files are documented
+	[here](../modules/data.html#detectron2.data.load_proposals_into_dataset).
diff --git a/docs/tutorials/extend.md b/docs/tutorials/extend.md
new file mode 100644
index 0000000..0145513
--- /dev/null
+++ b/docs/tutorials/extend.md
@@ -0,0 +1,45 @@
+# Extend Detectron2's Defaults
+
+__Research is about doing things in new ways__.
+This brings a tension in how to create abstractions in code,
+which is a challenge for any research engineering project of a significant size:
+
+1. On one hand, it needs to have very thin abstractions to allow for the possibility of doing
+   everything in new ways. It should be reasonably easy to break existing
+   abstractions and replace them with new ones.
+
+2. On the other hand, such a project also needs reasonably high-level
+   abstractions, so that users can easily do things in standard ways,
+   without worrying too much about the details that only certain researchers care about.
+
+In detectron2, there are two types of interfaces that address this tension together:
+
+1. Functions and classes that take only a "config" argument (optionally with a minimal
+   set of extra arguments in cases of mature interfaces).
+
+   Such functions and classes implement
+   the "standard default" behavior: it will read what it needs from the
+   config and do the "standard" thing.
+   Users only need to load a standard config and pass it around, without having to worry about
+   which arguments are used and what they all mean.
+
+2. Functions and classes that have well-defined explicit arguments.
+
+   Each of these is a small building block of the entire system.
+   They require users' effort to stitch together, but can be stitched together in more flexible ways.
+   When you need to implement something different from the "standard defaults"
+   included in detectron2, these well-defined components can be reused.
+
+
+If you only need the standard behavior, the [Beginner's Tutorial](getting_started.html)
+should suffice. If you need to extend detectron2 to your own needs,
+see the following tutorials for more details:
+
+* Detectron2 includes a few standard datasets, but you can use custom ones. See
+  [Use Custom Datasets](datasets.html).
+* Detectron2 contains the standard logic that creates a data loader from a
+  dataset, but you can write your own as well. See [Use Custom Data Loaders](data_loading.html).
+* Detectron2 implements many standard detection models, and provide ways for you
+  to overwrite its behaviors. See [Use Models](models.html) and [Write Models](write-models.html).
+* Detectron2 provides a default training loop that is good for common training tasks.
+  You can customize it with hooks, or write your own loop instead. See [training](training.html).
diff --git a/docs/tutorials/getting_started.md b/docs/tutorials/getting_started.md
new file mode 100644
index 0000000..e90bde7
--- /dev/null
+++ b/docs/tutorials/getting_started.md
@@ -0,0 +1 @@
+../../GETTING_STARTED.md
\ No newline at end of file
diff --git a/docs/tutorials/index.rst b/docs/tutorials/index.rst
new file mode 100644
index 0000000..24fd1b8
--- /dev/null
+++ b/docs/tutorials/index.rst
@@ -0,0 +1,15 @@
+Tutorials
+======================================
+
+.. toctree::
+   :maxdepth: 2
+
+   install
+   getting_started
+   extend
+   datasets
+   data_loading
+   models
+   write-models
+   training
+   configs
diff --git a/docs/tutorials/install.md b/docs/tutorials/install.md
new file mode 100644
index 0000000..5f52b2b
--- /dev/null
+++ b/docs/tutorials/install.md
@@ -0,0 +1 @@
+../../INSTALL.md
\ No newline at end of file
diff --git a/docs/tutorials/models.md b/docs/tutorials/models.md
new file mode 100644
index 0000000..84beb48
--- /dev/null
+++ b/docs/tutorials/models.md
@@ -0,0 +1,90 @@
+# Use Models
+
+Models (and their sub-models) in detectron2 are built by
+functions such as `build_model`, `build_backbone`, `build_roi_heads`:
+```python
+from detectron2.modeling import build_model
+model = build_model(cfg)  # returns a torch.nn.Module
+```
+
+To load an existing checkpoint to the model, use
+`DetectionCheckpointer(model).load(file_path)`.
+Detectron2 recognizes models in pytorch's `.pth` format, as well as the `.pkl` files
+in our model zoo.
+
+You can use a model by just `outputs = model(inputs)`.
+Next, we explain the inputs/outputs format used by the builtin models in detectron2.
+
+
+### Model Input Format
+
+All builtin models take a `list[dict]` as the inputs. Each dict
+corresponds to information about one image.
+
+The dict may contain the following keys:
+
+* "image": `Tensor` in (C, H, W) format. The meaning of channels are defined by `cfg.INPUT.FORMAT`.
+* "instances": an `Instances` object, with the following fields:
+	+ "gt_boxes": `Boxes` object storing N boxes, one for each instance.
+	+ "gt_classes": `Tensor` of long type, a vector of N labels, in range [0, num_categories).
+	+ "gt_masks": a `PolygonMasks` object storing N masks, one for each instance.
+	+ "gt_keypoints": a `Keypoints` object storing N keypoint sets, one for each instance.
+* "proposals": an `Instances` object used in Fast R-CNN style models, with the following fields:
+	+ "proposal_boxes": `Boxes` object storing P proposal boxes.
+	+ "objectness_logits": `Tensor`, a vector of P scores, one for each proposal.
+* "height", "width": the *desired* output height and width of the image, not necessarily the same
+	as the height or width of the `image` when input into the model, which might be after resizing.
+	For example, it can be the *original* image height and width before resizing.
+
+	If provided, the model will produce output in this resolution,
+	rather than in the resolution of the `image` as input into the model. This is more efficient and accurate.
+* "sem_seg": `Tensor[int]` in (H, W) format. The semantic segmentation ground truth.
+
+
+#### How it connects to data loader:
+
+The output of the default [DatasetMapper]( ../modules/data.html#detectron2.data.DatasetMapper) is a dict
+that follows the above format.
+After the data loader performs batching, it becomes `list[dict]` which the builtin models support.
+
+
+### Model Output Format
+
+When in training mode, the builtin models output a `dict[str->ScalarTensor]` with all the losses.
+
+When in inference mode, the builtin models output a `list[dict]`, one dict for each image. Each dict may contain:
+
+* "instances": [Instances](../modules/structures.html#detectron2.structures.Instances)
+  object with the following fields:
+	* "pred_boxes": [Boxes](../modules/structures.html#detectron2.structures.Boxes) object storing N boxes, one for each detected instance.
+	* "scores": `Tensor`, a vector of N scores.
+	* "pred_classes": `Tensor`, a vector of N labels in range [0, num_categories).
+	+ "pred_masks": a `Tensor` of shape (N, H, W), masks for each detected instance.
+	+ "pred_keypoints": a `Tensor` of shape (N, num_keypoint, 3).
+		Each row in the last dimension is (x, y, score).
+* "sem_seg": `Tensor` of (num_categories, H, W), the semantic segmentation prediction.
+* "proposals": [Instances](../modules/structures.html#detectron2.structures.Instances)
+	object with the following fields:
+	* "proposal_boxes": [Boxes](../modules/structures.html#detectron2.structures.Boxes)
+		object storing N boxes.
+	* "objectness_logits": a torch vector of N scores.
+* "panoptic_seg": A tuple of (Tensor, list[dict]). The tensor has shape (H, W), where each element
+	represent the segment id of the pixel. Each dict describes one segment id and has the following fields:
+	* "id": the segment id
+	* "isthing": whether the segment is a thing or stuff
+	* "category_id": the category id of this segment. It represents the thing
+       class id when `isthing==True`, and the stuff class id otherwise.
+
+
+### How to use a model in your code:
+
+Contruct your own `list[dict]`, with the necessary keys.
+For example, for inference, provide dicts with "image", and optionally "height" and "width".
+
+Note that when in training mode, all models are required to be used under an `EventStorage`.
+The training statistics will be put into the storage:
+```python
+from detectron2.utils.events import EventStorage
+with EventStorage() as storage:
+  losses = model(inputs)
+```
diff --git a/docs/tutorials/training.md b/docs/tutorials/training.md
new file mode 100644
index 0000000..98ff44f
--- /dev/null
+++ b/docs/tutorials/training.md
@@ -0,0 +1,18 @@
+# Training
+
+From the previous tutorials, you may now have a custom model and data loader.
+
+You are free to create your own optimizer, and write the training logic: it's
+usually easy with PyTorch, and allow researchers to see the entire training
+logic more clearly.
+One such example is provided in [tools/plain_train_net.py](https://github.com/facebookresearch/detectron2/blob/master/tools/plain_train_net.py).
+
+We also provide a standarized "trainer" abstraction with a
+[minimal hook system](../modules/engine.html#detectron2.engine.HookBase)
+that helps simplify the standard types of training.
+
+You can use
+[SimpleTrainer().train()](../modules/engine.html#detectron2.engine.SimpleTrainer)
+which does single-cost single-optimizer single-data-source training.
+Or use [DefaultTrainer().train()](../modules/engine.html#detectron2.engine.defaults.DefaultTrainer)
+which includes more standard behavior that one might want to opt in.
diff --git a/docs/tutorials/write-models.md b/docs/tutorials/write-models.md
new file mode 100644
index 0000000..6384771
--- /dev/null
+++ b/docs/tutorials/write-models.md
@@ -0,0 +1,36 @@
+# Write Models
+
+If you are trying to do something completely new, you may wish to implement
+a model entirely from scratch within detectron2. However, in many situations you may
+be interested in modifying or extending some components of an existing model.
+Therefore, we also provide a registration mechanism that lets you override the
+behavior of certain internal components of standard models.
+
+For example, to add a new backbone, import this code in your code:
+```python
+from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
+
+@BACKBONE_REGISTRY.register()
+class ToyBackBone(Backbone):
+  def __init__(self, cfg, input_shape):
+    # create your own backbone
+    self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=16, padding=3)
+
+  def forward(self, image):
+    return {"conv1": self.conv1(image)}
+
+  def output_shape(self):
+    return {"conv1": ShapeSpec(channels=64, stride=16)}
+```
+Then, you can use `cfg.MODEL.BACKBONE.NAME = 'ToyBackBone'` in your config object.
+`build_model(cfg)` will then call your `ToyBackBone` instead.
+
+As another example, to add new abilities to the ROI heads in the Generalized R-CNN meta-architecture,
+you can implement a new
+[ROIHeads](../modules/modeling.html#detectron2.modeling.ROIHeads) subclass and put it in the `ROI_HEADS_REGISTRY`.
+See [densepose in detectron2](https://github.com/facebookresearch/detectron2/tree/master/projects/DensePose)
+for an example that implements new ROIHeads.
+
+Other registries can be found in [API documentation](../modules/modeling.html#model-registries).
+You can register components in these registries to customize different parts of a model, or the
+entire model.
diff --git a/docs/windows_build.png b/docs/windows_build.png
new file mode 100644
index 0000000..1ab65e5
Binary files /dev/null and b/docs/windows_build.png differ
diff --git a/projects/DensePose/README.md b/projects/DensePose/README.md
new file mode 100644
index 0000000..68167b8
--- /dev/null
+++ b/projects/DensePose/README.md
@@ -0,0 +1,41 @@
+# DensePose in Detectron2
+**Dense Human Pose Estimation In The Wild**
+
+_Rıza Alp Güler, Natalia Neverova, Iasonas Kokkinos_
+
+[[`densepose.org`](https://densepose.org)] [[`arXiv`](https://arxiv.org/abs/1802.00434)] [[`BibTeX`](#CitingDensePose)]
+
+Dense human pose estimation aims at mapping all human pixels of an RGB image to the 3D surface of the human body.
+
+<div align="center">
+  <img src="https://drive.google.com/uc?export=view&id=1qfSOkpueo1kVZbXOuQJJhyagKjMgepsz" width="700px" />
+</div>
+
+In this repository, we provide the code to train and evaluate DensePose-RCNN. We also provide tools to visualize
+DensePose annotation and results.
+
+# Quick Start
+
+See [ Getting Started ](doc/GETTING_STARTED.md)
+
+# Model Zoo and Baselines
+
+We provide a number of baseline results and trained models available for download. See [Model Zoo](doc/MODEL_ZOO.md) for details.
+
+# License
+
+Detectron2 is released under the [Apache 2.0 license](../../LICENSE)
+
+## <a name="CitingDensePose"></a>Citing DensePose
+
+If you use DensePose, please use the following BibTeX entry.
+
+```
+@InProceedings{Guler2018DensePose,
+  title={DensePose: Dense Human Pose Estimation In The Wild},
+  author={R\{i}za Alp G\"uler, Natalia Neverova, Iasonas Kokkinos},
+  journal={The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year={2018}
+}
+```
+
diff --git a/projects/DensePose/apply_net.py b/projects/DensePose/apply_net.py
new file mode 100644
index 0000000..8e6c279
--- /dev/null
+++ b/projects/DensePose/apply_net.py
@@ -0,0 +1,309 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import argparse
+import glob
+import logging
+import os
+import pickle
+import sys
+from typing import Any, ClassVar, Dict, List
+import torch
+
+from detectron2.config import get_cfg
+from detectron2.data.detection_utils import read_image
+from detectron2.engine.defaults import DefaultPredictor
+from detectron2.structures.boxes import BoxMode
+from detectron2.structures.instances import Instances
+from detectron2.utils.logger import setup_logger
+
+from densepose import add_densepose_config
+from densepose.utils.logger import verbosity_to_level
+from densepose.vis.base import CompoundVisualizer
+from densepose.vis.bounding_box import ScoredBoundingBoxVisualizer
+from densepose.vis.densepose import (
+    DensePoseResultsContourVisualizer,
+    DensePoseResultsFineSegmentationVisualizer,
+    DensePoseResultsUVisualizer,
+    DensePoseResultsVVisualizer,
+)
+from densepose.vis.extractor import CompoundExtractor, create_extractor
+
+DOC = """Apply Net - a tool to print / visualize DensePose results
+"""
+
+LOGGER_NAME = "apply_net"
+logger = logging.getLogger(LOGGER_NAME)
+
+_ACTION_REGISTRY: Dict[str, "Action"] = {}
+
+
+class Action(object):
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        parser.add_argument(
+            "-v",
+            "--verbosity",
+            action="count",
+            help="Verbose mode. Multiple -v options increase the verbosity.",
+        )
+
+
+def register_action(cls: type):
+    """
+    Decorator for action classes to automate action registration
+    """
+    global _ACTION_REGISTRY
+    _ACTION_REGISTRY[cls.COMMAND] = cls
+    return cls
+
+
+class InferenceAction(Action):
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        super(InferenceAction, cls).add_arguments(parser)
+        parser.add_argument("cfg", metavar="<config>", help="Config file")
+        parser.add_argument("model", metavar="<model>", help="Model file")
+        parser.add_argument("input", metavar="<input>", help="Input data")
+
+    @classmethod
+    def execute(cls: type, args: argparse.Namespace):
+        logger.info(f"Loading config from {args.cfg}")
+        opts = []
+        cfg = cls.setup_config(args.cfg, args.model, args, opts)
+        logger.info(f"Loading model from {args.model}")
+        predictor = DefaultPredictor(cfg)
+        logger.info(f"Loading data from {args.input}")
+        file_list = cls._get_input_file_list(args.input)
+        if len(file_list) == 0:
+            logger.warning(f"No input images for {args.input}")
+            return
+        context = cls.create_context(args)
+        for file_name in file_list:
+            img = read_image(file_name, format="BGR")  # predictor expects BGR image.
+            with torch.no_grad():
+                outputs = predictor(img)["instances"]
+                cls.execute_on_outputs(context, {"file_name": file_name, "image": img}, outputs)
+        cls.postexecute(context)
+
+    @classmethod
+    def setup_config(
+        cls: type, config_fpath: str, model_fpath: str, args: argparse.Namespace, opts: List[str]
+    ):
+        cfg = get_cfg()
+        add_densepose_config(cfg)
+        cfg.merge_from_file(config_fpath)
+        if opts:
+            cfg.merge_from_list(opts)
+        cfg.MODEL.WEIGHTS = model_fpath
+        cfg.freeze()
+        return cfg
+
+    @classmethod
+    def _get_input_file_list(cls: type, input_spec: str):
+        if os.path.isdir(input_spec):
+            file_list = [
+                os.path.join(input_spec, fname)
+                for fname in os.listdir(input_spec)
+                if os.path.isfile(os.path.join(input_spec, fname))
+            ]
+        elif os.path.isfile(input_spec):
+            file_list = [input_spec]
+        else:
+            file_list = glob.glob(input_spec)
+        return file_list
+
+
+@register_action
+class DumpAction(InferenceAction):
+    """
+    Dump action that outputs results to a pickle file
+    """
+
+    COMMAND: ClassVar[str] = "dump"
+
+    @classmethod
+    def add_parser(cls: type, subparsers: argparse._SubParsersAction):
+        parser = subparsers.add_parser(cls.COMMAND, help="Dump model outputs to a file.")
+        cls.add_arguments(parser)
+        parser.set_defaults(func=cls.execute)
+
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        super(DumpAction, cls).add_arguments(parser)
+        parser.add_argument(
+            "--output",
+            metavar="<dump_file>",
+            default="results.pkl",
+            help="File name to save dump to",
+        )
+
+    @classmethod
+    def execute_on_outputs(
+        cls: type, context: Dict[str, Any], entry: Dict[str, Any], outputs: Instances
+    ):
+        image_fpath = entry["file_name"]
+        logger.info(f"Processing {image_fpath}")
+        result = {"file_name": image_fpath}
+        if outputs.has("scores"):
+            result["scores"] = outputs.get("scores").cpu()
+        if outputs.has("pred_boxes"):
+            result["pred_boxes_XYXY"] = outputs.get("pred_boxes").tensor.cpu()
+            if outputs.has("pred_densepose"):
+                boxes_XYWH = BoxMode.convert(
+                    result["pred_boxes_XYXY"], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS
+                )
+                result["pred_densepose"] = outputs.get("pred_densepose").to_result(boxes_XYWH)
+        context["results"].append(result)
+
+    @classmethod
+    def create_context(cls: type, args: argparse.Namespace):
+        context = {"results": [], "out_fname": args.output}
+        return context
+
+    @classmethod
+    def postexecute(cls: type, context: Dict[str, Any]):
+        out_fname = context["out_fname"]
+        out_dir = os.path.dirname(out_fname)
+        if len(out_dir) > 0 and not os.path.exists(out_dir):
+            os.makedirs(out_dir)
+        with open(out_fname, "wb") as hFile:
+            pickle.dump(context["results"], hFile)
+            logger.info(f"Output saved to {out_fname}")
+
+
+@register_action
+class ShowAction(InferenceAction):
+    """
+    Show action that visualizes selected entries on an image
+    """
+
+    COMMAND: ClassVar[str] = "show"
+    VISUALIZERS: ClassVar[Dict[str, object]] = {
+        "dp_contour": DensePoseResultsContourVisualizer,
+        "dp_segm": DensePoseResultsFineSegmentationVisualizer,
+        "dp_u": DensePoseResultsUVisualizer,
+        "dp_v": DensePoseResultsVVisualizer,
+        "bbox": ScoredBoundingBoxVisualizer,
+    }
+
+    @classmethod
+    def add_parser(cls: type, subparsers: argparse._SubParsersAction):
+        parser = subparsers.add_parser(cls.COMMAND, help="Visualize selected entries")
+        cls.add_arguments(parser)
+        parser.set_defaults(func=cls.execute)
+
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        super(ShowAction, cls).add_arguments(parser)
+        parser.add_argument(
+            "visualizations",
+            metavar="<visualizations>",
+            help="Comma separated list of visualizations, possible values: "
+            "[{}]".format(",".join(sorted(cls.VISUALIZERS.keys()))),
+        )
+        parser.add_argument(
+            "--min_score",
+            metavar="<score>",
+            default=0.8,
+            type=float,
+            help="Minimum detection score to visualize",
+        )
+        parser.add_argument(
+            "--nms_thresh", metavar="<threshold>", default=None, type=float, help="NMS threshold"
+        )
+        parser.add_argument(
+            "--output",
+            metavar="<image_file>",
+            default="outputres.png",
+            help="File name to save output to",
+        )
+
+    @classmethod
+    def setup_config(
+        cls: type, config_fpath: str, model_fpath: str, args: argparse.Namespace, opts: List[str]
+    ):
+        opts.append("MODEL.ROI_HEADS.SCORE_THRESH_TEST")
+        opts.append(str(args.min_score))
+        if args.nms_thresh is not None:
+            opts.append("MODEL.ROI_HEADS.NMS_THRESH_TEST")
+            opts.append(str(args.nms_thresh))
+        cfg = super(ShowAction, cls).setup_config(config_fpath, model_fpath, args, opts)
+        return cfg
+
+    @classmethod
+    def execute_on_outputs(
+        cls: type, context: Dict[str, Any], entry: Dict[str, Any], outputs: Instances
+    ):
+        import cv2
+        import numpy as np
+
+        visualizer = context["visualizer"]
+        extractor = context["extractor"]
+        image_fpath = entry["file_name"]
+        logger.info(f"Processing {image_fpath}")
+        image = cv2.cvtColor(entry["image"], cv2.COLOR_BGR2GRAY)
+        image = np.tile(image[:, :, np.newaxis], [1, 1, 3])
+        data = extractor(outputs)
+        image_vis = visualizer.visualize(image, data)
+        entry_idx = context["entry_idx"] + 1
+        out_fname = cls._get_out_fname(entry_idx, context["out_fname"])
+        out_dir = os.path.dirname(out_fname)
+        if len(out_dir) > 0 and not os.path.exists(out_dir):
+            os.makedirs(out_dir)
+        cv2.imwrite(out_fname, image_vis)
+        logger.info(f"Output saved to {out_fname}")
+        context["entry_idx"] += 1
+
+    @classmethod
+    def postexecute(cls: type, context: Dict[str, Any]):
+        pass
+
+    @classmethod
+    def _get_out_fname(cls: type, entry_idx: int, fname_base: str):
+        base, ext = os.path.splitext(fname_base)
+        return base + ".{0:04d}".format(entry_idx) + ext
+
+    @classmethod
+    def create_context(cls: type, args: argparse.Namespace) -> Dict[str, Any]:
+        vis_specs = args.visualizations.split(",")
+        visualizers = []
+        extractors = []
+        for vis_spec in vis_specs:
+            vis = cls.VISUALIZERS[vis_spec]()
+            visualizers.append(vis)
+            extractor = create_extractor(vis)
+            extractors.append(extractor)
+        visualizer = CompoundVisualizer(visualizers)
+        extractor = CompoundExtractor(extractors)
+        context = {
+            "extractor": extractor,
+            "visualizer": visualizer,
+            "out_fname": args.output,
+            "entry_idx": 0,
+        }
+        return context
+
+
+def create_argument_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description=DOC,
+        formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=120),
+    )
+    parser.set_defaults(func=lambda _: parser.print_help(sys.stdout))
+    subparsers = parser.add_subparsers(title="Actions")
+    for _, action in _ACTION_REGISTRY.items():
+        action.add_parser(subparsers)
+    return parser
+
+
+def main():
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    verbosity = args.verbosity if hasattr(args, "verbosity") else None
+    global logger
+    logger = setup_logger(name=LOGGER_NAME)
+    logger.setLevel(verbosity_to_level(verbosity))
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/projects/DensePose/configs/Base-DensePose-RCNN-FPN.yaml b/projects/DensePose/configs/Base-DensePose-RCNN-FPN.yaml
new file mode 100644
index 0000000..7942010
--- /dev/null
+++ b/projects/DensePose/configs/Base-DensePose-RCNN-FPN.yaml
@@ -0,0 +1,46 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    # Detectron1 uses 2000 proposals per-batch,
+    # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
+    # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+
+  DENSEPOSE_ON: True
+  ROI_HEADS:
+    NAME: "DensePoseROIHeads"
+    IN_FEATURES: ["p2", "p3", "p4", "p5"]
+    NUM_CLASSES: 1
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+    POOLER_SAMPLING_RATIO: 2
+    POOLER_TYPE: "ROIAlign"
+  ROI_DENSEPOSE_HEAD:
+    NAME: "DensePoseV1ConvXHead"
+    POOLER_TYPE: "ROIAlign"
+DATASETS:
+  TRAIN: ("densepose_coco_2014_train", "densepose_coco_2014_valminusminival")
+  TEST: ("densepose_coco_2014_minival",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.002
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  WARMUP_FACTOR: 0.1
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
diff --git a/projects/DensePose/configs/densepose_rcnn_R_101_FPN_s1x.yaml b/projects/DensePose/configs/densepose_rcnn_R_101_FPN_s1x.yaml
new file mode 100644
index 0000000..2f2574d
--- /dev/null
+++ b/projects/DensePose/configs/densepose_rcnn_R_101_FPN_s1x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "catalog://ImageNetPretrained/MSRA/R-101"
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
+
diff --git a/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x.yaml b/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x.yaml
new file mode 100644
index 0000000..fa98434
--- /dev/null
+++ b/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "catalog://ImageNetPretrained/MSRA/R-50"
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  MAX_ITER: 130000
+  STEPS: (100000, 120000)
+
diff --git a/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_inference_acc_test.yaml b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_inference_acc_test.yaml
new file mode 100644
index 0000000..6a1d651
--- /dev/null
+++ b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_inference_acc_test.yaml
@@ -0,0 +1,8 @@
+_BASE_: "../densepose_rcnn_R_50_FPN_s1x.yaml"
+MODEL:
+  WEIGHTS: "detectron2://e2e_densepose_R_50_FPN_s1x/124238535/model_final_5f3d7f.pkl"
+DATASETS:
+  TRAIN: ()
+  TEST: ("densepose_coco_2014_minival_100",)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 56.05, 0.025], ["densepose", "AP",  46.54, 0.02]]
diff --git a/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_instant_test.yaml b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_instant_test.yaml
new file mode 100644
index 0000000..4221f72
--- /dev/null
+++ b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_instant_test.yaml
@@ -0,0 +1,9 @@
+_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "catalog://ImageNetPretrained/MSRA/R-50"
+DATASETS:
+  TRAIN: ("densepose_coco_2014_minival_100",)
+  TEST: ("densepose_coco_2014_minival_100",)
+SOLVER:
+  MAX_ITER: 40
+  STEPS: (30,)
diff --git a/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_training_acc_test.yaml b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_training_acc_test.yaml
new file mode 100644
index 0000000..4871e4e
--- /dev/null
+++ b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_training_acc_test.yaml
@@ -0,0 +1,14 @@
+_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
+MODEL:
+  WEIGHTS: "catalog://ImageNetPretrained/MSRA/R-50"
+  ROI_HEADS:
+    NUM_CLASSES: 1
+DATASETS:
+  TRAIN: ("densepose_coco_2014_minival",)
+  TEST: ("densepose_coco_2014_minival",)
+SOLVER:
+  MAX_ITER: 6000
+  STEPS: (5500, 5800)
+TEST:
+  EXPECTED_RESULTS: [["bbox", "AP", 58.27, 1.0], ["densepose", "AP", 42.47, 1.5]]
+
diff --git a/projects/DensePose/densepose/__init__.py b/projects/DensePose/densepose/__init__.py
new file mode 100644
index 0000000..48896fc
--- /dev/null
+++ b/projects/DensePose/densepose/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from . import dataset  # just to register data
+from .config import add_densepose_config
+from .dataset_mapper import DatasetMapper
+from .densepose_head import ROI_DENSEPOSE_HEAD_REGISTRY
+from .evaluator import DensePoseCOCOEvaluator
+from .roi_head import DensePoseROIHeads
+from .structures import DensePoseDataRelative, DensePoseList, DensePoseTransformData
diff --git a/projects/DensePose/densepose/config.py b/projects/DensePose/densepose/config.py
new file mode 100644
index 0000000..c23aa15
--- /dev/null
+++ b/projects/DensePose/densepose/config.py
@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from detectron2.config import CfgNode as CN
+
+
+def add_densepose_config(cfg):
+    """
+    Add config for densepose head.
+    """
+    _C = cfg
+
+    _C.MODEL.DENSEPOSE_ON = True
+
+    _C.MODEL.ROI_DENSEPOSE_HEAD = CN()
+    _C.MODEL.ROI_DENSEPOSE_HEAD.NAME = ""
+    _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS = 8
+    # Number of parts used for point labels
+    _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES = 24
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL = 4
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM = 512
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL = 3
+    _C.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE = 2
+    _C.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE = 56
+    _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE = "ROIAlignV2"
+    _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION = 14
+    _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO = 2
+    # Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD)
+    _C.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD = 0.7
+    # Loss weights for annotation masks.(14 Parts)
+    _C.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS = 2.0
+    # Loss weights for surface parts. (24 Parts)
+    _C.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS = 0.3
+    # Loss weights for UV regression.
+    _C.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS = 0.1
diff --git a/projects/DensePose/densepose/dataset.py b/projects/DensePose/densepose/dataset.py
new file mode 100644
index 0000000..e0cd763
--- /dev/null
+++ b/projects/DensePose/densepose/dataset.py
@@ -0,0 +1,47 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets import load_coco_json
+
+
+def get_densepose_metadata():
+    meta = {
+        "thing_classes": ["person"],
+        "densepose_transform_src": "detectron2://densepose/UV_symmetry_transforms.mat",
+        "densepose_smpl_subdiv": "detectron2://densepose/SMPL_subdiv.mat",
+        "densepose_smpl_subdiv_transform": "detectron2://densepose/SMPL_SUBDIV_TRANSFORM.mat",
+    }
+    return meta
+
+
+SPLITS = {
+    "densepose_coco_2014_train": ("coco/train2014", "coco/annotations/densepose_train2014.json"),
+    "densepose_coco_2014_minival": ("coco/val2014", "coco/annotations/densepose_minival2014.json"),
+    "densepose_coco_2014_minival_100": (
+        "coco/val2014",
+        "coco/annotations/densepose_minival2014_100.json",
+    ),
+    "densepose_coco_2014_valminusminival": (
+        "coco/val2014",
+        "coco/annotations/densepose_valminusminival2014.json",
+    ),
+}
+
+DENSEPOSE_KEYS = ["dp_x", "dp_y", "dp_I", "dp_U", "dp_V", "dp_masks"]
+
+for key, (image_root, json_file) in SPLITS.items():
+    # Assume pre-defined datasets live in `./datasets`.
+    json_file = os.path.join("datasets", json_file)
+    image_root = os.path.join("datasets", image_root)
+
+    DatasetCatalog.register(
+        key,
+        lambda key=key, json_file=json_file, image_root=image_root: load_coco_json(
+            json_file, image_root, key, extra_annotation_keys=DENSEPOSE_KEYS
+        ),
+    )
+
+    MetadataCatalog.get(key).set(
+        json_file=json_file, image_root=image_root, **get_densepose_metadata()
+    )
diff --git a/projects/DensePose/densepose/dataset_mapper.py b/projects/DensePose/densepose/dataset_mapper.py
new file mode 100644
index 0000000..3eadbe1
--- /dev/null
+++ b/projects/DensePose/densepose/dataset_mapper.py
@@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import copy
+import torch
+from fvcore.common.file_io import PathManager
+
+from detectron2.data import MetadataCatalog
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+
+from .structures import DensePoseDataRelative, DensePoseList, DensePoseTransformData
+
+
+class DatasetMapper:
+    """
+    A customized version of `detectron2.data.DatasetMapper`
+    """
+
+    def __init__(self, cfg, is_train=True):
+        self.tfm_gens = utils.build_transform_gen(cfg, is_train)
+
+        # fmt: off
+        self.img_format     = cfg.INPUT.FORMAT
+        self.mask_on        = cfg.MODEL.MASK_ON
+        self.keypoint_on    = cfg.MODEL.KEYPOINT_ON
+        self.densepose_on   = cfg.MODEL.DENSEPOSE_ON
+        assert not cfg.MODEL.LOAD_PROPOSALS, "not supported yet"
+        # fmt: on
+        if self.keypoint_on and is_train:
+            # Flip only makes sense in training
+            self.keypoint_hflip_indices = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
+        else:
+            self.keypoint_hflip_indices = None
+
+        if self.densepose_on:
+            densepose_transform_srcs = [
+                MetadataCatalog.get(ds).densepose_transform_src
+                for ds in cfg.DATASETS.TRAIN + cfg.DATASETS.TEST
+            ]
+            assert len(densepose_transform_srcs) > 0
+            # TODO: check that DensePose transformation data is the same for
+            # all the datasets. Otherwise one would have to pass DB ID with
+            # each entry to select proper transformation data. For now, since
+            # all DensePose annotated data uses the same data semantics, we
+            # omit this check.
+            densepose_transform_data_fpath = PathManager.get_local_path(densepose_transform_srcs[0])
+            self.densepose_transform_data = DensePoseTransformData.load(
+                densepose_transform_data_fpath
+            )
+
+        self.is_train = is_train
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+
+        image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+        image_shape = image.shape[:2]  # h, w
+        dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32"))
+
+        if not self.is_train:
+            dataset_dict.pop("annotations", None)
+            return dataset_dict
+
+        for anno in dataset_dict["annotations"]:
+            if not self.mask_on:
+                anno.pop("segmentation", None)
+            if not self.keypoint_on:
+                anno.pop("keypoints", None)
+
+        # USER: Implement additional transformations if you have other types of data
+        # USER: Don't call transpose_densepose if you don't need
+        annos = [
+            self._transform_densepose(
+                utils.transform_instance_annotations(
+                    obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices
+                ),
+                transforms,
+            )
+            for obj in dataset_dict.pop("annotations")
+            if obj.get("iscrowd", 0) == 0
+        ]
+        instances = utils.annotations_to_instances(annos, image_shape)
+
+        if len(annos) and "densepose" in annos[0]:
+            gt_densepose = [obj["densepose"] for obj in annos]
+            instances.gt_densepose = DensePoseList(gt_densepose, instances.gt_boxes, image_shape)
+
+        dataset_dict["instances"] = instances[instances.gt_boxes.nonempty()]
+        return dataset_dict
+
+    def _transform_densepose(self, annotation, transforms):
+        if not self.densepose_on:
+            return annotation
+
+        # Handle densepose annotations
+        is_valid, reason_not_valid = DensePoseDataRelative.validate_annotation(annotation)
+        if is_valid:
+            densepose_data = DensePoseDataRelative(annotation, cleanup=True)
+            densepose_data.apply_transform(transforms, self.densepose_transform_data)
+            annotation["densepose"] = densepose_data
+        else:
+            # logger = logging.getLogger(__name__)
+            # logger.debug("Could not load DensePose annotation: {}".format(reason_not_valid))
+            DensePoseDataRelative.cleanup_annotation(annotation)
+            # NOTE: annotations for certain instances may be unavailable.
+            # 'None' is accepted by the DensePostList data structure.
+            annotation["densepose"] = None
+        return annotation
diff --git a/projects/DensePose/densepose/densepose_coco_evaluation.py b/projects/DensePose/densepose/densepose_coco_evaluation.py
new file mode 100644
index 0000000..2bf722a
--- /dev/null
+++ b/projects/DensePose/densepose/densepose_coco_evaluation.py
@@ -0,0 +1,953 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# This is a modified version of cocoeval.py where we also have the densepose evaluation.
+
+__author__ = "tsungyi"
+
+import copy
+import datetime
+import itertools
+import logging
+import numpy as np
+import pickle
+import time
+from collections import defaultdict
+import scipy.spatial.distance as ssd
+from fvcore.common.file_io import PathManager
+from pycocotools import mask as maskUtils
+from scipy.io import loadmat
+
+from .structures import DensePoseResult
+
+logger = logging.getLogger(__name__)
+
+
+class DensePoseCocoEval(object):
+    # Interface for evaluating detection on the Microsoft COCO dataset.
+    #
+    # The usage for CocoEval is as follows:
+    #  cocoGt=..., cocoDt=...       # load dataset and results
+    #  E = CocoEval(cocoGt,cocoDt); # initialize CocoEval object
+    #  E.params.recThrs = ...;      # set parameters as desired
+    #  E.evaluate();                # run per image evaluation
+    #  E.accumulate();              # accumulate per image results
+    #  E.summarize();               # display summary metrics of results
+    # For example usage see evalDemo.m and http://mscoco.org/.
+    #
+    # The evaluation parameters are as follows (defaults in brackets):
+    #  imgIds     - [all] N img ids to use for evaluation
+    #  catIds     - [all] K cat ids to use for evaluation
+    #  iouThrs    - [.5:.05:.95] T=10 IoU thresholds for evaluation
+    #  recThrs    - [0:.01:1] R=101 recall thresholds for evaluation
+    #  areaRng    - [...] A=4 object area ranges for evaluation
+    #  maxDets    - [1 10 100] M=3 thresholds on max detections per image
+    #  iouType    - ['segm'] set iouType to 'segm', 'bbox', 'keypoints' or 'densepose'
+    #  iouType replaced the now DEPRECATED useSegm parameter.
+    #  useCats    - [1] if true use category labels for evaluation
+    # Note: if useCats=0 category labels are ignored as in proposal scoring.
+    # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified.
+    #
+    # evaluate(): evaluates detections on every image and every category and
+    # concats the results into the "evalImgs" with fields:
+    #  dtIds      - [1xD] id for each of the D detections (dt)
+    #  gtIds      - [1xG] id for each of the G ground truths (gt)
+    #  dtMatches  - [TxD] matching gt id at each IoU or 0
+    #  gtMatches  - [TxG] matching dt id at each IoU or 0
+    #  dtScores   - [1xD] confidence of each dt
+    #  gtIgnore   - [1xG] ignore flag for each gt
+    #  dtIgnore   - [TxD] ignore flag for each dt at each IoU
+    #
+    # accumulate(): accumulates the per-image, per-category evaluation
+    # results in "evalImgs" into the dictionary "eval" with fields:
+    #  params     - parameters used for evaluation
+    #  date       - date evaluation was performed
+    #  counts     - [T,R,K,A,M] parameter dimensions (see above)
+    #  precision  - [TxRxKxAxM] precision for every evaluation setting
+    #  recall     - [TxKxAxM] max recall for every evaluation setting
+    # Note: precision and recall==-1 for settings with no gt objects.
+    #
+    # See also coco, mask, pycocoDemo, pycocoEvalDemo
+    #
+    # Microsoft COCO Toolbox.      version 2.0
+    # Data, paper, and tutorials available at:  http://mscoco.org/
+    # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+    # Licensed under the Simplified BSD License [see coco/license.txt]
+    def __init__(self, cocoGt=None, cocoDt=None, iouType="densepose"):
+        """
+        Initialize CocoEval using coco APIs for gt and dt
+        :param cocoGt: coco object with ground truth annotations
+        :param cocoDt: coco object with detection results
+        :return: None
+        """
+        self.cocoGt = cocoGt  # ground truth COCO API
+        self.cocoDt = cocoDt  # detections COCO API
+        self.params = {}  # evaluation parameters
+        self.evalImgs = defaultdict(list)  # per-image per-category eval results [KxAxI]
+        self.eval = {}  # accumulated evaluation results
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        self.params = Params(iouType=iouType)  # parameters
+        self._paramsEval = {}  # parameters for evaluation
+        self.stats = []  # result summarization
+        self.ious = {}  # ious between all gts and dts
+        if cocoGt is not None:
+            self.params.imgIds = sorted(cocoGt.getImgIds())
+            self.params.catIds = sorted(cocoGt.getCatIds())
+        self.ignoreThrBB = 0.7
+        self.ignoreThrUV = 0.9
+
+    def _loadGEval(self):
+        smpl_subdiv_fpath = PathManager.get_local_path("detectron2://densepose/SMPL_subdiv.mat")
+        pdist_transform_fpath = PathManager.get_local_path(
+            "detectron2://densepose/SMPL_SUBDIV_TRANSFORM.mat"
+        )
+        pdist_matrix_fpath = PathManager.get_local_path("detectron2://densepose/Pdist_matrix.pkl")
+        SMPL_subdiv = loadmat(smpl_subdiv_fpath)
+        self.PDIST_transform = loadmat(pdist_transform_fpath)
+        self.PDIST_transform = self.PDIST_transform["index"].squeeze()
+        UV = np.array([SMPL_subdiv["U_subdiv"], SMPL_subdiv["V_subdiv"]]).squeeze()
+        ClosestVertInds = np.arange(UV.shape[1]) + 1
+        self.Part_UVs = []
+        self.Part_ClosestVertInds = []
+        for i in np.arange(24):
+            self.Part_UVs.append(UV[:, SMPL_subdiv["Part_ID_subdiv"].squeeze() == (i + 1)])
+            self.Part_ClosestVertInds.append(
+                ClosestVertInds[SMPL_subdiv["Part_ID_subdiv"].squeeze() == (i + 1)]
+            )
+
+        with open(pdist_matrix_fpath, "rb") as hFile:
+            arrays = pickle.load(hFile, encoding="latin1")
+        self.Pdist_matrix = arrays["Pdist_matrix"]
+        self.Part_ids = np.array(SMPL_subdiv["Part_ID_subdiv"].squeeze())
+        # Mean geodesic distances for parts.
+        self.Mean_Distances = np.array([0, 0.351, 0.107, 0.126, 0.237, 0.173, 0.142, 0.128, 0.150])
+        # Coarse Part labels.
+        self.CoarseParts = np.array(
+            [0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8]
+        )
+
+    def _prepare(self):
+        """
+        Prepare ._gts and ._dts for evaluation based on params
+        :return: None
+        """
+
+        def _toMask(anns, coco):
+            # modify ann['segmentation'] by reference
+            for ann in anns:
+                rle = coco.annToRLE(ann)
+                ann["segmentation"] = rle
+
+        def _getIgnoreRegion(iid, coco):
+            img = coco.imgs[iid]
+
+            if "ignore_regions_x" not in img.keys():
+                return None
+
+            if len(img["ignore_regions_x"]) == 0:
+                return None
+
+            rgns_merged = []
+            for region_x, region_y in zip(img["ignore_regions_x"], img["ignore_regions_y"]):
+                rgns = [iter(region_x), iter(region_y)]
+                rgns_merged.append([next(it) for it in itertools.cycle(rgns)])
+            rles = maskUtils.frPyObjects(rgns_merged, img["height"], img["width"])
+            rle = maskUtils.merge(rles)
+            return maskUtils.decode(rle)
+
+        def _checkIgnore(dt, iregion):
+            if iregion is None:
+                return True
+
+            bb = np.array(dt["bbox"]).astype(np.int)
+            x1, y1, x2, y2 = bb[0], bb[1], bb[0] + bb[2], bb[1] + bb[3]
+            x2 = min([x2, iregion.shape[1]])
+            y2 = min([y2, iregion.shape[0]])
+
+            if bb[2] * bb[3] == 0:
+                return False
+
+            crop_iregion = iregion[y1:y2, x1:x2]
+
+            if crop_iregion.sum() == 0:
+                return True
+
+            if "densepose" not in dt.keys():  # filtering boxes
+                return crop_iregion.sum() / bb[2] / bb[3] < self.ignoreThrBB
+
+            # filtering UVs
+            ignoremask = np.require(crop_iregion, requirements=["F"])
+            uvmask = np.require(
+                np.asarray(dt["densepose"][0] > 0), dtype=np.uint8, requirements=["F"]
+            )
+            uvmask_ = maskUtils.encode(uvmask)
+            ignoremask_ = maskUtils.encode(ignoremask)
+            uviou = maskUtils.iou([uvmask_], [ignoremask_], [1])[0]
+            return uviou < self.ignoreThrUV
+
+        p = self.params
+
+        if p.useCats:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
+        else:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+
+        # if iouType == 'uv', add point gt annotations
+        if p.iouType == "densepose":
+            self._loadGEval()
+
+        # convert ground truth to mask if iouType == 'segm'
+        if p.iouType == "segm":
+            _toMask(gts, self.cocoGt)
+            _toMask(dts, self.cocoDt)
+
+        # set ignore flag
+        for gt in gts:
+            gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
+            gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
+            if p.iouType == "keypoints":
+                gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
+            if p.iouType == "densepose":
+                gt["ignore"] = ("dp_x" in gt) == 0
+
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        self._igrgns = defaultdict(list)
+
+        for gt in gts:
+            iid = gt["image_id"]
+            if iid not in self._igrgns.keys():
+                self._igrgns[iid] = _getIgnoreRegion(iid, self.cocoGt)
+            if _checkIgnore(gt, self._igrgns[iid]):
+                self._gts[iid, gt["category_id"]].append(gt)
+        for dt in dts:
+            if _checkIgnore(dt, self._igrgns[dt["image_id"]]):
+                self._dts[dt["image_id"], dt["category_id"]].append(dt)
+
+        self.evalImgs = defaultdict(list)  # per-image per-category evaluation results
+        self.eval = {}  # accumulated evaluation results
+
+    def evaluate(self):
+        """
+        Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+        :return: None
+        """
+        tic = time.time()
+        logger.info("Running per image DensePose evaluation... {}".format(self.params.iouType))
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if p.useSegm is not None:
+            p.iouType = "segm" if p.useSegm == 1 else "bbox"
+            logger.info("useSegm (deprecated) is not None. Running DensePose evaluation")
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+
+        self._prepare()
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+
+        if p.iouType in ["segm", "bbox"]:
+            computeIoU = self.computeIoU
+        elif p.iouType == "keypoints":
+            computeIoU = self.computeOks
+        elif p.iouType == "densepose":
+            computeIoU = self.computeOgps
+
+        self.ious = {
+            (imgId, catId): computeIoU(imgId, catId) for imgId in p.imgIds for catId in catIds
+        }
+
+        evaluateImg = self.evaluateImg
+        maxDet = p.maxDets[-1]
+        self.evalImgs = [
+            evaluateImg(imgId, catId, areaRng, maxDet)
+            for catId in catIds
+            for areaRng in p.areaRng
+            for imgId in p.imgIds
+        ]
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        logger.info("DensePose evaluation DONE (t={:0.2f}s).".format(toc - tic))
+
+    def computeIoU(self, imgId, catId):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+        inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in inds]
+        if len(dt) > p.maxDets[-1]:
+            dt = dt[0 : p.maxDets[-1]]
+
+        if p.iouType == "segm":
+            g = [g["segmentation"] for g in gt]
+            d = [d["segmentation"] for d in dt]
+        elif p.iouType == "bbox":
+            g = [g["bbox"] for g in gt]
+            d = [d["bbox"] for d in dt]
+        else:
+            raise Exception("unknown iouType for iou computation")
+
+        # compute iou between each dt and gt region
+        iscrowd = [int(o["iscrowd"]) for o in gt]
+        ious = maskUtils.iou(d, g, iscrowd)
+        return ious
+
+    def computeOks(self, imgId, catId):
+        p = self.params
+        # dimension here should be Nxm
+        gts = self._gts[imgId, catId]
+        dts = self._dts[imgId, catId]
+        inds = np.argsort([-d["score"] for d in dts], kind="mergesort")
+        dts = [dts[i] for i in inds]
+        if len(dts) > p.maxDets[-1]:
+            dts = dts[0 : p.maxDets[-1]]
+        # if len(gts) == 0 and len(dts) == 0:
+        if len(gts) == 0 or len(dts) == 0:
+            return []
+        ious = np.zeros((len(dts), len(gts)))
+        sigmas = (
+            np.array(
+                [
+                    0.26,
+                    0.25,
+                    0.25,
+                    0.35,
+                    0.35,
+                    0.79,
+                    0.79,
+                    0.72,
+                    0.72,
+                    0.62,
+                    0.62,
+                    1.07,
+                    1.07,
+                    0.87,
+                    0.87,
+                    0.89,
+                    0.89,
+                ]
+            )
+            / 10.0
+        )
+        vars = (sigmas * 2) ** 2
+        k = len(sigmas)
+        # compute oks between each detection and ground truth object
+        for j, gt in enumerate(gts):
+            # create bounds for ignore regions(double the gt bbox)
+            g = np.array(gt["keypoints"])
+            xg = g[0::3]
+            yg = g[1::3]
+            vg = g[2::3]
+            k1 = np.count_nonzero(vg > 0)
+            bb = gt["bbox"]
+            x0 = bb[0] - bb[2]
+            x1 = bb[0] + bb[2] * 2
+            y0 = bb[1] - bb[3]
+            y1 = bb[1] + bb[3] * 2
+            for i, dt in enumerate(dts):
+                d = np.array(dt["keypoints"])
+                xd = d[0::3]
+                yd = d[1::3]
+                if k1 > 0:
+                    # measure the per-keypoint distance if keypoints visible
+                    dx = xd - xg
+                    dy = yd - yg
+                else:
+                    # measure minimum distance to keypoints in (x0,y0) & (x1,y1)
+                    z = np.zeros((k))
+                    dx = np.max((z, x0 - xd), axis=0) + np.max((z, xd - x1), axis=0)
+                    dy = np.max((z, y0 - yd), axis=0) + np.max((z, yd - y1), axis=0)
+                e = (dx ** 2 + dy ** 2) / vars / (gt["area"] + np.spacing(1)) / 2
+                if k1 > 0:
+                    e = e[vg > 0]
+                ious[i, j] = np.sum(np.exp(-e)) / e.shape[0]
+        return ious
+
+    def computeOgps(self, imgId, catId):
+        p = self.params
+        # dimension here should be Nxm
+        g = self._gts[imgId, catId]
+        d = self._dts[imgId, catId]
+        inds = np.argsort([-d_["score"] for d_ in d], kind="mergesort")
+        d = [d[i] for i in inds]
+        if len(d) > p.maxDets[-1]:
+            d = d[0 : p.maxDets[-1]]
+        # if len(gts) == 0 and len(dts) == 0:
+        if len(g) == 0 or len(d) == 0:
+            return []
+        ious = np.zeros((len(d), len(g)))
+        # compute opgs between each detection and ground truth object
+        # sigma = self.sigma #0.255 # dist = 0.3m corresponds to ogps = 0.5
+        # 1 # dist = 0.3m corresponds to ogps = 0.96
+        # 1.45 # dist = 1.7m (person height) corresponds to ogps = 0.5)
+        for j, gt in enumerate(g):
+            if not gt["ignore"]:
+                g_ = gt["bbox"]
+                for i, dt in enumerate(d):
+                    #
+                    dy = int(dt["bbox"][3])
+                    dx = int(dt["bbox"][2])
+                    dp_x = np.array(gt["dp_x"]) * g_[2] / 255.0
+                    dp_y = np.array(gt["dp_y"]) * g_[3] / 255.0
+                    py = (dp_y + g_[1] - dt["bbox"][1]).astype(np.int)
+                    px = (dp_x + g_[0] - dt["bbox"][0]).astype(np.int)
+                    #
+                    pts = np.zeros(len(px))
+                    pts[px >= dx] = -1
+                    pts[py >= dy] = -1
+                    pts[px < 0] = -1
+                    pts[py < 0] = -1
+                    if len(pts) < 1:
+                        ogps = 0.0
+                    elif np.max(pts) == -1:
+                        ogps = 0.0
+                    else:
+                        px[pts == -1] = 0
+                        py[pts == -1] = 0
+                        (densepose_shape, densepose_data_encoded), densepose_bbox_xywh = dt[
+                            "densepose"
+                        ]
+                        densepose_data = DensePoseResult.decode_png_data(
+                            densepose_shape, densepose_data_encoded
+                        )
+                        assert densepose_data.shape[2] == dx, (
+                            "DensePoseData width {} should be equal to "
+                            "detection bounding box width {}".format(densepose_data.shape[2], dx)
+                        )
+                        assert densepose_data.shape[1] == dy, (
+                            "DensePoseData height {} should be equal to "
+                            "detection bounding box height {}".format(densepose_data.shape[1], dy)
+                        )
+                        ipoints = densepose_data[0, py, px]
+                        upoints = densepose_data[1, py, px] / 255.0  # convert from uint8 by /255.
+                        vpoints = densepose_data[2, py, px] / 255.0
+                        ipoints[pts == -1] = 0
+                        # Find closest vertices in subsampled mesh.
+                        cVerts, cVertsGT = self.findAllClosestVerts(gt, upoints, vpoints, ipoints)
+                        # Get pairwise geodesic distances between gt and estimated mesh points.
+                        dist = self.getDistances(cVertsGT, cVerts)
+                        # Compute the Ogps measure.
+                        # Find the mean geodesic normalization distance for
+                        # each GT point, based on which part it is on.
+                        Current_Mean_Distances = self.Mean_Distances[
+                            self.CoarseParts[self.Part_ids[cVertsGT[cVertsGT > 0].astype(int) - 1]]
+                        ]
+                        # Compute gps
+                        ogps_values = np.exp(-(dist ** 2) / (2 * (Current_Mean_Distances ** 2)))
+                        #
+                        if len(dist) > 0:
+                            ogps = np.sum(ogps_values) / len(dist)
+                    ious[i, j] = ogps
+
+        gbb = [gt["bbox"] for gt in g]
+        dbb = [dt["bbox"] for dt in d]
+
+        # compute iou between each dt and gt region
+        iscrowd = [int(o["iscrowd"]) for o in g]
+        ious_bb = maskUtils.iou(dbb, gbb, iscrowd)
+        return ious, ious_bb
+
+    def evaluateImg(self, imgId, catId, aRng, maxDet):
+        """
+        perform evaluation for single category and image
+        :return: dict (single image results)
+        """
+
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return None
+
+        for g in gt:
+            # g['_ignore'] = g['ignore']
+            if g["ignore"] or (g["area"] < aRng[0] or g["area"] > aRng[1]):
+                g["_ignore"] = True
+            else:
+                g["_ignore"] = False
+
+        # sort dt highest score first, sort gt ignore last
+        gtind = np.argsort([g["_ignore"] for g in gt], kind="mergesort")
+        gt = [gt[i] for i in gtind]
+        dtind = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in dtind[0:maxDet]]
+        iscrowd = [int(o["iscrowd"]) for o in gt]
+        # load computed ious
+        if p.iouType == "densepose":
+            # print('Checking the length', len(self.ious[imgId, catId]))
+            # if len(self.ious[imgId, catId]) == 0:
+            #    print(self.ious[imgId, catId])
+            ious = (
+                self.ious[imgId, catId][0][:, gtind]
+                if len(self.ious[imgId, catId]) > 0
+                else self.ious[imgId, catId]
+            )
+            ioubs = (
+                self.ious[imgId, catId][1][:, gtind]
+                if len(self.ious[imgId, catId]) > 0
+                else self.ious[imgId, catId]
+            )
+        else:
+            ious = (
+                self.ious[imgId, catId][:, gtind]
+                if len(self.ious[imgId, catId]) > 0
+                else self.ious[imgId, catId]
+            )
+
+        T = len(p.iouThrs)
+        G = len(gt)
+        D = len(dt)
+        gtm = np.zeros((T, G))
+        dtm = np.zeros((T, D))
+        gtIg = np.array([g["_ignore"] for g in gt])
+        dtIg = np.zeros((T, D))
+        if np.all(gtIg) and p.iouType == "densepose":
+            dtIg = np.logical_or(dtIg, True)
+
+        if len(ious) > 0:  # and not p.iouType == 'densepose':
+            for tind, t in enumerate(p.iouThrs):
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    iou = min([t, 1 - 1e-10])
+                    m = -1
+                    for gind, _g in enumerate(gt):
+                        # if this gt already matched, and not a crowd, continue
+                        if gtm[tind, gind] > 0 and not iscrowd[gind]:
+                            continue
+                        # if dt matched to reg gt, and on ignore gt, stop
+                        if m > -1 and gtIg[m] == 0 and gtIg[gind] == 1:
+                            break
+                        # continue to next gt unless better match made
+                        if ious[dind, gind] < iou:
+                            continue
+                        if ious[dind, gind] == 0.0:
+                            continue
+                        # if match successful and best so far, store appropriately
+                        iou = ious[dind, gind]
+                        m = gind
+                    # if match made store id of match for both dt and gt
+                    if m == -1:
+                        continue
+                    dtIg[tind, dind] = gtIg[m]
+                    dtm[tind, dind] = gt[m]["id"]
+                    gtm[tind, m] = d["id"]
+
+        if p.iouType == "densepose":
+            if not len(ioubs) == 0:
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    if dtm[tind, dind] == 0:
+                        ioub = 0.8
+                        m = -1
+                        for gind, _g in enumerate(gt):
+                            # if this gt already matched, and not a crowd, continue
+                            if gtm[tind, gind] > 0 and not iscrowd[gind]:
+                                continue
+                            # continue to next gt unless better match made
+                            if ioubs[dind, gind] < ioub:
+                                continue
+                            # if match successful and best so far, store appropriately
+                            ioub = ioubs[dind, gind]
+                            m = gind
+                            # if match made store id of match for both dt and gt
+                        if m > -1:
+                            dtIg[:, dind] = gtIg[m]
+                            if gtIg[m]:
+                                dtm[tind, dind] = gt[m]["id"]
+                                gtm[tind, m] = d["id"]
+        # set unmatched detections outside of area range to ignore
+        a = np.array([d["area"] < aRng[0] or d["area"] > aRng[1] for d in dt]).reshape((1, len(dt)))
+        dtIg = np.logical_or(dtIg, np.logical_and(dtm == 0, np.repeat(a, T, 0)))
+        # store results for given image and category
+        # print('Done with the function', len(self.ious[imgId, catId]))
+        return {
+            "image_id": imgId,
+            "category_id": catId,
+            "aRng": aRng,
+            "maxDet": maxDet,
+            "dtIds": [d["id"] for d in dt],
+            "gtIds": [g["id"] for g in gt],
+            "dtMatches": dtm,
+            "gtMatches": gtm,
+            "dtScores": [d["score"] for d in dt],
+            "gtIgnore": gtIg,
+            "dtIgnore": dtIg,
+        }
+
+    def accumulate(self, p=None):
+        """
+        Accumulate per image evaluation results and store the result in self.eval
+        :param p: input params for evaluation
+        :return: None
+        """
+        logger.info("Accumulating evaluation results...")
+        tic = time.time()
+        if not self.evalImgs:
+            logger.info("Please run evaluate() first")
+        # allows input customized parameters
+        if p is None:
+            p = self.params
+        p.catIds = p.catIds if p.useCats == 1 else [-1]
+        T = len(p.iouThrs)
+        R = len(p.recThrs)
+        K = len(p.catIds) if p.useCats else 1
+        A = len(p.areaRng)
+        M = len(p.maxDets)
+        precision = -np.ones((T, R, K, A, M))  # -1 for the precision of absent categories
+        recall = -np.ones((T, K, A, M))
+
+        # create dictionary for future indexing
+        logger.info("Categories: {}".format(p.catIds))
+        _pe = self._paramsEval
+        catIds = _pe.catIds if _pe.useCats else [-1]
+        setK = set(catIds)
+        setA = set(map(tuple, _pe.areaRng))
+        setM = set(_pe.maxDets)
+        setI = set(_pe.imgIds)
+        # get inds to evaluate
+        k_list = [n for n, k in enumerate(p.catIds) if k in setK]
+        m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
+        a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
+        i_list = [n for n, i in enumerate(p.imgIds) if i in setI]
+        I0 = len(_pe.imgIds)
+        A0 = len(_pe.areaRng)
+        # retrieve E at each category, area range, and max number of detections
+        for k, k0 in enumerate(k_list):
+            Nk = k0 * A0 * I0
+            for a, a0 in enumerate(a_list):
+                Na = a0 * I0
+                for m, maxDet in enumerate(m_list):
+                    E = [self.evalImgs[Nk + Na + i] for i in i_list]
+                    E = [e for e in E if e is not None]
+                    if len(E) == 0:
+                        continue
+                    dtScores = np.concatenate([e["dtScores"][0:maxDet] for e in E])
+
+                    # different sorting method generates slightly different results.
+                    # mergesort is used to be consistent as Matlab implementation.
+                    inds = np.argsort(-dtScores, kind="mergesort")
+
+                    dtm = np.concatenate([e["dtMatches"][:, 0:maxDet] for e in E], axis=1)[:, inds]
+                    dtIg = np.concatenate([e["dtIgnore"][:, 0:maxDet] for e in E], axis=1)[:, inds]
+                    gtIg = np.concatenate([e["gtIgnore"] for e in E])
+                    npig = np.count_nonzero(gtIg == 0)
+                    if npig == 0:
+                        continue
+                    tps = np.logical_and(dtm, np.logical_not(dtIg))
+                    fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg))
+                    tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
+                    fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)
+                    for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
+                        tp = np.array(tp)
+                        fp = np.array(fp)
+                        nd = len(tp)
+                        rc = tp / npig
+                        pr = tp / (fp + tp + np.spacing(1))
+                        q = np.zeros((R,))
+
+                        if nd:
+                            recall[t, k, a, m] = rc[-1]
+                        else:
+                            recall[t, k, a, m] = 0
+
+                        # numpy is slow without cython optimization for accessing elements
+                        # use python array gets significant speed improvement
+                        pr = pr.tolist()
+                        q = q.tolist()
+
+                        for i in range(nd - 1, 0, -1):
+                            if pr[i] > pr[i - 1]:
+                                pr[i - 1] = pr[i]
+
+                        inds = np.searchsorted(rc, p.recThrs, side="left")
+                        try:
+                            for ri, pi in enumerate(inds):
+                                q[ri] = pr[pi]
+                        except Exception:
+                            pass
+                        precision[t, :, k, a, m] = np.array(q)
+        logger.info(
+            "Final: max precision {}, min precision {}".format(np.max(precision), np.min(precision))
+        )
+        self.eval = {
+            "params": p,
+            "counts": [T, R, K, A, M],
+            "date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            "precision": precision,
+            "recall": recall,
+        }
+        toc = time.time()
+        logger.info("DONE (t={:0.2f}s).".format(toc - tic))
+
+    def summarize(self):
+        """
+        Compute and display summary metrics for evaluation results.
+        Note this function can *only* be applied on the default parameter setting
+        """
+
+        def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100):
+            p = self.params
+            iStr = " {:<18} {} @[ {}={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}"
+            titleStr = "Average Precision" if ap == 1 else "Average Recall"
+            typeStr = "(AP)" if ap == 1 else "(AR)"
+            measure = "IoU"
+            if self.params.iouType == "keypoints":
+                measure = "OKS"
+            elif self.params.iouType == "densepose":
+                measure = "OGPS"
+            iouStr = (
+                "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
+                if iouThr is None
+                else "{:0.2f}".format(iouThr)
+            )
+
+            aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
+            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+            if ap == 1:
+                # dimension of precision: [TxRxKxAxM]
+                s = self.eval["precision"]
+                # IoU
+                if iouThr is not None:
+                    t = np.where(np.abs(iouThr - p.iouThrs) < 0.001)[0]
+                    s = s[t]
+                s = s[:, :, :, aind, mind]
+            else:
+                # dimension of recall: [TxKxAxM]
+                s = self.eval["recall"]
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, aind, mind]
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s > -1])
+            logger.info(iStr.format(titleStr, typeStr, measure, iouStr, areaRng, maxDets, mean_s))
+            return mean_s
+
+        def _summarizeDets():
+            stats = np.zeros((12,))
+            stats[0] = _summarize(1)
+            stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2])
+            stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2])
+            stats[3] = _summarize(1, areaRng="small", maxDets=self.params.maxDets[2])
+            stats[4] = _summarize(1, areaRng="medium", maxDets=self.params.maxDets[2])
+            stats[5] = _summarize(1, areaRng="large", maxDets=self.params.maxDets[2])
+            stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
+            stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
+            stats[9] = _summarize(0, areaRng="small", maxDets=self.params.maxDets[2])
+            stats[10] = _summarize(0, areaRng="medium", maxDets=self.params.maxDets[2])
+            stats[11] = _summarize(0, areaRng="large", maxDets=self.params.maxDets[2])
+            return stats
+
+        def _summarizeKps():
+            stats = np.zeros((10,))
+            stats[0] = _summarize(1, maxDets=20)
+            stats[1] = _summarize(1, maxDets=20, iouThr=0.5)
+            stats[2] = _summarize(1, maxDets=20, iouThr=0.75)
+            stats[3] = _summarize(1, maxDets=20, areaRng="medium")
+            stats[4] = _summarize(1, maxDets=20, areaRng="large")
+            stats[5] = _summarize(0, maxDets=20)
+            stats[6] = _summarize(0, maxDets=20, iouThr=0.5)
+            stats[7] = _summarize(0, maxDets=20, iouThr=0.75)
+            stats[8] = _summarize(0, maxDets=20, areaRng="medium")
+            stats[9] = _summarize(0, maxDets=20, areaRng="large")
+            return stats
+
+        def _summarizeUvs():
+            stats = np.zeros((10,))
+            stats[0] = _summarize(1, maxDets=self.params.maxDets[0])
+            stats[1] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.5)
+            stats[2] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.75)
+            stats[3] = _summarize(1, maxDets=self.params.maxDets[0], areaRng="medium")
+            stats[4] = _summarize(1, maxDets=self.params.maxDets[0], areaRng="large")
+            stats[5] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[6] = _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.5)
+            stats[7] = _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.75)
+            stats[8] = _summarize(0, maxDets=self.params.maxDets[0], areaRng="medium")
+            stats[9] = _summarize(0, maxDets=self.params.maxDets[0], areaRng="large")
+            return stats
+
+        def _summarizeUvsOld():
+            stats = np.zeros((18,))
+            stats[0] = _summarize(1, maxDets=self.params.maxDets[0])
+            stats[1] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.5)
+            stats[2] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.55)
+            stats[3] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.60)
+            stats[4] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.65)
+            stats[5] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.70)
+            stats[6] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.75)
+            stats[7] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.80)
+            stats[8] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.85)
+            stats[9] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.90)
+            stats[10] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.95)
+            stats[11] = _summarize(1, maxDets=self.params.maxDets[0], areaRng="medium")
+            stats[12] = _summarize(1, maxDets=self.params.maxDets[0], areaRng="large")
+            stats[13] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[14] = _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.5)
+            stats[15] = _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.75)
+            stats[16] = _summarize(0, maxDets=self.params.maxDets[0], areaRng="medium")
+            stats[17] = _summarize(0, maxDets=self.params.maxDets[0], areaRng="large")
+            return stats
+
+        if not self.eval:
+            raise Exception("Please run accumulate() first")
+        iouType = self.params.iouType
+        if iouType in ["segm", "bbox"]:
+            summarize = _summarizeDets
+        elif iouType in ["keypoints"]:
+            summarize = _summarizeKps
+        elif iouType in ["densepose"]:
+            summarize = _summarizeUvs
+        self.stats = summarize()
+
+    def __str__(self):
+        self.summarize()
+
+    # ================ functions for dense pose ==============================
+    def findAllClosestVerts(self, gt, U_points, V_points, Index_points):
+        #
+        I_gt = np.array(gt["dp_I"])
+        U_gt = np.array(gt["dp_U"])
+        V_gt = np.array(gt["dp_V"])
+        #
+        # print(I_gt)
+        #
+        ClosestVerts = np.ones(Index_points.shape) * -1
+        for i in np.arange(24):
+            #
+            if sum(Index_points == (i + 1)) > 0:
+                UVs = np.array(
+                    [U_points[Index_points == (i + 1)], V_points[Index_points == (i + 1)]]
+                )
+                Current_Part_UVs = self.Part_UVs[i]
+                Current_Part_ClosestVertInds = self.Part_ClosestVertInds[i]
+                D = ssd.cdist(Current_Part_UVs.transpose(), UVs.transpose()).squeeze()
+                ClosestVerts[Index_points == (i + 1)] = Current_Part_ClosestVertInds[
+                    np.argmin(D, axis=0)
+                ]
+        #
+        ClosestVertsGT = np.ones(Index_points.shape) * -1
+        for i in np.arange(24):
+            if sum(I_gt == (i + 1)) > 0:
+                UVs = np.array([U_gt[I_gt == (i + 1)], V_gt[I_gt == (i + 1)]])
+                Current_Part_UVs = self.Part_UVs[i]
+                Current_Part_ClosestVertInds = self.Part_ClosestVertInds[i]
+                D = ssd.cdist(Current_Part_UVs.transpose(), UVs.transpose()).squeeze()
+                ClosestVertsGT[I_gt == (i + 1)] = Current_Part_ClosestVertInds[np.argmin(D, axis=0)]
+        #
+        return ClosestVerts, ClosestVertsGT
+
+    def getDistances(self, cVertsGT, cVerts):
+
+        ClosestVertsTransformed = self.PDIST_transform[cVerts.astype(int) - 1]
+        ClosestVertsGTTransformed = self.PDIST_transform[cVertsGT.astype(int) - 1]
+        #
+        ClosestVertsTransformed[cVerts < 0] = 0
+        ClosestVertsGTTransformed[cVertsGT < 0] = 0
+        #
+        cVertsGT = ClosestVertsGTTransformed
+        cVerts = ClosestVertsTransformed
+        #
+        n = 27554
+        dists = []
+        for d in range(len(cVertsGT)):
+            if cVertsGT[d] > 0:
+                if cVerts[d] > 0:
+                    i = cVertsGT[d] - 1
+                    j = cVerts[d] - 1
+                    if j == i:
+                        dists.append(0)
+                    elif j > i:
+                        ccc = i
+                        i = j
+                        j = ccc
+                        i = n - i - 1
+                        j = n - j - 1
+                        k = (n * (n - 1) / 2) - (n - i) * ((n - i) - 1) / 2 + j - i - 1
+                        k = (n * n - n) / 2 - k - 1
+                        dists.append(self.Pdist_matrix[int(k)][0])
+                    else:
+                        i = n - i - 1
+                        j = n - j - 1
+                        k = (n * (n - 1) / 2) - (n - i) * ((n - i) - 1) / 2 + j - i - 1
+                        k = (n * n - n) / 2 - k - 1
+                        dists.append(self.Pdist_matrix[int(k)][0])
+                else:
+                    dists.append(np.inf)
+        return np.array(dists).squeeze()
+
+
+class Params:
+    """
+    Params for coco evaluation api
+    """
+
+    def setDetParams(self):
+        self.imgIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(0.5, 0.95, np.round((0.95 - 0.5) / 0.05) + 1, endpoint=True)
+        self.recThrs = np.linspace(0.0, 1.00, np.round((1.00 - 0.0) / 0.01) + 1, endpoint=True)
+        self.maxDets = [1, 10, 100]
+        self.areaRng = [
+            [0 ** 2, 1e5 ** 2],
+            [0 ** 2, 32 ** 2],
+            [32 ** 2, 96 ** 2],
+            [96 ** 2, 1e5 ** 2],
+        ]
+        self.areaRngLbl = ["all", "small", "medium", "large"]
+        self.useCats = 1
+
+    def setKpParams(self):
+        self.imgIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(0.5, 0.95, np.round((0.95 - 0.5) / 0.05) + 1, endpoint=True)
+        self.recThrs = np.linspace(0.0, 1.00, np.round((1.00 - 0.0) / 0.01) + 1, endpoint=True)
+        self.maxDets = [20]
+        self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
+        self.areaRngLbl = ["all", "medium", "large"]
+        self.useCats = 1
+
+    def setUvParams(self):
+        self.imgIds = []
+        self.catIds = []
+        self.iouThrs = np.linspace(0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True)
+        self.recThrs = np.linspace(0.0, 1.00, int(np.round((1.00 - 0.0) / 0.01)) + 1, endpoint=True)
+        self.maxDets = [20]
+        self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
+        self.areaRngLbl = ["all", "medium", "large"]
+        self.useCats = 1
+
+    def __init__(self, iouType="segm"):
+        if iouType == "segm" or iouType == "bbox":
+            self.setDetParams()
+        elif iouType == "keypoints":
+            self.setKpParams()
+        elif iouType == "densepose":
+            self.setUvParams()
+        else:
+            raise Exception("iouType not supported")
+        self.iouType = iouType
+        # useSegm is deprecated
+        self.useSegm = None
diff --git a/projects/DensePose/densepose/densepose_head.py b/projects/DensePose/densepose/densepose_head.py
new file mode 100644
index 0000000..8cb5bf6
--- /dev/null
+++ b/projects/DensePose/densepose/densepose_head.py
@@ -0,0 +1,626 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.layers import Conv2d, ConvTranspose2d, interpolate
+from detectron2.structures.boxes import matched_boxlist_iou
+from detectron2.utils.registry import Registry
+
+from .structures import DensePoseOutput
+
+ROI_DENSEPOSE_HEAD_REGISTRY = Registry("ROI_DENSEPOSE_HEAD")
+
+
+def initialize_module_params(module):
+    for name, param in module.named_parameters():
+        if "bias" in name:
+            nn.init.constant_(param, 0)
+        elif "weight" in name:
+            nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")
+
+
+@ROI_DENSEPOSE_HEAD_REGISTRY.register()
+class DensePoseV1ConvXHead(nn.Module):
+    def __init__(self, cfg, input_channels):
+        super(DensePoseV1ConvXHead, self).__init__()
+        # fmt: off
+        hidden_dim           = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM
+        kernel_size          = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL
+        self.n_stacked_convs = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS
+        # fmt: on
+        pad_size = kernel_size // 2
+        n_channels = input_channels
+        for i in range(self.n_stacked_convs):
+            layer = Conv2d(n_channels, hidden_dim, kernel_size, stride=1, padding=pad_size)
+            layer_name = self._get_layer_name(i)
+            self.add_module(layer_name, layer)
+            n_channels = hidden_dim
+        self.n_out_channels = n_channels
+        initialize_module_params(self)
+
+    def forward(self, features):
+        x = features
+        output = x
+        for i in range(self.n_stacked_convs):
+            layer_name = self._get_layer_name(i)
+            x = getattr(self, layer_name)(x)
+            x = F.relu(x)
+            output = x
+        return output
+
+    def _get_layer_name(self, i):
+        layer_name = "body_conv_fcn{}".format(i + 1)
+        return layer_name
+
+
+class DensePosePredictor(nn.Module):
+
+    NUM_ANN_INDICES = 15
+
+    def __init__(self, cfg, input_channels):
+        super(DensePosePredictor, self).__init__()
+        dim_in = input_channels
+        dim_out_ann_index = self.NUM_ANN_INDICES
+        dim_out_patches = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1
+        kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL
+        self.ann_index_lowres = ConvTranspose2d(
+            dim_in, dim_out_ann_index, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        self.index_uv_lowres = ConvTranspose2d(
+            dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        self.u_lowres = ConvTranspose2d(
+            dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        self.v_lowres = ConvTranspose2d(
+            dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        self.scale_factor = cfg.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE
+        initialize_module_params(self)
+
+    def forward(self, head_outputs):
+        ann_index_lowres = self.ann_index_lowres(head_outputs)
+        index_uv_lowres = self.index_uv_lowres(head_outputs)
+        u_lowres = self.u_lowres(head_outputs)
+        v_lowres = self.v_lowres(head_outputs)
+
+        def interp2d(input):
+            return interpolate(
+                input, scale_factor=self.scale_factor, mode="bilinear", align_corners=False
+            )
+
+        ann_index = interp2d(ann_index_lowres)
+        index_uv = interp2d(index_uv_lowres)
+        u = interp2d(u_lowres)
+        v = interp2d(v_lowres)
+        return (ann_index, index_uv, u, v), (ann_index_lowres, index_uv_lowres, u_lowres, v_lowres)
+
+
+class DensePoseDataFilter(object):
+    def __init__(self, cfg):
+        self.iou_threshold = cfg.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD
+
+    @torch.no_grad()
+    def __call__(self, proposals_with_targets):
+        """
+        Filters proposals with targets to keep only the ones relevant for
+        DensePose training
+        proposals: list(Instances), each element of the list corresponds to
+            various instances (proposals, GT for boxes and densepose) for one
+            image
+        """
+        proposals_filtered = []
+        for proposals_per_image in proposals_with_targets:
+            if not hasattr(proposals_per_image, "gt_densepose"):
+                continue
+            assert hasattr(proposals_per_image, "gt_boxes")
+            assert hasattr(proposals_per_image, "proposal_boxes")
+            gt_boxes = proposals_per_image.gt_boxes
+            est_boxes = proposals_per_image.proposal_boxes
+            # apply match threshold for densepose head
+            iou = matched_boxlist_iou(gt_boxes, est_boxes)
+            iou_select = iou > self.iou_threshold
+            proposals_per_image = proposals_per_image[iou_select]
+            assert len(proposals_per_image.gt_boxes) == len(proposals_per_image.proposal_boxes)
+            # filter out any target without densepose annotation
+            gt_densepose = proposals_per_image.gt_densepose
+            assert len(proposals_per_image.gt_boxes) == len(proposals_per_image.gt_densepose)
+            selected_indices = [
+                i for i, dp_target in enumerate(gt_densepose) if dp_target is not None
+            ]
+            if len(selected_indices) != len(gt_densepose):
+                proposals_per_image = proposals_per_image[selected_indices]
+            assert len(proposals_per_image.gt_boxes) == len(proposals_per_image.proposal_boxes)
+            assert len(proposals_per_image.gt_boxes) == len(proposals_per_image.gt_densepose)
+            proposals_filtered.append(proposals_per_image)
+        return proposals_filtered
+
+
+def build_densepose_head(cfg, input_channels):
+    head_name = cfg.MODEL.ROI_DENSEPOSE_HEAD.NAME
+    return ROI_DENSEPOSE_HEAD_REGISTRY.get(head_name)(cfg, input_channels)
+
+
+def build_densepose_predictor(cfg, input_channels):
+    predictor = DensePosePredictor(cfg, input_channels)
+    return predictor
+
+
+def build_densepose_data_filter(cfg):
+    dp_filter = DensePoseDataFilter(cfg)
+    return dp_filter
+
+
+def densepose_inference(densepose_outputs, detections):
+    """
+    Infer dense pose estimate based on outputs from the DensePose head
+    and detections. The estimate for each detection instance is stored in its
+    "pred_densepose" attribute.
+
+    Args:
+        densepose_outputs (tuple(`torch.Tensor`)): iterable containing 4 elements:
+            - s (:obj: `torch.Tensor`): segmentation tensor of size (N, A, H, W),
+            - i (:obj: `torch.Tensor`): classification tensor of size (N, C, H, W),
+            - u (:obj: `torch.Tensor`): U coordinates for each class of size (N, C, H, W),
+            - v (:obj: `torch.Tensor`): V coordinates for each class of size (N, C, H, W),
+            where N is the total number of detections in a batch,
+                  A is the number of segmentations classes (e.g. 15 for coarse body parts),
+                  C is the number of labels (e.g. 25 for fine body parts),
+                  W is the resolution along the X axis
+                  H is the resolution along the Y axis
+        detections (list[Instances]): A list of N Instances, where N is the number of images
+            in the batch. Instances are modified by this method: "pred_densepose" attribute
+            is added to each instance, the attribute contains the corresponding
+            DensePoseOutput object.
+    """
+
+    # DensePose outputs: segmentation, body part indices, U, V
+    s, index_uv, u, v = densepose_outputs
+    k = 0
+    for detection in detections:
+        n_i = len(detection)
+        s_i = s[k : k + n_i]
+        index_uv_i = index_uv[k : k + n_i]
+        u_i = u[k : k + n_i]
+        v_i = v[k : k + n_i]
+        densepose_output_i = DensePoseOutput(s_i, index_uv_i, u_i, v_i)
+        detection.pred_densepose = densepose_output_i
+        k += n_i
+
+
+def _linear_interpolation_utilities(v_norm, v0_src, size_src, v0_dst, size_dst, size_z):
+    """
+    Computes utility values for linear interpolation at points v.
+    The points are given as normalized offsets in the source interval
+    (v0_src, v0_src + size_src), more precisely:
+        v = v0_src + v_norm * size_src / 256.0
+    The computed utilities include lower points v_lo, upper points v_hi,
+    interpolation weights v_w and flags j_valid indicating whether the
+    points falls into the destination interval (v0_dst, v0_dst + size_dst).
+
+    Args:
+        v_norm (:obj: `torch.Tensor`): tensor of size N containing
+            normalized point offsets
+        v0_src (:obj: `torch.Tensor`): tensor of size N containing
+            left bounds of source intervals for normalized points
+        size_src (:obj: `torch.Tensor`): tensor of size N containing
+            source interval sizes for normalized points
+        v0_dst (:obj: `torch.Tensor`): tensor of size N containing
+            left bounds of destination intervals
+        size_dst (:obj: `torch.Tensor`): tensor of size N containing
+            destination interval sizes
+        size_z (int): interval size for data to be interpolated
+
+    Returns:
+        v_lo (:obj: `torch.Tensor`): int tensor of size N containing
+            indices of lower values used for interpolation, all values are
+            integers from [0, size_z - 1]
+        v_hi (:obj: `torch.Tensor`): int tensor of size N containing
+            indices of upper values used for interpolation, all values are
+            integers from [0, size_z - 1]
+        v_w (:obj: `torch.Tensor`): float tensor of size N containing
+            interpolation weights
+        j_valid (:obj: `torch.Tensor`): uint8 tensor of size N containing
+            0 for points outside the estimation interval
+            (v0_est, v0_est + size_est) and 1 otherwise
+    """
+    v = v0_src + v_norm * size_src / 256.0
+    j_valid = (v - v0_dst >= 0) * (v - v0_dst < size_dst)
+    v_grid = (v - v0_dst) * size_z / size_dst
+    v_lo = v_grid.floor().long().clamp(min=0, max=size_z - 1)
+    v_hi = (v_lo + 1).clamp(max=size_z - 1)
+    v_grid = torch.min(v_hi.float(), v_grid)
+    v_w = v_grid - v_lo.float()
+    return v_lo, v_hi, v_w, j_valid
+
+
+def _grid_sampling_utilities(
+    zh, zw, bbox_xywh_est, bbox_xywh_gt, index_gt, x_norm, y_norm, index_bbox
+):
+    """
+    Prepare tensors used in grid sampling.
+
+    Args:
+        z_est (:obj: `torch.Tensor`): tensor of size (N,C,H,W) with estimated
+            values of Z to be extracted for the points X, Y and channel
+            indices I
+        bbox_xywh_est (:obj: `torch.Tensor`): tensor of size (N, 4) containing
+            estimated bounding boxes in format XYWH
+        bbox_xywh_gt (:obj: `torch.Tensor`): tensor of size (N, 4) containing
+            matched ground truth bounding boxes in format XYWH
+        index_gt (:obj: `torch.Tensor`): tensor of size K with point labels for
+            ground truth points
+        x_norm (:obj: `torch.Tensor`): tensor of size K with X normalized
+            coordinates of ground truth points. Image X coordinates can be
+            obtained as X = Xbbox + x_norm * Wbbox / 255
+        y_norm (:obj: `torch.Tensor`): tensor of size K with Y normalized
+            coordinates of ground truth points. Image Y coordinates can be
+            obtained as Y = Ybbox + y_norm * Hbbox / 255
+        index_bbox (:obj: `torch.Tensor`): tensor of size K with bounding box
+            indices for each ground truth point. The values are thus in
+            [0, N-1]
+
+    Returns:
+        j_valid (:obj: `torch.Tensor`): uint8 tensor of size M containing
+            0 for points to be discarded and 1 for points to be selected
+        y_lo (:obj: `torch.Tensor`): int tensor of indices of upper values
+            in z_est for each point
+        y_hi (:obj: `torch.Tensor`): int tensor of indices of lower values
+            in z_est for each point
+        x_lo (:obj: `torch.Tensor`): int tensor of indices of left values
+            in z_est for each point
+        x_hi (:obj: `torch.Tensor`): int tensor of indices of right values
+            in z_est for each point
+        w_ylo_xlo (:obj: `torch.Tensor`): float tensor of size M;
+            contains upper-left value weight for each point
+        w_ylo_xhi (:obj: `torch.Tensor`): float tensor of size M;
+            contains upper-right value weight for each point
+        w_yhi_xlo (:obj: `torch.Tensor`): float tensor of size M;
+            contains lower-left value weight for each point
+        w_yhi_xhi (:obj: `torch.Tensor`): float tensor of size M;
+            contains lower-right value weight for each point
+    """
+
+    x0_gt, y0_gt, w_gt, h_gt = bbox_xywh_gt[index_bbox].unbind(dim=1)
+    x0_est, y0_est, w_est, h_est = bbox_xywh_est[index_bbox].unbind(dim=1)
+    x_lo, x_hi, x_w, jx_valid = _linear_interpolation_utilities(
+        x_norm, x0_gt, w_gt, x0_est, w_est, zw
+    )
+    y_lo, y_hi, y_w, jy_valid = _linear_interpolation_utilities(
+        y_norm, y0_gt, h_gt, y0_est, h_est, zh
+    )
+    j_valid = jx_valid * jy_valid
+
+    w_ylo_xlo = (1.0 - x_w) * (1.0 - y_w)
+    w_ylo_xhi = x_w * (1.0 - y_w)
+    w_yhi_xlo = (1.0 - x_w) * y_w
+    w_yhi_xhi = x_w * y_w
+
+    return j_valid, y_lo, y_hi, x_lo, x_hi, w_ylo_xlo, w_ylo_xhi, w_yhi_xlo, w_yhi_xhi
+
+
+def _extract_at_points_packed(
+    z_est,
+    index_bbox_valid,
+    slice_index_uv,
+    y_lo,
+    y_hi,
+    x_lo,
+    x_hi,
+    w_ylo_xlo,
+    w_ylo_xhi,
+    w_yhi_xlo,
+    w_yhi_xhi,
+):
+    """
+    Extract ground truth values z_gt for valid point indices and estimated
+    values z_est using bilinear interpolation over top-left (y_lo, x_lo),
+    top-right (y_lo, x_hi), bottom-left (y_hi, x_lo) and bottom-right
+    (y_hi, x_hi) values in z_est with corresponding weights:
+    w_ylo_xlo, w_ylo_xhi, w_yhi_xlo and w_yhi_xhi.
+    Use slice_index_uv to slice dim=1 in z_est
+    """
+    z_est_sampled = (
+        z_est[index_bbox_valid, slice_index_uv, y_lo, x_lo] * w_ylo_xlo
+        + z_est[index_bbox_valid, slice_index_uv, y_lo, x_hi] * w_ylo_xhi
+        + z_est[index_bbox_valid, slice_index_uv, y_hi, x_lo] * w_yhi_xlo
+        + z_est[index_bbox_valid, slice_index_uv, y_hi, x_hi] * w_yhi_xhi
+    )
+    return z_est_sampled
+
+
+def _resample_data(
+    z, bbox_xywh_src, bbox_xywh_dst, wout, hout, mode="nearest", padding_mode="zeros"
+):
+    """
+    Args:
+        z (:obj: `torch.Tensor`): tensor of size (N,C,H,W) with data to be
+            resampled
+        bbox_xywh_src (:obj: `torch.Tensor`): tensor of size (N,4) containing
+            source bounding boxes in format XYWH
+        bbox_xywh_dst (:obj: `torch.Tensor`): tensor of size (N,4) containing
+            destination bounding boxes in format XYWH
+    Return:
+        zresampled (:obj: `torch.Tensor`): tensor of size (N, C, Hout, Wout)
+            with resampled values of z, where D is the discretization size
+    """
+    n = bbox_xywh_src.size(0)
+    assert n == bbox_xywh_dst.size(0), (
+        "The number of "
+        "source ROIs for resampling ({}) should be equal to the number "
+        "of destination ROIs ({})".format(bbox_xywh_src.size(0), bbox_xywh_dst.size(0))
+    )
+    x0src, y0src, wsrc, hsrc = bbox_xywh_src.unbind(dim=1)
+    x0dst, y0dst, wdst, hdst = bbox_xywh_dst.unbind(dim=1)
+    x0dst_norm = 2 * (x0dst - x0src) / wsrc - 1
+    y0dst_norm = 2 * (y0dst - y0src) / hsrc - 1
+    x1dst_norm = 2 * (x0dst + wdst - x0src) / wsrc - 1
+    y1dst_norm = 2 * (y0dst + hdst - y0src) / hsrc - 1
+    grid_w = torch.arange(wout, device=z.device, dtype=torch.float) / wout
+    grid_h = torch.arange(hout, device=z.device, dtype=torch.float) / hout
+    grid_w_expanded = grid_w[None, None, :].expand(n, hout, wout)
+    grid_h_expanded = grid_h[None, :, None].expand(n, hout, wout)
+    dx_expanded = (x1dst_norm - x0dst_norm)[:, None, None].expand(n, hout, wout)
+    dy_expanded = (y1dst_norm - y0dst_norm)[:, None, None].expand(n, hout, wout)
+    x0_expanded = x0dst_norm[:, None, None].expand(n, hout, wout)
+    y0_expanded = y0dst_norm[:, None, None].expand(n, hout, wout)
+    grid_x = grid_w_expanded * dx_expanded + x0_expanded
+    grid_y = grid_h_expanded * dy_expanded + y0_expanded
+    grid = torch.stack((grid_x, grid_y), dim=3)
+    # resample Z from (N, C, H, W) into (N, C, Hout, Wout)
+    zresampled = F.grid_sample(z, grid, mode=mode, padding_mode=padding_mode, align_corners=True)
+    return zresampled
+
+
+def _extract_single_tensors_from_matches_one_image(
+    proposals_targets, bbox_with_dp_offset, bbox_global_offset
+):
+    i_gt_all = []
+    x_norm_all = []
+    y_norm_all = []
+    u_gt_all = []
+    v_gt_all = []
+    s_gt_all = []
+    bbox_xywh_gt_all = []
+    bbox_xywh_est_all = []
+    # Ibbox_all == k should be true for all data that corresponds
+    # to bbox_xywh_gt[k] and bbox_xywh_est[k]
+    # index k here is global wrt images
+    i_bbox_all = []
+    # at offset k (k is global) contains index of bounding box data
+    # within densepose output tensor
+    i_with_dp = []
+
+    boxes_xywh_est = proposals_targets.proposal_boxes.clone()
+    boxes_xywh_gt = proposals_targets.gt_boxes.clone()
+    n_i = len(boxes_xywh_est)
+    assert n_i == len(boxes_xywh_gt)
+
+    if n_i:
+        boxes_xywh_est.tensor[:, 2] -= boxes_xywh_est.tensor[:, 0]
+        boxes_xywh_est.tensor[:, 3] -= boxes_xywh_est.tensor[:, 1]
+        boxes_xywh_gt.tensor[:, 2] -= boxes_xywh_gt.tensor[:, 0]
+        boxes_xywh_gt.tensor[:, 3] -= boxes_xywh_gt.tensor[:, 1]
+        if hasattr(proposals_targets, "gt_densepose"):
+            densepose_gt = proposals_targets.gt_densepose
+            for k, box_xywh_est, box_xywh_gt, dp_gt in zip(
+                range(n_i), boxes_xywh_est.tensor, boxes_xywh_gt.tensor, densepose_gt
+            ):
+                if (dp_gt is not None) and (len(dp_gt.x) > 0):
+                    i_gt_all.append(dp_gt.i)
+                    x_norm_all.append(dp_gt.x)
+                    y_norm_all.append(dp_gt.y)
+                    u_gt_all.append(dp_gt.u)
+                    v_gt_all.append(dp_gt.v)
+                    s_gt_all.append(dp_gt.segm.unsqueeze(0))
+                    bbox_xywh_gt_all.append(box_xywh_gt.view(-1, 4))
+                    bbox_xywh_est_all.append(box_xywh_est.view(-1, 4))
+                    i_bbox_k = torch.full_like(dp_gt.i, bbox_with_dp_offset + len(i_with_dp))
+                    i_bbox_all.append(i_bbox_k)
+                    i_with_dp.append(bbox_global_offset + k)
+    return (
+        i_gt_all,
+        x_norm_all,
+        y_norm_all,
+        u_gt_all,
+        v_gt_all,
+        s_gt_all,
+        bbox_xywh_gt_all,
+        bbox_xywh_est_all,
+        i_bbox_all,
+        i_with_dp,
+    )
+
+
+def _extract_single_tensors_from_matches(proposals_with_targets):
+    i_img = []
+    i_gt_all = []
+    x_norm_all = []
+    y_norm_all = []
+    u_gt_all = []
+    v_gt_all = []
+    s_gt_all = []
+    bbox_xywh_gt_all = []
+    bbox_xywh_est_all = []
+    i_bbox_all = []
+    i_with_dp_all = []
+    n = 0
+    for i, proposals_targets_per_image in enumerate(proposals_with_targets):
+        n_i = proposals_targets_per_image.proposal_boxes.tensor.size(0)
+        if not n_i:
+            continue
+        i_gt_img, x_norm_img, y_norm_img, u_gt_img, v_gt_img, s_gt_img, bbox_xywh_gt_img, bbox_xywh_est_img, i_bbox_img, i_with_dp_img = _extract_single_tensors_from_matches_one_image(  # noqa
+            proposals_targets_per_image, len(i_with_dp_all), n
+        )
+        i_gt_all.extend(i_gt_img)
+        x_norm_all.extend(x_norm_img)
+        y_norm_all.extend(y_norm_img)
+        u_gt_all.extend(u_gt_img)
+        v_gt_all.extend(v_gt_img)
+        s_gt_all.extend(s_gt_img)
+        bbox_xywh_gt_all.extend(bbox_xywh_gt_img)
+        bbox_xywh_est_all.extend(bbox_xywh_est_img)
+        i_bbox_all.extend(i_bbox_img)
+        i_with_dp_all.extend(i_with_dp_img)
+        i_img.extend([i] * len(i_with_dp_img))
+        n += n_i
+    # concatenate all data into a single tensor
+    if (n > 0) and (len(i_with_dp_all) > 0):
+        i_gt = torch.cat(i_gt_all, 0).long()
+        x_norm = torch.cat(x_norm_all, 0)
+        y_norm = torch.cat(y_norm_all, 0)
+        u_gt = torch.cat(u_gt_all, 0)
+        v_gt = torch.cat(v_gt_all, 0)
+        s_gt = torch.cat(s_gt_all, 0)
+        bbox_xywh_gt = torch.cat(bbox_xywh_gt_all, 0)
+        bbox_xywh_est = torch.cat(bbox_xywh_est_all, 0)
+        i_bbox = torch.cat(i_bbox_all, 0).long()
+    else:
+        i_gt = None
+        x_norm = None
+        y_norm = None
+        u_gt = None
+        v_gt = None
+        s_gt = None
+        bbox_xywh_gt = None
+        bbox_xywh_est = None
+        i_bbox = None
+    return (
+        i_img,
+        i_with_dp_all,
+        bbox_xywh_est,
+        bbox_xywh_gt,
+        i_gt,
+        x_norm,
+        y_norm,
+        u_gt,
+        v_gt,
+        s_gt,
+        i_bbox,
+    )
+
+
+class DensePoseLosses(object):
+    def __init__(self, cfg):
+        # fmt: off
+        self.heatmap_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE
+        self.w_points     = cfg.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS
+        self.w_part       = cfg.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS
+        self.w_segm       = cfg.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS
+        # fmt: on
+
+    def __call__(self, proposals_with_gt, densepose_outputs):
+        losses = {}
+        # densepose outputs are computed for all images and all bounding boxes;
+        # i.e. if a batch has 4 images with (3, 1, 2, 1) proposals respectively,
+        # the outputs will have size(0) == 3+1+2+1 == 7
+        s, index_uv, u, v = densepose_outputs
+        assert u.size(2) == v.size(2)
+        assert u.size(3) == v.size(3)
+        assert u.size(2) == index_uv.size(2)
+        assert u.size(3) == index_uv.size(3)
+
+        with torch.no_grad():
+            index_uv_img, i_with_dp, bbox_xywh_est, bbox_xywh_gt, index_gt_all, x_norm, y_norm, u_gt_all, v_gt_all, s_gt, index_bbox = _extract_single_tensors_from_matches(  # noqa
+                proposals_with_gt
+            )
+        n_batch = len(i_with_dp)
+
+        # NOTE: we need to keep the same computation graph on all the GPUs to
+        # perform reduction properly. Hence even if we have no data on one
+        # of the GPUs, we still need to generate the computation graph.
+        # Add fake (zero) loss in the form Tensor.sum() * 0
+        if not n_batch:
+            losses["loss_densepose_U"] = u.sum() * 0
+            losses["loss_densepose_V"] = v.sum() * 0
+            losses["loss_densepose_I"] = index_uv.sum() * 0
+            losses["loss_densepose_S"] = s.sum() * 0
+            return losses
+
+        zh = u.size(2)
+        zw = u.size(3)
+
+        j_valid, y_lo, y_hi, x_lo, x_hi, w_ylo_xlo, w_ylo_xhi, w_yhi_xlo, w_yhi_xhi = _grid_sampling_utilities(  # noqa
+            zh, zw, bbox_xywh_est, bbox_xywh_gt, index_gt_all, x_norm, y_norm, index_bbox
+        )
+
+        j_valid_fg = j_valid * (index_gt_all > 0)
+
+        u_gt = u_gt_all[j_valid_fg]
+        u_est_all = _extract_at_points_packed(
+            u[i_with_dp],
+            index_bbox,
+            index_gt_all,
+            y_lo,
+            y_hi,
+            x_lo,
+            x_hi,
+            w_ylo_xlo,
+            w_ylo_xhi,
+            w_yhi_xlo,
+            w_yhi_xhi,
+        )
+        u_est = u_est_all[j_valid_fg]
+
+        v_gt = v_gt_all[j_valid_fg]
+        v_est_all = _extract_at_points_packed(
+            v[i_with_dp],
+            index_bbox,
+            index_gt_all,
+            y_lo,
+            y_hi,
+            x_lo,
+            x_hi,
+            w_ylo_xlo,
+            w_ylo_xhi,
+            w_yhi_xlo,
+            w_yhi_xhi,
+        )
+        v_est = v_est_all[j_valid_fg]
+
+        index_uv_gt = index_gt_all[j_valid]
+        index_uv_est_all = _extract_at_points_packed(
+            index_uv[i_with_dp],
+            index_bbox,
+            slice(None),
+            y_lo,
+            y_hi,
+            x_lo,
+            x_hi,
+            w_ylo_xlo[:, None],
+            w_ylo_xhi[:, None],
+            w_yhi_xlo[:, None],
+            w_yhi_xhi[:, None],
+        )
+        index_uv_est = index_uv_est_all[j_valid, :]
+
+        # Resample everything to the estimated data size, no need to resample
+        # S_est then:
+        s_est = s[i_with_dp]
+        with torch.no_grad():
+            s_gt = _resample_data(
+                s_gt.unsqueeze(1),
+                bbox_xywh_gt,
+                bbox_xywh_est,
+                self.heatmap_size,
+                self.heatmap_size,
+                mode="nearest",
+                padding_mode="zeros",
+            ).squeeze(1)
+
+        # add point-based losses:
+        u_loss = F.smooth_l1_loss(u_est, u_gt, reduction="sum") * self.w_points
+        losses["loss_densepose_U"] = u_loss
+        v_loss = F.smooth_l1_loss(v_est, v_gt, reduction="sum") * self.w_points
+        losses["loss_densepose_V"] = v_loss
+        index_uv_loss = F.cross_entropy(index_uv_est, index_uv_gt.long()) * self.w_part
+        losses["loss_densepose_I"] = index_uv_loss
+        s_loss = F.cross_entropy(s_est, s_gt.long()) * self.w_segm
+        losses["loss_densepose_S"] = s_loss
+        return losses
+
+
+def build_densepose_losses(cfg):
+    losses = DensePoseLosses(cfg)
+    return losses
diff --git a/projects/DensePose/densepose/evaluator.py b/projects/DensePose/densepose/evaluator.py
new file mode 100644
index 0000000..161c389
--- /dev/null
+++ b/projects/DensePose/densepose/evaluator.py
@@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import contextlib
+import copy
+import io
+import itertools
+import json
+import logging
+import os
+from collections import OrderedDict
+import torch
+from pycocotools.coco import COCO
+
+from detectron2.data import MetadataCatalog
+from detectron2.evaluation import DatasetEvaluator
+from detectron2.structures import BoxMode
+from detectron2.utils.comm import all_gather, is_main_process, synchronize
+from detectron2.utils.logger import create_small_table
+
+from .densepose_coco_evaluation import DensePoseCocoEval
+
+
+class DensePoseCOCOEvaluator(DatasetEvaluator):
+    def __init__(self, dataset_name, distributed, output_dir=None):
+        self._distributed = distributed
+        self._output_dir = output_dir
+
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+
+        self._metadata = MetadataCatalog.get(dataset_name)
+        with contextlib.redirect_stdout(io.StringIO()):
+            self._coco_api = COCO(self._metadata.json_file)
+
+    def reset(self):
+        self._predictions = []
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a COCO model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+                The :class:`Instances` object needs to have `densepose` field.
+        """
+        for input, output in zip(inputs, outputs):
+            instances = output["instances"].to(self._cpu_device)
+
+            boxes = instances.pred_boxes.tensor.clone()
+            boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+            instances.pred_densepose = instances.pred_densepose.to_result(boxes)
+
+            json_results = prediction_to_json(instances, input["image_id"])
+            self._predictions.extend(json_results)
+
+    def evaluate(self):
+        if self._distributed:
+            synchronize()
+            self._predictions = all_gather(self._predictions)
+            self._predictions = list(itertools.chain(*self._predictions))
+            if not is_main_process():
+                return
+
+        return copy.deepcopy(self._eval_predictions())
+
+    def _eval_predictions(self):
+        """
+        Evaluate self._predictions on densepose.
+        Return results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results for COCO format ...")
+
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "coco_densepose_results.json")
+            with open(file_path, "w") as f:
+                json.dump(self._predictions, f)
+                f.flush()
+                os.fsync(f.fileno())
+
+        self._logger.info("Evaluating predictions ...")
+        res = OrderedDict()
+        res["densepose"] = _evaluate_predictions_on_coco(self._coco_api, self._predictions)
+        return res
+
+
+def prediction_to_json(instances, img_id):
+    """
+    Args:
+        instances (Instances): the output of the model
+        img_id (str): the image id in COCO
+
+    Returns:
+        list[dict]: the results in densepose evaluation format
+    """
+    scores = instances.scores.tolist()
+
+    results = []
+    for k in range(len(instances)):
+        densepose = instances.pred_densepose[k]
+        result = {
+            "image_id": img_id,
+            "category_id": 1,  # densepose only has one class
+            "bbox": densepose[1],
+            "score": scores[k],
+            "densepose": densepose,
+        }
+        results.append(result)
+    return results
+
+
+def _evaluate_predictions_on_coco(coco_gt, coco_results):
+    metrics = ["AP", "AP50", "AP75", "APm", "APl"]
+
+    logger = logging.getLogger(__name__)
+
+    if len(coco_results) == 0:  # cocoapi does not handle empty results very well
+        logger.warn("No predictions from the model! Set scores to -1")
+        return {metric: -1 for metric in metrics}
+
+    coco_dt = coco_gt.loadRes(coco_results)
+    coco_eval = DensePoseCocoEval(coco_gt, coco_dt, "densepose")
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+
+    # the standard metrics
+    results = {metric: float(coco_eval.stats[idx] * 100) for idx, metric in enumerate(metrics)}
+    logger.info("Evaluation results for densepose: \n" + create_small_table(results))
+    return results
diff --git a/projects/DensePose/densepose/roi_head.py b/projects/DensePose/densepose/roi_head.py
new file mode 100644
index 0000000..4f3d272
--- /dev/null
+++ b/projects/DensePose/densepose/roi_head.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import torch
+
+from detectron2.modeling import ROI_HEADS_REGISTRY, StandardROIHeads
+from detectron2.modeling.poolers import ROIPooler
+from detectron2.modeling.roi_heads import select_foreground_proposals
+
+from .densepose_head import (
+    build_densepose_data_filter,
+    build_densepose_head,
+    build_densepose_losses,
+    build_densepose_predictor,
+    densepose_inference,
+)
+
+
+@ROI_HEADS_REGISTRY.register()
+class DensePoseROIHeads(StandardROIHeads):
+    """
+    A Standard ROIHeads which contains an addition of DensePose head.
+    """
+
+    def __init__(self, cfg, input_shape):
+        super().__init__(cfg, input_shape)
+        self._init_densepose_head(cfg)
+
+    def _init_densepose_head(self, cfg):
+        # fmt: off
+        self.densepose_on          = cfg.MODEL.DENSEPOSE_ON
+        if not self.densepose_on:
+            return
+        self.densepose_data_filter = build_densepose_data_filter(cfg)
+        dp_pooler_resolution       = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION
+        dp_pooler_scales           = tuple(1.0 / self.feature_strides[k] for k in self.in_features)
+        dp_pooler_sampling_ratio   = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO
+        dp_pooler_type             = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE
+        # fmt: on
+        in_channels = [self.feature_channels[f] for f in self.in_features][0]
+        self.densepose_pooler = ROIPooler(
+            output_size=dp_pooler_resolution,
+            scales=dp_pooler_scales,
+            sampling_ratio=dp_pooler_sampling_ratio,
+            pooler_type=dp_pooler_type,
+        )
+        self.densepose_head = build_densepose_head(cfg, in_channels)
+        self.densepose_predictor = build_densepose_predictor(
+            cfg, self.densepose_head.n_out_channels
+        )
+        self.densepose_losses = build_densepose_losses(cfg)
+
+    def _forward_densepose(self, features, instances):
+        """
+        Forward logic of the densepose prediction branch.
+
+        Args:
+            features (list[Tensor]): #level input features for densepose prediction
+            instances (list[Instances]): the per-image instances to train/predict densepose.
+                In training, they can be the proposals.
+                In inference, they can be the predicted boxes.
+
+        Returns:
+            In training, a dict of losses.
+            In inference, update `instances` with new fields "densepose" and return it.
+        """
+        if not self.densepose_on:
+            return {} if self.training else instances
+
+        if self.training:
+            proposals, _ = select_foreground_proposals(instances, self.num_classes)
+            proposals_dp = self.densepose_data_filter(proposals)
+            if len(proposals_dp) > 0:
+                proposal_boxes = [x.proposal_boxes for x in proposals_dp]
+                features_dp = self.densepose_pooler(features, proposal_boxes)
+                densepose_head_outputs = self.densepose_head(features_dp)
+                densepose_outputs, _ = self.densepose_predictor(densepose_head_outputs)
+                densepose_loss_dict = self.densepose_losses(proposals_dp, densepose_outputs)
+                return densepose_loss_dict
+        else:
+            pred_boxes = [x.pred_boxes for x in instances]
+            features_dp = self.densepose_pooler(features, pred_boxes)
+            if len(features_dp) > 0:
+                densepose_head_outputs = self.densepose_head(features_dp)
+                densepose_outputs, _ = self.densepose_predictor(densepose_head_outputs)
+            else:
+                # If no detection occurred instances
+                # set densepose_outputs to empty tensors
+                empty_tensor = torch.zeros(size=(0, 0, 0, 0), device=features_dp.device)
+                densepose_outputs = tuple([empty_tensor] * 4)
+
+            densepose_inference(densepose_outputs, instances)
+            return instances
+
+    def forward(self, images, features, proposals, targets=None):
+        features_list = [features[f] for f in self.in_features]
+
+        instances, losses = super().forward(images, features, proposals, targets)
+        del targets, images
+
+        if self.training:
+            losses.update(self._forward_densepose(features_list, instances))
+        else:
+            instances = self._forward_densepose(features_list, instances)
+        return instances, losses
diff --git a/projects/DensePose/densepose/structures.py b/projects/DensePose/densepose/structures.py
new file mode 100644
index 0000000..9d3465b
--- /dev/null
+++ b/projects/DensePose/densepose/structures.py
@@ -0,0 +1,519 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import base64
+import numpy as np
+from io import BytesIO
+import torch
+from PIL import Image
+from torch.nn import functional as F
+
+
+class DensePoseTransformData(object):
+
+    # Horizontal symmetry label transforms used for horizontal flip
+    MASK_LABEL_SYMMETRIES = [0, 1, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14]
+    # fmt: off
+    POINT_LABEL_SYMMETRIES = [ 0, 1, 2, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15, 18, 17, 20, 19, 22, 21, 24, 23]  # noqa
+    # fmt: on
+
+    def __init__(self, uv_symmetries):
+        self.mask_label_symmetries = DensePoseTransformData.MASK_LABEL_SYMMETRIES
+        self.point_label_symmetries = DensePoseTransformData.POINT_LABEL_SYMMETRIES
+        self.uv_symmetries = uv_symmetries
+
+    @staticmethod
+    def load(fpath):
+        import scipy.io
+
+        uv_symmetry_map = scipy.io.loadmat(fpath)
+        uv_symmetry_map_torch = {}
+        for key in ["U_transforms", "V_transforms"]:
+            map_src = uv_symmetry_map[key]
+            uv_symmetry_map_torch[key] = []
+            for i in range(uv_symmetry_map[key].shape[1]):
+                uv_symmetry_map_torch[key].append(
+                    torch.from_numpy(map_src[0, i]).to(dtype=torch.float)
+                )
+        transform_data = DensePoseTransformData(uv_symmetry_map_torch)
+        return transform_data
+
+
+class DensePoseDataRelative(object):
+    """
+    Dense pose relative annotations that can be applied to any bounding box:
+        x - normalized X coordinates [0, 255] of annotated points
+        y - normalized Y coordinates [0, 255] of annotated points
+        i - body part labels 0,...,24 for annotated points
+        u - body part U coordinates [0, 1] for annotated points
+        v - body part V coordinates [0, 1] for annotated points
+        segm - 256x256 segmentation mask with values 0,...,14
+    To obtain absolute x and y data wrt some bounding box one needs to first
+    divide the data by 256, multiply by the respective bounding box size
+    and add bounding box offset:
+        x_img = x0 + x_norm * w / 256.0
+        y_img = y0 + y_norm * h / 256.0
+    Segmentation masks are typically sampled to get image-based masks.
+    """
+
+    # Key for normalized X coordinates in annotation dict
+    X_KEY = "dp_x"
+    # Key for normalized Y coordinates in annotation dict
+    Y_KEY = "dp_y"
+    # Key for U part coordinates in annotation dict
+    U_KEY = "dp_U"
+    # Key for V part coordinates in annotation dict
+    V_KEY = "dp_V"
+    # Key for I point labels in annotation dict
+    I_KEY = "dp_I"
+    # Key for segmentation mask in annotation dict
+    S_KEY = "dp_masks"
+    # Number of body parts in segmentation masks
+    N_BODY_PARTS = 14
+    # Number of parts in point labels
+    N_PART_LABELS = 24
+    MASK_SIZE = 256
+
+    def __init__(self, annotation, cleanup=False):
+        is_valid, reason_not_valid = DensePoseDataRelative.validate_annotation(annotation)
+        assert is_valid, "Invalid DensePose annotations: {}".format(reason_not_valid)
+        self.x = torch.as_tensor(annotation[DensePoseDataRelative.X_KEY])
+        self.y = torch.as_tensor(annotation[DensePoseDataRelative.Y_KEY])
+        self.i = torch.as_tensor(annotation[DensePoseDataRelative.I_KEY])
+        self.u = torch.as_tensor(annotation[DensePoseDataRelative.U_KEY])
+        self.v = torch.as_tensor(annotation[DensePoseDataRelative.V_KEY])
+        self.segm = DensePoseDataRelative.extract_segmentation_mask(annotation)
+        self.device = torch.device("cpu")
+        if cleanup:
+            DensePoseDataRelative.cleanup_annotation(annotation)
+
+    def to(self, device):
+        if self.device == device:
+            return self
+        new_data = DensePoseDataRelative.__new__(DensePoseDataRelative)
+        new_data.x = self.x
+        new_data.x = self.x.to(device)
+        new_data.y = self.y.to(device)
+        new_data.i = self.i.to(device)
+        new_data.u = self.u.to(device)
+        new_data.v = self.v.to(device)
+        new_data.segm = self.segm.to(device)
+        new_data.device = device
+        return new_data
+
+    @staticmethod
+    def extract_segmentation_mask(annotation):
+        import pycocotools.mask as mask_utils
+
+        poly_specs = annotation[DensePoseDataRelative.S_KEY]
+        segm = torch.zeros((DensePoseDataRelative.MASK_SIZE,) * 2, dtype=torch.float32)
+        for i in range(DensePoseDataRelative.N_BODY_PARTS):
+            poly_i = poly_specs[i]
+            if poly_i:
+                mask_i = mask_utils.decode(poly_i)
+                segm[mask_i > 0] = i + 1
+        return segm
+
+    @staticmethod
+    def validate_annotation(annotation):
+        for key in [
+            DensePoseDataRelative.X_KEY,
+            DensePoseDataRelative.Y_KEY,
+            DensePoseDataRelative.I_KEY,
+            DensePoseDataRelative.U_KEY,
+            DensePoseDataRelative.V_KEY,
+            DensePoseDataRelative.S_KEY,
+        ]:
+            if key not in annotation:
+                return False, "no {key} data in the annotation".format(key=key)
+        return True, None
+
+    @staticmethod
+    def cleanup_annotation(annotation):
+        for key in [
+            DensePoseDataRelative.X_KEY,
+            DensePoseDataRelative.Y_KEY,
+            DensePoseDataRelative.I_KEY,
+            DensePoseDataRelative.U_KEY,
+            DensePoseDataRelative.V_KEY,
+            DensePoseDataRelative.S_KEY,
+        ]:
+            if key in annotation:
+                del annotation[key]
+
+    def apply_transform(self, transforms, densepose_transform_data):
+        self._transform_pts(transforms, densepose_transform_data)
+        self._transform_segm(transforms, densepose_transform_data)
+
+    def _transform_pts(self, transforms, dp_transform_data):
+        import detectron2.data.transforms as T
+
+        # NOTE: This assumes that HorizFlipTransform is the only one that does flip
+        do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
+        if do_hflip:
+            self.x = self.segm.size(1) - self.x
+            self._flip_iuv_semantics(dp_transform_data)
+
+    def _flip_iuv_semantics(self, dp_transform_data):
+        i_old = self.i.clone()
+        uv_symmetries = dp_transform_data.uv_symmetries
+        pt_label_symmetries = dp_transform_data.point_label_symmetries
+        for i in range(self.N_PART_LABELS):
+            if pt_label_symmetries[i + 1] != i + 1:
+                annot_indices_i = i_old == i + 1
+                self.i[annot_indices_i] = pt_label_symmetries[i + 1]
+                u_loc = (self.u[annot_indices_i] * 255).long()
+                v_loc = (self.v[annot_indices_i] * 255).long()
+                self.u[annot_indices_i] = uv_symmetries["U_transforms"][i][v_loc, u_loc]
+                self.v[annot_indices_i] = uv_symmetries["V_transforms"][i][v_loc, u_loc]
+
+    def _transform_segm(self, transforms, dp_transform_data):
+        import detectron2.data.transforms as T
+
+        # NOTE: This assumes that HorizFlipTransform is the only one that does flip
+        do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
+        if do_hflip:
+            self.segm = torch.flip(self.segm, [1])
+            self._flip_segm_semantics(dp_transform_data)
+
+    def _flip_segm_semantics(self, dp_transform_data):
+        old_segm = self.segm.clone()
+        mask_label_symmetries = dp_transform_data.mask_label_symmetries
+        for i in range(self.N_BODY_PARTS):
+            if mask_label_symmetries[i + 1] != i + 1:
+                self.segm[old_segm == i + 1] = mask_label_symmetries[i + 1]
+
+
+def normalized_coords_transform(x0, y0, w, h):
+    """
+    Coordinates transform that maps top left corner to (-1, -1) and bottom
+    right corner to (1, 1). Used for torch.grid_sample to initialize the
+    grid
+    """
+
+    def f(p):
+        return (2 * (p[0] - x0) / w - 1, 2 * (p[1] - y0) / h - 1)
+
+    return f
+
+
+class DensePoseOutput(object):
+    def __init__(self, S, I, U, V):
+        self.S = S
+        self.I = I  # noqa: E741
+        self.U = U
+        self.V = V
+        self._check_output_dims(S, I, U, V)
+
+    def _check_output_dims(self, S, I, U, V):
+        assert (
+            len(S.size()) == 4
+        ), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
+            S.size()
+        )
+        assert (
+            len(I.size()) == 4
+        ), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
+            S.size()
+        )
+        assert (
+            len(U.size()) == 4
+        ), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
+            S.size()
+        )
+        assert (
+            len(V.size()) == 4
+        ), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
+            S.size()
+        )
+        assert len(S) == len(I), (
+            "Number of output segmentation planes {} "
+            "should be equal to the number of output part index "
+            "planes {}".format(len(S), len(I))
+        )
+        assert S.size()[2:] == I.size()[2:], (
+            "Output segmentation plane size {} "
+            "should be equal to the output part index "
+            "plane size {}".format(S.size()[2:], I.size()[2:])
+        )
+        assert I.size() == U.size(), (
+            "Part index output shape {} "
+            "should be the same as U coordinates output shape {}".format(I.size(), U.size())
+        )
+        assert I.size() == V.size(), (
+            "Part index output shape {} "
+            "should be the same as V coordinates output shape {}".format(I.size(), V.size())
+        )
+
+    def resize(self, image_size_hw):
+        # do nothing - outputs are invariant to resize
+        pass
+
+    def _crop(self, S, I, U, V, bbox_old_xywh, bbox_new_xywh):
+        """
+        Resample S, I, U, V from bbox_old to the cropped bbox_new
+        """
+        x0old, y0old, wold, hold = bbox_old_xywh
+        x0new, y0new, wnew, hnew = bbox_new_xywh
+        tr_coords = normalized_coords_transform(x0old, y0old, wold, hold)
+        topleft = (x0new, y0new)
+        bottomright = (x0new + wnew, y0new + hnew)
+        topleft_norm = tr_coords(topleft)
+        bottomright_norm = tr_coords(bottomright)
+        hsize = S.size(1)
+        wsize = S.size(2)
+        grid = torch.meshgrid(
+            torch.arange(
+                topleft_norm[1],
+                bottomright_norm[1],
+                (bottomright_norm[1] - topleft_norm[1]) / hsize,
+            )[:hsize],
+            torch.arange(
+                topleft_norm[0],
+                bottomright_norm[0],
+                (bottomright_norm[0] - topleft_norm[0]) / wsize,
+            )[:wsize],
+        )
+        grid = torch.stack(grid, dim=2).to(S.device)
+        assert (
+            grid.size(0) == hsize
+        ), "Resampled grid expected " "height={}, actual height={}".format(hsize, grid.size(0))
+        assert grid.size(1) == wsize, "Resampled grid expected " "width={}, actual width={}".format(
+            wsize, grid.size(1)
+        )
+        S_new = F.grid_sample(
+            S.unsqueeze(0),
+            torch.unsqueeze(grid, 0),
+            mode="bilinear",
+            padding_mode="border",
+            align_corners=True,
+        ).squeeze(0)
+        I_new = F.grid_sample(
+            I.unsqueeze(0),
+            torch.unsqueeze(grid, 0),
+            mode="bilinear",
+            padding_mode="border",
+            align_corners=True,
+        ).squeeze(0)
+        U_new = F.grid_sample(
+            U.unsqueeze(0),
+            torch.unsqueeze(grid, 0),
+            mode="bilinear",
+            padding_mode="border",
+            align_corners=True,
+        ).squeeze(0)
+        V_new = F.grid_sample(
+            V.unsqueeze(0),
+            torch.unsqueeze(grid, 0),
+            mode="bilinear",
+            padding_mode="border",
+            align_corners=True,
+        ).squeeze(0)
+        return S_new, I_new, U_new, V_new
+
+    def crop(self, indices_cropped, bboxes_old, bboxes_new):
+        """
+        Crop outputs for selected bounding boxes to the new bounding boxes.
+        """
+        # VK: cropping is ignored for now
+        # for i, ic in enumerate(indices_cropped):
+        #    self.S[ic], self.I[ic], self.U[ic], self.V[ic] = \
+        #        self._crop(self.S[ic], self.I[ic], self.U[ic], self.V[ic],
+        #        bboxes_old[i], bboxes_new[i])
+        pass
+
+    def to_result(self, boxes_xywh):
+        """
+        Convert DensePose outputs to results format. Results are more compact,
+        but cannot be resampled any more
+        """
+        result = DensePoseResult(boxes_xywh, self.S, self.I, self.U, self.V)
+        return result
+
+    def __getitem__(self, item):
+        if isinstance(item, int):
+            S_selected = self.S[item].unsqueeze(0)
+            I_selected = self.I[item].unsqueeze(0)
+            U_selected = self.U[item].unsqueeze(0)
+            V_selected = self.V[item].unsqueeze(0)
+        else:
+            S_selected = self.S[item]
+            I_selected = self.I[item]
+            U_selected = self.U[item]
+            V_selected = self.V[item]
+        return DensePoseOutput(S_selected, I_selected, U_selected, V_selected)
+
+    def __str__(self):
+        s = "DensePoseOutput S {}, I {}, U {}, V {}".format(
+            list(self.S.size()), list(self.I.size()), list(self.U.size()), list(self.V.size())
+        )
+        return s
+
+    def __len__(self):
+        return self.S.size(0)
+
+
+class DensePoseResult(object):
+    def __init__(self, boxes_xywh, S, I, U, V):
+        self.results = []
+        self.boxes_xywh = boxes_xywh.cpu().tolist()
+        assert len(boxes_xywh.size()) == 2
+        assert boxes_xywh.size(1) == 4
+        for i, box_xywh in enumerate(boxes_xywh):
+            result_i = self._output_to_result(box_xywh, S[[i]], I[[i]], U[[i]], V[[i]])
+            result_numpy_i = result_i.cpu().numpy()
+            result_encoded_i = DensePoseResult.encode_png_data(result_numpy_i)
+            result_encoded_with_shape_i = (result_numpy_i.shape, result_encoded_i)
+            self.results.append(result_encoded_with_shape_i)
+
+    def __str__(self):
+        s = "DensePoseResult: N={} [{}]".format(
+            len(self.results), ", ".join([str(list(r[0])) for r in self.results])
+        )
+        return s
+
+    def _output_to_result(self, box_xywh, S, I, U, V):
+        x, y, w, h = box_xywh
+        w = max(int(w), 1)
+        h = max(int(h), 1)
+        result = torch.zeros([3, h, w], dtype=torch.uint8, device=U.device)
+        assert (
+            len(S.size()) == 4
+        ), "AnnIndex tensor size should have {} " "dimensions but has {}".format(4, len(S.size()))
+        s_bbox = F.interpolate(S, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
+        assert (
+            len(I.size()) == 4
+        ), "IndexUV tensor size should have {} " "dimensions but has {}".format(4, len(S.size()))
+        i_bbox = (
+            F.interpolate(I, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
+            * (s_bbox > 0).long()
+        ).squeeze(0)
+        assert len(U.size()) == 4, "U tensor size should have {} " "dimensions but has {}".format(
+            4, len(U.size())
+        )
+        u_bbox = F.interpolate(U, (h, w), mode="bilinear", align_corners=False)
+        assert len(V.size()) == 4, "V tensor size should have {} " "dimensions but has {}".format(
+            4, len(V.size())
+        )
+        v_bbox = F.interpolate(V, (h, w), mode="bilinear", align_corners=False)
+        result[0] = i_bbox
+        for part_id in range(1, u_bbox.size(1)):
+            result[1][i_bbox == part_id] = (
+                (u_bbox[0, part_id][i_bbox == part_id] * 255).clamp(0, 255).to(torch.uint8)
+            )
+            result[2][i_bbox == part_id] = (
+                (v_bbox[0, part_id][i_bbox == part_id] * 255).clamp(0, 255).to(torch.uint8)
+            )
+        assert (
+            result.size(1) == h
+        ), "Results height {} should be equal" "to bounding box height {}".format(result.size(1), h)
+        assert (
+            result.size(2) == w
+        ), "Results width {} should be equal" "to bounding box width {}".format(result.size(2), w)
+        return result
+
+    @staticmethod
+    def encode_png_data(arr):
+        """
+        Encode array data as a PNG image using the highest compression rate
+        @param arr [in] Data stored in an array of size (3, M, N) of type uint8
+        @return Base64-encoded string containing PNG-compressed data
+        """
+        assert len(arr.shape) == 3, "Expected a 3D array as an input," " got a {0}D array".format(
+            len(arr.shape)
+        )
+        assert arr.shape[0] == 3, "Expected first array dimension of size 3," " got {0}".format(
+            arr.shape[0]
+        )
+        assert arr.dtype == np.uint8, "Expected an array of type np.uint8, " " got {0}".format(
+            arr.dtype
+        )
+        data = np.moveaxis(arr, 0, -1)
+        im = Image.fromarray(data)
+        fstream = BytesIO()
+        im.save(fstream, format="png", optimize=True)
+        s = base64.encodebytes(fstream.getvalue()).decode()
+        return s
+
+    @staticmethod
+    def decode_png_data(shape, s):
+        """
+        Decode array data from a string that contains PNG-compressed data
+        @param Base64-encoded string containing PNG-compressed data
+        @return Data stored in an array of size (3, M, N) of type uint8
+        """
+        fstream = BytesIO(base64.decodebytes(s.encode()))
+        im = Image.open(fstream)
+        data = np.moveaxis(np.array(im.getdata(), dtype=np.uint8), -1, 0)
+        return data.reshape(shape)
+
+    def __len__(self):
+        return len(self.results)
+
+    def __getitem__(self, item):
+        result_encoded = self.results[item]
+        bbox_xywh = self.boxes_xywh[item]
+        return result_encoded, bbox_xywh
+
+
+class DensePoseList(object):
+
+    _TORCH_DEVICE_CPU = torch.device("cpu")
+
+    def __init__(self, densepose_datas, boxes_xyxy_abs, image_size_hw, device=_TORCH_DEVICE_CPU):
+        assert len(densepose_datas) == len(boxes_xyxy_abs), (
+            "Attempt to initialize DensePoseList with {} DensePose datas "
+            "and {} boxes".format(len(densepose_datas), len(boxes_xyxy_abs))
+        )
+        self.densepose_datas = []
+        for densepose_data in densepose_datas:
+            assert isinstance(densepose_data, DensePoseDataRelative) or densepose_data is None, (
+                "Attempt to initialize DensePoseList with DensePose datas "
+                "of type {}, expected DensePoseDataRelative".format(type(densepose_data))
+            )
+            densepose_data_ondevice = (
+                densepose_data.to(device) if densepose_data is not None else None
+            )
+            self.densepose_datas.append(densepose_data_ondevice)
+        self.boxes_xyxy_abs = boxes_xyxy_abs.to(device)
+        self.image_size_hw = image_size_hw
+        self.device = device
+
+    def to(self, device):
+        if self.device == device:
+            return self
+        return DensePoseList(self.densepose_datas, self.boxes_xyxy_abs, self.image_size_hw, device)
+
+    def __iter__(self):
+        return iter(self.densepose_datas)
+
+    def __len__(self):
+        return len(self.densepose_datas)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + "("
+        s += "num_instances={}, ".format(len(self.densepose_datas))
+        s += "image_width={}, ".format(self.image_size_hw[1])
+        s += "image_height={})".format(self.image_size_hw[0])
+        return s
+
+    def __getitem__(self, item):
+        if isinstance(item, int):
+            densepose_data_rel = self.densepose_datas[item]
+            return densepose_data_rel
+        elif isinstance(item, slice):
+            densepose_datas_rel = self.densepose_datas[item]
+            boxes_xyxy_abs = self.boxes_xyxy_abs[item]
+            return DensePoseList(
+                densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
+            )
+        elif isinstance(item, torch.Tensor) and (item.dtype == torch.bool):
+            densepose_datas_rel = [self.densepose_datas[i] for i, x in enumerate(item) if x > 0]
+            boxes_xyxy_abs = self.boxes_xyxy_abs[item]
+            return DensePoseList(
+                densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
+            )
+        else:
+            densepose_datas_rel = [self.densepose_datas[i] for i in item]
+            boxes_xyxy_abs = self.boxes_xyxy_abs[item]
+            return DensePoseList(
+                densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
+            )
diff --git a/projects/DensePose/densepose/utils/dbhelper.py b/projects/DensePose/densepose/utils/dbhelper.py
new file mode 100644
index 0000000..ef7e316
--- /dev/null
+++ b/projects/DensePose/densepose/utils/dbhelper.py
@@ -0,0 +1,145 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from typing import Any, Dict, List, Optional, Tuple
+
+
+class EntrySelector(object):
+    """
+    Base class for entry selectors
+    """
+
+    @staticmethod
+    def from_string(spec: str) -> "EntrySelector":
+        if spec == "*":
+            return AllEntrySelector()
+        return FieldEntrySelector(spec)
+
+
+class AllEntrySelector(EntrySelector):
+    """
+    Selector that accepts all entries
+    """
+
+    SPECIFIER = "*"
+
+    def __call__(self, entry):
+        return True
+
+
+class FieldEntrySelector(EntrySelector):
+    """
+    Selector that accepts only entries that match provided field
+    specifier(s). Only a limited set of specifiers is supported for now:
+      <specifiers>::=<specifier>[<comma><specifiers>]
+      <specifier>::=<field_name>[<type_delim><type>]<equal><value_or_range>
+      <field_name> is a valid identifier
+      <type> ::= "int" | "str"
+      <equal> ::= "="
+      <comma> ::= ","
+      <type_delim> ::= ":"
+      <value_or_range> ::= <value> | <range>
+      <range> ::= <value><range_delim><value>
+      <range_delim> ::= "-"
+      <value> is a string without spaces and special symbols
+        (e.g. <comma>, <equal>, <type_delim>, <range_delim>)
+    """
+
+    _SPEC_DELIM = ","
+    _TYPE_DELIM = ":"
+    _RANGE_DELIM = "-"
+    _EQUAL = "="
+    _ERROR_PREFIX = "Invalid field selector specifier"
+
+    class _FieldEntryValuePredicate(object):
+        """
+        Predicate that checks strict equality for the specified entry field
+        """
+
+        def __init__(self, name: str, typespec: str, value: str):
+            import builtins
+
+            self.name = name
+            self.type = getattr(builtins, typespec) if typespec is not None else str
+            self.value = value
+
+        def __call__(self, entry):
+            return entry[self.name] == self.type(self.value)
+
+    class _FieldEntryRangePredicate(object):
+        """
+        Predicate that checks whether an entry field falls into the specified range
+        """
+
+        def __init__(self, name: str, typespec: str, vmin: str, vmax: str):
+            import builtins
+
+            self.name = name
+            self.type = getattr(builtins, typespec) if typespec is not None else str
+            self.vmin = vmin
+            self.vmax = vmax
+
+        def __call__(self, entry):
+            return (entry[self.name] >= self.type(self.vmin)) and (
+                entry[self.name] <= self.type(self.vmax)
+            )
+
+    def __init__(self, spec: str):
+        self._predicates = self._parse_specifier_into_predicates(spec)
+
+    def __call__(self, entry: Dict[str, Any]):
+        for predicate in self._predicates:
+            if not predicate(entry):
+                return False
+        return True
+
+    def _parse_specifier_into_predicates(self, spec: str) -> List["_FieldEntryPredicate"]:
+        predicates = []
+        specs = spec.split(self._SPEC_DELIM)
+        for subspec in specs:
+            eq_idx = subspec.find(self._EQUAL)
+            if eq_idx > 0:
+                field_name_with_type = subspec[:eq_idx]
+                field_name, field_type = self._parse_field_name_type(field_name_with_type)
+                field_value_or_range = subspec[eq_idx + 1 :]
+                if self._is_range_spec(field_value_or_range):
+                    vmin, vmax = self._get_range_spec(field_value_or_range)
+                    predicate = FieldEntrySelector._FieldEntryRangePredicate(
+                        field_name, field_type, vmin, vmax
+                    )
+                else:
+                    predicate = FieldEntrySelector._FieldEntryValuePredicate(
+                        field_name, field_type, field_value_or_range
+                    )
+                predicates.append(predicate)
+            elif eq_idx == 0:
+                self._parse_error(f'"{subspec}", field name is empty!')
+            else:
+                self._parse_error(f'"{subspec}", should have format ' "<field>=<value_or_range>!")
+        return predicates
+
+    def _parse_field_name_type(self, field_name_with_type: str) -> Tuple[str, Optional[str]]:
+        type_delim_idx = field_name_with_type.find(self._TYPE_DELIM)
+        if type_delim_idx > 0:
+            field_name = field_name_with_type[:type_delim_idx]
+            field_type = field_name_with_type[type_delim_idx + 1 :]
+        elif type_delim_idx == 0:
+            self._parse_error(f'"{field_name_with_type}", field name is empty!')
+        else:
+            field_name = field_name_with_type
+            field_type = None
+        return field_name, field_type
+
+    def _is_range_spec(self, field_value_or_range):
+        delim_idx = field_value_or_range.find(self._RANGE_DELIM)
+        return delim_idx > 0
+
+    def _get_range_spec(self, field_value_or_range):
+        if self._is_range_spec(field_value_or_range):
+            delim_idx = field_value_or_range.find(self._RANGE_DELIM)
+            vmin = field_value_or_range[:delim_idx]
+            vmax = field_value_or_range[delim_idx + 1 :]
+            return vmin, vmax
+        else:
+            self._parse_error('"field_value_or_range", range of values expected!')
+
+    def _parse_error(self, msg):
+        raise ValueError(f"{self._ERROR_PREFIX}: {msg}")
diff --git a/projects/DensePose/densepose/utils/logger.py b/projects/DensePose/densepose/utils/logger.py
new file mode 100644
index 0000000..e3fa45e
--- /dev/null
+++ b/projects/DensePose/densepose/utils/logger.py
@@ -0,0 +1,13 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+
+
+def verbosity_to_level(verbosity):
+    if verbosity is not None:
+        if verbosity == 0:
+            return logging.WARNING
+        elif verbosity == 1:
+            return logging.INFO
+        elif verbosity >= 2:
+            return logging.DEBUG
+    return logging.WARNING
diff --git a/projects/DensePose/densepose/vis/base.py b/projects/DensePose/densepose/vis/base.py
new file mode 100644
index 0000000..b3f1ae5
--- /dev/null
+++ b/projects/DensePose/densepose/vis/base.py
@@ -0,0 +1,190 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import numpy as np
+import cv2
+import torch
+
+Image = np.ndarray
+Boxes = torch.Tensor
+
+
+class MatrixVisualizer(object):
+    """
+    Base visualizer for matrix data
+    """
+
+    def __init__(
+        self,
+        inplace=True,
+        cmap=cv2.COLORMAP_PARULA,
+        val_scale=1.0,
+        alpha=0.7,
+        interp_method_matrix=cv2.INTER_LINEAR,
+        interp_method_mask=cv2.INTER_NEAREST,
+    ):
+        self.inplace = inplace
+        self.cmap = cmap
+        self.val_scale = val_scale
+        self.alpha = alpha
+        self.interp_method_matrix = interp_method_matrix
+        self.interp_method_mask = interp_method_mask
+
+    def visualize(self, image_bgr, mask, matrix, bbox_xywh):
+        self._check_image(image_bgr)
+        self._check_mask_matrix(mask, matrix)
+        if self.inplace:
+            image_target_bgr = image_bgr
+        else:
+            image_target_bgr = image_bgr * 0
+        x, y, w, h = [int(v) for v in bbox_xywh]
+        if w <= 0 or h <= 0:
+            return image_bgr
+        mask, matrix = self._resize(mask, matrix, w, h)
+        mask_bg = np.tile((mask == 0)[:, :, np.newaxis], [1, 1, 3])
+        matrix_scaled = matrix.astype(np.float32) * self.val_scale
+        _EPSILON = 1e-6
+        if np.any(matrix_scaled > 255 + _EPSILON):
+            logger = logging.getLogger(__name__)
+            logger.warning(
+                f"Matrix has values > {255 + _EPSILON} after " f"scaling, clipping to [0..255]"
+            )
+        matrix_scaled_8u = matrix_scaled.clip(0, 255).astype(np.uint8)
+        matrix_vis = cv2.applyColorMap(matrix_scaled_8u, self.cmap)
+        matrix_vis[mask_bg] = image_target_bgr[y : y + h, x : x + w, :][mask_bg]
+        image_target_bgr[y : y + h, x : x + w, :] = (
+            image_target_bgr[y : y + h, x : x + w, :] * (1.0 - self.alpha) + matrix_vis * self.alpha
+        )
+        return image_target_bgr.astype(np.uint8)
+
+    def _resize(self, mask, matrix, w, h):
+        if (w != mask.shape[1]) or (h != mask.shape[0]):
+            mask = cv2.resize(mask, (w, h), self.interp_method_mask)
+        if (w != matrix.shape[1]) or (h != matrix.shape[0]):
+            matrix = cv2.resize(matrix, (w, h), self.interp_method_matrix)
+        return mask, matrix
+
+    def _check_image(self, image_rgb):
+        assert len(image_rgb.shape) == 3
+        assert image_rgb.shape[2] == 3
+        assert image_rgb.dtype == np.uint8
+
+    def _check_mask_matrix(self, mask, matrix):
+        assert len(matrix.shape) == 2
+        assert len(mask.shape) == 2
+        assert mask.dtype == np.uint8
+
+
+class RectangleVisualizer(object):
+
+    _COLOR_GREEN = (18, 127, 15)
+
+    def __init__(self, color=_COLOR_GREEN, thickness=1):
+        self.color = color
+        self.thickness = thickness
+
+    def visualize(self, image_bgr, bbox_xywh, color=None, thickness=None):
+        x, y, w, h = bbox_xywh
+        color = color or self.color
+        thickness = thickness or self.thickness
+        cv2.rectangle(image_bgr, (int(x), int(y)), (int(x + w), int(y + h)), color, thickness)
+        return image_bgr
+
+
+class PointsVisualizer(object):
+
+    _COLOR_GREEN = (18, 127, 15)
+
+    def __init__(self, color_bgr=_COLOR_GREEN, r=5):
+        self.color_bgr = color_bgr
+        self.r = r
+
+    def visualize(self, image_bgr, pts_xy, colors_bgr=None, rs=None):
+        for j, pt_xy in enumerate(pts_xy):
+            x, y = pt_xy
+            color_bgr = colors_bgr[j] if colors_bgr is not None else self.color_bgr
+            r = rs[j] if rs is not None else self.r
+            cv2.circle(image_bgr, (x, y), r, color_bgr, -1)
+        return image_bgr
+
+
+class TextVisualizer(object):
+
+    _COLOR_GRAY = (218, 227, 218)
+    _COLOR_WHITE = (255, 255, 255)
+
+    def __init__(
+        self,
+        font_face=cv2.FONT_HERSHEY_SIMPLEX,
+        font_color_bgr=_COLOR_GRAY,
+        font_scale=0.35,
+        font_line_type=cv2.LINE_AA,
+        font_line_thickness=1,
+        fill_color_bgr=_COLOR_WHITE,
+        fill_color_transparency=1.0,
+        frame_color_bgr=_COLOR_WHITE,
+        frame_color_transparency=1.0,
+        frame_thickness=1,
+    ):
+        self.font_face = font_face
+        self.font_color_bgr = font_color_bgr
+        self.font_scale = font_scale
+        self.font_line_type = font_line_type
+        self.font_line_thickness = font_line_thickness
+        self.fill_color_bgr = fill_color_bgr
+        self.fill_color_transparency = fill_color_transparency
+        self.frame_color_bgr = frame_color_bgr
+        self.frame_color_transparency = frame_color_transparency
+        self.frame_thickness = frame_thickness
+
+    def visualize(self, image_bgr, txt, topleft_xy):
+        txt_w, txt_h = self.get_text_size_wh(txt)
+        topleft_xy = tuple(map(int, topleft_xy))
+        x, y = topleft_xy
+        if self.frame_color_transparency < 1.0:
+            t = self.frame_thickness
+            image_bgr[y - t : y + txt_h + t, x - t : x + txt_w + t, :] = (
+                image_bgr[y - t : y + txt_h + t, x - t : x + txt_w + t, :]
+                * self.frame_color_transparency
+                + np.array(self.frame_color_bgr) * (1.0 - self.frame_color_transparency)
+            ).astype(np.float)
+        if self.fill_color_transparency < 1.0:
+            image_bgr[y : y + txt_h, x : x + txt_w, :] = (
+                image_bgr[y : y + txt_h, x : x + txt_w, :] * self.fill_color_transparency
+                + np.array(self.fill_color_bgr) * (1.0 - self.fill_color_transparency)
+            ).astype(np.float)
+        cv2.putText(
+            image_bgr,
+            txt,
+            topleft_xy,
+            self.font_face,
+            self.font_scale,
+            self.font_color_bgr,
+            self.font_line_thickness,
+            self.font_line_type,
+        )
+        return image_bgr
+
+    def get_text_size_wh(self, txt):
+        ((txt_w, txt_h), _) = cv2.getTextSize(
+            txt, self.font_face, self.font_scale, self.font_line_thickness
+        )
+        return txt_w, txt_h
+
+
+class CompoundVisualizer(object):
+    def __init__(self, visualizers):
+        self.visualizers = visualizers
+
+    def visualize(self, image_bgr, data):
+        assert len(data) == len(self.visualizers), (
+            "The number of datas {} should match the number of visualizers"
+            " {}".format(len(data), len(self.visualizers))
+        )
+        image = image_bgr
+        for i, visualizer in enumerate(self.visualizers):
+            image = visualizer.visualize(image, data[i])
+        return image
+
+    def __str__(self):
+        visualizer_str = ", ".join([str(v) for v in self.visualizers])
+        return "Compound Visualizer [{}]".format(visualizer_str)
diff --git a/projects/DensePose/densepose/vis/bounding_box.py b/projects/DensePose/densepose/vis/bounding_box.py
new file mode 100644
index 0000000..7fc6efd
--- /dev/null
+++ b/projects/DensePose/densepose/vis/bounding_box.py
@@ -0,0 +1,36 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .base import RectangleVisualizer, TextVisualizer
+
+
+class BoundingBoxVisualizer(object):
+    def __init__(self):
+        self.rectangle_visualizer = RectangleVisualizer()
+
+    def visualize(self, image_bgr, boxes_xywh):
+        for bbox_xywh in boxes_xywh:
+            image_bgr = self.rectangle_visualizer.visualize(image_bgr, bbox_xywh)
+        return image_bgr
+
+
+class ScoredBoundingBoxVisualizer(object):
+    def __init__(self, bbox_visualizer_params=None, score_visualizer_params=None):
+        if bbox_visualizer_params is None:
+            bbox_visualizer_params = {}
+        if score_visualizer_params is None:
+            score_visualizer_params = {}
+        self.visualizer_bbox = RectangleVisualizer(**bbox_visualizer_params)
+        self.visualizer_score = TextVisualizer(**score_visualizer_params)
+
+    def visualize(self, image_bgr, scored_bboxes):
+        boxes_xywh, box_scores = scored_bboxes
+        assert len(boxes_xywh) == len(box_scores), (
+            "Number of bounding boxes {} should be equal to the number of "
+            "scores".format(len(boxes_xywh), len(box_scores))
+        )
+        for i, box_xywh in enumerate(boxes_xywh):
+            score_i = box_scores[i]
+            image_bgr = self.visualizer_bbox.visualize(image_bgr, box_xywh)
+            score_txt = "{0:6.4f}".format(score_i)
+            topleft_xy = box_xywh[0], box_xywh[1]
+            image_bgr = self.visualizer_score.visualize(image_bgr, score_txt, topleft_xy)
+        return image_bgr
diff --git a/projects/DensePose/densepose/vis/densepose.py b/projects/DensePose/densepose/vis/densepose.py
new file mode 100644
index 0000000..ba561ca
--- /dev/null
+++ b/projects/DensePose/densepose/vis/densepose.py
@@ -0,0 +1,581 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import numpy as np
+from typing import Iterable, Optional, Tuple
+import cv2
+
+from ..structures import DensePoseDataRelative, DensePoseOutput, DensePoseResult
+from .base import Boxes, Image, MatrixVisualizer, PointsVisualizer
+
+
+class DensePoseResultsVisualizer(object):
+    def visualize(self, image_bgr: Image, densepose_result: Optional[DensePoseResult]) -> Image:
+        if densepose_result is None:
+            return image_bgr
+        context = self.create_visualization_context(image_bgr)
+        for i, result_encoded_w_shape in enumerate(densepose_result.results):
+            iuv_arr = DensePoseResult.decode_png_data(*result_encoded_w_shape)
+            bbox_xywh = densepose_result.boxes_xywh[i]
+            self.visualize_iuv_arr(context, iuv_arr, bbox_xywh)
+        image_bgr = self.context_to_image_bgr(context)
+        return image_bgr
+
+
+class DensePoseMaskedColormapResultsVisualizer(DensePoseResultsVisualizer):
+    def __init__(
+        self,
+        data_extractor,
+        segm_extractor,
+        inplace=True,
+        cmap=cv2.COLORMAP_PARULA,
+        alpha=0.7,
+        val_scale=1.0,
+    ):
+        self.mask_visualizer = MatrixVisualizer(
+            inplace=inplace, cmap=cmap, val_scale=val_scale, alpha=alpha
+        )
+        self.data_extractor = data_extractor
+        self.segm_extractor = segm_extractor
+
+    def create_visualization_context(self, image_bgr: Image):
+        return image_bgr
+
+    def context_to_image_bgr(self, context):
+        return context
+
+    def get_image_bgr_from_context(self, context):
+        return context
+
+    def visualize_iuv_arr(self, context, iuv_arr, bbox_xywh):
+        image_bgr = self.get_image_bgr_from_context(context)
+        matrix = self.data_extractor(iuv_arr)
+        segm = self.segm_extractor(iuv_arr)
+        mask = np.zeros(matrix.shape, dtype=np.uint8)
+        mask[segm > 0] = 1
+        image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh)
+        return image_bgr
+
+
+def _extract_i_from_iuvarr(iuv_arr):
+    return iuv_arr[0, :, :]
+
+
+def _extract_u_from_iuvarr(iuv_arr):
+    return iuv_arr[1, :, :]
+
+
+def _extract_v_from_iuvarr(iuv_arr):
+    return iuv_arr[2, :, :]
+
+
+class DensePoseResultsMplContourVisualizer(DensePoseResultsVisualizer):
+    def __init__(self, levels=10, **kwargs):
+        self.levels = levels
+        self.plot_args = kwargs
+
+    def create_visualization_context(self, image_bgr: Image):
+        import matplotlib.pyplot as plt
+        from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
+
+        context = {}
+        context["image_bgr"] = image_bgr
+        dpi = 100
+        height_inches = float(image_bgr.shape[0]) / dpi
+        width_inches = float(image_bgr.shape[1]) / dpi
+        fig = plt.figure(figsize=(width_inches, height_inches), dpi=dpi)
+        plt.axes([0, 0, 1, 1])
+        plt.axis("off")
+        context["fig"] = fig
+        canvas = FigureCanvas(fig)
+        context["canvas"] = canvas
+        extent = (0, image_bgr.shape[1], image_bgr.shape[0], 0)
+        plt.imshow(image_bgr[:, :, ::-1], extent=extent)
+        return context
+
+    def context_to_image_bgr(self, context):
+        fig = context["fig"]
+        w, h = map(int, fig.get_size_inches() * fig.get_dpi())
+        canvas = context["canvas"]
+        canvas.draw()
+        image_1d = np.fromstring(canvas.tostring_rgb(), dtype="uint8")
+        image_rgb = image_1d.reshape(h, w, 3)
+        image_bgr = image_rgb[:, :, ::-1].copy()
+        return image_bgr
+
+    def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh: Boxes) -> Image:
+        import matplotlib.pyplot as plt
+
+        u = _extract_u_from_iuvarr(iuv_arr).astype(float) / 255.0
+        v = _extract_v_from_iuvarr(iuv_arr).astype(float) / 255.0
+        extent = (
+            bbox_xywh[0],
+            bbox_xywh[0] + bbox_xywh[2],
+            bbox_xywh[1],
+            bbox_xywh[1] + bbox_xywh[3],
+        )
+        plt.contour(u, self.levels, extent=extent, **self.plot_args)
+        plt.contour(v, self.levels, extent=extent, **self.plot_args)
+
+
+class DensePoseResultsCustomContourVisualizer(DensePoseResultsVisualizer):
+    """
+    Contour visualization using marching squares
+    """
+
+    def __init__(self, levels=10, **kwargs):
+        # TODO: colormap is hardcoded
+        cmap = cv2.COLORMAP_PARULA
+        if isinstance(levels, int):
+            self.levels = np.linspace(0, 1, levels)
+        else:
+            self.levels = levels
+        if "linewidths" in kwargs:
+            self.linewidths = kwargs["linewidths"]
+        else:
+            self.linewidths = [1] * len(self.levels)
+        self.plot_args = kwargs
+        img_colors_bgr = cv2.applyColorMap((self.levels * 255).astype(np.uint8), cmap)
+        self.level_colors_bgr = [
+            [int(v) for v in img_color_bgr.ravel()] for img_color_bgr in img_colors_bgr
+        ]
+
+    def create_visualization_context(self, image_bgr: Image):
+        return image_bgr
+
+    def context_to_image_bgr(self, context):
+        return context
+
+    def get_image_bgr_from_context(self, context):
+        return context
+
+    def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh: Boxes) -> Image:
+        image_bgr = self.get_image_bgr_from_context(context)
+        segm = _extract_i_from_iuvarr(iuv_arr)
+        u = _extract_u_from_iuvarr(iuv_arr).astype(float) / 255.0
+        v = _extract_v_from_iuvarr(iuv_arr).astype(float) / 255.0
+        self._contours(image_bgr, u, segm, bbox_xywh)
+        self._contours(image_bgr, v, segm, bbox_xywh)
+
+    def _contours(self, image_bgr, arr, segm, bbox_xywh):
+        for part_idx in range(1, DensePoseDataRelative.N_PART_LABELS + 1):
+            mask = segm == part_idx
+            if not np.any(mask):
+                continue
+            arr_min = np.amin(arr[mask])
+            arr_max = np.amax(arr[mask])
+            I, J = np.nonzero(mask)
+            i0 = np.amin(I)
+            i1 = np.amax(I) + 1
+            j0 = np.amin(J)
+            j1 = np.amax(J) + 1
+            if (j1 == j0 + 1) or (i1 == i0 + 1):
+                continue
+            Nw = arr.shape[1] - 1
+            Nh = arr.shape[0] - 1
+            for level_idx, level in enumerate(self.levels):
+                if (level < arr_min) or (level > arr_max):
+                    continue
+                vp = arr[i0:i1, j0:j1] >= level
+                bin_codes = vp[:-1, :-1] + vp[1:, :-1] * 2 + vp[1:, 1:] * 4 + vp[:-1, 1:] * 8
+                mp = mask[i0:i1, j0:j1]
+                bin_mask_codes = mp[:-1, :-1] + mp[1:, :-1] * 2 + mp[1:, 1:] * 4 + mp[:-1, 1:] * 8
+                it = np.nditer(bin_codes, flags=["multi_index"])
+                color_bgr = self.level_colors_bgr[level_idx]
+                linewidth = self.linewidths[level_idx]
+                while not it.finished:
+                    if (it[0] != 0) and (it[0] != 15):
+                        i, j = it.multi_index
+                        if bin_mask_codes[i, j] != 0:
+                            self._draw_line(
+                                image_bgr,
+                                arr,
+                                mask,
+                                level,
+                                color_bgr,
+                                linewidth,
+                                it[0],
+                                it.multi_index,
+                                bbox_xywh,
+                                Nw,
+                                Nh,
+                                (i0, j0),
+                            )
+                    it.iternext()
+
+    def _draw_line(
+        self,
+        image_bgr,
+        arr,
+        mask,
+        v,
+        color_bgr,
+        linewidth,
+        bin_code,
+        multi_idx,
+        bbox_xywh,
+        Nw,
+        Nh,
+        offset,
+    ):
+        lines = self._bin_code_2_lines(arr, v, bin_code, multi_idx, Nw, Nh, offset)
+        x0, y0, w, h = bbox_xywh
+        x1 = x0 + w
+        y1 = y0 + h
+        for line in lines:
+            x0r, y0r = line[0]
+            x1r, y1r = line[1]
+            pt0 = (int(x0 + x0r * (x1 - x0)), int(y0 + y0r * (y1 - y0)))
+            pt1 = (int(x0 + x1r * (x1 - x0)), int(y0 + y1r * (y1 - y0)))
+            cv2.line(image_bgr, pt0, pt1, color_bgr, linewidth)
+
+    def _bin_code_2_lines(self, arr, v, bin_code, multi_idx, Nw, Nh, offset):
+        i0, j0 = offset
+        i, j = multi_idx
+        i += i0
+        j += j0
+        v0, v1, v2, v3 = arr[i, j], arr[i + 1, j], arr[i + 1, j + 1], arr[i, j + 1]
+        x0i = float(j) / Nw
+        y0j = float(i) / Nh
+        He = 1.0 / Nh
+        We = 1.0 / Nw
+        if (bin_code == 1) or (bin_code == 14):
+            a = (v - v0) / (v1 - v0)
+            b = (v - v0) / (v3 - v0)
+            pt1 = (x0i, y0j + a * He)
+            pt2 = (x0i + b * We, y0j)
+            return [(pt1, pt2)]
+        elif (bin_code == 2) or (bin_code == 13):
+            a = (v - v0) / (v1 - v0)
+            b = (v - v1) / (v2 - v1)
+            pt1 = (x0i, y0j + a * He)
+            pt2 = (x0i + b * We, y0j + He)
+            return [(pt1, pt2)]
+        elif (bin_code == 3) or (bin_code == 12):
+            a = (v - v0) / (v3 - v0)
+            b = (v - v1) / (v2 - v1)
+            pt1 = (x0i + a * We, y0j)
+            pt2 = (x0i + b * We, y0j + He)
+            return [(pt1, pt2)]
+        elif (bin_code == 4) or (bin_code == 11):
+            a = (v - v1) / (v2 - v1)
+            b = (v - v3) / (v2 - v3)
+            pt1 = (x0i + a * We, y0j + He)
+            pt2 = (x0i + We, y0j + b * He)
+            return [(pt1, pt2)]
+        elif (bin_code == 6) or (bin_code == 9):
+            a = (v - v0) / (v1 - v0)
+            b = (v - v3) / (v2 - v3)
+            pt1 = (x0i, y0j + a * He)
+            pt2 = (x0i + We, y0j + b * He)
+            return [(pt1, pt2)]
+        elif (bin_code == 7) or (bin_code == 8):
+            a = (v - v0) / (v3 - v0)
+            b = (v - v3) / (v2 - v3)
+            pt1 = (x0i + a * We, y0j)
+            pt2 = (x0i + We, y0j + b * He)
+            return [(pt1, pt2)]
+        elif bin_code == 5:
+            a1 = (v - v0) / (v1 - v0)
+            b1 = (v - v1) / (v2 - v1)
+            pt11 = (x0i, y0j + a1 * He)
+            pt12 = (x0i + b1 * We, y0j + He)
+            a2 = (v - v0) / (v3 - v0)
+            b2 = (v - v3) / (v2 - v3)
+            pt21 = (x0i + a2 * We, y0j)
+            pt22 = (x0i + We, y0j + b2 * He)
+            return [(pt11, pt12), (pt21, pt22)]
+        elif bin_code == 10:
+            a1 = (v - v0) / (v3 - v0)
+            b1 = (v - v0) / (v1 - v0)
+            pt11 = (x0i + a1 * We, y0j)
+            pt12 = (x0i, y0j + b1 * He)
+            a2 = (v - v1) / (v2 - v1)
+            b2 = (v - v3) / (v2 - v3)
+            pt21 = (x0i + a2 * We, y0j + He)
+            pt22 = (x0i + We, y0j + b2 * He)
+            return [(pt11, pt12), (pt21, pt22)]
+        return []
+
+
+try:
+    import matplotlib
+
+    matplotlib.use("Agg")
+    DensePoseResultsContourVisualizer = DensePoseResultsMplContourVisualizer
+except ModuleNotFoundError:
+    logger = logging.getLogger(__name__)
+    logger.warning("Could not import matplotlib, using custom contour visualizer")
+    DensePoseResultsContourVisualizer = DensePoseResultsCustomContourVisualizer
+
+
+class DensePoseResultsFineSegmentationVisualizer(DensePoseMaskedColormapResultsVisualizer):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
+        super(DensePoseResultsFineSegmentationVisualizer, self).__init__(
+            _extract_i_from_iuvarr,
+            _extract_i_from_iuvarr,
+            inplace,
+            cmap,
+            alpha,
+            val_scale=255.0 / DensePoseDataRelative.N_PART_LABELS,
+        )
+
+
+class DensePoseResultsUVisualizer(DensePoseMaskedColormapResultsVisualizer):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
+        super(DensePoseResultsUVisualizer, self).__init__(
+            _extract_u_from_iuvarr, _extract_i_from_iuvarr, inplace, cmap, alpha, val_scale=1.0
+        )
+
+
+class DensePoseResultsVVisualizer(DensePoseMaskedColormapResultsVisualizer):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
+        super(DensePoseResultsVVisualizer, self).__init__(
+            _extract_v_from_iuvarr, _extract_i_from_iuvarr, inplace, cmap, alpha, val_scale=1.0
+        )
+
+
+class DensePoseOutputsFineSegmentationVisualizer(object):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
+        self.mask_visualizer = MatrixVisualizer(
+            inplace=inplace,
+            cmap=cmap,
+            val_scale=255.0 / DensePoseDataRelative.N_PART_LABELS,
+            alpha=alpha,
+        )
+
+    def visualize(
+        self, image_bgr: Image, dp_output_with_bboxes: Optional[Tuple[DensePoseOutput, Boxes]]
+    ) -> Image:
+        if dp_output_with_bboxes is None:
+            return image_bgr
+        densepose_output, bboxes_xywh = dp_output_with_bboxes
+        S = densepose_output.S
+        I = densepose_output.I  # noqa
+        U = densepose_output.U
+        V = densepose_output.V
+        N = S.size(0)
+        assert N == I.size(0), (
+            "densepose outputs S {} and I {}"
+            " should have equal first dim size".format(S.size(), I.size())
+        )
+        assert N == U.size(0), (
+            "densepose outputs S {} and U {}"
+            " should have equal first dim size".format(S.size(), U.size())
+        )
+        assert N == V.size(0), (
+            "densepose outputs S {} and V {}"
+            " should have equal first dim size".format(S.size(), V.size())
+        )
+        assert N == len(bboxes_xywh), (
+            "number of bounding boxes {}"
+            " should be equal to first dim size of outputs {}".format(len(bboxes_xywh), N)
+        )
+        for n in range(N):
+            Sn = S[n].argmax(dim=0)
+            In = I[n].argmax(dim=0) * (Sn > 0).long()
+            matrix = In.cpu().numpy().astype(np.uint8)
+            mask = np.zeros(matrix.shape, dtype=np.uint8)
+            mask[matrix > 0] = 1
+            bbox_xywh = bboxes_xywh[n]
+            image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh)
+        return image_bgr
+
+
+class DensePoseOutputsUVisualizer(object):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
+        self.mask_visualizer = MatrixVisualizer(
+            inplace=inplace, cmap=cmap, val_scale=1.0, alpha=alpha
+        )
+
+    def visualize(
+        self, image_bgr: Image, dp_output_with_bboxes: Optional[Tuple[DensePoseOutput, Boxes]]
+    ) -> Image:
+        if dp_output_with_bboxes is None:
+            return image_bgr
+        densepose_output, bboxes_xywh = dp_output_with_bboxes
+        assert isinstance(
+            densepose_output, DensePoseOutput
+        ), "DensePoseOutput expected, {} encountered".format(type(densepose_output))
+        S = densepose_output.S
+        I = densepose_output.I  # noqa
+        U = densepose_output.U
+        V = densepose_output.V
+        N = S.size(0)
+        assert N == I.size(0), (
+            "densepose outputs S {} and I {}"
+            " should have equal first dim size".format(S.size(), I.size())
+        )
+        assert N == U.size(0), (
+            "densepose outputs S {} and U {}"
+            " should have equal first dim size".format(S.size(), U.size())
+        )
+        assert N == V.size(0), (
+            "densepose outputs S {} and V {}"
+            " should have equal first dim size".format(S.size(), V.size())
+        )
+        assert N == len(bboxes_xywh), (
+            "number of bounding boxes {}"
+            " should be equal to first dim size of outputs {}".format(len(bboxes_xywh), N)
+        )
+        for n in range(N):
+            Sn = S[n].argmax(dim=0)
+            In = I[n].argmax(dim=0) * (Sn > 0).long()
+            segmentation = In.cpu().numpy().astype(np.uint8)
+            mask = np.zeros(segmentation.shape, dtype=np.uint8)
+            mask[segmentation > 0] = 1
+            Un = U[n].cpu().numpy().astype(np.float32)
+            Uvis = np.zeros(segmentation.shape, dtype=np.float32)
+            for partId in range(Un.shape[0]):
+                Uvis[segmentation == partId] = Un[partId][segmentation == partId].clip(0, 1) * 255
+                bbox_xywh = bboxes_xywh[n]
+            image_bgr = self.mask_visualizer.visualize(image_bgr, mask, Uvis, bbox_xywh)
+        return image_bgr
+
+
+class DensePoseOutputsVVisualizer(object):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
+        self.mask_visualizer = MatrixVisualizer(
+            inplace=inplace, cmap=cmap, val_scale=1.0, alpha=alpha
+        )
+
+    def visualize(
+        self, image_bgr: Image, dp_output_with_bboxes: Optional[Tuple[DensePoseOutput, Boxes]]
+    ) -> Image:
+        if dp_output_with_bboxes is None:
+            return image_bgr
+        densepose_output, bboxes_xywh = dp_output_with_bboxes
+        assert isinstance(
+            densepose_output, DensePoseOutput
+        ), "DensePoseOutput expected, {} encountered".format(type(densepose_output))
+        S = densepose_output.S
+        I = densepose_output.I  # noqa
+        U = densepose_output.U
+        V = densepose_output.V
+        N = S.size(0)
+        assert N == I.size(0), (
+            "densepose outputs S {} and I {}"
+            " should have equal first dim size".format(S.size(), I.size())
+        )
+        assert N == U.size(0), (
+            "densepose outputs S {} and U {}"
+            " should have equal first dim size".format(S.size(), U.size())
+        )
+        assert N == V.size(0), (
+            "densepose outputs S {} and V {}"
+            " should have equal first dim size".format(S.size(), V.size())
+        )
+        assert N == len(bboxes_xywh), (
+            "number of bounding boxes {}"
+            " should be equal to first dim size of outputs {}".format(len(bboxes_xywh), N)
+        )
+        for n in range(N):
+            Sn = S[n].argmax(dim=0)
+            In = I[n].argmax(dim=0) * (Sn > 0).long()
+            segmentation = In.cpu().numpy().astype(np.uint8)
+            mask = np.zeros(segmentation.shape, dtype=np.uint8)
+            mask[segmentation > 0] = 1
+            Vn = V[n].cpu().numpy().astype(np.float32)
+            Vvis = np.zeros(segmentation.shape, dtype=np.float32)
+            for partId in range(Vn.size(0)):
+                Vvis[segmentation == partId] = Vn[partId][segmentation == partId].clip(0, 1) * 255
+            bbox_xywh = bboxes_xywh[n]
+            image_bgr = self.mask_visualizer.visualize(image_bgr, mask, Vvis, bbox_xywh)
+        return image_bgr
+
+
+class DensePoseDataCoarseSegmentationVisualizer(object):
+    """
+    Visualizer for ground truth segmentation
+    """
+
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
+        self.mask_visualizer = MatrixVisualizer(
+            inplace=inplace,
+            cmap=cmap,
+            val_scale=255.0 / DensePoseDataRelative.N_BODY_PARTS,
+            alpha=alpha,
+        )
+
+    def visualize(
+        self,
+        image_bgr: Image,
+        bbox_densepose_datas: Optional[Tuple[Iterable[Boxes], Iterable[DensePoseDataRelative]]],
+    ) -> Image:
+        if bbox_densepose_datas is None:
+            return image_bgr
+        for bbox_xywh, densepose_data in zip(*bbox_densepose_datas):
+            matrix = densepose_data.segm.numpy()
+            mask = np.zeros(matrix.shape, dtype=np.uint8)
+            mask[matrix > 0] = 1
+            image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh.numpy())
+        return image_bgr
+
+
+class DensePoseDataPointsVisualizer(object):
+    def __init__(self, densepose_data_to_value_fn=None, cmap=cv2.COLORMAP_PARULA):
+        self.points_visualizer = PointsVisualizer()
+        self.densepose_data_to_value_fn = densepose_data_to_value_fn
+        self.cmap = cmap
+
+    def visualize(
+        self,
+        image_bgr: Image,
+        bbox_densepose_datas: Optional[Tuple[Iterable[Boxes], Iterable[DensePoseDataRelative]]],
+    ) -> Image:
+        if bbox_densepose_datas is None:
+            return image_bgr
+        for bbox_xywh, densepose_data in zip(*bbox_densepose_datas):
+            x0, y0, w, h = bbox_xywh.numpy()
+            x = densepose_data.x.numpy() * w / 255.0 + x0
+            y = densepose_data.y.numpy() * h / 255.0 + y0
+            pts_xy = zip(x, y)
+            if self.densepose_data_to_value_fn is None:
+                image_bgr = self.points_visualizer.visualize(image_bgr, pts_xy)
+            else:
+                v = self.densepose_data_to_value_fn(densepose_data)
+                img_colors_bgr = cv2.applyColorMap(v, self.cmap)
+                colors_bgr = [
+                    [int(v) for v in img_color_bgr.ravel()] for img_color_bgr in img_colors_bgr
+                ]
+                image_bgr = self.points_visualizer.visualize(image_bgr, pts_xy, colors_bgr)
+        return image_bgr
+
+
+def _densepose_data_u_for_cmap(densepose_data):
+    u = np.clip(densepose_data.u.numpy(), 0, 1) * 255.0
+    return u.astype(np.uint8)
+
+
+def _densepose_data_v_for_cmap(densepose_data):
+    v = np.clip(densepose_data.v.numpy(), 0, 1) * 255.0
+    return v.astype(np.uint8)
+
+
+def _densepose_data_i_for_cmap(densepose_data):
+    i = (
+        np.clip(densepose_data.i.numpy(), 0.0, DensePoseDataRelative.N_PART_LABELS)
+        * 255.0
+        / DensePoseDataRelative.N_PART_LABELS
+    )
+    return i.astype(np.uint8)
+
+
+class DensePoseDataPointsUVisualizer(DensePoseDataPointsVisualizer):
+    def __init__(self):
+        super(DensePoseDataPointsUVisualizer, self).__init__(
+            densepose_data_to_value_fn=_densepose_data_u_for_cmap
+        )
+
+
+class DensePoseDataPointsVVisualizer(DensePoseDataPointsVisualizer):
+    def __init__(self):
+        super(DensePoseDataPointsVVisualizer, self).__init__(
+            densepose_data_to_value_fn=_densepose_data_v_for_cmap
+        )
+
+
+class DensePoseDataPointsIVisualizer(DensePoseDataPointsVisualizer):
+    def __init__(self):
+        super(DensePoseDataPointsIVisualizer, self).__init__(
+            densepose_data_to_value_fn=_densepose_data_i_for_cmap
+        )
diff --git a/projects/DensePose/densepose/vis/extractor.py b/projects/DensePose/densepose/vis/extractor.py
new file mode 100644
index 0000000..b715a44
--- /dev/null
+++ b/projects/DensePose/densepose/vis/extractor.py
@@ -0,0 +1,152 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+from typing import Sequence
+import torch
+
+from detectron2.layers.nms import batched_nms
+from detectron2.structures.instances import Instances
+
+from densepose.vis.bounding_box import BoundingBoxVisualizer, ScoredBoundingBoxVisualizer
+from densepose.vis.densepose import DensePoseResultsVisualizer
+
+from .base import CompoundVisualizer
+
+Scores = Sequence[float]
+
+
+def extract_scores_from_instances(instances: Instances, select=None):
+    if instances.has("scores"):
+        return instances.scores if select is None else instances.scores[select]
+    return None
+
+
+def extract_boxes_xywh_from_instances(instances: Instances, select=None):
+    if instances.has("pred_boxes"):
+        boxes_xywh = instances.pred_boxes.tensor.clone()
+        boxes_xywh[:, 2] -= boxes_xywh[:, 0]
+        boxes_xywh[:, 3] -= boxes_xywh[:, 1]
+        return boxes_xywh if select is None else boxes_xywh[select]
+    return None
+
+
+def create_extractor(visualizer: object):
+    """
+    Create an extractor for the provided visualizer
+    """
+    if isinstance(visualizer, CompoundVisualizer):
+        extractors = [create_extractor(v) for v in visualizer.visualizers]
+        return CompoundExtractor(extractors)
+    elif isinstance(visualizer, DensePoseResultsVisualizer):
+        return DensePoseResultExtractor()
+    elif isinstance(visualizer, ScoredBoundingBoxVisualizer):
+        return CompoundExtractor([extract_boxes_xywh_from_instances, extract_scores_from_instances])
+    elif isinstance(visualizer, BoundingBoxVisualizer):
+        return extract_boxes_xywh_from_instances
+    else:
+        logger = logging.getLogger(__name__)
+        logger.error(f"Could not create extractor for {visualizer}")
+        return None
+
+
+class BoundingBoxExtractor(object):
+    """
+    Extracts bounding boxes from instances
+    """
+
+    def __call__(self, instances: Instances):
+        boxes_xywh = extract_boxes_xywh_from_instances(instances)
+        return boxes_xywh
+
+
+class ScoredBoundingBoxExtractor(object):
+    """
+    Extracts bounding boxes from instances
+    """
+
+    def __call__(self, instances: Instances, select=None):
+        scores = extract_scores_from_instances(instances)
+        boxes_xywh = extract_boxes_xywh_from_instances(instances)
+        if (scores is None) or (boxes_xywh is None):
+            return (boxes_xywh, scores)
+        if select is not None:
+            scores = scores[select]
+            boxes_xywh = boxes_xywh[select]
+        return (boxes_xywh, scores)
+
+
+class DensePoseResultExtractor(object):
+    """
+    Extracts DensePose result from instances
+    """
+
+    def __call__(self, instances: Instances, select=None):
+        boxes_xywh = extract_boxes_xywh_from_instances(instances)
+        if instances.has("pred_densepose") and (boxes_xywh is not None):
+            dpout = instances.pred_densepose
+            if select is not None:
+                dpout = dpout[select]
+                boxes_xywh = boxes_xywh[select]
+            return dpout.to_result(boxes_xywh)
+        else:
+            return None
+
+
+class CompoundExtractor(object):
+    """
+    Extracts data for CompoundVisualizer
+    """
+
+    def __init__(self, extractors):
+        self.extractors = extractors
+
+    def __call__(self, instances: Instances, select=None):
+        datas = []
+        for extractor in self.extractors:
+            data = extractor(instances, select)
+            datas.append(data)
+        return datas
+
+
+class NmsFilteredExtractor(object):
+    """
+    Extracts data in the format accepted by NmsFilteredVisualizer
+    """
+
+    def __init__(self, extractor, iou_threshold):
+        self.extractor = extractor
+        self.iou_threshold = iou_threshold
+
+    def __call__(self, instances: Instances, select=None):
+        scores = extract_scores_from_instances(instances)
+        boxes_xywh = extract_boxes_xywh_from_instances(instances)
+        if boxes_xywh is None:
+            return None
+        select_local_idx = batched_nms(
+            boxes_xywh,
+            scores,
+            torch.zeros(len(scores), dtype=torch.int32),
+            iou_threshold=self.iou_threshold,
+        ).squeeze()
+        select_local = torch.zeros(len(boxes_xywh), dtype=torch.bool, device=boxes_xywh.device)
+        select_local[select_local_idx] = True
+        select = select_local if select is None else (select & select_local)
+        return self.extractor(instances, select=select)
+
+
+class ScoreThresholdedExtractor(object):
+    """
+    Extracts data in the format accepted by ScoreThresholdedVisualizer
+    """
+
+    def __init__(self, extractor, min_score):
+        self.extractor = extractor
+        self.min_score = min_score
+
+    def __call__(self, instances: Instances, select=None):
+        scores = extract_scores_from_instances(instances)
+        if scores is None:
+            return None
+        select_local = scores > self.min_score
+        select = select_local if select is None else (select & select_local)
+        data = self.extractor(instances, select=select)
+        return data
diff --git a/projects/DensePose/dev/README.md b/projects/DensePose/dev/README.md
new file mode 100644
index 0000000..e3a94b6
--- /dev/null
+++ b/projects/DensePose/dev/README.md
@@ -0,0 +1,7 @@
+
+## Some scripts for developers to use, include:
+
+- `run_instant_tests.sh`: run training for a few iterations.
+- `run_inference_tests.sh`: run inference on a small dataset.
+- `../../dev/linter.sh`: lint the codebase before commit
+- `../../dev/parse_results.sh`: parse results from log file.
diff --git a/projects/DensePose/dev/run_inference_tests.sh b/projects/DensePose/dev/run_inference_tests.sh
new file mode 100644
index 0000000..670de7f
--- /dev/null
+++ b/projects/DensePose/dev/run_inference_tests.sh
@@ -0,0 +1,30 @@
+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+BIN="python train_net.py"
+OUTPUT="inference_test_output"
+NUM_GPUS=2
+
+CFG_LIST=( "${@:1}" )
+
+if [ ${#CFG_LIST[@]} -eq 0 ]; then
+  CFG_LIST=( ./configs/quick_schedules/*inference_acc_test.yaml )
+fi
+
+echo "========================================================================"
+echo "Configs to run:"
+echo "${CFG_LIST[@]}"
+echo "========================================================================"
+
+for cfg in "${CFG_LIST[@]}"; do
+    echo "========================================================================"
+    echo "Running $cfg ..."
+    echo "========================================================================"
+    $BIN \
+      --eval-only \
+      --num-gpus $NUM_GPUS \
+      --config-file "$cfg" \
+      OUTPUT_DIR $OUTPUT
+      rm -rf $OUTPUT
+done
+
diff --git a/projects/DensePose/dev/run_instant_tests.sh b/projects/DensePose/dev/run_instant_tests.sh
new file mode 100644
index 0000000..d44f4df
--- /dev/null
+++ b/projects/DensePose/dev/run_instant_tests.sh
@@ -0,0 +1,27 @@
+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+BIN="python train_net.py"
+OUTPUT="instant_test_output"
+NUM_GPUS=2
+
+CFG_LIST=( "${@:1}" )
+if [ ${#CFG_LIST[@]} -eq 0 ]; then
+  CFG_LIST=( ./configs/quick_schedules/*instant_test.yaml )
+fi
+
+echo "========================================================================"
+echo "Configs to run:"
+echo "${CFG_LIST[@]}"
+echo "========================================================================"
+
+for cfg in "${CFG_LIST[@]}"; do
+    echo "========================================================================"
+    echo "Running $cfg ..."
+    echo "========================================================================"
+    $BIN --num-gpus $NUM_GPUS --config-file "$cfg" \
+      SOLVER.IMS_PER_BATCH $(($NUM_GPUS * 2)) \
+      OUTPUT_DIR "$OUTPUT"
+    rm -rf "$OUTPUT"
+done
+
diff --git a/projects/DensePose/doc/GETTING_STARTED.md b/projects/DensePose/doc/GETTING_STARTED.md
new file mode 100644
index 0000000..2a6c23c
--- /dev/null
+++ b/projects/DensePose/doc/GETTING_STARTED.md
@@ -0,0 +1,48 @@
+# Getting Started with DensePose
+
+## Inference with Pre-trained Models
+
+1. Pick a model and its config file from [Model Zoo](MODEL_ZOO.md), for example [densepose_rcnn_R_50_FPN_s1x.yaml](../configs/densepose_rcnn_R_50_FPN_s1x.yaml)
+2. Run the [Apply Net](TOOL_APPLY_NET.md) tool to visualize the results or save the to disk. For example, to use contour visualization for DensePose, one can run:
+```bash
+python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml densepose_rcnn_R_50_FPN_s1x.pkl image.jpg dp_contour,bbox --output image_densepose_contour.png
+```
+Please see [Apply Net](TOOL_APPLY_NET.md) for more details on the tool.
+
+## Training
+
+To train a model one can use the [train_net.py](../train_net.py) script.
+This script was used to train all DensePose models in [Model Zoo](MODEL_ZOO.md).
+For example, to launch end-to-end DensePose-RCNN training with ResNet-50 FPN backbone
+on 8 GPUs following the s1x schedule, one can run
+```bash
+python train_net.py --config-file configs/densepose_rcnn_R_50_FPN_s1x.yaml --num-gpus 8
+```
+The configs are made for 8-GPU training. To train on 1 GPU, one can apply the
+[linear learning rate scaling rule](https://arxiv.org/abs/1706.02677):
+```bash
+python train_net.py --config-file configs/densepose_rcnn_R_50_FPN_s1x.yaml \
+    SOLVER.IMS_PER_BATCH 2 SOLVER.BASE_LR 0.0025
+```
+
+## Evaluation
+
+Model testing can be done in the same way as training, except for an additional flag `--eval-only` and
+model location specification through `MODEL.WEIGHTS model.pth` in the command line
+```bash
+python train_net.py --config-file configs/densepose_rcnn_R_50_FPN_s1x.yaml \
+    --eval-only MODEL.WEIGHTS model.pth
+```
+
+## Tools
+
+We provide tools which allow one to:
+ - easily view DensePose annotated data in a dataset;
+ - perform DensePose inference on a set of images;
+ - visualize DensePose model results;
+
+`query_db` is a tool to print or visualize DensePose data in a dataset.
+Please refer to [Query DB](TOOL_QUERY_DB.md) for more details on this tool
+
+`apply_net` is a tool to print or visualize DensePose results.
+Please refer to [Apply Net](TOOL_APPLY_NET.md) for more details on this tool
diff --git a/projects/DensePose/doc/MODEL_ZOO.md b/projects/DensePose/doc/MODEL_ZOO.md
new file mode 100644
index 0000000..723ed34
--- /dev/null
+++ b/projects/DensePose/doc/MODEL_ZOO.md
@@ -0,0 +1,66 @@
+# Model Zoo and Baselines
+
+# Introduction
+
+We provide baselines trained with Detectron2 DensePose. The corresponding
+configuration files can be found in the [configs](../configs) directory.
+All models were trained on COCO `train2014` + `valminusminival2014` and
+evaluated on COCO `minival2014`. For the details on common settings in which
+baselines were trained, please check [Detectron 2 Model Zoo](../../../MODEL_ZOO.md).
+
+## License
+
+All models available for download through this document are licensed under the
+[Creative Commons Attribution-ShareAlike 3.0 license](https://creativecommons.org/licenses/by-sa/3.0/)
+
+## COCO DensePose Baselines with DensePose-RCNN
+
+<table><tbody>
+<!-- START TABLE -->
+<!-- TABLE HEADER -->
+<th valign="bottom">Name</th>
+<th valign="bottom">lr<br/>sched</th>
+<th valign="bottom">train<br/>time<br/>(s/iter)</th>
+<th valign="bottom">inference<br/>time<br/>(s/im)</th>
+<th valign="bottom">train<br/>mem<br/>(GB)</th>
+<th valign="bottom">box<br/>AP</th>
+<th valign="bottom">dp.<br/>AP</th>
+<th valign="bottom">model id</th>
+<th valign="bottom">download</th>
+<!-- TABLE BODY --> 
+<!-- ROW: densepose_rcnn_R_50_FPN_s1x --> 
+ <tr><td align="left"><a href="../configs/densepose_rcnn_R_50_FPN_s1x.yaml">R_50_FPN_s1x</a></td>
+<td align="center">s1x</td>
+<td align="center">0.281</td>
+<td align="center">0.064</td>
+<td align="center">3.2</td>
+<td align="center">57.8</td>
+<td align="center">49.8</td>
+<td align="center">143908701</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/143908701/model_final_dd99d2.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/143908701/metrics.json">metrics</a></td>
+</tr>
+<!-- ROW: densepose_rcnn_R_101_FPN_s1x --> 
+ <tr><td align="left"><a href="../configs/densepose_rcnn_R_101_FPN_s1x.yaml">R_101_FPN_s1x</a></td>
+<td align="center">s1x</td>
+<td align="center">0.365</td>
+<td align="center">0.076</td>
+<td align="center">4.3</td>
+<td align="center">59.5</td>
+<td align="center">51.1</td>
+<td align="center">143908726</td>
+<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_s1x/143908726/model_final_ad63b5.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_s1x/143908726/metrics.json">metrics</a></td>
+</tr>
+</tbody></table>
+
+## Old Baselines
+
+It is still possible to use some baselines from [DensePose 1](https://github.com/facebookresearch/DensePose).
+Below are evaluation metrics for the baselines recomputed in the current framework:
+
+| Model | bbox AP | AP  |  AP50 | AP75  | APm  |APl |
+|-----|-----|-----|---    |---    |---   |--- |
+| [`ResNet50_FPN_s1x-e2e`](https://dl.fbaipublicfiles.com/densepose/DensePose_ResNet50_FPN_s1x-e2e.pkl) | 54.673 | 48.894 | 84.963 | 50.717 | 43.132 | 50.433 |
+| [`ResNet101_FPN_s1x-e2e`](https://dl.fbaipublicfiles.com/densepose/DensePose_ResNet101_FPN_s1x-e2e.pkl) | 56.032 | 51.088 | 86.250 | 55.057 | 46.542 | 52.563 |
+
+Note: these scores are close, but not strictly equal to the ones reported in the [DensePose 1 Model Zoo](https://github.com/facebookresearch/DensePose/blob/master/MODEL_ZOO.md),
+which is due to small incompatibilities between the frameworks.
diff --git a/projects/DensePose/doc/TOOL_APPLY_NET.md b/projects/DensePose/doc/TOOL_APPLY_NET.md
new file mode 100644
index 0000000..48dd02c
--- /dev/null
+++ b/projects/DensePose/doc/TOOL_APPLY_NET.md
@@ -0,0 +1,92 @@
+# Apply Net
+
+`apply_net` is a tool to print or visualize DensePose results on a set of images.
+It has two modes: `dump` to save DensePose model results to a pickle file
+and `show` to visualize them on images.
+
+## Dump Mode
+
+The general command form is:
+```bash
+python apply_net.py dump [-h] [-v] [--output <dump_file>] <config> <model> <input>
+```
+
+There are three mandatory arguments:
+ - `<config>`, configuration file for a given model;
+ - `<model>`, model file with trained parameters
+ - `<input>`, input image file name, pattern or folder
+
+One can additionally provide `--output` argument to define the output file name,
+which defaults to `output.pkl`.
+
+
+Examples:
+
+1. Dump results of a DensePose model with ResNet-50 FPN backbone for images
+   in a folder `images` to file `dump.pkl`:
+```bash
+python apply_net.py dump configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl images --output dump.pkl -v
+```
+
+2. Dump results of a DensePose model with ResNet-50 FPN backbone for images
+   with file name matching a pattern `image*.jpg` to file `results.pkl`:
+```bash
+python apply_net.py dump configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl "image*.jpg" --output results.pkl -v
+```
+
+
+## Visualization Mode
+
+The general command form is:
+```bash
+python apply_net.py show [-h] [-v] [--min_score <score>] [--nms_thresh <threshold>] [--output <image_file>] <config> <model> <input> <visualizations>
+```
+
+There are four mandatory arguments:
+ - `<config>`, configuration file for a given model;
+ - `<model>`, model file with trained parameters
+ - `<input>`, input image file name, pattern or folder
+ - `<visualizations>`, visualizations specifier; currently available visualizations are:
+   * `bbox` - bounding boxes of detected persons;
+   * `dp_segm` - segmentation masks for detected persons;
+   * `dp_u` - each body part is colored according to the estimated values of the
+     U coordinate in part parameterization;
+   * `dp_v` - each body part is colored according to the estimated values of the
+     V coordinate in part parameterization;
+   * `dp_contour` - plots contours with color-coded U and V coordinates
+
+
+One can additionally provide the following optional arguments:
+ - `--min_score` to only show detections with sufficient scores that are not lower than provided value
+ - `--nms_thresh` to additionally apply non-maximum suppression to detections at a given threshold
+ - `--output` to define visualization file name template, which defaults to `output.png`.
+   To distinguish output file names for different images, the tool appends 1-based entry index,
+   e.g. output.0001.png, output.0002.png, etc...
+
+
+The following examples show how to output results of a DensePose model
+with ResNet-50 FPN backbone using different visualizations for image `image.jpg`:
+
+1. Show bounding box and segmentation:
+```bash
+python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl image.jpg bbox,dp_segm -v
+```
+![Bounding Box + Segmentation Visualization](images/res_bbox_dp_segm.jpg)
+
+2. Show bounding box and estimated U coordinates for body parts:
+```bash
+python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl image.jpg bbox,dp_u -v
+```
+![Bounding Box + U Coordinate Visualization](images/res_bbox_dp_u.jpg)
+
+3. Show bounding box and estimated V coordinates for body parts:
+```bash
+python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl image.jpg bbox,dp_v -v
+```
+![Bounding Box + V Coordinate Visualization](images/res_bbox_dp_v.jpg)
+
+4. Show bounding box and estimated U and V coordinates via contour plots:
+```bash
+python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl image.jpg dp_contour,bbox -v
+```
+![Bounding Box + Contour Visualization](images/res_bbox_dp_contour.jpg)
diff --git a/projects/DensePose/doc/TOOL_QUERY_DB.md b/projects/DensePose/doc/TOOL_QUERY_DB.md
new file mode 100644
index 0000000..05c6829
--- /dev/null
+++ b/projects/DensePose/doc/TOOL_QUERY_DB.md
@@ -0,0 +1,105 @@
+
+# Query Dataset
+
+`query_db` is a tool to print or visualize DensePose data from a dataset.
+It has two modes: `print` and `show` to output dataset entries to standard
+output or to visualize them on images.
+
+## Print Mode
+
+The general command form is:
+```bash
+python query_db.py print [-h] [-v] [--max-entries N] <dataset> <selector>
+```
+
+There are two mandatory arguments:
+ - `<dataset>`, DensePose dataset specification, from which to select
+   the entries (e.g. `densepose_coco_2014_train`).
+ - `<selector>`, dataset entry selector which can be a single specification,
+   or a comma-separated list of specifications of the form
+   `field[:type]=value` for exact match with the value
+   or `field[:type]=min-max` for a range of values
+
+One can additionally limit the maximum number of entries to output
+by providing `--max-entries` argument.
+
+Examples:
+
+1. Output at most 10 first entries from the `densepose_coco_2014_train` dataset:
+```bash
+python query_db.py print densepose_coco_2014_train \* --max-entries 10 -v
+```
+
+2. Output all entries with `file_name` equal to `COCO_train2014_000000000036.jpg`: 
+```bash
+python query_db.py print densepose_coco_2014_train file_name=COCO_train2014_000000000036.jpg -v
+```
+
+3. Output all entries with `image_id` between 36 and 156:
+```bash
+python query_db.py print densepose_coco_2014_train image_id:int=36-156 -v
+```
+
+## Visualization Mode
+
+The general command form is:
+```bash
+python query_db.py show [-h] [-v] [--max-entries N] [--output <image_file>] <dataset> <selector> <visualizations>
+```
+
+There are three mandatory arguments:
+ - `<dataset>`, DensePose dataset specification, from which to select
+   the entries (e.g. `densepose_coco_2014_train`).
+ - `<selector>`, dataset entry selector which can be a single specification,
+   or a comma-separated list of specifications of the form
+   `field[:type]=value` for exact match with the value
+   or `field[:type]=min-max` for a range of values
+ - `<visualizations>`, visualizations specifier; currently available visualizations are:
+   * `bbox` - bounding boxes of annotated persons;
+   * `dp_i` - annotated points colored according to the containing part;
+   * `dp_pts` - annotated points in green color;
+   * `dp_segm` - segmentation masks for annotated persons;
+   * `dp_u` - annotated points colored according to their U coordinate in part parameterization;
+   * `dp_v` - annotated points colored according to their V coordinate in part parameterization;
+
+One can additionally provide one of the two optional arguments:
+ - `--max_entries` to limit the maximum number of entries to visualize
+ - `--output` to provide visualization file name template, which defaults
+   to `output.png`. To distinguish file names for different dataset
+   entries, the tool appends 1-based entry index to the output file name,
+   e.g. output.0001.png, output.0002.png, etc.
+
+The following examples show how to output different visualizations for image with `id = 322`
+from `densepose_coco_2014_train` dataset:
+
+1. Show bounding box and segmentation:
+```bash
+python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_segm -v
+```
+![Bounding Box + Segmentation Visualization](images/vis_bbox_dp_segm.png)
+
+2. Show bounding box and points colored according to the containing part:
+```bash
+python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_i -v
+```
+![Bounding Box + Point Label Visualization](images/vis_bbox_dp_i.png)
+
+3. Show bounding box and annotated points in green color:
+```bash
+python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_segm -v
+```
+![Bounding Box + Point Visualization](images/vis_bbox_dp_pts.png)
+
+4. Show bounding box and annotated points colored according to their U coordinate in part parameterization:
+```bash
+python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_u -v
+```
+![Bounding Box + Point U Visualization](images/vis_bbox_dp_u.png)
+
+5. Show bounding box and annotated points colored according to their V coordinate in part parameterization:
+```bash
+python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_v -v
+```
+![Bounding Box + Point V Visualization](images/vis_bbox_dp_v.png)
+
+
diff --git a/projects/DensePose/doc/images/res_bbox_dp_contour.jpg b/projects/DensePose/doc/images/res_bbox_dp_contour.jpg
new file mode 100644
index 0000000..8f0c195
Binary files /dev/null and b/projects/DensePose/doc/images/res_bbox_dp_contour.jpg differ
diff --git a/projects/DensePose/doc/images/res_bbox_dp_segm.jpg b/projects/DensePose/doc/images/res_bbox_dp_segm.jpg
new file mode 100644
index 0000000..855fb7f
Binary files /dev/null and b/projects/DensePose/doc/images/res_bbox_dp_segm.jpg differ
diff --git a/projects/DensePose/doc/images/res_bbox_dp_u.jpg b/projects/DensePose/doc/images/res_bbox_dp_u.jpg
new file mode 100644
index 0000000..fd4e77b
Binary files /dev/null and b/projects/DensePose/doc/images/res_bbox_dp_u.jpg differ
diff --git a/projects/DensePose/doc/images/res_bbox_dp_v.jpg b/projects/DensePose/doc/images/res_bbox_dp_v.jpg
new file mode 100644
index 0000000..09a8197
Binary files /dev/null and b/projects/DensePose/doc/images/res_bbox_dp_v.jpg differ
diff --git a/projects/DensePose/doc/images/vis_bbox_dp_i.jpg b/projects/DensePose/doc/images/vis_bbox_dp_i.jpg
new file mode 100644
index 0000000..113dd84
Binary files /dev/null and b/projects/DensePose/doc/images/vis_bbox_dp_i.jpg differ
diff --git a/projects/DensePose/doc/images/vis_bbox_dp_pts.jpg b/projects/DensePose/doc/images/vis_bbox_dp_pts.jpg
new file mode 100644
index 0000000..1a81dae
Binary files /dev/null and b/projects/DensePose/doc/images/vis_bbox_dp_pts.jpg differ
diff --git a/projects/DensePose/doc/images/vis_bbox_dp_segm.jpg b/projects/DensePose/doc/images/vis_bbox_dp_segm.jpg
new file mode 100644
index 0000000..b17f831
Binary files /dev/null and b/projects/DensePose/doc/images/vis_bbox_dp_segm.jpg differ
diff --git a/projects/DensePose/doc/images/vis_bbox_dp_u.jpg b/projects/DensePose/doc/images/vis_bbox_dp_u.jpg
new file mode 100644
index 0000000..e21be74
Binary files /dev/null and b/projects/DensePose/doc/images/vis_bbox_dp_u.jpg differ
diff --git a/projects/DensePose/doc/images/vis_bbox_dp_v.jpg b/projects/DensePose/doc/images/vis_bbox_dp_v.jpg
new file mode 100644
index 0000000..7bcab2c
Binary files /dev/null and b/projects/DensePose/doc/images/vis_bbox_dp_v.jpg differ
diff --git a/projects/DensePose/query_db.py b/projects/DensePose/query_db.py
new file mode 100644
index 0000000..d604e07
--- /dev/null
+++ b/projects/DensePose/query_db.py
@@ -0,0 +1,247 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import argparse
+import logging
+import os
+import sys
+from timeit import default_timer as timer
+from typing import Any, ClassVar, Dict, List
+import torch
+
+from detectron2.data.catalog import DatasetCatalog
+from detectron2.utils.logger import setup_logger
+
+from densepose.structures import DensePoseDataRelative
+from densepose.utils.dbhelper import EntrySelector
+from densepose.utils.logger import verbosity_to_level
+from densepose.vis.base import CompoundVisualizer
+from densepose.vis.bounding_box import BoundingBoxVisualizer
+from densepose.vis.densepose import (
+    DensePoseDataCoarseSegmentationVisualizer,
+    DensePoseDataPointsIVisualizer,
+    DensePoseDataPointsUVisualizer,
+    DensePoseDataPointsVisualizer,
+    DensePoseDataPointsVVisualizer,
+)
+
+DOC = """Query DB - a tool to print / visualize data from a database
+"""
+
+LOGGER_NAME = "query_db"
+
+logger = logging.getLogger(LOGGER_NAME)
+
+_ACTION_REGISTRY: Dict[str, "Action"] = {}
+
+
+class Action(object):
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        parser.add_argument(
+            "-v",
+            "--verbosity",
+            action="count",
+            help="Verbose mode. Multiple -v options increase the verbosity.",
+        )
+
+
+def register_action(cls: type):
+    """
+    Decorator for action classes to automate action registration
+    """
+    global _ACTION_REGISTRY
+    _ACTION_REGISTRY[cls.COMMAND] = cls
+    return cls
+
+
+class EntrywiseAction(Action):
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        super(EntrywiseAction, cls).add_arguments(parser)
+        parser.add_argument(
+            "dataset", metavar="<dataset>", help="Dataset name (e.g. densepose_coco_2014_train)"
+        )
+        parser.add_argument(
+            "selector",
+            metavar="<selector>",
+            help="Dataset entry selector in the form field1[:type]=value1[,"
+            "field2[:type]=value_min-value_max...] which selects all "
+            "entries from the dataset that satisfy the constraints",
+        )
+        parser.add_argument(
+            "--max-entries", metavar="N", help="Maximum number of entries to process", type=int
+        )
+
+    @classmethod
+    def execute(cls: type, args: argparse.Namespace):
+        dataset = setup_dataset(args.dataset)
+        entry_selector = EntrySelector.from_string(args.selector)
+        context = cls.create_context(args)
+        if args.max_entries is not None:
+            for _, entry in zip(range(args.max_entries), dataset):
+                if entry_selector(entry):
+                    cls.execute_on_entry(entry, context)
+        else:
+            for entry in dataset:
+                if entry_selector(entry):
+                    cls.execute_on_entry(entry, context)
+
+    @classmethod
+    def create_context(cls: type, args: argparse.Namespace) -> Dict[str, Any]:
+        context = {}
+        return context
+
+
+@register_action
+class PrintAction(EntrywiseAction):
+    """
+    Print action that outputs selected entries to stdout
+    """
+
+    COMMAND: ClassVar[str] = "print"
+
+    @classmethod
+    def add_parser(cls: type, subparsers: argparse._SubParsersAction):
+        parser = subparsers.add_parser(cls.COMMAND, help="Output selected entries to stdout. ")
+        cls.add_arguments(parser)
+        parser.set_defaults(func=cls.execute)
+
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        super(PrintAction, cls).add_arguments(parser)
+
+    @classmethod
+    def execute_on_entry(cls: type, entry: Dict[str, Any], context: Dict[str, Any]):
+        import pprint
+
+        printer = pprint.PrettyPrinter(indent=2, width=200, compact=True)
+        printer.pprint(entry)
+
+
+@register_action
+class ShowAction(EntrywiseAction):
+    """
+    Show action that visualizes selected entries on an image
+    """
+
+    COMMAND: ClassVar[str] = "show"
+    VISUALIZERS: ClassVar[Dict[str, object]] = {
+        "dp_segm": DensePoseDataCoarseSegmentationVisualizer(),
+        "dp_i": DensePoseDataPointsIVisualizer(),
+        "dp_u": DensePoseDataPointsUVisualizer(),
+        "dp_v": DensePoseDataPointsVVisualizer(),
+        "dp_pts": DensePoseDataPointsVisualizer(),
+        "bbox": BoundingBoxVisualizer(),
+    }
+
+    @classmethod
+    def add_parser(cls: type, subparsers: argparse._SubParsersAction):
+        parser = subparsers.add_parser(cls.COMMAND, help="Visualize selected entries")
+        cls.add_arguments(parser)
+        parser.set_defaults(func=cls.execute)
+
+    @classmethod
+    def add_arguments(cls: type, parser: argparse.ArgumentParser):
+        super(ShowAction, cls).add_arguments(parser)
+        parser.add_argument(
+            "visualizations",
+            metavar="<visualizations>",
+            help="Comma separated list of visualizations, possible values: "
+            "[{}]".format(",".join(sorted(cls.VISUALIZERS.keys()))),
+        )
+        parser.add_argument(
+            "--output",
+            metavar="<image_file>",
+            default="output.png",
+            help="File name to save output to",
+        )
+
+    @classmethod
+    def execute_on_entry(cls: type, entry: Dict[str, Any], context: Dict[str, Any]):
+        import cv2
+        import numpy as np
+
+        image_fpath = entry["file_name"]
+        image = cv2.imread(image_fpath, cv2.IMREAD_GRAYSCALE)
+        image = np.tile(image[:, :, np.newaxis], [1, 1, 3])
+        datas = cls._extract_data_for_visualizers_from_entry(context["vis_specs"], entry)
+        visualizer = context["visualizer"]
+        image_vis = visualizer.visualize(image, datas)
+        entry_idx = context["entry_idx"] + 1
+        out_fname = cls._get_out_fname(entry_idx, context["out_fname"])
+        cv2.imwrite(out_fname, image_vis)
+        logger.info(f"Output saved to {out_fname}")
+        context["entry_idx"] += 1
+
+    @classmethod
+    def _get_out_fname(cls: type, entry_idx: int, fname_base: str):
+        base, ext = os.path.splitext(fname_base)
+        return base + ".{0:04d}".format(entry_idx) + ext
+
+    @classmethod
+    def create_context(cls: type, args: argparse.Namespace) -> Dict[str, Any]:
+        vis_specs = args.visualizations.split(",")
+        visualizers = []
+        for vis_spec in vis_specs:
+            vis = cls.VISUALIZERS[vis_spec]
+            visualizers.append(vis)
+        context = {
+            "vis_specs": vis_specs,
+            "visualizer": CompoundVisualizer(visualizers),
+            "out_fname": args.output,
+            "entry_idx": 0,
+        }
+        return context
+
+    @classmethod
+    def _extract_data_for_visualizers_from_entry(
+        cls: type, vis_specs: List[str], entry: Dict[str, Any]
+    ):
+        dp_list = []
+        bbox_list = []
+        for annotation in entry["annotations"]:
+            is_valid, _ = DensePoseDataRelative.validate_annotation(annotation)
+            if not is_valid:
+                continue
+            bbox = torch.as_tensor(annotation["bbox"])
+            bbox_list.append(bbox)
+            dp_data = DensePoseDataRelative(annotation)
+            dp_list.append(dp_data)
+        datas = []
+        for vis_spec in vis_specs:
+            datas.append(bbox_list if "bbox" == vis_spec else (bbox_list, dp_list))
+        return datas
+
+
+def setup_dataset(dataset_name):
+    logger.info("Loading dataset {}".format(dataset_name))
+    start = timer()
+    dataset = DatasetCatalog.get(dataset_name)
+    stop = timer()
+    logger.info("Loaded dataset {} in {:.3f}s".format(dataset_name, stop - start))
+    return dataset
+
+
+def create_argument_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description=DOC,
+        formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=120),
+    )
+    parser.set_defaults(func=lambda _: parser.print_help(sys.stdout))
+    subparsers = parser.add_subparsers(title="Actions")
+    for _, action in _ACTION_REGISTRY.items():
+        action.add_parser(subparsers)
+    return parser
+
+
+def main():
+    parser = create_argument_parser()
+    args = parser.parse_args()
+    verbosity = args.verbosity if hasattr(args, "verbosity") else None
+    global logger
+    logger = setup_logger(name=LOGGER_NAME)
+    logger.setLevel(verbosity_to_level(verbosity))
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/projects/DensePose/train_net.py b/projects/DensePose/train_net.py
new file mode 100644
index 0000000..8dc9deb
--- /dev/null
+++ b/projects/DensePose/train_net.py
@@ -0,0 +1,81 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+DensePose Training Script.
+
+This script is similar to the training script in detectron2/tools.
+
+It is an example of how a user might use detectron2 for a new project.
+"""
+
+import os
+
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import build_detection_test_loader, build_detection_train_loader
+from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
+from detectron2.evaluation import COCOEvaluator, DatasetEvaluators, verify_results
+from detectron2.utils.logger import setup_logger
+
+from densepose import DatasetMapper, DensePoseCOCOEvaluator, add_densepose_config
+
+
+class Trainer(DefaultTrainer):
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name):
+        output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        evaluators = [COCOEvaluator(dataset_name, cfg, True, output_folder)]
+        if cfg.MODEL.DENSEPOSE_ON:
+            evaluators.append(DensePoseCOCOEvaluator(dataset_name, True, output_folder))
+        return DatasetEvaluators(evaluators)
+
+    @classmethod
+    def build_test_loader(cls, cfg, dataset_name):
+        return build_detection_test_loader(cfg, dataset_name, mapper=DatasetMapper(cfg, False))
+
+    @classmethod
+    def build_train_loader(cls, cfg):
+        return build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True))
+
+
+def setup(args):
+    cfg = get_cfg()
+    add_densepose_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    # Setup logger for "densepose" module
+    setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="densepose")
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = Trainer.test(cfg, model)
+        if comm.is_main_process():
+            verify_results(cfg, res)
+        return res
+
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/projects/README.md b/projects/README.md
new file mode 100644
index 0000000..efe7a22
--- /dev/null
+++ b/projects/README.md
@@ -0,0 +1,9 @@
+
+Here are a few research projects that are built on detectron2.
+They are examples of how to use detectron2 as a library, to make your projects more
+maintainable.
+
++ [DensePose: Dense Human Pose Estimation In The Wild](DensePose)
++ [Scale-Aware Trident Networks for Object Detection](TridentNet)
++ [TensorMask: A Foundation for Dense Object Segmentation](TensorMask)
++ Mesh R-CNN.  (Coming Soon)
diff --git a/projects/TensorMask/README.md b/projects/TensorMask/README.md
new file mode 100644
index 0000000..11fa938
--- /dev/null
+++ b/projects/TensorMask/README.md
@@ -0,0 +1,60 @@
+
+# TensorMask in Detectron2
+**A Foundation for Dense Object Segmentation**
+
+Xinlei Chen, Ross Girshick, Kaiming He, Piotr Dollár
+
+[[`arXiv`](https://arxiv.org/abs/1903.12174)] [[`BibTeX`](#CitingTensorMask)]
+
+<div align="center">
+  <img src="http://xinleic.xyz/images/tmask.png" width="700px" />
+</div>
+
+In this repository, we release code for TensorMask in Detectron2.
+TensorMask is a dense sliding-window instance segmentation framework that, for the first time, achieves results close to the well-developed Mask R-CNN framework -- both qualitatively and quantitatively. It establishes a conceptually complementary direction for object instance segmentation research. 
+
+## Installation
+To install, first setup Detectron 2 following [INSTALL.md](https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md). Then to compile the TensorMask-specific op (`swap_align2nat`):
+```bash
+cd /path/to/detectron2/projects/TensorMask
+python setup.py build develop
+```
+
+## Training
+
+To train a model, run:
+```bash
+python /path/to/detectron2/projects/TensorMask/train_net.py --config-file <config.yaml>
+```
+
+For example, to launch TensorMask BiPyramid training (1x schedule) with ResNet-50 backbone on 8 GPUs,
+one should execute:
+```bash
+python /path/to/detectron2/projects/TensorMask/train_net.py --config-file configs/tensormask_R_50_FPN_1x.yaml --num_gpus 8
+```
+
+## Evaluation
+
+Model evaluation can be done similarly (6x schedule with scale augmentation):
+```bash
+python /path/to/detectron2/projects/TensorMask/train_net.py --config-file configs/tensormask_R_50_FPN_6x.yaml --eval-only MODEL.WEIGHTS model.pth
+```
+
+# Model Zoo and Baselines
+
+(coming soon)
+
+
+## <a name="CitingTensorMask"></a>Citing TensorMask
+
+If you use TensorMask, please use the following BibTeX entry.
+
+```
+@InProceedings{chen2019tensormask,
+  title={Tensormask: A Foundation for Dense Object Segmentation},
+  author={Chen, Xinlei and Girshick, Ross and He, Kaiming and Doll{\'a}r, Piotr},
+  journal={The International Conference on Computer Vision (ICCV)},
+  year={2019}
+}
+```
+
diff --git a/projects/TensorMask/configs/Base-TensorMask.yaml b/projects/TensorMask/configs/Base-TensorMask.yaml
new file mode 100644
index 0000000..4aa4107
--- /dev/null
+++ b/projects/TensorMask/configs/Base-TensorMask.yaml
@@ -0,0 +1,24 @@
+MODEL:
+  META_ARCHITECTURE: "TensorMask"
+  MASK_ON: True
+  BACKBONE:
+    NAME: "build_retinanet_resnet_fpn_backbone"
+  RESNETS:
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  ANCHOR_GENERATOR:
+    SIZES: [[44, 60], [88, 120], [176, 240], [352, 480], [704, 960], [1408, 1920]]
+    ASPECT_RATIOS: [[1.0]]
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    FUSE_TYPE: "avg"
+  TENSOR_MASK:
+    ALIGNED_ON: True
+    BIPYRAMID_ON: True
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
diff --git a/projects/TensorMask/configs/tensormask_R_50_FPN_1x.yaml b/projects/TensorMask/configs/tensormask_R_50_FPN_1x.yaml
new file mode 100644
index 0000000..5d5eee1
--- /dev/null
+++ b/projects/TensorMask/configs/tensormask_R_50_FPN_1x.yaml
@@ -0,0 +1,5 @@
+_BASE_: "Base-TensorMask.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
diff --git a/projects/TensorMask/configs/tensormask_R_50_FPN_6x.yaml b/projects/TensorMask/configs/tensormask_R_50_FPN_6x.yaml
new file mode 100644
index 0000000..366a965
--- /dev/null
+++ b/projects/TensorMask/configs/tensormask_R_50_FPN_6x.yaml
@@ -0,0 +1,11 @@
+_BASE_: "Base-TensorMask.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (480000, 520000)
+  MAX_ITER: 540000
+INPUT:
+  MIN_SIZE_TRAIN_SAMPLING: "range"
+  MIN_SIZE_TRAIN: (640, 800)
diff --git a/projects/TensorMask/setup.py b/projects/TensorMask/setup.py
new file mode 100644
index 0000000..49e11e0
--- /dev/null
+++ b/projects/TensorMask/setup.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import glob
+import os
+from setuptools import find_packages, setup
+import torch
+from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
+
+torch_ver = [int(x) for x in torch.__version__.split(".")[:2]]
+assert torch_ver >= [1, 3], "Requires PyTorch >= 1.3"
+
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "tensormask", "layers", "csrc")
+
+    main_source = os.path.join(extensions_dir, "vision.cpp")
+    sources = glob.glob(os.path.join(extensions_dir, "**", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "**", "*.cu")) + glob.glob(
+        os.path.join(extensions_dir, "*.cu")
+    )
+
+    sources = [main_source] + sources
+
+    extension = CppExtension
+
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+
+    if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1":
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+
+        # It's better if pytorch can do this by default ..
+        CC = os.environ.get("CC", None)
+        if CC is not None:
+            extra_compile_args["nvcc"].append("-ccbin={}".format(CC))
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+
+    include_dirs = [extensions_dir]
+
+    ext_modules = [
+        extension(
+            "tensormask._C",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+
+    return ext_modules
+
+
+setup(
+    name="tensormask",
+    version="0.1",
+    author="FAIR",
+    packages=find_packages(exclude=("configs", "tests")),
+    python_requires=">=3.6",
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
diff --git a/projects/TensorMask/tensormask/__init__.py b/projects/TensorMask/tensormask/__init__.py
new file mode 100644
index 0000000..e3b642a
--- /dev/null
+++ b/projects/TensorMask/tensormask/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .config import add_tensormask_config
+from .arch import TensorMask
diff --git a/projects/TensorMask/tensormask/arch.py b/projects/TensorMask/tensormask/arch.py
new file mode 100644
index 0000000..25e4c4c
--- /dev/null
+++ b/projects/TensorMask/tensormask/arch.py
@@ -0,0 +1,903 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import copy
+import logging
+import math
+from typing import List
+import torch
+import torch.nn.functional as F
+from fvcore.nn import sigmoid_focal_loss_star_jit, smooth_l1_loss
+from torch import nn
+
+from detectron2.layers import ShapeSpec, batched_nms, cat, paste_masks_in_image
+from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
+from detectron2.modeling.backbone import build_backbone
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
+from detectron2.modeling.meta_arch.retinanet import (
+    permute_all_cls_and_box_to_N_HWA_K_and_concat,
+    permute_to_N_HWA_K,
+)
+from detectron2.structures import Boxes, ImageList, Instances
+from detectron2.utils.logger import log_first_n
+
+from tensormask.layers import SwapAlign2Nat
+
+__all__ = ["TensorMask"]
+
+
+def _assignment_rule(
+    gt_boxes,
+    anchor_boxes,
+    unit_lengths,
+    min_anchor_size,
+    scale_thresh=2.0,
+    spatial_thresh=1.0,
+    uniqueness_on=True,
+):
+    """
+    Given two lists of boxes of N ground truth boxes and M anchor boxes,
+    compute the assignment between the two, following the assignment rules in
+    https://arxiv.org/abs/1903.12174.
+    The box order must be (xmin, ymin, xmax, ymax), so please make sure to convert
+    to BoxMode.XYXY_ABS before calling this function.
+
+    Args:
+        gt_boxes, anchor_boxes (Boxes): two Boxes. Contains N & M boxes/anchors, respectively.
+        unit_lengths (Tensor): Contains the unit lengths of M anchor boxes.
+        min_anchor_size (float): Minimum size of the anchor, in pixels
+        scale_thresh (float): The `scale` threshold: the maximum size of the anchor
+                              should not be greater than scale_thresh x max(h, w) of
+                              the ground truth box.
+        spatial_thresh (float): The `spatial` threshold: the l2 distance between the
+                              center of the anchor and the ground truth box should not
+                              be greater than spatial_thresh x u where u is the unit length.
+
+    Returns:
+        matches (Tensor[int64]): a vector of length M, where matches[i] is a matched
+                ground-truth index in [0, N)
+        match_labels (Tensor[int8]): a vector of length M, where pred_labels[i] indicates
+            whether a prediction is a true or false positive or ignored
+    """
+    gt_boxes, anchor_boxes = gt_boxes.tensor, anchor_boxes.tensor
+    N = gt_boxes.shape[0]
+    M = anchor_boxes.shape[0]
+    if N == 0 or M == 0:
+        return (
+            gt_boxes.new_full((N,), 0, dtype=torch.int64),
+            gt_boxes.new_full((N,), -1, dtype=torch.int8),
+        )
+
+    # Containment rule
+    lt = torch.min(gt_boxes[:, None, :2], anchor_boxes[:, :2])  # [N,M,2]
+    rb = torch.max(gt_boxes[:, None, 2:], anchor_boxes[:, 2:])  # [N,M,2]
+    union = cat([lt, rb], dim=2)  # [N,M,4]
+
+    dummy_gt_boxes = torch.zeros_like(gt_boxes)
+    anchor = dummy_gt_boxes[:, None, :] + anchor_boxes[:, :]  # [N,M,4]
+
+    contain_matrix = torch.all(union == anchor, dim=2)  # [N,M]
+
+    # Centrality rule, scale
+    gt_size_lower = torch.max(gt_boxes[:, 2:] - gt_boxes[:, :2], dim=1)[0]  # [N]
+    gt_size_upper = gt_size_lower * scale_thresh  # [N]
+    # Fall back for small objects
+    gt_size_upper[gt_size_upper < min_anchor_size] = min_anchor_size
+    # Due to sampling of locations, the anchor sizes are deducted with sampling strides
+    anchor_size = (
+        torch.max(anchor_boxes[:, 2:] - anchor_boxes[:, :2], dim=1)[0] - unit_lengths
+    )  # [M]
+
+    size_diff_upper = gt_size_upper[:, None] - anchor_size  # [N,M]
+    scale_matrix = size_diff_upper >= 0  # [N,M]
+
+    # Centrality rule, spatial
+    gt_center = (gt_boxes[:, 2:] + gt_boxes[:, :2]) / 2  # [N,2]
+    anchor_center = (anchor_boxes[:, 2:] + anchor_boxes[:, :2]) / 2  # [M,2]
+    offset_center = gt_center[:, None, :] - anchor_center[:, :]  # [N,M,2]
+    offset_center /= unit_lengths[:, None]  # [N,M,2]
+    spatial_square = spatial_thresh * spatial_thresh
+    spatial_matrix = torch.sum(offset_center * offset_center, dim=2) <= spatial_square
+
+    assign_matrix = (contain_matrix & scale_matrix & spatial_matrix).int()
+
+    # assign_matrix is N (gt) x M (predicted)
+    # Max over gt elements (dim 0) to find best gt candidate for each prediction
+    matched_vals, matches = assign_matrix.max(dim=0)
+    match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
+
+    match_labels[matched_vals == 0] = 0
+    match_labels[matched_vals == 1] = 1
+
+    # find all the elements that match to ground truths multiple times
+    not_unique_idxs = assign_matrix.sum(dim=0) > 1
+    if uniqueness_on:
+        match_labels[not_unique_idxs] = 0
+    else:
+        match_labels[not_unique_idxs] = -1
+
+    return matches, match_labels
+
+
+def _paste_mask_lists_in_image(masks, boxes, image_shape, threshold=0.5):
+    """
+    Paste a list of masks that are of various resolutions (e.g., 28 x 28) into an image.
+    The location, height, and width for pasting each mask is determined by their
+    corresponding bounding boxes in boxes.
+
+    Args:
+        masks (list(Tensor)): A list of Tensor of shape (1, Hmask_i, Wmask_i).
+                            Values are in [0, 1]. The list length, Bimg, is the
+                            number of detected object instances in the image.
+        boxes (Boxes): A Boxes of length Bimg. boxes.tensor[i] and masks[i] correspond
+                            to the same object instance.
+        image_shape (tuple): height, width
+        threshold (float): A threshold in [0, 1] for converting the (soft) masks to
+            binary masks.
+
+    Returns:
+        img_masks (Tensor): A tensor of shape (Bimg, Himage, Wimage), where Bimg is the
+        number of detected object instances and Himage, Wimage are the image width
+        and height. img_masks[i] is a binary mask for object instance i.
+    """
+    if len(masks) == 0:
+        return torch.empty((0, 1) + image_shape, dtype=torch.uint8)
+
+    # Loop over masks groups. Each group has the same mask prediction size.
+    img_masks = []
+    ind_masks = []
+    mask_sizes = torch.tensor([m.shape[-1] for m in masks])
+    unique_sizes = torch.unique(mask_sizes)
+    for msize in unique_sizes.tolist():
+        cur_ind = torch.where(mask_sizes == msize)[0]
+        ind_masks.append(cur_ind)
+
+        cur_masks = cat([masks[i] for i in cur_ind])
+        cur_boxes = boxes[cur_ind]
+        img_masks.append(paste_masks_in_image(cur_masks, cur_boxes, image_shape, threshold))
+
+    img_masks = cat(img_masks)
+    ind_masks = cat(ind_masks)
+
+    img_masks_out = torch.empty_like(img_masks)
+    img_masks_out[ind_masks, :, :] = img_masks
+
+    return img_masks_out
+
+
+def _postprocess(results, result_mask_info, output_height, output_width, mask_threshold=0.5):
+    """
+    Post-process the output boxes for TensorMask.
+    The input images are often resized when entering an object detector.
+    As a result, we often need the outputs of the detector in a different
+    resolution from its inputs.
+
+    This function will postprocess the raw outputs of TensorMask
+    to produce outputs according to the desired output resolution.
+
+    Args:
+        results (Instances): the raw outputs from the detector.
+            `results.image_size` contains the input image resolution the detector sees.
+            This object might be modified in-place. Note that it does not contain the field
+            `pred_masks`, which is provided by another input `result_masks`.
+        result_mask_info (list[Tensor], Boxes): a pair of two items for mask related results.
+                The first item is a list of #detection tensors, each is the predicted masks.
+                The second item is the anchors corresponding to the predicted masks.
+        output_height, output_width: the desired output resolution.
+
+    Returns:
+        Instances: the postprocessed output from the model, based on the output resolution
+    """
+    scale_x, scale_y = (output_width / results.image_size[1], output_height / results.image_size[0])
+    results = Instances((output_height, output_width), **results.get_fields())
+
+    output_boxes = results.pred_boxes
+    output_boxes.tensor[:, 0::2] *= scale_x
+    output_boxes.tensor[:, 1::2] *= scale_y
+    output_boxes.clip(results.image_size)
+
+    inds_nonempty = output_boxes.nonempty()
+    results = results[inds_nonempty]
+    result_masks, result_anchors = result_mask_info
+    if result_masks:
+        result_anchors.tensor[:, 0::2] *= scale_x
+        result_anchors.tensor[:, 1::2] *= scale_y
+        result_masks = [x for (i, x) in zip(inds_nonempty.tolist(), result_masks) if i]
+        results.pred_masks = _paste_mask_lists_in_image(
+            result_masks,
+            result_anchors[inds_nonempty],
+            results.image_size,
+            threshold=mask_threshold,
+        )
+    return results
+
+
+class TensorMaskAnchorGenerator(DefaultAnchorGenerator):
+    """
+    For a set of image sizes and feature maps, computes a set of anchors for TensorMask.
+    It also computes the unit lengths and indexes for each anchor box.
+    """
+
+    def grid_anchors_with_unit_lengths_and_indexes(self, grid_sizes):
+        anchors = []
+        unit_lengths = []
+        indexes = []
+        for lvl, (size, stride, base_anchors) in enumerate(
+            zip(grid_sizes, self.strides, self.cell_anchors)
+        ):
+            grid_height, grid_width = size
+            device = base_anchors.device
+            shifts_x = torch.arange(
+                0, grid_width * stride, step=stride, dtype=torch.float32, device=device
+            )
+            shifts_y = torch.arange(
+                0, grid_height * stride, step=stride, dtype=torch.float32, device=device
+            )
+            shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
+            shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=2)
+            # Stack anchors in shapes of (HWA, 4)
+            cur_anchor = (shifts[:, :, None, :] + base_anchors.view(1, 1, -1, 4)).view(-1, 4)
+            anchors.append(cur_anchor)
+            unit_lengths.append(
+                torch.full((cur_anchor.shape[0],), stride, dtype=torch.float32, device=device)
+            )
+            # create mask indexes using mesh grid
+            shifts_l = torch.full((1,), lvl, dtype=torch.int64, device=device)
+            shifts_i = torch.zeros((1,), dtype=torch.int64, device=device)
+            shifts_h = torch.arange(0, grid_height, dtype=torch.int64, device=device)
+            shifts_w = torch.arange(0, grid_width, dtype=torch.int64, device=device)
+            shifts_a = torch.arange(0, base_anchors.shape[0], dtype=torch.int64, device=device)
+            grids = torch.meshgrid(shifts_l, shifts_i, shifts_h, shifts_w, shifts_a)
+
+            indexes.append(torch.stack(grids, dim=5).view(-1, 5))
+
+        return anchors, unit_lengths, indexes
+
+    def forward(self, features):
+        """
+        Returns:
+            list[list[Boxes]]: a list of #image elements. Each is a list of #feature level Boxes.
+                The Boxes contains anchors of this image on the specific feature level.
+            list[list[Tensor]]: a list of #image elements. Each is a list of #feature level tensors.
+                The tensor contains strides, or unit lengths for the anchors.
+            list[list[Tensor]]: a list of #image elements. Each is a list of #feature level tensors.
+                The Tensor contains indexes for the anchors, with the last dimension meaning
+                (L, N, H, W, A), where L is level, I is image (not set yet), H is height,
+                W is width, and A is anchor.
+        """
+        num_images = len(features[0])
+        grid_sizes = [feature_map.shape[-2:] for feature_map in features]
+        anchors_list, lengths_list, indexes_list = self.grid_anchors_with_unit_lengths_and_indexes(
+            grid_sizes
+        )
+
+        # Convert anchors from Tensor to Boxes
+        anchors_per_im = [Boxes(x) for x in anchors_list]
+
+        anchors = [copy.deepcopy(anchors_per_im) for _ in range(num_images)]
+        unit_lengths = [copy.deepcopy(lengths_list) for _ in range(num_images)]
+        indexes = [copy.deepcopy(indexes_list) for _ in range(num_images)]
+
+        return anchors, unit_lengths, indexes
+
+
+@META_ARCH_REGISTRY.register()
+class TensorMask(nn.Module):
+    """
+    TensorMask model. Creates FPN backbone, anchors and a head for classification
+    and box regression. Calculates and applies proper losses to class, box, and
+    masks.
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        # get the deice of the model
+        self.device = torch.device(cfg.MODEL.DEVICE)
+
+        # fmt: off
+        self.num_classes              = cfg.MODEL.TENSOR_MASK.NUM_CLASSES
+        self.in_features              = cfg.MODEL.TENSOR_MASK.IN_FEATURES
+        self.anchor_sizes             = cfg.MODEL.ANCHOR_GENERATOR.SIZES
+        self.num_levels               = len(cfg.MODEL.ANCHOR_GENERATOR.SIZES)
+        # Loss parameters:
+        self.focal_loss_alpha         = cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_ALPHA
+        self.focal_loss_gamma         = cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_GAMMA
+        # Inference parameters:
+        self.score_threshold          = cfg.MODEL.TENSOR_MASK.SCORE_THRESH_TEST
+        self.topk_candidates          = cfg.MODEL.TENSOR_MASK.TOPK_CANDIDATES_TEST
+        self.nms_threshold            = cfg.MODEL.TENSOR_MASK.NMS_THRESH_TEST
+        self.detections_im            = cfg.TEST.DETECTIONS_PER_IMAGE
+        # Mask parameters:
+        self.mask_on                  = cfg.MODEL.MASK_ON
+        self.mask_loss_weight         = cfg.MODEL.TENSOR_MASK.MASK_LOSS_WEIGHT
+        self.mask_pos_weight          = torch.tensor(cfg.MODEL.TENSOR_MASK.POSITIVE_WEIGHT,
+                                                     dtype=torch.float32,
+                                                     device=self.device)
+        self.bipyramid_on             = cfg.MODEL.TENSOR_MASK.BIPYRAMID_ON
+        # fmt: on
+
+        # build the backbone
+        self.backbone = build_backbone(cfg)
+
+        backbone_shape = self.backbone.output_shape()
+        feature_shapes = [backbone_shape[f] for f in self.in_features]
+        feature_strides = [x.stride for x in feature_shapes]
+        # build anchors
+        self.anchor_generator = TensorMaskAnchorGenerator(cfg, feature_shapes)
+        self.num_anchors = self.anchor_generator.num_cell_anchors[0]
+        anchors_min_level = cfg.MODEL.ANCHOR_GENERATOR.SIZES[0]
+        self.mask_sizes = [size // feature_strides[0] for size in anchors_min_level]
+        self.min_anchor_size = min(anchors_min_level) - feature_strides[0]
+
+        # head of the TensorMask
+        self.head = TensorMaskHead(
+            cfg, self.num_levels, self.num_anchors, self.mask_sizes, feature_shapes
+        )
+        # box transform
+        self.box2box_transform = Box2BoxTransform(weights=cfg.MODEL.TENSOR_MASK.BBOX_REG_WEIGHTS)
+        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
+        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
+        self.normalizer = lambda x: (x - pixel_mean) / pixel_std
+        self.to(self.device)
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DetectionTransform` .
+                Each item in the list contains the inputs for one image.
+            For now, each item in the list is a dict that contains:
+                image: Tensor, image in (C, H, W) format.
+                instances: Instances
+                Other information that's included in the original dicts, such as:
+                    "height", "width" (int): the output resolution of the model, used in inference.
+                        See :meth:`postprocess` for details.
+         Returns:
+            losses (dict[str: Tensor]): mapping from a named loss to a tensor
+                storing the loss. Used during training only.
+        """
+        images = self.preprocess_image(batched_inputs)
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        elif "targets" in batched_inputs[0]:
+            log_first_n(
+                logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
+            )
+            gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+
+        features = self.backbone(images.tensor)
+        features = [features[f] for f in self.in_features]
+        # apply the TensorMask head
+        pred_logits, pred_deltas, pred_masks = self.head(features)
+        # generate anchors based on features, is it image specific?
+        anchors, unit_lengths, indexes = self.anchor_generator(features)
+
+        if self.training:
+            # get ground truths for class labels and box targets, it will label each anchor
+            gt_class_info, gt_delta_info, gt_mask_info, num_fg = self.get_ground_truth(
+                anchors, unit_lengths, indexes, gt_instances
+            )
+            # compute the loss
+            return self.losses(
+                gt_class_info,
+                gt_delta_info,
+                gt_mask_info,
+                num_fg,
+                pred_logits,
+                pred_deltas,
+                pred_masks,
+            )
+        else:
+            # do inference to get the output
+            results = self.inference(pred_logits, pred_deltas, pred_masks, anchors, indexes, images)
+            processed_results = []
+            for results_im, input_im, image_size in zip(
+                results, batched_inputs, images.image_sizes
+            ):
+                height = input_im.get("height", image_size[0])
+                width = input_im.get("width", image_size[1])
+                # this is to do post-processing with the image size
+                result_box, result_mask = results_im
+                r = _postprocess(result_box, result_mask, height, width)
+                processed_results.append({"instances": r})
+            return processed_results
+
+    def losses(
+        self,
+        gt_class_info,
+        gt_delta_info,
+        gt_mask_info,
+        num_fg,
+        pred_logits,
+        pred_deltas,
+        pred_masks,
+    ):
+        """
+        Args:
+            For `gt_class_info`, `gt_delta_info`, `gt_mask_info` and `num_fg` parameters, see
+                :meth:`TensorMask.get_ground_truth`.
+            For `pred_logits`, `pred_deltas` and `pred_masks`, see
+                :meth:`TensorMaskHead.forward`.
+
+        Returns:
+            losses (dict[str: Tensor]): mapping from a named loss to a scalar tensor
+                storing the loss. Used during training only. The potential dict keys are:
+                "loss_cls", "loss_box_reg" and "loss_mask".
+        """
+        gt_classes_target, gt_valid_inds = gt_class_info
+        gt_deltas, gt_fg_inds = gt_delta_info
+        gt_masks, gt_mask_inds = gt_mask_info
+        loss_normalizer = torch.tensor(max(1, num_fg), dtype=torch.float32, device=self.device)
+
+        # classification and regression
+        pred_logits, pred_deltas = permute_all_cls_and_box_to_N_HWA_K_and_concat(
+            pred_logits, pred_deltas, self.num_classes
+        )
+        loss_cls = (
+            sigmoid_focal_loss_star_jit(
+                pred_logits[gt_valid_inds],
+                gt_classes_target[gt_valid_inds],
+                alpha=self.focal_loss_alpha,
+                gamma=self.focal_loss_gamma,
+                reduction="sum",
+            )
+            / loss_normalizer
+        )
+
+        if num_fg == 0:
+            loss_box_reg = pred_deltas.sum() * 0
+        else:
+            loss_box_reg = (
+                smooth_l1_loss(pred_deltas[gt_fg_inds], gt_deltas, beta=0.0, reduction="sum")
+                / loss_normalizer
+            )
+        losses = {"loss_cls": loss_cls, "loss_box_reg": loss_box_reg}
+
+        # mask prediction
+        if self.mask_on:
+            loss_mask = 0
+            for lvl in range(self.num_levels):
+                cur_level_factor = 2 ** lvl if self.bipyramid_on else 1
+                for anc in range(self.num_anchors):
+                    cur_gt_mask_inds = gt_mask_inds[lvl][anc]
+                    if cur_gt_mask_inds is None:
+                        loss_mask += pred_masks[lvl][anc][0, 0, 0, 0] * 0
+                    else:
+                        cur_mask_size = self.mask_sizes[anc] * cur_level_factor
+                        # TODO maybe there are numerical issues when mask sizes are large
+                        cur_size_divider = torch.tensor(
+                            self.mask_loss_weight / (cur_mask_size ** 2),
+                            dtype=torch.float32,
+                            device=self.device,
+                        )
+
+                        cur_pred_masks = pred_masks[lvl][anc][
+                            cur_gt_mask_inds[:, 0],  # N
+                            :,  # V x U
+                            cur_gt_mask_inds[:, 1],  # H
+                            cur_gt_mask_inds[:, 2],  # W
+                        ]
+
+                        loss_mask += F.binary_cross_entropy_with_logits(
+                            cur_pred_masks.view(-1, cur_mask_size, cur_mask_size),  # V, U
+                            gt_masks[lvl][anc].to(dtype=torch.float32),
+                            reduction="sum",
+                            weight=cur_size_divider,
+                            pos_weight=self.mask_pos_weight,
+                        )
+            losses["loss_mask"] = loss_mask / loss_normalizer
+        return losses
+
+    @torch.no_grad()
+    def get_ground_truth(self, anchors, unit_lengths, indexes, targets):
+        """
+        Args:
+            anchors (list[list[Boxes]]): a list of N=#image elements. Each is a
+                list of #feature level Boxes. The Boxes contains anchors of
+                this image on the specific feature level.
+            unit_lengths (list[list[Tensor]]): a list of N=#image elements. Each is a
+                list of #feature level Tensor. The tensor contains unit lengths for anchors of
+                this image on the specific feature level.
+            indexes (list[list[Tensor]]): a list of N=#image elements. Each is a
+                list of #feature level Tensor. The tensor contains the 5D index of
+                each anchor, the second dimension means (L, N, H, W, A), where L
+                is level, I is image, H is height, W is width, and A is anchor.
+            targets (list[Instances]): a list of N `Instances`s. The i-th
+                `Instances` contains the ground-truth per-instance annotations
+                for the i-th input image.  Specify `targets` during training only.
+
+        Returns:
+            gt_class_info (Tensor, Tensor): A pair of two tensors for classification.
+                The first one is an integer tensor of shape (R, #classes) storing ground-truth
+                labels for each anchor. R is the total number of anchors in the batch.
+                The second one is an integer tensor of shape (R,), to indicate which
+                anchors are valid for loss computation, which anchors are not.
+            gt_delta_info (Tensor, Tensor): A pair of two tensors for boxes.
+                The first one, of shape (F, 4). F=#foreground anchors.
+                The last dimension represents ground-truth box2box transform
+                targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box.
+                Only foreground anchors have values in this tensor. Could be `None` if F=0.
+                The second one, of shape (R,), is an integer tensor indicating which anchors
+                are foreground ones used for box regression. Could be `None` if F=0.
+            gt_mask_info (list[list[Tensor]], list[list[Tensor]]): A pair of two lists for masks.
+                The first one is a list of P=#feature level elements. Each is a
+                list of A=#anchor tensors. Each tensor contains the ground truth
+                masks of the same size and for the same feature level. Could be `None`.
+                The second one is a list of P=#feature level elements. Each is a
+                list of A=#anchor tensors. Each tensor contains the location of the ground truth
+                masks of the same size and for the same feature level. The second dimension means
+                (N, H, W), where N is image, H is height, and W is width. Could be `None`.
+            num_fg (int): F=#foreground anchors, used later for loss normalization.
+        """
+        gt_classes = []
+        gt_deltas = []
+        gt_masks = [[[] for _ in range(self.num_anchors)] for _ in range(self.num_levels)]
+        gt_mask_inds = [[[] for _ in range(self.num_anchors)] for _ in range(self.num_levels)]
+
+        anchors = [Boxes.cat(anchors_i) for anchors_i in anchors]
+        unit_lengths = [cat(unit_lengths_i) for unit_lengths_i in unit_lengths]
+        indexes = [cat(indexes_i) for indexes_i in indexes]
+
+        num_fg = 0
+        for i, (anchors_im, unit_lengths_im, indexes_im, targets_im) in enumerate(
+            zip(anchors, unit_lengths, indexes, targets)
+        ):
+            # Initialize all
+            gt_classes_i = torch.full_like(
+                unit_lengths_im, self.num_classes, dtype=torch.int64, device=self.device
+            )
+            # Ground truth classes
+            has_gt = len(targets_im) > 0
+            if has_gt:
+                # Compute the pairwise matrix
+                gt_matched_inds, anchor_labels = _assignment_rule(
+                    targets_im.gt_boxes, anchors_im, unit_lengths_im, self.min_anchor_size
+                )
+                # Find the foreground instances
+                fg_inds = anchor_labels == 1
+                fg_anchors = anchors_im[fg_inds]
+                num_fg += len(fg_anchors)
+                # Find the ground truths for foreground instances
+                gt_fg_matched_inds = gt_matched_inds[fg_inds]
+                # Assign labels for foreground instances
+                gt_classes_i[fg_inds] = targets_im.gt_classes[gt_fg_matched_inds]
+                # Anchors with label -1 are ignored, others are left as negative
+                gt_classes_i[anchor_labels == -1] = -1
+
+                # Boxes
+                # Ground truth box regression, only for foregrounds
+                matched_gt_boxes = targets_im[gt_fg_matched_inds].gt_boxes
+                # Compute box regression offsets for foregrounds only
+                gt_deltas_i = self.box2box_transform.get_deltas(
+                    fg_anchors.tensor, matched_gt_boxes.tensor
+                )
+                gt_deltas.append(gt_deltas_i)
+
+                # Masks
+                if self.mask_on:
+                    # Compute masks for each level and each anchor
+                    matched_indexes = indexes_im[fg_inds, :]
+                    for lvl in range(self.num_levels):
+                        ids_lvl = matched_indexes[:, 0] == lvl
+                        if torch.any(ids_lvl):
+                            cur_level_factor = 2 ** lvl if self.bipyramid_on else 1
+                            for anc in range(self.num_anchors):
+                                ids_lvl_anchor = ids_lvl & (matched_indexes[:, 4] == anc)
+                                if torch.any(ids_lvl_anchor):
+                                    gt_masks[lvl][anc].append(
+                                        targets_im[
+                                            gt_fg_matched_inds[ids_lvl_anchor]
+                                        ].gt_masks.crop_and_resize(
+                                            fg_anchors[ids_lvl_anchor].tensor,
+                                            self.mask_sizes[anc] * cur_level_factor,
+                                        )
+                                    )
+                                    # Select (N, H, W) dimensions
+                                    gt_mask_inds_lvl_anc = matched_indexes[ids_lvl_anchor, 1:4]
+                                    # Set the image index to the current image
+                                    gt_mask_inds_lvl_anc[:, 0] = i
+                                    gt_mask_inds[lvl][anc].append(gt_mask_inds_lvl_anc)
+            gt_classes.append(gt_classes_i)
+
+        # Classes and boxes
+        gt_classes = cat(gt_classes)
+        gt_valid_inds = gt_classes >= 0
+        gt_fg_inds = gt_valid_inds & (gt_classes < self.num_classes)
+        gt_classes_target = torch.zeros(
+            (gt_classes.shape[0], self.num_classes), dtype=torch.float32, device=self.device
+        )
+        gt_classes_target[gt_fg_inds, gt_classes[gt_fg_inds]] = 1
+        gt_deltas = cat(gt_deltas) if gt_deltas else None
+
+        # Masks
+        gt_masks = [[cat(mla) if mla else None for mla in ml] for ml in gt_masks]
+        gt_mask_inds = [[cat(ila) if ila else None for ila in il] for il in gt_mask_inds]
+        return (
+            (gt_classes_target, gt_valid_inds),
+            (gt_deltas, gt_fg_inds),
+            (gt_masks, gt_mask_inds),
+            num_fg,
+        )
+
+    def inference(self, pred_logits, pred_deltas, pred_masks, anchors, indexes, images):
+        """
+        Arguments:
+            pred_logits, pred_deltas, pred_masks: Same as the output of:
+                meth:`TensorMaskHead.forward`
+            anchors, indexes: Same as the input of meth:`TensorMask.get_ground_truth`
+            images (ImageList): the input images
+
+        Returns:
+            results (List[Instances]): a list of #images elements.
+        """
+        assert len(anchors) == len(images)
+        results = []
+
+        pred_logits = [permute_to_N_HWA_K(x, self.num_classes) for x in pred_logits]
+        pred_deltas = [permute_to_N_HWA_K(x, 4) for x in pred_deltas]
+
+        pred_logits = cat(pred_logits, dim=1)
+        pred_deltas = cat(pred_deltas, dim=1)
+
+        for img_idx, (anchors_im, indexes_im) in enumerate(zip(anchors, indexes)):
+            # Get the size of the current image
+            image_size = images.image_sizes[img_idx]
+
+            logits_im = pred_logits[img_idx]
+            deltas_im = pred_deltas[img_idx]
+
+            if self.mask_on:
+                masks_im = [[mla[img_idx] for mla in ml] for ml in pred_masks]
+            else:
+                masks_im = [None] * self.num_levels
+            results_im = self.inference_single_image(
+                logits_im,
+                deltas_im,
+                masks_im,
+                Boxes.cat(anchors_im),
+                cat(indexes_im),
+                tuple(image_size),
+            )
+            results.append(results_im)
+        return results
+
+    def inference_single_image(
+        self, pred_logits, pred_deltas, pred_masks, anchors, indexes, image_size
+    ):
+        """
+        Single-image inference. Return bounding-box detection results by thresholding
+        on scores and applying non-maximum suppression (NMS).
+
+        Arguments:
+            pred_logits (list[Tensor]): list of #feature levels. Each entry contains
+                tensor of size (AxHxW, K)
+            pred_deltas (list[Tensor]): Same shape as 'pred_logits' except that K becomes 4.
+            pred_masks (list[list[Tensor]]): List of #feature levels, each is a list of #anchors.
+                Each entry contains tensor of size (M_i*M_i, H, W). `None` if mask_on=False.
+            anchors (list[Boxes]): list of #feature levels. Each entry contains
+                a Boxes object, which contains all the anchors for that
+                image in that feature level.
+            image_size (tuple(H, W)): a tuple of the image height and width.
+
+        Returns:
+            Same as `inference`, but for only one image.
+        """
+        pred_logits = pred_logits.flatten().sigmoid_()
+        # We get top locations across all levels to accelerate the inference speed,
+        # which does not seem to affect the accuracy.
+        # First select values above the threshold
+        logits_top_idxs = torch.where(pred_logits > self.score_threshold)[0]
+        # Then get the top values
+        num_topk = min(self.topk_candidates, logits_top_idxs.shape[0])
+        pred_prob, topk_idxs = pred_logits[logits_top_idxs].sort(descending=True)
+        # Keep top k scoring values
+        pred_prob = pred_prob[:num_topk]
+        # Keep top k values
+        top_idxs = logits_top_idxs[topk_idxs[:num_topk]]
+
+        # class index
+        cls_idxs = top_idxs % self.num_classes
+        # HWA index
+        top_idxs //= self.num_classes
+        # predict boxes
+        pred_boxes = self.box2box_transform.apply_deltas(
+            pred_deltas[top_idxs], anchors[top_idxs].tensor
+        )
+        # apply nms
+        keep = batched_nms(pred_boxes, pred_prob, cls_idxs, self.nms_threshold)
+        # pick the top ones
+        keep = keep[: self.detections_im]
+
+        results = Instances(image_size)
+        results.pred_boxes = Boxes(pred_boxes[keep])
+        results.scores = pred_prob[keep]
+        results.pred_classes = cls_idxs[keep]
+
+        # deal with masks
+        result_masks, result_anchors = [], None
+        if self.mask_on:
+            # index and anchors, useful for masks
+            top_indexes = indexes[top_idxs]
+            top_anchors = anchors[top_idxs]
+            result_indexes = top_indexes[keep]
+            result_anchors = top_anchors[keep]
+            # Get masks and do sigmoid
+            for lvl, _, h, w, anc in result_indexes.tolist():
+                cur_size = self.mask_sizes[anc] * (2 ** lvl if self.bipyramid_on else 1)
+                result_masks.append(
+                    torch.sigmoid(pred_masks[lvl][anc][:, h, w].view(1, cur_size, cur_size))
+                )
+
+        return results, (result_masks, result_anchors)
+
+    def preprocess_image(self, batched_inputs):
+        """
+        Normalize, pad and batch the input images.
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [self.normalizer(x) for x in images]
+        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
+        return images
+
+
+class TensorMaskHead(nn.Module):
+    def __init__(self, cfg, num_levels, num_anchors, mask_sizes, input_shape: List[ShapeSpec]):
+        """
+        TensorMask head.
+        """
+        super().__init__()
+        # fmt: off
+        self.in_features        = cfg.MODEL.TENSOR_MASK.IN_FEATURES
+        in_channels             = input_shape[0].channels
+        num_classes             = cfg.MODEL.TENSOR_MASK.NUM_CLASSES
+        cls_channels            = cfg.MODEL.TENSOR_MASK.CLS_CHANNELS
+        num_convs               = cfg.MODEL.TENSOR_MASK.NUM_CONVS
+        # box parameters
+        bbox_channels           = cfg.MODEL.TENSOR_MASK.BBOX_CHANNELS
+        # mask parameters
+        self.mask_on            = cfg.MODEL.MASK_ON
+        self.mask_sizes         = mask_sizes
+        mask_channels           = cfg.MODEL.TENSOR_MASK.MASK_CHANNELS
+        self.align_on           = cfg.MODEL.TENSOR_MASK.ALIGNED_ON
+        self.bipyramid_on       = cfg.MODEL.TENSOR_MASK.BIPYRAMID_ON
+        # fmt: on
+
+        # class subnet
+        cls_subnet = []
+        cur_channels = in_channels
+        for _ in range(num_convs):
+            cls_subnet.append(
+                nn.Conv2d(cur_channels, cls_channels, kernel_size=3, stride=1, padding=1)
+            )
+            cur_channels = cls_channels
+            cls_subnet.append(nn.ReLU())
+
+        self.cls_subnet = nn.Sequential(*cls_subnet)
+        self.cls_score = nn.Conv2d(
+            cur_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1
+        )
+        modules_list = [self.cls_subnet, self.cls_score]
+
+        # box subnet
+        bbox_subnet = []
+        cur_channels = in_channels
+        for _ in range(num_convs):
+            bbox_subnet.append(
+                nn.Conv2d(cur_channels, bbox_channels, kernel_size=3, stride=1, padding=1)
+            )
+            cur_channels = bbox_channels
+            bbox_subnet.append(nn.ReLU())
+
+        self.bbox_subnet = nn.Sequential(*bbox_subnet)
+        self.bbox_pred = nn.Conv2d(
+            cur_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1
+        )
+        modules_list.extend([self.bbox_subnet, self.bbox_pred])
+
+        # mask subnet
+        if self.mask_on:
+            mask_subnet = []
+            cur_channels = in_channels
+            for _ in range(num_convs):
+                mask_subnet.append(
+                    nn.Conv2d(cur_channels, mask_channels, kernel_size=3, stride=1, padding=1)
+                )
+                cur_channels = mask_channels
+                mask_subnet.append(nn.ReLU())
+
+            self.mask_subnet = nn.Sequential(*mask_subnet)
+            modules_list.append(self.mask_subnet)
+            for mask_size in self.mask_sizes:
+                cur_mask_module = "mask_pred_%02d" % mask_size
+                self.add_module(
+                    cur_mask_module,
+                    nn.Conv2d(
+                        cur_channels, mask_size * mask_size, kernel_size=1, stride=1, padding=0
+                    ),
+                )
+                modules_list.append(getattr(self, cur_mask_module))
+            if self.align_on:
+                if self.bipyramid_on:
+                    for lvl in range(num_levels):
+                        cur_mask_module = "align2nat_%02d" % lvl
+                        lambda_val = 2 ** lvl
+                        setattr(self, cur_mask_module, SwapAlign2Nat(lambda_val))
+                    # Also the fusing layer, stay at the same channel size
+                    mask_fuse = [
+                        nn.Conv2d(cur_channels, cur_channels, kernel_size=3, stride=1, padding=1),
+                        nn.ReLU(),
+                    ]
+                    self.mask_fuse = nn.Sequential(*mask_fuse)
+                    modules_list.append(self.mask_fuse)
+                else:
+                    self.align2nat = SwapAlign2Nat(1)
+
+        # Initialization
+        for modules in modules_list:
+            for layer in modules.modules():
+                if isinstance(layer, nn.Conv2d):
+                    torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
+                    torch.nn.init.constant_(layer.bias, 0)
+
+        # Use prior in model initialization to improve stability
+        bias_value = -math.log((1 - 0.01) / 0.01)
+        torch.nn.init.constant_(self.cls_score.bias, bias_value)
+
+    def forward(self, features):
+        """
+        Arguments:
+            features (list[Tensor]): FPN feature map tensors in high to low resolution.
+                Each tensor in the list correspond to different feature levels.
+
+        Returns:
+            pred_logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi).
+                The tensor predicts the classification probability
+                at each spatial position for each of the A anchors and K object
+                classes.
+            pred_deltas (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi).
+                The tensor predicts 4-vector (dx,dy,dw,dh) box
+                regression values for every anchor. These values are the
+                relative offset between the anchor and the ground truth box.
+            pred_masks (list(list[Tensor])): #lvl list of tensors, each is a list of
+                A tensors of shape (N, M_{i,a}, Hi, Wi).
+                The tensor predicts a dense set of M_ixM_i masks at every location.
+        """
+        pred_logits = [self.cls_score(self.cls_subnet(x)) for x in features]
+        pred_deltas = [self.bbox_pred(self.bbox_subnet(x)) for x in features]
+
+        pred_masks = None
+        if self.mask_on:
+            mask_feats = [self.mask_subnet(x) for x in features]
+
+            if self.bipyramid_on:
+                mask_feat_high_res = mask_feats[0]
+                H, W = mask_feat_high_res.shape[-2:]
+                mask_feats_up = []
+                for lvl, mask_feat in enumerate(mask_feats):
+                    lambda_val = 2.0 ** lvl
+                    mask_feat_up = mask_feat
+                    if lvl > 0:
+                        mask_feat_up = F.interpolate(
+                            mask_feat, scale_factor=lambda_val, mode="bilinear", align_corners=False
+                        )
+                    mask_feats_up.append(
+                        self.mask_fuse(mask_feat_up[:, :, :H, :W] + mask_feat_high_res)
+                    )
+                mask_feats = mask_feats_up
+
+            pred_masks = []
+            for lvl, mask_feat in enumerate(mask_feats):
+                cur_masks = []
+                for mask_size in self.mask_sizes:
+                    cur_mask_module = getattr(self, "mask_pred_%02d" % mask_size)
+                    cur_mask = cur_mask_module(mask_feat)
+                    if self.align_on:
+                        if self.bipyramid_on:
+                            cur_mask_module = getattr(self, "align2nat_%02d" % lvl)
+                            cur_mask = cur_mask_module(cur_mask)
+                        else:
+                            cur_mask = self.align2nat(cur_mask)
+                    cur_masks.append(cur_mask)
+                pred_masks.append(cur_masks)
+        return pred_logits, pred_deltas, pred_masks
diff --git a/projects/TensorMask/tensormask/config.py b/projects/TensorMask/tensormask/config.py
new file mode 100644
index 0000000..44479f2
--- /dev/null
+++ b/projects/TensorMask/tensormask/config.py
@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from detectron2.config import CfgNode as CN
+
+
+def add_tensormask_config(cfg):
+    """
+    Add config for TensorMask.
+    """
+    cfg.MODEL.TENSOR_MASK = CN()
+
+    # Anchor parameters
+    cfg.MODEL.TENSOR_MASK.IN_FEATURES = ["p2", "p3", "p4", "p5", "p6", "p7"]
+
+    # Convolutions to use in the towers
+    cfg.MODEL.TENSOR_MASK.NUM_CONVS = 4
+
+    # Number of foreground classes.
+    cfg.MODEL.TENSOR_MASK.NUM_CLASSES = 80
+    # Channel size for the classification tower
+    cfg.MODEL.TENSOR_MASK.CLS_CHANNELS = 256
+
+    cfg.MODEL.TENSOR_MASK.SCORE_THRESH_TEST = 0.05
+    # Only the top (1000 * #levels) candidate boxes across all levels are
+    # considered jointly during test (to improve speed)
+    cfg.MODEL.TENSOR_MASK.TOPK_CANDIDATES_TEST = 6000
+    cfg.MODEL.TENSOR_MASK.NMS_THRESH_TEST = 0.5
+
+    # Box parameters
+    # Channel size for the box tower
+    cfg.MODEL.TENSOR_MASK.BBOX_CHANNELS = 128
+    # Weights on (dx, dy, dw, dh)
+    cfg.MODEL.TENSOR_MASK.BBOX_REG_WEIGHTS = (1.5, 1.5, 0.75, 0.75)
+
+    # Loss parameters
+    cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_GAMMA = 3.0
+    cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_ALPHA = 0.3
+
+    # Mask parameters
+    # Channel size for the mask tower
+    cfg.MODEL.TENSOR_MASK.MASK_CHANNELS = 128
+    # Mask loss weight
+    cfg.MODEL.TENSOR_MASK.MASK_LOSS_WEIGHT = 2.0
+    # weight on positive pixels within the mask
+    cfg.MODEL.TENSOR_MASK.POSITIVE_WEIGHT = 1.5
+    # Whether to predict in the aligned representation
+    cfg.MODEL.TENSOR_MASK.ALIGNED_ON = False
+    # Whether to use the bipyramid architecture
+    cfg.MODEL.TENSOR_MASK.BIPYRAMID_ON = False
diff --git a/projects/TensorMask/tensormask/layers/__init__.py b/projects/TensorMask/tensormask/layers/__init__.py
new file mode 100644
index 0000000..cbbac42
--- /dev/null
+++ b/projects/TensorMask/tensormask/layers/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .swap_align2nat import SwapAlign2Nat, swap_align2nat
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat.h b/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat.h
new file mode 100644
index 0000000..2ec0373
--- /dev/null
+++ b/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat.h
@@ -0,0 +1,54 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+#include <torch/types.h>
+
+namespace tensormask {
+
+#ifdef WITH_CUDA
+at::Tensor SwapAlign2Nat_forward_cuda(
+    const at::Tensor& X,
+    const int lambda_val,
+    const float pad_val);
+
+at::Tensor SwapAlign2Nat_backward_cuda(
+    const at::Tensor& gY,
+    const int lambda_val,
+    const int batch_size,
+    const int channel,
+    const int height,
+    const int width);
+#endif
+
+inline at::Tensor SwapAlign2Nat_forward(
+    const at::Tensor& X,
+    const int lambda_val,
+    const float pad_val) {
+  if (X.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return SwapAlign2Nat_forward_cuda(X, lambda_val, pad_val);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+inline at::Tensor SwapAlign2Nat_backward(
+    const at::Tensor& gY,
+    const int lambda_val,
+    const int batch_size,
+    const int channel,
+    const int height,
+    const int width) {
+  if (gY.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return SwapAlign2Nat_backward_cuda(
+        gY, lambda_val, batch_size, channel, height, width);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+} // namespace tensormask
diff --git a/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat_cuda.cu b/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat_cuda.cu
new file mode 100644
index 0000000..7405385
--- /dev/null
+++ b/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat_cuda.cu
@@ -0,0 +1,524 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__device__ inline T get_pixel_val(
+    const T* tensor,
+    const int idx,
+    const int H,
+    const int W,
+    const int y,
+    const int x,
+    const int V,
+    const int U,
+    const int v,
+    const int u,
+    const T pad_val) {
+  if ((y < 0) || (y >= H) || (x < 0) || (x >= W) || (v < 0) || (v >= V) ||
+      (u < 0) || (u >= U)) {
+    return pad_val;
+  } else {
+    return tensor[(((idx * V + v) * U + u) * H + y) * W + x];
+  }
+}
+
+template <typename T>
+__device__ inline void add_pixel_val(
+    T* tensor,
+    const T val,
+    const int idx,
+    const int H,
+    const int W,
+    const int y,
+    const int x,
+    const int V,
+    const int U,
+    const int v,
+    const int u) {
+  if ((val == 0.) || (y < 0) || (y >= H) || (x < 0) || (x >= W) || (v < 0) ||
+      (v >= V) || (u < 0) || (u >= U)) {
+    return;
+  } else {
+    atomicAdd(tensor + ((((idx * V + v) * U + u) * H + y) * W + x), val);
+  }
+}
+
+template <typename T>
+__global__ void SwapAlign2NatForwardFeat(
+    const int nthreads,
+    const T* bottom_data,
+    const int Vout,
+    const int Uout,
+    const float hVout,
+    const float hUout,
+    const int Vin,
+    const int Uin,
+    const float lambda,
+    const int Hin,
+    const int Win,
+    const int Hout,
+    const int Wout,
+    const T pad_val,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int idx = index;
+    const int x = idx % Wout;
+    idx /= Wout;
+    const int y = idx % Hout;
+    idx /= Hout;
+    const int u = idx % Uout;
+    idx /= Uout;
+    const int v = idx % Vout;
+    idx /= Vout;
+
+    const float ox = x * lambda + u - hUout + 0.5;
+    const int xf = static_cast<int>(floor(ox));
+    const int xc = static_cast<int>(ceil(ox));
+    const float xwc = ox - xf;
+    const float xwf = 1. - xwc;
+
+    const float oy = y * lambda + v - hVout + 0.5;
+    const int yf = static_cast<int>(floor(oy));
+    const int yc = static_cast<int>(ceil(oy));
+    const float ywc = oy - yf;
+    const float ywf = 1. - ywc;
+
+    const float ou = (u + 0.5) / lambda - 0.5;
+    const int uf = static_cast<int>(floor(ou));
+    const int uc = static_cast<int>(ceil(ou));
+    const float uwc = ou - uf;
+    const float uwf = 1. - uwc;
+
+    const float ov = (v + 0.5) / lambda - 0.5;
+    const int vf = static_cast<int>(floor(ov));
+    const int vc = static_cast<int>(ceil(ov));
+    const float vwc = ov - vf;
+    const float vwf = 1. - vwc;
+
+    T val = ywf * xwf * vwf * uwf *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yf, xf, Vin, Uin, vf, uf, pad_val) +
+        ywf * xwf * vwf * uwc *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yf, xf, Vin, Uin, vf, uc, pad_val) +
+        ywf * xwf * vwc * uwf *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yf, xf, Vin, Uin, vc, uf, pad_val) +
+        ywf * xwf * vwc * uwc *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yf, xf, Vin, Uin, vc, uc, pad_val) +
+        ywf * xwc * vwf * uwf *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yf, xc, Vin, Uin, vf, uf, pad_val) +
+        ywf * xwc * vwf * uwc *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yf, xc, Vin, Uin, vf, uc, pad_val) +
+        ywf * xwc * vwc * uwf *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yf, xc, Vin, Uin, vc, uf, pad_val) +
+        ywf * xwc * vwc * uwc *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yf, xc, Vin, Uin, vc, uc, pad_val) +
+        ywc * xwf * vwf * uwf *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yc, xf, Vin, Uin, vf, uf, pad_val) +
+        ywc * xwf * vwf * uwc *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yc, xf, Vin, Uin, vf, uc, pad_val) +
+        ywc * xwf * vwc * uwf *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yc, xf, Vin, Uin, vc, uf, pad_val) +
+        ywc * xwf * vwc * uwc *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yc, xf, Vin, Uin, vc, uc, pad_val) +
+        ywc * xwc * vwf * uwf *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yc, xc, Vin, Uin, vf, uf, pad_val) +
+        ywc * xwc * vwf * uwc *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yc, xc, Vin, Uin, vf, uc, pad_val) +
+        ywc * xwc * vwc * uwf *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yc, xc, Vin, Uin, vc, uf, pad_val) +
+        ywc * xwc * vwc * uwc *
+            get_pixel_val(
+                bottom_data, idx, Hin, Win, yc, xc, Vin, Uin, vc, uc, pad_val);
+
+    top_data[index] = val;
+  }
+}
+
+template <typename T>
+__global__ void SwapAlign2NatBackwardFeat(
+    const int nthreads,
+    const T* top_diff,
+    const int Vout,
+    const int Uout,
+    const float hVout,
+    const float hUout,
+    const int Vin,
+    const int Uin,
+    const float lambda,
+    const int Hin,
+    const int Win,
+    const int Hout,
+    const int Wout,
+    T* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int idx = index;
+    const int x = idx % Wout;
+    idx /= Wout;
+    const int y = idx % Hout;
+    idx /= Hout;
+    const int u = idx % Uout;
+    idx /= Uout;
+    const int v = idx % Vout;
+    idx /= Vout;
+
+    const float ox = x * lambda + u - hUout + 0.5;
+    const int xf = static_cast<int>(floor(ox));
+    const int xc = static_cast<int>(ceil(ox));
+    const float xwc = ox - xf;
+    const float xwf = 1. - xwc;
+
+    const float oy = y * lambda + v - hVout + 0.5;
+    const int yf = static_cast<int>(floor(oy));
+    const int yc = static_cast<int>(ceil(oy));
+    const float ywc = oy - yf;
+    const float ywf = 1. - ywc;
+
+    const float ou = (u + 0.5) / lambda - 0.5;
+    const int uf = static_cast<int>(floor(ou));
+    const int uc = static_cast<int>(ceil(ou));
+    const float uwc = ou - uf;
+    const float uwf = 1. - uwc;
+
+    const float ov = (v + 0.5) / lambda - 0.5;
+    const int vf = static_cast<int>(floor(ov));
+    const int vc = static_cast<int>(ceil(ov));
+    const float vwc = ov - vf;
+    const float vwf = 1. - vwc;
+
+    const T grad = top_diff[index];
+
+    add_pixel_val(
+        bottom_diff,
+        ywf * xwf * vwf * uwf * grad,
+        idx,
+        Hin,
+        Win,
+        yf,
+        xf,
+        Vin,
+        Uin,
+        vf,
+        uf);
+    add_pixel_val(
+        bottom_diff,
+        ywf * xwf * vwf * uwc * grad,
+        idx,
+        Hin,
+        Win,
+        yf,
+        xf,
+        Vin,
+        Uin,
+        vf,
+        uc);
+    add_pixel_val(
+        bottom_diff,
+        ywf * xwf * vwc * uwf * grad,
+        idx,
+        Hin,
+        Win,
+        yf,
+        xf,
+        Vin,
+        Uin,
+        vc,
+        uf);
+    add_pixel_val(
+        bottom_diff,
+        ywf * xwf * vwc * uwc * grad,
+        idx,
+        Hin,
+        Win,
+        yf,
+        xf,
+        Vin,
+        Uin,
+        vc,
+        uc);
+    add_pixel_val(
+        bottom_diff,
+        ywf * xwc * vwf * uwf * grad,
+        idx,
+        Hin,
+        Win,
+        yf,
+        xc,
+        Vin,
+        Uin,
+        vf,
+        uf);
+    add_pixel_val(
+        bottom_diff,
+        ywf * xwc * vwf * uwc * grad,
+        idx,
+        Hin,
+        Win,
+        yf,
+        xc,
+        Vin,
+        Uin,
+        vf,
+        uc);
+    add_pixel_val(
+        bottom_diff,
+        ywf * xwc * vwc * uwf * grad,
+        idx,
+        Hin,
+        Win,
+        yf,
+        xc,
+        Vin,
+        Uin,
+        vc,
+        uf);
+    add_pixel_val(
+        bottom_diff,
+        ywf * xwc * vwc * uwc * grad,
+        idx,
+        Hin,
+        Win,
+        yf,
+        xc,
+        Vin,
+        Uin,
+        vc,
+        uc);
+    add_pixel_val(
+        bottom_diff,
+        ywc * xwf * vwf * uwf * grad,
+        idx,
+        Hin,
+        Win,
+        yc,
+        xf,
+        Vin,
+        Uin,
+        vf,
+        uf);
+    add_pixel_val(
+        bottom_diff,
+        ywc * xwf * vwf * uwc * grad,
+        idx,
+        Hin,
+        Win,
+        yc,
+        xf,
+        Vin,
+        Uin,
+        vf,
+        uc);
+    add_pixel_val(
+        bottom_diff,
+        ywc * xwf * vwc * uwf * grad,
+        idx,
+        Hin,
+        Win,
+        yc,
+        xf,
+        Vin,
+        Uin,
+        vc,
+        uf);
+    add_pixel_val(
+        bottom_diff,
+        ywc * xwf * vwc * uwc * grad,
+        idx,
+        Hin,
+        Win,
+        yc,
+        xf,
+        Vin,
+        Uin,
+        vc,
+        uc);
+    add_pixel_val(
+        bottom_diff,
+        ywc * xwc * vwf * uwf * grad,
+        idx,
+        Hin,
+        Win,
+        yc,
+        xc,
+        Vin,
+        Uin,
+        vf,
+        uf);
+    add_pixel_val(
+        bottom_diff,
+        ywc * xwc * vwf * uwc * grad,
+        idx,
+        Hin,
+        Win,
+        yc,
+        xc,
+        Vin,
+        Uin,
+        vf,
+        uc);
+    add_pixel_val(
+        bottom_diff,
+        ywc * xwc * vwc * uwf * grad,
+        idx,
+        Hin,
+        Win,
+        yc,
+        xc,
+        Vin,
+        Uin,
+        vc,
+        uf);
+    add_pixel_val(
+        bottom_diff,
+        ywc * xwc * vwc * uwc * grad,
+        idx,
+        Hin,
+        Win,
+        yc,
+        xc,
+        Vin,
+        Uin,
+        vc,
+        uc);
+  }
+}
+
+namespace tensormask {
+
+at::Tensor SwapAlign2Nat_forward_cuda(
+    const at::Tensor& X,
+    const int lambda_val,
+    const float pad_val) {
+  AT_ASSERTM(X.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(X.ndimension() == 4, "input must be a 4D tensor");
+  AT_ASSERTM(lambda_val >= 1, "lambda should be greater or equal to 1");
+  const int N = X.size(0);
+  const int C = X.size(1);
+  const int Vin = static_cast<int>(sqrt(static_cast<float>(C)));
+  const int Uin = C / Vin;
+  AT_ASSERTM(
+      C == Vin * Uin && Vin == Uin, "#channels should be a square number");
+  const int Vout = lambda_val * Vin;
+  const int Uout = lambda_val * Uin;
+  const int Hin = X.size(2);
+  const int Win = X.size(3);
+  const float lambda = static_cast<float>(lambda_val);
+  const int Hout = static_cast<int>(ceil(Hin / lambda));
+  const int Wout = static_cast<int>(ceil(Win / lambda));
+  const float hVout = Vout / 2.;
+  const float hUout = Uout / 2.;
+
+  at::cuda::CUDAGuard device_guard(X.device());
+
+  at::Tensor Y = at::empty({N, Vout * Uout, Hout, Wout}, X.options());
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv(Y.numel(), 512L), 4096L));
+  dim3 block(512);
+
+  if (Y.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return Y;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "SwapAlign2Nat_forward", [&] {
+    SwapAlign2NatForwardFeat<scalar_t><<<grid, block, 0, stream>>>(
+        Y.numel(),
+        X.contiguous().data_ptr<scalar_t>(),
+        Vout,
+        Uout,
+        hVout,
+        hUout,
+        Vin,
+        Uin,
+        lambda,
+        Hin,
+        Win,
+        Hout,
+        Wout,
+        pad_val,
+        Y.data_ptr<scalar_t>());
+  });
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+  return Y;
+}
+
+at::Tensor SwapAlign2Nat_backward_cuda(
+    const at::Tensor& gY,
+    const int lambda_val,
+    const int batch_size,
+    const int channel,
+    const int height,
+    const int width) {
+  AT_ASSERTM(gY.device().is_cuda(), "input gradient must be a CUDA tensor");
+  AT_ASSERTM(gY.ndimension() == 4, "input gradient must be a 4D tensor");
+  AT_ASSERTM(lambda_val >= 1, "lambda should be greater or equal to 1");
+  const int Vin = static_cast<int>(sqrt(static_cast<float>(channel)));
+  const int Uin = channel / Vin;
+  const int Vout = lambda_val * Vin;
+  const int Uout = lambda_val * Uin;
+  const float hVout = Vout / 2.;
+  const float hUout = Uout / 2.;
+  const int Hout = gY.size(2);
+  const int Wout = gY.size(3);
+
+  at::cuda::CUDAGuard device_guard(gY.device());
+
+  at::Tensor gX = at::zeros({batch_size, channel, height, width}, gY.options());
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv(gY.numel(), 512L), 4096L));
+  dim3 block(512);
+
+  // handle possibly empty gradients
+  if (gY.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return gX;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(gY.scalar_type(), "SwapAlign2Nat_backward", [&] {
+    SwapAlign2NatBackwardFeat<scalar_t><<<grid, block, 0, stream>>>(
+        gY.numel(),
+        gY.contiguous().data_ptr<scalar_t>(),
+        Vout,
+        Uout,
+        hVout,
+        hUout,
+        Vin,
+        Uin,
+        static_cast<float>(lambda_val),
+        height,
+        width,
+        Hout,
+        Wout,
+        gX.data_ptr<scalar_t>());
+  });
+  AT_CUDA_CHECK(cudaGetLastError());
+  return gX;
+}
+
+} // namespace tensormask
diff --git a/projects/TensorMask/tensormask/layers/csrc/vision.cpp b/projects/TensorMask/tensormask/layers/csrc/vision.cpp
new file mode 100644
index 0000000..ad8e472
--- /dev/null
+++ b/projects/TensorMask/tensormask/layers/csrc/vision.cpp
@@ -0,0 +1,19 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+#include <torch/extension.h>
+#include "SwapAlign2Nat/SwapAlign2Nat.h"
+
+namespace tensormask {
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def(
+      "swap_align2nat_forward",
+      &SwapAlign2Nat_forward,
+      "SwapAlign2Nat_forward");
+  m.def(
+      "swap_align2nat_backward",
+      &SwapAlign2Nat_backward,
+      "SwapAlign2Nat_backward");
+}
+
+} // namespace tensormask
diff --git a/projects/TensorMask/tensormask/layers/swap_align2nat.py b/projects/TensorMask/tensormask/layers/swap_align2nat.py
new file mode 100644
index 0000000..79a6809
--- /dev/null
+++ b/projects/TensorMask/tensormask/layers/swap_align2nat.py
@@ -0,0 +1,61 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from tensormask import _C
+
+
+class _SwapAlign2Nat(Function):
+    @staticmethod
+    def forward(ctx, X, lambda_val, pad_val):
+        ctx.lambda_val = lambda_val
+        ctx.input_shape = X.size()
+
+        Y = _C.swap_align2nat_forward(X, lambda_val, pad_val)
+        return Y
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, gY):
+        lambda_val = ctx.lambda_val
+        bs, ch, h, w = ctx.input_shape
+
+        gX = _C.swap_align2nat_backward(gY, lambda_val, bs, ch, h, w)
+
+        return gX, None, None
+
+
+swap_align2nat = _SwapAlign2Nat.apply
+
+
+class SwapAlign2Nat(nn.Module):
+    """
+        The op `SwapAlign2Nat` described in https://arxiv.org/abs/1903.12174.
+        Given an input tensor that predicts masks of shape (N, C=VxU, H, W),
+        apply the op, it will return masks of shape (N, V'xU', H', W') where
+        the unit lengths of (V, U) and (H, W) are swapped, and the mask representation
+        is transformed from aligned to natural.
+        Args:
+            lambda_val (int): the relative unit length ratio between (V, U) and (H, W),
+                                as we always have larger unit lengths for (V, U) than (H, W),
+                                lambda_val is always >= 1.
+            pad_val (float):    padding value for the values falling outside of the input
+                                tensor, default set to -6 as sigmoid(-6) is ~0, indicating
+                                that is no masks outside of the tensor.
+    """
+
+    def __init__(self, lambda_val, pad_val=-6.):
+        super(SwapAlign2Nat, self).__init__()
+        self.lambda_val = lambda_val
+        self.pad_val = pad_val
+
+    def forward(self, X):
+        return swap_align2nat(X, self.lambda_val, self.pad_val)
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += "lambda_val=" + str(self.lambda_val)
+        tmpstr += ", pad_val=" + str(self.pad_val)
+        tmpstr += ")"
+        return tmpstr
diff --git a/projects/TensorMask/tests/__init__.py b/projects/TensorMask/tests/__init__.py
new file mode 100644
index 0000000..168f997
--- /dev/null
+++ b/projects/TensorMask/tests/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
diff --git a/projects/TensorMask/tests/test_swap_align2nat.py b/projects/TensorMask/tests/test_swap_align2nat.py
new file mode 100644
index 0000000..456fc87
--- /dev/null
+++ b/projects/TensorMask/tests/test_swap_align2nat.py
@@ -0,0 +1,30 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import unittest
+import torch
+from torch.autograd import gradcheck
+
+from tensormask.layers.swap_align2nat import SwapAlign2Nat
+
+
+class SwapAlign2NatTest(unittest.TestCase):
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+    def test_swap_align2nat_gradcheck_cuda(self):
+        dtype = torch.float64
+        device = torch.device('cuda')
+        m = SwapAlign2Nat(2).to(dtype=dtype, device=device)
+        x = torch.rand(2, 4, 10, 10, dtype=dtype, device=device, requires_grad=True)
+
+        assert gradcheck(m, x), 'gradcheck failed for SwapAlign2Nat CUDA'
+
+    def _swap_align2nat(self, tensor, lambda_val):
+        """
+        The basic setup for testing Swap_Align
+        """
+        op = SwapAlign2Nat(lambda_val, pad_val=0.)
+        input = torch.from_numpy(tensor[None, :, :, :].astype("float32"))
+        output = op.forward(input.cuda()).cpu().numpy()
+        return output[0]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/projects/TensorMask/train_net.py b/projects/TensorMask/train_net.py
new file mode 100644
index 0000000..a09219e
--- /dev/null
+++ b/projects/TensorMask/train_net.py
@@ -0,0 +1,68 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+TensorMask Training Script.
+
+This script is a simplified version of the training script in detectron2/tools.
+"""
+
+import os
+
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
+from detectron2.evaluation import COCOEvaluator, verify_results
+
+from tensormask import add_tensormask_config
+
+
+class Trainer(DefaultTrainer):
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        return COCOEvaluator(dataset_name, cfg, True, output_folder)
+
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    add_tensormask_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = Trainer.test(cfg, model)
+        if comm.is_main_process():
+            verify_results(cfg, res)
+        return res
+
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/projects/TridentNet/README.md b/projects/TridentNet/README.md
new file mode 100644
index 0000000..4b7a901
--- /dev/null
+++ b/projects/TridentNet/README.md
@@ -0,0 +1,60 @@
+
+# TridentNet in Detectron2
+**Scale-Aware Trident Networks for Object Detection**
+
+Yanghao Li\*, Yuntao Chen\*, Naiyan Wang, Zhaoxiang Zhang
+
+[[`TridentNet`](https://github.com/TuSimple/simpledet/tree/master/models/tridentnet)] [[`arXiv`](https://arxiv.org/abs/1901.01892)] [[`BibTeX`](#CitingTridentNet)]
+
+<div align="center">
+  <img src="https://drive.google.com/uc?export=view&id=10THEPdIPmf3ooMyNzrfZbpWihEBvixwt" width="700px" />
+</div>
+
+In this repository, we implement TridentNet-Fast in Detectron2.
+Trident Network (TridentNet) aims to generate scale-specific feature maps with a uniform representational power. We construct a parallel multi-branch architecture in which each branch shares the same transformation parameters but with different receptive fields. TridentNet-Fast is a fast approximation version of TridentNet that could achieve significant improvements without any additional parameters and computational cost.
+
+## Training
+
+To train a model, run
+```bash
+python /path/to/detectron2/projects/TridentNet/train_net.py --config-file <config.yaml>
+```
+
+For example, to launch end-to-end TridentNet training with ResNet-50 backbone on 8 GPUs,
+one should execute:
+```bash
+python /path/to/detectron2/projects/TridentNet/train_net.py --config-file configs/tridentnet_fast_R_50_C4_1x.yaml --num-gpus 8
+```
+
+## Evaluation
+
+Model evaluation can be done similarly:
+```bash
+python /path/to/detectron2/projects/TridentNet/train_net.py --config-file configs/tridentnet_fast_R_50_C4_1x.yaml --eval-only MODEL.WEIGHTS model.pth
+```
+
+## Results on MS-COCO in Detectron2
+
+|Model|Backbone|Head|lr sched|AP|AP50|AP75|APs|APm|APl|download|
+|-----|--------|----|--------|--|----|----|---|---|---|--------|
+|Faster|R50-C4|C5-512ROI|1X|35.7|56.1|38.0|19.2|40.9|48.7|<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/model_final_721ade.pkl">model</a>&nbsp;\|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/metrics.json">metrics</a>|
+|TridentFast|R50-C4|C5-128ROI|1X|38.0|58.1|40.8|19.5|42.2|54.6|<a href="https://dl.fbaipublicfiles.com/detectron2/TridentNet/tridentnet_fast_R_50_C4_1x/148572687/model_final_756cda.pkl">model</a>&nbsp;\|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/TridentNet/tridentnet_fast_R_50_C4_1x/148572687/metrics.json">metrics</a>|
+|Faster|R50-C4|C5-512ROI|3X|38.4|58.7|41.3|20.7|42.7|53.1|<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_3x/137849393/model_final_f97cb7.pkl">model</a>&nbsp;\|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_3x/137849393/metrics.json">metrics</a>|
+|TridentFast|R50-C4|C5-128ROI|3X|40.6|60.8|43.6|23.4|44.7|57.1|<a href="https://dl.fbaipublicfiles.com/detectron2/TridentNet/tridentnet_fast_R_50_C4_3x/148572287/model_final_e1027c.pkl">model</a>&nbsp;\|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/TridentNet/tridentnet_fast_R_50_C4_3x/148572287/metrics.json">metrics</a>|
+|Faster|R101-C4|C5-512ROI|3X|41.1|61.4|44.0|22.2|45.5|55.9|<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_C4_3x/138204752/model_final_298dad.pkl">model</a>&nbsp;\|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_C4_3x/138204752/metrics.json">metrics</a>|
+|TridentFast|R101-C4|C5-128ROI|3X|43.6|63.4|47.0|24.3|47.8|60.0|<a href="https://dl.fbaipublicfiles.com/detectron2/TridentNet/tridentnet_fast_R_101_C4_3x/148572198/model_final_164568.pkl">model</a>&nbsp;\|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/TridentNet/tridentnet_fast_R_101_C4_3x/148572198/metrics.json">metrics</a>|
+
+
+## <a name="CitingTridentNet"></a>Citing TridentNet
+
+If you use TridentNet, please use the following BibTeX entry.
+
+```
+@InProceedings{li2019scale,
+  title={Scale-Aware Trident Networks for Object Detection},
+  author={Li, Yanghao and Chen, Yuntao and Wang, Naiyan and Zhang, Zhaoxiang},
+  journal={The International Conference on Computer Vision (ICCV)},
+  year={2019}
+}
+```
+
diff --git a/projects/TridentNet/configs/Base-TridentNet-Fast-C4.yaml b/projects/TridentNet/configs/Base-TridentNet-Fast-C4.yaml
new file mode 100644
index 0000000..1dd2a15
--- /dev/null
+++ b/projects/TridentNet/configs/Base-TridentNet-Fast-C4.yaml
@@ -0,0 +1,28 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  BACKBONE:
+    NAME: "build_trident_resnet_backbone"
+  ROI_HEADS:
+    NAME: "TridentRes5ROIHeads"
+    POSITIVE_FRACTION: 0.5
+    BATCH_SIZE_PER_IMAGE: 128
+    PROPOSAL_APPEND_GT: False
+  PROPOSAL_GENERATOR:
+    NAME: "TridentRPN"
+  RPN:
+    POST_NMS_TOPK_TRAIN: 500
+  TRIDENT:
+    NUM_BRANCH: 3
+    BRANCH_DILATIONS: [1, 2, 3]
+    TEST_BRANCH_IDX: 1
+    TRIDENT_STAGE: "res4"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
diff --git a/projects/TridentNet/configs/tridentnet_fast_R_101_C4_3x.yaml b/projects/TridentNet/configs/tridentnet_fast_R_101_C4_3x.yaml
new file mode 100644
index 0000000..bc83c2f
--- /dev/null
+++ b/projects/TridentNet/configs/tridentnet_fast_R_101_C4_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "Base-TridentNet-Fast-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 101
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/projects/TridentNet/configs/tridentnet_fast_R_50_C4_1x.yaml b/projects/TridentNet/configs/tridentnet_fast_R_50_C4_1x.yaml
new file mode 100644
index 0000000..fda2cb6
--- /dev/null
+++ b/projects/TridentNet/configs/tridentnet_fast_R_50_C4_1x.yaml
@@ -0,0 +1,6 @@
+_BASE_: "Base-TridentNet-Fast-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
diff --git a/projects/TridentNet/configs/tridentnet_fast_R_50_C4_3x.yaml b/projects/TridentNet/configs/tridentnet_fast_R_50_C4_3x.yaml
new file mode 100644
index 0000000..ebf89d0
--- /dev/null
+++ b/projects/TridentNet/configs/tridentnet_fast_R_50_C4_3x.yaml
@@ -0,0 +1,9 @@
+_BASE_: "Base-TridentNet-Fast-C4.yaml"
+MODEL:
+  WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+SOLVER:
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
diff --git a/projects/TridentNet/train_net.py b/projects/TridentNet/train_net.py
new file mode 100644
index 0000000..34cde48
--- /dev/null
+++ b/projects/TridentNet/train_net.py
@@ -0,0 +1,68 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+TridentNet Training Script.
+
+This script is a simplified version of the training script in detectron2/tools.
+"""
+
+import os
+
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
+from detectron2.evaluation import COCOEvaluator, verify_results
+
+from tridentnet import add_tridentnet_config
+
+
+class Trainer(DefaultTrainer):
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        return COCOEvaluator(dataset_name, cfg, True, output_folder)
+
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    add_tridentnet_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = Trainer.test(cfg, model)
+        if comm.is_main_process():
+            verify_results(cfg, res)
+        return res
+
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/projects/TridentNet/tridentnet/__init__.py b/projects/TridentNet/tridentnet/__init__.py
new file mode 100644
index 0000000..2fcdeb4
--- /dev/null
+++ b/projects/TridentNet/tridentnet/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .config import add_tridentnet_config
+from .trident_backbone import (
+    TridentBottleneckBlock,
+    build_trident_resnet_backbone,
+    make_trident_stage,
+)
+from .trident_rpn import TridentRPN
+from .trident_rcnn import TridentRes5ROIHeads, TridentStandardROIHeads
diff --git a/projects/TridentNet/tridentnet/config.py b/projects/TridentNet/tridentnet/config.py
new file mode 100644
index 0000000..f33f473
--- /dev/null
+++ b/projects/TridentNet/tridentnet/config.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from detectron2.config import CfgNode as CN
+
+
+def add_tridentnet_config(cfg):
+    """
+    Add config for tridentnet.
+    """
+    _C = cfg
+
+    _C.MODEL.TRIDENT = CN()
+
+    # Number of branches for TridentNet.
+    _C.MODEL.TRIDENT.NUM_BRANCH = 3
+    # Specify the dilations for each branch.
+    _C.MODEL.TRIDENT.BRANCH_DILATIONS = [1, 2, 3]
+    # Specify the stage for applying trident blocks. Default stage is Res4 according to the
+    # TridentNet paper.
+    _C.MODEL.TRIDENT.TRIDENT_STAGE = "res4"
+    # Specify the test branch index TridentNet Fast inference:
+    #   - use -1 to aggregate results of all branches during inference.
+    #   - otherwise, only using specified branch for fast inference. Recommended setting is
+    #     to use the middle branch.
+    _C.MODEL.TRIDENT.TEST_BRANCH_IDX = 1
diff --git a/projects/TridentNet/tridentnet/trident_backbone.py b/projects/TridentNet/tridentnet/trident_backbone.py
new file mode 100644
index 0000000..232dfaf
--- /dev/null
+++ b/projects/TridentNet/tridentnet/trident_backbone.py
@@ -0,0 +1,223 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn.functional as F
+
+from detectron2.layers import Conv2d, FrozenBatchNorm2d, get_norm
+from detectron2.modeling import BACKBONE_REGISTRY, ResNet, ResNetBlockBase, make_stage
+from detectron2.modeling.backbone.resnet import BasicStem, BottleneckBlock, DeformBottleneckBlock
+
+from .trident_conv import TridentConv
+
+__all__ = ["TridentBottleneckBlock", "make_trident_stage", "build_trident_resnet_backbone"]
+
+
+class TridentBottleneckBlock(ResNetBlockBase):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        stride_in_1x1=False,
+        num_branch=3,
+        dilations=(1, 2, 3),
+        concat_output=False,
+        test_branch_idx=-1,
+    ):
+        """
+        Args:
+            num_branch (int): the number of branches in TridentNet.
+            dilations (tuple): the dilations of multiple branches in TridentNet.
+            concat_output (bool): if concatenate outputs of multiple branches in TridentNet.
+                Use 'True' for the last trident block.
+        """
+        super().__init__(in_channels, out_channels, stride)
+
+        assert num_branch == len(dilations)
+
+        self.num_branch = num_branch
+        self.concat_output = concat_output
+        self.test_branch_idx = test_branch_idx
+
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=1,
+            stride=stride_1x1,
+            bias=False,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+
+        self.conv2 = TridentConv(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=3,
+            stride=stride_3x3,
+            paddings=dilations,
+            bias=False,
+            groups=num_groups,
+            dilations=dilations,
+            num_branch=num_branch,
+            test_branch_idx=test_branch_idx,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+
+        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+
+    def forward(self, x):
+        num_branch = self.num_branch if self.training or self.test_branch_idx == -1 else 1
+        if not isinstance(x, list):
+            x = [x] * num_branch
+        out = [self.conv1(b) for b in x]
+        out = [F.relu_(b) for b in out]
+
+        out = self.conv2(out)
+        out = [F.relu_(b) for b in out]
+
+        out = [self.conv3(b) for b in out]
+
+        if self.shortcut is not None:
+            shortcut = [self.shortcut(b) for b in x]
+        else:
+            shortcut = x
+
+        out = [out_b + shortcut_b for out_b, shortcut_b in zip(out, shortcut)]
+        out = [F.relu_(b) for b in out]
+        if self.concat_output:
+            out = torch.cat(out)
+        return out
+
+
+def make_trident_stage(block_class, num_blocks, first_stride, **kwargs):
+    """
+    Create a resnet stage by creating many blocks for TridentNet.
+    """
+    blocks = []
+    for i in range(num_blocks - 1):
+        blocks.append(block_class(stride=first_stride if i == 0 else 1, **kwargs))
+        kwargs["in_channels"] = kwargs["out_channels"]
+    blocks.append(block_class(stride=1, concat_output=True, **kwargs))
+    return blocks
+
+
+@BACKBONE_REGISTRY.register()
+def build_trident_resnet_backbone(cfg, input_shape):
+    """
+    Create a ResNet instance from config for TridentNet.
+
+    Returns:
+        ResNet: a :class:`ResNet` instance.
+    """
+    # need registration of new blocks/stems?
+    norm = cfg.MODEL.RESNETS.NORM
+    stem = BasicStem(
+        in_channels=input_shape.channels,
+        out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS,
+        norm=norm,
+    )
+    freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT
+
+    if freeze_at >= 1:
+        for p in stem.parameters():
+            p.requires_grad = False
+        stem = FrozenBatchNorm2d.convert_frozen_batchnorm(stem)
+
+    # fmt: off
+    out_features         = cfg.MODEL.RESNETS.OUT_FEATURES
+    depth                = cfg.MODEL.RESNETS.DEPTH
+    num_groups           = cfg.MODEL.RESNETS.NUM_GROUPS
+    width_per_group      = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
+    bottleneck_channels  = num_groups * width_per_group
+    in_channels          = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
+    out_channels         = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
+    stride_in_1x1        = cfg.MODEL.RESNETS.STRIDE_IN_1X1
+    res5_dilation        = cfg.MODEL.RESNETS.RES5_DILATION
+    deform_on_per_stage  = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
+    deform_modulated     = cfg.MODEL.RESNETS.DEFORM_MODULATED
+    deform_num_groups    = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS
+    num_branch           = cfg.MODEL.TRIDENT.NUM_BRANCH
+    branch_dilations     = cfg.MODEL.TRIDENT.BRANCH_DILATIONS
+    trident_stage        = cfg.MODEL.TRIDENT.TRIDENT_STAGE
+    test_branch_idx      = cfg.MODEL.TRIDENT.TEST_BRANCH_IDX
+    # fmt: on
+    assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)
+
+    num_blocks_per_stage = {50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3]}[depth]
+
+    stages = []
+
+    res_stage_idx = {"res2": 2, "res3": 3, "res4": 4, "res5": 5}
+    out_stage_idx = [res_stage_idx[f] for f in out_features]
+    trident_stage_idx = res_stage_idx[trident_stage]
+    max_stage_idx = max(out_stage_idx)
+    for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)):
+        dilation = res5_dilation if stage_idx == 5 else 1
+        first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
+        stage_kargs = {
+            "num_blocks": num_blocks_per_stage[idx],
+            "first_stride": first_stride,
+            "in_channels": in_channels,
+            "bottleneck_channels": bottleneck_channels,
+            "out_channels": out_channels,
+            "num_groups": num_groups,
+            "norm": norm,
+            "stride_in_1x1": stride_in_1x1,
+            "dilation": dilation,
+        }
+        if stage_idx == trident_stage_idx:
+            assert not deform_on_per_stage[
+                idx
+            ], "Not support deformable conv in Trident blocks yet."
+            stage_kargs["block_class"] = TridentBottleneckBlock
+            stage_kargs["num_branch"] = num_branch
+            stage_kargs["dilations"] = branch_dilations
+            stage_kargs["test_branch_idx"] = test_branch_idx
+            stage_kargs.pop("dilation")
+        elif deform_on_per_stage[idx]:
+            stage_kargs["block_class"] = DeformBottleneckBlock
+            stage_kargs["deform_modulated"] = deform_modulated
+            stage_kargs["deform_num_groups"] = deform_num_groups
+        else:
+            stage_kargs["block_class"] = BottleneckBlock
+        blocks = (
+            make_trident_stage(**stage_kargs)
+            if stage_idx == trident_stage_idx
+            else make_stage(**stage_kargs)
+        )
+        in_channels = out_channels
+        out_channels *= 2
+        bottleneck_channels *= 2
+
+        if freeze_at >= stage_idx:
+            for block in blocks:
+                block.freeze()
+        stages.append(blocks)
+    return ResNet(stem, stages, out_features=out_features)
diff --git a/projects/TridentNet/tridentnet/trident_conv.py b/projects/TridentNet/tridentnet/trident_conv.py
new file mode 100644
index 0000000..7e2d525
--- /dev/null
+++ b/projects/TridentNet/tridentnet/trident_conv.py
@@ -0,0 +1,107 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.modules.utils import _pair
+
+from detectron2.layers.wrappers import _NewEmptyTensorOp
+
+
+class TridentConv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        paddings=0,
+        dilations=1,
+        groups=1,
+        num_branch=1,
+        test_branch_idx=-1,
+        bias=False,
+        norm=None,
+        activation=None,
+    ):
+        super(TridentConv, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.num_branch = num_branch
+        self.stride = _pair(stride)
+        self.groups = groups
+        self.with_bias = bias
+        if isinstance(paddings, int):
+            paddings = [paddings] * self.num_branch
+        if isinstance(dilations, int):
+            dilations = [dilations] * self.num_branch
+        self.paddings = [_pair(padding) for padding in paddings]
+        self.dilations = [_pair(dilation) for dilation in dilations]
+        self.test_branch_idx = test_branch_idx
+        self.norm = norm
+        self.activation = activation
+
+        assert len({self.num_branch, len(self.paddings), len(self.dilations)}) == 1
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.bias = None
+
+        nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0)
+
+    def forward(self, inputs):
+        num_branch = self.num_branch if self.training or self.test_branch_idx == -1 else 1
+        assert len(inputs) == num_branch
+
+        if inputs[0].numel() == 0:
+            output_shape = [
+                (i + 2 * p - (di * (k - 1) + 1)) // s + 1
+                for i, p, di, k, s in zip(
+                    inputs[0].shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
+                )
+            ]
+            output_shape = [input[0].shape[0], self.weight.shape[0]] + output_shape
+            return [_NewEmptyTensorOp.apply(input, output_shape) for input in inputs]
+
+        if self.training or self.test_branch_idx == -1:
+            outputs = [
+                F.conv2d(input, self.weight, self.bias, self.stride, padding, dilation, self.groups)
+                for input, dilation, padding in zip(inputs, self.dilations, self.paddings)
+            ]
+        else:
+            outputs = [
+                F.conv2d(
+                    inputs[0],
+                    self.weight,
+                    self.bias,
+                    self.stride,
+                    self.paddings[self.test_branch_idx],
+                    self.dilations[self.test_branch_idx],
+                    self.groups,
+                )
+            ]
+
+        if self.norm is not None:
+            outputs = [self.norm(x) for x in outputs]
+        if self.activation is not None:
+            outputs = [self.activation(x) for x in outputs]
+        return outputs
+
+    def extra_repr(self):
+        tmpstr = "in_channels=" + str(self.in_channels)
+        tmpstr += ", out_channels=" + str(self.out_channels)
+        tmpstr += ", kernel_size=" + str(self.kernel_size)
+        tmpstr += ", num_branch=" + str(self.num_branch)
+        tmpstr += ", test_branch_idx=" + str(self.test_branch_idx)
+        tmpstr += ", stride=" + str(self.stride)
+        tmpstr += ", paddings=" + str(self.paddings)
+        tmpstr += ", dilations=" + str(self.dilations)
+        tmpstr += ", groups=" + str(self.groups)
+        tmpstr += ", bias=" + str(self.with_bias)
+        return tmpstr
diff --git a/projects/TridentNet/tridentnet/trident_rcnn.py b/projects/TridentNet/tridentnet/trident_rcnn.py
new file mode 100644
index 0000000..f458b25
--- /dev/null
+++ b/projects/TridentNet/tridentnet/trident_rcnn.py
@@ -0,0 +1,110 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from detectron2.layers import batched_nms
+from detectron2.modeling import ROI_HEADS_REGISTRY, StandardROIHeads
+from detectron2.modeling.roi_heads.roi_heads import Res5ROIHeads
+from detectron2.structures import Instances
+
+
+def merge_branch_instances(instances, num_branch, nms_thrsh, topk_per_image):
+    """
+    Merge detection results from different branches of TridentNet.
+    Return detection results by applying non-maximum suppression (NMS) on bounding boxes
+    and keep the unsuppressed boxes and other instances (e.g mask) if any.
+
+    Args:
+        instances (list[Instances]): A list of N * num_branch instances that store detection
+            results. Contain N images and each image has num_branch instances.
+        num_branch (int): Number of branches used for merging detection results for each image.
+        nms_thresh (float):  The threshold to use for box non-maximum suppression. Value in [0, 1].
+        topk_per_image (int): The number of top scoring detections to return. Set < 0 to return
+            all detections.
+
+    Returns:
+        results: (list[Instances]): A list of N instances, one for each image in the batch,
+            that stores the topk most confidence detections after merging results from multiple
+            branches.
+    """
+    if num_branch == 1:
+        return instances
+
+    batch_size = len(instances) // num_branch
+    results = []
+    for i in range(batch_size):
+        instance = Instances.cat([instances[i + batch_size * j] for j in range(num_branch)])
+
+        # Apply per-class NMS
+        keep = batched_nms(
+            instance.pred_boxes.tensor, instance.scores, instance.pred_classes, nms_thrsh
+        )
+        keep = keep[:topk_per_image]
+        result = instance[keep]
+
+        results.append(result)
+
+    return results
+
+
+@ROI_HEADS_REGISTRY.register()
+class TridentRes5ROIHeads(Res5ROIHeads):
+    """
+    The TridentNet ROIHeads in a typical "C4" R-CNN model.
+    See :class:`Res5ROIHeads`.
+    """
+
+    def __init__(self, cfg, input_shape):
+        super().__init__(cfg, input_shape)
+
+        self.num_branch = cfg.MODEL.TRIDENT.NUM_BRANCH
+        self.trident_fast = cfg.MODEL.TRIDENT.TEST_BRANCH_IDX != -1
+
+    def forward(self, images, features, proposals, targets=None):
+        """
+        See :class:`Res5ROIHeads.forward`.
+        """
+        num_branch = self.num_branch if self.training or not self.trident_fast else 1
+        all_targets = targets * num_branch if targets is not None else None
+        pred_instances, losses = super().forward(images, features, proposals, all_targets)
+        del images, all_targets, targets
+
+        if self.training:
+            return pred_instances, losses
+        else:
+            pred_instances = merge_branch_instances(
+                pred_instances, num_branch, self.test_nms_thresh, self.test_detections_per_img
+            )
+
+            return pred_instances, {}
+
+
+@ROI_HEADS_REGISTRY.register()
+class TridentStandardROIHeads(StandardROIHeads):
+    """
+    The `StandardROIHeads` for TridentNet.
+    See :class:`StandardROIHeads`.
+    """
+
+    def __init__(self, cfg, input_shape):
+        super(TridentStandardROIHeads, self).__init__(cfg, input_shape)
+
+        self.num_branch = cfg.MODEL.TRIDENT.NUM_BRANCH
+        self.trident_fast = cfg.MODEL.TRIDENT.TEST_BRANCH_IDX != -1
+
+    def forward(self, images, features, proposals, targets=None):
+        """
+        See :class:`Res5ROIHeads.forward`.
+        """
+        # Use 1 branch if using trident_fast during inference.
+        num_branch = self.num_branch if self.training or not self.trident_fast else 1
+        # Duplicate targets for all branches in TridentNet.
+        all_targets = targets * num_branch if targets is not None else None
+        pred_instances, losses = super().forward(images, features, proposals, all_targets)
+        del images, all_targets, targets
+
+        if self.training:
+            return pred_instances, losses
+        else:
+            pred_instances = merge_branch_instances(
+                pred_instances, num_branch, self.test_nms_thresh, self.test_detections_per_img
+            )
+
+            return pred_instances, {}
diff --git a/projects/TridentNet/tridentnet/trident_rpn.py b/projects/TridentNet/tridentnet/trident_rpn.py
new file mode 100644
index 0000000..c30137f
--- /dev/null
+++ b/projects/TridentNet/tridentnet/trident_rpn.py
@@ -0,0 +1,32 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import torch
+
+from detectron2.modeling import PROPOSAL_GENERATOR_REGISTRY
+from detectron2.modeling.proposal_generator.rpn import RPN
+from detectron2.structures import ImageList
+
+
+@PROPOSAL_GENERATOR_REGISTRY.register()
+class TridentRPN(RPN):
+    """
+    Trident RPN subnetwork.
+    """
+
+    def __init__(self, cfg, input_shape):
+        super(TridentRPN, self).__init__(cfg, input_shape)
+
+        self.num_branch = cfg.MODEL.TRIDENT.NUM_BRANCH
+        self.trident_fast = cfg.MODEL.TRIDENT.TEST_BRANCH_IDX != -1
+
+    def forward(self, images, features, gt_instances=None):
+        """
+        See :class:`RPN.forward`.
+        """
+        num_branch = self.num_branch if self.training or not self.trident_fast else 1
+        # Duplicate images and gt_instances for all branches in TridentNet.
+        all_images = ImageList(
+            torch.cat([images.tensor] * num_branch), images.image_sizes * num_branch
+        )
+        all_gt_instances = gt_instances * num_branch if gt_instances is not None else None
+
+        return super(TridentRPN, self).forward(all_images, features, all_gt_instances)
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..2a2ce44
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,23 @@
+[isort]
+line_length=100
+multi_line_output=4
+known_standard_library=numpy,setuptools
+known_myself=detectron2
+known_third_party=fvcore,matplotlib,cv2,torch,torchvision,PIL,pycocotools,yacs,termcolor,cityscapesscripts,tabulate,tqdm,scipy,lvis,psutil
+no_lines_before=STDLIB,THIRDPARTY
+sections=FUTURE,STDLIB,THIRDPARTY,myself,FIRSTPARTY,LOCALFOLDER
+default_section=FIRSTPARTY
+
+[mypy]
+python_version=3.6
+ignore_missing_imports = True
+warn_unused_configs = True
+disallow_untyped_defs = True
+check_untyped_defs = True
+warn_unused_ignores = True
+warn_redundant_casts = True
+show_column_numbers = True
+follow_imports = silent
+allow_redefinition = True
+; Require all functions to be annotated
+disallow_incomplete_defs = True
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..bb2fe98
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import glob
+import os
+import shutil
+from setuptools import find_packages, setup
+from typing import List
+import torch
+from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
+
+torch_ver = [int(x) for x in torch.__version__.split(".")[:2]]
+assert torch_ver >= [1, 3], "Requires PyTorch >= 1.3"
+
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "detectron2", "layers", "csrc")
+
+    main_source = os.path.join(extensions_dir, "vision.cpp")
+    sources = glob.glob(os.path.join(extensions_dir, "**", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "**", "*.cu")) + glob.glob(
+        os.path.join(extensions_dir, "*.cu")
+    )
+
+    sources = [main_source] + sources
+    extension = CppExtension
+
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+
+    if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1":
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+
+        # It's better if pytorch can do this by default ..
+        CC = os.environ.get("CC", None)
+        if CC is not None:
+            extra_compile_args["nvcc"].append("-ccbin={}".format(CC))
+
+    include_dirs = [extensions_dir]
+
+    ext_modules = [
+        extension(
+            "detectron2._C",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+
+    return ext_modules
+
+
+def get_model_zoo_configs() -> List[str]:
+    """
+    Return a list of configs to include in package for model zoo. Copy over these configs inside
+    detectron2/model_zoo.
+    """
+
+    # Use absolute paths while symlinking.
+    source_configs_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "configs")
+    destination = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "detectron2", "model_zoo", "configs"
+    )
+    # Symlink the config directory inside package to have a cleaner pip install.
+    if os.path.exists(destination):
+        # Remove stale symlink/directory from a previous build.
+        if os.path.islink(destination):
+            os.unlink(destination)
+        else:
+            shutil.rmtree(destination)
+
+    try:
+        os.symlink(source_configs_dir, destination)
+    except OSError:
+        # Fall back to copying if symlink fails: ex. on Windows.
+        shutil.copytree(source_configs_dir, destination)
+
+    config_paths = glob.glob("configs/**/*.yaml", recursive=True)
+    return config_paths
+
+
+setup(
+    name="detectron2",
+    version="0.1",
+    author="FAIR",
+    url="https://github.com/facebookresearch/detectron2",
+    description="Detectron2 is FAIR's next-generation research "
+    "platform for object detection and segmentation.",
+    packages=find_packages(exclude=("configs", "tests")),
+    package_data={"detectron2.model_zoo": get_model_zoo_configs()},
+    python_requires=">=3.6",
+    install_requires=[
+        "termcolor>=1.1",
+        "Pillow>=6.0",
+        "yacs>=0.1.6",
+        "tabulate",
+        "cloudpickle",
+        "matplotlib",
+        "tqdm>4.29.0",
+        "tensorboard",
+        "imagesize",
+    ],
+    extras_require={"all": ["shapely", "psutil"]},
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..168f997
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
diff --git a/tests/test_anchor_generator.py b/tests/test_anchor_generator.py
new file mode 100644
index 0000000..e122655
--- /dev/null
+++ b/tests/test_anchor_generator.py
@@ -0,0 +1,90 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import unittest
+import torch
+
+from detectron2.config import get_cfg
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.anchor_generator import DefaultAnchorGenerator, RotatedAnchorGenerator
+
+logger = logging.getLogger(__name__)
+
+
+class TestAnchorGenerator(unittest.TestCase):
+    def test_default_anchor_generator(self):
+        cfg = get_cfg()
+        cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64]]
+        cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.25, 1, 4]]
+
+        anchor_generator = DefaultAnchorGenerator(cfg, [ShapeSpec(stride=4)])
+
+        # only the last two dimensions of features matter here
+        num_images = 2
+        features = {"stage3": torch.rand(num_images, 96, 1, 2)}
+        anchors = anchor_generator([features["stage3"]])
+        expected_anchor_tensor = torch.tensor(
+            [
+                [-32.0, -8.0, 32.0, 8.0],
+                [-16.0, -16.0, 16.0, 16.0],
+                [-8.0, -32.0, 8.0, 32.0],
+                [-64.0, -16.0, 64.0, 16.0],
+                [-32.0, -32.0, 32.0, 32.0],
+                [-16.0, -64.0, 16.0, 64.0],
+                [-28.0, -8.0, 36.0, 8.0],  # -28.0 == -32.0 + STRIDE (4)
+                [-12.0, -16.0, 20.0, 16.0],
+                [-4.0, -32.0, 12.0, 32.0],
+                [-60.0, -16.0, 68.0, 16.0],
+                [-28.0, -32.0, 36.0, 32.0],
+                [-12.0, -64.0, 20.0, 64.0],
+            ]
+        )
+
+        for i in range(num_images):
+            assert torch.allclose(anchors[i][0].tensor, expected_anchor_tensor)
+
+    def test_rrpn_anchor_generator(self):
+        cfg = get_cfg()
+        cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64]]
+        cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.25, 1, 4]]
+        cfg.MODEL.ANCHOR_GENERATOR.ANGLES = [[0, 45]]
+        anchor_generator = RotatedAnchorGenerator(cfg, [ShapeSpec(stride=4)])
+
+        # only the last two dimensions of features matter here
+        num_images = 2
+        features = {"stage3": torch.rand(num_images, 96, 1, 2)}
+        anchors = anchor_generator([features["stage3"]])
+        expected_anchor_tensor = torch.tensor(
+            [
+                [0.0, 0.0, 64.0, 16.0, 0.0],
+                [0.0, 0.0, 64.0, 16.0, 45.0],
+                [0.0, 0.0, 32.0, 32.0, 0.0],
+                [0.0, 0.0, 32.0, 32.0, 45.0],
+                [0.0, 0.0, 16.0, 64.0, 0.0],
+                [0.0, 0.0, 16.0, 64.0, 45.0],
+                [0.0, 0.0, 128.0, 32.0, 0.0],
+                [0.0, 0.0, 128.0, 32.0, 45.0],
+                [0.0, 0.0, 64.0, 64.0, 0.0],
+                [0.0, 0.0, 64.0, 64.0, 45.0],
+                [0.0, 0.0, 32.0, 128.0, 0.0],
+                [0.0, 0.0, 32.0, 128.0, 45.0],
+                [4.0, 0.0, 64.0, 16.0, 0.0],  # 4.0 == 0.0 + STRIDE (4)
+                [4.0, 0.0, 64.0, 16.0, 45.0],
+                [4.0, 0.0, 32.0, 32.0, 0.0],
+                [4.0, 0.0, 32.0, 32.0, 45.0],
+                [4.0, 0.0, 16.0, 64.0, 0.0],
+                [4.0, 0.0, 16.0, 64.0, 45.0],
+                [4.0, 0.0, 128.0, 32.0, 0.0],
+                [4.0, 0.0, 128.0, 32.0, 45.0],
+                [4.0, 0.0, 64.0, 64.0, 0.0],
+                [4.0, 0.0, 64.0, 64.0, 45.0],
+                [4.0, 0.0, 32.0, 128.0, 0.0],
+                [4.0, 0.0, 32.0, 128.0, 45.0],
+            ]
+        )
+
+        for i in range(num_images):
+            assert torch.allclose(anchors[i][0].tensor, expected_anchor_tensor)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_box2box_transform.py b/tests/test_box2box_transform.py
new file mode 100644
index 0000000..d9e7aaf
--- /dev/null
+++ b/tests/test_box2box_transform.py
@@ -0,0 +1,58 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import unittest
+import torch
+
+from detectron2.modeling.box_regression import Box2BoxTransform, Box2BoxTransformRotated
+
+logger = logging.getLogger(__name__)
+
+
+def random_boxes(mean_box, stdev, N):
+    return torch.rand(N, 4) * stdev + torch.tensor(mean_box, dtype=torch.float)
+
+
+class TestBox2BoxTransform(unittest.TestCase):
+    def test_reconstruction(self):
+        weights = (5, 5, 10, 10)
+        b2b_tfm = Box2BoxTransform(weights=weights)
+        src_boxes = random_boxes([10, 10, 20, 20], 1, 10)
+        dst_boxes = random_boxes([10, 10, 20, 20], 1, 10)
+
+        devices = [torch.device("cpu")]
+        if torch.cuda.is_available():
+            devices.append(torch.device("cuda"))
+        for device in devices:
+            src_boxes = src_boxes.to(device=device)
+            dst_boxes = dst_boxes.to(device=device)
+            deltas = b2b_tfm.get_deltas(src_boxes, dst_boxes)
+            dst_boxes_reconstructed = b2b_tfm.apply_deltas(deltas, src_boxes)
+            assert torch.allclose(dst_boxes, dst_boxes_reconstructed)
+
+
+def random_rotated_boxes(mean_box, std_length, std_angle, N):
+    return torch.cat(
+        [torch.rand(N, 4) * std_length, torch.rand(N, 1) * std_angle], dim=1
+    ) + torch.tensor(mean_box, dtype=torch.float)
+
+
+class TestBox2BoxTransformRotated(unittest.TestCase):
+    def test_reconstruction(self):
+        weights = (5, 5, 10, 10, 1)
+        b2b_transform = Box2BoxTransformRotated(weights=weights)
+        src_boxes = random_rotated_boxes([10, 10, 20, 20, -30], 5, 60.0, 10)
+        dst_boxes = random_rotated_boxes([10, 10, 20, 20, -30], 5, 60.0, 10)
+
+        devices = [torch.device("cpu")]
+        if torch.cuda.is_available():
+            devices.append(torch.device("cuda"))
+        for device in devices:
+            src_boxes = src_boxes.to(device=device)
+            dst_boxes = dst_boxes.to(device=device)
+            deltas = b2b_transform.get_deltas(src_boxes, dst_boxes)
+            dst_boxes_reconstructed = b2b_transform.apply_deltas(deltas, src_boxes)
+            assert torch.allclose(dst_boxes, dst_boxes_reconstructed, atol=1e-5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_boxes.py b/tests/test_boxes.py
new file mode 100644
index 0000000..7694fa1
--- /dev/null
+++ b/tests/test_boxes.py
@@ -0,0 +1,63 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+import unittest
+import torch
+
+from detectron2.structures import Boxes, BoxMode, pairwise_iou
+
+
+class TestBoxMode(unittest.TestCase):
+    def _convert_xy_to_wh(self, x):
+        return BoxMode.convert(x, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+
+    def test_box_convert_list(self):
+        for tp in [list, tuple]:
+            box = tp([5, 5, 10, 10])
+            output = self._convert_xy_to_wh(box)
+            self.assertTrue(output == tp([5, 5, 5, 5]))
+
+            with self.assertRaises(Exception):
+                self._convert_xy_to_wh([box])
+
+    def test_box_convert_array(self):
+        box = np.asarray([[5, 5, 10, 10], [1, 1, 2, 3]])
+        output = self._convert_xy_to_wh(box)
+        self.assertTrue((output[0] == [5, 5, 5, 5]).all())
+        self.assertTrue((output[1] == [1, 1, 1, 2]).all())
+
+    def test_box_convert_tensor(self):
+        box = torch.tensor([[5, 5, 10, 10], [1, 1, 2, 3]])
+        output = self._convert_xy_to_wh(box).numpy()
+        self.assertTrue((output[0] == [5, 5, 5, 5]).all())
+        self.assertTrue((output[1] == [1, 1, 1, 2]).all())
+
+
+class TestBoxIOU(unittest.TestCase):
+    def test_pairwise_iou(self):
+        boxes1 = torch.tensor([[0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 1.0, 1.0]])
+
+        boxes2 = torch.tensor(
+            [
+                [0.0, 0.0, 1.0, 1.0],
+                [0.0, 0.0, 0.5, 1.0],
+                [0.0, 0.0, 1.0, 0.5],
+                [0.0, 0.0, 0.5, 0.5],
+                [0.5, 0.5, 1.0, 1.0],
+                [0.5, 0.5, 1.5, 1.5],
+            ]
+        )
+
+        expected_ious = torch.tensor(
+            [
+                [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)],
+                [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)],
+            ]
+        )
+
+        ious = pairwise_iou(Boxes(boxes1), Boxes(boxes2))
+
+        assert torch.allclose(ious, expected_ious)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py
new file mode 100644
index 0000000..725b488
--- /dev/null
+++ b/tests/test_checkpoint.py
@@ -0,0 +1,48 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import unittest
+from collections import OrderedDict
+import torch
+from torch import nn
+
+from detectron2.checkpoint.c2_model_loading import align_and_update_state_dicts
+from detectron2.utils.logger import setup_logger
+
+
+class TestCheckpointer(unittest.TestCase):
+    def setUp(self):
+        setup_logger()
+
+    def create_complex_model(self):
+        m = nn.Module()
+        m.block1 = nn.Module()
+        m.block1.layer1 = nn.Linear(2, 3)
+        m.layer2 = nn.Linear(3, 2)
+        m.res = nn.Module()
+        m.res.layer2 = nn.Linear(3, 2)
+
+        state_dict = OrderedDict()
+        state_dict["layer1.weight"] = torch.rand(3, 2)
+        state_dict["layer1.bias"] = torch.rand(3)
+        state_dict["layer2.weight"] = torch.rand(2, 3)
+        state_dict["layer2.bias"] = torch.rand(2)
+        state_dict["res.layer2.weight"] = torch.rand(2, 3)
+        state_dict["res.layer2.bias"] = torch.rand(2)
+        return m, state_dict
+
+    def test_complex_model_loaded(self):
+        for add_data_parallel in [False, True]:
+            model, state_dict = self.create_complex_model()
+            if add_data_parallel:
+                model = nn.DataParallel(model)
+            model_sd = model.state_dict()
+
+            align_and_update_state_dicts(model_sd, state_dict)
+            for loaded, stored in zip(model_sd.values(), state_dict.values()):
+                # different tensor references
+                self.assertFalse(id(loaded) == id(stored))
+                # same content
+                self.assertTrue(loaded.equal(stored))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_config.py b/tests/test_config.py
new file mode 100644
index 0000000..d79f07c
--- /dev/null
+++ b/tests/test_config.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+import os
+import tempfile
+import unittest
+
+from detectron2.config import downgrade_config, get_cfg, upgrade_config
+
+_V0_CFG = """
+MODEL:
+  RPN_HEAD:
+    NAME: "TEST"
+VERSION: 0
+"""
+
+_V1_CFG = """
+MODEL:
+  WEIGHT: "/path/to/weight"
+"""
+
+
+class TestConfigVersioning(unittest.TestCase):
+    def test_upgrade_downgrade_consistency(self):
+        cfg = get_cfg()
+        # check that custom is preserved
+        cfg.USER_CUSTOM = 1
+
+        down = downgrade_config(cfg, to_version=0)
+        up = upgrade_config(down)
+        self.assertTrue(up == cfg)
+
+    def _merge_cfg_str(self, cfg, merge_str):
+        f = tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False)
+        try:
+            f.write(merge_str)
+            f.close()
+            cfg.merge_from_file(f.name)
+        finally:
+            os.remove(f.name)
+        return cfg
+
+    def test_auto_upgrade(self):
+        cfg = get_cfg()
+        latest_ver = cfg.VERSION
+        cfg.USER_CUSTOM = 1
+
+        self._merge_cfg_str(cfg, _V0_CFG)
+
+        self.assertEqual(cfg.MODEL.RPN.HEAD_NAME, "TEST")
+        self.assertEqual(cfg.VERSION, latest_ver)
+
+    def test_guess_v1(self):
+        cfg = get_cfg()
+        latest_ver = cfg.VERSION
+        self._merge_cfg_str(cfg, _V1_CFG)
+        self.assertEqual(cfg.VERSION, latest_ver)
diff --git a/tests/test_data_transform.py b/tests/test_data_transform.py
new file mode 100644
index 0000000..1f910e3
--- /dev/null
+++ b/tests/test_data_transform.py
@@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import logging
+import numpy as np
+import unittest
+
+from detectron2.config import get_cfg
+from detectron2.data import detection_utils
+from detectron2.data import transforms as T
+from detectron2.utils.logger import setup_logger
+
+logger = logging.getLogger(__name__)
+
+
+class TestTransforms(unittest.TestCase):
+    def setUp(self):
+        setup_logger()
+
+    def test_apply_rotated_boxes(self):
+        np.random.seed(125)
+        cfg = get_cfg()
+        is_train = True
+        transform_gen = detection_utils.build_transform_gen(cfg, is_train)
+        image = np.random.rand(200, 300)
+        image, transforms = T.apply_transform_gens(transform_gen, image)
+        image_shape = image.shape[:2]  # h, w
+        assert image_shape == (800, 1200)
+        annotation = {"bbox": [179, 97, 62, 40, -56]}
+
+        boxes = np.array([annotation["bbox"]], dtype=np.float64)  # boxes.shape = (1, 5)
+        transformed_bbox = transforms.apply_rotated_box(boxes)[0]
+
+        expected_bbox = np.array([484, 388, 248, 160, 56], dtype=np.float64)
+        err_msg = "transformed_bbox = {}, expected {}".format(transformed_bbox, expected_bbox)
+        assert np.allclose(transformed_bbox, expected_bbox), err_msg
+
+    def test_apply_rotated_boxes_unequal_scaling_factor(self):
+        np.random.seed(125)
+        h, w = 400, 200
+        newh, neww = 800, 800
+        image = np.random.rand(h, w)
+        transform_gen = []
+        transform_gen.append(T.Resize(shape=(newh, neww)))
+        image, transforms = T.apply_transform_gens(transform_gen, image)
+        image_shape = image.shape[:2]  # h, w
+        assert image_shape == (newh, neww)
+
+        boxes = np.array(
+            [
+                [150, 100, 40, 20, 0],
+                [150, 100, 40, 20, 30],
+                [150, 100, 40, 20, 90],
+                [150, 100, 40, 20, -90],
+            ],
+            dtype=np.float64,
+        )
+        transformed_boxes = transforms.apply_rotated_box(boxes)
+
+        expected_bboxes = np.array(
+            [
+                [600, 200, 160, 40, 0],
+                [600, 200, 144.22205102, 52.91502622, 49.10660535],
+                [600, 200, 80, 80, 90],
+                [600, 200, 80, 80, -90],
+            ],
+            dtype=np.float64,
+        )
+        err_msg = "transformed_boxes = {}, expected {}".format(transformed_boxes, expected_bboxes)
+        assert np.allclose(transformed_boxes, expected_bboxes), err_msg
+
+    def test_print_transform_gen(self):
+        t = T.RandomCrop("relative", (100, 100))
+        self.assertTrue(str(t) == "RandomCrop(crop_type='relative', crop_size=(100, 100))")
+
+        t = T.RandomFlip(prob=0.5)
+        self.assertTrue(str(t) == "RandomFlip(prob=0.5)")
+
+        t = T.RandomFlip()
+        self.assertTrue(str(t) == "RandomFlip()")
diff --git a/tests/test_fast_rcnn.py b/tests/test_fast_rcnn.py
new file mode 100644
index 0000000..d4f841a
--- /dev/null
+++ b/tests/test_fast_rcnn.py
@@ -0,0 +1,103 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import unittest
+import torch
+
+from detectron2.config import get_cfg
+from detectron2.modeling.box_regression import Box2BoxTransform, Box2BoxTransformRotated
+from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers, FastRCNNOutputs
+from detectron2.modeling.roi_heads.rotated_fast_rcnn import RotatedFastRCNNOutputs
+from detectron2.structures import Boxes, Instances, RotatedBoxes
+from detectron2.utils.events import EventStorage
+
+logger = logging.getLogger(__name__)
+
+
+class FastRCNNTest(unittest.TestCase):
+    def test_fast_rcnn(self):
+        torch.manual_seed(132)
+        cfg = get_cfg()
+        cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10, 10, 5, 5)
+        box2box_transform = Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
+
+        box_head_output_size = 8
+        num_classes = 5
+        cls_agnostic_bbox_reg = False
+
+        box_predictor = FastRCNNOutputLayers(
+            box_head_output_size, num_classes, cls_agnostic_bbox_reg, box_dim=4
+        )
+        feature_pooled = torch.rand(2, box_head_output_size)
+        pred_class_logits, pred_proposal_deltas = box_predictor(feature_pooled)
+        image_shape = (10, 10)
+        proposal_boxes = torch.tensor([[0.8, 1.1, 3.2, 2.8], [2.3, 2.5, 7, 8]], dtype=torch.float32)
+        gt_boxes = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32)
+        result = Instances(image_shape)
+        result.proposal_boxes = Boxes(proposal_boxes)
+        result.gt_boxes = Boxes(gt_boxes)
+        result.gt_classes = torch.tensor([1, 2])
+        proposals = []
+        proposals.append(result)
+        smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA
+
+        outputs = FastRCNNOutputs(
+            box2box_transform, pred_class_logits, pred_proposal_deltas, proposals, smooth_l1_beta
+        )
+        with EventStorage():  # capture events in a new storage to discard them
+            losses = outputs.losses()
+
+        expected_losses = {
+            "loss_cls": torch.tensor(1.7951188087),
+            "loss_box_reg": torch.tensor(4.0357131958),
+        }
+        for name in expected_losses.keys():
+            assert torch.allclose(losses[name], expected_losses[name])
+
+    def test_fast_rcnn_rotated(self):
+        torch.manual_seed(132)
+        cfg = get_cfg()
+        cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10, 10, 5, 5, 1)
+        box2box_transform = Box2BoxTransformRotated(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
+
+        box_head_output_size = 8
+        num_classes = 5
+        cls_agnostic_bbox_reg = False
+
+        box_predictor = FastRCNNOutputLayers(
+            box_head_output_size, num_classes, cls_agnostic_bbox_reg, box_dim=5
+        )
+        feature_pooled = torch.rand(2, box_head_output_size)
+        pred_class_logits, pred_proposal_deltas = box_predictor(feature_pooled)
+        image_shape = (10, 10)
+        proposal_boxes = torch.tensor(
+            [[2, 1.95, 2.4, 1.7, 0], [4.65, 5.25, 4.7, 5.5, 0]], dtype=torch.float32
+        )
+        gt_boxes = torch.tensor([[2, 2, 2, 2, 0], [4, 4, 4, 4, 0]], dtype=torch.float32)
+        result = Instances(image_shape)
+        result.proposal_boxes = RotatedBoxes(proposal_boxes)
+        result.gt_boxes = RotatedBoxes(gt_boxes)
+        result.gt_classes = torch.tensor([1, 2])
+        proposals = []
+        proposals.append(result)
+        smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA
+
+        outputs = RotatedFastRCNNOutputs(
+            box2box_transform, pred_class_logits, pred_proposal_deltas, proposals, smooth_l1_beta
+        )
+        with EventStorage():  # capture events in a new storage to discard them
+            losses = outputs.losses()
+
+        # Note: the expected losses are slightly different even if
+        # the boxes are essentially the same as in the FastRCNNOutput test, because
+        # bbox_pred in FastRCNNOutputLayers have different Linear layers/initialization
+        # between the two cases.
+        expected_losses = {
+            "loss_cls": torch.tensor(1.7920907736),
+            "loss_box_reg": torch.tensor(4.0410838127),
+        }
+        for name in expected_losses.keys():
+            assert torch.allclose(losses[name], expected_losses[name])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_mask_ops.py b/tests/test_mask_ops.py
new file mode 100644
index 0000000..aca572c
--- /dev/null
+++ b/tests/test_mask_ops.py
@@ -0,0 +1,189 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import contextlib
+import io
+import numpy as np
+import os
+import unittest
+from collections import defaultdict
+import torch
+import tqdm
+from fvcore.common.benchmark import benchmark
+from pycocotools.coco import COCO
+from tabulate import tabulate
+from torch.nn import functional as F
+
+from detectron2.data import MetadataCatalog
+from detectron2.layers.mask_ops import (
+    pad_masks,
+    paste_mask_in_image_old,
+    paste_masks_in_image,
+    scale_boxes,
+)
+from detectron2.structures import BitMasks, Boxes, BoxMode, PolygonMasks
+from detectron2.structures.masks import polygons_to_bitmask
+
+
+def iou_between_full_image_bit_masks(a, b):
+    intersect = (a & b).sum()
+    union = (a | b).sum()
+    return intersect / union
+
+
+def rasterize_polygons_with_grid_sample(full_image_bit_mask, box, mask_size, threshold=0.5):
+    x0, y0, x1, y1 = box[0], box[1], box[2], box[3]
+
+    img_h, img_w = full_image_bit_mask.shape
+
+    mask_y = np.arange(0.0, mask_size) + 0.5  # mask y sample coords in [0.5, mask_size - 0.5]
+    mask_x = np.arange(0.0, mask_size) + 0.5  # mask x sample coords in [0.5, mask_size - 0.5]
+    mask_y = (mask_y) / (mask_size) * (y1 - y0) + y0
+    mask_x = (mask_x) / (mask_size) * (x1 - x0) + x0
+
+    mask_x = (mask_x - 0.5) / (img_w - 1) * 2 + -1
+    mask_y = (mask_y - 0.5) / (img_h - 1) * 2 + -1
+    gy, gx = torch.meshgrid(torch.from_numpy(mask_y), torch.from_numpy(mask_x))
+    ind = torch.stack([gx, gy], dim=-1).to(dtype=torch.float32)
+
+    full_image_bit_mask = torch.from_numpy(full_image_bit_mask)
+    mask = F.grid_sample(
+        full_image_bit_mask[None, None, :, :].to(dtype=torch.float32),
+        ind[None, :, :, :],
+        align_corners=True,
+    )
+
+    return mask[0, 0] >= threshold
+
+
+class TestMaskCropPaste(unittest.TestCase):
+    def setUp(self):
+        json_file = MetadataCatalog.get("coco_2017_val_100").json_file
+        if not os.path.isfile(json_file):
+            raise unittest.SkipTest("{} not found".format(json_file))
+        with contextlib.redirect_stdout(io.StringIO()):
+            self.coco = COCO(json_file)
+
+    def test_crop_paste_consistency(self):
+        """
+        rasterize_polygons_within_box (used in training)
+        and
+        paste_masks_in_image (used in inference)
+        should be inverse operations to each other.
+
+        This function runs several implementation of the above two operations and prints
+        the reconstruction error.
+        """
+
+        anns = self.coco.loadAnns(self.coco.getAnnIds(iscrowd=False))  # avoid crowd annotations
+
+        selected_anns = anns[:100]
+
+        ious = []
+        for ann in tqdm.tqdm(selected_anns):
+            results = self.process_annotation(ann)
+            ious.append([k[2] for k in results])
+
+        ious = np.array(ious)
+        mean_ious = ious.mean(axis=0)
+        table = []
+        res_dic = defaultdict(dict)
+        for row, iou in zip(results, mean_ious):
+            table.append((row[0], row[1], iou))
+            res_dic[row[0]][row[1]] = iou
+        print(tabulate(table, headers=["rasterize", "paste", "iou"], tablefmt="simple"))
+        # assert that the reconstruction is good:
+        self.assertTrue(res_dic["polygon"]["aligned"] > 0.94)
+        self.assertTrue(res_dic["roialign"]["aligned"] > 0.95)
+
+    def process_annotation(self, ann, mask_side_len=28):
+        # Parse annotation data
+        img_info = self.coco.loadImgs(ids=[ann["image_id"]])[0]
+        height, width = img_info["height"], img_info["width"]
+        gt_polygons = [np.array(p, dtype=np.float64) for p in ann["segmentation"]]
+        gt_bbox = BoxMode.convert(np.array(ann["bbox"]), BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
+        gt_bit_mask = polygons_to_bitmask(gt_polygons, height, width)
+
+        # Run rasterize ..
+        torch_gt_bbox = torch.from_numpy(gt_bbox[None, :]).to(dtype=torch.float32)
+        box_bitmasks = {
+            "polygon": PolygonMasks([gt_polygons]).crop_and_resize(torch_gt_bbox, mask_side_len)[0],
+            "gridsample": rasterize_polygons_with_grid_sample(gt_bit_mask, gt_bbox, mask_side_len),
+            "roialign": BitMasks(torch.from_numpy(gt_bit_mask[None, :, :])).crop_and_resize(
+                torch_gt_bbox, mask_side_len
+            )[0],
+        }
+
+        # Run paste ..
+        results = defaultdict(dict)
+        for k, box_bitmask in box_bitmasks.items():
+            padded_bitmask, scale = pad_masks(box_bitmask[None, :, :], 1)
+            scaled_boxes = scale_boxes(torch_gt_bbox, scale)
+
+            r = results[k]
+            r["old"] = paste_mask_in_image_old(
+                padded_bitmask[0], scaled_boxes[0], height, width, threshold=0.5
+            )
+            r["aligned"] = paste_masks_in_image(
+                box_bitmask[None, :, :], Boxes(gt_bbox[None, :]), (height, width)
+            )[0]
+
+        table = []
+        for rasterize_method, r in results.items():
+            for paste_method, mask in r.items():
+                mask = np.asarray(mask)
+                iou = iou_between_full_image_bit_masks(gt_bit_mask.astype("uint8"), mask)
+                table.append((rasterize_method, paste_method, iou))
+        return table
+
+    def test_polygon_area(self):
+        # Draw polygon boxes
+        for d in [5.0, 10.0, 1000.0]:
+            polygon = PolygonMasks([[[0, 0, 0, d, d, d, d, 0]]])
+            area = polygon.area()[0]
+            target = d ** 2
+            self.assertEqual(area, target)
+
+        # Draw polygon triangles
+        for d in [5.0, 10.0, 1000.0]:
+            polygon = PolygonMasks([[[0, 0, 0, d, d, d]]])
+            area = polygon.area()[0]
+            target = d ** 2 / 2
+            self.assertEqual(area, target)
+
+
+def benchmark_paste():
+    S = 800
+    H, W = image_shape = (S, S)
+    N = 64
+    torch.manual_seed(42)
+    masks = torch.rand(N, 28, 28)
+
+    center = torch.rand(N, 2) * 600 + 100
+    wh = torch.clamp(torch.randn(N, 2) * 40 + 200, min=50)
+    x0y0 = torch.clamp(center - wh * 0.5, min=0.0)
+    x1y1 = torch.clamp(center + wh * 0.5, max=S)
+    boxes = Boxes(torch.cat([x0y0, x1y1], axis=1))
+
+    def func(device, n=3):
+        m = masks.to(device=device)
+        b = boxes.to(device=device)
+
+        def bench():
+            for _ in range(n):
+                paste_masks_in_image(m, b, image_shape)
+            if device.type == "cuda":
+                torch.cuda.synchronize()
+
+        return bench
+
+    specs = [{"device": torch.device("cpu"), "n": 3}]
+    if torch.cuda.is_available():
+        specs.append({"device": torch.device("cuda"), "n": 3})
+
+    benchmark(func, "paste_masks", specs, num_iters=10, warmup_iters=2)
+
+
+if __name__ == "__main__":
+    benchmark_paste()
+    unittest.main()
diff --git a/tests/test_model_zoo.py b/tests/test_model_zoo.py
new file mode 100644
index 0000000..e550e08
--- /dev/null
+++ b/tests/test_model_zoo.py
@@ -0,0 +1,22 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import unittest
+
+from detectron2 import model_zoo
+from detectron2.modeling import FPN, GeneralizedRCNN
+
+logger = logging.getLogger(__name__)
+
+
+class TestModelZoo(unittest.TestCase):
+    def test_get_returns_model(self):
+        model = model_zoo.get("Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml", trained=False)
+        assert isinstance(model, GeneralizedRCNN), model
+        assert isinstance(model.backbone, FPN), model.backbone
+
+    def test_get_invalid_model(self):
+        self.assertRaises(RuntimeError, model_zoo.get, "Invalid/config.yaml")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_nms_rotated.py b/tests/test_nms_rotated.py
new file mode 100644
index 0000000..7e3e944
--- /dev/null
+++ b/tests/test_nms_rotated.py
@@ -0,0 +1,159 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from __future__ import absolute_import, division, print_function, unicode_literals
+import unittest
+import torch
+from torchvision import ops
+
+from detectron2.layers import batched_nms, batched_nms_rotated, nms_rotated
+
+
+class TestNMSRotated(unittest.TestCase):
+    def reference_horizontal_nms(self, boxes, scores, iou_threshold):
+        """
+        Args:
+            box_scores (N, 5): boxes in corner-form and probabilities.
+                (Note here 5 == 4 + 1, i.e., 4-dim horizontal box + 1-dim prob)
+            iou_threshold: intersection over union threshold.
+        Returns:
+             picked: a list of indexes of the kept boxes
+        """
+        picked = []
+        _, indexes = scores.sort(descending=True)
+        while len(indexes) > 0:
+            current = indexes[0]
+            picked.append(current.item())
+            if len(indexes) == 1:
+                break
+            current_box = boxes[current, :]
+            indexes = indexes[1:]
+            rest_boxes = boxes[indexes, :]
+            iou = ops.box_iou(rest_boxes, current_box.unsqueeze(0)).squeeze(1)
+            indexes = indexes[iou <= iou_threshold]
+
+        return torch.as_tensor(picked)
+
+    def _create_tensors(self, N):
+        boxes = torch.rand(N, 4) * 100
+        # Note: the implementation of this function in torchvision is:
+        # boxes[:, 2:] += torch.rand(N, 2) * 100
+        # but it does not guarantee non-negative widths/heights constraints:
+        # boxes[:, 2] >= boxes[:, 0] and boxes[:, 3] >= boxes[:, 1]:
+        boxes[:, 2:] += boxes[:, :2]
+        scores = torch.rand(N)
+        return boxes, scores
+
+    def test_batched_nms_rotated_0_degree_cpu(self):
+        # torch.manual_seed(0)
+        N = 2000
+        num_classes = 50
+        boxes, scores = self._create_tensors(N)
+        idxs = torch.randint(0, num_classes, (N,))
+        rotated_boxes = torch.zeros(N, 5)
+        rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0
+        rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0
+        rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
+        rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
+        err_msg = "Rotated NMS with 0 degree is incompatible with horizontal NMS for IoU={}"
+        for iou in [0.2, 0.5, 0.8]:
+            backup = boxes.clone()
+            keep_ref = batched_nms(boxes, scores, idxs, iou)
+            assert torch.allclose(boxes, backup), "boxes modified by batched_nms"
+            backup = rotated_boxes.clone()
+            keep = batched_nms_rotated(rotated_boxes, scores, idxs, iou)
+            assert torch.allclose(
+                rotated_boxes, backup
+            ), "rotated_boxes modified by batched_nms_rotated"
+            assert torch.equal(keep, keep_ref), err_msg.format(iou)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+    def test_batched_nms_rotated_0_degree_cuda(self):
+        # torch.manual_seed(0)
+        N = 2000
+        num_classes = 50
+        boxes, scores = self._create_tensors(N)
+        idxs = torch.randint(0, num_classes, (N,))
+        rotated_boxes = torch.zeros(N, 5)
+        rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0
+        rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0
+        rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
+        rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
+        err_msg = "Rotated NMS with 0 degree is incompatible with horizontal NMS for IoU={}"
+        for iou in [0.2, 0.5, 0.8]:
+            backup = boxes.clone()
+            keep_ref = batched_nms(boxes.cuda(), scores.cuda(), idxs, iou)
+            assert torch.allclose(boxes, backup), "boxes modified by batched_nms"
+            backup = rotated_boxes.clone()
+            keep = batched_nms_rotated(rotated_boxes.cuda(), scores.cuda(), idxs, iou)
+            assert torch.allclose(
+                rotated_boxes, backup
+            ), "rotated_boxes modified by batched_nms_rotated"
+            assert torch.equal(keep, keep_ref), err_msg.format(iou)
+
+    def test_nms_rotated_0_degree_cpu(self):
+        N = 1000
+        boxes, scores = self._create_tensors(N)
+        rotated_boxes = torch.zeros(N, 5)
+        rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0
+        rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0
+        rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
+        rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
+        err_msg = "Rotated NMS incompatible between CPU and reference implementation for IoU={}"
+        for iou in [0.5]:
+            keep_ref = self.reference_horizontal_nms(boxes, scores, iou)
+            keep = nms_rotated(rotated_boxes, scores, iou)
+            assert torch.equal(keep, keep_ref), err_msg.format(iou)
+
+    def test_nms_rotated_90_degrees_cpu(self):
+        N = 1000
+        boxes, scores = self._create_tensors(N)
+        rotated_boxes = torch.zeros(N, 5)
+        rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0
+        rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0
+        # Note for rotated_boxes[:, 2] and rotated_boxes[:, 3]:
+        # widths and heights are intentionally swapped here for 90 degrees case
+        # so that the reference horizontal nms could be used
+        rotated_boxes[:, 2] = boxes[:, 3] - boxes[:, 1]
+        rotated_boxes[:, 3] = boxes[:, 2] - boxes[:, 0]
+
+        rotated_boxes[:, 4] = torch.ones(N) * 90
+        err_msg = "Rotated NMS incompatible between CPU and reference implementation for IoU={}"
+        for iou in [0.2, 0.5, 0.8]:
+            keep_ref = self.reference_horizontal_nms(boxes, scores, iou)
+            keep = nms_rotated(rotated_boxes, scores, iou)
+            assert torch.equal(keep, keep_ref), err_msg.format(iou)
+
+    def test_nms_rotated_180_degrees_cpu(self):
+        N = 1000
+        boxes, scores = self._create_tensors(N)
+        rotated_boxes = torch.zeros(N, 5)
+        rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0
+        rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0
+        rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
+        rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
+        rotated_boxes[:, 4] = torch.ones(N) * 180
+        err_msg = "Rotated NMS incompatible between CPU and reference implementation for IoU={}"
+        for iou in [0.2, 0.5, 0.8]:
+            keep_ref = self.reference_horizontal_nms(boxes, scores, iou)
+            keep = nms_rotated(rotated_boxes, scores, iou)
+            assert torch.equal(keep, keep_ref), err_msg.format(iou)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+    def test_nms_rotated_0_degree_cuda(self):
+        N = 1000
+        boxes, scores = self._create_tensors(N)
+        rotated_boxes = torch.zeros(N, 5)
+        rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0
+        rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0
+        rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
+        rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
+        err_msg = "Rotated NMS incompatible between CPU and CUDA for IoU={}"
+
+        for iou in [0.2, 0.5, 0.8]:
+            r_cpu = nms_rotated(rotated_boxes, scores, iou)
+            r_cuda = nms_rotated(rotated_boxes.cuda(), scores.cuda(), iou)
+
+            assert torch.equal(r_cpu, r_cuda.cpu()), err_msg.format(iou)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_roi_align.py b/tests/test_roi_align.py
new file mode 100644
index 0000000..a745678
--- /dev/null
+++ b/tests/test_roi_align.py
@@ -0,0 +1,86 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import numpy as np
+import unittest
+import cv2
+import torch
+
+from detectron2.layers.roi_align import ROIAlign
+
+
+class ROIAlignTest(unittest.TestCase):
+    def test_forward_output(self):
+        input = np.arange(25).reshape(5, 5).astype("float32")
+        """
+        0  1  2   3 4
+        5  6  7   8 9
+        10 11 12 13 14
+        15 16 17 18 19
+        20 21 22 23 24
+        """
+
+        output = self._simple_roialign(input, [1, 1, 3, 3], (4, 4), aligned=False)
+        output_correct = self._simple_roialign(input, [1, 1, 3, 3], (4, 4), aligned=True)
+
+        # without correction:
+        old_results = [
+            [7.5, 8, 8.5, 9],
+            [10, 10.5, 11, 11.5],
+            [12.5, 13, 13.5, 14],
+            [15, 15.5, 16, 16.5],
+        ]
+
+        # with 0.5 correction:
+        correct_results = [
+            [4.5, 5.0, 5.5, 6.0],
+            [7.0, 7.5, 8.0, 8.5],
+            [9.5, 10.0, 10.5, 11.0],
+            [12.0, 12.5, 13.0, 13.5],
+        ]
+        # This is an upsampled version of [[6, 7], [11, 12]]
+
+        self.assertTrue(np.allclose(output.flatten(), np.asarray(old_results).flatten()))
+        self.assertTrue(
+            np.allclose(output_correct.flatten(), np.asarray(correct_results).flatten())
+        )
+
+        # Also see similar issues in tensorflow at
+        # https://github.com/tensorflow/tensorflow/issues/26278
+
+    def test_resize(self):
+        H, W = 30, 30
+        input = np.random.rand(H, W).astype("float32") * 100
+        box = [10, 10, 20, 20]
+        output = self._simple_roialign(input, box, (5, 5), aligned=True)
+
+        input2x = cv2.resize(input, (W // 2, H // 2), interpolation=cv2.INTER_LINEAR)
+        box2x = [x / 2 for x in box]
+        output2x = self._simple_roialign(input2x, box2x, (5, 5), aligned=True)
+        diff = np.abs(output2x - output)
+        self.assertTrue(diff.max() < 1e-4)
+
+    def _simple_roialign(self, img, box, resolution, aligned=True):
+        """
+        RoiAlign with scale 1.0 and 0 sample ratio.
+        """
+        if isinstance(resolution, int):
+            resolution = (resolution, resolution)
+        op = ROIAlign(resolution, 1.0, 0, aligned=aligned)
+        input = torch.from_numpy(img[None, None, :, :].astype("float32"))
+
+        rois = [0] + list(box)
+        rois = torch.from_numpy(np.asarray(rois)[None, :].astype("float32"))
+        output = op.forward(input, rois).numpy()
+        if torch.cuda.is_available():
+            output_cuda = op.forward(input.cuda(), rois.cuda()).cpu().numpy()
+            self.assertTrue(np.allclose(output, output_cuda))
+        return output[0, 0]
+
+    def test_empty_box(self):
+        img = np.random.rand(5, 5)
+        box = [3, 4, 5, 4]
+        o = self._simple_roialign(img, box, 7)
+        self.assertTrue((o == 0).all())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_roi_align_rotated.py b/tests/test_roi_align_rotated.py
new file mode 100644
index 0000000..d790067
--- /dev/null
+++ b/tests/test_roi_align_rotated.py
@@ -0,0 +1,176 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import unittest
+import cv2
+import torch
+from torch.autograd import Variable, gradcheck
+
+from detectron2.layers.roi_align import ROIAlign
+from detectron2.layers.roi_align_rotated import ROIAlignRotated
+
+logger = logging.getLogger(__name__)
+
+
+class ROIAlignRotatedTest(unittest.TestCase):
+    def _box_to_rotated_box(self, box, angle):
+        return [
+            (box[0] + box[2]) / 2.0,
+            (box[1] + box[3]) / 2.0,
+            box[2] - box[0],
+            box[3] - box[1],
+            angle,
+        ]
+
+    def _rot90(self, img, num):
+        num = num % 4  # note: -1 % 4 == 3
+        for _ in range(num):
+            img = img.transpose(0, 1).flip(0)
+        return img
+
+    def test_forward_output_0_90_180_270(self):
+        for i in range(4):
+            # i = 0, 1, 2, 3 corresponding to 0, 90, 180, 270 degrees
+            img = torch.arange(25, dtype=torch.float32).reshape(5, 5)
+            """
+            0  1  2   3 4
+            5  6  7   8 9
+            10 11 12 13 14
+            15 16 17 18 19
+            20 21 22 23 24
+            """
+            box = [1, 1, 3, 3]
+            rotated_box = self._box_to_rotated_box(box=box, angle=90 * i)
+
+            result = self._simple_roi_align_rotated(img=img, box=rotated_box, resolution=(4, 4))
+
+            # Here's an explanation for 0 degree case:
+            # point 0 in the original input lies at [0.5, 0.5]
+            # (the center of bin [0, 1] x [0, 1])
+            # point 1 in the original input lies at [1.5, 0.5], etc.
+            # since the resolution is (4, 4) that divides [1, 3] x [1, 3]
+            # into 4 x 4 equal bins,
+            # the top-left bin is [1, 1.5] x [1, 1.5], and its center
+            # (1.25, 1.25) lies at the 3/4 position
+            # between point 0 and point 1, point 5 and point 6,
+            # point 0 and point 5, point 1 and point 6, so it can be calculated as
+            # 0.25*(0*0.25+1*0.75)+(5*0.25+6*0.75)*0.75 = 4.5
+            result_expected = torch.tensor(
+                [
+                    [4.5, 5.0, 5.5, 6.0],
+                    [7.0, 7.5, 8.0, 8.5],
+                    [9.5, 10.0, 10.5, 11.0],
+                    [12.0, 12.5, 13.0, 13.5],
+                ]
+            )
+            # This is also an upsampled version of [[6, 7], [11, 12]]
+
+            # When the box is rotated by 90 degrees CCW,
+            # the result would be rotated by 90 degrees CW, thus it's -i here
+            result_expected = self._rot90(result_expected, -i)
+
+            assert torch.allclose(result, result_expected)
+
+    def test_resize(self):
+        H, W = 30, 30
+        input = torch.rand(H, W) * 100
+        box = [10, 10, 20, 20]
+        rotated_box = self._box_to_rotated_box(box, angle=0)
+        output = self._simple_roi_align_rotated(img=input, box=rotated_box, resolution=(5, 5))
+
+        input2x = cv2.resize(input.numpy(), (W // 2, H // 2), interpolation=cv2.INTER_LINEAR)
+        input2x = torch.from_numpy(input2x)
+        box2x = [x / 2 for x in box]
+        rotated_box2x = self._box_to_rotated_box(box2x, angle=0)
+        output2x = self._simple_roi_align_rotated(img=input2x, box=rotated_box2x, resolution=(5, 5))
+        assert torch.allclose(output2x, output)
+
+    def _simple_roi_align_rotated(self, img, box, resolution):
+        """
+        RoiAlignRotated with scale 1.0 and 0 sample ratio.
+        """
+        op = ROIAlignRotated(output_size=resolution, spatial_scale=1.0, sampling_ratio=0)
+        input = img[None, None, :, :]
+
+        rois = [0] + list(box)
+        rois = torch.tensor(rois, dtype=torch.float32)[None, :]
+        result_cpu = op.forward(input, rois)
+        if torch.cuda.is_available():
+            result_cuda = op.forward(input.cuda(), rois.cuda())
+            assert torch.allclose(result_cpu, result_cuda.cpu())
+        return result_cpu[0, 0]
+
+    def test_empty_box(self):
+        img = torch.rand(5, 5)
+        out = self._simple_roi_align_rotated(img, [2, 3, 0, 0, 0], (7, 7))
+        self.assertTrue((out == 0).all())
+
+    def test_roi_align_rotated_gradcheck_cpu(self):
+        dtype = torch.float64
+        device = torch.device("cpu")
+        roi_align_rotated_op = ROIAlignRotated(
+            output_size=(5, 5), spatial_scale=0.5, sampling_ratio=1
+        ).to(dtype=dtype, device=device)
+        x = torch.rand(1, 1, 10, 10, dtype=dtype, device=device, requires_grad=True)
+        # roi format is (batch index, x_center, y_center, width, height, angle)
+        rois = torch.tensor(
+            [[0, 4.5, 4.5, 9, 9, 0], [0, 2, 7, 4, 4, 0], [0, 7, 7, 4, 4, 0]],
+            dtype=dtype,
+            device=device,
+        )
+
+        def func(input):
+            return roi_align_rotated_op(input, rois)
+
+        assert gradcheck(func, (x,)), "gradcheck failed for RoIAlignRotated CPU"
+        assert gradcheck(func, (x.transpose(2, 3),)), "gradcheck failed for RoIAlignRotated CPU"
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+    def test_roi_align_rotated_gradient_cuda(self):
+        """
+        Compute gradients for ROIAlignRotated with multiple bounding boxes on the GPU,
+        and compare the result with ROIAlign
+        """
+        # torch.manual_seed(123)
+        dtype = torch.float64
+        device = torch.device("cuda")
+        pool_h, pool_w = (5, 5)
+
+        roi_align = ROIAlign(output_size=(pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to(
+            device=device
+        )
+
+        roi_align_rotated = ROIAlignRotated(
+            output_size=(pool_h, pool_w), spatial_scale=1, sampling_ratio=2
+        ).to(device=device)
+
+        x = torch.rand(1, 1, 10, 10, dtype=dtype, device=device, requires_grad=True)
+        # x_rotated = x.clone() won't work (will lead to grad_fun=CloneBackward)!
+        x_rotated = Variable(x.data.clone(), requires_grad=True)
+
+        # roi_rotated format is (batch index, x_center, y_center, width, height, angle)
+        rois_rotated = torch.tensor(
+            [[0, 4.5, 4.5, 9, 9, 0], [0, 2, 7, 4, 4, 0], [0, 7, 7, 4, 4, 0]],
+            dtype=dtype,
+            device=device,
+        )
+
+        y_rotated = roi_align_rotated(x_rotated, rois_rotated)
+        s_rotated = y_rotated.sum()
+        s_rotated.backward()
+
+        # roi format is (batch index, x1, y1, x2, y2)
+        rois = torch.tensor(
+            [[0, 0, 0, 9, 9], [0, 0, 5, 4, 9], [0, 5, 5, 9, 9]], dtype=dtype, device=device
+        )
+
+        y = roi_align(x, rois)
+        s = y.sum()
+        s.backward()
+
+        assert torch.allclose(
+            x.grad, x_rotated.grad
+        ), "gradients for ROIAlign and ROIAlignRotated mismatch on CUDA"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_roi_heads.py b/tests/test_roi_heads.py
new file mode 100644
index 0000000..70da179
--- /dev/null
+++ b/tests/test_roi_heads.py
@@ -0,0 +1,108 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import unittest
+import torch
+
+from detectron2.config import get_cfg
+from detectron2.modeling.backbone import build_backbone
+from detectron2.modeling.proposal_generator.build import build_proposal_generator
+from detectron2.modeling.roi_heads import build_roi_heads
+from detectron2.structures import Boxes, ImageList, Instances, RotatedBoxes
+from detectron2.utils.events import EventStorage
+
+logger = logging.getLogger(__name__)
+
+
+class ROIHeadsTest(unittest.TestCase):
+    def test_roi_heads(self):
+        torch.manual_seed(121)
+        cfg = get_cfg()
+        cfg.MODEL.ROI_HEADS.NAME = "StandardROIHeads"
+        cfg.MODEL.ROI_BOX_HEAD.NAME = "FastRCNNConvFCHead"
+        cfg.MODEL.ROI_BOX_HEAD.NUM_FC = 2
+        cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignV2"
+        cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10, 10, 5, 5)
+        backbone = build_backbone(cfg)
+        num_images = 2
+        images_tensor = torch.rand(num_images, 20, 30)
+        image_sizes = [(10, 10), (20, 30)]
+        images = ImageList(images_tensor, image_sizes)
+        num_channels = 1024
+        features = {"res4": torch.rand(num_images, num_channels, 1, 2)}
+
+        image_shape = (15, 15)
+        gt_boxes0 = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32)
+        gt_instance0 = Instances(image_shape)
+        gt_instance0.gt_boxes = Boxes(gt_boxes0)
+        gt_instance0.gt_classes = torch.tensor([2, 1])
+        gt_boxes1 = torch.tensor([[1, 5, 2, 8], [7, 3, 10, 5]], dtype=torch.float32)
+        gt_instance1 = Instances(image_shape)
+        gt_instance1.gt_boxes = Boxes(gt_boxes1)
+        gt_instance1.gt_classes = torch.tensor([1, 2])
+        gt_instances = [gt_instance0, gt_instance1]
+
+        proposal_generator = build_proposal_generator(cfg, backbone.output_shape())
+        roi_heads = build_roi_heads(cfg, backbone.output_shape())
+
+        with EventStorage():  # capture events in a new storage to discard them
+            proposals, proposal_losses = proposal_generator(images, features, gt_instances)
+            _, detector_losses = roi_heads(images, features, proposals, gt_instances)
+
+        expected_losses = {
+            "loss_cls": torch.tensor(4.4236516953),
+            "loss_box_reg": torch.tensor(0.0091214813),
+        }
+        for name in expected_losses.keys():
+            assert torch.allclose(detector_losses[name], expected_losses[name])
+
+    def test_rroi_heads(self):
+        torch.manual_seed(121)
+        cfg = get_cfg()
+        cfg.MODEL.PROPOSAL_GENERATOR.NAME = "RRPN"
+        cfg.MODEL.ANCHOR_GENERATOR.NAME = "RotatedAnchorGenerator"
+        cfg.MODEL.ROI_HEADS.NAME = "RROIHeads"
+        cfg.MODEL.ROI_BOX_HEAD.NAME = "FastRCNNConvFCHead"
+        cfg.MODEL.ROI_BOX_HEAD.NUM_FC = 2
+        cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1, 1)
+        cfg.MODEL.RPN.HEAD_NAME = "StandardRPNHead"
+        cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignRotated"
+        cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10, 10, 5, 5, 1)
+        backbone = build_backbone(cfg)
+        num_images = 2
+        images_tensor = torch.rand(num_images, 20, 30)
+        image_sizes = [(10, 10), (20, 30)]
+        images = ImageList(images_tensor, image_sizes)
+        num_channels = 1024
+        features = {"res4": torch.rand(num_images, num_channels, 1, 2)}
+
+        image_shape = (15, 15)
+        gt_boxes0 = torch.tensor([[2, 2, 2, 2, 30], [4, 4, 4, 4, 0]], dtype=torch.float32)
+        gt_instance0 = Instances(image_shape)
+        gt_instance0.gt_boxes = RotatedBoxes(gt_boxes0)
+        gt_instance0.gt_classes = torch.tensor([2, 1])
+        gt_boxes1 = torch.tensor([[1.5, 5.5, 1, 3, 0], [8.5, 4, 3, 2, -50]], dtype=torch.float32)
+        gt_instance1 = Instances(image_shape)
+        gt_instance1.gt_boxes = RotatedBoxes(gt_boxes1)
+        gt_instance1.gt_classes = torch.tensor([1, 2])
+        gt_instances = [gt_instance0, gt_instance1]
+
+        proposal_generator = build_proposal_generator(cfg, backbone.output_shape())
+        roi_heads = build_roi_heads(cfg, backbone.output_shape())
+
+        with EventStorage():  # capture events in a new storage to discard them
+            proposals, proposal_losses = proposal_generator(images, features, gt_instances)
+            _, detector_losses = roi_heads(images, features, proposals, gt_instances)
+
+        expected_losses = {
+            "loss_cls": torch.tensor(4.381443977355957),
+            "loss_box_reg": torch.tensor(0.0011560433777049184),
+        }
+        for name in expected_losses.keys():
+            err_msg = "detector_losses[{}] = {}, expected losses = {}".format(
+                name, detector_losses[name], expected_losses[name]
+            )
+            assert torch.allclose(detector_losses[name], expected_losses[name]), err_msg
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_roi_pooler.py b/tests/test_roi_pooler.py
new file mode 100644
index 0000000..127498f
--- /dev/null
+++ b/tests/test_roi_pooler.py
@@ -0,0 +1,85 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import unittest
+import torch
+
+from detectron2.modeling.poolers import ROIPooler
+from detectron2.structures import Boxes, RotatedBoxes
+
+logger = logging.getLogger(__name__)
+
+
+class TestROIPooler(unittest.TestCase):
+    def _rand_boxes(self, num_boxes, x_max, y_max):
+        coords = torch.rand(num_boxes, 4)
+        coords[:, 0] *= x_max
+        coords[:, 1] *= y_max
+        coords[:, 2] *= x_max
+        coords[:, 3] *= y_max
+        boxes = torch.zeros(num_boxes, 4)
+        boxes[:, 0] = torch.min(coords[:, 0], coords[:, 2])
+        boxes[:, 1] = torch.min(coords[:, 1], coords[:, 3])
+        boxes[:, 2] = torch.max(coords[:, 0], coords[:, 2])
+        boxes[:, 3] = torch.max(coords[:, 1], coords[:, 3])
+        return boxes
+
+    def _test_roialignv2_roialignrotated_match(self, device):
+        pooler_resolution = 14
+        canonical_level = 4
+        canonical_scale_factor = 2 ** canonical_level
+        pooler_scales = (1.0 / canonical_scale_factor,)
+        sampling_ratio = 0
+
+        N, C, H, W = 2, 4, 10, 8
+        N_rois = 10
+        std = 11
+        mean = 0
+        feature = (torch.rand(N, C, H, W) - 0.5) * 2 * std + mean
+
+        features = [feature.to(device)]
+
+        rois = []
+        rois_rotated = []
+        for _ in range(N):
+            boxes = self._rand_boxes(
+                num_boxes=N_rois, x_max=W * canonical_scale_factor, y_max=H * canonical_scale_factor
+            )
+
+            rotated_boxes = torch.zeros(N_rois, 5)
+            rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0
+            rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0
+            rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
+            rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
+            rois.append(Boxes(boxes).to(device))
+            rois_rotated.append(RotatedBoxes(rotated_boxes).to(device))
+
+        roialignv2_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type="ROIAlignV2",
+        )
+
+        roialignv2_out = roialignv2_pooler(features, rois)
+
+        roialignrotated_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type="ROIAlignRotated",
+        )
+
+        roialignrotated_out = roialignrotated_pooler(features, rois_rotated)
+
+        assert torch.allclose(roialignv2_out, roialignrotated_out, atol=1e-4)
+
+    def test_roialignv2_roialignrotated_match_cpu(self):
+        self._test_roialignv2_roialignrotated_match(device="cpu")
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+    def test_roialignv2_roialignrotated_match_cuda(self):
+        self._test_roialignv2_roialignrotated_match(device="cuda")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_rotated_boxes.py b/tests/test_rotated_boxes.py
new file mode 100644
index 0000000..052ad8b
--- /dev/null
+++ b/tests/test_rotated_boxes.py
@@ -0,0 +1,578 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from __future__ import absolute_import, division, print_function, unicode_literals
+import logging
+import math
+import random
+import unittest
+import torch
+from fvcore.common.benchmark import benchmark
+
+from detectron2.layers.rotated_boxes import pairwise_iou_rotated
+from detectron2.structures.boxes import Boxes
+from detectron2.structures.rotated_boxes import RotatedBoxes, pairwise_iou
+
+logger = logging.getLogger(__name__)
+
+
+class TestRotatedBoxesLayer(unittest.TestCase):
+    def test_iou_0_dim_cpu(self):
+        boxes1 = torch.rand(0, 5, dtype=torch.float32)
+        boxes2 = torch.rand(10, 5, dtype=torch.float32)
+        expected_ious = torch.zeros(0, 10, dtype=torch.float32)
+        ious = pairwise_iou_rotated(boxes1, boxes2)
+        assert torch.allclose(ious, expected_ious)
+
+        boxes1 = torch.rand(10, 5, dtype=torch.float32)
+        boxes2 = torch.rand(0, 5, dtype=torch.float32)
+        expected_ious = torch.zeros(10, 0, dtype=torch.float32)
+        ious = pairwise_iou_rotated(boxes1, boxes2)
+        assert torch.allclose(ious, expected_ious)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_iou_0_dim_cuda(self):
+        boxes1 = torch.rand(0, 5, dtype=torch.float32)
+        boxes2 = torch.rand(10, 5, dtype=torch.float32)
+        expected_ious = torch.zeros(0, 10, dtype=torch.float32)
+        ious_cuda = pairwise_iou_rotated(boxes1.cuda(), boxes2.cuda())
+        assert torch.allclose(ious_cuda.cpu(), expected_ious)
+
+        boxes1 = torch.rand(10, 5, dtype=torch.float32)
+        boxes2 = torch.rand(0, 5, dtype=torch.float32)
+        expected_ious = torch.zeros(10, 0, dtype=torch.float32)
+        ious_cuda = pairwise_iou_rotated(boxes1.cuda(), boxes2.cuda())
+        assert torch.allclose(ious_cuda.cpu(), expected_ious)
+
+    def test_iou_half_overlap_cpu(self):
+        boxes1 = torch.tensor([[0.5, 0.5, 1.0, 1.0, 0.0]], dtype=torch.float32)
+        boxes2 = torch.tensor([[0.25, 0.5, 0.5, 1.0, 0.0]], dtype=torch.float32)
+        expected_ious = torch.tensor([[0.5]], dtype=torch.float32)
+        ious = pairwise_iou_rotated(boxes1, boxes2)
+        assert torch.allclose(ious, expected_ious)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_iou_half_overlap_cuda(self):
+        boxes1 = torch.tensor([[0.5, 0.5, 1.0, 1.0, 0.0]], dtype=torch.float32)
+        boxes2 = torch.tensor([[0.25, 0.5, 0.5, 1.0, 0.0]], dtype=torch.float32)
+        expected_ious = torch.tensor([[0.5]], dtype=torch.float32)
+        ious_cuda = pairwise_iou_rotated(boxes1.cuda(), boxes2.cuda())
+        assert torch.allclose(ious_cuda.cpu(), expected_ious)
+
+    def test_iou_0_degree_cpu(self):
+        boxes1 = torch.tensor(
+            [[0.5, 0.5, 1.0, 1.0, 0.0], [0.5, 0.5, 1.0, 1.0, 0.0]], dtype=torch.float32
+        )
+        boxes2 = torch.tensor(
+            [
+                [0.5, 0.5, 1.0, 1.0, 0.0],
+                [0.25, 0.5, 0.5, 1.0, 0.0],
+                [0.5, 0.25, 1.0, 0.5, 0.0],
+                [0.25, 0.25, 0.5, 0.5, 0.0],
+                [0.75, 0.75, 0.5, 0.5, 0.0],
+                [1.0, 1.0, 1.0, 1.0, 0.0],
+            ],
+            dtype=torch.float32,
+        )
+        expected_ious = torch.tensor(
+            [
+                [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)],
+                [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)],
+            ],
+            dtype=torch.float32,
+        )
+        ious = pairwise_iou_rotated(boxes1, boxes2)
+        assert torch.allclose(ious, expected_ious)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_iou_0_degree_cuda(self):
+        boxes1 = torch.tensor(
+            [[0.5, 0.5, 1.0, 1.0, 0.0], [0.5, 0.5, 1.0, 1.0, 0.0]], dtype=torch.float32
+        )
+        boxes2 = torch.tensor(
+            [
+                [0.5, 0.5, 1.0, 1.0, 0.0],
+                [0.25, 0.5, 0.5, 1.0, 0.0],
+                [0.5, 0.25, 1.0, 0.5, 0.0],
+                [0.25, 0.25, 0.5, 0.5, 0.0],
+                [0.75, 0.75, 0.5, 0.5, 0.0],
+                [1.0, 1.0, 1.0, 1.0, 0.0],
+            ],
+            dtype=torch.float32,
+        )
+        expected_ious = torch.tensor(
+            [
+                [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)],
+                [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)],
+            ],
+            dtype=torch.float32,
+        )
+        ious_cuda = pairwise_iou_rotated(boxes1.cuda(), boxes2.cuda())
+        assert torch.allclose(ious_cuda.cpu(), expected_ious)
+
+    def test_iou_45_degrees_cpu(self):
+        boxes1 = torch.tensor(
+            [
+                [1, 1, math.sqrt(2), math.sqrt(2), 45],
+                [1, 1, 2 * math.sqrt(2), 2 * math.sqrt(2), -45],
+            ],
+            dtype=torch.float32,
+        )
+        boxes2 = torch.tensor([[1, 1, 2, 2, 0]], dtype=torch.float32)
+        expected_ious = torch.tensor([[0.5], [0.5]], dtype=torch.float32)
+        ious = pairwise_iou_rotated(boxes1, boxes2)
+        assert torch.allclose(ious, expected_ious)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_iou_45_degrees_cuda(self):
+        boxes1 = torch.tensor(
+            [
+                [1, 1, math.sqrt(2), math.sqrt(2), 45],
+                [1, 1, 2 * math.sqrt(2), 2 * math.sqrt(2), -45],
+            ],
+            dtype=torch.float32,
+        )
+        boxes2 = torch.tensor([[1, 1, 2, 2, 0]], dtype=torch.float32)
+        expected_ious = torch.tensor([[0.5], [0.5]], dtype=torch.float32)
+        ious_cuda = pairwise_iou_rotated(boxes1.cuda(), boxes2.cuda())
+        assert torch.allclose(ious_cuda.cpu(), expected_ious)
+
+    def test_iou_perpendicular_cpu(self):
+        boxes1 = torch.tensor([[5, 5, 10.0, 6, 55]], dtype=torch.float32)
+        boxes2 = torch.tensor([[5, 5, 10.0, 6, -35]], dtype=torch.float32)
+        iou = (6.0 * 6.0) / (6.0 * 6.0 + 4.0 * 6.0 + 4.0 * 6.0)
+        expected_ious = torch.tensor([[iou]], dtype=torch.float32)
+        ious = pairwise_iou_rotated(boxes1, boxes2)
+        assert torch.allclose(ious, expected_ious)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+    def test_iou_perpendicular_cuda(self):
+        boxes1 = torch.tensor([[5, 5, 10.0, 6, 55]], dtype=torch.float32)
+        boxes2 = torch.tensor([[5, 5, 10.0, 6, -35]], dtype=torch.float32)
+        iou = (6.0 * 6.0) / (6.0 * 6.0 + 4.0 * 6.0 + 4.0 * 6.0)
+        expected_ious = torch.tensor([[iou]], dtype=torch.float32)
+        ious_cuda = pairwise_iou_rotated(boxes1.cuda(), boxes2.cuda())
+        assert torch.allclose(ious_cuda.cpu(), expected_ious)
+
+    def test_iou_large_close_boxes_cpu(self):
+        boxes1 = torch.tensor(
+            [[299.500000, 417.370422, 600.000000, 364.259186, 27.1828]], dtype=torch.float32
+        )
+        boxes2 = torch.tensor(
+            [[299.500000, 417.370422, 600.000000, 364.259155, 27.1828]], dtype=torch.float32
+        )
+        iou = 364.259155 / 364.259186
+        expected_ious = torch.tensor([[iou]], dtype=torch.float32)
+        ious = pairwise_iou_rotated(boxes1, boxes2)
+        assert torch.allclose(ious, expected_ious)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_iou_large_close_boxes_cuda(self):
+        boxes1 = torch.tensor(
+            [[299.500000, 417.370422, 600.000000, 364.259186, 27.1828]], dtype=torch.float32
+        )
+        boxes2 = torch.tensor(
+            [[299.500000, 417.370422, 600.000000, 364.259155, 27.1828]], dtype=torch.float32
+        )
+        iou = 364.259155 / 364.259186
+        expected_ious = torch.tensor([[iou]], dtype=torch.float32)
+        ious_cuda = pairwise_iou_rotated(boxes1.cuda(), boxes2.cuda())
+        assert torch.allclose(ious_cuda.cpu(), expected_ious)
+
+    def test_iou_precision_cpu(self):
+        boxes1 = torch.tensor([[565, 565, 10, 10, 0]], dtype=torch.float32)
+        boxes2 = torch.tensor([[565, 565, 10, 8.3, 0]], dtype=torch.float32)
+        iou = 8.3 / 10.0
+        expected_ious = torch.tensor([[iou]], dtype=torch.float32)
+        ious = pairwise_iou_rotated(boxes1, boxes2)
+        assert torch.allclose(ious, expected_ious)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_iou_precision_cuda(self):
+        boxes1 = torch.tensor([[565, 565, 10, 10, 0]], dtype=torch.float32)
+        boxes2 = torch.tensor([[565, 565, 10, 8.3, 0]], dtype=torch.float32)
+        iou = 8.3 / 10.0
+        expected_ious = torch.tensor([[iou]], dtype=torch.float32)
+        ious_cuda = pairwise_iou_rotated(boxes1.cuda(), boxes2.cuda())
+        assert torch.allclose(ious_cuda.cpu(), expected_ious)
+
+    def test_iou_many_boxes_cpu(self):
+        num_boxes1 = 100
+        num_boxes2 = 200
+        boxes1 = torch.stack(
+            [
+                torch.tensor([5 + 20 * i, 5 + 20 * i, 10, 10, 0], dtype=torch.float32)
+                for i in range(num_boxes1)
+            ]
+        )
+        boxes2 = torch.stack(
+            [
+                torch.tensor(
+                    [5 + 20 * i, 5 + 20 * i, 10, 1 + 9 * i / num_boxes2, 0], dtype=torch.float32
+                )
+                for i in range(num_boxes2)
+            ]
+        )
+        expected_ious = torch.zeros(num_boxes1, num_boxes2, dtype=torch.float32)
+        for i in range(min(num_boxes1, num_boxes2)):
+            expected_ious[i][i] = (1 + 9 * i / num_boxes2) / 10.0
+        ious = pairwise_iou_rotated(boxes1, boxes2)
+        assert torch.allclose(ious, expected_ious)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_iou_many_boxes_cuda(self):
+        num_boxes1 = 100
+        num_boxes2 = 200
+        boxes1 = torch.stack(
+            [
+                torch.tensor([5 + 20 * i, 5 + 20 * i, 10, 10, 0], dtype=torch.float32)
+                for i in range(num_boxes1)
+            ]
+        )
+        boxes2 = torch.stack(
+            [
+                torch.tensor(
+                    [5 + 20 * i, 5 + 20 * i, 10, 1 + 9 * i / num_boxes2, 0], dtype=torch.float32
+                )
+                for i in range(num_boxes2)
+            ]
+        )
+        expected_ious = torch.zeros(num_boxes1, num_boxes2, dtype=torch.float32)
+        for i in range(min(num_boxes1, num_boxes2)):
+            expected_ious[i][i] = (1 + 9 * i / num_boxes2) / 10.0
+        ious_cuda = pairwise_iou_rotated(boxes1.cuda(), boxes2.cuda())
+        assert torch.allclose(ious_cuda.cpu(), expected_ious)
+
+
+class TestRotatedBoxesStructure(unittest.TestCase):
+    def test_clip_area_0_degree(self):
+        for _ in range(50):
+            num_boxes = 100
+            boxes_5d = torch.zeros(num_boxes, 5)
+            boxes_5d[:, 0] = torch.FloatTensor(num_boxes).uniform_(-100, 500)
+            boxes_5d[:, 1] = torch.FloatTensor(num_boxes).uniform_(-100, 500)
+            boxes_5d[:, 2] = torch.FloatTensor(num_boxes).uniform_(0, 500)
+            boxes_5d[:, 3] = torch.FloatTensor(num_boxes).uniform_(0, 500)
+            # Convert from (x_ctr, y_ctr, w, h, 0) to  (x1, y1, x2, y2)
+            boxes_4d = torch.zeros(num_boxes, 4)
+            boxes_4d[:, 0] = boxes_5d[:, 0] - boxes_5d[:, 2] / 2.0
+            boxes_4d[:, 1] = boxes_5d[:, 1] - boxes_5d[:, 3] / 2.0
+            boxes_4d[:, 2] = boxes_5d[:, 0] + boxes_5d[:, 2] / 2.0
+            boxes_4d[:, 3] = boxes_5d[:, 1] + boxes_5d[:, 3] / 2.0
+
+            image_size = (500, 600)
+            test_boxes_4d = Boxes(boxes_4d)
+            test_boxes_5d = RotatedBoxes(boxes_5d)
+            # Before clip
+            areas_4d = test_boxes_4d.area()
+            areas_5d = test_boxes_5d.area()
+            assert torch.allclose(areas_4d, areas_5d, atol=1e-1, rtol=1e-5)
+            # After clip
+            test_boxes_4d.clip(image_size)
+            test_boxes_5d.clip(image_size)
+            areas_4d = test_boxes_4d.area()
+            areas_5d = test_boxes_5d.area()
+            assert torch.allclose(areas_4d, areas_5d, atol=1e-1, rtol=1e-5)
+
+    def test_clip_area_arbitrary_angle(self):
+        num_boxes = 100
+        boxes_5d = torch.zeros(num_boxes, 5)
+        boxes_5d[:, 0] = torch.FloatTensor(num_boxes).uniform_(-100, 500)
+        boxes_5d[:, 1] = torch.FloatTensor(num_boxes).uniform_(-100, 500)
+        boxes_5d[:, 2] = torch.FloatTensor(num_boxes).uniform_(0, 500)
+        boxes_5d[:, 3] = torch.FloatTensor(num_boxes).uniform_(0, 500)
+        boxes_5d[:, 4] = torch.FloatTensor(num_boxes).uniform_(-1800, 1800)
+        clip_angle_threshold = random.uniform(0, 180)
+
+        image_size = (500, 600)
+        test_boxes_5d = RotatedBoxes(boxes_5d)
+        # Before clip
+        areas_before = test_boxes_5d.area()
+        # After clip
+        test_boxes_5d.clip(image_size, clip_angle_threshold)
+        areas_diff = test_boxes_5d.area() - areas_before
+
+        # the areas should only decrease after clipping
+        assert torch.all(areas_diff <= 0)
+        # whenever the box is clipped (thus the area shrinks),
+        # the angle for the box must be within the clip_angle_threshold
+        # Note that the clip function will normalize the angle range
+        # to be within (-180, 180]
+        assert torch.all(
+            torch.abs(boxes_5d[:, 4][torch.where(areas_diff < 0)]) < clip_angle_threshold
+        )
+
+    def test_normalize_angles(self):
+        # torch.manual_seed(0)
+        for _ in range(50):
+            num_boxes = 100
+            boxes_5d = torch.zeros(num_boxes, 5)
+            boxes_5d[:, 0] = torch.FloatTensor(num_boxes).uniform_(-100, 500)
+            boxes_5d[:, 1] = torch.FloatTensor(num_boxes).uniform_(-100, 500)
+            boxes_5d[:, 2] = torch.FloatTensor(num_boxes).uniform_(0, 500)
+            boxes_5d[:, 3] = torch.FloatTensor(num_boxes).uniform_(0, 500)
+            boxes_5d[:, 4] = torch.FloatTensor(num_boxes).uniform_(-1800, 1800)
+            rotated_boxes = RotatedBoxes(boxes_5d)
+            normalized_boxes = rotated_boxes.clone()
+            normalized_boxes.normalize_angles()
+            assert torch.all(normalized_boxes.tensor[:, 4] > -180)
+            assert torch.all(normalized_boxes.tensor[:, 4] <= 180)
+            # x, y, w, h should not change
+            assert torch.allclose(boxes_5d[:, :4], normalized_boxes.tensor[:, :4])
+            # the cos/sin values of the angles should stay the same
+
+            assert torch.allclose(
+                torch.cos(boxes_5d[:, 4] * math.pi / 180),
+                torch.cos(normalized_boxes.tensor[:, 4] * math.pi / 180),
+                atol=1e-5,
+            )
+
+            assert torch.allclose(
+                torch.sin(boxes_5d[:, 4] * math.pi / 180),
+                torch.sin(normalized_boxes.tensor[:, 4] * math.pi / 180),
+                atol=1e-5,
+            )
+
+    def test_pairwise_iou_0_degree_cpu(self):
+        device = torch.device("cpu")
+        boxes1 = torch.tensor(
+            [[0.5, 0.5, 1.0, 1.0, 0.0], [0.5, 0.5, 1.0, 1.0, 0.0]],
+            dtype=torch.float32,
+            device=device,
+        )
+        boxes2 = torch.tensor(
+            [
+                [0.5, 0.5, 1.0, 1.0, 0.0],
+                [0.25, 0.5, 0.5, 1.0, 0.0],
+                [0.5, 0.25, 1.0, 0.5, 0.0],
+                [0.25, 0.25, 0.5, 0.5, 0.0],
+                [0.75, 0.75, 0.5, 0.5, 0.0],
+                [1.0, 1.0, 1.0, 1.0, 0.0],
+            ],
+            dtype=torch.float32,
+            device=device,
+        )
+        expected_ious = torch.tensor(
+            [
+                [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)],
+                [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)],
+            ],
+            dtype=torch.float32,
+            device=device,
+        )
+        ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2))
+        assert torch.allclose(ious, expected_ious)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_pairwise_iou_0_degree_cuda(self):
+        device = torch.device("cuda")
+        boxes1 = torch.tensor(
+            [[0.5, 0.5, 1.0, 1.0, 0.0], [0.5, 0.5, 1.0, 1.0, 0.0]],
+            dtype=torch.float32,
+            device=device,
+        )
+        boxes2 = torch.tensor(
+            [
+                [0.5, 0.5, 1.0, 1.0, 0.0],
+                [0.25, 0.5, 0.5, 1.0, 0.0],
+                [0.5, 0.25, 1.0, 0.5, 0.0],
+                [0.25, 0.25, 0.5, 0.5, 0.0],
+                [0.75, 0.75, 0.5, 0.5, 0.0],
+                [1.0, 1.0, 1.0, 1.0, 0.0],
+            ],
+            dtype=torch.float32,
+            device=device,
+        )
+        expected_ious = torch.tensor(
+            [
+                [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)],
+                [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)],
+            ],
+            dtype=torch.float32,
+            device=device,
+        )
+        ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2))
+        assert torch.allclose(ious, expected_ious)
+
+    def test_pairwise_iou_45_degrees_cpu(self):
+        device = torch.device("cpu")
+        boxes1 = torch.tensor(
+            [
+                [1, 1, math.sqrt(2), math.sqrt(2), 45],
+                [1, 1, 2 * math.sqrt(2), 2 * math.sqrt(2), -45],
+            ],
+            dtype=torch.float32,
+            device=device,
+        )
+        boxes2 = torch.tensor([[1, 1, 2, 2, 0]], dtype=torch.float32, device=device)
+        expected_ious = torch.tensor([[0.5], [0.5]], dtype=torch.float32, device=device)
+        ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2))
+        assert torch.allclose(ious, expected_ious)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_pairwise_iou_45_degrees_cuda(self):
+        device = torch.device("cuda")
+        boxes1 = torch.tensor(
+            [
+                [1, 1, math.sqrt(2), math.sqrt(2), 45],
+                [1, 1, 2 * math.sqrt(2), 2 * math.sqrt(2), -45],
+            ],
+            dtype=torch.float32,
+            device=device,
+        )
+        boxes2 = torch.tensor([[1, 1, 2, 2, 0]], dtype=torch.float32, device=device)
+        expected_ious = torch.tensor([[0.5], [0.5]], dtype=torch.float32, device=device)
+        ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2))
+        assert torch.allclose(ious, expected_ious)
+
+    def test_pairwise_iou_orthogonal_cpu(self):
+        device = torch.device("cpu")
+        boxes1 = torch.tensor([[5, 5, 10, 6, 55]], dtype=torch.float32, device=device)
+        boxes2 = torch.tensor([[5, 5, 10, 6, -35]], dtype=torch.float32, device=device)
+        iou = (6.0 * 6.0) / (6.0 * 6.0 + 4.0 * 6.0 + 4.0 * 6.0)
+        expected_ious = torch.tensor([[iou]], dtype=torch.float32, device=device)
+        ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2))
+        assert torch.allclose(ious, expected_ious)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_pairwise_iou_orthogonal_cuda(self):
+        device = torch.device("cuda")
+        boxes1 = torch.tensor([[5, 5, 10, 6, 55]], dtype=torch.float32, device=device)
+        boxes2 = torch.tensor([[5, 5, 10, 6, -35]], dtype=torch.float32, device=device)
+        iou = (6.0 * 6.0) / (6.0 * 6.0 + 4.0 * 6.0 + 4.0 * 6.0)
+        expected_ious = torch.tensor([[iou]], dtype=torch.float32, device=device)
+        ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2))
+        assert torch.allclose(ious, expected_ious)
+
+    def test_pairwise_iou_large_close_boxes_cpu(self):
+        device = torch.device("cpu")
+        boxes1 = torch.tensor(
+            [[299.500000, 417.370422, 600.000000, 364.259186, 27.1828]],
+            dtype=torch.float32,
+            device=device,
+        )
+        boxes2 = torch.tensor(
+            [[299.500000, 417.370422, 600.000000, 364.259155, 27.1828]],
+            dtype=torch.float32,
+            device=device,
+        )
+        iou = 364.259155 / 364.259186
+        expected_ious = torch.tensor([[iou]], dtype=torch.float32, device=device)
+        ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2))
+        assert torch.allclose(ious, expected_ious)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_pairwise_iou_large_close_boxes_cuda(self):
+        device = torch.device("cuda")
+        boxes1 = torch.tensor(
+            [[299.500000, 417.370422, 600.000000, 364.259186, 27.1828]],
+            dtype=torch.float32,
+            device=device,
+        )
+        boxes2 = torch.tensor(
+            [[299.500000, 417.370422, 600.000000, 364.259155, 27.1828]],
+            dtype=torch.float32,
+            device=device,
+        )
+        iou = 364.259155 / 364.259186
+        expected_ious = torch.tensor([[iou]], dtype=torch.float32, device=device)
+        ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2))
+        assert torch.allclose(ious, expected_ious)
+
+    def test_pairwise_iou_many_boxes_cpu(self):
+        device = torch.device("cpu")
+        num_boxes1 = 100
+        num_boxes2 = 200
+        boxes1 = torch.stack(
+            [
+                torch.tensor(
+                    [5 + 20 * i, 5 + 20 * i, 10, 10, 0], dtype=torch.float32, device=device
+                )
+                for i in range(num_boxes1)
+            ]
+        )
+        boxes2 = torch.stack(
+            [
+                torch.tensor(
+                    [5 + 20 * i, 5 + 20 * i, 10, 1 + 9 * i / num_boxes2, 0],
+                    dtype=torch.float32,
+                    device=device,
+                )
+                for i in range(num_boxes2)
+            ]
+        )
+        expected_ious = torch.zeros(num_boxes1, num_boxes2, dtype=torch.float32, device=device)
+        for i in range(min(num_boxes1, num_boxes2)):
+            expected_ious[i][i] = (1 + 9 * i / num_boxes2) / 10.0
+        ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2))
+        assert torch.allclose(ious, expected_ious)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_pairwise_iou_many_boxes_cuda(self):
+        device = torch.device("cuda")
+        num_boxes1 = 100
+        num_boxes2 = 200
+        boxes1 = torch.stack(
+            [
+                torch.tensor(
+                    [5 + 20 * i, 5 + 20 * i, 10, 10, 0], dtype=torch.float32, device=device
+                )
+                for i in range(num_boxes1)
+            ]
+        )
+        boxes2 = torch.stack(
+            [
+                torch.tensor(
+                    [5 + 20 * i, 5 + 20 * i, 10, 1 + 9 * i / num_boxes2, 0],
+                    dtype=torch.float32,
+                    device=device,
+                )
+                for i in range(num_boxes2)
+            ]
+        )
+        expected_ious = torch.zeros(num_boxes1, num_boxes2, dtype=torch.float32, device=device)
+        for i in range(min(num_boxes1, num_boxes2)):
+            expected_ious[i][i] = (1 + 9 * i / num_boxes2) / 10.0
+        ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2))
+        assert torch.allclose(ious, expected_ious)
+
+
+def benchmark_rotated_iou():
+    num_boxes1 = 200
+    num_boxes2 = 500
+    boxes1 = torch.stack(
+        [
+            torch.tensor([5 + 20 * i, 5 + 20 * i, 10, 10, 0], dtype=torch.float32)
+            for i in range(num_boxes1)
+        ]
+    )
+    boxes2 = torch.stack(
+        [
+            torch.tensor(
+                [5 + 20 * i, 5 + 20 * i, 10, 1 + 9 * i / num_boxes2, 0], dtype=torch.float32
+            )
+            for i in range(num_boxes2)
+        ]
+    )
+
+    def func(dev, n=1):
+        b1 = boxes1.to(device=dev)
+        b2 = boxes2.to(device=dev)
+
+        def bench():
+            for _ in range(n):
+                pairwise_iou_rotated(b1, b2)
+            if dev.type == "cuda":
+                torch.cuda.synchronize()
+
+        return bench
+
+    # only run it once per timed loop, since it's slow
+    args = [{"dev": torch.device("cpu"), "n": 1}]
+    if torch.cuda.is_available():
+        args.append({"dev": torch.device("cuda"), "n": 10})
+
+    benchmark(func, "rotated_iou", args, warmup_iters=3)
+
+
+if __name__ == "__main__":
+    unittest.main()
+    benchmark_rotated_iou()
diff --git a/tests/test_rpn.py b/tests/test_rpn.py
new file mode 100644
index 0000000..e50f1a9
--- /dev/null
+++ b/tests/test_rpn.py
@@ -0,0 +1,210 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import unittest
+import torch
+
+from detectron2.config import get_cfg
+from detectron2.modeling.backbone import build_backbone
+from detectron2.modeling.proposal_generator.build import build_proposal_generator
+from detectron2.structures import Boxes, ImageList, Instances, RotatedBoxes
+from detectron2.utils.events import EventStorage
+
+logger = logging.getLogger(__name__)
+
+
+class RPNTest(unittest.TestCase):
+    def test_rpn(self):
+        torch.manual_seed(121)
+        cfg = get_cfg()
+        cfg.MODEL.PROPOSAL_GENERATOR.NAME = "RPN"
+        cfg.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator"
+        cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1)
+        backbone = build_backbone(cfg)
+        proposal_generator = build_proposal_generator(cfg, backbone.output_shape())
+        num_images = 2
+        images_tensor = torch.rand(num_images, 20, 30)
+        image_sizes = [(10, 10), (20, 30)]
+        images = ImageList(images_tensor, image_sizes)
+        image_shape = (15, 15)
+        num_channels = 1024
+        features = {"res4": torch.rand(num_images, num_channels, 1, 2)}
+        gt_boxes = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32)
+        gt_instances = Instances(image_shape)
+        gt_instances.gt_boxes = Boxes(gt_boxes)
+        with EventStorage():  # capture events in a new storage to discard them
+            proposals, proposal_losses = proposal_generator(
+                images, features, [gt_instances[0], gt_instances[1]]
+            )
+
+        expected_losses = {
+            "loss_rpn_cls": torch.tensor(0.0804563984),
+            "loss_rpn_loc": torch.tensor(0.0990132466),
+        }
+        for name in expected_losses.keys():
+            assert torch.allclose(proposal_losses[name], expected_losses[name])
+
+        expected_proposal_boxes = [
+            Boxes(torch.tensor([[0, 0, 10, 10], [7.3365392685, 0, 10, 10]])),
+            Boxes(
+                torch.tensor(
+                    [
+                        [0, 0, 30, 20],
+                        [0, 0, 16.7862777710, 13.1362524033],
+                        [0, 0, 30, 13.3173446655],
+                        [0, 0, 10.8602609634, 20],
+                        [7.7165775299, 0, 27.3875980377, 20],
+                    ]
+                )
+            ),
+        ]
+
+        expected_objectness_logits = [
+            torch.tensor([0.1225359365, -0.0133192837]),
+            torch.tensor([0.1415634006, 0.0989848152, 0.0565387346, -0.0072308783, -0.0428492837]),
+        ]
+
+        for i in range(len(image_sizes)):
+            assert len(proposals[i]) == len(expected_proposal_boxes[i])
+            assert proposals[i].image_size == (image_sizes[i][0], image_sizes[i][1])
+            assert torch.allclose(
+                proposals[i].proposal_boxes.tensor, expected_proposal_boxes[i].tensor
+            )
+            assert torch.allclose(proposals[i].objectness_logits, expected_objectness_logits[i])
+
+    def test_rrpn(self):
+        torch.manual_seed(121)
+        cfg = get_cfg()
+        cfg.MODEL.PROPOSAL_GENERATOR.NAME = "RRPN"
+        cfg.MODEL.ANCHOR_GENERATOR.NAME = "RotatedAnchorGenerator"
+        cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64]]
+        cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.25, 1]]
+        cfg.MODEL.ANCHOR_GENERATOR.ANGLES = [[0, 60]]
+        cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1, 1)
+        cfg.MODEL.RPN.HEAD_NAME = "StandardRPNHead"
+        backbone = build_backbone(cfg)
+        proposal_generator = build_proposal_generator(cfg, backbone.output_shape())
+        num_images = 2
+        images_tensor = torch.rand(num_images, 20, 30)
+        image_sizes = [(10, 10), (20, 30)]
+        images = ImageList(images_tensor, image_sizes)
+        image_shape = (15, 15)
+        num_channels = 1024
+        features = {"res4": torch.rand(num_images, num_channels, 1, 2)}
+        gt_boxes = torch.tensor([[2, 2, 2, 2, 0], [4, 4, 4, 4, 0]], dtype=torch.float32)
+        gt_instances = Instances(image_shape)
+        gt_instances.gt_boxes = RotatedBoxes(gt_boxes)
+        with EventStorage():  # capture events in a new storage to discard them
+            proposals, proposal_losses = proposal_generator(
+                images, features, [gt_instances[0], gt_instances[1]]
+            )
+
+        expected_losses = {
+            "loss_rpn_cls": torch.tensor(0.0432923734),
+            "loss_rpn_loc": torch.tensor(0.1552739739),
+        }
+        for name in expected_losses.keys():
+            assert torch.allclose(proposal_losses[name], expected_losses[name])
+
+        expected_proposal_boxes = [
+            RotatedBoxes(
+                torch.tensor(
+                    [
+                        [0.60189795, 1.24095452, 61.98131943, 18.03621292, -4.07244873],
+                        [15.64940453, 1.69624567, 59.59749603, 16.34339333, 2.62692475],
+                        [-3.02982378, -2.69752932, 67.90952301, 59.62455750, 59.97010040],
+                        [16.71863365, 1.98309708, 35.61507797, 32.81484985, 62.92267227],
+                        [0.49432933, -7.92979717, 67.77606201, 62.93098450, -1.85656738],
+                        [8.00880814, 1.36017394, 121.81007385, 32.74150467, 50.44297409],
+                        [16.44299889, -4.82221127, 63.39775848, 61.22503662, 54.12270737],
+                        [5.00000000, 5.00000000, 10.00000000, 10.00000000, -0.76943970],
+                        [17.64130402, -0.98095351, 61.40377808, 16.28918839, 55.53118134],
+                        [0.13016054, 4.60568953, 35.80157471, 32.30180359, 62.52872086],
+                        [-4.26460743, 0.39604485, 124.30079651, 31.84611320, -1.58203125],
+                        [7.52815342, -0.91636634, 62.39784622, 15.45565224, 60.79549789],
+                    ]
+                )
+            ),
+            RotatedBoxes(
+                torch.tensor(
+                    [
+                        [0.07734215, 0.81635046, 65.33510590, 17.34688377, -1.51821899],
+                        [-3.41833067, -3.11320257, 64.17595673, 60.55617905, 58.27033234],
+                        [20.67383385, -6.16561556, 63.60531998, 62.52315903, 54.85546494],
+                        [15.00000000, 10.00000000, 30.00000000, 20.00000000, -0.18218994],
+                        [9.22646523, -6.84775209, 62.09895706, 65.46472931, -2.74307251],
+                        [15.00000000, 4.93451595, 30.00000000, 9.86903191, -0.60272217],
+                        [8.88342094, 2.65560246, 120.95362854, 32.45022202, 55.75970078],
+                        [16.39088631, 2.33887148, 34.78761292, 35.61492920, 60.81977463],
+                        [9.78298569, 10.00000000, 19.56597137, 20.00000000, -0.86660767],
+                        [1.28576660, 5.49873352, 34.93610382, 33.22600174, 60.51599884],
+                        [17.58912468, -1.63270092, 62.96052551, 16.45713997, 52.91245270],
+                        [5.64749718, -1.90428460, 62.37649155, 16.19474792, 61.09543991],
+                        [0.82255805, 2.34931135, 118.83985901, 32.83671188, 56.50753784],
+                        [-5.33874989, 1.64404404, 125.28501892, 33.35424042, -2.80731201],
+                    ]
+                )
+            ),
+        ]
+
+        expected_objectness_logits = [
+            torch.tensor(
+                [
+                    0.10111768,
+                    0.09112845,
+                    0.08466332,
+                    0.07589971,
+                    0.06650183,
+                    0.06350251,
+                    0.04299347,
+                    0.01864817,
+                    0.00986163,
+                    0.00078543,
+                    -0.04573630,
+                    -0.04799230,
+                ]
+            ),
+            torch.tensor(
+                [
+                    0.11373727,
+                    0.09377633,
+                    0.05281663,
+                    0.05143715,
+                    0.04040275,
+                    0.03250912,
+                    0.01307789,
+                    0.01177734,
+                    0.00038105,
+                    -0.00540255,
+                    -0.01194804,
+                    -0.01461012,
+                    -0.03061717,
+                    -0.03599222,
+                ]
+            ),
+        ]
+
+        torch.set_printoptions(precision=8, sci_mode=False)
+
+        for i in range(len(image_sizes)):
+            assert len(proposals[i]) == len(expected_proposal_boxes[i])
+            assert proposals[i].image_size == (image_sizes[i][0], image_sizes[i][1])
+            # It seems that there's some randomness in the result across different machines:
+            # This test can be run on a local machine for 100 times with exactly the same result,
+            # However, a different machine might produce slightly different results,
+            # thus the atol here.
+            err_msg = "computed proposal boxes = {}, expected {}".format(
+                proposals[i].proposal_boxes.tensor, expected_proposal_boxes[i].tensor
+            )
+            assert torch.allclose(
+                proposals[i].proposal_boxes.tensor, expected_proposal_boxes[i].tensor, atol=1e-5
+            ), err_msg
+            err_msg = "computed objectness logits = {}, expected {}".format(
+                proposals[i].objectness_logits, expected_objectness_logits[i]
+            )
+            assert torch.allclose(
+                proposals[i].objectness_logits, expected_objectness_logits[i], atol=1e-5
+            ), err_msg
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_sampler.py b/tests/test_sampler.py
new file mode 100644
index 0000000..7738341
--- /dev/null
+++ b/tests/test_sampler.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+import unittest
+from torch.utils.data.sampler import SequentialSampler
+
+from detectron2.data.samplers import GroupedBatchSampler
+
+
+class TestGroupedBatchSampler(unittest.TestCase):
+    def test_missing_group_id(self):
+        sampler = SequentialSampler(list(range(100)))
+        group_ids = [1] * 100
+        s = GroupedBatchSampler(sampler, group_ids, 2)
+
+        for k in s:
+            self.assertEqual(len(k), 2)
+
+    def test_groups(self):
+        sampler = SequentialSampler(list(range(100)))
+        group_ids = [1, 0] * 50
+        s = GroupedBatchSampler(sampler, group_ids, 2)
+
+        for k in s:
+            self.assertTrue((k[0] + k[1]) % 2 == 0)
diff --git a/tests/test_visualizer.py b/tests/test_visualizer.py
new file mode 100644
index 0000000..1cdeddc
--- /dev/null
+++ b/tests/test_visualizer.py
@@ -0,0 +1,143 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# File:
+
+import numpy as np
+import unittest
+import torch
+
+from detectron2.data import MetadataCatalog
+from detectron2.structures import BoxMode, Instances, RotatedBoxes
+from detectron2.utils.visualizer import Visualizer
+
+
+class TestVisualizer(unittest.TestCase):
+    def _random_data(self):
+        H, W = 100, 100
+        N = 10
+        img = np.random.rand(H, W, 3) * 255
+        boxxy = np.random.rand(N, 2) * (H // 2)
+        boxes = np.concatenate((boxxy, boxxy + H // 2), axis=1)
+
+        def _rand_poly():
+            return np.random.rand(3, 2).flatten() * H
+
+        polygons = [[_rand_poly() for _ in range(np.random.randint(1, 5))] for _ in range(N)]
+
+        mask = np.zeros_like(img[:, :, 0], dtype=np.bool)
+        mask[:10, 10:20] = 1
+
+        labels = [str(i) for i in range(N)]
+        return img, boxes, labels, polygons, [mask] * N
+
+    @property
+    def metadata(self):
+        return MetadataCatalog.get("coco_2017_train")
+
+    def test_draw_dataset_dict(self):
+        img = np.random.rand(512, 512, 3) * 255
+        dic = {
+            "annotations": [
+                {
+                    "bbox": [
+                        368.9946492271106,
+                        330.891438763377,
+                        13.148537455410235,
+                        13.644708680142685,
+                    ],
+                    "bbox_mode": BoxMode.XYWH_ABS,
+                    "category_id": 0,
+                    "iscrowd": 1,
+                    "segmentation": {
+                        "counts": "_jh52m?2N2N2N2O100O10O001N1O2MceP2",
+                        "size": [512, 512],
+                    },
+                }
+            ],
+            "height": 512,
+            "image_id": 1,
+            "width": 512,
+        }
+        v = Visualizer(img, self.metadata)
+        v.draw_dataset_dict(dic)
+
+    def test_overlay_instances(self):
+        img, boxes, labels, polygons, masks = self._random_data()
+
+        v = Visualizer(img, self.metadata)
+        output = v.overlay_instances(masks=polygons, boxes=boxes, labels=labels).get_image()
+        self.assertEqual(output.shape, img.shape)
+
+        # Test 2x scaling
+        v = Visualizer(img, self.metadata, scale=2.0)
+        output = v.overlay_instances(masks=polygons, boxes=boxes, labels=labels).get_image()
+        self.assertEqual(output.shape[0], img.shape[0] * 2)
+
+        # Test overlay masks
+        v = Visualizer(img, self.metadata)
+        output = v.overlay_instances(masks=masks, boxes=boxes, labels=labels).get_image()
+        self.assertEqual(output.shape, img.shape)
+
+    def test_overlay_instances_no_boxes(self):
+        img, boxes, labels, polygons, _ = self._random_data()
+        v = Visualizer(img, self.metadata)
+        v.overlay_instances(masks=polygons, boxes=None, labels=labels).get_image()
+
+    def test_draw_instance_predictions(self):
+        img, boxes, _, _, masks = self._random_data()
+        num_inst = len(boxes)
+        inst = Instances((img.shape[0], img.shape[1]))
+        inst.pred_classes = torch.randint(0, 80, size=(num_inst,))
+        inst.scores = torch.rand(num_inst)
+        inst.pred_boxes = torch.from_numpy(boxes)
+        inst.pred_masks = torch.from_numpy(np.asarray(masks))
+
+        v = Visualizer(img, self.metadata)
+        v.draw_instance_predictions(inst)
+
+    def test_draw_empty_mask_predictions(self):
+        img, boxes, _, _, masks = self._random_data()
+        num_inst = len(boxes)
+        inst = Instances((img.shape[0], img.shape[1]))
+        inst.pred_classes = torch.randint(0, 80, size=(num_inst,))
+        inst.scores = torch.rand(num_inst)
+        inst.pred_boxes = torch.from_numpy(boxes)
+        inst.pred_masks = torch.from_numpy(np.zeros_like(np.asarray(masks)))
+
+        v = Visualizer(img, self.metadata)
+        v.draw_instance_predictions(inst)
+
+    def test_correct_output_shape(self):
+        img = np.random.rand(928, 928, 3) * 255
+        v = Visualizer(img, self.metadata)
+        out = v.output.get_image()
+        self.assertEqual(out.shape, img.shape)
+
+    def test_overlay_rotated_instances(self):
+        H, W = 100, 150
+        img = np.random.rand(H, W, 3) * 255
+        num_boxes = 50
+        boxes_5d = torch.zeros(num_boxes, 5)
+        boxes_5d[:, 0] = torch.FloatTensor(num_boxes).uniform_(-0.1 * W, 1.1 * W)
+        boxes_5d[:, 1] = torch.FloatTensor(num_boxes).uniform_(-0.1 * H, 1.1 * H)
+        boxes_5d[:, 2] = torch.FloatTensor(num_boxes).uniform_(0, max(W, H))
+        boxes_5d[:, 3] = torch.FloatTensor(num_boxes).uniform_(0, max(W, H))
+        boxes_5d[:, 4] = torch.FloatTensor(num_boxes).uniform_(-1800, 1800)
+        rotated_boxes = RotatedBoxes(boxes_5d)
+        labels = [str(i) for i in range(num_boxes)]
+
+        v = Visualizer(img, self.metadata)
+        output = v.overlay_instances(boxes=rotated_boxes, labels=labels).get_image()
+        self.assertEqual(output.shape, img.shape)
+
+    def test_draw_no_metadata(self):
+        img, boxes, _, _, masks = self._random_data()
+        num_inst = len(boxes)
+        inst = Instances((img.shape[0], img.shape[1]))
+        inst.pred_classes = torch.randint(0, 80, size=(num_inst,))
+        inst.scores = torch.rand(num_inst)
+        inst.pred_boxes = torch.from_numpy(boxes)
+        inst.pred_masks = torch.from_numpy(np.asarray(masks))
+
+        v = Visualizer(img, MetadataCatalog.get("asdfasdf"))
+        v.draw_instance_predictions(inst)
diff --git a/tools/benchmark.py b/tools/benchmark.py
new file mode 100644
index 0000000..33d7669
--- /dev/null
+++ b/tools/benchmark.py
@@ -0,0 +1,151 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+A script to benchmark builtin models.
+
+Note: this script has an extra dependency of psutil.
+"""
+
+import itertools
+import logging
+import psutil
+import torch
+import tqdm
+from fvcore.common.timer import Timer
+from torch.nn.parallel import DistributedDataParallel
+
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import (
+    DatasetFromList,
+    build_detection_test_loader,
+    build_detection_train_loader,
+)
+from detectron2.engine import SimpleTrainer, default_argument_parser, hooks, launch
+from detectron2.modeling import build_model
+from detectron2.solver import build_optimizer
+from detectron2.utils import comm
+from detectron2.utils.events import CommonMetricPrinter
+from detectron2.utils.logger import setup_logger
+
+logger = logging.getLogger("detectron2")
+
+
+def setup(args):
+    cfg = get_cfg()
+    cfg.merge_from_file(args.config_file)
+    cfg.SOLVER.BASE_LR = 0.001  # Avoid NaNs. Not useful in this script anyway.
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    setup_logger(distributed_rank=comm.get_rank())
+    return cfg
+
+
+def benchmark_data(args):
+    cfg = setup(args)
+
+    dataloader = build_detection_train_loader(cfg)
+
+    timer = Timer()
+    itr = iter(dataloader)
+    for i in range(10):  # warmup
+        next(itr)
+        if i == 0:
+            startup_time = timer.seconds()
+    timer = Timer()
+    max_iter = 1000
+    for _ in tqdm.trange(max_iter):
+        next(itr)
+    logger.info(
+        "{} iters ({} images) in {} seconds.".format(
+            max_iter, max_iter * cfg.SOLVER.IMS_PER_BATCH, timer.seconds()
+        )
+    )
+    logger.info("Startup time: {} seconds".format(startup_time))
+    vram = psutil.virtual_memory()
+    logger.info(
+        "RAM Usage: {:.2f}/{:.2f} GB".format(
+            (vram.total - vram.available) / 1024 ** 3, vram.total / 1024 ** 3
+        )
+    )
+
+
+def benchmark_train(args):
+    cfg = setup(args)
+    model = build_model(cfg)
+    logger.info("Model:\n{}".format(model))
+    if comm.get_world_size() > 1:
+        model = DistributedDataParallel(
+            model, device_ids=[comm.get_local_rank()], broadcast_buffers=False
+        )
+    optimizer = build_optimizer(cfg, model)
+    checkpointer = DetectionCheckpointer(model, optimizer=optimizer)
+    checkpointer.load(cfg.MODEL.WEIGHTS)
+
+    cfg.defrost()
+    cfg.DATALOADER.NUM_WORKERS = 0
+    data_loader = build_detection_train_loader(cfg)
+    dummy_data = list(itertools.islice(data_loader, 100))
+
+    def f():
+        while True:
+            yield from DatasetFromList(dummy_data, copy=False)
+
+    max_iter = 400
+    trainer = SimpleTrainer(model, f(), optimizer)
+    trainer.register_hooks(
+        [hooks.IterationTimer(), hooks.PeriodicWriter([CommonMetricPrinter(max_iter)])]
+    )
+    trainer.train(1, max_iter)
+
+
+@torch.no_grad()
+def benchmark_eval(args):
+    cfg = setup(args)
+    model = build_model(cfg)
+    model.eval()
+    logger.info("Model:\n{}".format(model))
+    DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
+
+    cfg.defrost()
+    cfg.DATALOADER.NUM_WORKERS = 0
+    data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0])
+    dummy_data = list(itertools.islice(data_loader, 100))
+
+    def f():
+        while True:
+            yield from DatasetFromList(dummy_data, copy=False)
+
+    for _ in range(5):  # warmup
+        model(dummy_data[0])
+
+    max_iter = 400
+    timer = Timer()
+    with tqdm.tqdm(total=max_iter) as pbar:
+        for idx, d in enumerate(f()):
+            if idx == max_iter:
+                break
+            model(d)
+            pbar.update()
+    logger.info("{} iters in {} seconds.".format(max_iter, timer.seconds()))
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    parser.add_argument("--task", choices=["train", "eval", "data"], required=True)
+    args = parser.parse_args()
+    assert not args.eval_only
+
+    if args.task == "data":
+        f = benchmark_data
+    elif args.task == "train":
+        """
+        Note: training speed may not be representative.
+        The training cost of a R-CNN model varies with the content of the data
+        and the quality of the model.
+        """
+        f = benchmark_train
+    elif args.task == "eval":
+        f = benchmark_eval
+        # only benchmark single-GPU inference.
+        assert args.num_gpus == 1 and args.num_machines == 1
+    launch(f, args.num_gpus, args.num_machines, args.machine_rank, args.dist_url, args=(args,))
diff --git a/tools/plain_train_net.py b/tools/plain_train_net.py
new file mode 100644
index 0000000..8c5b453
--- /dev/null
+++ b/tools/plain_train_net.py
@@ -0,0 +1,230 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Detectron2 training script with a plain training loop.
+
+This scripts reads a given config file and runs the training or evaluation.
+It is an entry point that is able to train standard models in detectron2.
+
+In order to let one script support training of many models,
+this script contains logic that are specific to these built-in models and therefore
+may not be suitable for your own project.
+For example, your research project perhaps only needs a single "evaluator".
+
+Therefore, we recommend you to use detectron2 as an library and take
+this file as an example of how to use the library.
+You may want to write your own script with your datasets and other customizations.
+
+Compared to "train_net.py", this script supports fewer features, and also
+includes fewer abstraction.
+"""
+
+import logging
+import os
+from collections import OrderedDict
+import torch
+from torch.nn.parallel import DistributedDataParallel
+
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer, PeriodicCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import (
+    MetadataCatalog,
+    build_detection_test_loader,
+    build_detection_train_loader,
+)
+from detectron2.engine import default_argument_parser, default_setup, launch
+from detectron2.evaluation import (
+    CityscapesEvaluator,
+    COCOEvaluator,
+    COCOPanopticEvaluator,
+    DatasetEvaluators,
+    LVISEvaluator,
+    PascalVOCDetectionEvaluator,
+    SemSegEvaluator,
+    inference_on_dataset,
+    print_csv_format,
+)
+from detectron2.modeling import build_model
+from detectron2.solver import build_lr_scheduler, build_optimizer
+from detectron2.utils.events import (
+    CommonMetricPrinter,
+    EventStorage,
+    JSONWriter,
+    TensorboardXWriter,
+)
+
+logger = logging.getLogger("detectron2")
+
+
+def get_evaluator(cfg, dataset_name, output_folder=None):
+    """
+    Create evaluator(s) for a given dataset.
+    This uses the special metadata "evaluator_type" associated with each builtin dataset.
+    For your own dataset, you can simply create an evaluator manually in your
+    script and do not have to worry about the hacky if-else logic here.
+    """
+    if output_folder is None:
+        output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+    evaluator_list = []
+    evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+    if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
+        evaluator_list.append(
+            SemSegEvaluator(
+                dataset_name,
+                distributed=True,
+                num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
+                ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+                output_dir=output_folder,
+            )
+        )
+    if evaluator_type in ["coco", "coco_panoptic_seg"]:
+        evaluator_list.append(COCOEvaluator(dataset_name, cfg, True, output_folder))
+    if evaluator_type == "coco_panoptic_seg":
+        evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
+    if evaluator_type == "cityscapes":
+        assert (
+            torch.cuda.device_count() >= comm.get_rank()
+        ), "CityscapesEvaluator currently do not work with multiple machines."
+        return CityscapesEvaluator(dataset_name)
+    if evaluator_type == "pascal_voc":
+        return PascalVOCDetectionEvaluator(dataset_name)
+    if evaluator_type == "lvis":
+        return LVISEvaluator(dataset_name, cfg, True, output_folder)
+    if len(evaluator_list) == 0:
+        raise NotImplementedError(
+            "no Evaluator for the dataset {} with the type {}".format(dataset_name, evaluator_type)
+        )
+    if len(evaluator_list) == 1:
+        return evaluator_list[0]
+    return DatasetEvaluators(evaluator_list)
+
+
+def do_test(cfg, model):
+    results = OrderedDict()
+    for dataset_name in cfg.DATASETS.TEST:
+        data_loader = build_detection_test_loader(cfg, dataset_name)
+        evaluator = get_evaluator(
+            cfg, dataset_name, os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name)
+        )
+        results_i = inference_on_dataset(model, data_loader, evaluator)
+        results[dataset_name] = results_i
+        if comm.is_main_process():
+            logger.info("Evaluation results for {} in csv format:".format(dataset_name))
+            print_csv_format(results_i)
+    if len(results) == 1:
+        results = list(results.values())[0]
+    return results
+
+
+def do_train(cfg, model, resume=False):
+    model.train()
+    optimizer = build_optimizer(cfg, model)
+    scheduler = build_lr_scheduler(cfg, optimizer)
+
+    checkpointer = DetectionCheckpointer(
+        model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler
+    )
+    start_iter = (
+        checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
+    )
+    max_iter = cfg.SOLVER.MAX_ITER
+
+    periodic_checkpointer = PeriodicCheckpointer(
+        checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter
+    )
+
+    writers = (
+        [
+            CommonMetricPrinter(max_iter),
+            JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
+            TensorboardXWriter(cfg.OUTPUT_DIR),
+        ]
+        if comm.is_main_process()
+        else []
+    )
+
+    # compared to "train_net.py", we do not support accurate timing and
+    # precise BN here, because they are not trivial to implement
+    data_loader = build_detection_train_loader(cfg)
+    logger.info("Starting training from iteration {}".format(start_iter))
+    with EventStorage(start_iter) as storage:
+        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
+            iteration = iteration + 1
+            storage.step()
+
+            loss_dict = model(data)
+            losses = sum(loss for loss in loss_dict.values())
+            assert torch.isfinite(losses).all(), loss_dict
+
+            loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
+            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
+            if comm.is_main_process():
+                storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)
+
+            optimizer.zero_grad()
+            losses.backward()
+            optimizer.step()
+            storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
+            scheduler.step()
+
+            if (
+                cfg.TEST.EVAL_PERIOD > 0
+                and iteration % cfg.TEST.EVAL_PERIOD == 0
+                and iteration != max_iter
+            ):
+                do_test(cfg, model)
+                # Compared to "train_net.py", the test results are not dumped to EventStorage
+                comm.synchronize()
+
+            if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter):
+                for writer in writers:
+                    writer.write()
+            periodic_checkpointer.step(iteration)
+
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(
+        cfg, args
+    )  # if you don't like any of the default setup, write your own setup code
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+
+    model = build_model(cfg)
+    logger.info("Model:\n{}".format(model))
+    if args.eval_only:
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        return do_test(cfg, model)
+
+    distributed = comm.get_world_size() > 1
+    if distributed:
+        model = DistributedDataParallel(
+            model, device_ids=[comm.get_local_rank()], broadcast_buffers=False
+        )
+
+    do_train(cfg, model)
+    return do_test(cfg, model)
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/tools/train_net.py b/tools/train_net.py
new file mode 100644
index 0000000..1a6c5f1
--- /dev/null
+++ b/tools/train_net.py
@@ -0,0 +1,162 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Detection Training Script.
+
+This scripts reads a given config file and runs the training or evaluation.
+It is an entry point that is made to train standard models in detectron2.
+
+In order to let one script support training of many models,
+this script contains logic that are specific to these built-in models and therefore
+may not be suitable for your own project.
+For example, your research project perhaps only needs a single "evaluator".
+
+Therefore, we recommend you to use detectron2 as an library and take
+this file as an example of how to use the library.
+You may want to write your own script with your datasets and other customizations.
+"""
+
+import logging
+import os
+from collections import OrderedDict
+import torch
+
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import MetadataCatalog
+from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch
+from detectron2.evaluation import (
+    CityscapesEvaluator,
+    COCOEvaluator,
+    COCOPanopticEvaluator,
+    DatasetEvaluators,
+    LVISEvaluator,
+    PascalVOCDetectionEvaluator,
+    SemSegEvaluator,
+    verify_results,
+)
+from detectron2.modeling import GeneralizedRCNNWithTTA
+
+
+class Trainer(DefaultTrainer):
+    """
+    We use the "DefaultTrainer" which contains a number pre-defined logic for
+    standard training workflow. They may not work for you, especially if you
+    are working on a new research project. In that case you can use the cleaner
+    "SimpleTrainer", or write your own training loop.
+    """
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        """
+        Create evaluator(s) for a given dataset.
+        This uses the special metadata "evaluator_type" associated with each builtin dataset.
+        For your own dataset, you can simply create an evaluator manually in your
+        script and do not have to worry about the hacky if-else logic here.
+        """
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        evaluator_list = []
+        evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+        if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
+            evaluator_list.append(
+                SemSegEvaluator(
+                    dataset_name,
+                    distributed=True,
+                    num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
+                    ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+                    output_dir=output_folder,
+                )
+            )
+        if evaluator_type in ["coco", "coco_panoptic_seg"]:
+            evaluator_list.append(COCOEvaluator(dataset_name, cfg, True, output_folder))
+        if evaluator_type == "coco_panoptic_seg":
+            evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
+        if evaluator_type == "cityscapes":
+            assert (
+                torch.cuda.device_count() >= comm.get_rank()
+            ), "CityscapesEvaluator currently do not work with multiple machines."
+            return CityscapesEvaluator(dataset_name)
+        if evaluator_type == "pascal_voc":
+            return PascalVOCDetectionEvaluator(dataset_name)
+        if evaluator_type == "lvis":
+            return LVISEvaluator(dataset_name, cfg, True, output_folder)
+        if len(evaluator_list) == 0:
+            raise NotImplementedError(
+                "no Evaluator for the dataset {} with the type {}".format(
+                    dataset_name, evaluator_type
+                )
+            )
+        if len(evaluator_list) == 1:
+            return evaluator_list[0]
+        return DatasetEvaluators(evaluator_list)
+
+    @classmethod
+    def test_with_TTA(cls, cfg, model):
+        logger = logging.getLogger("detectron2.trainer")
+        # In the end of training, run an evaluation with TTA
+        # Only support some R-CNN models.
+        logger.info("Running inference with test-time augmentation ...")
+        model = GeneralizedRCNNWithTTA(cfg, model)
+        evaluators = [
+            cls.build_evaluator(
+                cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
+            )
+            for name in cfg.DATASETS.TEST
+        ]
+        res = cls.test(cfg, model, evaluators)
+        res = OrderedDict({k + "_TTA": v for k, v in res.items()})
+        return res
+
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = Trainer.test(cfg, model)
+        if comm.is_main_process():
+            verify_results(cfg, res)
+        if cfg.TEST.AUG.ENABLED:
+            res.update(Trainer.test_with_TTA(cfg, model))
+        return res
+
+    """
+    If you'd like to do anything fancier than the standard training logic,
+    consider writing your own training loop or subclassing the trainer.
+    """
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    if cfg.TEST.AUG.ENABLED:
+        trainer.register_hooks(
+            [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))]
+        )
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/tools/visualize_data.py b/tools/visualize_data.py
new file mode 100644
index 0000000..c01114c
--- /dev/null
+++ b/tools/visualize_data.py
@@ -0,0 +1,96 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import argparse
+import numpy as np
+import os
+from itertools import chain
+import cv2
+from PIL import Image
+
+from detectron2.config import get_cfg
+from detectron2.data import DatasetCatalog, MetadataCatalog, build_detection_train_loader
+from detectron2.data import detection_utils as utils
+from detectron2.data.build import filter_images_with_few_keypoints
+from detectron2.utils.logger import setup_logger
+from detectron2.utils.visualizer import Visualizer
+
+
+def setup(args):
+    cfg = get_cfg()
+    if args.config_file:
+        cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    return cfg
+
+
+def parse_args(in_args=None):
+    parser = argparse.ArgumentParser(description="Visualize ground-truth data")
+    parser.add_argument(
+        "--source",
+        choices=["annotation", "dataloader"],
+        required=True,
+        help="visualize the annotations or the data loader (with pre-processing)",
+    )
+    parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file")
+    parser.add_argument("--output-dir", default="./", help="path to output directory")
+    parser.add_argument("--show", action="store_true", help="show output in a window")
+    parser.add_argument(
+        "opts",
+        help="Modify config options using the command-line",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+    return parser.parse_args(in_args)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+    cfg = setup(args)
+
+    dirname = args.output_dir
+    os.makedirs(dirname, exist_ok=True)
+    metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])
+
+    def output(vis, fname):
+        if args.show:
+            print(fname)
+            cv2.imshow("window", vis.get_image()[:, :, ::-1])
+            cv2.waitKey()
+        else:
+            filepath = os.path.join(dirname, fname)
+            print("Saving to {} ...".format(filepath))
+            vis.save(filepath)
+
+    scale = 2.0 if args.show else 1.0
+    if args.source == "dataloader":
+        train_data_loader = build_detection_train_loader(cfg)
+        for batch in train_data_loader:
+            for per_image in batch:
+                # Pytorch tensor is in (C, H, W) format
+                img = per_image["image"].permute(1, 2, 0)
+                if cfg.INPUT.FORMAT == "BGR":
+                    img = img[:, :, [2, 1, 0]]
+                else:
+                    img = np.asarray(Image.fromarray(img, mode=cfg.INPUT.FORMAT).convert("RGB"))
+
+                visualizer = Visualizer(img, metadata=metadata, scale=scale)
+                target_fields = per_image["instances"].get_fields()
+                labels = [metadata.thing_classes[i] for i in target_fields["gt_classes"]]
+                vis = visualizer.overlay_instances(
+                    labels=labels,
+                    boxes=target_fields.get("gt_boxes", None),
+                    masks=target_fields.get("gt_masks", None),
+                    keypoints=target_fields.get("gt_keypoints", None),
+                )
+                output(vis, str(per_image["image_id"]) + ".jpg")
+    else:
+        dicts = list(chain.from_iterable([DatasetCatalog.get(k) for k in cfg.DATASETS.TRAIN]))
+        if cfg.MODEL.KEYPOINT_ON:
+            dicts = filter_images_with_few_keypoints(dicts, 1)
+        for dic in dicts:
+            img = utils.read_image(dic["file_name"], "RGB")
+            visualizer = Visualizer(img, metadata=metadata, scale=scale)
+            vis = visualizer.draw_dataset_dict(dic)
+            output(vis, os.path.basename(dic["file_name"]))
diff --git a/tools/visualize_json_results.py b/tools/visualize_json_results.py
new file mode 100644
index 0000000..5b369a1
--- /dev/null
+++ b/tools/visualize_json_results.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import argparse
+import json
+import numpy as np
+import os
+from collections import defaultdict
+import cv2
+import tqdm
+from fvcore.common.file_io import PathManager
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures import Boxes, BoxMode, Instances
+from detectron2.utils.logger import setup_logger
+from detectron2.utils.visualizer import Visualizer
+
+
+def create_instances(predictions, image_size):
+    ret = Instances(image_size)
+
+    score = np.asarray([x["score"] for x in predictions])
+    chosen = (score > args.conf_threshold).nonzero()[0]
+    score = score[chosen]
+    bbox = np.asarray([predictions[i]["bbox"] for i in chosen])
+    bbox = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
+
+    labels = np.asarray([dataset_id_map(predictions[i]["category_id"]) for i in chosen])
+
+    ret.scores = score
+    ret.pred_boxes = Boxes(bbox)
+    ret.pred_classes = labels
+
+    try:
+        ret.pred_masks = [predictions[i]["segmentation"] for i in chosen]
+    except KeyError:
+        pass
+    return ret
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="A script that visualizes the json predictions from COCO or LVIS dataset."
+    )
+    parser.add_argument("--input", required=True, help="JSON file produced by the model")
+    parser.add_argument("--output", required=True, help="output directory")
+    parser.add_argument("--dataset", help="name of the dataset", default="coco_2017_val")
+    parser.add_argument("--conf-threshold", default=0.5, type=float, help="confidence threshold")
+    args = parser.parse_args()
+
+    logger = setup_logger()
+
+    with PathManager.open(args.input, "r") as f:
+        predictions = json.load(f)
+
+    pred_by_image = defaultdict(list)
+    for p in predictions:
+        pred_by_image[p["image_id"]].append(p)
+
+    dicts = list(DatasetCatalog.get(args.dataset))
+    metadata = MetadataCatalog.get(args.dataset)
+    if hasattr(metadata, "thing_dataset_id_to_contiguous_id"):
+
+        def dataset_id_map(ds_id):
+            return metadata.thing_dataset_id_to_contiguous_id[ds_id]
+
+    elif "lvis" in args.dataset:
+        # LVIS results are in the same format as COCO results, but have a different
+        # mapping from dataset category id to contiguous category id in [0, #categories - 1]
+        def dataset_id_map(ds_id):
+            return ds_id - 1
+
+    else:
+        raise ValueError("Unsupported dataset: {}".format(args.dataset))
+
+    os.makedirs(args.output, exist_ok=True)
+
+    for dic in tqdm.tqdm(dicts):
+        img = cv2.imread(dic["file_name"], cv2.IMREAD_COLOR)[:, :, ::-1]
+        basename = os.path.basename(dic["file_name"])
+
+        predictions = create_instances(pred_by_image[dic["image_id"]], img.shape[:2])
+        vis = Visualizer(img, metadata)
+        vis_pred = vis.draw_instance_predictions(predictions).get_image()
+
+        vis = Visualizer(img, metadata)
+        vis_gt = vis.draw_dataset_dict(dic).get_image()
+
+        concat = np.concatenate((vis_pred, vis_gt), axis=1)
+        cv2.imwrite(os.path.join(args.output, basename), concat[:, :, ::-1])