You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

defaults.py 18 kB

3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. # -*- coding: utf-8 -*-
  2. # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  3. """
  4. This file contains components with some default boilerplate logic user may need
  5. in training / testing. They will not work for everyone, but many users may find them useful.
  6. The behavior of functions/classes in this file is subject to change,
  7. since they are meant to represent the "common default behavior" people need in their projects.
  8. """
  9. import argparse
  10. import logging
  11. import os
  12. from collections import OrderedDict
  13. import torch
  14. from fvcore.common.file_io import PathManager
  15. from fvcore.nn.precise_bn import get_bn_modules
  16. from torch.nn.parallel import DistributedDataParallel
  17. import detectron2.data.transforms as T
  18. from detectron2.checkpoint import DetectionCheckpointer
  19. from detectron2.data import (
  20. MetadataCatalog,
  21. build_detection_test_loader,
  22. build_detection_train_loader,
  23. )
  24. from detectron2.evaluation import (
  25. DatasetEvaluator,
  26. inference_on_dataset,
  27. print_csv_format,
  28. verify_results,
  29. )
  30. from detectron2.modeling import build_model
  31. from detectron2.solver import build_lr_scheduler, build_optimizer
  32. from detectron2.utils import comm
  33. from detectron2.utils.collect_env import collect_env_info
  34. from detectron2.utils.env import seed_all_rng
  35. from detectron2.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter
  36. from detectron2.utils.logger import setup_logger
  37. from . import hooks
  38. from .train_loop import SimpleTrainer
  39. __all__ = ["default_argument_parser", "default_setup", "DefaultPredictor", "DefaultTrainer"]
  40. def default_argument_parser():
  41. """
  42. Create a parser with some common arguments used by detectron2 users.
  43. Returns:
  44. argparse.ArgumentParser:
  45. """
  46. parser = argparse.ArgumentParser(description="Detectron2 Training")
  47. parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file")
  48. parser.add_argument(
  49. "--resume",
  50. action="store_true",
  51. help="whether to attempt to resume from the checkpoint directory",
  52. )
  53. parser.add_argument("--eval-only", action="store_true", help="perform evaluation only")
  54. parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*")
  55. parser.add_argument("--num-machines", type=int, default=1)
  56. parser.add_argument(
  57. "--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)"
  58. )
  59. # PyTorch still may leave orphan processes in multi-gpu training.
  60. # Therefore we use a deterministic way to obtain port,
  61. # so that users are aware of orphan processes by seeing the port occupied.
  62. port = 2 ** 15 + 2 ** 14 + hash(os.getpid()) % 2 ** 14
  63. parser.add_argument("--dist-url", default="tcp://127.0.0.1:{}".format(port))
  64. parser.add_argument(
  65. "opts",
  66. help="Modify config options using the command-line",
  67. default=None,
  68. nargs=argparse.REMAINDER,
  69. )
  70. return parser
  71. def default_setup(cfg, args):
  72. """
  73. Perform some basic common setups at the beginning of a job, including:
  74. 1. Set up the detectron2 logger
  75. 2. Log basic information about environment, cmdline arguments, and config
  76. 3. Backup the config to the output directory
  77. Args:
  78. cfg (CfgNode): the full config to be used
  79. args (argparse.NameSpace): the command line arguments to be logged
  80. """
  81. output_dir = cfg.OUTPUT_DIR
  82. if comm.is_main_process() and output_dir:
  83. PathManager.mkdirs(output_dir)
  84. rank = comm.get_rank()
  85. setup_logger(output_dir, distributed_rank=rank, name="fvcore")
  86. logger = setup_logger(output_dir, distributed_rank=rank)
  87. logger.info("Rank of current process: {}. World size: {}".format(rank, comm.get_world_size()))
  88. logger.info("Environment info:\n" + collect_env_info())
  89. logger.info("Command line arguments: " + str(args))
  90. if hasattr(args, "config_file"):
  91. logger.info(
  92. "Contents of args.config_file={}:\n{}".format(
  93. args.config_file, PathManager.open(args.config_file, "r").read()
  94. )
  95. )
  96. logger.info("Running with full config:\n{}".format(cfg))
  97. if comm.is_main_process() and output_dir:
  98. # Note: some of our scripts may expect the existence of
  99. # config.yaml in output directory
  100. path = os.path.join(output_dir, "config.yaml")
  101. with PathManager.open(path, "w") as f:
  102. f.write(cfg.dump())
  103. logger.info("Full config saved to {}".format(os.path.abspath(path)))
  104. # make sure each worker has a different, yet deterministic seed if specified
  105. seed_all_rng(None if cfg.SEED < 0 else cfg.SEED + rank)
  106. # cudnn benchmark has large overhead. It shouldn't be used considering the small size of
  107. # typical validation set.
  108. if not (hasattr(args, "eval_only") and args.eval_only):
  109. torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK
  110. class DefaultPredictor:
  111. """
  112. Create a simple end-to-end predictor with the given config.
  113. The predictor takes an BGR image, resizes it to the specified resolution,
  114. runs the model and produces a dict of predictions.
  115. This predictor takes care of model loading and input preprocessing for you.
  116. If you'd like to do anything more fancy, please refer to its source code
  117. as examples to build and use the model manually.
  118. Attributes:
  119. metadata (Metadata): the metadata of the underlying dataset, obtained from
  120. cfg.DATASETS.TEST.
  121. Examples:
  122. .. code-block:: python
  123. pred = DefaultPredictor(cfg)
  124. outputs = pred(inputs)
  125. """
  126. def __init__(self, cfg):
  127. self.cfg = cfg.clone() # cfg can be modified by model
  128. self.model = build_model(self.cfg)
  129. self.model.eval()
  130. self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
  131. checkpointer = DetectionCheckpointer(self.model)
  132. checkpointer.load(cfg.MODEL.WEIGHTS)
  133. self.transform_gen = T.ResizeShortestEdge(
  134. [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
  135. )
  136. self.input_format = cfg.INPUT.FORMAT
  137. assert self.input_format in ["RGB", "BGR"], self.input_format
  138. @torch.no_grad()
  139. def __call__(self, original_image):
  140. """
  141. Args:
  142. original_image (np.ndarray): an image of shape (H, W, C) (in BGR order).
  143. Returns:
  144. predictions (dict): the output of the model
  145. """
  146. # Apply pre-processing to image.
  147. if self.input_format == "RGB":
  148. # whether the model expects BGR inputs or RGB
  149. original_image = original_image[:, :, ::-1]
  150. height, width = original_image.shape[:2]
  151. image = self.transform_gen.get_transform(original_image).apply_image(original_image)
  152. image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
  153. inputs = {"image": image, "height": height, "width": width}
  154. predictions = self.model([inputs])[0]
  155. return predictions
  156. class DefaultTrainer(SimpleTrainer):
  157. """
  158. A trainer with default training logic. Compared to `SimpleTrainer`, it
  159. contains the following logic in addition:
  160. 1. Create model, optimizer, scheduler, dataloader from the given config.
  161. 2. Load a checkpoint or `cfg.MODEL.WEIGHTS`, if exists.
  162. 3. Register a few common hooks.
  163. It is created to simplify the **standard model training workflow** and reduce code boilerplate
  164. for users who only need the standard training workflow, with standard features.
  165. It means this class makes *many assumptions* about your training logic that
  166. may easily become invalid in a new research. In fact, any assumptions beyond those made in the
  167. :class:`SimpleTrainer` are too much for research.
  168. The code of this class has been annotated about restrictive assumptions it mades.
  169. When they do not work for you, you're encouraged to:
  170. 1. Overwrite methods of this class, OR:
  171. 2. Use :class:`SimpleTrainer`, which only does minimal SGD training and
  172. nothing else. You can then add your own hooks if needed. OR:
  173. 3. Write your own training loop similar to `tools/plain_train_net.py`.
  174. Also note that the behavior of this class, like other functions/classes in
  175. this file, is not stable, since it is meant to represent the "common default behavior".
  176. It is only guaranteed to work well with the standard models and training workflow in detectron2.
  177. To obtain more stable behavior, write your own training logic with other public APIs.
  178. Attributes:
  179. scheduler:
  180. checkpointer (DetectionCheckpointer):
  181. cfg (CfgNode):
  182. Examples:
  183. .. code-block:: python
  184. trainer = DefaultTrainer(cfg)
  185. trainer.resume_or_load() # load last checkpoint or MODEL.WEIGHTS
  186. trainer.train()
  187. """
  188. def __init__(self, cfg):
  189. """
  190. Args:
  191. cfg (CfgNode):
  192. """
  193. # Assume these objects must be constructed in this order.
  194. model = self.build_model(cfg)
  195. optimizer = self.build_optimizer(cfg, model)
  196. data_loader = self.build_train_loader(cfg)
  197. # For training, wrap with DDP. But don't need this for inference.
  198. if comm.get_world_size() > 1:
  199. model = DistributedDataParallel(
  200. model, device_ids=[comm.get_local_rank()], broadcast_buffers=False
  201. )
  202. super().__init__(model, data_loader, optimizer)
  203. self.scheduler = self.build_lr_scheduler(cfg, optimizer)
  204. # Assume no other objects need to be checkpointed.
  205. # We can later make it checkpoint the stateful hooks
  206. self.checkpointer = DetectionCheckpointer(
  207. # Assume you want to save checkpoints together with logs/statistics
  208. model,
  209. cfg.OUTPUT_DIR,
  210. optimizer=optimizer,
  211. scheduler=self.scheduler,
  212. )
  213. self.start_iter = 0
  214. self.max_iter = cfg.SOLVER.MAX_ITER
  215. self.cfg = cfg
  216. self.register_hooks(self.build_hooks())
  217. def resume_or_load(self, resume=True):
  218. """
  219. If `resume==True`, and last checkpoint exists, resume from it.
  220. Otherwise, load a model specified by the config.
  221. Args:
  222. resume (bool): whether to do resume or not
  223. """
  224. # The checkpoint stores the training iteration that just finished, thus we start
  225. # at the next iteration (or iter zero if there's no checkpoint).
  226. self.start_iter = (
  227. self.checkpointer.resume_or_load(self.cfg.MODEL.WEIGHTS, resume=resume).get(
  228. "iteration", -1
  229. )
  230. + 1
  231. )
  232. def build_hooks(self):
  233. """
  234. Build a list of default hooks, including timing, evaluation,
  235. checkpointing, lr scheduling, precise BN, writing events.
  236. Returns:
  237. list[HookBase]:
  238. """
  239. cfg = self.cfg.clone()
  240. cfg.defrost()
  241. cfg.DATALOADER.NUM_WORKERS = 0 # save some memory and time for PreciseBN
  242. ret = [
  243. hooks.IterationTimer(),
  244. hooks.LRScheduler(self.optimizer, self.scheduler),
  245. hooks.PreciseBN(
  246. # Run at the same freq as (but before) evaluation.
  247. cfg.TEST.EVAL_PERIOD,
  248. self.model,
  249. # Build a new data loader to not affect training
  250. self.build_train_loader(cfg),
  251. cfg.TEST.PRECISE_BN.NUM_ITER,
  252. )
  253. if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)
  254. else None,
  255. ]
  256. # Do PreciseBN before checkpointer, because it updates the model and need to
  257. # be saved by checkpointer.
  258. # This is not always the best: if checkpointing has a different frequency,
  259. # some checkpoints may have more precise statistics than others.
  260. if comm.is_main_process():
  261. ret.append(hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD))
  262. def test_and_save_results():
  263. self._last_eval_results = self.test(self.cfg, self.model)
  264. return self._last_eval_results
  265. # Do evaluation after checkpointer, because then if it fails,
  266. # we can use the saved checkpoint to debug.
  267. ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))
  268. if comm.is_main_process():
  269. # run writers in the end, so that evaluation metrics are written
  270. ret.append(hooks.PeriodicWriter(self.build_writers()))
  271. return ret
  272. def build_writers(self):
  273. """
  274. Build a list of writers to be used. By default it contains
  275. writers that write metrics to the screen,
  276. a json file, and a tensorboard event file respectively.
  277. If you'd like a different list of writers, you can overwrite it in
  278. your trainer.
  279. Returns:
  280. list[EventWriter]: a list of :class:`EventWriter` objects.
  281. It is now implemented by:
  282. .. code-block:: python
  283. return [
  284. CommonMetricPrinter(self.max_iter),
  285. JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")),
  286. TensorboardXWriter(self.cfg.OUTPUT_DIR),
  287. ]
  288. """
  289. # Assume the default print/log frequency.
  290. return [
  291. # It may not always print what you want to see, since it prints "common" metrics only.
  292. CommonMetricPrinter(self.max_iter),
  293. JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")),
  294. TensorboardXWriter(self.cfg.OUTPUT_DIR),
  295. ]
  296. def train(self):
  297. """
  298. Run training.
  299. Returns:
  300. OrderedDict of results, if evaluation is enabled. Otherwise None.
  301. """
  302. super().train(self.start_iter, self.max_iter)
  303. if hasattr(self, "_last_eval_results") and comm.is_main_process():
  304. verify_results(self.cfg, self._last_eval_results)
  305. return self._last_eval_results
  306. @classmethod
  307. def build_model(cls, cfg):
  308. """
  309. Returns:
  310. torch.nn.Module:
  311. It now calls :func:`detectron2.modeling.build_model`.
  312. Overwrite it if you'd like a different model.
  313. """
  314. model = build_model(cfg)
  315. logger = logging.getLogger(__name__)
  316. logger.info("Model:\n{}".format(model))
  317. return model
  318. @classmethod
  319. def build_optimizer(cls, cfg, model):
  320. """
  321. Returns:
  322. torch.optim.Optimizer:
  323. It now calls :func:`detectron2.solver.build_optimizer`.
  324. Overwrite it if you'd like a different optimizer.
  325. """
  326. return build_optimizer(cfg, model)
  327. @classmethod
  328. def build_lr_scheduler(cls, cfg, optimizer):
  329. """
  330. It now calls :func:`detectron2.solver.build_lr_scheduler`.
  331. Overwrite it if you'd like a different scheduler.
  332. """
  333. return build_lr_scheduler(cfg, optimizer)
  334. @classmethod
  335. def build_train_loader(cls, cfg):
  336. """
  337. Returns:
  338. iterable
  339. It now calls :func:`detectron2.data.build_detection_train_loader`.
  340. Overwrite it if you'd like a different data loader.
  341. """
  342. return build_detection_train_loader(cfg)
  343. @classmethod
  344. def build_test_loader(cls, cfg, dataset_name):
  345. """
  346. Returns:
  347. iterable
  348. It now calls :func:`detectron2.data.build_detection_test_loader`.
  349. Overwrite it if you'd like a different data loader.
  350. """
  351. return build_detection_test_loader(cfg, dataset_name)
  352. @classmethod
  353. def build_evaluator(cls, cfg, dataset_name):
  354. """
  355. Returns:
  356. DatasetEvaluator
  357. It is not implemented by default.
  358. """
  359. raise NotImplementedError(
  360. "Please either implement `build_evaluator()` in subclasses, or pass "
  361. "your evaluator as arguments to `DefaultTrainer.test()`."
  362. )
  363. @classmethod
  364. def test(cls, cfg, model, evaluators=None):
  365. """
  366. Args:
  367. cfg (CfgNode):
  368. model (nn.Module):
  369. evaluators (list[DatasetEvaluator] or None): if None, will call
  370. :meth:`build_evaluator`. Otherwise, must have the same length as
  371. `cfg.DATASETS.TEST`.
  372. Returns:
  373. dict: a dict of result metrics
  374. """
  375. logger = logging.getLogger(__name__)
  376. if isinstance(evaluators, DatasetEvaluator):
  377. evaluators = [evaluators]
  378. if evaluators is not None:
  379. assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format(
  380. len(cfg.DATASETS.TEST), len(evaluators)
  381. )
  382. results = OrderedDict()
  383. for idx, dataset_name in enumerate(cfg.DATASETS.TEST):
  384. data_loader = cls.build_test_loader(cfg, dataset_name)
  385. # When evaluators are passed in as arguments,
  386. # implicitly assume that evaluators can be created before data_loader.
  387. if evaluators is not None:
  388. evaluator = evaluators[idx]
  389. else:
  390. try:
  391. evaluator = cls.build_evaluator(cfg, dataset_name)
  392. except NotImplementedError:
  393. logger.warn(
  394. "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, "
  395. "or implement its `build_evaluator` method."
  396. )
  397. results[dataset_name] = {}
  398. continue
  399. results_i = inference_on_dataset(model, data_loader, evaluator)
  400. results[dataset_name] = results_i
  401. if comm.is_main_process():
  402. assert isinstance(
  403. results_i, dict
  404. ), "Evaluator must return a dict on the main process. Got {} instead.".format(
  405. results_i
  406. )
  407. logger.info("Evaluation results for {} in csv format:".format(dataset_name))
  408. print_csv_format(results_i)
  409. if len(results) == 1:
  410. results = list(results.values())[0]
  411. return results

No Description