You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

eval.py 10 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """Eval"""
  16. import os
  17. import time
  18. import argparse
  19. import datetime
  20. import glob
  21. import numpy as np
  22. import mindspore.nn as nn
  23. from mindspore import Tensor, context
  24. from mindspore.communication.management import init, get_rank, get_group_size, release
  25. from mindspore.train.serialization import load_checkpoint, load_param_into_net
  26. from mindspore.ops import operations as P
  27. from mindspore.ops import functional as F
  28. from mindspore.common import dtype as mstype
  29. from src.utils.logging import get_logger
  30. from src.utils.auto_mixed_precision import auto_mixed_precision
  31. from src.image_classification import get_network
  32. from src.dataset import classification_dataset
  33. from src.config import config
  34. class ParameterReduce(nn.Cell):
  35. """ParameterReduce"""
  36. def __init__(self):
  37. super(ParameterReduce, self).__init__()
  38. self.cast = P.Cast()
  39. self.reduce = P.AllReduce()
  40. def construct(self, x):
  41. one = self.cast(F.scalar_to_array(1.0), mstype.float32)
  42. out = x * one
  43. ret = self.reduce(out)
  44. return ret
  45. def parse_args(cloud_args=None):
  46. """parse_args"""
  47. parser = argparse.ArgumentParser('mindspore classification test')
  48. parser.add_argument('--platform', type=str, default='Ascend', choices=('Ascend', 'GPU'), help='run platform')
  49. # dataset related
  50. parser.add_argument('--data_dir', type=str, default='/opt/npu/datasets/classification/val', help='eval data dir')
  51. parser.add_argument('--per_batch_size', default=32, type=int, help='batch size for per npu')
  52. # network related
  53. parser.add_argument('--graph_ckpt', type=int, default=1, help='graph ckpt or feed ckpt')
  54. parser.add_argument('--pretrained', default='', type=str, help='fully path of pretrained model to load. '
  55. 'If it is a direction, it will test all ckpt')
  56. # logging related
  57. parser.add_argument('--log_path', type=str, default='outputs/', help='path to save log')
  58. parser.add_argument('--is_distributed', type=int, default=0, help='if multi device')
  59. # roma obs
  60. parser.add_argument('--train_url', type=str, default="", help='train url')
  61. args, _ = parser.parse_known_args()
  62. args = merge_args(args, cloud_args)
  63. args.image_size = config.image_size
  64. args.num_classes = config.num_classes
  65. args.backbone = config.backbone
  66. args.rank = config.rank
  67. args.group_size = config.group_size
  68. args.image_size = list(map(int, args.image_size.split(',')))
  69. return args
  70. def get_top5_acc(top5_arg, gt_class):
  71. sub_count = 0
  72. for top5, gt in zip(top5_arg, gt_class):
  73. if gt in top5:
  74. sub_count += 1
  75. return sub_count
  76. def merge_args(args, cloud_args):
  77. """merge_args"""
  78. args_dict = vars(args)
  79. if isinstance(cloud_args, dict):
  80. for key in cloud_args.keys():
  81. val = cloud_args[key]
  82. if key in args_dict and val:
  83. arg_type = type(args_dict[key])
  84. if arg_type is not type(None):
  85. val = arg_type(val)
  86. args_dict[key] = val
  87. return args
  88. def test(cloud_args=None):
  89. """test"""
  90. args = parse_args(cloud_args)
  91. context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True,
  92. device_target=args.platform, save_graphs=False)
  93. if os.getenv('DEVICE_ID', "not_set").isdigit():
  94. context.set_context(device_id=int(os.getenv('DEVICE_ID')))
  95. # init distributed
  96. if args.is_distributed:
  97. if args.platform == "Ascend":
  98. init()
  99. elif args.platform == "GPU":
  100. init("nccl")
  101. args.rank = get_rank()
  102. args.group_size = get_group_size()
  103. parallel_mode = ParallelMode.DATA_PARALLEL
  104. context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size,
  105. parameter_broadcast=True, mirror_mean=True)
  106. else:
  107. args.rank = 0
  108. args.group_size = 1
  109. args.outputs_dir = os.path.join(args.log_path,
  110. datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
  111. args.logger = get_logger(args.outputs_dir, args.rank)
  112. args.logger.save_args(args)
  113. # network
  114. args.logger.important_info('start create network')
  115. if os.path.isdir(args.pretrained):
  116. models = list(glob.glob(os.path.join(args.pretrained, '*.ckpt')))
  117. print(models)
  118. if args.graph_ckpt:
  119. f = lambda x: -1 * int(os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].split('_')[0])
  120. else:
  121. f = lambda x: -1 * int(os.path.splitext(os.path.split(x)[-1])[0].split('_')[-1])
  122. args.models = sorted(models, key=f)
  123. else:
  124. args.models = [args.pretrained,]
  125. for model in args.models:
  126. de_dataset = classification_dataset(args.data_dir, image_size=args.image_size,
  127. per_batch_size=args.per_batch_size,
  128. max_epoch=1, rank=args.rank, group_size=args.group_size,
  129. mode='eval')
  130. eval_dataloader = de_dataset.create_tuple_iterator()
  131. network = get_network(args.backbone, args.num_classes, platform=args.platform)
  132. if network is None:
  133. raise NotImplementedError('not implement {}'.format(args.backbone))
  134. param_dict = load_checkpoint(model)
  135. param_dict_new = {}
  136. for key, values in param_dict.items():
  137. if key.startswith('moments.'):
  138. continue
  139. elif key.startswith('network.'):
  140. param_dict_new[key[8:]] = values
  141. else:
  142. param_dict_new[key] = values
  143. load_param_into_net(network, param_dict_new)
  144. args.logger.info('load model {} success'.format(model))
  145. img_tot = 0
  146. top1_correct = 0
  147. top5_correct = 0
  148. if args.platform == "Ascend":
  149. network.to_float(mstype.float16)
  150. else:
  151. auto_mixed_precision(network)
  152. network.set_train(False)
  153. t_end = time.time()
  154. it = 0
  155. for data, gt_classes in eval_dataloader:
  156. output = network(Tensor(data, mstype.float32))
  157. output = output.asnumpy()
  158. top1_output = np.argmax(output, (-1))
  159. top5_output = np.argsort(output)[:, -5:]
  160. t1_correct = np.equal(top1_output, gt_classes).sum()
  161. top1_correct += t1_correct
  162. top5_correct += get_top5_acc(top5_output, gt_classes)
  163. img_tot += args.per_batch_size
  164. if args.rank == 0 and it == 0:
  165. t_end = time.time()
  166. it = 1
  167. if args.rank == 0:
  168. time_used = time.time() - t_end
  169. fps = (img_tot - args.per_batch_size) * args.group_size / time_used
  170. args.logger.info('Inference Performance: {:.2f} img/sec'.format(fps))
  171. results = [[top1_correct], [top5_correct], [img_tot]]
  172. args.logger.info('before results={}'.format(results))
  173. if args.is_distributed:
  174. model_md5 = model.replace('/', '')
  175. tmp_dir = '/cache'
  176. if not os.path.exists(tmp_dir):
  177. os.mkdir(tmp_dir)
  178. top1_correct_npy = '/cache/top1_rank_{}_{}.npy'.format(args.rank, model_md5)
  179. top5_correct_npy = '/cache/top5_rank_{}_{}.npy'.format(args.rank, model_md5)
  180. img_tot_npy = '/cache/img_tot_rank_{}_{}.npy'.format(args.rank, model_md5)
  181. np.save(top1_correct_npy, top1_correct)
  182. np.save(top5_correct_npy, top5_correct)
  183. np.save(img_tot_npy, img_tot)
  184. while True:
  185. rank_ok = True
  186. for other_rank in range(args.group_size):
  187. top1_correct_npy = '/cache/top1_rank_{}_{}.npy'.format(other_rank, model_md5)
  188. top5_correct_npy = '/cache/top5_rank_{}_{}.npy'.format(other_rank, model_md5)
  189. img_tot_npy = '/cache/img_tot_rank_{}_{}.npy'.format(other_rank, model_md5)
  190. if not os.path.exists(top1_correct_npy) or not os.path.exists(top5_correct_npy) or \
  191. not os.path.exists(img_tot_npy):
  192. rank_ok = False
  193. if rank_ok:
  194. break
  195. top1_correct_all = 0
  196. top5_correct_all = 0
  197. img_tot_all = 0
  198. for other_rank in range(args.group_size):
  199. top1_correct_npy = '/cache/top1_rank_{}_{}.npy'.format(other_rank, model_md5)
  200. top5_correct_npy = '/cache/top5_rank_{}_{}.npy'.format(other_rank, model_md5)
  201. img_tot_npy = '/cache/img_tot_rank_{}_{}.npy'.format(other_rank, model_md5)
  202. top1_correct_all += np.load(top1_correct_npy)
  203. top5_correct_all += np.load(top5_correct_npy)
  204. img_tot_all += np.load(img_tot_npy)
  205. results = [[top1_correct_all], [top5_correct_all], [img_tot_all]]
  206. results = np.array(results)
  207. else:
  208. results = np.array(results)
  209. args.logger.info('after results={}'.format(results))
  210. top1_correct = results[0, 0]
  211. top5_correct = results[1, 0]
  212. img_tot = results[2, 0]
  213. acc1 = 100.0 * top1_correct / img_tot
  214. acc5 = 100.0 * top5_correct / img_tot
  215. args.logger.info('after allreduce eval: top1_correct={}, tot={},'
  216. 'acc={:.2f}%(TOP1)'.format(top1_correct, img_tot, acc1))
  217. args.logger.info('after allreduce eval: top5_correct={}, tot={},'
  218. 'acc={:.2f}%(TOP5)'.format(top5_correct, img_tot, acc5))
  219. if args.is_distributed:
  220. release()
  221. if __name__ == "__main__":
  222. test()