You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

process_data.py 3.1 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """Generate train and test dataset"""
  16. import os
  17. import shutil
  18. import math as m
  19. import random
  20. from multiprocessing import Process
  21. from captcha.image import ImageCaptcha
  22. def _generate_captcha_per_process(path, total, start, end, img_width, img_height, max_digits):
  23. captcha = ImageCaptcha(width=img_width, height=img_height)
  24. filename_head = '{:0>' + str(len(str(total))) + '}-'
  25. for i in range(start, end):
  26. digits = ''
  27. digits_length = random.randint(1, max_digits)
  28. for _ in range(0, digits_length):
  29. integer = random.randint(0, 9)
  30. digits += str(integer)
  31. captcha.write(digits, os.path.join(path, filename_head.format(i) + digits + '.png'))
  32. def generate_captcha(name, img_num, img_width, img_height, max_digits, process_num=16):
  33. """
  34. generate captcha images
  35. Args:
  36. name(str): name of folder, under which captcha images are saved in
  37. img_num(int): number of generated captcha images
  38. img_width(int): width of generated captcha images
  39. img_height(int): height of generated captcha images
  40. max_digits(int): max number of digits in each captcha images. For each captcha images, number of digits is in
  41. range [1,max_digits]
  42. process_num(int): number of process to generate captcha images, default is 16
  43. """
  44. cur_script_path = os.path.dirname(os.path.realpath(__file__))
  45. path_data = os.path.join(cur_script_path, "data")
  46. if not os.path.exists(path_data):
  47. os.mkdir(path_data)
  48. path = os.path.join(path_data, name)
  49. print("Generating dataset [{}] under {}...".format(name, path))
  50. if os.path.exists(path):
  51. shutil.rmtree(path)
  52. os.mkdir(path)
  53. img_num_per_thread = m.ceil(img_num / process_num)
  54. processes = []
  55. for i in range(process_num):
  56. start = i * img_num_per_thread
  57. end = start + img_num_per_thread if i != (process_num - 1) else img_num
  58. p = Process(target=_generate_captcha_per_process,
  59. args=(path, img_num, start, end, img_width, img_height, max_digits))
  60. p.start()
  61. processes.append(p)
  62. for p in processes:
  63. p.join()
  64. print("Generating dataset [{}] finished, total number is {}!".format(name, img_num))
  65. if __name__ == '__main__':
  66. generate_captcha("test", img_num=10000, img_width=160, img_height=64, max_digits=4)
  67. generate_captcha("train", img_num=50000, img_width=160, img_height=64, max_digits=4)