You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

process_data.py 3.0 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """Generate train and test dataset"""
  16. import os
  17. import math as m
  18. import random
  19. from multiprocessing import Process
  20. from captcha.image import ImageCaptcha
  21. def _generate_captcha_per_process(path, total, start, end, img_width, img_height, max_digits):
  22. captcha = ImageCaptcha(width=img_width, height=img_height)
  23. filename_head = '{:0>' + str(len(str(total))) + '}-'
  24. for i in range(start, end):
  25. digits = ''
  26. digits_length = random.randint(1, max_digits)
  27. for _ in range(0, digits_length):
  28. integer = random.randint(0, 9)
  29. digits += str(integer)
  30. captcha.write(digits, os.path.join(path, filename_head.format(i) + digits + '.png'))
  31. def generate_captcha(name, img_num, img_width, img_height, max_digits, process_num=16):
  32. """
  33. generate captcha images
  34. Args:
  35. name(str): name of folder, under which captcha images are saved in
  36. img_num(int): number of generated captcha images
  37. img_width(int): width of generated captcha images
  38. img_height(int): height of generated captcha images
  39. max_digits(int): max number of digits in each captcha images. For each captcha images, number of digits is in
  40. range [1,max_digits]
  41. process_num(int): number of process to generate captcha images, default is 16
  42. """
  43. cur_script_path = os.path.dirname(os.path.realpath(__file__))
  44. path = os.path.join(cur_script_path, "data", name)
  45. print("Generating dataset [{}] under {}...".format(name, path))
  46. if os.path.exists(path):
  47. os.system("rm -rf {}".format(path))
  48. os.system("mkdir -p {}".format(path))
  49. img_num_per_thread = m.ceil(img_num / process_num)
  50. processes = []
  51. for i in range(process_num):
  52. start = i * img_num_per_thread
  53. end = start + img_num_per_thread if i != (process_num - 1) else img_num
  54. p = Process(target=_generate_captcha_per_process,
  55. args=(path, img_num, start, end, img_width, img_height, max_digits))
  56. p.start()
  57. processes.append(p)
  58. for p in processes:
  59. p.join()
  60. print("Generating dataset [{}] finished, total number is {}!".format(name, img_num))
  61. if __name__ == '__main__':
  62. generate_captcha("test", img_num=10000, img_width=160, img_height=64, max_digits=4)
  63. generate_captcha("train", img_num=50000, img_width=160, img_height=64, max_digits=4)