|
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071 |
- # Copyright 2020 Huawei Technologies Co., Ltd
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ============================================================================
- """Generate train and test dataset"""
- import os
- import math as m
- import random
- from multiprocessing import Process
- from captcha.image import ImageCaptcha
-
-
- def _generate_captcha_per_process(path, total, start, end, img_width, img_height, max_digits):
- captcha = ImageCaptcha(width=img_width, height=img_height)
- filename_head = '{:0>' + str(len(str(total))) + '}-'
- for i in range(start, end):
- digits = ''
- digits_length = random.randint(1, max_digits)
- for _ in range(0, digits_length):
- integer = random.randint(0, 9)
- digits += str(integer)
- captcha.write(digits, os.path.join(path, filename_head.format(i) + digits + '.png'))
-
-
- def generate_captcha(name, img_num, img_width, img_height, max_digits, process_num=16):
- """
- generate captcha images
-
- Args:
- name(str): name of folder, under which captcha images are saved in
- img_num(int): number of generated captcha images
- img_width(int): width of generated captcha images
- img_height(int): height of generated captcha images
- max_digits(int): max number of digits in each captcha images. For each captcha images, number of digits is in
- range [1,max_digits]
- process_num(int): number of process to generate captcha images, default is 16
- """
- cur_script_path = os.path.dirname(os.path.realpath(__file__))
- path = os.path.join(cur_script_path, "data", name)
- print("Generating dataset [{}] under {}...".format(name, path))
- if os.path.exists(path):
- os.system("rm -rf {}".format(path))
- os.system("mkdir -p {}".format(path))
- img_num_per_thread = m.ceil(img_num / process_num)
-
- processes = []
- for i in range(process_num):
- start = i * img_num_per_thread
- end = start + img_num_per_thread if i != (process_num - 1) else img_num
- p = Process(target=_generate_captcha_per_process,
- args=(path, img_num, start, end, img_width, img_height, max_digits))
- p.start()
- processes.append(p)
- for p in processes:
- p.join()
- print("Generating dataset [{}] finished, total number is {}!".format(name, img_num))
-
-
- if __name__ == '__main__':
- generate_captcha("test", img_num=10000, img_width=160, img_height=64, max_digits=4)
- generate_captcha("train", img_num=50000, img_width=160, img_height=64, max_digits=4)
|