|
- """
- Preprocess script.
- """
-
- import glob
- import os
-
- from modelscope.preprocessors.space.args import parse_args
- from modelscope.preprocessors.space.fields.intent_field import \
- IntentBPETextField
-
- FILE_NAME = 'train.json'
-
-
- def intent_preprocess(path, cfg):
-
- bpe = IntentBPETextField(path, cfg)
- args = cfg.Dataset
- build_examples_fn = bpe.build_examples_multi_turn if args.trigger_role == 'system' \
- else bpe.build_examples_single_turn
- build_score_matrix_fn = bpe.build_score_matrix
- build_score_matrix_multiprocessing_fn = bpe.build_score_matrix_multiprocessing
- data_paths = list(
- os.path.dirname(c) for c in sorted(
- glob.glob(args.data_dir + '/**/' + FILE_NAME, recursive=True)))
- data_paths = bpe.filter_data_path(data_paths=data_paths)
-
- for mode in ['train', 'valid', 'test']:
- for data_path in data_paths:
- input_file = os.path.join(data_path, f'{mode}.json')
- output_file = os.path.join(data_path,
- f'{mode}.{bpe.tokenizer_type}.jsonl')
- output_score_file = os.path.join(data_path, f'{mode}.Score.npy')
- if os.path.exists(input_file) and not os.path.exists(output_file):
- examples = build_examples_fn(input_file, data_type=mode)
- if examples:
- bpe.save_examples(examples, output_file)
- else:
- continue
- if os.path.exists(output_file) and not os.path.exists(output_score_file) and \
- not args.dynamic_score and 'AnPreDial' in data_path:
- examples = bpe.load_examples(output_file)
- if args.num_process >= 2:
- score_matrix = build_score_matrix_multiprocessing_fn(
- examples)
- else:
- score_matrix = build_score_matrix_fn(examples)
- bpe.save_examples(score_matrix, output_score_file)
|