|
|
@@ -45,20 +45,20 @@ getdir "${data_dir}" |
|
|
# echo "The input files: "${file_list[@]} |
|
|
# echo "The input files: "${file_list[@]} |
|
|
# echo "The output files: "${output_filename[@]} |
|
|
# echo "The output files: "${output_filename[@]} |
|
|
|
|
|
|
|
|
if [ ! -d "../../../third_party/to_mindrecord/zhwiki" ]; then |
|
|
|
|
|
echo "The patch base dir ../../../third_party/to_mindrecord/zhwiki is not exist." |
|
|
|
|
|
|
|
|
if [ ! -d "../../../../third_party/to_mindrecord/zhwiki" ]; then |
|
|
|
|
|
echo "The patch base dir ../../../../third_party/to_mindrecord/zhwiki is not exist." |
|
|
exit 1 |
|
|
exit 1 |
|
|
fi |
|
|
fi |
|
|
|
|
|
|
|
|
if [ ! -f "../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch" ]; then |
|
|
|
|
|
echo "The patch file ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch is not exist." |
|
|
|
|
|
|
|
|
if [ ! -f "../../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch" ]; then |
|
|
|
|
|
echo "The patch file ../../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch is not exist." |
|
|
exit 1 |
|
|
exit 1 |
|
|
fi |
|
|
fi |
|
|
|
|
|
|
|
|
# patch for create_pretraining_data.py |
|
|
# patch for create_pretraining_data.py |
|
|
patch -p0 -d ../../../third_party/to_mindrecord/zhwiki/ -o create_pretraining_data_patched.py < ../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch |
|
|
|
|
|
|
|
|
patch -p0 -d ../../../../third_party/to_mindrecord/zhwiki/ -o create_pretraining_data_patched.py < ../../../../third_party/patch/to_mindrecord/zhwiki/create_pretraining_data.patch |
|
|
if [ $? -ne 0 ]; then |
|
|
if [ $? -ne 0 ]; then |
|
|
echo "Patch ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data.py failed" |
|
|
|
|
|
|
|
|
echo "Patch ../../../../third_party/to_mindrecord/zhwiki/create_pretraining_data.py failed" |
|
|
exit 1 |
|
|
exit 1 |
|
|
fi |
|
|
fi |
|
|
|
|
|
|
|
|
@@ -73,11 +73,11 @@ file_list_len=`expr ${#file_list[*]} - 1` |
|
|
for index in $(seq 0 $file_list_len); do |
|
|
for index in $(seq 0 $file_list_len); do |
|
|
echo "Begin preprocess input file: ${file_list[$index]}" |
|
|
echo "Begin preprocess input file: ${file_list[$index]}" |
|
|
echo "Begin output file: ${output_filename[$index]}" |
|
|
echo "Begin output file: ${output_filename[$index]}" |
|
|
python ../../../third_party/to_mindrecord/zhwiki/create_pretraining_data_patched.py \ |
|
|
|
|
|
|
|
|
python ../../../../third_party/to_mindrecord/zhwiki/create_pretraining_data_patched.py \ |
|
|
--input_file=${file_list[$index]} \ |
|
|
--input_file=${file_list[$index]} \ |
|
|
--output_file=output/${output_filename[$index]} \ |
|
|
--output_file=output/${output_filename[$index]} \ |
|
|
--partition_number=1 \ |
|
|
--partition_number=1 \ |
|
|
--vocab_file=../../../third_party/to_mindrecord/zhwiki/vocab.txt \ |
|
|
|
|
|
|
|
|
--vocab_file=../../../../third_party/to_mindrecord/zhwiki/vocab.txt \ |
|
|
--do_lower_case=True \ |
|
|
--do_lower_case=True \ |
|
|
--max_seq_length=128 \ |
|
|
--max_seq_length=128 \ |
|
|
--max_predictions_per_seq=20 \ |
|
|
--max_predictions_per_seq=20 \ |