From 22007f209e2c6b7189c4ef6c23cf433c0f93375f Mon Sep 17 00:00:00 2001
From: liuzx <liuzx@pcl.ac.cn>
Date: Wed, 24 Jan 2024 09:19:36 +0800
Subject: [PATCH 1/5] update readme

---
 gcu_mnist_example/README.md           | 5 ++---
 gpgpu_mnist_example/README.md         | 4 ++--
 gpu_mnist_example/README.md           | 9 ++++-----
 npu_mnist_example/README.md           | 4 ++--
 npu_mnist_example/train_multi_card.py | 7 +++++--
 5 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/gcu_mnist_example/README.md b/gcu_mnist_example/README.md
index 6812796..3c9cb80 100644
--- a/gcu_mnist_example/README.md
+++ b/gcu_mnist_example/README.md
@@ -1,4 +1,3 @@
-
 # 如何在启智平台上进行模型调试和训练—GCU_手写数字识别示例
 
 ## 一 ，数据集及预训练模型准备
@@ -39,9 +38,9 @@ c2net_context = prepare()
 ##### 2，获取代码路径
 
 ```
-code_path = c2net_context.code_path +"/" +"项目名"
+code_path = c2net_context.code_path +"/" +"项目名".lower()
 在本示例中代码路径为：
-code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example"
+code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example".lower()
 ```
 
 ##### 3，获取数据集路径
diff --git a/gpgpu_mnist_example/README.md b/gpgpu_mnist_example/README.md
index 271f8e1..5713e2f 100644
--- a/gpgpu_mnist_example/README.md
+++ b/gpgpu_mnist_example/README.md
@@ -38,9 +38,9 @@ c2net_context = prepare()
 ##### 2，获取代码路径
 
 ```
-code_path = c2net_context.code_path +"/" +"项目名"
+code_path = c2net_context.code_path +"/" + "项目名".lower()
 在本示例中代码路径为：
-code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example"
+code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example".lower()
 ```
 
 ##### 3，获取数据集路径
diff --git a/gpu_mnist_example/README.md b/gpu_mnist_example/README.md
index cf1982e..579a3db 100644
--- a/gpu_mnist_example/README.md
+++ b/gpu_mnist_example/README.md
@@ -1,4 +1,3 @@
-
 # 如何在启智平台上进行模型调试和训练—GPU_手写数字识别示例
 
 ## 一 ，数据集及预训练模型准备
@@ -12,9 +11,9 @@
 
   > MnistDataset_torch.zip
   >
-  >   ├── test
+  > ├── test
   >
-  >   └── train
+  > └── train
   >
 
 ##### 2，预训练模型说明：
@@ -39,9 +38,9 @@ c2net_context = prepare()
 ##### 2，获取代码路径
 
 ```
-code_path = c2net_context.code_path +"/" +"项目名"
+code_path = c2net_context.code_path +"/" +"项目名".lower()
 在本示例中代码路径为：
-code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example"
+code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example".lower()
 ```
 
 ##### 3，获取数据集路径
diff --git a/npu_mnist_example/README.md b/npu_mnist_example/README.md
index 1ecb27f..439424f 100644
--- a/npu_mnist_example/README.md
+++ b/npu_mnist_example/README.md
@@ -40,9 +40,9 @@ c2net_context = prepare()
 ##### 2，获取代码路径
 
 ```
-code_path = c2net_context.code_path +"/" +"项目名"
+code_path = c2net_context.code_path +"/" + "项目名".lower()
 在本示例中代码路径为：
-code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example"
+code_path = c2net_context.code_path + "/" + "Openl_Cloudbrain_Example".lower()
 ```
 
 ##### 3，获取数据集路径
diff --git a/npu_mnist_example/train_multi_card.py b/npu_mnist_example/train_multi_card.py
index ffa6491..2741550 100644
--- a/npu_mnist_example/train_multi_card.py
+++ b/npu_mnist_example/train_multi_card.py
@@ -50,10 +50,13 @@ parser.add_argument('--epoch_size',
 if __name__ == "__main__":
     ###请在代码中加入args, unknown = parser.parse_known_args()，可忽略掉--ckpt_url参数报错等参数问题
     args, unknown = parser.parse_known_args()
-  
+    MnistDataset_mindspore_path = ''
+    Mindspore_MNIST_Example_Model_path = ''
+    output_path = ''
+
     device_num = int(os.getenv('RANK_SIZE'))
     #使用多卡时        
-        # set device_id and init for multi-card training
+    # set device_id and init for multi-card training
     context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
     context.reset_auto_parallel_context()
     context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)

From 4fc0d713a8cc791a30e68f1ef262c7b2fc9fd48b Mon Sep 17 00:00:00 2001
From: liuzx <liuzx@pcl.ac.cn>
Date: Thu, 25 Jan 2024 09:07:21 +0800
Subject: [PATCH 2/5] update

---
 npu_mnist_example/train_multi_card.py | 38 ++++++++-------------------
 1 file changed, 11 insertions(+), 27 deletions(-)

diff --git a/npu_mnist_example/train_multi_card.py b/npu_mnist_example/train_multi_card.py
index 2741550..d53a7e8 100644
--- a/npu_mnist_example/train_multi_card.py
+++ b/npu_mnist_example/train_multi_card.py
@@ -19,6 +19,7 @@
 import os
 import argparse
 from config import mnist_cfg as cfg
+from dataset import create_dataset
 from dataset_distributed import create_dataset_parallel
 from lenet import LeNet5
 import mindspore.nn as nn
@@ -29,7 +30,6 @@ from mindspore.train import Model
 from mindspore.context import ParallelMode
 from mindspore.communication.management import init, get_rank
 import time
-#导入openi包
 from c2net.context import prepare, upload_output
 
 
@@ -50,10 +50,7 @@ parser.add_argument('--epoch_size',
 if __name__ == "__main__":
     ###请在代码中加入args, unknown = parser.parse_known_args()，可忽略掉--ckpt_url参数报错等参数问题
     args, unknown = parser.parse_known_args()
-    MnistDataset_mindspore_path = ''
-    Mindspore_MNIST_Example_Model_path = ''
-    output_path = ''
-
+  
     device_num = int(os.getenv('RANK_SIZE'))
     #使用多卡时        
     # set device_id and init for multi-card training
@@ -63,32 +60,19 @@ if __name__ == "__main__":
     init()
     #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
     local_rank=int(os.getenv('RANK_ID'))
-    if local_rank%8==0:
-        #初始化导入数据集和预训练模型到容器内
-        c2net_context = prepare()
-        #获取数据集路径
-        MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore"
-        #获取预训练模型路径
-        Mindspore_MNIST_Example_Model_path = c2net_context.pretrain_model_path+"/"+"Mindspore_MNIST_Example_Model"
-        output_path = c2net_context.output_path
-        #Set a cache file to determine whether the data has been copied to obs. 
-        #If this file exists during multi-card training, there is no need to copy the dataset multiple times.
-        f = open("/cache/download_input.txt", 'w')    
-        f.close()
-        try:
-            if os.path.exists("/cache/download_input.txt"):
-                print("download_input succeed")
-        except Exception as e:
-            print("download_input failed")
-    while not os.path.exists("/cache/download_input.txt"):
-        time.sleep(1)               
-    ds_train = create_dataset_parallel(os.path.join(MnistDataset_mindspore_path, "train"),  cfg.batch_size)
-
+    #初始化导入数据集和预训练模型到容器内
+    c2net_context = prepare()
+    #获取数据集路径
+    MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore"
+    #获取预训练模型路径
+    Mindspore_MNIST_Example_Model_path = c2net_context.pretrain_model_path+"/"+"Mindspore_MNIST_Example_Model"
+    output_path = c2net_context.output_path
+    ds_train = create_dataset_parallel(os.path.join(MnistDataset_mindspore_path, "train"),  cfg.batch_size)       
     network = LeNet5(cfg.num_classes)
     net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
     net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
     time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
-    load_param_into_net(network, load_checkpoint(os.path.join(Mindspore_MNIST_Example_Model_path, "checkpoint_lenet-1_1875.ckpt")))
+    #load_param_into_net(network, load_checkpoint(os.path.join(Mindspore_MNIST_Example_Model_path, "checkpoint_lenet-1_1875.ckpt")))
     if args.device_target != "Ascend":
         model = Model(network,
                       net_loss,

From cc4be88033ce4f91d59d3dcd49a01fa0a4f9503d Mon Sep 17 00:00:00 2001
From: liuzx <liuzx@pcl.ac.cn>
Date: Wed, 31 Jan 2024 17:12:05 +0800
Subject: [PATCH 3/5] fix bug

---
 gcu_mnist_example/train.py              |  18 ++--
 gpgpu_mnist_example/inference.py        |   7 +-
 gpu_mnist_example/inference.py          |   7 +-
 gpu_mnist_example/train.py              |   6 +-
 npu_mnist_example/README.md             |   1 +
 npu_mnist_example/read_imagenet.py      |   3 -
 npu_mnist_example/train.py              |   4 +
 npu_mnist_example/train_epoch_upload.py | 115 ++++++++++++++++++++++++
 npu_mnist_example/train_multi_card.py   |   8 +-
 9 files changed, 144 insertions(+), 25 deletions(-)
 create mode 100644 npu_mnist_example/train_epoch_upload.py

diff --git a/gcu_mnist_example/train.py b/gcu_mnist_example/train.py
index aa385bd..dd461bc 100644
--- a/gcu_mnist_example/train.py
+++ b/gcu_mnist_example/train.py
@@ -9,19 +9,11 @@ If there are Chinese comments in the code，please add at the beginning：
 数据集结构是：
  MnistDataset_torch.zip
   ├── test
-  │   ├── MNIST/processed/test.pt
-  │   └── MNIST/processed/training.pt
-  │   ├── MNIST/raw/train-images-idx3-ubyte
-  │   └── MNIST/raw/train-labels-idx1-ubyte
-  │   ├── MNIST/raw/t10k-images-idx3-ubyte
-  │   └── MNIST/raw/t10k-labels-idx1-ubyte
-  ├── train
-  │   ├── MNIST/processed/test.pt
-  │   └── MNIST/processed/training.pt
-  │   ├── MNIST/raw/train-images-idx3-ubyte
-  │   └── MNIST/raw/train-labels-idx1-ubyte
-  │   ├── MNIST/raw/t10k-images-idx3-ubyte
-  │   └── MNIST/raw/t10k-labels-idx1-ubyte       
+  └── train  
+
+预训练模型文件夹结构是：
+Torch_MNIST_Example_Model
+├── mnist_epoch1_0.76.pkl  
 
 '''
 
diff --git a/gpgpu_mnist_example/inference.py b/gpgpu_mnist_example/inference.py
index ee99215..cb2d6c4 100644
--- a/gpgpu_mnist_example/inference.py
+++ b/gpgpu_mnist_example/inference.py
@@ -5,11 +5,16 @@ If there are Chinese comments in the code，please add at the beginning：
 #!/usr/bin/python
 #coding=utf-8   
 
-1，The dataset structure of the single-dataset in this example
+示例选用的数据集是MnistDataset_torch.zip
+数据集结构是：
  MnistDataset_torch.zip
   ├── test
   └── train  
 
+预训练模型文件夹结构是：
+Torch_MNIST_Example_Model
+├── mnist_epoch1_0.76.pkl  
+
 '''
 from model import Model
 import numpy as np
diff --git a/gpu_mnist_example/inference.py b/gpu_mnist_example/inference.py
index 9ddaf72..08457d8 100644
--- a/gpu_mnist_example/inference.py
+++ b/gpu_mnist_example/inference.py
@@ -5,11 +5,16 @@ If there are Chinese comments in the code，please add at the beginning：
 #!/usr/bin/python
 #coding=utf-8   
 
-1，The dataset structure of the single-dataset in this example
+示例选用的数据集是MnistDataset_torch.zip
+数据集结构是：
  MnistDataset_torch.zip
   ├── test
   └── train  
 
+预训练模型文件夹结构是：
+Torch_MNIST_Example_Model
+├── mnist_epoch1_0.76.pkl  
+
 '''
 from model import Model
 import numpy as np
diff --git a/gpu_mnist_example/train.py b/gpu_mnist_example/train.py
index 9319ff1..9db7665 100644
--- a/gpu_mnist_example/train.py
+++ b/gpu_mnist_example/train.py
@@ -5,11 +5,15 @@ If there are Chinese comments in the code，please add at the beginning：
 #!/usr/bin/python
 #coding=utf-8   
 
-1，The dataset structure of the single-dataset in this example
+数据集结构是：
  MnistDataset_torch.zip
   ├── test
   └── train  
 
+预训练模型文件夹结构是：
+Torch_MNIST_Example_Model
+├── mnist_epoch1_0.76.pkl
+
 '''
 
 
diff --git a/npu_mnist_example/README.md b/npu_mnist_example/README.md
index 439424f..408d883 100644
--- a/npu_mnist_example/README.md
+++ b/npu_mnist_example/README.md
@@ -85,6 +85,7 @@ upload_output()
 
 - 训练任务单卡示例请参考示例中[train.py](./train.py)的代码注释
 - 训练任务多卡示例请参考示例中[train_multi_card.py](./train_multi_card.py)的代码注释
+- 训练任务在每个epoch结束后就上传文件，可参考[train_epoch_upload.py](./train_epoch_upload.py)的代码注释
 - 推理任务示例请参考示例中[inference.py](./inference.py)的代码注释
 - 启智集群训练任务已经将ImageNet-1k数据集挂载到训练镜像，具体使用方法请参考示例中[read_imagenet.py](./read_imagenet.py)的代码注释
 - 继续训练示例参考示例中[train_continue.py](./train_continue.py)的代码注释
diff --git a/npu_mnist_example/read_imagenet.py b/npu_mnist_example/read_imagenet.py
index eba4f24..923418c 100644
--- a/npu_mnist_example/read_imagenet.py
+++ b/npu_mnist_example/read_imagenet.py
@@ -39,9 +39,6 @@ import mindspore.dataset.vision.c_transforms as transforms
 from c2net.context import upload_output
 
 parser = argparse.ArgumentParser(description='Read big dataset ImageNet Example')
-parser.add_argument('--train_url',
-                    help='output folder to save/load',
-                    default= '/cache/output/')
 
 if __name__ == "__main__":
     args, unknown = parser.parse_known_args()
diff --git a/npu_mnist_example/train.py b/npu_mnist_example/train.py
index 1fd1e4c..74adb65 100644
--- a/npu_mnist_example/train.py
+++ b/npu_mnist_example/train.py
@@ -11,6 +11,10 @@
       ├── train-images-idx3-ubyte
       └── train-labels-idx1-ubyte 
       
+模型文件夹结构是：
+Mindspore_MNIST_Example_Model
+├── checkpoint_lenet-1_1875.ckpt
+      
 使用注意事项：
 1、在代码中加入args, unknown = parser.parse_known_args()，可忽略掉--ckpt_url参数报错等参数问题
 2、用户需要调用c2net的python sdk包
diff --git a/npu_mnist_example/train_epoch_upload.py b/npu_mnist_example/train_epoch_upload.py
new file mode 100644
index 0000000..fcbfc00
--- /dev/null
+++ b/npu_mnist_example/train_epoch_upload.py
@@ -0,0 +1,115 @@
+
+
+"""
+示例选用的数据集是MnistDataset_mindspore.zip
+数据集结构是：
+ MnistDataset_mindspore.zip
+  ├── test
+  │   ├── t10k-images-idx3-ubyte
+  │   └── t10k-labels-idx1-ubyte
+  └── train
+      ├── train-images-idx3-ubyte
+      └── train-labels-idx1-ubyte 
+
+模型文件夹结构是：
+Mindspore_MNIST_Example_Model
+├── checkpoint_lenet-1_1875.ckpt
+      
+使用注意事项：
+1、在代码中加入args, unknown = parser.parse_known_args()，可忽略掉--ckpt_url参数报错等参数问题
+2、用户需要调用c2net的python sdk包
+"""
+
+import os
+import argparse
+from config import mnist_cfg as cfg
+from dataset import create_dataset
+from lenet import LeNet5
+import mindspore.nn as nn
+from mindspore import context
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
+from mindspore import load_checkpoint, load_param_into_net
+from mindspore.train import Model
+from mindspore.train.callback import Callback
+#导入c2net包
+from c2net.context import prepare, upload_output
+
+class EnvToOpenIEpochEnd(Callback):
+    """
+    upload output to openi when epoch end
+    """
+    def epoch_end(self,run_context):
+        upload_output() 
+
+
+parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
+                   
+parser.add_argument(
+    '--device_target',
+    type=str,
+    default="Ascend",
+    choices=['Ascend', 'CPU'],
+    help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')
+
+parser.add_argument('--epoch_size',
+                    type=int,
+                    default=5,
+                    help='Training epochs.')
+
+if __name__ == "__main__":
+    ###请在代码中加入args, unknown = parser.parse_known_args()，可忽略掉--ckpt_url参数报错等参数问题
+    args, unknown = parser.parse_known_args()
+    #初始化导入数据集和预训练模型到容器内
+    c2net_context = prepare()
+    #获取数据集路径
+    MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore"
+    #获取预训练模型路径
+    Mindspore_MNIST_Example_Model_path = c2net_context.pretrain_model_path+"/"+"Mindspore_MNIST_Example_Model"
+    #获取输出路径
+    output_path = c2net_context.output_path
+  
+    context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
+    #使用数据集的方式  
+    ds_train = create_dataset(os.path.join(MnistDataset_mindspore_path, "train"),  cfg.batch_size)      
+    network = LeNet5(cfg.num_classes)
+    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
+    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
+    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
+    load_param_into_net(network, load_checkpoint(os.path.join(Mindspore_MNIST_Example_Model_path, "checkpoint_lenet-1_1875.ckpt")))
+    if args.device_target != "Ascend":
+        model = Model(network,
+                      net_loss,
+                      net_opt,
+                      metrics={"accuracy"})
+    else:
+        model = Model(network,
+                      net_loss,
+                      net_opt,
+                      metrics={"accuracy"},
+                      amp_level="O2")
+
+    config_ck = CheckpointConfig(
+        save_checkpoint_steps=cfg.save_checkpoint_steps,
+        keep_checkpoint_max=cfg.keep_checkpoint_max)
+    #将模型保存到c2net_context.output_path
+    outputDirectory = output_path + "/"
+    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
+                                directory=outputDirectory,
+                                config=config_ck)
+    print("============== Starting Training ==============")
+    epoch_size = cfg['epoch_size']
+    if (args.epoch_size):
+        epoch_size = args.epoch_size
+        print('epoch_size is: ', epoch_size)
+
+    # set callback functions
+    callback =[time_cb,LossMonitor()]
+    local_rank=int(os.getenv('RANK_ID'))
+    #非必选，每个epoch结束后，都手动上传训练结果到启智平台，注意这样使用会占用很多内存，只有在部分特殊需要手动上传的任务才需要使用
+    uploadOutput = EnvToOpenIEpochEnd()
+    callback.append(uploadOutput) 
+    # for data parallel, only save checkpoint on rank 0
+    if local_rank==0 :
+        callback.append(ckpoint_cb) 
+    
+    model.train(epoch_size,ds_train,callbacks=callback)
diff --git a/npu_mnist_example/train_multi_card.py b/npu_mnist_example/train_multi_card.py
index d53a7e8..8e1a8bc 100644
--- a/npu_mnist_example/train_multi_card.py
+++ b/npu_mnist_example/train_multi_card.py
@@ -10,7 +10,7 @@
   └── train
       ├── train-images-idx3-ubyte
       └── train-labels-idx1-ubyte 
-      
+
 使用注意事项：
 1、在代码中加入args, unknown = parser.parse_known_args()，可忽略掉--ckpt_url参数报错等参数问题
 2、用户需要调用c2net的python sdk包
@@ -19,13 +19,11 @@
 import os
 import argparse
 from config import mnist_cfg as cfg
-from dataset import create_dataset
 from dataset_distributed import create_dataset_parallel
 from lenet import LeNet5
 import mindspore.nn as nn
 from mindspore import context
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
-from mindspore import load_checkpoint, load_param_into_net
 from mindspore.train import Model
 from mindspore.context import ParallelMode
 from mindspore.communication.management import init, get_rank
@@ -64,15 +62,13 @@ if __name__ == "__main__":
     c2net_context = prepare()
     #获取数据集路径
     MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore"
-    #获取预训练模型路径
-    Mindspore_MNIST_Example_Model_path = c2net_context.pretrain_model_path+"/"+"Mindspore_MNIST_Example_Model"
+
     output_path = c2net_context.output_path
     ds_train = create_dataset_parallel(os.path.join(MnistDataset_mindspore_path, "train"),  cfg.batch_size)       
     network = LeNet5(cfg.num_classes)
     net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
     net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
     time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
-    #load_param_into_net(network, load_checkpoint(os.path.join(Mindspore_MNIST_Example_Model_path, "checkpoint_lenet-1_1875.ckpt")))
     if args.device_target != "Ascend":
         model = Model(network,
                       net_loss,

From 5ec299870393eea06fe7e16a4af4f34408eca74a Mon Sep 17 00:00:00 2001
From: liuzx <liuzx@pcl.ac.cn>
Date: Wed, 31 Jan 2024 17:14:31 +0800
Subject: [PATCH 4/5] fix bug

---
 gpu_mnist_example/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu_mnist_example/train.py b/gpu_mnist_example/train.py
index 9db7665..e71eaf2 100644
--- a/gpu_mnist_example/train.py
+++ b/gpu_mnist_example/train.py
@@ -109,7 +109,7 @@ if __name__ == '__main__':
         start_epoch = 0
         print('无保存模型，将从头开始训练！')
  
-    for epoch in range(start_epoch+1, epochs):
+    for epoch in range(start_epoch+1, epochs+1):
         train(model, train_loader, epoch)
         test(model, test_loader, test_dataset)
         # 将模型保存到c2net_context.output_path

From 1b41008660e86ffd7545b13087343a60bb75e9bb Mon Sep 17 00:00:00 2001
From: liuzx <liuzx@pcl.ac.cn>
Date: Thu, 1 Feb 2024 10:03:58 +0800
Subject: [PATCH 5/5] update train_multi_card example

---
 npu_mnist_example/train_multi_card.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/npu_mnist_example/train_multi_card.py b/npu_mnist_example/train_multi_card.py
index 8e1a8bc..ffa7ea4 100644
--- a/npu_mnist_example/train_multi_card.py
+++ b/npu_mnist_example/train_multi_card.py
@@ -58,8 +58,19 @@ if __name__ == "__main__":
     init()
     #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
     local_rank=int(os.getenv('RANK_ID'))
-    #初始化导入数据集和预训练模型到容器内
-    c2net_context = prepare()
+    #初始化导入数据集和预训练模型到容器内,并行任务先让0卡拷贝数据,并用一个缓存文件标记0卡已prepare完成
+    if local_rank == 0:
+        c2net_context = prepare()
+        f = open("/cache/prepare_completed.txt", 'w')    
+        f.close()
+        try:
+            if os.path.exists("/cache/prepare_completed.txt"):
+                print("prepare completed!")
+        except Exception as e:
+            print("prepare failed")
+    while not os.path.exists("/cache/prepare_completed.txt"):
+        time.sleep(1)
+    c2net_context = prepare()        
     #获取数据集路径
     MnistDataset_mindspore_path = c2net_context.dataset_path+"/"+"MnistDataset_mindspore"