From 3d9d0ba3978f8ae06b50514507896a73089e7dde Mon Sep 17 00:00:00 2001 From: liuzx Date: Wed, 30 Jul 2025 10:50:16 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20'gpu=5Fmnist=5Fexample?= =?UTF-8?q?/parallel=5Ftrain.py'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- gpu_mnist_example/parallel_train.py | 128 ++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 gpu_mnist_example/parallel_train.py diff --git a/gpu_mnist_example/parallel_train.py b/gpu_mnist_example/parallel_train.py new file mode 100644 index 0000000..690d450 --- /dev/null +++ b/gpu_mnist_example/parallel_train.py @@ -0,0 +1,128 @@ +#!/usr/bin/python +#coding=utf-8 +''' +If there are Chinese comments in the code,please add at the beginning: +#!/usr/bin/python +#coding=utf-8 + +数据集结构是: + MnistDataset_torch.zip + ├── test + └── train + +预训练模型文件夹结构是: +Torch_MNIST_Example_Model +├── mnist_epoch1.pkl + +''' + + +from model import Model +import numpy as np +import torch +import torch.nn as nn +from torchvision.datasets import mnist +from torch.nn import CrossEntropyLoss +from torch.optim import SGD +from torch.utils.data import DataLoader +from torchvision.transforms import ToTensor +import argparse +import os +#导入c2net包 +from c2net.context import prepare + +# Training settings +parser = argparse.ArgumentParser(description='PyTorch MNIST Example') +parser.add_argument('--epoch_size', type=int, default=10, help='how much epoch to train') +parser.add_argument('--batch_size', type=int, default=256, help='how much batch_size in epoch') + +# 参数声明 +WORKERS = 0 # dataloder线程数 + +# 检查可用GPU数量 +if torch.cuda.device_count() < 2: + raise RuntimeError("需要至少2块GPU,但当前只有 {} 块".format(torch.cuda.device_count())) +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# 2. 初始化模型并并行化 +model = Model().to(device) +model = nn.DataParallel(model, device_ids=[0, 1]) # 使用GPU 0和1 + +optimizer = SGD(model.parameters(), lr=1e-1) +cost = CrossEntropyLoss() + +# 模型训练 +def train(model, train_loader, epoch): + model.train() + train_loss = 0 + for i, data in enumerate(train_loader, 0): + x, y = data + x = x.to(device) + y = y.to(device) + optimizer.zero_grad() + y_hat = model(x) + loss = cost(y_hat, y) + loss.backward() + optimizer.step() + train_loss += loss + loss_mean = train_loss / (i+1) + print('Train Epoch: {}\t Loss: {:.6f}'.format(epoch, loss_mean.item())) + +# 模型测试 +def test(model, test_loader, test_data): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for i, data in enumerate(test_loader, 0): + x, y = data + x = x.to(device) + y = y.to(device) + optimizer.zero_grad() + y_hat = model(x) + test_loss += cost(y_hat, y).item() + pred = y_hat.max(1, keepdim=True)[1] + correct += pred.eq(y.view_as(pred)).sum().item() + test_loss /= (i+1) + print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_data), 100. * correct / len(test_data))) + +if __name__ == '__main__': + args, unknown = parser.parse_known_args() + + #初始化导入数据集和预训练模型到容器内 + c2net_context = prepare() + #获取数据集路径 + MnistDataset_torch_path = c2net_context.dataset_path+"/"+"MnistDataset_torch" + #获取预训练模型路径 + Torch_MNIST_Example_Model_path = c2net_context.pretrain_model_path+"/"+"Torch_MNIST_Example_Model" + + #log output + print('cuda is available:{}'.format(torch.cuda.is_available())) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + batch_size = args.batch_size + epochs = args.epoch_size + train_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch_path, "train"), train=True, transform=ToTensor(),download=False) +# test_dataset = mnist.MNIST(root=os.path.join(MnistDataset_torch_path, "test"), train=False, transform=ToTensor(),download=False) + train_loader = DataLoader(train_dataset, batch_size=batch_size) +# test_loader = DataLoader(test_dataset, batch_size=batch_size) + + #如果有保存的模型,则加载模型,并在其基础上继续训练 + if os.path.exists(os.path.join(Torch_MNIST_Example_Model_path, "mnist_epoch1.pkl")): + checkpoint = torch.load(os.path.join(Torch_MNIST_Example_Model_path, "mnist_epoch1.pkl")) + model.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + start_epoch = checkpoint['epoch'] + print('加载 epoch {} 权重成功!'.format(start_epoch)) + else: + start_epoch = 0 + print('无保存模型,将从头开始训练!') + + for epoch in range(start_epoch+1, epochs+1): + train(model, train_loader, epoch) +# test(model, test_loader, test_dataset) + # 将模型保存到c2net_context.output_path + state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} + torch.save(state, '{}/mnist_epoch{}.pkl'.format(c2net_context.output_path, epoch)) + + From 885820fa431b0369caa5d914c709c8fdc1ed5b11 Mon Sep 17 00:00:00 2001 From: liuzx Date: Wed, 30 Jul 2025 10:54:47 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20'gpu=5Fmnist=5Fexample?= =?UTF-8?q?/parallel=5Ftrain.py'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- gpu_mnist_example/parallel_train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gpu_mnist_example/parallel_train.py b/gpu_mnist_example/parallel_train.py index 690d450..4d2ac02 100644 --- a/gpu_mnist_example/parallel_train.py +++ b/gpu_mnist_example/parallel_train.py @@ -28,6 +28,7 @@ from torch.utils.data import DataLoader from torchvision.transforms import ToTensor import argparse import os +os.system("pip install c2net") #导入c2net包 from c2net.context import prepare From 3b5143d90cfe5e710563242bb3260e6fa09d00b1 Mon Sep 17 00:00:00 2001 From: liuzx Date: Wed, 30 Jul 2025 11:00:33 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20'gpu=5Fmnist=5Fexample?= =?UTF-8?q?/parallel=5Ftrain.py'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- gpu_mnist_example/parallel_train.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gpu_mnist_example/parallel_train.py b/gpu_mnist_example/parallel_train.py index 4d2ac02..1f95d6f 100644 --- a/gpu_mnist_example/parallel_train.py +++ b/gpu_mnist_example/parallel_train.py @@ -43,10 +43,14 @@ WORKERS = 0 # dataloder线程数 # 检查可用GPU数量 if torch.cuda.device_count() < 2: raise RuntimeError("需要至少2块GPU,但当前只有 {} 块".format(torch.cuda.device_count())) +else: + print('当前有 {} 块GPU'.format(torch.cuda.device_count())) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 2. 初始化模型并并行化 model = Model().to(device) + +print('开始进行并行训练!') model = nn.DataParallel(model, device_ids=[0, 1]) # 使用GPU 0和1 optimizer = SGD(model.parameters(), lr=1e-1)