可以用“watch -n 0.1 nvidia-smi”来查看gpu状态,我用的是3块12G的GPU进行实验

本实验将使用一个简单的瞎写的网络进行,网络训练一个分类任务,当然这个不重要,我们也不关心效果,这里希望用一个简单的网络来说明如何使用GPU训练,这个网络是可以直接跑起来的,xdm可以动手尝试一下

在第0部分是CPU上训练的代码,第一部分使用了单张GPU,第二部分是单机多卡的任务

目录

0、CPU代码

1、单机单卡

2、单机多卡

2.1 DataParaller(DP)(不建议用)

2.2DistributedSampler(DDP)


0、CPU代码

#样例 准备数据,加载数据,准备模型,设置损失函数,设置优化器,开始训练,最后验证,结果聚合展示 import torchimport torchvisionfrom torch.nn import Sequentialfrom torch.utils.data import DataLoaderfrom torch import nn #搭建神经网络class MyModule(nn.Module):def __init__(self):super(MyModule, self).__init__()self.model1=Sequential(nn.Conv2d(3,32,5,1,2),nn.Conv2d(32,64,5,1,2),nn.Conv2d(64,512,5,1,2),nn.MaxPool2d(2),nn.Conv2d(512, 1024, 5, 1, 2),nn.MaxPool2d(2),nn.Conv2d(1024, 2048, 5, 1, 2),nn.Conv2d(2048, 4096, 5, 1, 2),nn.MaxPool2d(2))self.model2=Sequential(nn.Flatten(),nn.Linear(4096*4*4,8000),nn.Linear(8000,64),nn.Linear(64,10)) def forward(self, x):x=self.model1(x)x=self.model2(x)return x if __name__=="__main__":"""准备数据集"""#训练train_data=torchvision.datasets.CIFAR10(root="./DataSet",train=True,transform=torchvision.transforms.ToTensor(),download=True)train_data_size=len(train_data)print("训练数据集的长度为{}".format(train_data_size))#50000#利用DataLoader加载数据集train_dataloader=DataLoader(train_data,batch_size=64)#初始化#创建模型myModule=MyModule()#损失函数loss_fn=nn.CrossEntropyLoss()#定义优化器learning_rate=0.01optimize=torch.optim.SGD(myModule.parameters() ,lr=learning_rate )#训练次数train_step=0#测试次数test_step=0#训练轮数epoch=10#开始训练#训练迭代几次for i in range(epoch):print("------第 {} 轮训练开始------".format(i+1))#每一次的训练myModule.train()for data in train_dataloader:imgs,targets=dataoutputs=myModule(imgs)#计算损失loss=loss_fn(outputs,targets)# 利用优化器对参数优化,调优optimize.zero_grad()loss.backward()optimize.step()train_step=train_step+1#一张图片加一次if train_step%10==0:print("训练次数:{},Loss:{}".format(train_step,loss.item()))#加item将tensor转化为数字#保存每一轮训练后的模型torch.save(myModule.state_dict(),"myModule_{}.pth".format(i))print("模型已保存")

1、单机单卡

使用的函数:

#1、判断GPU是否可用:torch.cuda.is_availabel()#2、使用0号GPUos.environ["CUDA_VISIBLE_DEVICES"]="0"#3、数据拷贝到GPUmodel.cuda()#不用赋值data = data.cuda()#4、模型保存与加载torch.savetorch.load(file_name,mao_location=torch.device("cuda"/"cpu"))

在任务1的基础上改变的代码均使用注释标记了

#样例 准备数据,加载数据,准备模型,设置损失函数,设置优化器,开始训练,最后验证,结果聚合展示 import torchimport torchvisionimport osfrom torch.nn import Sequentialfrom torch.utils.data import DataLoaderfrom torch import nnclass MyModule(nn.Module):def __init__(self):super(MyModule, self).__init__()self.model1=Sequential(nn.Conv2d(3,32,5,1,2),nn.Conv2d(32,64,5,1,2),nn.Conv2d(64,512,5,1,2),nn.MaxPool2d(2),nn.Conv2d(512, 1024, 5, 1, 2),nn.MaxPool2d(2),nn.Conv2d(1024, 2048, 5, 1, 2),nn.Conv2d(2048, 4096, 5, 1, 2),nn.MaxPool2d(2))self.model2=Sequential(nn.Flatten(),nn.Linear(4096*4*4,5000),nn.Linear(5000,64),nn.Linear(64,10)) def forward(self, x):x=self.model1(x)x=self.model2(x)return x if __name__=="__main__":#检验GPU是否可用if torch.cuda.is_available():print("Use one GPU")#使用的GPUos.environ["CUDA_VISIBLE_DEVICES"] = "0"else:print("can't use GPU")raise Exception("can't use GPU")train_data=torchvision.datasets.CIFAR10(root="./DataSet",train=True,transform=torchvision.transforms.ToTensor(),download=True)train_data_size=len(train_data)print("训练数据集的长度为{}".format(train_data_size))#50000train_dataloader=DataLoader(train_data,batch_size=64)myModule=MyModule()#模型拷贝到cuda()myModule.cuda()loss_fn=nn.CrossEntropyLoss()learning_rate=0.01optimize=torch.optim.SGD(myModule.parameters() ,lr=learning_rate )train_step=0test_step=0epoch=10for i in range(epoch):print("------第 {} 轮训练开始------".format(i+1))myModule.train()for data in train_dataloader:imgs,targets=data#将数据赋值到gpuimgs = imgs.cuda()targets = targets.cuda()outputs=myModule(imgs)loss=loss_fn(outputs,targets)optimize.zero_grad()loss.backward()optimize.step()train_step=train_step+1if train_step%1==0:print("训练次数:{},Loss:{}".format(train_step,loss.item()))torch.save(myModule.state_dict(),"myModule_{}.pth".format(i))print("模型已保存")

2、单机多卡

这里有两种方案,一种是使用torch.nn.DataParaller,这种方案会慢但改动的代码量较少,但巨慢。另一种是nn.parallerl.DistributedDataParallel,这种方案改动会比较大,但是最终的代码多进程效率会高。

注意这边的小坑:后面采取的cuda标号都是按os.environ[“CUDA_VISIBLE_DEVICES”]=”0,1,2″中的顺序

2.1 DataParaller(DP)(不建议用)

锐评:改是真的好改,慢是真的慢

#将模型放到多个GPU上#只用改一句model = DataParallerl(model.cuda,device_ids = [0,1,2])

代码:

import torchimport torchvisionimport osfrom torch.nn import Sequentialfrom torch.utils.data import DataLoaderfrom torch import nnclass MyModule(nn.Module):def __init__(self):super(MyModule, self).__init__()self.model1=Sequential(nn.Conv2d(3,32,5,1,2),nn.Conv2d(32,64,5,1,2),nn.Conv2d(64,512,5,1,2),nn.MaxPool2d(2),nn.Conv2d(512, 1024, 5, 1, 2),nn.MaxPool2d(2),nn.Conv2d(1024, 2048, 5, 1, 2),nn.Conv2d(2048, 4096, 5, 1, 2),nn.MaxPool2d(2))self.model2=Sequential(nn.Flatten(),nn.Linear(4096*4*4,5000),nn.Linear(5000,64),nn.Linear(64,10)) def forward(self, x):x=self.model1(x)x=self.model2(x)return x if __name__=="__main__":#检验GPU是否可用if torch.cuda.is_available():print(torch.cuda.device_count())print("Use GPU")#使用的GPUos.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2"else:print("can't use GPU")raise Exception("can't use GPU")train_data=torchvision.datasets.CIFAR10(root="./DataSet",train=True,transform=torchvision.transforms.ToTensor(),download=True)train_data_size=len(train_data)print("训练数据集的长度为{}".format(train_data_size))#50000train_dataloader=DataLoader(train_data,batch_size=64)myModule=MyModule()#模型拷贝到cuda()myModule = nn.DataParallel(myModule.cuda(), device_ids = [0,1,2])loss_fn=nn.CrossEntropyLoss()learning_rate=0.01optimize=torch.optim.SGD(myModule.parameters() ,lr=learning_rate )train_step=0test_step=0epoch=10for i in range(epoch):print("------第 {} 轮训练开始------".format(i+1))myModule.train()for data in train_dataloader:imgs,targets=data#将数据赋值到gpuimgs = imgs.cuda()targets = targets.cuda()outputs=myModule(imgs)loss=loss_fn(outputs,targets)optimize.zero_grad()loss.backward()optimize.step()train_step=train_step+1if train_step%1==0:print("训练次数:{},Loss:{}".format(train_step,loss.item()))torch.save(myModule.state_dict(),"myModule_{}.pth".format(i))print("模型已保存")

2.2DistributedSampler(DDP)

这个方法的最大不同在于gpu自动分配为args.local_rank

注意模型文件只在args.local_rank == 0时保存就可以

注意启动方式特殊

python -m torch.distributed.launch –nproc_per_node=n_gpus test.py

#执行命令n_gpus 是gpu数目,torch.distributed.launch自动分配从0到n_gpus-1python -m torch.distributed.launch --nproc_per_node=n_gpus test.py在写代码时要用parser来接住'--local_rank'#初始化:torch.distributed.init_process_group("nccl", world_size = n_gpus, rank = args.local_rank)#参数分别是:gpu通信方式,gpu数量,一个环境变量#torch.cuda.set_device(arg.local_rank)#模型载入model = DistributeDataParallel(model.cuda(arg.local_rank),device_ids = [args.local_rank])#数据集操作#分配数据集train_sampler = DistributedSampler(train_dataset)#为增加随机性。要注意在每个批次训练之前需要调用 train_sampler.set_epoch(epoch)#DataLoader中传入sampler,注意sampler和shuffle互斥train_dataloader = DataLoader(..., sampler=train_sampler)#数据拷贝到相应的卡上data = data.cuda(args.local_rank)

代码:

import osos.environ["CUDA_VISIBLE_DEVICES"] = "0,1"import torchimport torchvisionfrom torch.nn import Sequentialfrom torch.utils.data import DataLoaderfrom torch.utils.data.distributed import DistributedSampler#导入包from torch import nnimport argparseimport timeclass MyModule(nn.Module):def __init__(self):super(MyModule, self).__init__()self.model1=Sequential(nn.Conv2d(3,32,5,1,2),nn.Conv2d(32,64,5,1,2),nn.Conv2d(64,512,5,1,2),nn.MaxPool2d(2),nn.Conv2d(512, 1024, 5, 1, 2),nn.MaxPool2d(2),nn.Conv2d(1024, 2048, 5, 1, 2),nn.Conv2d(2048, 4096, 5, 1, 2),nn.MaxPool2d(2))self.model2=Sequential(nn.Flatten(),nn.Linear(4096*4*4,5000),nn.Linear(5000,64),nn.Linear(64,10)) def forward(self, x):x=self.model1(x)x=self.model2(x)return x if __name__=="__main__":#设置参数args.local_rankparser = argparse.ArgumentParser()parser.add_argument("--local_rank", help = "local device id on current node", type = int)args = parser.parse_args()if torch.cuda.is_available():print(torch.cuda.device_count())print("Use GPU")else:print("can't use GPU")raise Exception("can't use GPU")#初始化n_gpus = 2torch.distributed.init_process_group("nccl", world_size = n_gpus, rank = args.local_rank)torch.cuda.set_device(args.local_rank)#修改环境变量train_data=torchvision.datasets.CIFAR10(root="./DataSet",train=True,transform=torchvision.transforms.ToTensor(),download=True)train_data_size=len(train_data)print("训练数据集的长度为{}".format(train_data_size))#50000#数据集划分后载入train_sampler = DistributedSampler(train_data)train_dataloader=DataLoader(train_data,batch_size=64, sampler = train_sampler)#batch_size会变小myModule=MyModule()#模型载入args.local_rankmyModule.cuda()myModule = nn.parallel.DistributedDataParallel(myModule.cuda(args.local_rank), device_ids = [args.local_rank])loss_fn=nn.CrossEntropyLoss()learning_rate=0.01optimize=torch.optim.SGD(myModule.parameters() ,lr=learning_rate )train_step=0test_step=0epoch=10for i in range(epoch):print("------第 {} 轮训练开始------".format(i+1))train_sampler.set_epoch(epoch)#每张卡在每个周期上的值是随机的myModule.train()for data in train_dataloader:imgs,targets=data#将数据赋值到args.local_rankimgs = imgs.cuda(args.local_rank)targets = targets.cuda(args.local_rank)starttime = time.time()outputs=myModule(imgs)loss=loss_fn(outputs,targets)optimize.zero_grad()loss.backward()optimize.step()endtime = time.time()train_step=train_step+1if train_step%1==0:print("训练次数:{},Loss:{},time:{}".format(train_step,loss.item(),endtime-starttime))#仅在args.local_rank == 0时保存if args.local_rank ==0:torch.save(myModule.state_dict(),"myModule_{}.pth".format(i))print("模型已保存")