在PyTorch中实现多GPU分布式训练,可以使用torch.nn.parallel.DistributedDataParallel类。以下是一个简单的示例:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
import torchvision.transforms as transforms
import torchvision.datasets as datasetsworld_size = 2 # 使用的GPU数量
rank = 0 # 当前GPU的排名(从0开始)
master_ip = 'localhost' # 主节点的IP地址
master_port = '12345' # 主节点的端口号
torch.distributed.init_process_group(backend='nccl', init_method=f'tcp://{master_ip}:{master_port}', world_size=world_size, rank=rank)class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
self.conv2_drop = nn.Dropout2d()
self.fc1 = nn.Linear(320, 50)
self.fc2 = nn.Linear(50, 10)
def forward(self, x):
x = F.relu(F.max_pool2d(self.conv1(x), 2))
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
x = x.view(-1, 320)
x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training)
x = self.fc2(x)
return F.log_softmax(x, dim=1)
model = Net()DistributedDataParallel包装模型:model = DDP(model)criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, sampler=train_sampler)for epoch in range(10):
train_sampler.set_epoch(epoch)
running_loss = 0
for batch_idx, (data, target) in enumerate(train_loader):
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
running_loss += loss.item()
print(f'Epoch {epoch}: Loss: {running_loss / len(train_loader)}')torch.distributed.destroy_process_group()注意:在实际应用中,你可能需要在多个GPU上运行这个示例。为了实现这一点,你需要在命令行中使用python -m torch.distributed.launch或torchrun命令来启动多个进程。例如:
python -m torch.distributed.launch --nproc_per_node=2 your_script.py或者
torchrun --nproc_per_node=2 your_script.py这将启动两个进程,每个进程在一个GPU上运行。确保你的系统中有足够的GPU资源。