使用Pytorch测试cuda设备的性能（单卡或多卡并行）

以下CUDA设备泛指NVIDIA显卡或启用ROCm的AMD显卡

测试环境：
- Distributor ID: Ubuntu
- Description: Ubuntu 22.04.3 LTS
- Release: 22.04
- Codename: jammy

1.首先，简单使用torch.ones测试CUDA设备

import torch
import time

def cuda_benchmark(device_id, N=1000000):
    # 指定要使用的显卡设备
    torch.cuda.set_device(device_id)

    # 创建输入数据
    data = torch.ones(N).cuda()

    # 启动CUDA操作，并记录执行时间
    start_time = time.time()
    for i in range(10000):
        data += 1
    torch.cuda.synchronize()  # 等待CUDA操作执行完成
    end_time = time.time()

    # 将结果从GPU内存下载到主机内存
    result = data.cpu().numpy()

    # 打印Benchmark结果和执行时间
    print(f"Benchmark结果：{result[:10]}")
    print(f"执行时间：{end_time - start_time} 秒")


if __name__ == '__main__':
	# 测试第一块显卡
	device_id = 0
    cuda_benchmark(device_id,10000000)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

2.使用自带的CUDABenchmarkModel测试CUDA设备

import torch
import torch.nn as nn
import time

class CUDABenchmarkModel(nn.Module):
    def __init__(self):
        super(CUDABenchmarkModel, self).__init__()
        self.fc = nn.Linear(10, 10).cuda()

    def forward(self, x):
        return self.fc(x)

def cuda_benchmark(device_ids, N=10000000):
    # 创建模型
    model = CUDABenchmarkModel()
    model = nn.DataParallel(model, device_ids=device_ids)

    # 创建输入数据
    data = torch.ones(N, 10).cuda()

    # 启动CUDA操作，并记录执行时间
    start_time = time.time()
    for i in range(10000):
        output = model(data)
    torch.cuda.synchronize()  # 等待CUDA操作执行完成
    end_time = time.time()

    # 打印执行时间
    print(f"执行时间：{end_time - start_time} 秒")

if __name__ == '__main__':
	# 同时测试3块显卡
	device_ids = [0, 1, 2]
    cuda_benchmark(device_ids=device_ids)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34

3.使用nccl多进程的方式测试CUDA设备

import torch
import torch.nn as nn
import torch.distributed as dist
import torch.multiprocessing as mp
import time

def cuda_benchmark(device_id, N=10000000):
    # 指定要使用的显卡设备
    torch.cuda.set_device(device_id)
    print(f"该GPU的核心数量为：{torch.cuda.get_device_properties(device_id).multi_processor_count}")
    # 创建输入数据
    data = torch.ones(N).cuda()

    # 启动CUDA操作，并记录执行时间
    start_time = time.time()
    for i in range(10000):
        data += 1
    torch.cuda.synchronize()  # 等待CUDA操作执行完成
    end_time = time.time()

    # 将结果从GPU内存下载到主机内存
    result = data.cpu().numpy()

    # 打印Benchmark结果和执行时间
    print(f"Benchmark结果：{result[:10]}")
    print(f"执行时间：{end_time - start_time} 秒")

def main(num):
    # 初始化多进程
    mp.spawn(run, args=(num,), nprocs=num)

def run(rank,world_size):
    """每个进程的入口函数"""
    # 初始化进程组
    dist.init_process_group("nccl", init_method="tcp://127.0.0.1:23456", rank=rank, world_size=world_size)
    # 指定设备ID
    device_id = rank

    # 在多个GPU上并行执行操作
    model = cuda_benchmark(device_id)

if __name__ == '__main__':
	# 同时启用3个进程（一个进程对应一块显卡）
	device_numbers = 3
    main(device_numbers)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47

相关阅读:
《PostgreSQL备份与恢复：步骤与最佳实践》
Cloud Foundry 4：应用程序的生命周期
大模型相关资料整理
决策树分类器（保姆级教学）定义+特性+原理及公式+鸢尾花分类经典问题示例（完整Python代码带详细注释、保姆级分部代码解释及结果说明、决策树可视化及解释）
2022年数维杯国际赛C题如何利用大脑结构诊断阿尔茨海默氏病
sqoop ETL工具
H - XYZZY(spfa最大路径,判断正环)
四川云汇优想：短视频矩阵运营方案
快排三种递归及其优化，非递归和三路划分
C嘎嘎 - 类和对象

原文地址：https://blog.csdn.net/qq_42183962/article/details/134450693