• NCCL源码解析③:机器内拓扑分析


    913afe22f53f4e8385cf5c687f05a399.png

    作者|KIDGINBROOK

    更新|潘丽晨

    上节介绍所有节点执行了bootstrap网络连接的建立,接下来介绍下拓扑分析。

    由于GPU机器架构是多种多样的,一台机器上可能有多个网卡,多个GPU卡,卡间连接也各不相同,因此需要对机器内设备连接拓扑进行分析,以使性能在各种拓扑结构下都尽可能好。

    接着上回继续看initTransportsRank。

     
     
    1. static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
    2. // We use 3 AllGathers
    3. // 1. { peerInfo, comm }
    4. // 2. ConnectTransport[nranks], ConnectValue[nranks]
    5. // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
    6. int rank = comm->rank;
    7. int nranks = comm->nRanks;
    8. uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
    9. TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
    10. NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
    11. // AllGather1 - begin
    12. struct {
    13. struct ncclPeerInfo peerInfo;
    14. struct ncclComm* comm;
    15. } *allGather1Data;
    16. NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
    17. allGather1Data[rank].comm = comm;
    18. struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo;
    19. NCCLCHECK(fillInfo(comm, myInfo, commHash));
    20. ...
    21. }

    创建nrank个allGather1Data,然后通过fillInfo 填充当前rank的peerInfo,ncclPeerInfo是rank的一些基本信息,比如rank号,在哪个机器的哪个进程等。

     
     
    1. struct ncclPeerInfo {
    2. int rank;
    3. int cudaDev;
    4. int gdrSupport;
    5. uint64_t hostHash;
    6. uint64_t pidHash;
    7. dev_t shmDev;
    8. int64_t busId;
    9. };
    10. static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) {
    11. info->rank = comm->rank;
    12. CUDACHECK(cudaGetDevice(&info->cudaDev));
    13. info->hostHash=getHostHash()+commHash;
    14. info->pidHash=getPidHash()+commHash;
    15. // Get the device MAJOR:MINOR of /dev/shm so we can use that
    16. // information to decide whether we can use SHM for inter-process
    17. // communication in a container environment
    18. struct stat statbuf;
    19. SYSCHECK(stat("/dev/shm", &statbuf), "stat");
    20. info->shmDev = statbuf.st_dev;
    21. info->busId = comm->busId;
    22. NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport));
    23. return ncclSuccess;
    24. }

    获取当前卡的rank,PCIe busId,/dev/shm的设备号,填充到ncclPeerInfo,然后通过ncclGpuGdrSupport查看是否支持gdr,rdma在通信前需要注册一段内存,使得网卡知道虚拟地址和物理地址的映射,但是如果每次通信都需要将data从显存拷贝到内存再通信的话效率就比较低。

    而IB提供了peer memory的接口,使得ib网卡可以访问其他PCIe空间,nv基于peer memory实现了自己的驱动,使得rdma可以直接注册显存,这样通信就可以避免host和device的内存拷贝,IB可以直接dma显存,即gdr。

     
     
    1. static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
    2. int netDevs;
    3. NCCLCHECK(ncclNetDevices(&netDevs));
    4. *gdrSupport = 0;
    5. for (int dev=0; dev
    6. // Find a net device which is GDR-capable
    7. ncclNetProperties_t props;
    8. NCCLCHECK(ncclNet->getProperties(dev, &props));
    9. if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
    10. // Allocate memory on the GPU and try to register it on the NIC.
    11. void *lComm = NULL, *sComm = NULL, *rComm = NULL;
    12. ncclNetHandle_t handle;
    13. void* gpuPtr = NULL;
    14. void* mHandle = NULL;
    15. NCCLCHECK(ncclNetListen(dev, &handle, &lComm));
    16. NCCLCHECK(ncclNetConnect(dev, &handle, &sComm));
    17. NCCLCHECK(ncclNetAccept(lComm, &rComm));
    18. CUDACHECK(cudaMalloc(&gpuPtr, GPU_BUF_SIZE));
    19. ncclDebugNoWarn = NCCL_NET;
    20. if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
    21. NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
    22. NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
    23. NCCLCHECK(ncclNetDeregMr(rComm, mHandle));
    24. *gdrSupport = 1;
    25. }
    26. ncclDebugNoWarn = 0;
    27. CUDACHECK(cudaFree(gpuPtr));
    28. NCCLCHECK(ncclNetCloseRecv(rComm));
    29. NCCLCHECK(ncclNetCloseSend(sComm));
    30. NCCLCHECK(ncclNetCloseListen(lComm));
    31. break;
    32. }
    33. return ncclSuccess;
    34. }

    这里会遍历每一个网卡,获取网卡的信息,由第一节可以知道这里的ncclNet就是ncclNetIb。

     
     
    1. ncclResult_t ncclIbGdrSupport(int ibDev) {
    2. static int moduleLoaded = -1;
    3. if (moduleLoaded == -1) {
    4. moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
    5. }
    6. if (moduleLoaded == 0) return ncclSystemError;
    7. return ncclSuccess;
    8. }
    9. ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
    10. props->name = ncclIbDevs[dev].devName;
    11. props->pciPath = ncclIbDevs[dev].pciPath;
    12. props->guid = ncclIbDevs[dev].guid;
    13. props->ptrSupport = NCCL_PTR_HOST;
    14. if (ncclIbGdrSupport(dev) != ncclSuccess) {
    15. INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName);
    16. } else {
    17. props->ptrSupport |= NCCL_PTR_CUDA;
    18. }
    19. props->speed = ncclIbDevs[dev].speed;
    20. props->port = ncclIbDevs[dev].port + ncclIbDevs[dev].realPort;
    21. props->maxComms = ncclIbDevs[dev].maxQp;
    22. return ncclSuccess;
    23. }

    这里主要是获取网卡名,PCIe路径,guid等信息,然后查看是否有/sys/kernel/mm/memory_peers/nv_mem/version判断是否安装了nv_peermem,即nv的驱动,如果安装了的话则设置props->ptrSupport |= NCCL_PTR_CUDA,表示可以注册显存。

  • 相关阅读:
    2023服务端测试开发必备技能:Mock测试
    kotlin 注解 @Parcelize 使用
    Python-表白小程序练习
    RenderDoc图形调试器详细使用教程(基于DirectX11)
    Day38——进程的创建方法,join方法,进程对象
    FPGA高端项目:图像缩放+GTX+UDP架构,高速接口以太网视频传输,提供2套工程源码加QT上位机源码和技术支持
    Docker consul
    lambda表达式
    [HarekazeCTF2019]encode_and_encode
    动态规划:09 0-1背包理论基础I
  • 原文地址:https://blog.csdn.net/OneFlow_Official/article/details/130418182