• NCCL源码解析④:建图过程


    1663dfa5241414ed424c9cb11aa73e8a.png

    作者|KIDGINBROOK
    更新|潘丽晨

    上次分析了NCCL对机器PCI系统进行拓扑分析的过程,产出的结果为xml格式,接下来,NCCL会根据这个xml进图的建立过程以便之后进行路径搜索。

    ncclTopoGetSystem的最后会执行ncclTopoGetSystemFromXml将xml格式转成图格式。

     
     
    1. ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) {
    2. NCCLCHECK(ncclCalloc(topoSystem, 1));
    3. struct ncclXmlNode* topNode;
    4. NCCLCHECK(xmlFindTag(xml, "system", &topNode));
    5. for (int s=0; snSubs; s++) {
    6. struct ncclXmlNode* node = topNode->subs[s];
    7. if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
    8. }
    9. NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));
    10. NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
    11. NCCLCHECK(ncclTopoSortSystem(*topoSystem));
    12. return ncclSuccess;
    13. }

    从xml中拿到根节点"system",然后遍历子节点中的"cpu",对每个cpu通过ncclTopoAddCpu进行建图,这里一个cpu其实就是一个numa。

     
     
    1. ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {
    2. int numaId;
    3. NCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId));
    4. struct ncclTopoNode* cpu;
    5. NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, numaId));
    6. const char* str;
    7. NCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str));
    8. if (str != NULL) {
    9. NCCLCHECK(ncclStrToCpuset(str, &cpu->cpu.affinity));
    10. }
    11. NCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str));
    12. NCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch));
    13. if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86) {
    14. NCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str));
    15. NCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor));
    16. if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
    17. int familyId, modelId;
    18. NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
    19. NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
    20. cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
    21. }
    22. }
    23. for (int s=0; snSubs; s++) {
    24. struct ncclXmlNode* node = xmlCpu->subs[s];
    25. if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu));
    26. if (strcmp(node->name, "nic") == 0) {
    27. struct ncclTopoNode* nic = NULL;
    28. NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0));
    29. if (nic == NULL) {
    30. NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, 0));
    31. NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_WIDTH));
    32. NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_WIDTH));
    33. }
    34. NCCLCHECK(ncclTopoAddNic(node, system, nic));
    35. }
    36. }
    37. return ncclSuccess;
    38. }

    接着创建一个cpu node,id为numaid,设置cpu的affinity,即该numa对应的核,设置cpu对应vendor等信息。

    然后遍历cpu node的子节点,根据不同的类型执行不同的函数,如果是PCI节点,则执行ncclTopoAddPci。

     
     
    1. ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {
    2. const char* str;
    3. int type;
    4. NCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str));
    5. NCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass));
    6. int64_t busId;
    7. NCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str));
    8. NCCLCHECK(busIdToInt64(str, &busId));
    9. struct ncclTopoNode* node = NULL;
    10. if (type == GPU) {
    11. struct ncclXmlNode* xmlGpu;
    12. NCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu));
    13. if (xmlGpu == NULL) return ncclSuccess;
    14. int index;
    15. NCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index));
    16. if (index == -1) return ncclSuccess;
    17. NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
    18. NCCLCHECK(ncclTopoAddGpu(xmlGpu, system, node));
    19. }
    20. if (type == NIC) {
    21. struct ncclXmlNode* xmlNic;
    22. NCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic));
    23. if (xmlNic == NULL) return ncclSuccess;
    24. // Ignore sub device ID and merge multi-port NICs into one PCI device.
    25. busId &= 0xfffffffffffffff0;
    26. struct ncclTopoNode* nicNode = NULL;
    27. NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, busId));
    28. if (nicNode == NULL) {
    29. NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, busId));
    30. node = nicNode; // Connect it to parent later on
    31. }
    32. NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode));
    33. } else if (type == PCI) {
    34. NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
    35. for (int s=0; snSubs; s++) {
    36. struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
    37. NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node));
    38. }
    39. }
    40. if (node) {
    41. int width, speed;
    42. NCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width));
    43. NCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str));
    44. // Manage cases where speed was not indicated in /sys
    45. if (width == 0) width = 16;
    46. NCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end)
    47. NCCLCHECK(ncclTopoConnectNodes(node, parent, LINK_PCI, width*speed/80.0));
    48. NCCLCHECK(ncclTopoConnectNodes(parent, node, LINK_PCI, width*speed/80.0));
    49. }
    50. return ncclSuccess;
    51. }

    首先获取pci的type和busId, 然后判断type,如果是PCI,那么创建一个PCI node,递归执行ncclTopoAddPci,直到遇到NIC或者GPU xml节点。

    如果遇到的是NIC,那么创建NIC节点,然后执行ncclTopoAddNic,这里会在xml nic下遍历xml net,对每个xml net创建net node,id为dev,然后设置speed,port,gdr等属性。

     
     
    1. ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
    2. int dev;
    3. NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));
    4. struct ncclTopoNode* net;
    5. NCCLCHECK(ncclTopoCreateNode(system, &net, NET, dev));
    6. const char* str;
    7. NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));
    8. if (str) sscanf(str, "0x%lx", &net->net.asic);
    9. else net->net.asic = dev;
    10. ncclDebugNoWarn = NCCL_GRAPH;
    11. int mbps;
    12. if (xmlGetAttrInt(xmlNet, "speed", &mbps) != ncclSuccess) mbps = 0;
    13. if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1
    14. net->net.width = mbps / 8000.0;
    15. if (xmlGetAttrInt(xmlNet, "port", &net->net.port) != ncclSuccess) net->net.port = 0;
    16. if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0;
    17. if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS;
    18. if (xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0;
    19. ncclDebugNoWarn = 0;
    20. NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width));
    21. NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.width));
    22. return ncclSuccess;
    23. }
    24. ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
    25. for (int s=0; snSubs; s++) {
    26. struct ncclXmlNode* xmlNet = xmlNic->subs[s];
    27. if (strcmp(xmlNet->name, "net") != 0) continue;
    28. int index;
    29. NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
    30. if (index == -1) continue;
    31. NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic));
    32. }
    33. return ncclSuccess;
    34. }

    然后通过建立net node到nic node的正反向边,设置边的类型,边上累计带宽,并且当前节点的边按照带宽从大到小排序。

     
     
    1. ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width) {
    2. // Aggregate links into higher width for NVLink
    3. struct ncclTopoLink* link;
    4. for (link = node->links; link->remNode; link++) {
    5. if (link->remNode == remNode && link->type == type) break;
    6. }
    7. if (link->remNode == NULL) node->nlinks++;
    8. link->type = type;
    9. link->remNode = remNode;
    10. link->width += width;
    11. // Sort links in BW descending order
    12. struct ncclTopoLink linkSave;
    13. memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
    14. while (link != node->links) {
    15. if ((link-1)->width >= linkSave.width) break;
    16. memcpy(link, link-1, sizeof(struct ncclTopoLink));
    17. link--;
    18. }
    19. memcpy(link, &linkSave, sizeof(struct ncclTopoLink));
    20. return ncclSuccess;
    21. }

    到这里就添加完成了NIC,回到ncclTopoAddPci里,如果是gpu的话则创建gpu node,然后设置gpu node的rank,dev,gdr等属性。最后通过ncclTopoConnectNodes建立当前节点到子节点的双向边。

    到这里就完成了每个numa节点下的建图,然后开始添加nvlink和QPI以连接,先看下nvlink。

     
     
    1. ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) {
    2. if (strcmp(node->name, "nvlink") == 0) {
    3. struct ncclTopoNode* gpu = NULL;
    4. int64_t pBusId;
    5. NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
    6. NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
    7. if (gpu == NULL) {
    8. WARN("Add NVLink error : could not find GPU %lx\n", pBusId);
    9. return ncclInternalError;
    10. }
    11. int count;
    12. NCCLCHECK(xmlGetAttrInt(node, "count", &count));
    13. const char* targetClass;
    14. NCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass));
    15. int targetType;
    16. NCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass));
    17. struct ncclTopoNode* remote = NULL;
    18. if (targetType == GPU) {
    19. // NVL P2P connection to another GPU
    20. const char* target;
    21. NCCLCHECK(xmlGetAttrStr(node, "target", &target));
    22. int64_t busId;
    23. NCCLCHECK(busIdToInt64(target, &busId));
    24. NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, busId));
    25. } else if (targetType == CPU) {
    26. // NVL connection to the local CPU
    27. NCCLCHECK(findLocalCpu(gpu, &remote));
    28. } else {
    29. if (system->nodes[NVS].count == 0) {
    30. NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0));
    31. } else {
    32. remote = system->nodes[NVS].nodes;
    33. }
    34. }
    35. if (remote) {
    36. int nvlSpeed = gpu->gpu.cudaCompCap == 60 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
    37. NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed));
    38. if (remote->type != GPU) {
    39. NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed));
    40. }
    41. }
    42. } else {
    43. const char* busId;
    44. NCCLCHECK(xmlGetAttr(node, "busid", &busId));
    45. for (int s=0; snSubs; s++) {
    46. NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId));
    47. }
    48. }
    49. return ncclSuccess;
    50. }

    从根节点递归遍历下去,直到遇到nvlink xml节点,然后拿到nvlink的父节点,即gpu节点,然后通过tclass获取对端PCI设备类型,如果是gpu或者cpu,直接返回对端node,如果是nvswitch,那就先创建nvswitch节点,然后创建当前gpu节点和对端的双向边。然后通过ncclTopoConnectCpus将cpu两两连接。

    最后为了方便后续搜索channel,通过ncclTopoSort递归将每个PCI节点的边按照nvlink,向下的PCI连接,向上的PCI连接,QPI的顺序进行排序,因为建边的过程中已经按照带宽排序过,所以nvlink一定在最前边,QPI一定在最后,因此只需要对中间的PCI排序即可。

     
     
    1. static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) {
    2. // Shift all links to have upLink as last link
    3. if (upNode) {
    4. int l=0;
    5. while (node->links[l].remNode != upNode) l++;
    6. struct ncclTopoLink upLink;
    7. memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink));
    8. while (node->links[l+1].remNode) {
    9. memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink));
    10. l++;
    11. }
    12. memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink));
    13. }
    14. // Recursively sort the PCI tree
    15. for (int l=0; lnlinks; l++) {
    16. struct ncclTopoLink* link = node->links+l;
    17. if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node));
    18. }
    19. return ncclSuccess;
    20. }

    到这里就完成了整个的建图过程。总结下,由于拓扑分析产出的xml不便于进行后续的路径搜索,所以本节基于xml对PCI系统进行了建图。

    其他人都在看

    试用OneFlow: github.com/Oneflow-Inc/oneflow/icon-default.png?t=N7T8http://github.com/Oneflow-Inc/oneflow/

  • 相关阅读:
    音频——I2S DSP 模式(五)
    R包Colorfindr识别图片颜色|用刀剑神域方式打开SCI科研配色
    ffmpeg把RTSP流分段录制成MP4,如果能把ffmpeg.exe改成ffmpeg.dll用,那音视频开发的难度直接就降一个维度啊
    聚苯乙烯-SiO2/NiFe2O4磁性微球/中空介孔载银二氧化硅聚苯乙烯微球制备过程
    mac通过docker搭建elasticsearch:8.9.2以及kibana:8.9.2
    探索跨境电商产品开发流程的最佳工具
    用mysql客户端操作时,一直提示 Lost connection to MySQL server during query
    简单实用的数据可视化案例
    【JAVAEE基础学习(13)】--简述maven
    软件测试面试被问:你为何换工作、换专业学测试?
  • 原文地址:https://blog.csdn.net/OneFlow_Official/article/details/131428044