• NCCL源码解析⑥:Channel搜索


    b47d799c4f110d502bd2060dcecc864e.png

    作者|KIDGINBROOK
    更新|潘丽晨

    上节讲到已经计算出GPU和NIC节点到其他任意节点的最优路径了,本节看下NCCL中channel的搜索过程。

    NCCL中channel的概念表示一个通信路径,为了更好地利用带宽和网卡,以及同一块数据可以通过多个channel并发通信,另外后续可以看到一个channel对应了一个GPU SM,所以基于这些原因,NCCL会使用多channel,搜索的过程就是搜索出来一组channel。

    如上节所述,单机的情况下会在ncclTopoTrimSystem函数里删除网卡,因此我们先看下单机八卡这种简化的情况,最后再看下多机引入网卡之后的情况。

    1. static float getMaxWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) {
    2. float maxWidth = 0.0;
    3. for (int i=0; inodes[type].count; i++) {
    4. struct ncclTopoLinkList* path = gpu->paths[type]+i;
    5. float width = path->width;
    6. if (path->count == 0) continue;
    7. maxWidth = std::max(maxWidth, width);
    8. }
    9. return maxWidth;
    10. }
    11. ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
    12. system->maxWidth = 0.0;
    13. int inter = system->nodes[NET].count;
    14. if (inter == 0 && system->nodes[GPU].count == 1) {
    15. system->maxWidth = LOC_WIDTH;
    16. return ncclSuccess;
    17. }
    18. for (int g=0; gnodes[GPU].count; g++) {
    19. struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
    20. system->maxWidth = std::max(system->maxWidth, getMaxWidth(system, gpu, inter ? NET : GPU));
    21. }
    22. return ncclSuccess;
    23. }

    ncclTopoSearchInit就是初始化system->maxWidth,如果是单机单卡的情况,那么maxWidth设置为LOC_WIDTH,否则就遍历每个GPU节点,查看到其他所有GPU节点或者网卡最大带宽。

    1. struct ncclTopoGraph ringGraph;
    2. ringGraph.id = 0;
    3. ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
    4. ringGraph.crossNic = ncclParamCrossNic();
    5. ringGraph.collNet = 0;
    6. ringGraph.minChannels = 1;
    7. ringGraph.maxChannels = MAXCHANNELS/2;
    8. NCCLCHECK(ncclTopoCompute(comm->topo, &ringGraph));
    9. NCCLCHECK(ncclTopoPrintGraph(comm->topo, &ringGraph));

    nccl执行集合通信时支持ring,tree和collnet三种算法,这里我们以ring来举例,后续专门介绍ring和tree。

    1. struct ncclTopoGraph {
    2. // Input / output
    3. int id; // ring : 0, tree : 1, collnet : 2
    4. int pattern;
    5. int crossNic;
    6. int collNet;
    7. int minChannels;
    8. int maxChannels;
    9. // Output
    10. int nChannels; // 搜索到的channel数量
    11. float speedIntra; // 节点内单个channel带宽
    12. float speedInter; // 节点间单个channel带宽
    13. int typeIntra; // 节点内channel的路径类型
    14. int typeInter; // 节点间channel的路径类型
    15. int sameChannels; // channel是否一样
    16. int nHops;
    17. int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES]; // 节点内每个channel路径
    18. int inter[MAXCHANNELS*2]; // 节点间每个channel路径
    19. };

    ncclTopoGraph记录了搜索到的结果,具体含义见注释。

    然后看下ncclTopoCompute,这里就是实际搜索channel的过程,目标是搜索出来尽可能多,带宽尽可能大的一系列channel,本质就是暴力搜索,先设置一系列的条件搜答案,如果搜不出来则降低条件继续搜。

    由于此时没有NET节点,所以crossNic为0,然后初始化graph,首先设置最高的条件,限制节点内部只能使用不超过PATH_NVL路径,节点间只能使用不超过PATH_PIX的路径,然后通过system-maxWidth设置speedIntra和speedInter,接着执行ncclTopoSearchRec搜索出一个答案存储到tmpGraph中。

    如果此时就是最优的结果,channel数等于maxChannel,并且speedInter也等于maxWidth,则直接退出,否则就开始逐步降低条件,比如将sameChannel设置为0,允许channel之间不一样;调大typeIntra和typeInter;允许crossNic;调小speedInter和speedIntra。

    1. ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
    2. int ngpus = system->nodes[GPU].count;
    3. int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
    4. graph->speedIntra = graph->speedInter = 0;
    5. if (graph->crossNic == 2) graph->crossNic = 0;
    6. graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
    7. graph->typeInter = PATH_PIX;
    8. graph->nChannels = 0;
    9. graph->sameChannels = 1;
    10. if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
    11. struct ncclTopoGraph tmpGraph;
    12. memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
    13. // First try crossnic, then decrease speed and finally increase speedIntra.
    14. tmpGraph.pattern = graph->pattern;
    15. int pass = 1;
    16. int speedIndex = 0;
    17. while (speedArray[speedIndex] > system->maxWidth && speedIndex < NSPEEDS-1) speedIndex++;
    18. tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex];
    19. int64_t globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
    20. search:
    21. int time = tmpGraph.sameChannels ? NCCL_SEARCH_TIMEOUT_SAMECHANNELS :
    22. tmpGraph.pattern == NCCL_TOPO_PATTERN_TREE ? NCCL_SEARCH_TIMEOUT_TREE : NCCL_SEARCH_TIMEOUT;
    23. tmpGraph.nChannels = 0;
    24. globalTimeout -= time;
    25. NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time));
    26. // Optimal solution, stop here
    27. if (graph->nChannels == graph->maxChannels && graph->speedInter == system->maxWidth) goto done;
    28. if (pass == 1) {
    29. // First pass, we don't have a solution yet ; try other options
    30. // Try having different channels
    31. if (tmpGraph.sameChannels == 1) {
    32. tmpGraph.sameChannels = 0;
    33. goto search;
    34. }
    35. tmpGraph.sameChannels = 1;
    36. if (time != -1) globalTimeout += time;
    37. else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
    38. if (globalTimeout < 0) goto done;
    39. int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
    40. if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
    41. tmpGraph.typeIntra += 1;
    42. goto search;
    43. }
    44. tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
    45. if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXB)) {
    46. tmpGraph.typeInter += 1;
    47. goto search;
    48. }
    49. tmpGraph.typeInter = PATH_PIX;
    50. // Try a simpler tree
    51. if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) {
    52. tmpGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
    53. goto search;
    54. }
    55. if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
    56. tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
    57. goto search;
    58. }
    59. tmpGraph.pattern = graph->pattern;
    60. if (crossNic && tmpGraph.crossNic == 0) {
    61. // Try again with crossNic if permitted
    62. tmpGraph.crossNic = crossNic;
    63. goto search;
    64. }
    65. tmpGraph.crossNic = 0;
    66. // Decrease speed until we find a solution
    67. if ((speedIndex < NSPEEDS-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->speedInter > .49))) {
    68. tmpGraph.speedInter = tmpGraph.speedIntra = speedArray[++speedIndex];
    69. goto search;
    70. }
    71. speedIndex = 0;
    72. while (speedArray[speedIndex] > system->maxWidth && speedIndex < NSPEEDS-1) speedIndex++;
    73. tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex];
    74. }
    75. done:
    76. // We have a solution. Start from that solution and move to pass 2.
    77. if (pass == 1) {
    78. time = -1;
    79. memcpy(&tmpGraph, graph, sizeof(tmpGraph));
    80. speedIndex = 0;
    81. while (speedArray[speedIndex] > graph->speedInter && speedIndex < NSPEEDS-1) speedIndex++;
    82. tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex];
    83. tmpGraph.minChannels = graph->nChannels;
    84. pass = 2;
    85. }
    86. // 3. See if we can increase speedIntra for trees (2 nodes or collnet)
    87. if (pass == 2) {
    88. if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING &&
    89. tmpGraph.speedIntra == graph->speedIntra && tmpGraph.speedIntra < tmpGraph.speedInter*2 &&
    90. speedIndex > 0) {
    91. tmpGraph.speedIntra = speedArray[--speedIndex];
    92. goto search;
    93. }
    94. time = -1;
    95. memcpy(&tmpGraph, graph, sizeof(tmpGraph));
    96. }
    97. if (graph->nChannels == 0 && graph->collNet == 0) {
    98. WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern);
    99. for (int i=0; iintra[i] = system->nodes[GPU].nodes[i].gpu.rank;
    100. graph->inter[0] = graph->inter[1] = 0;
    101. graph->speedIntra = graph->speedInter = 0.1;
    102. graph->typeIntra = graph->typeInter = PATH_SYS;
    103. graph->nChannels = 1;
    104. }
    105. if (graph->speedIntra >= 25.0) {
    106. int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
    107. memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
    108. memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
    109. graph->speedIntra /= DIVUP(dupChannels, graph->nChannels);
    110. graph->speedInter /= DIVUP(dupChannels, graph->nChannels);
    111. graph->nChannels = dupChannels;
    112. }
    113. return ncclSuccess;
    114. }

    然后开始搜索channel,对于ringGraph来说其实就是搜索出来一系列的环,每个rank对应这个环的一个节点,记录了环的prev和next,这里是一个回溯的过程,执行一次ncclTopoSearchRec就会得到一个环,执行一次ncclTopoSearchTryGpu看选择出来的下一个点能不能到达,执行一次ncclTopoSearchRecGpu用来找下一个GPU,接下来具体看下。

    1. ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time) {
    2. int backToNet, backToFirstRank;
    3. NCCLCHECK(ncclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank));
    4. if (system->nodes[NET].count) {
    5. // Start from NET
    6. ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
    7. } else {
    8. // Intra-node only.
    9. if (graph->nChannels == 0) {
    10. // Try PCI order first
    11. NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0));
    12. } else {
    13. // Also try to replay previous channel
    14. int g;
    15. NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
    16. NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, -1, -1, g));
    17. }
    18. if (graph->sameChannels == 0 || graph->nChannels == 0) {
    19. // Finally, try all other possibilities unless we are forced to use the same channels
    20. for (int g=0; g<system->nodes[GPU].count; g++) {
    21. NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g));
    22. }
    23. }
    24. }
    25. return ncclSuccess;
    26. }

    通过ncclTopoSearchParams设置backToNet和backToFirstRank参数,单机八卡ringGraph场景下这俩参数会分别设置为-1和7,此时nchannel为0,执行ncclTopoSearchTryGpu,强制为pci顺序,就是devid的顺序,从dev0开始。

    1. ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) {
    2. if (system->nodes[NET].count) {
    3. if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1;
    4. else if (pattern == NCCL_TOPO_PATTERN_TREE) *backToNet = 0;
    5. else *backToNet = 1;
    6. if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
    7. else *backToFirstRank = -1;
    8. } else {
    9. *backToNet = -1;
    10. if (pattern == NCCL_TOPO_PATTERN_RING || pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
    11. else *backToFirstRank = -1;
    12. }
    13. return ncclSuccess;
    14. }

    然后执行ncclTopoSearchTryGpu,这里会判断下一个点能不能到达,因为type为-1,ncclTopoFollowPath会设置gpu为0号卡,直接执行ncclTopoSearchRecGpu,从0号卡开始搜,step为0。

    1. ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time, int type, int index, int g) {
    2. const uint64_t flag = 1ULL<<(graph->nChannels);
    3. struct ncclTopoNode* gpu;
    4. NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, 1, &gpu));
    5. if (gpu) {
    6. gpu->used ^= flag;
    7. NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, time));
    8. gpu->used ^= flag;
    9. NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, -1, &gpu));
    10. }
    11. return ncclSuccess;
    12. }
    1. ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
    2. if ((*time) <= 0) return ncclSuccess;
    3. (*time)--;
    4. int ngpus = system->nodes[GPU].count;
    5. if (step == ngpus) {
    6. ...
    7. }
    8. graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank;
    9. int g = gpu - system->nodes[GPU].nodes;
    10. if (step == backToNet) {
    11. ...
    12. } else if (step < system->nodes[GPU].count-1) {
    13. // Go to next GPU
    14. int next[NCCL_TOPO_MAX_NODES];
    15. int count;
    16. if (forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order
    17. next[0] = step+1;
    18. count = 1;
    19. } else if (forcedOrder == FORCED_ORDER_REPLAY) { // Try last channel order
    20. NCCLCHECK(ncclTopoReplayGetGpu(system, graph, step, next));
    21. count = 1;
    22. } else { // Normal search
    23. NCCLCHECK(ncclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step+1 ? 1 : -1 ));
    24. }
    25. for (int i=0; i
    26. NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, step+1, backToNet, backToFirstRank, forcedOrder, time, GPU, g, next[i]));
    27. }
    28. } else if (step == backToFirstRank) {
    29. ...
    30. } else {
    31. // Next path
    32. NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time));
    33. }
    34. return ncclSuccess;
    35. }

    然后看下ncclTopoSearchRecGpu,这里会选择下一个节点,先将0号卡节点写入到graph->intra的对应位置;由于当前step是0,因此会在xx行选择下一个GPU,next数组表示候选的GPU节点,由于forcedOrder == FORCED_ORDER_PCI,所以候选只有一个,即1号卡,然后对所有候选执行ncclTopoSearchTryGpu判断这一步是否可行并继续选择下一个节点。

    然后回到ncclTopoSearchRec开始尝试判断是否可达1号卡,看下ncclTopoFollowPath,这个函数就是判断能否从type1的index1节点到达type2的index2节点,这里可以看到之前在选起点的时候type1为-1,因此直接将node设置为type2的index2就返回;这次我们要判断gpu0到gpu1是否可达,获取index1到index2的路径path,如果index1和index2的类型都是GPU那么speed就设置为graph->speedIntra,即搜索之前设置的条件,mult是函数的入参,表示需要在path上加还是减去speed,向下搜环的时候需要在path上减去speed,当回溯回去的时候需要将speed加回去,然后判断path的type是否大于之前设置的type,即graph->typeIntra,大于的话说明不可达,然后通过followPath将path上的边全都减去speed,如果有边剩下的带宽不够speed,那么通过rewind加回去,此时路径不可达;如果足够的话,则设置node为index2。

    1. static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int type1, int index1, int type2, int index2, int mult, struct ncclTopoNode** node) {
    2. // First handle easy cases
    3. *node = system->nodes[type2].nodes+index2;
    4. if (type1 == -1) return ncclSuccess;
    5. struct ncclTopoNode* node1 = system->nodes[type1].nodes+index1;
    6. struct ncclTopoLinkList* path = node1->paths[type2]+index2;
    7. if (path->count == 0 ) return ncclSuccess;
    8. // Now check link type
    9. *node = NULL;
    10. int intra = type1 == GPU && type2 == GPU;
    11. float speed = intra ? graph->speedIntra : graph->speedInter;
    12. int type = intra ? graph->typeIntra : graph->typeInter;
    13. if (mult == 1 && (path->type > type)) return ncclSuccess;
    14. speed *= mult;
    15. // Check there is enough bandwidth on paths.
    16. int step = 0;
    17. NCCLCHECK(followPath(path, node1, path->count, speed, &step));
    18. if (step < path->count) goto rewind;
    19. // Enough bandwidth : return destination node.
    20. graph->nHops += mult*path->count;
    21. *node = system->nodes[type2].nodes+index2;
    22. return ncclSuccess;
    23. rewind:
    24. // Not enough bandwidth : rewind and exit.
    25. NCCLCHECK(followPath(path, node1, step, -speed, &step));
    26. return ncclSuccess;
    27. }

    接着递归执行ncclTopoSearchRecGpu,重复上述过程,直到gpu7,这个时候graph->intra中的第一个环是[0,1,2,3,4,5,6,7],此时step为backToFirstRank,然后通过获取第一个gpu,即gpu0,然后继续执行ncclTopoFollowPath判断7到0是否可达,如果可达的话继续递归执行ncclTopoSearchRecGpu,此时step == ngpus,即搜索到了一个环,那会将现有的graph去更新最优的saveGraph,判断标准主要是看总的带宽,即环的数量乘以speedIntra;如果搜到的环的数量已经达到maxChannel了,则结束本次搜索,否则继续递归执行ncclTopoSearchRec搜索下一个环。

    1. ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
    2. if ((*time) <= 0) return ncclSuccess;
    3. (*time)--;
    4. int ngpus = system->nodes[GPU].count;
    5. if (step == ngpus) {
    6. // Determine whether we found a better solution or not
    7. int copy = 0;
    8. graph->nChannels++;
    9. NCCLCHECK(ncclTopoCompareGraphs(graph, saveGraph, ©));
    10. if (copy) {
    11. memcpy(saveGraph, graph, sizeof(struct ncclTopoGraph));
    12. if (graph->nChannels == graph->maxChannels) *time = -1;
    13. }
    14. if (graph->nChannels < graph->maxChannels) {
    15. NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, time));
    16. }
    17. graph->nChannels--;
    18. return ncclSuccess;
    19. }
    20. graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank;
    21. int g = gpu - system->nodes[GPU].nodes;
    22. if (step == backToNet) {
    23. ...
    24. } else if (step < system->nodes[GPU].count-1) {
    25. ...
    26. } else if (step == backToFirstRank) {
    27. // Find first GPU and loop back to it
    28. int p;
    29. NCCLCHECK(getGpuIndex(system, graph->intra[graph->nChannels*ngpus], &p));
    30. struct ncclTopoNode* firstGpu;
    31. NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, p, 1, &firstGpu));
    32. if (firstGpu) {
    33. NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step+1, backToNet, -1, forcedOrder, time));
    34. NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, p, -1, &firstGpu));
    35. }
    36. } else {
    37. // Next path
    38. NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time));
    39. }
    40. return ncclSuccess;
    41. }

    假设现在开始搜索下一个环,回到ncclTopoSearchRec,接下来会尝试复制刚刚的环,ncclTopoReplayGetGpu会获取上一个环的第step + 1个gpu,这里其实就是gpu0,然后继续执行ncclTopoSearchTryGpu,这里设置forcedOrder为FORCED_ORDER_REPLAY。

    1. ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time) {
    2. {
    3. // Also try to replay previous channel
    4. int g;
    5. NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
    6. NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, -1, -1, g));
    7. }
    8. }
    9. ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int step, int* g) {
    10. *g = -1;
    11. if (graph->nChannels == 0) return ncclInternalError;
    12. int ngpus = system->nodes[GPU].count;
    13. int nextRank = graph->intra[(graph->nChannels-1)*ngpus+step+1];
    14. for (int i=0; iif (system->nodes[GPU].nodes[i].gpu.rank == nextRank) {
    15. *g = i;
    16. return ncclSuccess;
    17. }
    18. if (*g == -1) return ncclInternalError;
    19. return ncclSuccess;
    20. }

    然后FORCED_ORDER_REPLAY会在寻找下一个节点时通过ncclTopoReplayGetGpu获取上一个环对应step的gpu,因此就是一直在复制上一个环。

    1. ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
    2. ...
    3. else if (step < system->nodes[GPU].count-1) {
    4. // Go to next GPU
    5. int next[NCCL_TOPO_MAX_NODES];
    6. int count;
    7. if (forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order
    8. next[0] = step+1;
    9. count = 1;
    10. } else if (forcedOrder == FORCED_ORDER_REPLAY) { // Try last channel order
    11. NCCLCHECK(ncclTopoReplayGetGpu(system, graph, step, next));
    12. count = 1;
    13. } else { // Normal search
    14. NCCLCHECK(ncclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step+1 ? 1 : -1 ));
    15. }
    16. for (int i=0; i
    17. NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, step+1, backToNet, backToFirstRank, forcedOrder, time, GPU, g, next[i]));
    18. }
    19. }
    20. ...
    21. }

    到这里就完成了第一次搜索,如前文所述,如果搜索出来的结果没有达到条件,就开始逐步降低条件继续搜索,接下来的过程比较类似,就不再赘述了。

    然后看下多机场景下,比如两机十六卡场景,这个时候有网卡,所以ncclTopoSearchParams设置参数为backToFirstRank = -1,backToNet = 7,ncclTopoSearchRec直接执行ncclTopoSearchRecNet。

    1. ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time) {
    2. int backToNet, backToFirstRank;
    3. NCCLCHECK(ncclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank));
    4. if (system->nodes[NET].count) {
    5. // Start from NET
    6. ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
    7. }
    8. ...
    9. }

    ncclTopoSearchRecNet会搜索出来一个答案,这里会遍历每个网卡,尝试用每个网卡作为起点搜索环,首先是网卡0,将0写入到inter中第一个channel中,然后将网卡0的带宽减去speedInter,maxChannel减去1,然后后边过程和上述很像,会通过ncclTopoSearchTryGpu搜索出一个环。

    1. ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
    2. const int speed = graph->speedInter;
    3. for (int n=0; nnodes[NET].count; n++) {
    4. struct ncclTopoNode* net = system->nodes[NET].nodes+n;
    5. struct ncclTopoNode* gpu;
    6. if (graph->collNet && net->net.collSupport == 0) continue;
    7. if (net->net.width < speed) continue;
    8. if (net->net.maxChannels == 0) continue;
    9. graph->inter[graph->nChannels*2] = net->id;
    10. for (int i=0; inodes[NET].count; i++) {
    11. if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) &&
    12. (system->nodes[NET].nodes[i].net.port == net->net.port)) {
    13. system->nodes[NET].nodes[i].net.width -= speed;
    14. }
    15. }
    16. net->net.maxChannels--;
    17. // First try to replay the last channel
    18. if (graph->nChannels > 0) {
    19. int g;
    20. NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
    21. NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
    22. }
    23. if (graph->nChannels == 0 || graph->sameChannels == 0) {
    24. if (graph->nChannels == 0) {
    25. // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
    26. int t = 1 << 10;
    27. NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
    28. if (t == -1) *time = -1;
    29. }
    30. ...
    31. }
    32. return ncclSuccess;
    33. }

    ncclTopoSearchTryGpu还是会调用ncclTopoSearchRecGpu,当没有遍历完所有GPU节点时,仍然通过递归执行ncclTopoSearchRecGpu来填充graph->intra,最后遍历所有GPU之后step等于7,即backToNet,这里首先拿出来起始网卡,即网卡0,如果搜索参数支持crossNic的话就选一个合法的网卡即可,如果不支持的话就判断网卡0是否合法,合法的话将网卡0填充到graph->inter,一个环就搜索完成了。这里有一个小的疑惑点,在将出口网卡选择好后,并没有将该网卡的带宽减去speed。

    1. ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
    2. if ((*time) <= 0) return ncclSuccess;
    3. (*time)--;
    4. int ngpus = system->nodes[GPU].count;
    5. if (step == ngpus) {
    6. // Determine whether we found a better solution or not
    7. int copy = 0;
    8. graph->nChannels++;
    9. NCCLCHECK(ncclTopoCompareGraphs(graph, saveGraph, ©));
    10. if (copy) {
    11. memcpy(saveGraph, graph, sizeof(struct ncclTopoGraph));
    12. if (graph->nChannels == graph->maxChannels) *time = -1;
    13. }
    14. if (graph->nChannels < graph->maxChannels) {
    15. NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, time));
    16. }
    17. graph->nChannels--;
    18. return ncclSuccess;
    19. }
    20. graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank;
    21. int g = gpu - system->nodes[GPU].nodes;
    22. if (step == backToNet) {
    23. // first get back to NIC
    24. if (system->nodes[NET].count) {
    25. int startNetIndex;
    26. NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
    27. struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
    28. for (int n=0; n<system->nodes[NET].count; n++) {
    29. struct ncclTopoNode* net = system->nodes[NET].nodes+n;
    30. if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
    31. if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
    32. NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
    33. if (net) {
    34. graph->inter[graph->nChannels*2+1] = net->id;
    35. NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, time));
    36. NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
    37. }
    38. }
    39. }
    40. } else if (step < system->nodes[GPU].count-1) {
    41. // Go to next GPU
    42. int next[NCCL_TOPO_MAX_NODES];
    43. int count;
    44. if (forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order
    45. next[0] = step+1;
    46. count = 1;
    47. } else if (forcedOrder == FORCED_ORDER_REPLAY) { // Try last channel order
    48. NCCLCHECK(ncclTopoReplayGetGpu(system, graph, step, next));
    49. count = 1;
    50. } else { // Normal search
    51. NCCLCHECK(ncclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step+1 ? 1 : -1 ));
    52. }
    53. for (int i=0; i
    54. NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, step+1, backToNet, backToFirstRank, forcedOrder, time, GPU, g, next[i]));
    55. }
    56. } else if (step == backToFirstRank) {
    57. ...
    58. } else {
    59. // Next path
    60. NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time));
    61. }
    62. return ncclSuccess;
    63. }

    回到ncclTopoSearchRecNet,接下来会尝试复制刚刚搜索出来的环,当搜索出一个答案后,回到第一次ncclTopoSearchRecNet,接下来会尝试从离网卡0最近的GPU开始搜索,而不是从GPU0开始,假设为GPUn,这里会先判断GPUn到PCIe switch的双向带宽是否还有空闲,如果有空闲的话才从GPUn开始搜索。但是和这里的注释表述不太相符,注释的意思是说不会将一个GPU既用来发送,又用来接收(说这种情况会影响带宽,这一点比较疑惑)。

    1. ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
    2. const int speed = graph->speedInter;
    3. for (int n=0; n<system->nodes[NET].count; n++) {
    4. ...
    5. // Then try the most local GPUs
    6. float maxWidth = 0;
    7. int minHops = 0xfffffff;
    8. struct ncclTopoLinkList* paths = net->paths[GPU];
    9. for (int g=0; g<system->nodes[GPU].count; g++) {
    10. if (paths[g].width > maxWidth) {
    11. maxWidth = paths[g].width;
    12. minHops = paths[g].count;
    13. } else if (paths[g].width == maxWidth && paths[g].count < minHops) {
    14. minHops = paths[g].count;
    15. }
    16. }
    17. if (maxWidth >= speed) {
    18. // In the first loop, avoid using GPUs in both directions between channels (one channel
    19. // sending from that GPU and one channel receiving to that GPU), since that usually leads
    20. // to lower BW.
    21. for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) {
    22. for (int g=0; g<system->nodes[GPU].count; g++) {
    23. if (paths[g].width == maxWidth && paths[g].count == minHops) {
    24. gpu = system->nodes[GPU].nodes+g;
    25. int gpuUsed = gpuPciWidth(gpu) > 0 ? 0 : 1;
    26. if (tryGpuBidir == gpuUsed) {
    27. NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
    28. }
    29. }
    30. }
    31. }
    32. }
    33. }
    34. net->net.maxChannels++;
    35. for (int i=0; i<system->nodes[NET].count; i++) {
    36. if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) &&
    37. (system->nodes[NET].nodes[i].net.port == net->net.port)) {
    38. system->nodes[NET].nodes[i].net.width += speed;
    39. }
    40. }
    41. }
    42. return ncclSuccess;
    43. }

    到这里就完成了channel的搜索。总结一下,本节就是基于机器拓扑,搜索出一组channel用于数据的通信,并记录到ncclTopoGraph。

    (本文经授权后由OneFlow发布。原文:https://blog.csdn.net/KIDGIN7439/article/details/128074716)

    其他人都在看

    试用OneFlow: github.com/Oneflow-Inc/oneflow/icon-default.png?t=N7T8http://github.com/Oneflow-Inc/oneflow/

  • 相关阅读:
    高中生自学Python,这里给大家一些建议
    注意力机制的一种卷积替代方式
    Java代码实现不同应用系统中数据同步程序
    InnoDB数据页结构示例
    免实名域名是什么意思?
    Nginx入门(简介、反向代理、负载均衡)
    C语言和C++的区别
    Abbexa小鼠Asprosin ELISA试剂盒,体外定量测量好帮手!
    性能透明提升 50%,SMC + ERDMA 云上超大规模高性能网络协议栈
    Air780E连接点灯科技-LuatOS
  • 原文地址:https://blog.csdn.net/OneFlow_Official/article/details/132374290