NCCL源码解析5——跨机channel合并-lvyilong316-ChinaUnix博客

NCCL源码解析5——跨机channel合并

——lvyilong316

上节中我们知道通过ncclTopoCompute函数完成单机（单node）内的channel搜索和构建。接下来回到initTransportsRank中，看下如果是多机器情况，是如何合并各个node的拓扑和channel。在后续介绍之前，需要先再回顾一下graph结构的intra和inter数组含义，其中intra记录的是所有channel的node内节点，即GPU，inter记录的向外互联的节点，即NET节点。如下图所示，假如当前Node，分别有GPU0-> GPU1-> GPU2-> GPU3和对外的NET，以及GPU1-> GPU0-> GPU3-> GPU2和对外的NET两个channel，则对应的intra和inter数组如下所示。

接下来我们回到代码，开始第二次AllGather的准备工作。首先分配allGather3Data用于rank间聚合channel的信息，其中allGather3Data对应的结构定义如下

点击(此处)折叠或打开

struct allGatherInfo *allGather3Data
struct allGatherInfo {
struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS];
struct ncclTopoRanks topoRanks;
int cpuArch;
int cpuVendor;
};

allGatherInfo结构包含所有算法的graph和channel信息。

点击(此处)折叠或打开

// AllGather3 - begin
NCCLCHECKGOTO(ncclCalloc(&allGather3Data, nranks), ret, fail);
//graphs即comm->graphs
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
allGather3Data[rank].graphInfo[a].pattern = graphs[a]->pattern;
allGather3Data[rank].graphInfo[a].nChannels = graphs[a]->nChannels;
allGather3Data[rank].graphInfo[a].sameChannels = graphs[a]->sameChannels;
allGather3Data[rank].graphInfo[a].bwIntra = graphs[a]->bwIntra;
allGather3Data[rank].graphInfo[a].bwInter = graphs[a]->bwInter;
allGather3Data[rank].graphInfo[a].typeIntra = graphs[a]->typeIntra;
allGather3Data[rank].graphInfo[a].typeInter = graphs[a]->typeInter;
allGather3Data[rank].graphInfo[a].crossNic = graphs[a]->crossNic;
}
allGather3Data[rank].cpuArch = comm->cpuArch;
allGather3Data[rank].cpuVendor = comm->cpuVendor;
comm->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail);

上述代码的for循环用于初始化allGather3Data[rank].graphInfo,而ncclTopoPreset函数则用于初始化allGather3Data[rank].topoRanks。首先看一下topoRanks的结构体定义：

点击(此处)折叠或打开

struct ncclTopoRanks {
int ringRecv[MAXCHANNELS];
int ringSend[MAXCHANNELS];
int ringPrev[MAXCHANNELS];
int ringNext[MAXCHANNELS];
int treeToParent[MAXCHANNELS];
int treeToChild0[MAXCHANNELS];
int treeToChild1[MAXCHANNELS];
int nvlsHeads[MAXCHANNELS];
int nvlsHeadNum;
};

涉及以下信息：

l ringRecv[c]：当前rank所在的”部分 channel c ”的头节点，也可以理解为所在机器的头节点rank；

l ringSend[c]：当前rank所在的”部分 channel c ”的尾节点也可以理解为所在机器的尾节点rank；

l ringPrev[c]：当前rank所在的”部分 channel c ”中的前一个rank；

l ringNext[c]：当前rank所在的”部分 channel c ”中的后一个rank；

其中{BANNED}中国第一个rank的prev和{BANNED}最佳后一个rank的next为-1。ncclTopoPreset同时还会将comm->channels进行初始化。我们只关注Ring算法的部分，其他代码省去，注意这里{BANNED}最佳后将搜索到的环复制了一遍，即channel这里会翻倍，这里在官方issue中看到相关解释是为了进一步的并行以充分利用带宽。

点击(此处)折叠或打开

ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
int rank = comm->rank;
int localRanks = comm->topo->nodes[GPU].count;
int nvlsRanks = comm->MNNVL ? comm->clique.size : localRanks;
int nChannels = comm->nChannels;
topoRanks->nvlsHeadNum = 0;
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
channel->ring.prev = channel->ring.next = -1;
channel->tree.up = -1;
channel->collnetChain.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collnetChain.down[i] = -1;
channel->collnetDirect.out = -1;
channel->collnetDirect.headRank = -1;
channel->collnetDirect.nHeads = 0;
channel->collnetDirect.shift = 0;
for (int i=0; i<NCCL_MAX_DIRECT_ARITY+1; i++) channel->collnetDirect.heads[i] = -1;
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.up[i] = -1;
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.down[i] = -1;
int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks;
int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks;
int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks;
for (int i=0; i<localRanks; i++) {
if (ringIntra[i] == rank) {
topoRanks->ringRecv[c] = ringIntra[0];
topoRanks->ringSend[c] = ringIntra[localRanks-1];
topoRanks->ringPrev[c] = (i == 0) ? -1 : ringIntra[i-1];
topoRanks->ringNext[c] = (i == localRanks-1) ? -1 : ringIntra[i+1];
}
if (collNetIntra[i] == rank) {
channel->collnetChain.up = i == 0 ? comm->nRanks : collNetIntra[i-1];
channel->collnetChain.down[0] = i == localRanks-1 ? -1 : collNetIntra[i+1];
}
}
}
// Duplicate channels trees
struct ncclChannel* channel0 = comm->channels;
struct ncclChannel* channel1 = channel0+nChannels;
memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));
// Get nvls heads and the number of heads. Duplicate head is not allowed.
for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
bool addHead = true;
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * nvlsRanks;
for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
addHead = false;
break;
}
}
if (addHead) {
topoRanks->nvlsHeads[topoRanks->nvlsHeadNum++] = nvlsIntra[0];
}
}
memcpy(comm->nvlsHeads, topoRanks->nvlsHeads, sizeof(int) * topoRanks->nvlsHeadNum);
return ncclSuccess;
}

回到initTransportsRank，接着，通过bootstrapAllGather获取的所有进程间的图信息，并计算下面的数据结构。

l nodesFirstRank: 记录机器 node 上的{BANNED}中国第一个 rank 编号。

l nodesTreePatters: 记录机器 node 上的树形拓扑的模式。

l rankToNode: 记录 rank 所属的机器 node。

l localRankToRank：记录 local_rank 到全局 rank 的映射。

l localRank: 记录本进程的 local_rank。

l localRanks: 记录 local_rank 的数量。

如果所有进程中使用有不同型号或者厂商的 CPU，则给出警告信息以便于排查错误，因为在过去的实现中这种配置可能导致集合通信死锁。

点击(此处)折叠或打开

NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
// Determine nNodes, firstRanks, ...
NCCLCHECKGOTO(ncclCalloc(&nodesFirstRank, nranks), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&nodesTreePatterns, nranks), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&comm->rankToNode, comm->nRanks), ret, fail);
for (int r=0; r<nranks; r++) {
int node;
int firstRank = allGather3Data[r].topoRanks.ringRecv[0];
for (node=0; node<comm->nNodes && nodesFirstRank[node] != firstRank; node++);
if (node == comm->nNodes) {
comm->nNodes++;
nodesFirstRank[node] = firstRank;
// Record tree pattern of each node as they can be different depending on sm arch
nodesTreePatterns[node] = allGather3Data[r].graphInfo[NCCL_ALGO_TREE].pattern;
}
comm->rankToNode[r] = node;
if (comm->cpuArch != allGather3Data[r].cpuArch &&
comm->cpuArch != NCCL_TOPO_CPU_ARCH_MIXED) {
comm->cpuArch = NCCL_TOPO_CPU_ARCH_MIXED;
}
if (comm->cpuVendor != allGather3Data[r].cpuVendor &&
comm->cpuVendor != NCCL_TOPO_CPU_VENDOR_MIXED) {
comm->cpuVendor = NCCL_TOPO_CPU_VENDOR_MIXED;
}
}
// Alert the user to the presence of mixed CPUs. In the past this has caused
// locks in some collective routines. This may help debug issues in the future.
if (rank==0) {
if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_MIXED) {
INFO(NCCL_GRAPH, "CPUs with mixed architecture were detected.");
}
if (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) {
INFO(NCCL_GRAPH, "CPUs with mixed vendors were detected.");
}
}
// Now that we know nNodes, alloc nodeRanks and compute localRanks for each node
NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks, comm->nNodes), ret, fail);
NCCLCHECKGOTO(ncclCalloc(&comm->rankToLocalRank, comm->nRanks), ret, fail);
for (int r=0; r<comm->nRanks; r++) {
int node = comm->rankToNode[r];
comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks;
comm->nodeRanks[node].localRanks++;
}
// Allocate ranks arrays for each node
for (int n=0; n<comm->nNodes; n++) {
NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks), ret, fail);
comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks);
comm->nodeRanks[n].localRanks = 0;
}
// And fill the ranks arrays
for (int r=0; r<comm->nRanks; r++) {
int node = comm->rankToNode[r];
comm->nodeRanks[node].localRankToRank[comm->nodeRanks[node].localRanks++] = r;
}
comm->node = comm->rankToNode[rank];
comm->localRankToRank = comm->nodeRanks[comm->node].localRankToRank;
comm->localRank = comm->rankToLocalRank[rank];
comm->localRanks = comm->nodeRanks[comm->node].localRanks;
TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d",
rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]);
if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) {
WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d",
rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
comm->localRank, comm->localRanks, comm->localRankToRank[0]);
ret = ncclInternalError;
goto fail;
}
INFO(NCCL_INIT, "comm %p rank %d nRanks %d nNodes %d localRanks %d localRank %d MNNVL %d",
comm, rank, comm->nRanks, comm->nNodes, comm->localRanks, comm->localRank, comm->MNNVL);

接着，我们对齐所有 rank 间的配置信息。

点击(此处)折叠或打开

nChannelsOrig = comm->nChannels;
NCCLCHECKGOTO(ncclCalloc(&allTopoRanks, comm->nRanks), ret, fail);
for (int i=0; i<nranks; i++) {
allTopoRanks[i] = &allGather3Data[i].topoRanks;
// Make sure we align all ranks so that the tuning is consistent across ranks
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
graphs[a]->nChannels = std::min(allGather3Data[i].graphInfo[a].nChannels, graphs[a]->nChannels);
graphs[a]->sameChannels = std::min(allGather3Data[i].graphInfo[a].sameChannels, graphs[a]->sameChannels);
graphs[a]->bwIntra = std::min(allGather3Data[i].graphInfo[a].bwIntra, graphs[a]->bwIntra);
graphs[a]->bwInter = std::min(allGather3Data[i].graphInfo[a].bwInter, graphs[a]->bwInter);
graphs[a]->typeIntra = std::max(allGather3Data[i].graphInfo[a].typeIntra, graphs[a]->typeIntra);
graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic);
}
}

随后，由于删除了一些 channels，因此需要重新设置副本 channels。

点击(此处)折叠或打开

if (comm->nChannels < nChannelsOrig) {
// We started duplicating channels during Preset(), so we need to move the
// duplicated channels since we have removed some.
for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
}

之后，设置是否支持 CollNet。

点击(此处)折叠或打开

// Determine CollNet support after all-gather now that we know nNodes and each node localRanks
if (comm->collNetSupport == 1) {
int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
if (comm->nNodes < collNetNodeThreshold) {
INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
comm->collNetSupport = 0;
}
comm->collNetRegSupport = true;
for (int n=0; n<comm->nNodes; n++) {
if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) {
WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1);
comm->collNetSupport = 0;
break;
}
if (comm->nodeRanks[n].localRanks > 1) {
// As long as there is more than 1 rank on any node, we need to disable collnet reg
comm->collNetRegSupport = false;
}
}
}

完成上述操作后，我们调用 ncclTopoPostset 函数合并所有的 Ring、Tree 等拓扑。例如，对于 Ring 形的拓扑，依次连接各个子环，构成完成的环。第二次AllGather结束。

点击(此处)折叠或打开

NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail);
// AllGather3 - end

如下图所示，有两台机器，每台机器上有 4 张 GPU，rank 编号为 0～7。每台机器上的 rank 构建本机器上的部分环，然后通过 ncclTopoPostset 是两个部分环首尾相连，构建完整的环。

ncclTopoPostset将所有channel的prev，next，send，recv信息打平到数组中，例如recv[0]表示{BANNED}中国第一个ring中rank0的recv是哪个rank，然后开始计算当前机器{BANNED}中国第一个rank的prev和{BANNED}最佳后一个rank的next。

点击(此处)折叠或打开

ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) {
// Gather data from all ranks
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
int nranks = comm->nRanks;
int nNodes = comm->nNodes;
int nChannels = comm->nChannels;
int minHeadNum = INT_MAX;
int shared = parent && parent->nvlsSupport && parent->config.splitShare;
NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS));
// Alternate rings to avoid crossing rails
if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) {
for (int r=0; r<comm->nRanks; r++) {
if (comm->rankToNode[r] % 2 == 1) {
// Exchange rings
for (int c=0; c<nChannels; c+=2) {
exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1));
}
}
}
}
for (int c=0; c<nChannels;c++) {
for (int n=0; n<nNodes; n++) {
int r = firstRanks[n];
ringRecv[c*nNodes+n] = allTopoRanks[r]->ringRecv[c];
ringSend[c*nNodes+n] = allTopoRanks[r]->ringSend[c];
treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c];
treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c];
treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c];
}
for (int r=0; r<nranks; r++) {
ringPrev[c*nranks+r] = allTopoRanks[r]->ringPrev[c];
ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c];
}
}
for (int n = 0; n < nNodes; n++) {
int r = firstRanks[n];
if (minHeadNum > allTopoRanks[r]->nvlsHeadNum)
minHeadNum = allTopoRanks[r]->nvlsHeadNum;
}
for (int c = 0; c < minHeadNum; c++) {
for (int n = 0; n < nNodes; n++) {
int r = firstRanks[n];
nvlsHeads[c * nNodes + n] = allTopoRanks[r]->nvlsHeads[c];
}
}
// Connect rings and trees. This should also duplicate the channels.
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext));
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
// Duplicate ringPrev/ringNext for ncclBuildRing
memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
// Set ring prev/next for my rank
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel0 = comm->channels+c;
struct ncclChannel* channel1 = channel0+nChannels;
channel0->ring.prev = channel1->ring.prev = ringPrev[c*nranks+comm->rank];
channel0->ring.next = channel1->ring.next = ringNext[c*nranks+comm->rank];
}
// Duplication should be complete now
nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
// Setup CollNet
if (comm->collNetSupport == 1) {
struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN];
// Add more channels to saturate intra-node bandwidth, except the 1 PPN case
if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) {
int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
}
NCCLCHECK(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]));
}
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) {
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
}
// Double the number of channels when using unpack networking (greater than 1 node)
// We won't automatically double past 16 channels, users can specify 32 if they want
if (comm->netDeviceType == NCCL_NET_DEVICE_UNPACK && comm->nNodes > 1 && nChannels < 16 && ncclParamUnpackDoubleNChannels()) {
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
}
// Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
// We permit combining max, then min, to only use the first channels, then duplicate them.
if (comm->sharedRes->owner != comm) {
/* child comm #channels cannot exceed top parent #channels. */
nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(ncclMinNchannels(), comm->config.minCTAs), comm->sharedRes->tpNChannels), ringPrev, ringNext);
} else {
nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs);
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), comm->config.minCTAs), ringPrev, ringNext);
}
comm->collChannels = comm->nChannels;
#if CUDART_VERSION >= 12010
// Support maximal channel usage for aggregation
if (shared && comm->nvlsChannels > parent->nvlsResources->nChannels) {
comm->nvlsChannels = parent->nvlsResources->nChannels;
}
if (comm->nChannels < comm->nvlsChannels) {
nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
}
NCCLCHECK(connectNvls(comm, nvlsHeads, minHeadNum));
#endif
if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
nChannels = comm->nChannels = parent->sharedRes->tpNChannels;
comm->collChannels = std::min(comm->collChannels, comm->nChannels);
}
// Create rings array and check all is fine
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
free(ringRecv);
free(ringSend);
free(ringPrev);
free(ringNext);
free(treeToParent);
free(treeToChild0);
free(treeToChild1);
free(nvlsHeads);
return ncclSuccess;
}