Actual source code: sfnvshmem.cu
1: #include <petsc/private/cudavecimpl.h>
2: #include <../src/vec/is/sf/impls/basic/sfpack.h>
3: #include <mpi.h>
4: #include <nvshmem.h>
5: #include <nvshmemx.h>
7: PetscErrorCode PetscNvshmemInitializeCheck(void)
8: {
9: PetscErrorCode ierr;
12: if (!PetscNvshmemInitialized) { /* Note NVSHMEM does not provide a routine to check whether it is initialized */
13: nvshmemx_init_attr_t attr;
14: attr.mpi_comm = &PETSC_COMM_WORLD;
15: PetscCUDAInitializeCheck();
16: nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM,&attr);
17: PetscNvshmemInitialized = PETSC_TRUE;
18: PetscBeganNvshmem = PETSC_TRUE;
19: }
20: return(0);
21: }
23: PetscErrorCode PetscNvshmemMalloc(size_t size, void** ptr)
24: {
28: PetscNvshmemInitializeCheck();
29: *ptr = nvshmem_malloc(size);
30: if (!*ptr) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"nvshmem_malloc() failed to allocate %zu bytes",size);
31: return(0);
32: }
34: PetscErrorCode PetscNvshmemCalloc(size_t size, void**ptr)
35: {
39: PetscNvshmemInitializeCheck();
40: *ptr = nvshmem_calloc(size,1);
41: if (!*ptr) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"nvshmem_calloc() failed to allocate %zu bytes",size);
42: return(0);
43: }
45: PetscErrorCode PetscNvshmemFree_Private(void* ptr)
46: {
48: nvshmem_free(ptr);
49: return(0);
50: }
52: PetscErrorCode PetscNvshmemFinalize(void)
53: {
55: nvshmem_finalize();
56: return(0);
57: }
59: /* Free nvshmem related fields in the SF */
60: PetscErrorCode PetscSFReset_Basic_NVSHMEM(PetscSF sf)
61: {
62: PetscErrorCode ierr;
63: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
66: PetscFree2(bas->leafsigdisp,bas->leafbufdisp);
67: PetscSFFree(sf,PETSC_MEMTYPE_CUDA,bas->leafbufdisp_d);
68: PetscSFFree(sf,PETSC_MEMTYPE_CUDA,bas->leafsigdisp_d);
69: PetscSFFree(sf,PETSC_MEMTYPE_CUDA,bas->iranks_d);
70: PetscSFFree(sf,PETSC_MEMTYPE_CUDA,bas->ioffset_d);
72: PetscFree2(sf->rootsigdisp,sf->rootbufdisp);
73: PetscSFFree(sf,PETSC_MEMTYPE_CUDA,sf->rootbufdisp_d);
74: PetscSFFree(sf,PETSC_MEMTYPE_CUDA,sf->rootsigdisp_d);
75: PetscSFFree(sf,PETSC_MEMTYPE_CUDA,sf->ranks_d);
76: PetscSFFree(sf,PETSC_MEMTYPE_CUDA,sf->roffset_d);
77: return(0);
78: }
80: /* Set up NVSHMEM related fields for an SF of type SFBASIC (only after PetscSFSetup_Basic() already set up dependant fields */
81: static PetscErrorCode PetscSFSetUp_Basic_NVSHMEM(PetscSF sf)
82: {
84: cudaError_t cerr;
85: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
86: PetscInt i,nRemoteRootRanks,nRemoteLeafRanks;
87: PetscMPIInt tag;
88: MPI_Comm comm;
89: MPI_Request *rootreqs,*leafreqs;
90: PetscInt tmp,stmp[4],rtmp[4]; /* tmps for send/recv buffers */
93: PetscObjectGetComm((PetscObject)sf,&comm);
94: PetscObjectGetNewTag((PetscObject)sf,&tag);
96: nRemoteRootRanks = sf->nranks-sf->ndranks;
97: nRemoteLeafRanks = bas->niranks-bas->ndiranks;
98: sf->nRemoteRootRanks = nRemoteRootRanks;
99: bas->nRemoteLeafRanks = nRemoteLeafRanks;
101: PetscMalloc2(nRemoteLeafRanks,&rootreqs,nRemoteRootRanks,&leafreqs);
103: stmp[0] = nRemoteRootRanks;
104: stmp[1] = sf->leafbuflen[PETSCSF_REMOTE];
105: stmp[2] = nRemoteLeafRanks;
106: stmp[3] = bas->rootbuflen[PETSCSF_REMOTE];
108: MPIU_Allreduce(stmp,rtmp,4,MPIU_INT,MPI_MAX,comm);
110: sf->nRemoteRootRanksMax = rtmp[0];
111: sf->leafbuflen_rmax = rtmp[1];
112: bas->nRemoteLeafRanksMax = rtmp[2];
113: bas->rootbuflen_rmax = rtmp[3];
115: /* Total four rounds of MPI communications to set up the nvshmem fields */
117: /* Root ranks to leaf ranks: send info about rootsigdisp[] and rootbufdisp[] */
118: PetscMalloc2(nRemoteRootRanks,&sf->rootsigdisp,nRemoteRootRanks,&sf->rootbufdisp);
119: for (i=0; i<nRemoteRootRanks; i++) {MPI_Irecv(&sf->rootsigdisp[i],1,MPIU_INT,sf->ranks[i+sf->ndranks],tag,comm,&leafreqs[i]);} /* Leaves recv */
120: for (i=0; i<nRemoteLeafRanks; i++) {MPI_Send(&i,1,MPIU_INT,bas->iranks[i+bas->ndiranks],tag,comm);} /* Roots send. Note i changes, so we use MPI_Send. */
121: MPI_Waitall(nRemoteRootRanks,leafreqs,MPI_STATUSES_IGNORE);
123: for (i=0; i<nRemoteRootRanks; i++) {MPI_Irecv(&sf->rootbufdisp[i],1,MPIU_INT,sf->ranks[i+sf->ndranks],tag,comm,&leafreqs[i]);} /* Leaves recv */
124: for (i=0; i<nRemoteLeafRanks; i++) {
125: tmp = bas->ioffset[i+bas->ndiranks] - bas->ioffset[bas->ndiranks];
126: MPI_Send(&tmp,1,MPIU_INT,bas->iranks[i+bas->ndiranks],tag,comm); /* Roots send. Note tmp changes, so we use MPI_Send. */
127: }
128: MPI_Waitall(nRemoteRootRanks,leafreqs,MPI_STATUSES_IGNORE);
130: cerr = cudaMalloc((void**)&sf->rootbufdisp_d,nRemoteRootRanks*sizeof(PetscInt));CHKERRCUDA(cerr);
131: cerr = cudaMalloc((void**)&sf->rootsigdisp_d,nRemoteRootRanks*sizeof(PetscInt));CHKERRCUDA(cerr);
132: cerr = cudaMalloc((void**)&sf->ranks_d,nRemoteRootRanks*sizeof(PetscMPIInt));CHKERRCUDA(cerr);
133: cerr = cudaMalloc((void**)&sf->roffset_d,(nRemoteRootRanks+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
135: cerr = cudaMemcpyAsync(sf->rootbufdisp_d,sf->rootbufdisp,nRemoteRootRanks*sizeof(PetscInt),cudaMemcpyHostToDevice,PetscDefaultCudaStream);CHKERRCUDA(cerr);
136: cerr = cudaMemcpyAsync(sf->rootsigdisp_d,sf->rootsigdisp,nRemoteRootRanks*sizeof(PetscInt),cudaMemcpyHostToDevice,PetscDefaultCudaStream);CHKERRCUDA(cerr);
137: cerr = cudaMemcpyAsync(sf->ranks_d,sf->ranks+sf->ndranks,nRemoteRootRanks*sizeof(PetscMPIInt),cudaMemcpyHostToDevice,PetscDefaultCudaStream);CHKERRCUDA(cerr);
138: cerr = cudaMemcpyAsync(sf->roffset_d,sf->roffset+sf->ndranks,(nRemoteRootRanks+1)*sizeof(PetscInt),cudaMemcpyHostToDevice,PetscDefaultCudaStream);CHKERRCUDA(cerr);
140: /* Leaf ranks to root ranks: send info about leafsigdisp[] and leafbufdisp[] */
141: PetscMalloc2(nRemoteLeafRanks,&bas->leafsigdisp,nRemoteLeafRanks,&bas->leafbufdisp);
142: for (i=0; i<nRemoteLeafRanks; i++) {MPI_Irecv(&bas->leafsigdisp[i],1,MPIU_INT,bas->iranks[i+bas->ndiranks],tag,comm,&rootreqs[i]);}
143: for (i=0; i<nRemoteRootRanks; i++) {MPI_Send(&i,1,MPIU_INT,sf->ranks[i+sf->ndranks],tag,comm);}
144: MPI_Waitall(nRemoteLeafRanks,rootreqs,MPI_STATUSES_IGNORE);
146: for (i=0; i<nRemoteLeafRanks; i++) {MPI_Irecv(&bas->leafbufdisp[i],1,MPIU_INT,bas->iranks[i+bas->ndiranks],tag,comm,&rootreqs[i]);}
147: for (i=0; i<nRemoteRootRanks; i++) {
148: tmp = sf->roffset[i+sf->ndranks] - sf->roffset[sf->ndranks];
149: MPI_Send(&tmp,1,MPIU_INT,sf->ranks[i+sf->ndranks],tag,comm);
150: }
151: MPI_Waitall(nRemoteLeafRanks,rootreqs,MPI_STATUSES_IGNORE);
153: cerr = cudaMalloc((void**)&bas->leafbufdisp_d,nRemoteLeafRanks*sizeof(PetscInt));CHKERRCUDA(cerr);
154: cerr = cudaMalloc((void**)&bas->leafsigdisp_d,nRemoteLeafRanks*sizeof(PetscInt));CHKERRCUDA(cerr);
155: cerr = cudaMalloc((void**)&bas->iranks_d,nRemoteLeafRanks*sizeof(PetscMPIInt));CHKERRCUDA(cerr);
156: cerr = cudaMalloc((void**)&bas->ioffset_d,(nRemoteLeafRanks+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
158: cerr = cudaMemcpyAsync(bas->leafbufdisp_d,bas->leafbufdisp,nRemoteLeafRanks*sizeof(PetscInt),cudaMemcpyHostToDevice,PetscDefaultCudaStream);CHKERRCUDA(cerr);
159: cerr = cudaMemcpyAsync(bas->leafsigdisp_d,bas->leafsigdisp,nRemoteLeafRanks*sizeof(PetscInt),cudaMemcpyHostToDevice,PetscDefaultCudaStream);CHKERRCUDA(cerr);
160: cerr = cudaMemcpyAsync(bas->iranks_d,bas->iranks+bas->ndiranks,nRemoteLeafRanks*sizeof(PetscMPIInt),cudaMemcpyHostToDevice,PetscDefaultCudaStream);CHKERRCUDA(cerr);
161: cerr = cudaMemcpyAsync(bas->ioffset_d,bas->ioffset+bas->ndiranks,(nRemoteLeafRanks+1)*sizeof(PetscInt),cudaMemcpyHostToDevice,PetscDefaultCudaStream);CHKERRCUDA(cerr);
163: PetscFree2(rootreqs,leafreqs);
164: return(0);
165: }
167: PetscErrorCode PetscSFLinkNvshmemCheck(PetscSF sf,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,PetscBool *use_nvshmem)
168: {
169: PetscErrorCode ierr;
170: MPI_Comm comm;
171: PetscBool isBasic;
172: PetscMPIInt result = MPI_UNEQUAL;
175: PetscObjectGetComm((PetscObject)sf,&comm);
176: /* Check if the sf is eligible for NVSHMEM, if we have not checked yet.
177: Note the check result <use_nvshmem> must be the same over comm, since an SFLink must be collectively either NVSHMEM or MPI.
178: */
179: sf->checked_nvshmem_eligibility = PETSC_TRUE;
180: if (sf->use_nvshmem && !sf->checked_nvshmem_eligibility) {
181: /* Only use NVSHMEM for SFBASIC on PETSC_COMM_WORLD */
182: PetscObjectTypeCompare((PetscObject)sf,PETSCSFBASIC,&isBasic);
183: if (isBasic) {MPI_Comm_compare(PETSC_COMM_WORLD,comm,&result);}
184: if (!isBasic || (result != MPI_IDENT && result != MPI_CONGRUENT)) sf->use_nvshmem = PETSC_FALSE; /* If not eligible, clear the flag so that we don't try again */
186: /* Do further check: If on a rank, both rootdata and leafdata are NULL, we might think they are PETSC_MEMTYPE_CUDA (or HOST)
187: and then use NVSHMEM. But if root/leafmtypes on other ranks are PETSC_MEMTYPE_HOST (or DEVICE), this would lead to
188: inconsistency on the return value <use_nvshmem>. To be safe, we simply disable nvshmem on these rare SFs.
189: */
190: if (sf->use_nvshmem) {
191: PetscInt hasNullRank = (!rootdata && !leafdata) ? 1 : 0;
192: MPI_Allreduce(MPI_IN_PLACE,&hasNullRank,1,MPIU_INT,MPI_LOR,comm);
193: if (hasNullRank) sf->use_nvshmem = PETSC_FALSE;
194: }
195: sf->checked_nvshmem_eligibility = PETSC_TRUE; /* If eligible, don't do above check again */
196: }
198: /* Check if rootmtype and leafmtype collectively are PETSC_MEMTYPE_CUDA */
199: if (sf->use_nvshmem) {
200: PetscInt oneCuda = (!rootdata || PetscMemTypeCUDA(rootmtype)) && (!leafdata || PetscMemTypeCUDA(leafmtype)) ? 1 : 0; /* Do I use cuda for both root&leafmtype? */
201: PetscInt allCuda = oneCuda; /* Assume the same for all ranks. But if not, in opt mode, return value <use_nvshmem> won't be collective! */
202: #if defined(PETSC_USE_DEBUG) /* Check in debug mode. Note MPI_Allreduce is expensive, so only in debug mode */
203: MPI_Allreduce(&oneCuda,&allCuda,1,MPIU_INT,MPI_LAND,comm);
204: if (allCuda != oneCuda) SETERRQ(comm,PETSC_ERR_SUP,"root/leaf mtypes are inconsistent among ranks, which may lead to SF nvshmem failure in opt mode. Add -use_nvshmem 0 to disable it.");
205: #endif
206: if (allCuda) {
207: PetscNvshmemInitializeCheck();
208: if (!sf->setup_nvshmem) { /* Set up nvshmem related fields on this SF on-demand */
209: PetscSFSetUp_Basic_NVSHMEM(sf);
210: sf->setup_nvshmem = PETSC_TRUE;
211: }
212: *use_nvshmem = PETSC_TRUE;
213: } else {
214: *use_nvshmem = PETSC_FALSE;
215: }
216: } else {
217: *use_nvshmem = PETSC_FALSE;
218: }
219: return(0);
220: }
222: /* Build dependence between <stream> and <remoteCommStream> at the entry of NVSHMEM communication */
223: static PetscErrorCode PetscSFLinkBuildDependenceBegin(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
224: {
225: cudaError_t cerr;
226: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
227: PetscInt buflen = (direction == PETSCSF_../../../../../..2LEAF)? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE];
230: if (buflen) {
231: cerr = cudaEventRecord(link->dataReady,link->stream);CHKERRCUDA(cerr);
232: cerr = cudaStreamWaitEvent(link->remoteCommStream,link->dataReady,0);CHKERRCUDA(cerr);
233: }
234: return(0);
235: }
237: /* Build dependence between <stream> and <remoteCommStream> at the exit of NVSHMEM communication */
238: static PetscErrorCode PetscSFLinkBuildDependenceEnd(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
239: {
240: cudaError_t cerr;
241: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
242: PetscInt buflen = (direction == PETSCSF_../../../../../..2LEAF)? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE];
245: /* If unpack to non-null device buffer, build the endRemoteComm dependance */
246: if (buflen) {
247: cerr = cudaEventRecord(link->endRemoteComm,link->remoteCommStream);CHKERRCUDA(cerr);
248: cerr = cudaStreamWaitEvent(link->stream,link->endRemoteComm,0);CHKERRCUDA(cerr);
249: }
250: return(0);
251: }
253: /* Send/Put signals to remote ranks
255: Input parameters:
256: + n - Number of remote ranks
257: . sig - Signal address in symmetric heap
258: . sigdisp - To i-th rank, use its signal at offset sigdisp[i]
259: . ranks - remote ranks
260: - newval - Set signals to this value
261: */
262: __global__ static void NvshmemSendSignals(PetscInt n,uint64_t *sig,PetscInt *sigdisp,PetscMPIInt *ranks,uint64_t newval)
263: {
264: int i = blockIdx.x*blockDim.x + threadIdx.x;
266: /* Each thread puts one remote signal */
267: if (i < n) nvshmemx_uint64_signal(sig+sigdisp[i],newval,ranks[i]);
268: }
270: /* Wait until local signals equal to the expected value and then set them to a new value
272: Input parameters:
273: + n - Number of signals
274: . sig - Local signal address
275: . expval - expected value
276: - newval - Set signals to this new value
277: */
278: __global__ static void NvshmemWaitSignals(PetscInt n,uint64_t *sig,uint64_t expval,uint64_t newval)
279: {
280: #if 0
281: /* Akhil Langer@NVIDIA said using 1 thread and nvshmem_uint64_wait_until_all is better */
282: int i = blockIdx.x*blockDim.x + threadIdx.x;
283: if (i < n) {
284: nvshmem_signal_wait_until(sig+i,NVSHMEM_CMP_EQ,expval);
285: sig[i] = newval;
286: }
287: #else
288: nvshmem_uint64_wait_until_all(sig,n,NULL/*no mask*/,NVSHMEM_CMP_EQ,expval);
289: for (int i=0; i<n; i++) sig[i] = newval;
290: #endif
291: }
293: /* ===========================================================================================================
295: A set of routines to support receiver initiated communication using the get method
297: The getting protocol is:
299: Sender has a send buf (sbuf) and a signal variable (ssig); Receiver has a recv buf (rbuf) and a signal variable (rsig);
300: All signal variables have an initial value 0.
302: Sender: | Receiver:
303: 1. Wait ssig be 0, then set it to 1
304: 2. Pack data into stand alone sbuf |
305: 3. Put 1 to receiver's rsig | 1. Wait rsig to be 1, then set it 0
306: | 2. Get data from remote sbuf to local rbuf
307: | 3. Put 1 to sender's ssig
308: | 4. Unpack data from local rbuf
309: ===========================================================================================================*/
310: /* PrePack operation -- since sender will overwrite the send buffer which the receiver might be getting data from.
311: Sender waits for signals (from receivers) indicating receivers have finished getting data
312: */
313: PetscErrorCode PetscSFLinkWaitSignalsOfCompletionOfGettingData_NVSHMEM(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
314: {
315: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
316: uint64_t *sig;
317: PetscInt n;
320: if (direction == PETSCSF_../../../../../..2LEAF) { /* leaf ranks are getting data */
321: sig = link->rootSendSig; /* leaf ranks set my rootSendsig */
322: n = bas->nRemoteLeafRanks;
323: } else { /* LEAF2../../../../../.. */
324: sig = link->leafSendSig;
325: n = sf->nRemoteRootRanks;
326: }
328: if (n) {
329: NvshmemWaitSignals<<<1,1,0,link->remoteCommStream>>>(n,sig,0,1); /* wait the signals to be 0, then set them to 1 */
330: cudaError_t cerr = cudaGetLastError();CHKERRCUDA(cerr);
331: }
332: return(0);
333: }
335: /* n thread blocks. Each takes in charge one remote rank */
336: __global__ static void GetDataFromRemotelyAccessible(PetscInt nsrcranks,PetscMPIInt *srcranks,const char *src,PetscInt *srcdisp,char *dst,PetscInt *dstdisp,PetscInt unitbytes)
337: {
338: int bid = blockIdx.x;
339: PetscMPIInt pe = srcranks[bid];
341: if (!nvshmem_ptr(src,pe)) {
342: PetscInt nelems = (dstdisp[bid+1]-dstdisp[bid])*unitbytes;
343: nvshmem_getmem_nbi(dst+(dstdisp[bid]-dstdisp[0])*unitbytes,src+srcdisp[bid]*unitbytes,nelems,pe);
344: }
345: }
347: /* Start communication -- Get data in the given direction */
348: PetscErrorCode PetscSFLinkGetDataBegin_NVSHMEM(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
349: {
350: PetscErrorCode ierr;
351: cudaError_t cerr;
352: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
354: PetscInt nsrcranks,ndstranks,nLocallyAccessible = 0;
356: char *src,*dst;
357: PetscInt *srcdisp_h,*dstdisp_h;
358: PetscInt *srcdisp_d,*dstdisp_d;
359: PetscMPIInt *srcranks_h;
360: PetscMPIInt *srcranks_d,*dstranks_d;
361: uint64_t *dstsig;
362: PetscInt *dstsigdisp_d;
365: PetscSFLinkBuildDependenceBegin(sf,link,direction);
366: if (direction == PETSCSF_../../../../../..2LEAF) { /* src is root, dst is leaf; we will move data from src to dst */
367: nsrcranks = sf->nRemoteRootRanks;
368: src = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; /* root buf is the send buf; it is in symmetric heap */
370: srcdisp_h = sf->rootbufdisp; /* for my i-th remote root rank, I will access its buf at offset rootbufdisp[i] */
371: srcdisp_d = sf->rootbufdisp_d;
372: srcranks_h = sf->ranks+sf->ndranks; /* my (remote) root ranks */
373: srcranks_d = sf->ranks_d;
375: ndstranks = bas->nRemoteLeafRanks;
376: dst = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; /* recv buf is the local leaf buf, also in symmetric heap */
378: dstdisp_h = sf->roffset+sf->ndranks; /* offsets of the local leaf buf. Note dstdisp[0] is not necessarily 0 */
379: dstdisp_d = sf->roffset_d;
380: dstranks_d = bas->iranks_d; /* my (remote) leaf ranks */
382: dstsig = link->leafRecvSig;
383: dstsigdisp_d = bas->leafsigdisp_d;
384: } else { /* src is leaf, dst is root; we will move data from src to dst */
385: nsrcranks = bas->nRemoteLeafRanks;
386: src = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; /* leaf buf is the send buf */
388: srcdisp_h = bas->leafbufdisp; /* for my i-th remote root rank, I will access its buf at offset rootbufdisp[i] */
389: srcdisp_d = bas->leafbufdisp_d;
390: srcranks_h = bas->iranks+bas->ndiranks; /* my (remote) root ranks */
391: srcranks_d = bas->iranks_d;
393: ndstranks = sf->nRemoteRootRanks;
394: dst = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; /* the local root buf is the recv buf */
396: dstdisp_h = bas->ioffset+bas->ndiranks; /* offsets of the local root buf. Note dstdisp[0] is not necessarily 0 */
397: dstdisp_d = bas->ioffset_d;
398: dstranks_d = sf->ranks_d; /* my (remote) root ranks */
400: dstsig = link->rootRecvSig;
401: dstsigdisp_d = sf->rootsigdisp_d;
402: }
404: /* After Pack operation -- src tells dst ranks that they are allowed to get data */
405: if (ndstranks) {
406: NvshmemSendSignals<<<(ndstranks+255)/256,256,0,link->remoteCommStream>>>(ndstranks,dstsig,dstsigdisp_d,dstranks_d,1); /* set signals to 1 */
407: cerr = cudaGetLastError();CHKERRCUDA(cerr);
408: }
410: /* dst waits for signals (permissions) from src ranks to start getting data */
411: if (nsrcranks) {
412: NvshmemWaitSignals<<<1,1,0,link->remoteCommStream>>>(nsrcranks,dstsig,1,0); /* wait the signals to be 1, then set them to 0 */
413: cerr = cudaGetLastError();CHKERRCUDA(cerr);
414: }
416: /* dst gets data from src ranks using non-blocking nvshmem_gets, which are finished in PetscSFLinkGetDataEnd_NVSHMEM() */
418: /* Count number of locally accessible src ranks, which should be a small number */
419: for (int i=0; i<nsrcranks; i++) {if (nvshmem_ptr(src,srcranks_h[i])) nLocallyAccessible++;}
421: /* Get data from remotely accessible PEs */
422: if (nLocallyAccessible < nsrcranks) {
423: GetDataFromRemotelyAccessible<<<nsrcranks,1,0,link->remoteCommStream>>>(nsrcranks,srcranks_d,src,srcdisp_d,dst,dstdisp_d,link->unitbytes);
424: cerr = cudaGetLastError();CHKERRCUDA(cerr);
425: }
427: /* Get data from locally accessible PEs */
428: if (nLocallyAccessible) {
429: for (int i=0; i<nsrcranks; i++) {
430: int pe = srcranks_h[i];
431: if (nvshmem_ptr(src,pe)) {
432: size_t nelems = (dstdisp_h[i+1]-dstdisp_h[i])*link->unitbytes;
433: nvshmemx_getmem_nbi_on_stream(dst+(dstdisp_h[i]-dstdisp_h[0])*link->unitbytes,src+srcdisp_h[i]*link->unitbytes,nelems,pe,link->remoteCommStream);
434: }
435: }
436: }
437: return(0);
438: }
440: /* Finish the communication (can be done before Unpack)
441: Receiver tells its senders that they are allowed to reuse their send buffer (since receiver has got data from their send buffer)
442: */
443: PetscErrorCode PetscSFLinkGetDataEnd_NVSHMEM(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
444: {
445: PetscErrorCode ierr;
446: cudaError_t cerr;
447: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
448: uint64_t *srcsig;
449: PetscInt nsrcranks,*srcsigdisp;
450: PetscMPIInt *srcranks;
453: if (direction == PETSCSF_../../../../../..2LEAF) { /* leaf ranks are getting data */
454: nsrcranks = sf->nRemoteRootRanks;
455: srcsig = link->rootSendSig; /* I want to set their root signal */
456: srcsigdisp = sf->rootsigdisp_d; /* offset of each root signal */
457: srcranks = sf->ranks_d; /* ranks of the n root ranks */
458: } else { /* LEAF2../../../../../.., root ranks are getting data */
459: nsrcranks = bas->nRemoteLeafRanks;
460: srcsig = link->leafSendSig;
461: srcsigdisp = bas->leafsigdisp_d;
462: srcranks = bas->iranks_d;
463: }
465: if (nsrcranks) {
466: nvshmemx_quiet_on_stream(link->remoteCommStream); /* Finish the nonblocking get, so that we can unpack afterwards */
467: cerr = cudaGetLastError();CHKERRCUDA(cerr);
468: NvshmemSendSignals<<<(nsrcranks+511)/512,512,0,link->remoteCommStream>>>(nsrcranks,srcsig,srcsigdisp,srcranks,0); /* set signals to 0 */
469: cerr = cudaGetLastError();CHKERRCUDA(cerr);
470: }
471: PetscSFLinkBuildDependenceEnd(sf,link,direction);
472: return(0);
473: }
475: /* ===========================================================================================================
477: A set of routines to support sender initiated communication using the put-based method (the default)
479: The putting protocol is:
481: Sender has a send buf (sbuf) and a send signal var (ssig); Receiver has a stand-alone recv buf (rbuf)
482: and a recv signal var (rsig); All signal variables have an initial value 0. rbuf is allocated by SF and
483: is in nvshmem space.
485: Sender: | Receiver:
486: |
487: 1. Pack data into sbuf |
488: 2. Wait ssig be 0, then set it to 1 |
489: 3. Put data to remote stand-alone rbuf |
490: 4. Fence // make sure 5 happens after 3 |
491: 5. Put 1 to receiver's rsig | 1. Wait rsig to be 1, then set it 0
492: | 2. Unpack data from local rbuf
493: | 3. Put 0 to sender's ssig
494: ===========================================================================================================*/
496: /* n thread blocks. Each takes in charge one remote rank */
497: __global__ static void WaitAndPutDataToRemotelyAccessible(PetscInt ndstranks,PetscMPIInt *dstranks,char *dst,PetscInt *dstdisp,const char *src,PetscInt *srcdisp,uint64_t *srcsig,PetscInt unitbytes)
498: {
499: int bid = blockIdx.x;
500: PetscMPIInt pe = dstranks[bid];
502: if (!nvshmem_ptr(dst,pe)) {
503: PetscInt nelems = (srcdisp[bid+1]-srcdisp[bid])*unitbytes;
504: nvshmem_uint64_wait_until(srcsig+bid,NVSHMEM_CMP_EQ,0); /* Wait until the sig = 0 */
505: srcsig[bid] = 1;
506: nvshmem_putmem_nbi(dst+dstdisp[bid]*unitbytes,src+(srcdisp[bid]-srcdisp[0])*unitbytes,nelems,pe);
507: }
508: }
510: /* one-thread kernel, which takes in charge all locally accesible */
511: __global__ static void WaitSignalsFromLocallyAccessible(PetscInt ndstranks,PetscMPIInt *dstranks,uint64_t *srcsig,const char *dst)
512: {
513: for (int i=0; i<ndstranks; i++) {
514: int pe = dstranks[i];
515: if (nvshmem_ptr(dst,pe)) {
516: nvshmem_uint64_wait_until(srcsig+i,NVSHMEM_CMP_EQ,0); /* Wait until the sig = 0 */
517: srcsig[i] = 1;
518: }
519: }
520: }
522: /* Put data in the given direction */
523: PetscErrorCode PetscSFLinkPutDataBegin_NVSHMEM(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
524: {
525: PetscErrorCode ierr;
526: cudaError_t cerr;
527: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
528: PetscInt ndstranks,nLocallyAccessible = 0;
529: char *src,*dst;
530: PetscInt *srcdisp_h,*dstdisp_h;
531: PetscInt *srcdisp_d,*dstdisp_d;
532: PetscMPIInt *dstranks_h;
533: PetscMPIInt *dstranks_d;
534: uint64_t *srcsig;
537: PetscSFLinkBuildDependenceBegin(sf,link,direction);
538: if (direction == PETSCSF_../../../../../..2LEAF) { /* put data in rootbuf to leafbuf */
539: ndstranks = bas->nRemoteLeafRanks; /* number of (remote) leaf ranks */
540: src = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; /* Both src & dst must be symmetric */
541: dst = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
543: srcdisp_h = bas->ioffset+bas->ndiranks; /* offsets of rootbuf. srcdisp[0] is not necessarily zero */
544: srcdisp_d = bas->ioffset_d;
545: srcsig = link->rootSendSig;
547: dstdisp_h = bas->leafbufdisp; /* for my i-th remote leaf rank, I will access its leaf buf at offset leafbufdisp[i] */
548: dstdisp_d = bas->leafbufdisp_d;
549: dstranks_h = bas->iranks+bas->ndiranks; /* remote leaf ranks */
550: dstranks_d = bas->iranks_d;
551: } else { /* put data in leafbuf to rootbuf */
552: ndstranks = sf->nRemoteRootRanks;
553: src = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
554: dst = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
556: srcdisp_h = sf->roffset+sf->ndranks; /* offsets of leafbuf */
557: srcdisp_d = sf->roffset_d;
558: srcsig = link->leafSendSig;
560: dstdisp_h = sf->rootbufdisp; /* for my i-th remote root rank, I will access its root buf at offset rootbufdisp[i] */
561: dstdisp_d = sf->rootbufdisp_d;
562: dstranks_h = sf->ranks+sf->ndranks; /* remote root ranks */
563: dstranks_d = sf->ranks_d;
564: }
566: /* Wait for signals and then put data to dst ranks using non-blocking nvshmem_put, which are finished in PetscSFLinkPutDataEnd_NVSHMEM */
568: /* Count number of locally accessible neighbors, which should be a small number */
569: for (int i=0; i<ndstranks; i++) {if (nvshmem_ptr(dst,dstranks_h[i])) nLocallyAccessible++;}
571: /* For remotely accessible PEs, send data to them in one kernel call */
572: if (nLocallyAccessible < ndstranks) {
573: WaitAndPutDataToRemotelyAccessible<<<ndstranks,1,0,link->remoteCommStream>>>(ndstranks,dstranks_d,dst,dstdisp_d,src,srcdisp_d,srcsig,link->unitbytes);
574: cerr = cudaGetLastError();CHKERRCUDA(cerr);
575: }
577: /* For locally accessible PEs, use host API, which uses CUDA copy-engines and is much faster than device API */
578: if (nLocallyAccessible) {
579: WaitSignalsFromLocallyAccessible<<<1,1,0,link->remoteCommStream>>>(ndstranks,dstranks_d,srcsig,dst);
580: for (int i=0; i<ndstranks; i++) {
581: int pe = dstranks_h[i];
582: if (nvshmem_ptr(dst,pe)) { /* If return a non-null pointer, then <pe> is locally accessible */
583: size_t nelems = (srcdisp_h[i+1]-srcdisp_h[i])*link->unitbytes;
584: /* Initiate the nonblocking communication */
585: nvshmemx_putmem_nbi_on_stream(dst+dstdisp_h[i]*link->unitbytes,src+(srcdisp_h[i]-srcdisp_h[0])*link->unitbytes,nelems,pe,link->remoteCommStream);
586: }
587: }
588: }
590: if (nLocallyAccessible) {
591: nvshmemx_quiet_on_stream(link->remoteCommStream); /* Calling nvshmem_fence/quiet() does not fence the above nvshmemx_putmem_nbi_on_stream! */
592: }
593: return(0);
594: }
596: /* A one-thread kernel. The thread takes in charge all remote PEs */
597: __global__ static void PutDataEnd(PetscInt nsrcranks,PetscInt ndstranks,PetscMPIInt *dstranks,uint64_t *dstsig,PetscInt *dstsigdisp)
598: {
599: /* TODO: Shall we finished the non-blocking remote puts? */
601: /* 1. Send a signal to each dst rank */
603: /* According to Akhil@NVIDIA, IB is orderred, so no fence is needed for remote PEs.
604: For local PEs, we already called nvshmemx_quiet_on_stream(). Therefore, we are good to send signals to all dst ranks now.
605: */
606: for (int i=0; i<ndstranks; i++) {nvshmemx_uint64_signal(dstsig+dstsigdisp[i],1,dstranks[i]);} /* set sig to 1 */
608: /* 2. Wait for signals from src ranks (if any) */
609: if (nsrcranks) {
610: nvshmem_uint64_wait_until_all(dstsig,nsrcranks,NULL/*no mask*/,NVSHMEM_CMP_EQ,1); /* wait sigs to be 1, then set them to 0 */
611: for (int i=0; i<nsrcranks; i++) dstsig[i] = 0;
612: }
613: }
615: /* Finish the communication -- A receiver waits until it can access its receive buffer */
616: PetscErrorCode PetscSFLinkPutDataEnd_NVSHMEM(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
617: {
618: PetscErrorCode ierr;
619: cudaError_t cerr;
620: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
621: PetscMPIInt *dstranks;
622: uint64_t *dstsig;
623: PetscInt nsrcranks,ndstranks,*dstsigdisp;
626: if (direction == PETSCSF_../../../../../..2LEAF) { /* put root data to leaf */
627: nsrcranks = sf->nRemoteRootRanks;
629: ndstranks = bas->nRemoteLeafRanks;
630: dstranks = bas->iranks_d; /* leaf ranks */
631: dstsig = link->leafRecvSig; /* I will set my leaf ranks's RecvSig */
632: dstsigdisp = bas->leafsigdisp_d; /* for my i-th remote leaf rank, I will access its signal at offset leafsigdisp[i] */
633: } else { /* LEAF2../../../../../.. */
634: nsrcranks = bas->nRemoteLeafRanks;
636: ndstranks = sf->nRemoteRootRanks;
637: dstranks = sf->ranks_d;
638: dstsig = link->rootRecvSig;
639: dstsigdisp = sf->rootsigdisp_d;
640: }
642: if (nsrcranks || ndstranks) {
643: PutDataEnd<<<1,1,0,link->remoteCommStream>>>(nsrcranks,ndstranks,dstranks,dstsig,dstsigdisp);
644: cerr = cudaGetLastError();CHKERRCUDA(cerr);
645: }
646: PetscSFLinkBuildDependenceEnd(sf,link,direction);
647: return(0);
648: }
650: /* PostUnpack operation -- A receiver tells its senders that they are allowed to put data to here (it implies recv buf is free to take new data) */
651: PetscErrorCode PetscSFLinkSendSignalsToAllowPuttingData_NVSHMEM(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
652: {
653: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
654: uint64_t *srcsig;
655: PetscInt nsrcranks,*srcsigdisp_d;
656: PetscMPIInt *srcranks_d;
659: if (direction == PETSCSF_../../../../../..2LEAF) { /* I allow my root ranks to put data to me */
660: nsrcranks = sf->nRemoteRootRanks;
661: srcsig = link->rootSendSig; /* I want to set their send signals */
662: srcsigdisp_d = sf->rootsigdisp_d; /* offset of each root signal */
663: srcranks_d = sf->ranks_d; /* ranks of the n root ranks */
664: } else { /* LEAF2../../../../../.. */
665: nsrcranks = bas->nRemoteLeafRanks;
666: srcsig = link->leafSendSig;
667: srcsigdisp_d = bas->leafsigdisp_d;
668: srcranks_d = bas->iranks_d;
669: }
671: if (nsrcranks) {
672: NvshmemSendSignals<<<(nsrcranks+255)/256,256,0,link->remoteCommStream>>>(nsrcranks,srcsig,srcsigdisp_d,srcranks_d,0); /* Set remote signals to 0 */
673: cudaError_t cerr = cudaGetLastError();CHKERRCUDA(cerr);
674: }
675: return(0);
676: }
678: /* Destructor when the link uses nvshmem for communication */
679: static PetscErrorCode PetscSFLinkDestroy_NVSHMEM(PetscSF sf,PetscSFLink link)
680: {
681: PetscErrorCode ierr;
682: cudaError_t cerr;
685: cerr = cudaEventDestroy(link->dataReady);CHKERRCUDA(cerr);
686: cerr = cudaEventDestroy(link->endRemoteComm);CHKERRCUDA(cerr);
687: cerr = cudaStreamDestroy(link->remoteCommStream);CHKERRCUDA(cerr);
689: /* nvshmem does not need buffers on host, which should be NULL */
690: PetscNvshmemFree(link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]);
691: PetscNvshmemFree(link->leafSendSig);
692: PetscNvshmemFree(link->leafRecvSig);
693: PetscNvshmemFree(link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]);
694: PetscNvshmemFree(link->rootSendSig);
695: PetscNvshmemFree(link->rootRecvSig);
696: return(0);
697: }
699: PetscErrorCode PetscSFLinkCreate_NVSHMEM(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
700: {
701: PetscErrorCode ierr;
702: cudaError_t cerr;
703: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
704: PetscSFLink *p,link;
705: PetscBool match,rootdirect[2],leafdirect[2];
706: int greatestPriority;
709: /* Check to see if we can directly send/recv root/leafdata with the given sf, sfop and op.
710: We only care root/leafdirect[PETSCSF_REMOTE], since we never need intermeidate buffers in local communication with NVSHMEM.
711: */
712: if (sfop == PETSCSF_BCAST) { /* Move data from rootbuf to leafbuf */
713: if (sf->use_nvshmem_get) {
714: rootdirect[PETSCSF_REMOTE] = PETSC_FALSE; /* send buffer has to be stand-alone (can't be rootdata) */
715: leafdirect[PETSCSF_REMOTE] = (PetscMemTypeNVSHMEM(leafmtype) && sf->leafcontig[PETSCSF_REMOTE] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE;
716: } else {
717: rootdirect[PETSCSF_REMOTE] = (PetscMemTypeNVSHMEM(rootmtype) && bas->rootcontig[PETSCSF_REMOTE]) ? PETSC_TRUE : PETSC_FALSE;
718: leafdirect[PETSCSF_REMOTE] = PETSC_FALSE; /* Our put-protocol always needs a nvshmem alloc'ed recv buffer */
719: }
720: } else if (sfop == PETSCSF_REDUCE) { /* Move data from leafbuf to rootbuf */
721: if (sf->use_nvshmem_get) {
722: rootdirect[PETSCSF_REMOTE] = (PetscMemTypeNVSHMEM(rootmtype) && bas->rootcontig[PETSCSF_REMOTE] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE;
723: leafdirect[PETSCSF_REMOTE] = PETSC_FALSE;
724: } else {
725: rootdirect[PETSCSF_REMOTE] = PETSC_FALSE;
726: leafdirect[PETSCSF_REMOTE] = (PetscMemTypeNVSHMEM(leafmtype) && sf->leafcontig[PETSCSF_REMOTE]) ? PETSC_TRUE : PETSC_FALSE;
727: }
728: } else { /* PETSCSF_FETCH */
729: rootdirect[PETSCSF_REMOTE] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
730: leafdirect[PETSCSF_REMOTE] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
731: }
733: /* Look for free nvshmem links in cache */
734: for (p=&bas->avail; (link=*p); p=&link->next) {
735: if (link->use_nvshmem) {
736: MPIPetsc_Type_compare(unit,link->unit,&match);
737: if (match) {
738: *p = link->next; /* Remove from available list */
739: goto found;
740: }
741: }
742: }
743: PetscNew(&link);
744: PetscSFLinkSetUp_Host(sf,link,unit); /* Compute link->unitbytes, dup link->unit etc. */
745: if (sf->backend == PETSCSF_BACKEND_CUDA) {PetscSFLinkSetUp_CUDA(sf,link,unit);} /* Setup pack routines, streams etc */
746: #if defined(PETSC_HAVE_KOKKOS)
747: else if (sf->backend == PETSCSF_BACKEND_KOKKOS) {PetscSFLinkSetUp_Kokkos(sf,link,unit);}
748: #endif
750: link->rootdirect[PETSCSF_LOCAL] = PETSC_TRUE; /* For the local part we directly use root/leafdata */
751: link->leafdirect[PETSCSF_LOCAL] = PETSC_TRUE;
753: /* Init signals to zero */
754: if (!link->rootSendSig) {PetscNvshmemCalloc(bas->nRemoteLeafRanksMax*sizeof(uint64_t),(void**)&link->rootSendSig);}
755: if (!link->rootRecvSig) {PetscNvshmemCalloc(bas->nRemoteLeafRanksMax*sizeof(uint64_t),(void**)&link->rootRecvSig);}
756: if (!link->leafSendSig) {PetscNvshmemCalloc(sf->nRemoteRootRanksMax*sizeof(uint64_t),(void**)&link->leafSendSig);}
757: if (!link->leafRecvSig) {PetscNvshmemCalloc(sf->nRemoteRootRanksMax*sizeof(uint64_t),(void**)&link->leafRecvSig);}
759: link->use_nvshmem = PETSC_TRUE;
760: link->rootmtype = PETSC_MEMTYPE_DEVICE; /* Only need 0/1-based mtype from now on */
761: link->leafmtype = PETSC_MEMTYPE_DEVICE;
762: /* Overwrite some function pointers set by PetscSFLinkSetUp_CUDA */
763: link->Destroy = PetscSFLinkDestroy_NVSHMEM;
764: if (sf->use_nvshmem_get) { /* get-based protocol */
765: link->PrePack = PetscSFLinkWaitSignalsOfCompletionOfGettingData_NVSHMEM;
766: link->StartCommunication = PetscSFLinkGetDataBegin_NVSHMEM;
767: link->FinishCommunication = PetscSFLinkGetDataEnd_NVSHMEM;
768: } else { /* put-based protocol */
769: link->StartCommunication = PetscSFLinkPutDataBegin_NVSHMEM;
770: link->FinishCommunication = PetscSFLinkPutDataEnd_NVSHMEM;
771: link->PostUnpack = PetscSFLinkSendSignalsToAllowPuttingData_NVSHMEM;
772: }
774: cerr = cudaDeviceGetStreamPriorityRange(NULL,&greatestPriority);CHKERRCUDA(cerr);
775: cerr = cudaStreamCreateWithPriority(&link->remoteCommStream,cudaStreamNonBlocking,greatestPriority);CHKERRCUDA(cerr);
777: cerr = cudaEventCreateWithFlags(&link->dataReady,cudaEventDisableTiming);CHKERRCUDA(cerr);
778: cerr = cudaEventCreateWithFlags(&link->endRemoteComm,cudaEventDisableTiming);CHKERRCUDA(cerr);
780: found:
781: if (rootdirect[PETSCSF_REMOTE]) {
782: link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE] = (char*)rootdata + bas->rootstart[PETSCSF_REMOTE]*link->unitbytes;
783: } else {
784: if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]) {
785: PetscNvshmemMalloc(bas->rootbuflen_rmax*link->unitbytes,(void**)&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]);
786: }
787: link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
788: }
790: if (leafdirect[PETSCSF_REMOTE]) {
791: link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE] = (char*)leafdata + sf->leafstart[PETSCSF_REMOTE]*link->unitbytes;
792: } else {
793: if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]) {
794: PetscNvshmemMalloc(sf->leafbuflen_rmax*link->unitbytes,(void**)&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]);
795: }
796: link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
797: }
799: link->rootdirect[PETSCSF_REMOTE] = rootdirect[PETSCSF_REMOTE];
800: link->leafdirect[PETSCSF_REMOTE] = leafdirect[PETSCSF_REMOTE];
801: link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
802: link->leafdata = leafdata;
803: link->next = bas->inuse;
804: bas->inuse = link;
805: *mylink = link;
806: return(0);
807: }
809: #if defined(PETSC_USE_REAL_SINGLE)
810: PetscErrorCode PetscNvshmemSum(PetscInt count,float *dst,const float *src)
811: {
812: PetscErrorCode ierr;
813: PetscMPIInt num; /* Assume nvshmem's int is MPI's int */
816: PetscMPIIntCast(count,&num);
817: nvshmemx_float_sum_reduce_on_stream(NVSHMEM_TEAM_WORLD,dst,src,num,PetscDefaultCudaStream);
818: return(0);
819: }
821: PetscErrorCode PetscNvshmemMax(PetscInt count,float *dst,const float *src)
822: {
823: PetscErrorCode ierr;
824: PetscMPIInt num;
827: PetscMPIIntCast(count,&num);
828: nvshmemx_float_max_reduce_on_stream(NVSHMEM_TEAM_WORLD,dst,src,num,PetscDefaultCudaStream);
829: return(0);
830: }
831: #elif defined(PETSC_USE_REAL_DOUBLE)
832: PetscErrorCode PetscNvshmemSum(PetscInt count,double *dst,const double *src)
833: {
834: PetscErrorCode ierr;
835: PetscMPIInt num;
838: PetscMPIIntCast(count,&num);
839: nvshmemx_double_sum_reduce_on_stream(NVSHMEM_TEAM_WORLD,dst,src,num,PetscDefaultCudaStream);
840: return(0);
841: }
843: PetscErrorCode PetscNvshmemMax(PetscInt count,double *dst,const double *src)
844: {
845: PetscErrorCode ierr;
846: PetscMPIInt num;
849: PetscMPIIntCast(count,&num);
850: nvshmemx_double_max_reduce_on_stream(NVSHMEM_TEAM_WORLD,dst,src,num,PetscDefaultCudaStream);
851: return(0);
852: }
853: #endif