Actual source code: sfhip.hip.cpp
1: #include <../src/vec/is/sf/impls/basic/sfpack.h>
3: /* compilation issues on SPOCK */
4: #undef PETSC_HAVE_COMPLEX
6: /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */
7: __device__ static inline PetscInt MapTidToIndex(const PetscInt *opt,PetscInt tid)
8: {
9: PetscInt i,j,k,m,n,r;
10: const PetscInt *offset,*start,*dx,*dy,*X,*Y;
12: n = opt[0];
13: offset = opt + 1;
14: start = opt + n + 2;
15: dx = opt + 2*n + 2;
16: dy = opt + 3*n + 2;
17: X = opt + 5*n + 2;
18: Y = opt + 6*n + 2;
19: for (r=0; r<n; r++) {if (tid < offset[r+1]) break;}
20: m = (tid - offset[r]);
21: k = m/(dx[r]*dy[r]);
22: j = (m - k*dx[r]*dy[r])/dx[r];
23: i = m - k*dx[r]*dy[r] - j*dx[r];
25: return (start[r] + k*X[r]*Y[r] + j*X[r] + i);
26: }
28: /*====================================================================================*/
29: /* Templated HIP kernels for pack/unpack. The Op can be regular or atomic */
30: /*====================================================================================*/
32: /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
33: <Type> is PetscReal, which is the primitive type we operate on.
34: <bs> is 16, which says <unit> contains 16 primitive types.
35: <BS> is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
36: <EQ> is 0, which is (bs == BS ? 1 : 0)
38: If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
39: For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
40: */
41: template<class Type,PetscInt BS,PetscInt EQ>
42: __global__ static void d_Pack(PetscInt bs,PetscInt count,PetscInt start,const PetscInt *opt,const PetscInt *idx,const Type *data,Type *buf)
43: {
44: PetscInt i,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
45: const PetscInt grid_size = gridDim.x * blockDim.x;
46: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */
47: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile-time const when EQ=1. */
49: for (; tid<count; tid += grid_size) {
50: /* opt != NULL ==> idx == NULL, i.e., the indices have patterns but not contiguous;
51: opt == NULL && idx == NULL ==> the indices are contiguous;
52: */
53: t = (opt? MapTidToIndex(opt,tid) : (idx? idx[tid] : start+tid))*MBS;
54: s = tid*MBS;
55: for (i=0; i<MBS; i++) buf[s+i] = data[t+i];
56: }
57: }
59: template<class Type,class Op,PetscInt BS,PetscInt EQ>
60: __global__ static void d_UnpackAndOp(PetscInt bs,PetscInt count,PetscInt start,const PetscInt *opt,const PetscInt *idx,Type *data,const Type *buf)
61: {
62: PetscInt i,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
63: const PetscInt grid_size = gridDim.x * blockDim.x;
64: const PetscInt M = (EQ) ? 1 : bs/BS, MBS = M*BS;
65: Op op;
67: for (; tid<count; tid += grid_size) {
68: t = (opt? MapTidToIndex(opt,tid) : (idx? idx[tid] : start+tid))*MBS;
69: s = tid*MBS;
70: for (i=0; i<MBS; i++) op(data[t+i],buf[s+i]);
71: }
72: }
74: template<class Type,class Op,PetscInt BS,PetscInt EQ>
75: __global__ static void d_FetchAndOp(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt *rootopt,const PetscInt *rootidx,Type *rootdata,Type *leafbuf)
76: {
77: PetscInt i,l,r,tid = blockIdx.x*blockDim.x + threadIdx.x;
78: const PetscInt grid_size = gridDim.x * blockDim.x;
79: const PetscInt M = (EQ) ? 1 : bs/BS, MBS = M*BS;
80: Op op;
82: for (; tid<count; tid += grid_size) {
83: r = (rootopt? MapTidToIndex(rootopt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
84: l = tid*MBS;
85: for (i=0; i<MBS; i++) leafbuf[l+i] = op(rootdata[r+i],leafbuf[l+i]);
86: }
87: }
89: template<class Type,class Op,PetscInt BS,PetscInt EQ>
90: __global__ static void d_ScatterAndOp(PetscInt bs,PetscInt count,PetscInt srcx,PetscInt srcy,PetscInt srcX,PetscInt srcY,PetscInt srcStart,const PetscInt* srcIdx,const Type *src,PetscInt dstx,PetscInt dsty,PetscInt dstX,PetscInt dstY,PetscInt dstStart,const PetscInt *dstIdx,Type *dst)
91: {
92: PetscInt i,j,k,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
93: const PetscInt grid_size = gridDim.x * blockDim.x;
94: const PetscInt M = (EQ) ? 1 : bs/BS, MBS = M*BS;
95: Op op;
97: for (; tid<count; tid += grid_size) {
98: if (!srcIdx) { /* src is either contiguous or 3D */
99: k = tid/(srcx*srcy);
100: j = (tid - k*srcx*srcy)/srcx;
101: i = tid - k*srcx*srcy - j*srcx;
102: s = srcStart + k*srcX*srcY + j*srcX + i;
103: } else {
104: s = srcIdx[tid];
105: }
107: if (!dstIdx) { /* dst is either contiguous or 3D */
108: k = tid/(dstx*dsty);
109: j = (tid - k*dstx*dsty)/dstx;
110: i = tid - k*dstx*dsty - j*dstx;
111: t = dstStart + k*dstX*dstY + j*dstX + i;
112: } else {
113: t = dstIdx[tid];
114: }
116: s *= MBS;
117: t *= MBS;
118: for (i=0; i<MBS; i++) op(dst[t+i],src[s+i]);
119: }
120: }
122: template<class Type,class Op,PetscInt BS,PetscInt EQ>
123: __global__ static void d_FetchAndOpLocal(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt *rootopt,const PetscInt *rootidx,Type *rootdata,PetscInt leafstart,const PetscInt *leafopt,const PetscInt *leafidx,const Type *leafdata,Type *leafupdate)
124: {
125: PetscInt i,l,r,tid = blockIdx.x*blockDim.x + threadIdx.x;
126: const PetscInt grid_size = gridDim.x * blockDim.x;
127: const PetscInt M = (EQ) ? 1 : bs/BS, MBS = M*BS;
128: Op op;
130: for (; tid<count; tid += grid_size) {
131: r = (rootopt? MapTidToIndex(rootopt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
132: l = (leafopt? MapTidToIndex(leafopt,tid) : (leafidx? leafidx[tid] : leafstart+tid))*MBS;
133: for (i=0; i<MBS; i++) leafupdate[l+i] = op(rootdata[r+i],leafdata[l+i]);
134: }
135: }
137: /*====================================================================================*/
138: /* Regular operations on device */
139: /*====================================================================================*/
140: template<typename Type> struct Insert {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = y; return old;}};
141: template<typename Type> struct Add {__device__ Type operator() (Type& x,Type y) const {Type old = x; x += y; return old;}};
142: template<typename Type> struct Mult {__device__ Type operator() (Type& x,Type y) const {Type old = x; x *= y; return old;}};
143: template<typename Type> struct Min {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = PetscMin(x,y); return old;}};
144: template<typename Type> struct Max {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = PetscMax(x,y); return old;}};
145: template<typename Type> struct LAND {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x && y; return old;}};
146: template<typename Type> struct LOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x || y; return old;}};
147: template<typename Type> struct LXOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = !x != !y; return old;}};
148: template<typename Type> struct BAND {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x & y; return old;}};
149: template<typename Type> struct BOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x | y; return old;}};
150: template<typename Type> struct BXOR {__device__ Type operator() (Type& x,Type y) const {Type old = x; x = x ^ y; return old;}};
151: template<typename Type> struct Minloc {
152: __device__ Type operator() (Type& x,Type y) const {
153: Type old = x;
154: if (y.a < x.a) x = y;
155: else if (y.a == x.a) x.b = min(x.b,y.b);
156: return old;
157: }
158: };
159: template<typename Type> struct Maxloc {
160: __device__ Type operator() (Type& x,Type y) const {
161: Type old = x;
162: if (y.a > x.a) x = y;
163: else if (y.a == x.a) x.b = min(x.b,y.b); /* See MPI MAXLOC */
164: return old;
165: }
166: };
168: /*====================================================================================*/
169: /* Atomic operations on device */
170: /*====================================================================================*/
172: /*
173: Atomic Insert (exchange) operations
175: See Cuda version
176: */
177: __device__ static double atomicExch(double* address,double val) {return __longlong_as_double(atomicExch((ullint*)address,__double_as_longlong(val)));}
179: __device__ static llint atomicExch(llint* address,llint val) {return (llint)(atomicExch((ullint*)address,(ullint)val));}
181: template<typename Type> struct AtomicInsert {__device__ Type operator() (Type& x,Type y) const {return atomicExch(&x,y);}};
183: #if defined(PETSC_HAVE_COMPLEX)
184: #if defined(PETSC_USE_REAL_DOUBLE)
185: template<> struct AtomicInsert<PetscComplex> {
186: __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
187: PetscComplex old, *z = &old;
188: double *xp = (double*)&x,*yp = (double*)&y;
189: AtomicInsert<double> op;
190: z[0] = op(xp[0],yp[0]);
191: z[1] = op(xp[1],yp[1]);
192: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
193: }
194: };
195: #elif defined(PETSC_USE_REAL_SINGLE)
196: template<> struct AtomicInsert<PetscComplex> {
197: __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
198: double *xp = (double*)&x,*yp = (double*)&y;
199: AtomicInsert<double> op;
200: return op(xp[0],yp[0]);
201: }
202: };
203: #endif
204: #endif
206: /*
207: Atomic add operations
209: */
210: __device__ static llint atomicAdd(llint* address,llint val) {return (llint)atomicAdd((ullint*)address,(ullint)val);}
212: template<typename Type> struct AtomicAdd {__device__ Type operator() (Type& x,Type y) const {return atomicAdd(&x,y);}};
214: template<> struct AtomicAdd<double> {
215: __device__ double operator() (double& x,double y) const {
216: /* Cuda version does more checks that may be needed */
217: return atomicAdd(&x,y);
218: }
219: };
221: template<> struct AtomicAdd<float> {
222: __device__ float operator() (float& x,float y) const {
223: /* Cuda version does more checks that may be needed */
224: return atomicAdd(&x,y);
225: }
226: };
228: #if defined(PETSC_HAVE_COMPLEX)
229: template<> struct AtomicAdd<PetscComplex> {
230: __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
231: PetscComplex old, *z = &old;
232: PetscReal *xp = (PetscReal*)&x,*yp = (PetscReal*)&y;
233: AtomicAdd<PetscReal> op;
234: z[0] = op(xp[0],yp[0]);
235: z[1] = op(xp[1],yp[1]);
236: return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
237: }
238: };
239: #endif
241: /*
242: Atomic Mult operations:
244: HIP has no atomicMult at all, so we build our own with atomicCAS
245: */
246: #if defined(PETSC_USE_REAL_DOUBLE)
247: __device__ static double atomicMult(double* address, double val)
248: {
249: ullint *address_as_ull = (ullint*)(address);
250: ullint old = *address_as_ull, assumed;
251: do {
252: assumed = old;
253: /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
254: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val*__longlong_as_double(assumed)));
255: } while (assumed != old);
256: return __longlong_as_double(old);
257: }
258: #elif defined(PETSC_USE_REAL_SINGLE)
259: __device__ static float atomicMult(float* address,float val)
260: {
261: int *address_as_int = (int*)(address);
262: int old = *address_as_int, assumed;
263: do {
264: assumed = old;
265: old = atomicCAS(address_as_int, assumed, __float_as_int(val*__int_as_float(assumed)));
266: } while (assumed != old);
267: return __int_as_float(old);
268: }
269: #endif
271: __device__ static int atomicMult(int* address,int val)
272: {
273: int *address_as_int = (int*)(address);
274: int old = *address_as_int, assumed;
275: do {
276: assumed = old;
277: old = atomicCAS(address_as_int, assumed, val*assumed);
278: } while (assumed != old);
279: return (int)old;
280: }
282: __device__ static llint atomicMult(llint* address,llint val)
283: {
284: ullint *address_as_ull = (ullint*)(address);
285: ullint old = *address_as_ull, assumed;
286: do {
287: assumed = old;
288: old = atomicCAS(address_as_ull, assumed, (ullint)(val*(llint)assumed));
289: } while (assumed != old);
290: return (llint)old;
291: }
293: template<typename Type> struct AtomicMult {__device__ Type operator() (Type& x,Type y) const {return atomicMult(&x,y);}};
295: /*
296: Atomic Min/Max operations
298: See CUDA version for comments.
299: */
301: #if defined(PETSC_USE_REAL_DOUBLE)
302: __device__ static double atomicMin(double* address, double val)
303: {
304: ullint *address_as_ull = (ullint*)(address);
305: ullint old = *address_as_ull, assumed;
306: do {
307: assumed = old;
308: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val,__longlong_as_double(assumed))));
309: } while (assumed != old);
310: return __longlong_as_double(old);
311: }
313: __device__ static double atomicMax(double* address, double val)
314: {
315: ullint *address_as_ull = (ullint*)(address);
316: ullint old = *address_as_ull, assumed;
317: do {
318: assumed = old;
319: old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val,__longlong_as_double(assumed))));
320: } while (assumed != old);
321: return __longlong_as_double(old);
322: }
323: #elif defined(PETSC_USE_REAL_SINGLE)
324: __device__ static float atomicMin(float* address,float val)
325: {
326: int *address_as_int = (int*)(address);
327: int old = *address_as_int, assumed;
328: do {
329: assumed = old;
330: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val,__int_as_float(assumed))));
331: } while (assumed != old);
332: return __int_as_float(old);
333: }
335: __device__ static float atomicMax(float* address,float val)
336: {
337: int *address_as_int = (int*)(address);
338: int old = *address_as_int, assumed;
339: do {
340: assumed = old;
341: old = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val,__int_as_float(assumed))));
342: } while (assumed != old);
343: return __int_as_float(old);
344: }
345: #endif
347: /* As of ROCm 3.10 llint atomicMin/Max(llint*, llint) is not supported */
348: __device__ static llint atomicMin(llint* address,llint val)
349: {
350: ullint *address_as_ull = (ullint*)(address);
351: ullint old = *address_as_ull, assumed;
352: do {
353: assumed = old;
354: old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val,(llint)assumed)));
355: } while (assumed != old);
356: return (llint)old;
357: }
359: __device__ static llint atomicMax(llint* address,llint val)
360: {
361: ullint *address_as_ull = (ullint*)(address);
362: ullint old = *address_as_ull, assumed;
363: do {
364: assumed = old;
365: old = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val,(llint)assumed)));
366: } while (assumed != old);
367: return (llint)old;
368: }
370: template<typename Type> struct AtomicMin {__device__ Type operator() (Type& x,Type y) const {return atomicMin(&x,y);}};
371: template<typename Type> struct AtomicMax {__device__ Type operator() (Type& x,Type y) const {return atomicMax(&x,y);}};
373: /*
374: Atomic bitwise operations
375: As of ROCm 3.10, the llint atomicAnd/Or/Xor(llint*, llint) is not supported
376: */
378: __device__ static llint atomicAnd(llint* address,llint val)
379: {
380: ullint *address_as_ull = (ullint*)(address);
381: ullint old = *address_as_ull, assumed;
382: do {
383: assumed = old;
384: old = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
385: } while (assumed != old);
386: return (llint)old;
387: }
388: __device__ static llint atomicOr(llint* address,llint val)
389: {
390: ullint *address_as_ull = (ullint*)(address);
391: ullint old = *address_as_ull, assumed;
392: do {
393: assumed = old;
394: old = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
395: } while (assumed != old);
396: return (llint)old;
397: }
399: __device__ static llint atomicXor(llint* address,llint val)
400: {
401: ullint *address_as_ull = (ullint*)(address);
402: ullint old = *address_as_ull, assumed;
403: do {
404: assumed = old;
405: old = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
406: } while (assumed != old);
407: return (llint)old;
408: }
410: template<typename Type> struct AtomicBAND {__device__ Type operator() (Type& x,Type y) const {return atomicAnd(&x,y);}};
411: template<typename Type> struct AtomicBOR {__device__ Type operator() (Type& x,Type y) const {return atomicOr (&x,y);}};
412: template<typename Type> struct AtomicBXOR {__device__ Type operator() (Type& x,Type y) const {return atomicXor(&x,y);}};
414: /*
415: Atomic logical operations:
417: CUDA has no atomic logical operations at all. We support them on integer types.
418: */
420: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
421: which is what we want since we only support 32-bit and 64-bit integers.
422: */
423: template<typename Type,class Op,int size/* sizeof(Type) */> struct AtomicLogical;
425: template<typename Type,class Op>
426: struct AtomicLogical<Type,Op,4> {
427: __device__ Type operator()(Type& x,Type y) const {
428: int *address_as_int = (int*)(&x);
429: int old = *address_as_int, assumed;
430: Op op;
431: do {
432: assumed = old;
433: old = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed,y)));
434: } while (assumed != old);
435: return (Type)old;
436: }
437: };
439: template<typename Type,class Op>
440: struct AtomicLogical<Type,Op,8> {
441: __device__ Type operator()(Type& x,Type y) const {
442: ullint *address_as_ull = (ullint*)(&x);
443: ullint old = *address_as_ull, assumed;
444: Op op;
445: do {
446: assumed = old;
447: old = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed,y)));
448: } while (assumed != old);
449: return (Type)old;
450: }
451: };
453: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
454: template<typename Type> struct land {__device__ Type operator()(Type x, Type y) {return x && y;}};
455: template<typename Type> struct lor {__device__ Type operator()(Type x, Type y) {return x || y;}};
456: template<typename Type> struct lxor {__device__ Type operator()(Type x, Type y) {return (!x != !y);}};
458: template<typename Type> struct AtomicLAND {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,land<Type>,sizeof(Type)> op; return op(x,y);}};
459: template<typename Type> struct AtomicLOR {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lor<Type> ,sizeof(Type)> op; return op(x,y);}};
460: template<typename Type> struct AtomicLXOR {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lxor<Type>,sizeof(Type)> op; return op(x,y);}};
462: /*====================================================================================*/
463: /* Wrapper functions of hip kernels. Function pointers are stored in 'link' */
464: /*====================================================================================*/
465: template<typename Type,PetscInt BS,PetscInt EQ>
466: static PetscErrorCode Pack(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *data,void *buf)
467: {
468: hipError_t err;
469: PetscInt nthreads=256;
470: PetscInt nblocks=(count+nthreads-1)/nthreads;
471: const PetscInt *iarray=opt ? opt->array : NULL;
474: if (!count) return(0);
475: nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
476: hipLaunchKernelGGL(HIP_KERNEL_NAME(d_Pack<Type,BS,EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs,count,start,iarray,idx,(const Type*)data,(Type*)buf);
477: err = hipGetLastError();CHKERRHIP(err);
478: return(0);
479: }
481: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
482: static PetscErrorCode UnpackAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,const void *buf)
483: {
484: hipError_t cerr;
485: PetscInt nthreads=256;
486: PetscInt nblocks=(count+nthreads-1)/nthreads;
487: const PetscInt *iarray=opt ? opt->array : NULL;
490: if (!count) return(0);
491: nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
492: hipLaunchKernelGGL(HIP_KERNEL_NAME(d_UnpackAndOp<Type,Op,BS,EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs,count,start,iarray,idx,(Type*)data,(const Type*)buf);
493: cerr = hipGetLastError();CHKERRHIP(cerr);
494: return(0);
495: }
497: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
498: static PetscErrorCode FetchAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,void *buf)
499: {
500: hipError_t cerr;
501: PetscInt nthreads=256;
502: PetscInt nblocks=(count+nthreads-1)/nthreads;
503: const PetscInt *iarray=opt ? opt->array : NULL;
506: if (!count) return(0);
507: nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
508: hipLaunchKernelGGL(HIP_KERNEL_NAME(d_FetchAndOp<Type,Op,BS,EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs,count,start,iarray,idx,(Type*)data,(Type*)buf);
509: cerr = hipGetLastError();CHKERRHIP(cerr);
510: return(0);
511: }
513: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
514: static PetscErrorCode ScatterAndOp(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst)
515: {
516: hipError_t cerr;
517: PetscInt nthreads=256;
518: PetscInt nblocks=(count+nthreads-1)/nthreads;
519: PetscInt srcx=0,srcy=0,srcX=0,srcY=0,dstx=0,dsty=0,dstX=0,dstY=0;
522: if (!count) return(0);
523: nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
525: /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use CUDA 3D grid and block */
526: if (srcOpt) {srcx = srcOpt->dx[0]; srcy = srcOpt->dy[0]; srcX = srcOpt->X[0]; srcY = srcOpt->Y[0]; srcStart = srcOpt->start[0]; srcIdx = NULL;}
527: else if (!srcIdx) {srcx = srcX = count; srcy = srcY = 1;}
529: if (dstOpt) {dstx = dstOpt->dx[0]; dsty = dstOpt->dy[0]; dstX = dstOpt->X[0]; dstY = dstOpt->Y[0]; dstStart = dstOpt->start[0]; dstIdx = NULL;}
530: else if (!dstIdx) {dstx = dstX = count; dsty = dstY = 1;}
532: hipLaunchKernelGGL(HIP_KERNEL_NAME(d_ScatterAndOp<Type,Op,BS,EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs,count,srcx,srcy,srcX,srcY,srcStart,srcIdx,(const Type*)src,dstx,dsty,dstX,dstY,dstStart,dstIdx,(Type*)dst);
533: cerr = hipGetLastError();CHKERRHIP(cerr);
534: return(0);
535: }
537: /* Specialization for Insert since we may use hipMemcpyAsync */
538: template<typename Type,PetscInt BS,PetscInt EQ>
539: static PetscErrorCode ScatterAndInsert(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst)
540: {
541: PetscErrorCode ierr;
542: hipError_t cerr;
545: if (!count) return(0);
546: /*src and dst are contiguous */
547: if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) {
548: cerr = hipMemcpyAsync((Type*)dst+dstStart*link->bs,(const Type*)src+srcStart*link->bs,count*link->unitbytes,hipMemcpyDeviceToDevice,link->stream);CHKERRHIP(cerr);
549: } else {
550: ScatterAndOp<Type,Insert<Type>,BS,EQ>(link,count,srcStart,srcOpt,srcIdx,src,dstStart,dstOpt,dstIdx,dst);
551: }
552: return(0);
553: }
555: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
556: static PetscErrorCode FetchAndOpLocal(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate)
557: {
558: hipError_t cerr;
559: PetscInt nthreads=256;
560: PetscInt nblocks=(count+nthreads-1)/nthreads;
561: const PetscInt *rarray = rootopt ? rootopt->array : NULL;
562: const PetscInt *larray = leafopt ? leafopt->array : NULL;
565: if (!count) return(0);
566: nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
567: hipLaunchKernelGGL(HIP_KERNEL_NAME(d_FetchAndOpLocal<Type,Op,BS,EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs,count,rootstart,rarray,rootidx,(Type*)rootdata,leafstart,larray,leafidx,(const Type*)leafdata,(Type*)leafupdate);
568: cerr = hipGetLastError();CHKERRHIP(cerr);
569: return(0);
570: }
572: /*====================================================================================*/
573: /* Init various types and instantiate pack/unpack function pointers */
574: /*====================================================================================*/
575: template<typename Type,PetscInt BS,PetscInt EQ>
576: static void PackInit_RealType(PetscSFLink link)
577: {
578: /* Pack/unpack for remote communication */
579: link->d_Pack = Pack<Type,BS,EQ>;
580: link->d_UnpackAndInsert = UnpackAndOp <Type,Insert<Type> ,BS,EQ>;
581: link->d_UnpackAndAdd = UnpackAndOp <Type,Add<Type> ,BS,EQ>;
582: link->d_UnpackAndMult = UnpackAndOp <Type,Mult<Type> ,BS,EQ>;
583: link->d_UnpackAndMin = UnpackAndOp <Type,Min<Type> ,BS,EQ>;
584: link->d_UnpackAndMax = UnpackAndOp <Type,Max<Type> ,BS,EQ>;
585: link->d_FetchAndAdd = FetchAndOp <Type,Add<Type> ,BS,EQ>;
587: /* Scatter for local communication */
588: link->d_ScatterAndInsert = ScatterAndInsert<Type ,BS,EQ>; /* Has special optimizations */
589: link->d_ScatterAndAdd = ScatterAndOp <Type,Add<Type> ,BS,EQ>;
590: link->d_ScatterAndMult = ScatterAndOp <Type,Mult<Type> ,BS,EQ>;
591: link->d_ScatterAndMin = ScatterAndOp <Type,Min<Type> ,BS,EQ>;
592: link->d_ScatterAndMax = ScatterAndOp <Type,Max<Type> ,BS,EQ>;
593: link->d_FetchAndAddLocal = FetchAndOpLocal <Type,Add <Type> ,BS,EQ>;
595: /* Atomic versions when there are data-race possibilities */
596: link->da_UnpackAndInsert = UnpackAndOp <Type,AtomicInsert<Type>,BS,EQ>;
597: link->da_UnpackAndAdd = UnpackAndOp <Type,AtomicAdd<Type> ,BS,EQ>;
598: link->da_UnpackAndMult = UnpackAndOp <Type,AtomicMult<Type> ,BS,EQ>;
599: link->da_UnpackAndMin = UnpackAndOp <Type,AtomicMin<Type> ,BS,EQ>;
600: link->da_UnpackAndMax = UnpackAndOp <Type,AtomicMax<Type> ,BS,EQ>;
601: link->da_FetchAndAdd = FetchAndOp <Type,AtomicAdd<Type> ,BS,EQ>;
603: link->da_ScatterAndInsert = ScatterAndOp <Type,AtomicInsert<Type>,BS,EQ>;
604: link->da_ScatterAndAdd = ScatterAndOp <Type,AtomicAdd<Type> ,BS,EQ>;
605: link->da_ScatterAndMult = ScatterAndOp <Type,AtomicMult<Type> ,BS,EQ>;
606: link->da_ScatterAndMin = ScatterAndOp <Type,AtomicMin<Type> ,BS,EQ>;
607: link->da_ScatterAndMax = ScatterAndOp <Type,AtomicMax<Type> ,BS,EQ>;
608: link->da_FetchAndAddLocal = FetchAndOpLocal <Type,AtomicAdd<Type> ,BS,EQ>;
609: }
611: /* Have this templated class to specialize for char integers */
612: template<typename Type,PetscInt BS,PetscInt EQ,PetscInt size/*sizeof(Type)*/>
613: struct PackInit_IntegerType_Atomic {
614: static void Init(PetscSFLink link)
615: {
616: link->da_UnpackAndInsert = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
617: link->da_UnpackAndAdd = UnpackAndOp<Type,AtomicAdd<Type> ,BS,EQ>;
618: link->da_UnpackAndMult = UnpackAndOp<Type,AtomicMult<Type> ,BS,EQ>;
619: link->da_UnpackAndMin = UnpackAndOp<Type,AtomicMin<Type> ,BS,EQ>;
620: link->da_UnpackAndMax = UnpackAndOp<Type,AtomicMax<Type> ,BS,EQ>;
621: link->da_UnpackAndLAND = UnpackAndOp<Type,AtomicLAND<Type> ,BS,EQ>;
622: link->da_UnpackAndLOR = UnpackAndOp<Type,AtomicLOR<Type> ,BS,EQ>;
623: link->da_UnpackAndLXOR = UnpackAndOp<Type,AtomicLXOR<Type> ,BS,EQ>;
624: link->da_UnpackAndBAND = UnpackAndOp<Type,AtomicBAND<Type> ,BS,EQ>;
625: link->da_UnpackAndBOR = UnpackAndOp<Type,AtomicBOR<Type> ,BS,EQ>;
626: link->da_UnpackAndBXOR = UnpackAndOp<Type,AtomicBXOR<Type> ,BS,EQ>;
627: link->da_FetchAndAdd = FetchAndOp <Type,AtomicAdd<Type> ,BS,EQ>;
629: link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
630: link->da_ScatterAndAdd = ScatterAndOp<Type,AtomicAdd<Type> ,BS,EQ>;
631: link->da_ScatterAndMult = ScatterAndOp<Type,AtomicMult<Type> ,BS,EQ>;
632: link->da_ScatterAndMin = ScatterAndOp<Type,AtomicMin<Type> ,BS,EQ>;
633: link->da_ScatterAndMax = ScatterAndOp<Type,AtomicMax<Type> ,BS,EQ>;
634: link->da_ScatterAndLAND = ScatterAndOp<Type,AtomicLAND<Type> ,BS,EQ>;
635: link->da_ScatterAndLOR = ScatterAndOp<Type,AtomicLOR<Type> ,BS,EQ>;
636: link->da_ScatterAndLXOR = ScatterAndOp<Type,AtomicLXOR<Type> ,BS,EQ>;
637: link->da_ScatterAndBAND = ScatterAndOp<Type,AtomicBAND<Type> ,BS,EQ>;
638: link->da_ScatterAndBOR = ScatterAndOp<Type,AtomicBOR<Type> ,BS,EQ>;
639: link->da_ScatterAndBXOR = ScatterAndOp<Type,AtomicBXOR<Type> ,BS,EQ>;
640: link->da_FetchAndAddLocal = FetchAndOpLocal<Type,AtomicAdd<Type>,BS,EQ>;
641: }
642: };
644: /* See cuda version */
645: template<typename Type,PetscInt BS,PetscInt EQ>
646: struct PackInit_IntegerType_Atomic<Type,BS,EQ,1> {
647: static void Init(PetscSFLink link) {/* Nothing to leave function pointers NULL */}
648: };
650: template<typename Type,PetscInt BS,PetscInt EQ>
651: static void PackInit_IntegerType(PetscSFLink link)
652: {
653: link->d_Pack = Pack<Type,BS,EQ>;
654: link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
655: link->d_UnpackAndAdd = UnpackAndOp<Type,Add<Type> ,BS,EQ>;
656: link->d_UnpackAndMult = UnpackAndOp<Type,Mult<Type> ,BS,EQ>;
657: link->d_UnpackAndMin = UnpackAndOp<Type,Min<Type> ,BS,EQ>;
658: link->d_UnpackAndMax = UnpackAndOp<Type,Max<Type> ,BS,EQ>;
659: link->d_UnpackAndLAND = UnpackAndOp<Type,LAND<Type> ,BS,EQ>;
660: link->d_UnpackAndLOR = UnpackAndOp<Type,LOR<Type> ,BS,EQ>;
661: link->d_UnpackAndLXOR = UnpackAndOp<Type,LXOR<Type> ,BS,EQ>;
662: link->d_UnpackAndBAND = UnpackAndOp<Type,BAND<Type> ,BS,EQ>;
663: link->d_UnpackAndBOR = UnpackAndOp<Type,BOR<Type> ,BS,EQ>;
664: link->d_UnpackAndBXOR = UnpackAndOp<Type,BXOR<Type> ,BS,EQ>;
665: link->d_FetchAndAdd = FetchAndOp <Type,Add<Type> ,BS,EQ>;
667: link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
668: link->d_ScatterAndAdd = ScatterAndOp<Type,Add<Type> ,BS,EQ>;
669: link->d_ScatterAndMult = ScatterAndOp<Type,Mult<Type> ,BS,EQ>;
670: link->d_ScatterAndMin = ScatterAndOp<Type,Min<Type> ,BS,EQ>;
671: link->d_ScatterAndMax = ScatterAndOp<Type,Max<Type> ,BS,EQ>;
672: link->d_ScatterAndLAND = ScatterAndOp<Type,LAND<Type> ,BS,EQ>;
673: link->d_ScatterAndLOR = ScatterAndOp<Type,LOR<Type> ,BS,EQ>;
674: link->d_ScatterAndLXOR = ScatterAndOp<Type,LXOR<Type> ,BS,EQ>;
675: link->d_ScatterAndBAND = ScatterAndOp<Type,BAND<Type> ,BS,EQ>;
676: link->d_ScatterAndBOR = ScatterAndOp<Type,BOR<Type> ,BS,EQ>;
677: link->d_ScatterAndBXOR = ScatterAndOp<Type,BXOR<Type> ,BS,EQ>;
678: link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;
679: PackInit_IntegerType_Atomic<Type,BS,EQ,sizeof(Type)>::Init(link);
680: }
682: #if defined(PETSC_HAVE_COMPLEX)
683: template<typename Type,PetscInt BS,PetscInt EQ>
684: static void PackInit_ComplexType(PetscSFLink link)
685: {
686: link->d_Pack = Pack<Type,BS,EQ>;
687: link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
688: link->d_UnpackAndAdd = UnpackAndOp<Type,Add<Type> ,BS,EQ>;
689: link->d_UnpackAndMult = UnpackAndOp<Type,Mult<Type> ,BS,EQ>;
690: link->d_FetchAndAdd = FetchAndOp <Type,Add<Type> ,BS,EQ>;
692: link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
693: link->d_ScatterAndAdd = ScatterAndOp<Type,Add<Type> ,BS,EQ>;
694: link->d_ScatterAndMult = ScatterAndOp<Type,Mult<Type> ,BS,EQ>;
695: link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;
697: link->da_UnpackAndInsert = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
698: link->da_UnpackAndAdd = UnpackAndOp<Type,AtomicAdd<Type>,BS,EQ>;
699: link->da_UnpackAndMult = NULL; /* Not implemented yet */
700: link->da_FetchAndAdd = NULL; /* Return value of atomicAdd on complex is not atomic */
702: link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
703: link->da_ScatterAndAdd = ScatterAndOp<Type,AtomicAdd<Type>,BS,EQ>;
704: }
705: #endif
707: typedef signed char SignedChar;
708: typedef unsigned char UnsignedChar;
709: typedef struct {int a; int b; } PairInt;
710: typedef struct {PetscInt a; PetscInt b;} PairPetscInt;
712: template<typename Type>
713: static void PackInit_PairType(PetscSFLink link)
714: {
715: link->d_Pack = Pack<Type,1,1>;
716: link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,1,1>;
717: link->d_UnpackAndMaxloc = UnpackAndOp<Type,Maxloc<Type>,1,1>;
718: link->d_UnpackAndMinloc = UnpackAndOp<Type,Minloc<Type>,1,1>;
720: link->d_ScatterAndInsert = ScatterAndOp<Type,Insert<Type>,1,1>;
721: link->d_ScatterAndMaxloc = ScatterAndOp<Type,Maxloc<Type>,1,1>;
722: link->d_ScatterAndMinloc = ScatterAndOp<Type,Minloc<Type>,1,1>;
723: /* Atomics for pair types are not implemented yet */
724: }
726: template<typename Type,PetscInt BS,PetscInt EQ>
727: static void PackInit_DumbType(PetscSFLink link)
728: {
729: link->d_Pack = Pack<Type,BS,EQ>;
730: link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
731: link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
732: /* Atomics for dumb types are not implemented yet */
733: }
735: /* Some device-specific utilities */
736: static PetscErrorCode PetscSFLinkSyncDevice_HIP(PetscSFLink link)
737: {
738: hipError_t cerr;
740: cerr = hipDeviceSynchronize();CHKERRHIP(cerr);
741: return(0);
742: }
744: static PetscErrorCode PetscSFLinkSyncStream_HIP(PetscSFLink link)
745: {
746: hipError_t cerr;
748: cerr = hipStreamSynchronize(link->stream);CHKERRHIP(cerr);
749: return(0);
750: }
752: static PetscErrorCode PetscSFLinkMemcpy_HIP(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
753: {
755: enum hipMemcpyKind kinds[2][2] = {{hipMemcpyHostToHost,hipMemcpyHostToDevice},{hipMemcpyDeviceToHost,hipMemcpyDeviceToDevice}};
757: if (n) {
758: if (PetscMemTypeHost(dstmtype) && PetscMemTypeHost(srcmtype)) { /* Separate HostToHost so that pure-cpu code won't call hip runtime */
759: PetscErrorCode PetscMemcpy(dst,src,n);
760: } else {
761: int stype = PetscMemTypeDevice(srcmtype) ? 1 : 0;
762: int dtype = PetscMemTypeDevice(dstmtype) ? 1 : 0;
763: hipError_t cerr = hipMemcpyAsync(dst,src,n,kinds[stype][dtype],link->stream);CHKERRHIP(cerr);
764: }
765: }
766: return(0);
767: }
769: PetscErrorCode PetscSFMalloc_HIP(PetscMemType mtype,size_t size,void** ptr)
770: {
772: if (PetscMemTypeHost(mtype)) {PetscErrorCode PetscMalloc(size,ptr);}
773: else if (PetscMemTypeDevice(mtype)) {
774: if (!PetscHIPInitialized) { PetscErrorCode PetscHIPInitializeCheck(); }
775: hipError_t err = hipMalloc(ptr,size);CHKERRHIP(err);
776: } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d", (int)mtype);
777: return(0);
778: }
780: PetscErrorCode PetscSFFree_HIP(PetscMemType mtype,void* ptr)
781: {
783: if (PetscMemTypeHost(mtype)) {PetscErrorCode PetscFree(ptr);}
784: else if (PetscMemTypeDevice(mtype)) {hipError_t err = hipFree(ptr);CHKERRHIP(err);}
785: else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d",(int)mtype);
786: return(0);
787: }
789: /* Destructor when the link uses MPI for communication on HIP device */
790: static PetscErrorCode PetscSFLinkDestroy_MPI_HIP(PetscSF sf,PetscSFLink link)
791: {
792: hipError_t cerr;
795: for (int i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
796: cerr = hipFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRHIP(cerr);
797: cerr = hipFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRHIP(cerr);
798: }
799: return(0);
800: }
802: /*====================================================================================*/
803: /* Main driver to init MPI datatype on device */
804: /*====================================================================================*/
806: /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
807: PetscErrorCode PetscSFLinkSetUp_HIP(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
808: {
810: hipError_t cerr;
811: PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
812: PetscBool is2Int,is2PetscInt;
813: #if defined(PETSC_HAVE_COMPLEX)
814: PetscInt nPetscComplex=0;
815: #endif
818: if (link->deviceinited) return(0);
819: MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);
820: MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
821: /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
822: MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);
823: MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
824: MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
825: #if defined(PETSC_HAVE_COMPLEX)
826: MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
827: #endif
828: MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
829: MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);
831: if (is2Int) {
832: PackInit_PairType<PairInt>(link);
833: } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
834: PackInit_PairType<PairPetscInt>(link);
835: } else if (nPetscReal) {
836: if (nPetscReal == 8) PackInit_RealType<PetscReal,8,1>(link); else if (nPetscReal%8 == 0) PackInit_RealType<PetscReal,8,0>(link);
837: else if (nPetscReal == 4) PackInit_RealType<PetscReal,4,1>(link); else if (nPetscReal%4 == 0) PackInit_RealType<PetscReal,4,0>(link);
838: else if (nPetscReal == 2) PackInit_RealType<PetscReal,2,1>(link); else if (nPetscReal%2 == 0) PackInit_RealType<PetscReal,2,0>(link);
839: else if (nPetscReal == 1) PackInit_RealType<PetscReal,1,1>(link); else if (nPetscReal%1 == 0) PackInit_RealType<PetscReal,1,0>(link);
840: } else if (nPetscInt && sizeof(PetscInt) == sizeof(llint)) {
841: if (nPetscInt == 8) PackInit_IntegerType<llint,8,1>(link); else if (nPetscInt%8 == 0) PackInit_IntegerType<llint,8,0>(link);
842: else if (nPetscInt == 4) PackInit_IntegerType<llint,4,1>(link); else if (nPetscInt%4 == 0) PackInit_IntegerType<llint,4,0>(link);
843: else if (nPetscInt == 2) PackInit_IntegerType<llint,2,1>(link); else if (nPetscInt%2 == 0) PackInit_IntegerType<llint,2,0>(link);
844: else if (nPetscInt == 1) PackInit_IntegerType<llint,1,1>(link); else if (nPetscInt%1 == 0) PackInit_IntegerType<llint,1,0>(link);
845: } else if (nInt) {
846: if (nInt == 8) PackInit_IntegerType<int,8,1>(link); else if (nInt%8 == 0) PackInit_IntegerType<int,8,0>(link);
847: else if (nInt == 4) PackInit_IntegerType<int,4,1>(link); else if (nInt%4 == 0) PackInit_IntegerType<int,4,0>(link);
848: else if (nInt == 2) PackInit_IntegerType<int,2,1>(link); else if (nInt%2 == 0) PackInit_IntegerType<int,2,0>(link);
849: else if (nInt == 1) PackInit_IntegerType<int,1,1>(link); else if (nInt%1 == 0) PackInit_IntegerType<int,1,0>(link);
850: } else if (nSignedChar) {
851: if (nSignedChar == 8) PackInit_IntegerType<SignedChar,8,1>(link); else if (nSignedChar%8 == 0) PackInit_IntegerType<SignedChar,8,0>(link);
852: else if (nSignedChar == 4) PackInit_IntegerType<SignedChar,4,1>(link); else if (nSignedChar%4 == 0) PackInit_IntegerType<SignedChar,4,0>(link);
853: else if (nSignedChar == 2) PackInit_IntegerType<SignedChar,2,1>(link); else if (nSignedChar%2 == 0) PackInit_IntegerType<SignedChar,2,0>(link);
854: else if (nSignedChar == 1) PackInit_IntegerType<SignedChar,1,1>(link); else if (nSignedChar%1 == 0) PackInit_IntegerType<SignedChar,1,0>(link);
855: } else if (nUnsignedChar) {
856: if (nUnsignedChar == 8) PackInit_IntegerType<UnsignedChar,8,1>(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType<UnsignedChar,8,0>(link);
857: else if (nUnsignedChar == 4) PackInit_IntegerType<UnsignedChar,4,1>(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType<UnsignedChar,4,0>(link);
858: else if (nUnsignedChar == 2) PackInit_IntegerType<UnsignedChar,2,1>(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType<UnsignedChar,2,0>(link);
859: else if (nUnsignedChar == 1) PackInit_IntegerType<UnsignedChar,1,1>(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType<UnsignedChar,1,0>(link);
860: #if defined(PETSC_HAVE_COMPLEX)
861: } else if (nPetscComplex) {
862: if (nPetscComplex == 8) PackInit_ComplexType<PetscComplex,8,1>(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType<PetscComplex,8,0>(link);
863: else if (nPetscComplex == 4) PackInit_ComplexType<PetscComplex,4,1>(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType<PetscComplex,4,0>(link);
864: else if (nPetscComplex == 2) PackInit_ComplexType<PetscComplex,2,1>(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType<PetscComplex,2,0>(link);
865: else if (nPetscComplex == 1) PackInit_ComplexType<PetscComplex,1,1>(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType<PetscComplex,1,0>(link);
866: #endif
867: } else {
868: MPI_Aint lb,nbyte;
869: MPI_Type_get_extent(unit,&lb,&nbyte);
870: if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
871: if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
872: if (nbyte == 4) PackInit_DumbType<char,4,1>(link); else if (nbyte%4 == 0) PackInit_DumbType<char,4,0>(link);
873: else if (nbyte == 2) PackInit_DumbType<char,2,1>(link); else if (nbyte%2 == 0) PackInit_DumbType<char,2,0>(link);
874: else if (nbyte == 1) PackInit_DumbType<char,1,1>(link); else if (nbyte%1 == 0) PackInit_DumbType<char,1,0>(link);
875: } else {
876: nInt = nbyte / sizeof(int);
877: if (nInt == 8) PackInit_DumbType<int,8,1>(link); else if (nInt%8 == 0) PackInit_DumbType<int,8,0>(link);
878: else if (nInt == 4) PackInit_DumbType<int,4,1>(link); else if (nInt%4 == 0) PackInit_DumbType<int,4,0>(link);
879: else if (nInt == 2) PackInit_DumbType<int,2,1>(link); else if (nInt%2 == 0) PackInit_DumbType<int,2,0>(link);
880: else if (nInt == 1) PackInit_DumbType<int,1,1>(link); else if (nInt%1 == 0) PackInit_DumbType<int,1,0>(link);
881: }
882: }
884: if (!sf->maxResidentThreadsPerGPU) { /* Not initialized */
885: int device;
886: struct hipDeviceProp_t props;
887: cerr = hipGetDevice(&device);CHKERRHIP(cerr);
888: cerr = hipGetDeviceProperties(&props,device);CHKERRHIP(cerr);
889: sf->maxResidentThreadsPerGPU = props.maxThreadsPerMultiProcessor*props.multiProcessorCount;
890: }
891: link->maxResidentThreadsPerGPU = sf->maxResidentThreadsPerGPU;
893: link->stream = PetscDefaultHipStream;
894: link->Destroy = PetscSFLinkDestroy_MPI_HIP;
895: link->SyncDevice = PetscSFLinkSyncDevice_HIP;
896: link->SyncStream = PetscSFLinkSyncStream_HIP;
897: link->Memcpy = PetscSFLinkMemcpy_HIP;
898: link->deviceinited = PETSC_TRUE;
899: return(0);
900: }