Actual source code: aijcusparseband.cu
1: /*
2: AIJCUSPARSE methods implemented with Cuda kernels. Uses cuSparse/Thrust maps from AIJCUSPARSE
3: */
4: #define PETSC_SKIP_SPINLOCK
5: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7: #include <petscconf.h>
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <../src/mat/impls/sbaij/seq/sbaij.h>
10: #undef VecType
11: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
12: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
13: #include <cooperative_groups.h>
14: #endif
16: #define CHECK_LAUNCH_ERROR() \
17: do { \
18: /* Check synchronous errors, i.e. pre-launch */ \
19: cudaError_t err = cudaGetLastError(); \
20: if (cudaSuccess != err) { \
21: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
22: } \
23: /* Check asynchronous errors, i.e. kernel failed (ULF) */ \
24: err = cudaDeviceSynchronize(); \
25: if (cudaSuccess != err) { \
26: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
27: } \
28: } while (0)
30: /*
31: LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields)
33: requires:
34: structurally symmetric: fix with transpose/column meta data
35: */
37: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat,Mat,IS,IS,const MatFactorInfo*);
38: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat,Mat,const MatFactorInfo*);
40: /*
41: The GPU LU factor kernel
42: */
43: __global__
44: void __launch_bounds__(1024,1)
45: mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[])
46: {
47: const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
48: const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
49: const PetscInt nloc_i = (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
51: // set i (row+1)
52: if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero
53: for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
54: if (rowb < end_i && threadIdx.x==0) {
55: PetscInt i=rowb+1, ni = (rowb>bw) ? bw+1 : i, n1L = ni*(ni-1)/2, nug= i*bw, n2L = bw*((rowb>bw) ? (rowb-bw) : 0), mi = bw + rowb + 1 - n, clip = (mi>0) ? mi*(mi-1)/2 + mi: 0;
56: bi_csr[rowb+1] = n1L + nug - clip + n2L + i;
57: }
58: }
59: }
60: // copy AIJ to AIJ_BAND
61: __global__
62: void __launch_bounds__(1024,1)
63: mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[],
64: const int ai_d[], const int aj_d[], const PetscScalar aa_d[],
65: const int bi_csr[], PetscScalar ba_csr[])
66: {
67: const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
68: const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
69: const PetscInt nloc_i = (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
71: // zero B
72: if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end
73: for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
74: if (rowb < end_i) {
75: PetscScalar *batmp = ba_csr + bi_csr[rowb];
76: const PetscInt nzb = bi_csr[rowb+1] - bi_csr[rowb];
77: for (int j=threadIdx.x ; j<nzb ; j += blockDim.x) {
78: if (j<nzb) {
79: batmp[j] = 0;
80: }
81: }
82: }
83: }
85: // copy A into B with CSR format -- these two loops can be fused
86: for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
87: if (rowb < end_i) {
88: const PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
89: const int *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb>bw) ? rowb-bw : 0;
90: const PetscScalar *av = aa_d + ai_d[rowa];
91: PetscScalar *batmp = ba_csr + bi_csr[rowb];
92: /* load in initial (unfactored row) */
93: for (int j=threadIdx.x ; j<nza ; j += blockDim.x) {
94: if (j<nza) {
95: PetscInt colb = ic[ajtmp[j]], idx = colb - bjStart;
96: PetscScalar vala = av[j];
97: batmp[idx] = vala;
98: }
99: }
100: }
101: }
102: }
103: // print AIJ_BAND
104: __global__
105: void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[])
106: {
107: // debug
108: if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) {
109: printf("B (AIJ) n=%d:\n",(int)n);
110: for (int rowb=0;rowb<n;rowb++) {
111: const PetscInt nz = bi_csr[rowb+1] - bi_csr[rowb];
112: const PetscScalar *batmp = ba_csr + bi_csr[rowb];
113: for (int j=0; j<nz; j++) printf("(%13.6e) ",PetscRealPart(batmp[j]));
114: printf(" bi=%d\n",bi_csr[rowb+1]);
115: }
116: }
117: }
118: // Band LU kernel --- ba_csr bi_csr
119: __global__
120: void __launch_bounds__(1024,1)
121: mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[], int *use_group_sync)
122: {
123: const PetscInt Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
124: const PetscInt field = blockIdx.x, blkIdx = blockIdx.y;
125: const PetscInt start = field*nloc, end = start + nloc;
126: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
127: auto g = cooperative_groups::this_grid();
128: #endif
129: // A22 panel update for each row A(1,:) and col A(:,1)
130: for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
131: PetscInt tnzUd = bw, maxU = end-1 - glbDD; // we are chopping off the inter ears
132: const PetscInt nzUd = (tnzUd>maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first
133: PetscScalar *pBdd = ba_csr + bi_csr[glbDD] + dOffset;
134: const PetscScalar *baUd = pBdd + 1; // vector of data U(i,i+1:end)
135: const PetscScalar Bdd = *pBdd;
136: const PetscInt offset = blkIdx*blockDim.y + threadIdx.y, inc = Nblk*blockDim.y;
137: if (threadIdx.x==0) {
138: for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd; idx += inc, myi += inc) { /* assuming symmetric structure */
139: const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
140: PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
141: *Aid = *Aid/Bdd;
142: }
143: }
144: __syncthreads(); // synch on threadIdx.x only
145: for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd; idx += inc, myi += inc) {
146: const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
147: PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
148: PetscScalar *Aij = Aid + 1;
149: const PetscScalar Lid = *Aid;
150: for (int jIdx=threadIdx.x ; jIdx<nzUd; jIdx += blockDim.x) {
151: Aij[jIdx] -= Lid*baUd[jIdx];
152: }
153: }
154: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
155: if (use_group_sync) {
156: g.sync();
157: } else {
158: __syncthreads();
159: }
160: #else
161: __syncthreads();
162: #endif
163: } /* endof for (i=0; i<n; i++) { */
164: }
166: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat,Vec,Vec);
167: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B,Mat A,const MatFactorInfo *info)
168: {
169: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
170: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
171: if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
172: Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
173: Mat_SeqAIJCUSPARSEMultStruct *matstructA;
174: CsrMatrix *matrixA;
175: PetscErrorCode ierr;
176: cudaError_t cerr;
177: const PetscInt n=A->rmap->n, *ic, *r;
178: const int *ai_d, *aj_d;
179: const PetscScalar *aa_d;
180: PetscScalar *ba_t = cusparseTriFactors->a_band_d;
181: int *bi_t = cusparseTriFactors->i_band_d;
182: PetscContainer container;
183: int Ni = 10, team_size=9, Nf, nVec=56, nconcurrent = 1, nsm = -1;
186: if (A->rmap->n == 0) {
187: return(0);
188: }
189: // cusparse setup
190: if (!cusparsestructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparsestructA");
191: matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; // matstruct->cprowIndices
192: if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
193: matrixA = (CsrMatrix*)matstructA->mat;
194: if (!matrixA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matrix cusparsestructA->mat->mat");
196: // factor: get Nf if available
197: PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
198: if (container) {
199: PetscInt *pNf=NULL;
200: PetscContainerGetPointer(container, (void **) &pNf);
201: Nf = (*pNf)%1000;
202: if ((*pNf)/1000>0) nconcurrent = (*pNf)/1000; // number of SMs to use
203: } else Nf = 1;
204: if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
206: // get data
207: ic = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data());
208: ai_d = thrust::raw_pointer_cast(matrixA->row_offsets->data());
209: aj_d = thrust::raw_pointer_cast(matrixA->column_indices->data());
210: aa_d = thrust::raw_pointer_cast(matrixA->values->data().get());
211: r = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data());
213: cerr = WaitForCUDA();CHKERRCUDA(cerr);
214: PetscLogGpuTimeBegin();
215: {
216: int bw = (int)(2.*(double)n-1. - (double)(PetscSqrtReal(1.+4.*((double)n*(double)n-(double)b->nz))+PETSC_MACHINE_EPSILON))/2, bm1=bw-1,nl=n/Nf;
217: #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
218: Ni = 1/nconcurrent;
219: Ni = 1;
220: #else
221: if (!cusparseTriFactors->init_dev_prop) {
222: int gpuid;
223: cusparseTriFactors->init_dev_prop = PETSC_TRUE;
224: cudaGetDevice(&gpuid);
225: cudaGetDeviceProperties(&cusparseTriFactors->dev_prop, gpuid);
226: }
227: nsm = cusparseTriFactors->dev_prop.multiProcessorCount;
228: Ni = nsm/Nf/nconcurrent;
229: #endif
230: team_size = bw/Ni + !!(bw%Ni);
231: nVec = PetscMin(bw, 1024/team_size);
232: PetscInfo7(A,"Matrix Bandwidth = %d, number SMs/block = %d, num concurency = %d, num fields = %d, numSMs/GPU = %d, thread group size = %d,%d\n",bw,Ni,nconcurrent,Nf,nsm,team_size,nVec);
233: {
234: dim3 dimBlockTeam(nVec,team_size);
235: dim3 dimBlockLeague(Nf,Ni);
236: mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague,dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t);
237: CHECK_LAUNCH_ERROR(); // does a sync
238: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
239: if (Ni > 1) {
240: void *kernelArgs[] = { (void*)&n, (void*)&bw, (void*)&bi_t, (void*)&ba_t, (void*)&nsm };
241: cudaLaunchCooperativeKernel((void*)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, 0, NULL);
242: } else {
243: mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam>>>(n, bw, bi_t, ba_t, NULL);
244: }
245: #else
246: mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam>>>(n, bw, bi_t, ba_t, NULL);
247: #endif
248: CHECK_LAUNCH_ERROR(); // does a sync
249: #if defined(PETSC_USE_LOG)
250: PetscLogGpuFlops((PetscLogDouble)Nf*(bm1*(bm1 + 1)*(PetscLogDouble)(2*bm1 + 1)/3 + (PetscLogDouble)2*(nl-bw)*bw*bw + (PetscLogDouble)nl*(nl+1)/2));
251: #endif
252: }
253: }
254: PetscLogGpuTimeEnd();
255: /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */
256: B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND;
257: B->ops->solvetranspose = NULL; // need transpose
258: B->ops->matsolve = NULL;
259: B->ops->matsolvetranspose = NULL;
260: return(0);
261: }
263: static PetscErrorCode MatrixNfDestroy(void *ptr)
264: {
265: PetscInt *nf = (PetscInt *)ptr;
266: PetscErrorCode ierr;
268: PetscFree(nf);
269: return(0);
270: }
272: PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
273: {
274: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b;
275: IS isicol;
276: PetscErrorCode ierr;
277: cudaError_t cerr;
278: const PetscInt *ic,*ai=a->i,*aj=a->j;
279: PetscScalar *ba_t;
280: int *bi_t;
281: PetscInt i,n=A->rmap->n,Nf;
282: PetscInt nzBcsr,bwL,bwU;
283: PetscBool missing;
284: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
285: PetscContainer container;
288: if (A->rmap->N != A->cmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"matrix must be square");
289: MatMissingDiagonal(A,&missing,&i);
290: if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
291: if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"!cusparseTriFactors");
292: MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&missing);
293: if (!missing) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"only structrally symmetric matrices supported");
295: // factor: get Nf if available
296: PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
297: if (container) {
298: PetscInt *pNf=NULL;
299: PetscContainerGetPointer(container, (void **) &pNf);
300: Nf = (*pNf)%1000;
301: PetscContainerCreate(PETSC_COMM_SELF, &container);
302: PetscMalloc(sizeof(PetscInt), &pNf);
303: *pNf = Nf;
304: PetscContainerSetPointer(container, (void *)pNf);
305: PetscContainerSetUserDestroy(container, MatrixNfDestroy);
306: PetscObjectCompose((PetscObject)B, "Nf", (PetscObject) container);
307: PetscContainerDestroy(&container);
308: } else Nf = 1;
309: if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
311: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
312: ISGetIndices(isicol,&ic);
314: MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);
315: PetscLogObjectParent((PetscObject)B,(PetscObject)isicol);
316: b = (Mat_SeqAIJ*)(B)->data;
318: /* get band widths, MatComputeBandwidth should take a reordering ic and do this */
319: bwL = bwU = 0;
320: for (int rwb=0; rwb<n; rwb++) {
321: const PetscInt rwa = ic[rwb], anz = ai[rwb+1] - ai[rwb], *ajtmp = aj + ai[rwb];
322: for (int j=0;j<anz;j++) {
323: PetscInt colb = ic[ajtmp[j]];
324: if (colb<rwa) { // L
325: if (rwa-colb > bwL) bwL = rwa-colb;
326: } else {
327: if (colb-rwa > bwU) bwU = colb-rwa;
328: }
329: }
330: }
331: ISRestoreIndices(isicol,&ic);
332: /* only support structurally symmetric, but it might work */
333: if (bwL!=bwU) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Only symmetric structure supported (now) W_L=%D W_U=%D",bwL,bwU);
334: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
335: nzBcsr = n + (2*n-1)*bwU - bwU*bwU;
336: b->maxnz = b->nz = nzBcsr;
337: cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz
338: PetscInfo2(A,"Matrix Bandwidth = %D, nnz = %D\n",bwL,b->nz);
339: if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
340: cerr = cudaMalloc(&ba_t,(b->nz+1)*sizeof(PetscScalar));CHKERRCUDA(cerr); // include a place for flops
341: cerr = cudaMalloc(&bi_t,(n+1)*sizeof(int));CHKERRCUDA(cerr);
342: cusparseTriFactors->a_band_d = ba_t;
343: cusparseTriFactors->i_band_d = bi_t;
344: /* In b structure: Free imax, ilen, old a, old j. Allocate solve_work, new a, new j */
345: PetscLogObjectMemory((PetscObject)B,(nzBcsr+1)*(sizeof(PetscInt)+sizeof(PetscScalar)));
346: {
347: dim3 dimBlockTeam(1,128);
348: dim3 dimBlockLeague(Nf,1);
349: mat_lu_factor_band_init_set_i<<<dimBlockLeague,dimBlockTeam>>>(n, bwU, bi_t);
350: }
351: CHECK_LAUNCH_ERROR(); // does a sync
353: // setup data
354: if (!cusparseTriFactors->rpermIndices) {
355: const PetscInt *r;
357: ISGetIndices(isrow,&r);
358: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
359: cusparseTriFactors->rpermIndices->assign(r, r+n);
360: ISRestoreIndices(isrow,&r);
361: PetscLogCpuToGpu(n*sizeof(PetscInt));
362: }
363: /* upper triangular indices */
364: if (!cusparseTriFactors->cpermIndices) {
365: const PetscInt *c;
367: ISGetIndices(isicol,&c);
368: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
369: cusparseTriFactors->cpermIndices->assign(c, c+n);
370: ISRestoreIndices(isicol,&c);
371: PetscLogCpuToGpu(n*sizeof(PetscInt));
372: }
374: /* put together the new matrix */
375: b->free_a = PETSC_FALSE;
376: b->free_ij = PETSC_FALSE;
377: b->singlemalloc = PETSC_FALSE;
378: b->ilen = NULL;
379: b->imax = NULL;
380: b->row = isrow;
381: b->col = iscol;
382: PetscObjectReference((PetscObject)isrow);
383: PetscObjectReference((PetscObject)iscol);
384: b->icol = isicol;
385: PetscMalloc1(n+1,&b->solve_work);
387: B->factortype = MAT_FACTOR_LU;
388: B->info.factor_mallocs = 0;
389: B->info.fill_ratio_given = 0;
391: if (ai[n]) {
392: B->info.fill_ratio_needed = ((PetscReal)(nzBcsr))/((PetscReal)ai[n]);
393: } else {
394: B->info.fill_ratio_needed = 0.0;
395: }
396: #if defined(PETSC_USE_INFO)
397: if (ai[n] != 0) {
398: PetscReal af = B->info.fill_ratio_needed;
399: PetscInfo1(A,"Band fill ratio %g\n",(double)af);
400: } else {
401: PetscInfo(A,"Empty matrix\n");
402: }
403: #endif
404: if (a->inode.size) {
405: PetscInfo(A,"Warning: using inodes in band solver.\n");
406: }
407: MatSeqAIJCheckInode_FactorLU(B);
408: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND;
409: B->offloadmask = PETSC_OFFLOAD_GPU;
411: return(0);
412: }
414: /* Use -pc_factor_mat_solver_type cusparseband */
415: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat A,MatSolverType *type)
416: {
418: *type = MATSOLVERCUSPARSEBAND;
419: return(0);
420: }
422: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A,MatFactorType ftype,Mat *B)
423: {
425: PetscInt n = A->rmap->n;
428: MatCreate(PetscObjectComm((PetscObject)A),B);
429: MatSetSizes(*B,n,n,n,n);
430: (*B)->factortype = ftype;
431: (*B)->canuseordering = PETSC_TRUE;
432: MatSetType(*B,MATSEQAIJCUSPARSE);
434: if (ftype == MAT_FACTOR_LU) {
435: MatSetBlockSizesFromMats(*B,A,A);
436: (*B)->ops->ilufactorsymbolic = NULL; // MatILUFactorSymbolic_SeqAIJCUSPARSE;
437: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND;
438: PetscStrallocpy(MATORDERINGRCM,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);
439: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSEBAND Matrix Types");
441: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
442: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse_band);
443: return(0);
444: }
446: #define WARP_SIZE 32
447: template <typename T>
448: __forceinline__ __device__
449: T wreduce(T a)
450: {
451: T b;
452: #pragma unroll
453: for (int i = WARP_SIZE/2; i >= 1; i = i >> 1) {
454: b = __shfl_down_sync(0xffffffff, a, i);
455: a += b;
456: }
457: return a;
458: }
459: // reduce in a block, returns result in thread 0
460: template <typename T, int BLOCK_SIZE>
461: __device__
462: T breduce(T a)
463: {
464: constexpr int NWARP = BLOCK_SIZE/WARP_SIZE;
465: __shared__ double buf[NWARP];
466: int wid = threadIdx.x / WARP_SIZE;
467: int laneid = threadIdx.x % WARP_SIZE;
468: T b = wreduce<T>(a);
469: if (laneid == 0)
470: buf[wid] = b;
471: __syncthreads();
472: if (wid == 0) {
473: if (threadIdx.x < NWARP)
474: a = buf[threadIdx.x];
475: else
476: a = 0;
477: for (int i = (NWARP+1)/2; i >= 1; i = i >> 1) {
478: a += __shfl_down_sync(0xffffffff, a, i);
479: }
480: }
481: return a;
482: }
484: // Band LU kernel --- ba_csr bi_csr
485: template <int BLOCK_SIZE>
486: __global__
487: void __launch_bounds__(256,1)
488: mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[])
489: {
490: const PetscInt Nf = gridDim.x, nloc = n/Nf, field = blockIdx.x, start = field*nloc, end = start + nloc, chopnz = bw*(bw+1)/2, blocknz=(2*bw+1)*nloc, blocknz_0 = blocknz-chopnz;
491: const PetscScalar *pLi;
492: const int tid = threadIdx.x;
494: /* Next, solve L */
495: pLi = ba_csr + (field==0 ? 0 : blocknz_0 + (field-1)*blocknz + bw); // diagonal (0,0) in field
496: for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
497: const PetscInt col = locDD<bw ? start : (glbDD-bw);
498: PetscScalar t = 0;
499: for (int j=col+tid,idx=tid;j<glbDD;j+=blockDim.x,idx+=blockDim.x) {
500: t += pLi[idx]*x[j];
501: }
502: #if defined(PETSC_USE_COMPLEX)
503: PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
504: PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
505: t = tt;
506: #else
507: t = breduce<PetscReal,BLOCK_SIZE>(t);
508: #endif
509: if (threadIdx.x == 0)
510: x[glbDD] -= t; // /1.0
511: __syncthreads();
512: // inc
513: pLi += glbDD-col; // get to diagonal
514: if (glbDD > n-1-bw) pLi += n-1-glbDD; // skip over U, only last block has funny offset
515: else pLi += bw;
516: pLi += 1; // skip to next row
517: if (field>0 && (locDD+1)<bw) pLi += bw-(locDD+1); // skip padding at beginning (ear)
518: }
519: /* Then, solve U */
520: pLi = ba_csr + Nf*blocknz - 2*chopnz - 1; // end of real data on block (diagonal)
521: if (field != Nf-1) pLi -= blocknz_0 + (Nf-2-field)*blocknz + bw; // diagonal of last local row
523: for (int glbDD=end-1, locDD = 0; glbDD >= start; glbDD--, locDD++) {
524: const PetscInt col = (locDD<bw) ? end-1 : glbDD+bw; // end of row in U
525: PetscScalar t = 0;
526: for (int j=col-tid,idx=tid;j>glbDD;j-=blockDim.x,idx+=blockDim.x) {
527: t += pLi[-idx]*x[j];
528: }
529: #if defined(PETSC_USE_COMPLEX)
530: PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
531: PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
532: t = tt;
533: #else
534: t = breduce<PetscReal,BLOCK_SIZE>(PetscRealPart(t));
535: #endif
536: pLi -= col-glbDD; // diagonal
537: if (threadIdx.x == 0) {
538: x[glbDD] -= t;
539: x[glbDD] /= pLi[0];
540: }
541: __syncthreads();
542: // inc past L to start of previous U
543: pLi -= bw+1;
544: if (glbDD<bw) pLi += bw-glbDD; // overshot in top left corner
545: if (((locDD+1) < bw) && field != Nf-1) pLi -= (bw - (locDD+1)); // skip past right corner
546: }
547: }
549: static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A,Vec bb,Vec xx)
550: {
551: const PetscScalar *barray;
552: PetscScalar *xarray;
553: thrust::device_ptr<const PetscScalar> bGPU;
554: thrust::device_ptr<PetscScalar> xGPU;
555: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
556: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
557: PetscInt n=A->rmap->n, nz=cusparseTriFactors->nnz, Nf;
558: PetscInt bw = (int)(2.*(double)n-1.-(double)(PetscSqrtReal(1.+4.*((double)n*(double)n-(double)nz))+PETSC_MACHINE_EPSILON))/2; // quadric formula for bandwidth
559: PetscErrorCode ierr;
560: PetscContainer container;
563: if (A->rmap->n == 0) {
564: return(0);
565: }
566: // factor: get Nf if available
567: PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);
568: if (container) {
569: PetscInt *pNf=NULL;
570: PetscContainerGetPointer(container, (void **) &pNf);
571: Nf = (*pNf)%1000;
572: } else Nf = 1;
573: if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n(%D) % Nf(%D) != 0",n,Nf);
575: /* Get the GPU pointers */
576: VecCUDAGetArrayWrite(xx,&xarray);
577: VecCUDAGetArrayRead(bb,&barray);
578: xGPU = thrust::device_pointer_cast(xarray);
579: bGPU = thrust::device_pointer_cast(barray);
581: PetscLogGpuTimeBegin();
582: /* First, reorder with the row permutation */
583: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
584: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
585: tempGPU->begin());
586: constexpr int block = 128;
587: mat_solve_band<block><<<Nf,block>>>(n,bw,cusparseTriFactors->a_band_d,tempGPU->data().get());
588: CHECK_LAUNCH_ERROR(); // does a sync
590: /* Last, reorder with the column permutation */
591: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
592: thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
593: xGPU);
595: VecCUDARestoreArrayRead(bb,&barray);
596: VecCUDARestoreArrayWrite(xx,&xarray);
597: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
598: PetscLogGpuTimeEnd();
600: return(0);
601: }