Actual source code: aijcusparse.cu
1: /*
2: Defines the basic matrix operations for the AIJ (compressed row)
3: matrix storage format using the CUSPARSE library,
4: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8: #include <petscconf.h>
9: #include <../src/mat/impls/aij/seq/aij.h>
10: #include <../src/mat/impls/sbaij/seq/sbaij.h>
11: #include <../src/vec/vec/impls/dvecimpl.h>
12: #include <petsc/private/vecimpl.h>
13: #undef VecType
14: #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15: #include <thrust/async/for_each.h>
17: const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19: /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20: 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
22: typedef enum {
23: CUSPARSE_MV_ALG_DEFAULT = 0,
24: CUSPARSE_COOMV_ALG = 1,
25: CUSPARSE_CSRMV_ALG1 = 2,
26: CUSPARSE_CSRMV_ALG2 = 3
27: } cusparseSpMVAlg_t;
29: typedef enum {
30: CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31: CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1,
32: CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2,
33: CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3,
34: CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4,
35: CUSPARSE_SPMM_ALG_DEFAULT = 0,
36: CUSPARSE_SPMM_COO_ALG1 = 1,
37: CUSPARSE_SPMM_COO_ALG2 = 2,
38: CUSPARSE_SPMM_COO_ALG3 = 3,
39: CUSPARSE_SPMM_COO_ALG4 = 5,
40: CUSPARSE_SPMM_CSR_ALG1 = 4,
41: CUSPARSE_SPMM_CSR_ALG2 = 6,
42: } cusparseSpMMAlg_t;
44: typedef enum {
45: CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46: CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc
47: } cusparseCsr2CscAlg_t;
48: */
49: const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50: const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51: const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52: #endif
54: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
58: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
69: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
70: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
77: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
83: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85: static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
87: PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
88: PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
90: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92: PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93: {
94: cusparseStatus_t stat;
95: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98: if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99: cusparsestruct->stream = stream;
100: stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101: return(0);
102: }
104: PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105: {
106: cusparseStatus_t stat;
107: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
110: if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111: if (cusparsestruct->handle != handle) {
112: if (cusparsestruct->handle) {
113: stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114: }
115: cusparsestruct->handle = handle;
116: }
117: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118: return(0);
119: }
121: PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122: {
123: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124: PetscBool flg;
125: PetscErrorCode ierr;
128: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
129: if (!flg || !cusparsestruct) return(0);
130: if (cusparsestruct->handle) cusparsestruct->handle = 0;
131: return(0);
132: }
134: PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135: {
137: *type = MATSOLVERCUSPARSE;
138: return(0);
139: }
141: /*MC
142: MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143: on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144: algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145: performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146: CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147: algorithms are not recommended. This class does NOT support direct solver operations.
149: Level: beginner
151: .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152: M*/
154: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155: {
157: PetscInt n = A->rmap->n;
160: MatCreate(PetscObjectComm((PetscObject)A),B);
161: MatSetSizes(*B,n,n,n,n);
162: (*B)->factortype = ftype;
163: MatSetType(*B,MATSEQAIJCUSPARSE);
165: if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
166: MatSetBlockSizesFromMats(*B,A,A);
167: (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
168: (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE;
169: PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);
170: PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);
171: PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);
172: } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173: (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174: (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
175: PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);
176: PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);
177: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
179: MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);
180: (*B)->canuseordering = PETSC_TRUE;
181: PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);
182: return(0);
183: }
185: PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
186: {
187: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
190: switch (op) {
191: case MAT_CUSPARSE_MULT:
192: cusparsestruct->format = format;
193: break;
194: case MAT_CUSPARSE_ALL:
195: cusparsestruct->format = format;
196: break;
197: default:
198: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
199: }
200: return(0);
201: }
203: /*@
204: MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
205: operation. Only the MatMult operation can use different GPU storage formats
206: for MPIAIJCUSPARSE matrices.
207: Not Collective
209: Input Parameters:
210: + A - Matrix of type SEQAIJCUSPARSE
211: . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
212: - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
214: Output Parameter:
216: Level: intermediate
218: .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
219: @*/
220: PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
221: {
226: PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
227: return(0);
228: }
230: PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
231: {
235: switch (op) {
236: case MAT_FORM_EXPLICIT_TRANSPOSE:
237: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
238: if (A->form_explicit_transpose && !flg) {MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);}
239: A->form_explicit_transpose = flg;
240: break;
241: default:
242: MatSetOption_SeqAIJ(A,op,flg);
243: break;
244: }
245: return(0);
246: }
248: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
250: static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
251: {
252: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
253: IS isrow = b->row,iscol = b->col;
254: PetscBool row_identity,col_identity;
258: MatSeqAIJCUSPARSECopyFromGPU(A);
259: MatLUFactorNumeric_SeqAIJ(B,A,info);
260: B->offloadmask = PETSC_OFFLOAD_CPU;
261: /* determine which version of MatSolve needs to be used. */
262: ISIdentity(isrow,&row_identity);
263: ISIdentity(iscol,&col_identity);
264: if (row_identity && col_identity) {
265: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
266: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
267: B->ops->matsolve = NULL;
268: B->ops->matsolvetranspose = NULL;
269: } else {
270: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
271: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
272: B->ops->matsolve = NULL;
273: B->ops->matsolvetranspose = NULL;
274: }
276: /* get the triangular factors */
277: MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);
278: return(0);
279: }
281: static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
282: {
283: PetscErrorCode ierr;
284: MatCUSPARSEStorageFormat format;
285: PetscBool flg;
286: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
289: PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");
290: if (A->factortype == MAT_FACTOR_NONE) {
291: PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
292: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
293: if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);}
295: PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
296: "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);
297: if (flg) {MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);}
298: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
299: PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
300: "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);
301: /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
302: #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
303: if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
304: #else
305: if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
306: #endif
307: PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
308: "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);
309: if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
311: PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
312: "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);
313: if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
314: #endif
315: }
316: PetscOptionsTail();
317: return(0);
318: }
320: static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
321: {
322: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
323: PetscErrorCode ierr;
326: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
327: MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
328: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
329: return(0);
330: }
332: static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
333: {
334: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
335: PetscErrorCode ierr;
338: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
339: MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
340: B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
341: return(0);
342: }
344: static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
345: {
346: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
347: PetscErrorCode ierr;
350: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
351: MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);
352: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
353: return(0);
354: }
356: static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
357: {
358: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
359: PetscErrorCode ierr;
362: MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);
363: MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);
364: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
365: return(0);
366: }
368: static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
369: {
370: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
371: PetscInt n = A->rmap->n;
372: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
373: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
374: cusparseStatus_t stat;
375: const PetscInt *ai = a->i,*aj = a->j,*vi;
376: const MatScalar *aa = a->a,*v;
377: PetscInt *AiLo, *AjLo;
378: PetscInt i,nz, nzLower, offset, rowOffset;
379: PetscErrorCode ierr;
380: cudaError_t cerr;
383: if (!n) return(0);
384: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
385: try {
386: /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
387: nzLower=n+ai[n]-ai[1];
388: if (!loTriFactor) {
389: PetscScalar *AALo;
391: cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
393: /* Allocate Space for the lower triangular matrix */
394: cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
395: cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
397: /* Fill the lower triangular matrix */
398: AiLo[0] = (PetscInt) 0;
399: AiLo[n] = nzLower;
400: AjLo[0] = (PetscInt) 0;
401: AALo[0] = (MatScalar) 1.0;
402: v = aa;
403: vi = aj;
404: offset = 1;
405: rowOffset= 1;
406: for (i=1; i<n; i++) {
407: nz = ai[i+1] - ai[i];
408: /* additional 1 for the term on the diagonal */
409: AiLo[i] = rowOffset;
410: rowOffset += nz+1;
412: PetscArraycpy(&(AjLo[offset]), vi, nz);
413: PetscArraycpy(&(AALo[offset]), v, nz);
415: offset += nz;
416: AjLo[offset] = (PetscInt) i;
417: AALo[offset] = (MatScalar) 1.0;
418: offset += 1;
420: v += nz;
421: vi += nz;
422: }
424: /* allocate space for the triangular factor information */
425: PetscNew(&loTriFactor);
426: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
427: /* Create the matrix description */
428: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
429: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
430: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
431: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
432: #else
433: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
434: #endif
435: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
436: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
438: /* set the operation */
439: loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
441: /* set the matrix */
442: loTriFactor->csrMat = new CsrMatrix;
443: loTriFactor->csrMat->num_rows = n;
444: loTriFactor->csrMat->num_cols = n;
445: loTriFactor->csrMat->num_entries = nzLower;
447: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
448: loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
450: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
451: loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
453: loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
454: loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
456: /* Create the solve analysis information */
457: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
458: stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
459: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
460: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
461: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
462: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
463: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
464: &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
465: cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
466: #endif
468: /* perform the solve analysis */
469: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
470: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
471: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
472: loTriFactor->csrMat->column_indices->data().get(),
473: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
474: loTriFactor->solveInfo,
475: loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
476: #else
477: loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
478: #endif
479: cerr = WaitForCUDA();CHKERRCUDA(cerr);
480: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
482: /* assign the pointer */
483: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
484: loTriFactor->AA_h = AALo;
485: cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
486: cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
487: PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));
488: } else { /* update values only */
489: if (!loTriFactor->AA_h) {
490: cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
491: }
492: /* Fill the lower triangular matrix */
493: loTriFactor->AA_h[0] = 1.0;
494: v = aa;
495: vi = aj;
496: offset = 1;
497: for (i=1; i<n; i++) {
498: nz = ai[i+1] - ai[i];
499: PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);
500: offset += nz;
501: loTriFactor->AA_h[offset] = 1.0;
502: offset += 1;
503: v += nz;
504: }
505: loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
506: PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));
507: }
508: } catch(char *ex) {
509: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
510: }
511: }
512: return(0);
513: }
515: static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
516: {
517: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
518: PetscInt n = A->rmap->n;
519: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
520: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
521: cusparseStatus_t stat;
522: const PetscInt *aj = a->j,*adiag = a->diag,*vi;
523: const MatScalar *aa = a->a,*v;
524: PetscInt *AiUp, *AjUp;
525: PetscInt i,nz, nzUpper, offset;
526: PetscErrorCode ierr;
527: cudaError_t cerr;
530: if (!n) return(0);
531: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
532: try {
533: /* next, figure out the number of nonzeros in the upper triangular matrix. */
534: nzUpper = adiag[0]-adiag[n];
535: if (!upTriFactor) {
536: PetscScalar *AAUp;
538: cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
540: /* Allocate Space for the upper triangular matrix */
541: cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
542: cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
544: /* Fill the upper triangular matrix */
545: AiUp[0]=(PetscInt) 0;
546: AiUp[n]=nzUpper;
547: offset = nzUpper;
548: for (i=n-1; i>=0; i--) {
549: v = aa + adiag[i+1] + 1;
550: vi = aj + adiag[i+1] + 1;
552: /* number of elements NOT on the diagonal */
553: nz = adiag[i] - adiag[i+1]-1;
555: /* decrement the offset */
556: offset -= (nz+1);
558: /* first, set the diagonal elements */
559: AjUp[offset] = (PetscInt) i;
560: AAUp[offset] = (MatScalar)1./v[nz];
561: AiUp[i] = AiUp[i+1] - (nz+1);
563: PetscArraycpy(&(AjUp[offset+1]), vi, nz);
564: PetscArraycpy(&(AAUp[offset+1]), v, nz);
565: }
567: /* allocate space for the triangular factor information */
568: PetscNew(&upTriFactor);
569: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
571: /* Create the matrix description */
572: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
573: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
574: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
575: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
576: #else
577: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
578: #endif
579: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
580: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
582: /* set the operation */
583: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
585: /* set the matrix */
586: upTriFactor->csrMat = new CsrMatrix;
587: upTriFactor->csrMat->num_rows = n;
588: upTriFactor->csrMat->num_cols = n;
589: upTriFactor->csrMat->num_entries = nzUpper;
591: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
592: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
594: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
595: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
597: upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
598: upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
600: /* Create the solve analysis information */
601: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
602: stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
603: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
604: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
605: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
606: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
607: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
608: &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
609: cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
610: #endif
612: /* perform the solve analysis */
613: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
614: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
615: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
616: upTriFactor->csrMat->column_indices->data().get(),
617: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
618: upTriFactor->solveInfo,
619: upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
620: #else
621: upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
622: #endif
623: cerr = WaitForCUDA();CHKERRCUDA(cerr);
624: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
626: /* assign the pointer */
627: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
628: upTriFactor->AA_h = AAUp;
629: cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
630: cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
631: PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));
632: } else {
633: if (!upTriFactor->AA_h) {
634: cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
635: }
636: /* Fill the upper triangular matrix */
637: offset = nzUpper;
638: for (i=n-1; i>=0; i--) {
639: v = aa + adiag[i+1] + 1;
641: /* number of elements NOT on the diagonal */
642: nz = adiag[i] - adiag[i+1]-1;
644: /* decrement the offset */
645: offset -= (nz+1);
647: /* first, set the diagonal elements */
648: upTriFactor->AA_h[offset] = 1./v[nz];
649: PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);
650: }
651: upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
652: PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));
653: }
654: } catch(char *ex) {
655: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
656: }
657: }
658: return(0);
659: }
661: static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
662: {
663: PetscErrorCode ierr;
664: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
665: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
666: IS isrow = a->row,iscol = a->icol;
667: PetscBool row_identity,col_identity;
668: PetscInt n = A->rmap->n;
671: if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
672: MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);
673: MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);
675: if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
676: cusparseTriFactors->nnz=a->nz;
678: A->offloadmask = PETSC_OFFLOAD_BOTH;
679: /* lower triangular indices */
680: ISIdentity(isrow,&row_identity);
681: if (!row_identity && !cusparseTriFactors->rpermIndices) {
682: const PetscInt *r;
684: ISGetIndices(isrow,&r);
685: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
686: cusparseTriFactors->rpermIndices->assign(r, r+n);
687: ISRestoreIndices(isrow,&r);
688: PetscLogCpuToGpu(n*sizeof(PetscInt));
689: }
691: /* upper triangular indices */
692: ISIdentity(iscol,&col_identity);
693: if (!col_identity && !cusparseTriFactors->cpermIndices) {
694: const PetscInt *c;
696: ISGetIndices(iscol,&c);
697: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
698: cusparseTriFactors->cpermIndices->assign(c, c+n);
699: ISRestoreIndices(iscol,&c);
700: PetscLogCpuToGpu(n*sizeof(PetscInt));
701: }
702: return(0);
703: }
705: static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
706: {
707: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
708: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
709: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
710: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
711: cusparseStatus_t stat;
712: PetscErrorCode ierr;
713: cudaError_t cerr;
714: PetscInt *AiUp, *AjUp;
715: PetscScalar *AAUp;
716: PetscScalar *AALo;
717: PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
718: Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data;
719: const PetscInt *ai = b->i,*aj = b->j,*vj;
720: const MatScalar *aa = b->a,*v;
723: if (!n) return(0);
724: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
725: try {
726: cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
727: cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
728: if (!upTriFactor && !loTriFactor) {
729: /* Allocate Space for the upper triangular matrix */
730: cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
731: cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
733: /* Fill the upper triangular matrix */
734: AiUp[0]=(PetscInt) 0;
735: AiUp[n]=nzUpper;
736: offset = 0;
737: for (i=0; i<n; i++) {
738: /* set the pointers */
739: v = aa + ai[i];
740: vj = aj + ai[i];
741: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
743: /* first, set the diagonal elements */
744: AjUp[offset] = (PetscInt) i;
745: AAUp[offset] = (MatScalar)1.0/v[nz];
746: AiUp[i] = offset;
747: AALo[offset] = (MatScalar)1.0/v[nz];
749: offset+=1;
750: if (nz>0) {
751: PetscArraycpy(&(AjUp[offset]), vj, nz);
752: PetscArraycpy(&(AAUp[offset]), v, nz);
753: for (j=offset; j<offset+nz; j++) {
754: AAUp[j] = -AAUp[j];
755: AALo[j] = AAUp[j]/v[nz];
756: }
757: offset+=nz;
758: }
759: }
761: /* allocate space for the triangular factor information */
762: PetscNew(&upTriFactor);
763: upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
765: /* Create the matrix description */
766: stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
767: stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
768: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
769: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
770: #else
771: stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
772: #endif
773: stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
774: stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
776: /* set the matrix */
777: upTriFactor->csrMat = new CsrMatrix;
778: upTriFactor->csrMat->num_rows = A->rmap->n;
779: upTriFactor->csrMat->num_cols = A->cmap->n;
780: upTriFactor->csrMat->num_entries = a->nz;
782: upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
783: upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
785: upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
786: upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
788: upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
789: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
791: /* set the operation */
792: upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
794: /* Create the solve analysis information */
795: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
796: stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
797: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
798: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
799: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
800: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
801: upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
802: &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
803: cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
804: #endif
806: /* perform the solve analysis */
807: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
808: upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
809: upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
810: upTriFactor->csrMat->column_indices->data().get(),
811: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
812: upTriFactor->solveInfo,
813: upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
814: #else
815: upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
816: #endif
817: cerr = WaitForCUDA();CHKERRCUDA(cerr);
818: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
820: /* assign the pointer */
821: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
823: /* allocate space for the triangular factor information */
824: PetscNew(&loTriFactor);
825: loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
827: /* Create the matrix description */
828: stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
829: stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
830: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
831: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
832: #else
833: stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
834: #endif
835: stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
836: stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
838: /* set the operation */
839: loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
841: /* set the matrix */
842: loTriFactor->csrMat = new CsrMatrix;
843: loTriFactor->csrMat->num_rows = A->rmap->n;
844: loTriFactor->csrMat->num_cols = A->cmap->n;
845: loTriFactor->csrMat->num_entries = a->nz;
847: loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
848: loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
850: loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851: loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
853: loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
856: /* Create the solve analysis information */
857: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
858: stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
859: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
860: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
861: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
862: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
863: loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
864: &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
865: cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
866: #endif
868: /* perform the solve analysis */
869: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
870: loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
871: loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
872: loTriFactor->csrMat->column_indices->data().get(),
873: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
874: loTriFactor->solveInfo,
875: loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
876: #else
877: loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
878: #endif
879: cerr = WaitForCUDA();CHKERRCUDA(cerr);
880: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
882: /* assign the pointer */
883: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
885: PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));
886: cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
887: cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
888: } else {
889: /* Fill the upper triangular matrix */
890: offset = 0;
891: for (i=0; i<n; i++) {
892: /* set the pointers */
893: v = aa + ai[i];
894: nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
896: /* first, set the diagonal elements */
897: AAUp[offset] = 1.0/v[nz];
898: AALo[offset] = 1.0/v[nz];
900: offset+=1;
901: if (nz>0) {
902: PetscArraycpy(&(AAUp[offset]), v, nz);
903: for (j=offset; j<offset+nz; j++) {
904: AAUp[j] = -AAUp[j];
905: AALo[j] = AAUp[j]/v[nz];
906: }
907: offset+=nz;
908: }
909: }
910: if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
911: if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
912: upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
913: loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
914: PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));
915: }
916: cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
917: cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
918: } catch(char *ex) {
919: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
920: }
921: }
922: return(0);
923: }
925: static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
926: {
927: PetscErrorCode ierr;
928: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
929: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
930: IS ip = a->row;
931: PetscBool perm_identity;
932: PetscInt n = A->rmap->n;
935: if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
936: MatSeqAIJCUSPARSEBuildICCTriMatrices(A);
937: if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
938: cusparseTriFactors->nnz=(a->nz-n)*2 + n;
940: A->offloadmask = PETSC_OFFLOAD_BOTH;
942: /* lower triangular indices */
943: ISIdentity(ip,&perm_identity);
944: if (!perm_identity) {
945: IS iip;
946: const PetscInt *irip,*rip;
948: ISInvertPermutation(ip,PETSC_DECIDE,&iip);
949: ISGetIndices(iip,&irip);
950: ISGetIndices(ip,&rip);
951: cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
952: cusparseTriFactors->rpermIndices->assign(rip, rip+n);
953: cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
954: cusparseTriFactors->cpermIndices->assign(irip, irip+n);
955: ISRestoreIndices(iip,&irip);
956: ISDestroy(&iip);
957: ISRestoreIndices(ip,&rip);
958: PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
959: }
960: return(0);
961: }
963: static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
964: {
965: Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data;
966: IS ip = b->row;
967: PetscBool perm_identity;
971: MatSeqAIJCUSPARSECopyFromGPU(A);
972: MatCholeskyFactorNumeric_SeqAIJ(B,A,info);
973: B->offloadmask = PETSC_OFFLOAD_CPU;
974: /* determine which version of MatSolve needs to be used. */
975: ISIdentity(ip,&perm_identity);
976: if (perm_identity) {
977: B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
978: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
979: B->ops->matsolve = NULL;
980: B->ops->matsolvetranspose = NULL;
981: } else {
982: B->ops->solve = MatSolve_SeqAIJCUSPARSE;
983: B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
984: B->ops->matsolve = NULL;
985: B->ops->matsolvetranspose = NULL;
986: }
988: /* get the triangular factors */
989: MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);
990: return(0);
991: }
993: static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
994: {
995: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
996: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
997: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
998: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
999: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1000: cusparseStatus_t stat;
1001: cusparseIndexBase_t indexBase;
1002: cusparseMatrixType_t matrixType;
1003: cusparseFillMode_t fillMode;
1004: cusparseDiagType_t diagType;
1005: cudaError_t cerr;
1006: PetscErrorCode ierr;
1009: /* allocate space for the transpose of the lower triangular factor */
1010: PetscNew(&loTriFactorT);
1011: loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1013: /* set the matrix descriptors of the lower triangular factor */
1014: matrixType = cusparseGetMatType(loTriFactor->descr);
1015: indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1016: fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1017: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1018: diagType = cusparseGetMatDiagType(loTriFactor->descr);
1020: /* Create the matrix description */
1021: stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1022: stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1023: stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1024: stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1025: stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1027: /* set the operation */
1028: loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1030: /* allocate GPU space for the CSC of the lower triangular factor*/
1031: loTriFactorT->csrMat = new CsrMatrix;
1032: loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols;
1033: loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows;
1034: loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
1035: loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1036: loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1037: loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1039: /* compute the transpose of the lower triangular factor, i.e. the CSC */
1040: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1041: stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1042: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1043: loTriFactor->csrMat->values->data().get(),
1044: loTriFactor->csrMat->row_offsets->data().get(),
1045: loTriFactor->csrMat->column_indices->data().get(),
1046: loTriFactorT->csrMat->values->data().get(),
1047: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1048: CUSPARSE_ACTION_NUMERIC,indexBase,
1049: CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1050: cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1051: #endif
1053: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1054: stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1055: loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1056: loTriFactor->csrMat->values->data().get(),
1057: loTriFactor->csrMat->row_offsets->data().get(),
1058: loTriFactor->csrMat->column_indices->data().get(),
1059: loTriFactorT->csrMat->values->data().get(),
1060: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1061: loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1062: CUSPARSE_ACTION_NUMERIC, indexBase,
1063: CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1064: #else
1065: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1066: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1067: #endif
1068: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1069: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1071: /* Create the solve analysis information */
1072: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1073: stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1074: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1075: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1077: loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1078: loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1079: &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1080: cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1081: #endif
1083: /* perform the solve analysis */
1084: stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1085: loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1086: loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1087: loTriFactorT->csrMat->column_indices->data().get(),
1088: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1089: loTriFactorT->solveInfo,
1090: loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1091: #else
1092: loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1093: #endif
1094: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1095: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1097: /* assign the pointer */
1098: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1100: /*********************************************/
1101: /* Now the Transpose of the Upper Tri Factor */
1102: /*********************************************/
1104: /* allocate space for the transpose of the upper triangular factor */
1105: PetscNew(&upTriFactorT);
1106: upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1108: /* set the matrix descriptors of the upper triangular factor */
1109: matrixType = cusparseGetMatType(upTriFactor->descr);
1110: indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1111: fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1112: CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1113: diagType = cusparseGetMatDiagType(upTriFactor->descr);
1115: /* Create the matrix description */
1116: stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1117: stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1118: stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1119: stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1120: stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1122: /* set the operation */
1123: upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1125: /* allocate GPU space for the CSC of the upper triangular factor*/
1126: upTriFactorT->csrMat = new CsrMatrix;
1127: upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols;
1128: upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows;
1129: upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
1130: upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1131: upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1132: upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1134: /* compute the transpose of the upper triangular factor, i.e. the CSC */
1135: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1136: stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1137: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1138: upTriFactor->csrMat->values->data().get(),
1139: upTriFactor->csrMat->row_offsets->data().get(),
1140: upTriFactor->csrMat->column_indices->data().get(),
1141: upTriFactorT->csrMat->values->data().get(),
1142: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1143: CUSPARSE_ACTION_NUMERIC,indexBase,
1144: CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1145: cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1146: #endif
1148: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1149: stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1150: upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1151: upTriFactor->csrMat->values->data().get(),
1152: upTriFactor->csrMat->row_offsets->data().get(),
1153: upTriFactor->csrMat->column_indices->data().get(),
1154: upTriFactorT->csrMat->values->data().get(),
1155: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1156: upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1157: CUSPARSE_ACTION_NUMERIC, indexBase,
1158: CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1159: #else
1160: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1161: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1162: #endif
1164: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1165: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1167: /* Create the solve analysis information */
1168: PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1169: stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1170: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1171: stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1172: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1173: upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1174: upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1175: &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1176: cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1177: #endif
1179: /* perform the solve analysis */
1180: stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1181: upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1182: upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1183: upTriFactorT->csrMat->column_indices->data().get(),
1184: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1185: upTriFactorT->solveInfo,
1186: upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1187: #else
1188: upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1189: #endif
1191: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192: PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);
1194: /* assign the pointer */
1195: ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196: return(0);
1197: }
1199: struct PetscScalarToPetscInt
1200: {
1201: __host__ __device__
1202: PetscInt operator()(PetscScalar s)
1203: {
1204: return (PetscInt)PetscRealPart(s);
1205: }
1206: };
1208: static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1209: {
1210: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1211: Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1212: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1213: cusparseStatus_t stat;
1214: cusparseIndexBase_t indexBase;
1215: cudaError_t err;
1216: PetscErrorCode ierr;
1219: if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) return(0);
1220: MatSeqAIJCUSPARSECopyToGPU(A);
1221: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1222: if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1223: matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1224: if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1225: if (A->transupdated) return(0);
1226: PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1227: PetscLogGpuTimeBegin();
1228: if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1229: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1230: }
1231: if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1232: matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1233: stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1234: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1235: stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1236: stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1238: /* set alpha and beta */
1239: err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1240: err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1241: err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1242: err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1243: err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1244: err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1246: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1247: CsrMatrix *matrixT = new CsrMatrix;
1248: matstructT->mat = matrixT;
1249: matrixT->num_rows = A->cmap->n;
1250: matrixT->num_cols = A->rmap->n;
1251: matrixT->num_entries = a->nz;
1252: matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1253: matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1254: matrixT->values = new THRUSTARRAY(a->nz);
1256: if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1257: cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1259: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1260: stat = cusparseCreateCsr(&matstructT->matDescr,
1261: matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1262: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1263: matrixT->values->data().get(),
1264: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265: indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1266: #endif
1267: } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1268: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1269: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1270: #else
1271: CsrMatrix *temp = new CsrMatrix;
1272: CsrMatrix *tempT = new CsrMatrix;
1273: /* First convert HYB to CSR */
1274: temp->num_rows = A->rmap->n;
1275: temp->num_cols = A->cmap->n;
1276: temp->num_entries = a->nz;
1277: temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1278: temp->column_indices = new THRUSTINTARRAY32(a->nz);
1279: temp->values = new THRUSTARRAY(a->nz);
1281: stat = cusparse_hyb2csr(cusparsestruct->handle,
1282: matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1283: temp->values->data().get(),
1284: temp->row_offsets->data().get(),
1285: temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1287: /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1288: tempT->num_rows = A->rmap->n;
1289: tempT->num_cols = A->cmap->n;
1290: tempT->num_entries = a->nz;
1291: tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1292: tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1293: tempT->values = new THRUSTARRAY(a->nz);
1295: stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1296: temp->num_cols, temp->num_entries,
1297: temp->values->data().get(),
1298: temp->row_offsets->data().get(),
1299: temp->column_indices->data().get(),
1300: tempT->values->data().get(),
1301: tempT->column_indices->data().get(),
1302: tempT->row_offsets->data().get(),
1303: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1305: /* Last, convert CSC to HYB */
1306: cusparseHybMat_t hybMat;
1307: stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1308: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1309: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1310: stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1311: matstructT->descr, tempT->values->data().get(),
1312: tempT->row_offsets->data().get(),
1313: tempT->column_indices->data().get(),
1314: hybMat, 0, partition);CHKERRCUSPARSE(stat);
1316: /* assign the pointer */
1317: matstructT->mat = hybMat;
1318: A->transupdated = PETSC_TRUE;
1319: /* delete temporaries */
1320: if (tempT) {
1321: if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1322: if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1323: if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1324: delete (CsrMatrix*) tempT;
1325: }
1326: if (temp) {
1327: if (temp->values) delete (THRUSTARRAY*) temp->values;
1328: if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1329: if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1330: delete (CsrMatrix*) temp;
1331: }
1332: #endif
1333: }
1334: }
1335: if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1336: CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1337: CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1338: if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1339: if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1340: if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1341: if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1342: if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1343: if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1344: if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1345: if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1346: if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1347: cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1348: cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1349: PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
1350: }
1351: if (!cusparsestruct->csr2csc_i) {
1352: THRUSTARRAY csr2csc_a(matrix->num_entries);
1353: PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1355: indexBase = cusparseGetMatIndexBase(matstruct->descr);
1356: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1357: void *csr2cscBuffer;
1358: size_t csr2cscBufferSize;
1359: stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1360: A->cmap->n, matrix->num_entries,
1361: matrix->values->data().get(),
1362: cusparsestruct->rowoffsets_gpu->data().get(),
1363: matrix->column_indices->data().get(),
1364: matrixT->values->data().get(),
1365: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1366: CUSPARSE_ACTION_NUMERIC,indexBase,
1367: cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1368: err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1369: #endif
1371: if (matrix->num_entries) {
1372: /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1373: mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1374: I checked every parameters and they were just fine. I have no clue why cusparse complains.
1376: Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1377: should be filled with indexBase. So I just take a shortcut here.
1378: */
1379: stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1380: A->cmap->n,matrix->num_entries,
1381: csr2csc_a.data().get(),
1382: cusparsestruct->rowoffsets_gpu->data().get(),
1383: matrix->column_indices->data().get(),
1384: matrixT->values->data().get(),
1385: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1386: matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1387: CUSPARSE_ACTION_NUMERIC,indexBase,
1388: cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1389: #else
1390: matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1391: CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1392: #endif
1393: } else {
1394: matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1395: }
1397: cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1398: PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1399: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1400: err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1401: #endif
1402: }
1403: PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1404: thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1405: matrixT->values->begin()));
1406: }
1407: PetscLogGpuTimeEnd();
1408: PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);
1409: /* the compressed row indices is not used for matTranspose */
1410: matstructT->cprowIndices = NULL;
1411: /* assign the pointer */
1412: ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1413: A->transupdated = PETSC_TRUE;
1414: return(0);
1415: }
1417: /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1418: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1419: {
1420: PetscInt n = xx->map->n;
1421: const PetscScalar *barray;
1422: PetscScalar *xarray;
1423: thrust::device_ptr<const PetscScalar> bGPU;
1424: thrust::device_ptr<PetscScalar> xGPU;
1425: cusparseStatus_t stat;
1426: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1427: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1428: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1429: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1430: PetscErrorCode ierr;
1433: /* Analyze the matrix and create the transpose ... on the fly */
1434: if (!loTriFactorT && !upTriFactorT) {
1435: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1436: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1437: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1438: }
1440: /* Get the GPU pointers */
1441: VecCUDAGetArrayWrite(xx,&xarray);
1442: VecCUDAGetArrayRead(bb,&barray);
1443: xGPU = thrust::device_pointer_cast(xarray);
1444: bGPU = thrust::device_pointer_cast(barray);
1446: PetscLogGpuTimeBegin();
1447: /* First, reorder with the row permutation */
1448: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1449: thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1450: xGPU);
1452: /* First, solve U */
1453: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1454: upTriFactorT->csrMat->num_rows,
1455: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1456: upTriFactorT->csrMat->num_entries,
1457: #endif
1458: &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1459: upTriFactorT->csrMat->values->data().get(),
1460: upTriFactorT->csrMat->row_offsets->data().get(),
1461: upTriFactorT->csrMat->column_indices->data().get(),
1462: upTriFactorT->solveInfo,
1463: xarray,
1464: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1465: tempGPU->data().get(),
1466: upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1467: #else
1468: tempGPU->data().get());CHKERRCUSPARSE(stat);
1469: #endif
1471: /* Then, solve L */
1472: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1473: loTriFactorT->csrMat->num_rows,
1474: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1475: loTriFactorT->csrMat->num_entries,
1476: #endif
1477: &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1478: loTriFactorT->csrMat->values->data().get(),
1479: loTriFactorT->csrMat->row_offsets->data().get(),
1480: loTriFactorT->csrMat->column_indices->data().get(),
1481: loTriFactorT->solveInfo,
1482: tempGPU->data().get(),
1483: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1484: xarray,
1485: loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1486: #else
1487: xarray);CHKERRCUSPARSE(stat);
1488: #endif
1490: /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1491: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1492: thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1493: tempGPU->begin());
1495: /* Copy the temporary to the full solution. */
1496: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1498: /* restore */
1499: VecCUDARestoreArrayRead(bb,&barray);
1500: VecCUDARestoreArrayWrite(xx,&xarray);
1501: PetscLogGpuTimeEnd();
1502: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1503: return(0);
1504: }
1506: static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1507: {
1508: const PetscScalar *barray;
1509: PetscScalar *xarray;
1510: cusparseStatus_t stat;
1511: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1512: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1513: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1514: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1515: PetscErrorCode ierr;
1518: /* Analyze the matrix and create the transpose ... on the fly */
1519: if (!loTriFactorT && !upTriFactorT) {
1520: MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);
1521: loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1522: upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1523: }
1525: /* Get the GPU pointers */
1526: VecCUDAGetArrayWrite(xx,&xarray);
1527: VecCUDAGetArrayRead(bb,&barray);
1529: PetscLogGpuTimeBegin();
1530: /* First, solve U */
1531: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1532: upTriFactorT->csrMat->num_rows,
1533: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1534: upTriFactorT->csrMat->num_entries,
1535: #endif
1536: &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1537: upTriFactorT->csrMat->values->data().get(),
1538: upTriFactorT->csrMat->row_offsets->data().get(),
1539: upTriFactorT->csrMat->column_indices->data().get(),
1540: upTriFactorT->solveInfo,
1541: barray,
1542: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1543: tempGPU->data().get(),
1544: upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1545: #else
1546: tempGPU->data().get());CHKERRCUSPARSE(stat);
1547: #endif
1549: /* Then, solve L */
1550: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1551: loTriFactorT->csrMat->num_rows,
1552: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1553: loTriFactorT->csrMat->num_entries,
1554: #endif
1555: &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1556: loTriFactorT->csrMat->values->data().get(),
1557: loTriFactorT->csrMat->row_offsets->data().get(),
1558: loTriFactorT->csrMat->column_indices->data().get(),
1559: loTriFactorT->solveInfo,
1560: tempGPU->data().get(),
1561: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1562: xarray,
1563: loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1564: #else
1565: xarray);CHKERRCUSPARSE(stat);
1566: #endif
1568: /* restore */
1569: VecCUDARestoreArrayRead(bb,&barray);
1570: VecCUDARestoreArrayWrite(xx,&xarray);
1571: PetscLogGpuTimeEnd();
1572: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1573: return(0);
1574: }
1576: static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1577: {
1578: const PetscScalar *barray;
1579: PetscScalar *xarray;
1580: thrust::device_ptr<const PetscScalar> bGPU;
1581: thrust::device_ptr<PetscScalar> xGPU;
1582: cusparseStatus_t stat;
1583: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1584: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1585: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1586: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1587: PetscErrorCode ierr;
1591: /* Get the GPU pointers */
1592: VecCUDAGetArrayWrite(xx,&xarray);
1593: VecCUDAGetArrayRead(bb,&barray);
1594: xGPU = thrust::device_pointer_cast(xarray);
1595: bGPU = thrust::device_pointer_cast(barray);
1597: PetscLogGpuTimeBegin();
1598: /* First, reorder with the row permutation */
1599: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1600: thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1601: tempGPU->begin());
1603: /* Next, solve L */
1604: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1605: loTriFactor->csrMat->num_rows,
1606: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1607: loTriFactor->csrMat->num_entries,
1608: #endif
1609: &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1610: loTriFactor->csrMat->values->data().get(),
1611: loTriFactor->csrMat->row_offsets->data().get(),
1612: loTriFactor->csrMat->column_indices->data().get(),
1613: loTriFactor->solveInfo,
1614: tempGPU->data().get(),
1615: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1616: xarray,
1617: loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1618: #else
1619: xarray);CHKERRCUSPARSE(stat);
1620: #endif
1622: /* Then, solve U */
1623: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1624: upTriFactor->csrMat->num_rows,
1625: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1626: upTriFactor->csrMat->num_entries,
1627: #endif
1628: &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1629: upTriFactor->csrMat->values->data().get(),
1630: upTriFactor->csrMat->row_offsets->data().get(),
1631: upTriFactor->csrMat->column_indices->data().get(),
1632: upTriFactor->solveInfo,xarray,
1633: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1634: tempGPU->data().get(),
1635: upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1636: #else
1637: tempGPU->data().get());CHKERRCUSPARSE(stat);
1638: #endif
1640: /* Last, reorder with the column permutation */
1641: thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1642: thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1643: xGPU);
1645: VecCUDARestoreArrayRead(bb,&barray);
1646: VecCUDARestoreArrayWrite(xx,&xarray);
1647: PetscLogGpuTimeEnd();
1648: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1649: return(0);
1650: }
1652: static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1653: {
1654: const PetscScalar *barray;
1655: PetscScalar *xarray;
1656: cusparseStatus_t stat;
1657: Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1658: Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1659: Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1660: THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1661: PetscErrorCode ierr;
1664: /* Get the GPU pointers */
1665: VecCUDAGetArrayWrite(xx,&xarray);
1666: VecCUDAGetArrayRead(bb,&barray);
1668: PetscLogGpuTimeBegin();
1669: /* First, solve L */
1670: stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1671: loTriFactor->csrMat->num_rows,
1672: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1673: loTriFactor->csrMat->num_entries,
1674: #endif
1675: &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1676: loTriFactor->csrMat->values->data().get(),
1677: loTriFactor->csrMat->row_offsets->data().get(),
1678: loTriFactor->csrMat->column_indices->data().get(),
1679: loTriFactor->solveInfo,
1680: barray,
1681: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1682: tempGPU->data().get(),
1683: loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1684: #else
1685: tempGPU->data().get());CHKERRCUSPARSE(stat);
1686: #endif
1688: /* Next, solve U */
1689: stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1690: upTriFactor->csrMat->num_rows,
1691: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1692: upTriFactor->csrMat->num_entries,
1693: #endif
1694: &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1695: upTriFactor->csrMat->values->data().get(),
1696: upTriFactor->csrMat->row_offsets->data().get(),
1697: upTriFactor->csrMat->column_indices->data().get(),
1698: upTriFactor->solveInfo,
1699: tempGPU->data().get(),
1700: #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1701: xarray,
1702: upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1703: #else
1704: xarray);CHKERRCUSPARSE(stat);
1705: #endif
1707: VecCUDARestoreArrayRead(bb,&barray);
1708: VecCUDARestoreArrayWrite(xx,&xarray);
1709: PetscLogGpuTimeEnd();
1710: PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);
1711: return(0);
1712: }
1714: static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1715: {
1716: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1717: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1718: cudaError_t cerr;
1719: PetscErrorCode ierr;
1722: if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1723: CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1725: PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1726: cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1727: cerr = WaitForCUDA();CHKERRCUDA(cerr);
1728: PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));
1729: PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);
1730: A->offloadmask = PETSC_OFFLOAD_BOTH;
1731: }
1732: return(0);
1733: }
1735: static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1736: {
1737: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1741: MatSeqAIJCUSPARSECopyFromGPU(A);
1742: *array = a->a;
1743: A->offloadmask = PETSC_OFFLOAD_CPU;
1744: return(0);
1745: }
1747: PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1748: {
1749: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1750: Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1751: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1752: PetscInt m = A->rmap->n,*ii,*ridx,tmp;
1753: PetscErrorCode ierr;
1754: cusparseStatus_t stat;
1755: PetscBool both = PETSC_TRUE;
1756: cudaError_t err;
1759: if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1760: if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1761: if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1762: CsrMatrix *matrix;
1763: matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1765: if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1766: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1767: matrix->values->assign(a->a, a->a+a->nz);
1768: err = WaitForCUDA();CHKERRCUDA(err);
1769: PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));
1770: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1771: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
1772: } else {
1773: PetscInt nnz;
1774: PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);
1775: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);
1776: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
1777: delete cusparsestruct->workVector;
1778: delete cusparsestruct->rowoffsets_gpu;
1779: cusparsestruct->workVector = NULL;
1780: cusparsestruct->rowoffsets_gpu = NULL;
1781: try {
1782: if (a->compressedrow.use) {
1783: m = a->compressedrow.nrows;
1784: ii = a->compressedrow.i;
1785: ridx = a->compressedrow.rindex;
1786: } else {
1787: m = A->rmap->n;
1788: ii = a->i;
1789: ridx = NULL;
1790: }
1791: if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1792: if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1793: if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1794: else nnz = a->nz;
1796: /* create cusparse matrix */
1797: cusparsestruct->nrows = m;
1798: matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1799: stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1800: stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1801: stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1803: err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1804: err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1805: err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1806: err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1807: err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1808: err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1809: stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1811: /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1812: if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1813: /* set the matrix */
1814: CsrMatrix *mat= new CsrMatrix;
1815: mat->num_rows = m;
1816: mat->num_cols = A->cmap->n;
1817: mat->num_entries = nnz;
1818: mat->row_offsets = new THRUSTINTARRAY32(m+1);
1819: mat->row_offsets->assign(ii, ii + m+1);
1821: mat->column_indices = new THRUSTINTARRAY32(nnz);
1822: mat->column_indices->assign(a->j, a->j+nnz);
1824: mat->values = new THRUSTARRAY(nnz);
1825: if (a->a) mat->values->assign(a->a, a->a+nnz);
1827: /* assign the pointer */
1828: matstruct->mat = mat;
1829: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1830: if (mat->num_rows) { /* cusparse errors on empty matrices! */
1831: stat = cusparseCreateCsr(&matstruct->matDescr,
1832: mat->num_rows, mat->num_cols, mat->num_entries,
1833: mat->row_offsets->data().get(), mat->column_indices->data().get(),
1834: mat->values->data().get(),
1835: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1836: CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1837: }
1838: #endif
1839: } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1840: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1841: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1842: #else
1843: CsrMatrix *mat= new CsrMatrix;
1844: mat->num_rows = m;
1845: mat->num_cols = A->cmap->n;
1846: mat->num_entries = nnz;
1847: mat->row_offsets = new THRUSTINTARRAY32(m+1);
1848: mat->row_offsets->assign(ii, ii + m+1);
1850: mat->column_indices = new THRUSTINTARRAY32(nnz);
1851: mat->column_indices->assign(a->j, a->j+nnz);
1853: mat->values = new THRUSTARRAY(nnz);
1854: if (a->a) mat->values->assign(a->a, a->a+nnz);
1856: cusparseHybMat_t hybMat;
1857: stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1858: cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1859: CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1860: stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1861: matstruct->descr, mat->values->data().get(),
1862: mat->row_offsets->data().get(),
1863: mat->column_indices->data().get(),
1864: hybMat, 0, partition);CHKERRCUSPARSE(stat);
1865: /* assign the pointer */
1866: matstruct->mat = hybMat;
1868: if (mat) {
1869: if (mat->values) delete (THRUSTARRAY*)mat->values;
1870: if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1871: if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1872: delete (CsrMatrix*)mat;
1873: }
1874: #endif
1875: }
1877: /* assign the compressed row indices */
1878: if (a->compressedrow.use) {
1879: cusparsestruct->workVector = new THRUSTARRAY(m);
1880: matstruct->cprowIndices = new THRUSTINTARRAY(m);
1881: matstruct->cprowIndices->assign(ridx,ridx+m);
1882: tmp = m;
1883: } else {
1884: cusparsestruct->workVector = NULL;
1885: matstruct->cprowIndices = NULL;
1886: tmp = 0;
1887: }
1888: PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));
1890: /* assign the pointer */
1891: cusparsestruct->mat = matstruct;
1892: } catch(char *ex) {
1893: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1894: }
1895: err = WaitForCUDA();CHKERRCUDA(err);
1896: PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);
1897: cusparsestruct->nonzerostate = A->nonzerostate;
1898: }
1899: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1900: }
1901: return(0);
1902: }
1904: struct VecCUDAPlusEquals
1905: {
1906: template <typename Tuple>
1907: __host__ __device__
1908: void operator()(Tuple t)
1909: {
1910: thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1911: }
1912: };
1914: struct VecCUDAEquals
1915: {
1916: template <typename Tuple>
1917: __host__ __device__
1918: void operator()(Tuple t)
1919: {
1920: thrust::get<1>(t) = thrust::get<0>(t);
1921: }
1922: };
1924: struct VecCUDAEqualsReverse
1925: {
1926: template <typename Tuple>
1927: __host__ __device__
1928: void operator()(Tuple t)
1929: {
1930: thrust::get<0>(t) = thrust::get<1>(t);
1931: }
1932: };
1934: struct MatMatCusparse {
1935: PetscBool cisdense;
1936: PetscScalar *Bt;
1937: Mat X;
1938: PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1939: PetscLogDouble flops;
1940: CsrMatrix *Bcsr;
1942: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1943: cusparseSpMatDescr_t matSpBDescr;
1944: PetscBool initialized; /* C = alpha op(A) op(B) + beta C */
1945: cusparseDnMatDescr_t matBDescr;
1946: cusparseDnMatDescr_t matCDescr;
1947: PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1948: #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1949: void *dBuffer4;
1950: void *dBuffer5;
1951: #endif
1952: size_t mmBufferSize;
1953: void *mmBuffer;
1954: void *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1955: cusparseSpGEMMDescr_t spgemmDesc;
1956: #endif
1957: };
1959: static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1960: {
1961: PetscErrorCode ierr;
1962: MatMatCusparse *mmdata = (MatMatCusparse *)data;
1963: cudaError_t cerr;
1964: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1965: cusparseStatus_t stat;
1966: #endif
1969: cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1970: delete mmdata->Bcsr;
1971: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1972: if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1973: if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1974: if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1975: if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1976: #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1977: if (mmdata->dBuffer4) { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
1978: if (mmdata->dBuffer5) { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
1979: #endif
1980: if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1981: if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1982: #endif
1983: MatDestroy(&mmdata->X);
1984: PetscFree(data);
1985: return(0);
1986: }
1988: PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1990: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1991: {
1992: Mat_Product *product = C->product;
1993: Mat A,B;
1994: PetscInt m,n,blda,clda;
1995: PetscBool flg,biscuda;
1996: Mat_SeqAIJCUSPARSE *cusp;
1997: cusparseStatus_t stat;
1998: cusparseOperation_t opA;
1999: const PetscScalar *barray;
2000: PetscScalar *carray;
2001: PetscErrorCode ierr;
2002: MatMatCusparse *mmdata;
2003: Mat_SeqAIJCUSPARSEMultStruct *mat;
2004: CsrMatrix *csrmat;
2007: MatCheckProduct(C,1);
2008: if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2009: mmdata = (MatMatCusparse*)product->data;
2010: A = product->A;
2011: B = product->B;
2012: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2013: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2014: /* currently CopyToGpu does not copy if the matrix is bound to CPU
2015: Instead of silently accepting the wrong answer, I prefer to raise the error */
2016: if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2017: MatSeqAIJCUSPARSECopyToGPU(A);
2018: cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2019: switch (product->type) {
2020: case MATPRODUCT_AB:
2021: case MATPRODUCT_PtAP:
2022: mat = cusp->mat;
2023: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2024: m = A->rmap->n;
2025: n = B->cmap->n;
2026: break;
2027: case MATPRODUCT_AtB:
2028: if (!A->form_explicit_transpose) {
2029: mat = cusp->mat;
2030: opA = CUSPARSE_OPERATION_TRANSPOSE;
2031: } else {
2032: MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
2033: mat = cusp->matTranspose;
2034: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2035: }
2036: m = A->cmap->n;
2037: n = B->cmap->n;
2038: break;
2039: case MATPRODUCT_ABt:
2040: case MATPRODUCT_RARt:
2041: mat = cusp->mat;
2042: opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2043: m = A->rmap->n;
2044: n = B->rmap->n;
2045: break;
2046: default:
2047: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2048: }
2049: if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2050: csrmat = (CsrMatrix*)mat->mat;
2051: /* if the user passed a CPU matrix, copy the data to the GPU */
2052: PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);
2053: if (!biscuda) {MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);}
2054: MatDenseCUDAGetArrayRead(B,&barray);
2056: MatDenseGetLDA(B,&blda);
2057: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2058: MatDenseCUDAGetArrayWrite(mmdata->X,&carray);
2059: MatDenseGetLDA(mmdata->X,&clda);
2060: } else {
2061: MatDenseCUDAGetArrayWrite(C,&carray);
2062: MatDenseGetLDA(C,&clda);
2063: }
2065: PetscLogGpuTimeBegin();
2066: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2067: cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2068: /* (re)allocate mmBuffer if not initialized or LDAs are different */
2069: if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2070: size_t mmBufferSize;
2071: if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2072: if (!mmdata->matBDescr) {
2073: stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2074: mmdata->Blda = blda;
2075: }
2077: if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2078: if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2079: stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2080: mmdata->Clda = clda;
2081: }
2083: if (!mat->matDescr) {
2084: stat = cusparseCreateCsr(&mat->matDescr,
2085: csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2086: csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2087: csrmat->values->data().get(),
2088: CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2089: CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2090: }
2091: stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2092: mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2093: mmdata->matCDescr,cusparse_scalartype,
2094: cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2095: if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2096: cudaError_t cerr;
2097: cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2098: cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2099: mmdata->mmBufferSize = mmBufferSize;
2100: }
2101: mmdata->initialized = PETSC_TRUE;
2102: } else {
2103: /* to be safe, always update pointers of the mats */
2104: stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2105: stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2106: stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2107: }
2109: /* do cusparseSpMM, which supports transpose on B */
2110: stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2111: mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2112: mmdata->matCDescr,cusparse_scalartype,
2113: cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2114: #else
2115: PetscInt k;
2116: /* cusparseXcsrmm does not support transpose on B */
2117: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2118: cublasHandle_t cublasv2handle;
2119: cublasStatus_t cerr;
2121: PetscCUBLASGetHandle(&cublasv2handle);
2122: cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2123: B->cmap->n,B->rmap->n,
2124: &PETSC_CUSPARSE_ONE ,barray,blda,
2125: &PETSC_CUSPARSE_ZERO,barray,blda,
2126: mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2127: blda = B->cmap->n;
2128: k = B->cmap->n;
2129: } else {
2130: k = B->rmap->n;
2131: }
2133: /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2134: stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2135: csrmat->num_entries,mat->alpha_one,mat->descr,
2136: csrmat->values->data().get(),
2137: csrmat->row_offsets->data().get(),
2138: csrmat->column_indices->data().get(),
2139: mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2140: carray,clda);CHKERRCUSPARSE(stat);
2141: #endif
2142: PetscLogGpuTimeEnd();
2143: PetscLogGpuFlops(n*2.0*csrmat->num_entries);
2144: MatDenseCUDARestoreArrayRead(B,&barray);
2145: if (product->type == MATPRODUCT_RARt) {
2146: MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2147: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);
2148: } else if (product->type == MATPRODUCT_PtAP) {
2149: MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);
2150: MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);
2151: } else {
2152: MatDenseCUDARestoreArrayWrite(C,&carray);
2153: }
2154: if (mmdata->cisdense) {
2155: MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);
2156: }
2157: if (!biscuda) {
2158: MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);
2159: }
2160: return(0);
2161: }
2163: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2164: {
2165: Mat_Product *product = C->product;
2166: Mat A,B;
2167: PetscInt m,n;
2168: PetscBool cisdense,flg;
2169: PetscErrorCode ierr;
2170: MatMatCusparse *mmdata;
2171: Mat_SeqAIJCUSPARSE *cusp;
2174: MatCheckProduct(C,1);
2175: if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2176: A = product->A;
2177: B = product->B;
2178: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2179: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2180: cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2181: if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2182: switch (product->type) {
2183: case MATPRODUCT_AB:
2184: m = A->rmap->n;
2185: n = B->cmap->n;
2186: break;
2187: case MATPRODUCT_AtB:
2188: m = A->cmap->n;
2189: n = B->cmap->n;
2190: break;
2191: case MATPRODUCT_ABt:
2192: m = A->rmap->n;
2193: n = B->rmap->n;
2194: break;
2195: case MATPRODUCT_PtAP:
2196: m = B->cmap->n;
2197: n = B->cmap->n;
2198: break;
2199: case MATPRODUCT_RARt:
2200: m = B->rmap->n;
2201: n = B->rmap->n;
2202: break;
2203: default:
2204: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2205: }
2206: MatSetSizes(C,m,n,m,n);
2207: /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2208: PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);
2209: MatSetType(C,MATSEQDENSECUDA);
2211: /* product data */
2212: PetscNew(&mmdata);
2213: mmdata->cisdense = cisdense;
2214: #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2215: /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2216: if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2217: cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2218: }
2219: #endif
2220: /* for these products we need intermediate storage */
2221: if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2222: MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);
2223: MatSetType(mmdata->X,MATSEQDENSECUDA);
2224: if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2225: MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);
2226: } else {
2227: MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);
2228: }
2229: }
2230: C->product->data = mmdata;
2231: C->product->destroy = MatDestroy_MatMatCusparse;
2233: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2234: return(0);
2235: }
2237: static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2238: {
2239: Mat_Product *product = C->product;
2240: Mat A,B;
2241: Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp;
2242: Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data;
2243: Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2244: CsrMatrix *Acsr,*Bcsr,*Ccsr;
2245: PetscBool flg;
2246: PetscErrorCode ierr;
2247: cusparseStatus_t stat;
2248: cudaError_t cerr;
2249: MatProductType ptype;
2250: MatMatCusparse *mmdata;
2251: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2252: cusparseSpMatDescr_t BmatSpDescr;
2253: #endif
2254: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2257: MatCheckProduct(C,1);
2258: if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2259: PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);
2260: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2261: mmdata = (MatMatCusparse*)C->product->data;
2262: A = product->A;
2263: B = product->B;
2264: if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2265: mmdata->reusesym = PETSC_FALSE;
2266: Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2267: if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2268: Cmat = Ccusp->mat;
2269: if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2270: Ccsr = (CsrMatrix*)Cmat->mat;
2271: if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2272: goto finalize;
2273: }
2274: if (!c->nz) goto finalize;
2275: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2276: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2277: PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2278: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2279: if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2280: if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2281: Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2282: Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2283: Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2284: if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2285: if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2286: if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2287: MatSeqAIJCUSPARSECopyToGPU(A);
2288: MatSeqAIJCUSPARSECopyToGPU(B);
2290: ptype = product->type;
2291: if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2292: if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2293: switch (ptype) {
2294: case MATPRODUCT_AB:
2295: Amat = Acusp->mat;
2296: Bmat = Bcusp->mat;
2297: break;
2298: case MATPRODUCT_AtB:
2299: Amat = Acusp->matTranspose;
2300: Bmat = Bcusp->mat;
2301: break;
2302: case MATPRODUCT_ABt:
2303: Amat = Acusp->mat;
2304: Bmat = Bcusp->matTranspose;
2305: break;
2306: default:
2307: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2308: }
2309: Cmat = Ccusp->mat;
2310: if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2311: if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2312: if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2313: Acsr = (CsrMatrix*)Amat->mat;
2314: Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2315: Ccsr = (CsrMatrix*)Cmat->mat;
2316: if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2317: if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2318: if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2319: PetscLogGpuTimeBegin();
2320: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2321: BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2322: stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2323: #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2324: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2325: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2326: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2327: mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2328: #else
2329: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2330: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2331: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2332: mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2333: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2334: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2335: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2336: #endif
2337: #else
2338: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2339: Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2340: Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2341: Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2342: Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2343: #endif
2344: PetscLogGpuFlops(mmdata->flops);
2345: cerr = WaitForCUDA();CHKERRCUDA(cerr);
2346: PetscLogGpuTimeEnd();
2347: C->offloadmask = PETSC_OFFLOAD_GPU;
2348: finalize:
2349: /* shorter version of MatAssemblyEnd_SeqAIJ */
2350: PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);
2351: PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");
2352: PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);
2353: c->reallocs = 0;
2354: C->info.mallocs += 0;
2355: C->info.nz_unneeded = 0;
2356: C->assembled = C->was_assembled = PETSC_TRUE;
2357: C->num_ass++;
2358: return(0);
2359: }
2361: static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2362: {
2363: Mat_Product *product = C->product;
2364: Mat A,B;
2365: Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp;
2366: Mat_SeqAIJ *a,*b,*c;
2367: Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2368: CsrMatrix *Acsr,*Bcsr,*Ccsr;
2369: PetscInt i,j,m,n,k;
2370: PetscBool flg;
2371: PetscErrorCode ierr;
2372: cusparseStatus_t stat;
2373: cudaError_t cerr;
2374: MatProductType ptype;
2375: MatMatCusparse *mmdata;
2376: PetscLogDouble flops;
2377: PetscBool biscompressed,ciscompressed;
2378: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2379: int64_t C_num_rows1, C_num_cols1, C_nnz1;
2380: cusparseSpMatDescr_t BmatSpDescr;
2381: #else
2382: int cnz;
2383: #endif
2384: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2387: MatCheckProduct(C,1);
2388: if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2389: A = product->A;
2390: B = product->B;
2391: PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);
2392: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2393: PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);
2394: if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2395: a = (Mat_SeqAIJ*)A->data;
2396: b = (Mat_SeqAIJ*)B->data;
2397: Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2398: Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2399: if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2400: if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2402: /* product data */
2403: PetscNew(&mmdata);
2404: C->product->data = mmdata;
2405: C->product->destroy = MatDestroy_MatMatCusparse;
2407: MatSeqAIJCUSPARSECopyToGPU(A);
2408: MatSeqAIJCUSPARSECopyToGPU(B);
2409: ptype = product->type;
2410: if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2411: if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2412: biscompressed = PETSC_FALSE;
2413: ciscompressed = PETSC_FALSE;
2414: switch (ptype) {
2415: case MATPRODUCT_AB:
2416: m = A->rmap->n;
2417: n = B->cmap->n;
2418: k = A->cmap->n;
2419: Amat = Acusp->mat;
2420: Bmat = Bcusp->mat;
2421: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2422: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2423: break;
2424: case MATPRODUCT_AtB:
2425: m = A->cmap->n;
2426: n = B->cmap->n;
2427: k = A->rmap->n;
2428: MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
2429: Amat = Acusp->matTranspose;
2430: Bmat = Bcusp->mat;
2431: if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2432: break;
2433: case MATPRODUCT_ABt:
2434: m = A->rmap->n;
2435: n = B->rmap->n;
2436: k = A->cmap->n;
2437: MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);
2438: Amat = Acusp->mat;
2439: Bmat = Bcusp->matTranspose;
2440: if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2441: break;
2442: default:
2443: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2444: }
2446: /* create cusparse matrix */
2447: MatSetSizes(C,m,n,m,n);
2448: MatSetType(C,MATSEQAIJCUSPARSE);
2449: c = (Mat_SeqAIJ*)C->data;
2450: Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2451: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
2452: Ccsr = new CsrMatrix;
2454: c->compressedrow.use = ciscompressed;
2455: if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2456: c->compressedrow.nrows = a->compressedrow.nrows;
2457: PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);
2458: PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);
2459: Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows);
2460: Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2461: Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2462: } else {
2463: c->compressedrow.nrows = 0;
2464: c->compressedrow.i = NULL;
2465: c->compressedrow.rindex = NULL;
2466: Ccusp->workVector = NULL;
2467: Cmat->cprowIndices = NULL;
2468: }
2469: Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m;
2470: Ccusp->mat = Cmat;
2471: Ccusp->mat->mat = Ccsr;
2472: Ccsr->num_rows = Ccusp->nrows;
2473: Ccsr->num_cols = n;
2474: Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2475: stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2476: stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2477: stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2478: cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2479: cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2480: cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2481: cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2482: cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2483: cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2484: if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2485: thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2486: c->nz = 0;
2487: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2488: Ccsr->values = new THRUSTARRAY(c->nz);
2489: goto finalizesym;
2490: }
2492: if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2493: if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2494: Acsr = (CsrMatrix*)Amat->mat;
2495: if (!biscompressed) {
2496: Bcsr = (CsrMatrix*)Bmat->mat;
2497: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2498: BmatSpDescr = Bmat->matDescr;
2499: #endif
2500: } else { /* we need to use row offsets for the full matrix */
2501: CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2502: Bcsr = new CsrMatrix;
2503: Bcsr->num_rows = B->rmap->n;
2504: Bcsr->num_cols = cBcsr->num_cols;
2505: Bcsr->num_entries = cBcsr->num_entries;
2506: Bcsr->column_indices = cBcsr->column_indices;
2507: Bcsr->values = cBcsr->values;
2508: if (!Bcusp->rowoffsets_gpu) {
2509: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2510: Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2511: PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
2512: }
2513: Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2514: mmdata->Bcsr = Bcsr;
2515: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2516: if (Bcsr->num_rows && Bcsr->num_cols) {
2517: stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2518: Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2519: Bcsr->values->data().get(),
2520: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2521: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2522: }
2523: BmatSpDescr = mmdata->matSpBDescr;
2524: #endif
2525: }
2526: if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2527: if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2528: /* precompute flops count */
2529: if (ptype == MATPRODUCT_AB) {
2530: for (i=0, flops = 0; i<A->rmap->n; i++) {
2531: const PetscInt st = a->i[i];
2532: const PetscInt en = a->i[i+1];
2533: for (j=st; j<en; j++) {
2534: const PetscInt brow = a->j[j];
2535: flops += 2.*(b->i[brow+1] - b->i[brow]);
2536: }
2537: }
2538: } else if (ptype == MATPRODUCT_AtB) {
2539: for (i=0, flops = 0; i<A->rmap->n; i++) {
2540: const PetscInt anzi = a->i[i+1] - a->i[i];
2541: const PetscInt bnzi = b->i[i+1] - b->i[i];
2542: flops += (2.*anzi)*bnzi;
2543: }
2544: } else { /* TODO */
2545: flops = 0.;
2546: }
2548: mmdata->flops = flops;
2549: PetscLogGpuTimeBegin();
2551: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2552: stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2553: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2554: NULL, NULL, NULL,
2555: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2556: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2557: stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2558: #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2559: {
2560: /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2561: We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2562: */
2563: void* dBuffer1 = NULL;
2564: void* dBuffer2 = NULL;
2565: void* dBuffer3 = NULL;
2566: /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2567: size_t bufferSize1 = 0;
2568: size_t bufferSize2 = 0;
2569: size_t bufferSize3 = 0;
2570: size_t bufferSize4 = 0;
2571: size_t bufferSize5 = 0;
2573: /*----------------------------------------------------------------------*/
2574: /* ask bufferSize1 bytes for external memory */
2575: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2576: CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2577: &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2578: cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2579: /* inspect the matrices A and B to understand the memory requirement for the next step */
2580: stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2581: CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2582: &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2584: /*----------------------------------------------------------------------*/
2585: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2586: CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2587: &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2588: cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2589: cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2590: cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2591: stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2592: CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2593: &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2594: cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2595: cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2597: /*----------------------------------------------------------------------*/
2598: /* get matrix C non-zero entries C_nnz1 */
2599: stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2600: c->nz = (PetscInt) C_nnz1;
2601: /* allocate matrix C */
2602: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2603: Ccsr->values = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2604: /* update matC with the new pointers */
2605: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2606: Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2608: /*----------------------------------------------------------------------*/
2609: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2610: CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2611: &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2612: cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2613: stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2614: CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2615: &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2616: cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2617: stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2618: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2619: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2620: mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2621: PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);
2622: }
2623: #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2624: size_t bufSize2;
2625: /* ask bufferSize bytes for external memory */
2626: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2627: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2628: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2629: mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2630: cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2631: /* inspect the matrices A and B to understand the memory requirement for the next step */
2632: stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2633: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2634: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2635: mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2636: /* ask bufferSize again bytes for external memory */
2637: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2638: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2639: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2640: mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2641: /* The CUSPARSE documentation is not clear, nor the API
2642: We need both buffers to perform the operations properly!
2643: mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2644: it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2645: is stored in the descriptor! What a messy API... */
2646: cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2647: /* compute the intermediate product of A * B */
2648: stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2649: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2650: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2651: mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2652: /* get matrix C non-zero entries C_nnz1 */
2653: stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2654: c->nz = (PetscInt) C_nnz1;
2655: PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);
2656: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2657: CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2658: Ccsr->values = new THRUSTARRAY(c->nz);
2659: CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2660: stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2661: Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2662: stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2663: Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2664: cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2665: #endif
2666: #else
2667: stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2668: stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2669: Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2670: Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2671: Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2672: Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2673: c->nz = cnz;
2674: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2675: CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2676: Ccsr->values = new THRUSTARRAY(c->nz);
2677: CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2679: stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2680: /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2681: I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2682: D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2683: stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2684: Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2685: Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2686: Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2687: Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2688: #endif
2689: PetscLogGpuFlops(mmdata->flops);
2690: PetscLogGpuTimeEnd();
2691: finalizesym:
2692: c->singlemalloc = PETSC_FALSE;
2693: c->free_a = PETSC_TRUE;
2694: c->free_ij = PETSC_TRUE;
2695: PetscMalloc1(m+1,&c->i);
2696: PetscMalloc1(c->nz,&c->j);
2697: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2698: PetscInt *d_i = c->i;
2699: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2700: THRUSTINTARRAY jj(Ccsr->column_indices->size());
2701: ii = *Ccsr->row_offsets;
2702: jj = *Ccsr->column_indices;
2703: if (ciscompressed) d_i = c->compressedrow.i;
2704: cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2705: cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2706: } else {
2707: PetscInt *d_i = c->i;
2708: if (ciscompressed) d_i = c->compressedrow.i;
2709: cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2710: cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2711: }
2712: if (ciscompressed) { /* need to expand host row offsets */
2713: PetscInt r = 0;
2714: c->i[0] = 0;
2715: for (k = 0; k < c->compressedrow.nrows; k++) {
2716: const PetscInt next = c->compressedrow.rindex[k];
2717: const PetscInt old = c->compressedrow.i[k];
2718: for (; r < next; r++) c->i[r+1] = old;
2719: }
2720: for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2721: }
2722: PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
2723: PetscMalloc1(m,&c->ilen);
2724: PetscMalloc1(m,&c->imax);
2725: c->maxnz = c->nz;
2726: c->nonzerorowcnt = 0;
2727: c->rmax = 0;
2728: for (k = 0; k < m; k++) {
2729: const PetscInt nn = c->i[k+1] - c->i[k];
2730: c->ilen[k] = c->imax[k] = nn;
2731: c->nonzerorowcnt += (PetscInt)!!nn;
2732: c->rmax = PetscMax(c->rmax,nn);
2733: }
2734: MatMarkDiagonal_SeqAIJ(C);
2735: PetscMalloc1(c->nz,&c->a);
2736: Ccsr->num_entries = c->nz;
2738: C->nonzerostate++;
2739: PetscLayoutSetUp(C->rmap);
2740: PetscLayoutSetUp(C->cmap);
2741: Ccusp->nonzerostate = C->nonzerostate;
2742: C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2743: C->preallocated = PETSC_TRUE;
2744: C->assembled = PETSC_FALSE;
2745: C->was_assembled = PETSC_FALSE;
2746: if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2747: mmdata->reusesym = PETSC_TRUE;
2748: C->offloadmask = PETSC_OFFLOAD_GPU;
2749: }
2750: C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2751: return(0);
2752: }
2754: PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2756: /* handles sparse or dense B */
2757: static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2758: {
2759: Mat_Product *product = mat->product;
2761: PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2764: MatCheckProduct(mat,1);
2765: PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);
2766: if (!product->A->boundtocpu && !product->B->boundtocpu) {
2767: PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);
2768: }
2769: if (product->type == MATPRODUCT_ABC) {
2770: Ciscusp = PETSC_FALSE;
2771: if (!product->C->boundtocpu) {
2772: PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);
2773: }
2774: }
2775: if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2776: PetscBool usecpu = PETSC_FALSE;
2777: switch (product->type) {
2778: case MATPRODUCT_AB:
2779: if (product->api_user) {
2780: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
2781: PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
2782: PetscOptionsEnd();
2783: } else {
2784: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
2785: PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);
2786: PetscOptionsEnd();
2787: }
2788: break;
2789: case MATPRODUCT_AtB:
2790: if (product->api_user) {
2791: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
2792: PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
2793: PetscOptionsEnd();
2794: } else {
2795: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
2796: PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);
2797: PetscOptionsEnd();
2798: }
2799: break;
2800: case MATPRODUCT_PtAP:
2801: if (product->api_user) {
2802: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
2803: PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
2804: PetscOptionsEnd();
2805: } else {
2806: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
2807: PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);
2808: PetscOptionsEnd();
2809: }
2810: break;
2811: case MATPRODUCT_RARt:
2812: if (product->api_user) {
2813: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
2814: PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);
2815: PetscOptionsEnd();
2816: } else {
2817: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
2818: PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);
2819: PetscOptionsEnd();
2820: }
2821: break;
2822: case MATPRODUCT_ABC:
2823: if (product->api_user) {
2824: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
2825: PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);
2826: PetscOptionsEnd();
2827: } else {
2828: PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
2829: PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);
2830: PetscOptionsEnd();
2831: }
2832: break;
2833: default:
2834: break;
2835: }
2836: if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2837: }
2838: /* dispatch */
2839: if (isdense) {
2840: switch (product->type) {
2841: case MATPRODUCT_AB:
2842: case MATPRODUCT_AtB:
2843: case MATPRODUCT_ABt:
2844: case MATPRODUCT_PtAP:
2845: case MATPRODUCT_RARt:
2846: if (product->A->boundtocpu) {
2847: MatProductSetFromOptions_SeqAIJ_SeqDense(mat);
2848: } else {
2849: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2850: }
2851: break;
2852: case MATPRODUCT_ABC:
2853: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2854: break;
2855: default:
2856: break;
2857: }
2858: } else if (Biscusp && Ciscusp) {
2859: switch (product->type) {
2860: case MATPRODUCT_AB:
2861: case MATPRODUCT_AtB:
2862: case MATPRODUCT_ABt:
2863: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2864: break;
2865: case MATPRODUCT_PtAP:
2866: case MATPRODUCT_RARt:
2867: case MATPRODUCT_ABC:
2868: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2869: break;
2870: default:
2871: break;
2872: }
2873: } else { /* fallback for AIJ */
2874: MatProductSetFromOptions_SeqAIJ(mat);
2875: }
2876: return(0);
2877: }
2879: static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2880: {
2884: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);
2885: return(0);
2886: }
2888: static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2889: {
2893: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);
2894: return(0);
2895: }
2897: static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2898: {
2902: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);
2903: return(0);
2904: }
2906: static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2907: {
2911: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);
2912: return(0);
2913: }
2915: static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2916: {
2920: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);
2921: return(0);
2922: }
2924: __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2925: {
2926: int i = blockIdx.x*blockDim.x + threadIdx.x;
2927: if (i < n) y[idx[i]] += x[i];
2928: }
2930: /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2931: static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2932: {
2933: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2934: Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2935: Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2936: PetscScalar *xarray,*zarray,*dptr,*beta,*xptr;
2937: PetscErrorCode ierr;
2938: cusparseStatus_t stat;
2939: cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2940: PetscBool compressed;
2941: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2942: PetscInt nx,ny;
2943: #endif
2946: if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2947: if (!a->nonzerorowcnt) {
2948: if (!yy) {VecSet_SeqCUDA(zz,0);}
2949: else {VecCopy_SeqCUDA(yy,zz);}
2950: return(0);
2951: }
2952: /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2953: MatSeqAIJCUSPARSECopyToGPU(A);
2954: if (!trans) {
2955: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2956: if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2957: } else {
2958: if (herm || !A->form_explicit_transpose) {
2959: opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2960: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2961: } else {
2962: if (!cusparsestruct->matTranspose) {MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);}
2963: matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2964: }
2965: }
2966: /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2967: compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2969: try {
2970: VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);
2971: if (yy == zz) {VecCUDAGetArray(zz,&zarray);} /* read & write zz, so need to get uptodate zarray on GPU */
2972: else {VecCUDAGetArrayWrite(zz,&zarray);} /* write zz, so no need to init zarray on GPU */
2974: PetscLogGpuTimeBegin();
2975: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2976: /* z = A x + beta y.
2977: If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2978: When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2979: */
2980: xptr = xarray;
2981: dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2982: beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2983: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2984: /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2985: allocated to accommodate different uses. So we get the length info directly from mat.
2986: */
2987: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2988: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2989: nx = mat->num_cols;
2990: ny = mat->num_rows;
2991: }
2992: #endif
2993: } else {
2994: /* z = A^T x + beta y
2995: If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2996: Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2997: */
2998: xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2999: dptr = zarray;
3000: beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3001: if (compressed) { /* Scatter x to work vector */
3002: thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3003: thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3004: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3005: VecCUDAEqualsReverse());
3006: }
3007: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3008: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3009: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3010: nx = mat->num_rows;
3011: ny = mat->num_cols;
3012: }
3013: #endif
3014: }
3016: /* csr_spmv does y = alpha op(A) x + beta y */
3017: if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3018: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3019: if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3020: if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3021: cudaError_t cerr;
3022: stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3023: stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3024: stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3025: matstruct->matDescr,
3026: matstruct->cuSpMV[opA].vecXDescr, beta,
3027: matstruct->cuSpMV[opA].vecYDescr,
3028: cusparse_scalartype,
3029: cusparsestruct->spmvAlg,
3030: &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3031: cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3033: matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3034: } else {
3035: /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3036: stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3037: stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3038: }
3040: stat = cusparseSpMV(cusparsestruct->handle, opA,
3041: matstruct->alpha_one,
3042: matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
3043: matstruct->cuSpMV[opA].vecXDescr,
3044: beta,
3045: matstruct->cuSpMV[opA].vecYDescr,
3046: cusparse_scalartype,
3047: cusparsestruct->spmvAlg,
3048: matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3049: #else
3050: CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3051: stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3052: mat->num_rows, mat->num_cols,
3053: mat->num_entries, matstruct->alpha_one, matstruct->descr,
3054: mat->values->data().get(), mat->row_offsets->data().get(),
3055: mat->column_indices->data().get(), xptr, beta,
3056: dptr);CHKERRCUSPARSE(stat);
3057: #endif
3058: } else {
3059: if (cusparsestruct->nrows) {
3060: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3061: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3062: #else
3063: cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3064: stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3065: matstruct->alpha_one, matstruct->descr, hybMat,
3066: xptr, beta,
3067: dptr);CHKERRCUSPARSE(stat);
3068: #endif
3069: }
3070: }
3071: PetscLogGpuTimeEnd();
3073: if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3074: if (yy) { /* MatMultAdd: zz = A*xx + yy */
3075: if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3076: VecCopy_SeqCUDA(yy,zz); /* zz = yy */
3077: } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3078: VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
3079: }
3080: } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3081: VecSet_SeqCUDA(zz,0);
3082: }
3084: /* ScatterAdd the result from work vector into the full vector when A is compressed */
3085: if (compressed) {
3086: PetscLogGpuTimeBegin();
3087: /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3088: and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3089: prevent that. So I just add a ScatterAdd kernel.
3090: */
3091: #if 0
3092: thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3093: thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3094: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3095: thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3096: VecCUDAPlusEquals());
3097: #else
3098: PetscInt n = matstruct->cprowIndices->size();
3099: ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3100: #endif
3101: PetscLogGpuTimeEnd();
3102: }
3103: } else {
3104: if (yy && yy != zz) {
3105: VecAXPY_SeqCUDA(zz,1.0,yy); /* zz += yy */
3106: }
3107: }
3108: VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);
3109: if (yy == zz) {VecCUDARestoreArray(zz,&zarray);}
3110: else {VecCUDARestoreArrayWrite(zz,&zarray);}
3111: } catch(char *ex) {
3112: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3113: }
3114: if (yy) {
3115: PetscLogGpuFlops(2.0*a->nz);
3116: } else {
3117: PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);
3118: }
3119: return(0);
3120: }
3122: static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3123: {
3127: MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);
3128: return(0);
3129: }
3131: static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3132: {
3133: PetscErrorCode ierr;
3134: PetscObjectState onnz = A->nonzerostate;
3135: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3138: MatAssemblyEnd_SeqAIJ(A,mode);
3139: if (onnz != A->nonzerostate && cusp->deviceMat) {
3140: cudaError_t cerr;
3142: PetscInfo(A,"Destroy device mat since nonzerostate changed\n");
3143: cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3144: cusp->deviceMat = NULL;
3145: }
3146: return(0);
3147: }
3149: /* --------------------------------------------------------------------------------*/
3150: /*@
3151: MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3152: (the default parallel PETSc format). This matrix will ultimately pushed down
3153: to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3154: assembly performance the user should preallocate the matrix storage by setting
3155: the parameter nz (or the array nnz). By setting these parameters accurately,
3156: performance during matrix assembly can be increased by more than a factor of 50.
3158: Collective
3160: Input Parameters:
3161: + comm - MPI communicator, set to PETSC_COMM_SELF
3162: . m - number of rows
3163: . n - number of columns
3164: . nz - number of nonzeros per row (same for all rows)
3165: - nnz - array containing the number of nonzeros in the various rows
3166: (possibly different for each row) or NULL
3168: Output Parameter:
3169: . A - the matrix
3171: It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3172: MatXXXXSetPreallocation() paradgm instead of this routine directly.
3173: [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3175: Notes:
3176: If nnz is given then nz is ignored
3178: The AIJ format (also called the Yale sparse matrix format or
3179: compressed row storage), is fully compatible with standard Fortran 77
3180: storage. That is, the stored row and column indices can begin at
3181: either one (as in Fortran) or zero. See the users' manual for details.
3183: Specify the preallocated storage with either nz or nnz (not both).
3184: Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3185: allocation. For large problems you MUST preallocate memory or you
3186: will get TERRIBLE performance, see the users' manual chapter on matrices.
3188: By default, this format uses inodes (identical nodes) when possible, to
3189: improve numerical efficiency of matrix-vector products and solves. We
3190: search for consecutive rows with the same nonzero structure, thereby
3191: reusing matrix information to achieve increased efficiency.
3193: Level: intermediate
3195: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3196: @*/
3197: PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3198: {
3202: MatCreate(comm,A);
3203: MatSetSizes(*A,m,n,m,n);
3204: MatSetType(*A,MATSEQAIJCUSPARSE);
3205: MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
3206: return(0);
3207: }
3209: static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3210: {
3214: if (A->factortype == MAT_FACTOR_NONE) {
3215: MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);
3216: } else {
3217: MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);
3218: }
3219: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3220: PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);
3221: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3222: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3223: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3224: PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);
3225: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3226: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3227: PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);
3228: MatDestroy_SeqAIJ(A);
3229: return(0);
3230: }
3232: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3233: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3234: static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3235: {
3239: MatDuplicate_SeqAIJ(A,cpvalues,B);
3240: MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);
3241: return(0);
3242: }
3244: static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3245: {
3246: PetscErrorCode ierr;
3247: Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3248: Mat_SeqAIJCUSPARSE *cy;
3249: Mat_SeqAIJCUSPARSE *cx;
3250: PetscScalar *ay;
3251: const PetscScalar *ax;
3252: CsrMatrix *csry,*csrx;
3255: cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3256: cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3257: if (X->ops->axpy != Y->ops->axpy) {
3258: MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3259: MatAXPY_SeqAIJ(Y,a,X,str);
3260: return(0);
3261: }
3262: /* if we are here, it means both matrices are bound to GPU */
3263: MatSeqAIJCUSPARSECopyToGPU(Y);
3264: MatSeqAIJCUSPARSECopyToGPU(X);
3265: if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3266: if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3267: csry = (CsrMatrix*)cy->mat->mat;
3268: csrx = (CsrMatrix*)cx->mat->mat;
3269: /* see if we can turn this into a cublas axpy */
3270: if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3271: bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3272: if (eq) {
3273: eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3274: }
3275: if (eq) str = SAME_NONZERO_PATTERN;
3276: }
3277: /* spgeam is buggy with one column */
3278: if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3280: if (str == SUBSET_NONZERO_PATTERN) {
3281: cusparseStatus_t stat;
3282: PetscScalar b = 1.0;
3283: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3284: size_t bufferSize;
3285: void *buffer;
3286: cudaError_t cerr;
3287: #endif
3289: MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3290: MatSeqAIJCUSPARSEGetArray(Y,&ay);
3291: stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3292: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3293: stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3294: &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3295: &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3296: cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3297: cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3298: PetscLogGpuTimeBegin();
3299: stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3300: &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3301: &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3302: cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3303: PetscLogGpuFlops(x->nz + y->nz);
3304: PetscLogGpuTimeEnd();
3305: cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3306: #else
3307: PetscLogGpuTimeBegin();
3308: stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3309: &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3310: &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3311: cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3312: PetscLogGpuFlops(x->nz + y->nz);
3313: PetscLogGpuTimeEnd();
3314: #endif
3315: stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3316: MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3317: MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3318: MatSeqAIJInvalidateDiagonal(Y);
3319: } else if (str == SAME_NONZERO_PATTERN) {
3320: cublasHandle_t cublasv2handle;
3321: cublasStatus_t berr;
3322: PetscBLASInt one = 1, bnz = 1;
3324: MatSeqAIJCUSPARSEGetArrayRead(X,&ax);
3325: MatSeqAIJCUSPARSEGetArray(Y,&ay);
3326: PetscCUBLASGetHandle(&cublasv2handle);
3327: PetscBLASIntCast(x->nz,&bnz);
3328: PetscLogGpuTimeBegin();
3329: berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3330: PetscLogGpuFlops(2.0*bnz);
3331: PetscLogGpuTimeEnd();
3332: MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);
3333: MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3334: MatSeqAIJInvalidateDiagonal(Y);
3335: } else {
3336: MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);
3337: MatAXPY_SeqAIJ(Y,a,X,str);
3338: }
3339: return(0);
3340: }
3342: static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3343: {
3345: Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data;
3346: PetscScalar *ay;
3347: cublasHandle_t cublasv2handle;
3348: cublasStatus_t berr;
3349: PetscBLASInt one = 1, bnz = 1;
3352: MatSeqAIJCUSPARSEGetArray(Y,&ay);
3353: PetscCUBLASGetHandle(&cublasv2handle);
3354: PetscBLASIntCast(y->nz,&bnz);
3355: PetscLogGpuTimeBegin();
3356: berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3357: PetscLogGpuFlops(bnz);
3358: PetscLogGpuTimeEnd();
3359: MatSeqAIJCUSPARSERestoreArray(Y,&ay);
3360: MatSeqAIJInvalidateDiagonal(Y);
3361: return(0);
3362: }
3364: static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3365: {
3367: PetscBool both = PETSC_FALSE;
3368: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3371: if (A->factortype == MAT_FACTOR_NONE) {
3372: Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3373: if (spptr->mat) {
3374: CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3375: if (matrix->values) {
3376: both = PETSC_TRUE;
3377: thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3378: }
3379: }
3380: if (spptr->matTranspose) {
3381: CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3382: if (matrix->values) {
3383: thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3384: }
3385: }
3386: }
3387: //MatZeroEntries_SeqAIJ(A);
3388: PetscArrayzero(a->a,a->i[A->rmap->n]);
3389: MatSeqAIJInvalidateDiagonal(A);
3390: if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3391: else A->offloadmask = PETSC_OFFLOAD_CPU;
3392: return(0);
3393: }
3395: static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3396: {
3397: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3401: if (A->factortype != MAT_FACTOR_NONE) return(0);
3402: if (flg) {
3403: MatSeqAIJCUSPARSECopyFromGPU(A);
3405: A->ops->scale = MatScale_SeqAIJ;
3406: A->ops->axpy = MatAXPY_SeqAIJ;
3407: A->ops->zeroentries = MatZeroEntries_SeqAIJ;
3408: A->ops->mult = MatMult_SeqAIJ;
3409: A->ops->multadd = MatMultAdd_SeqAIJ;
3410: A->ops->multtranspose = MatMultTranspose_SeqAIJ;
3411: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
3412: A->ops->multhermitiantranspose = NULL;
3413: A->ops->multhermitiantransposeadd = NULL;
3414: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ;
3415: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);
3416: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);
3417: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);
3418: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);
3419: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);
3420: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);
3421: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);
3422: } else {
3423: A->ops->scale = MatScale_SeqAIJCUSPARSE;
3424: A->ops->axpy = MatAXPY_SeqAIJCUSPARSE;
3425: A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE;
3426: A->ops->mult = MatMult_SeqAIJCUSPARSE;
3427: A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE;
3428: A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE;
3429: A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
3430: A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3431: A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3432: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE;
3433: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);
3434: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3435: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3436: PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);
3437: PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);
3438: PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);
3439: PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);
3440: }
3441: A->boundtocpu = flg;
3442: a->inode.use = flg;
3443: return(0);
3444: }
3446: PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3447: {
3448: PetscErrorCode ierr;
3449: cusparseStatus_t stat;
3450: Mat B;
3453: PetscCUDAInitializeCheck(); /* first use of CUSPARSE may be via MatConvert */
3454: if (reuse == MAT_INITIAL_MATRIX) {
3455: MatDuplicate(A,MAT_COPY_VALUES,newmat);
3456: } else if (reuse == MAT_REUSE_MATRIX) {
3457: MatCopy(A,*newmat,SAME_NONZERO_PATTERN);
3458: }
3459: B = *newmat;
3461: PetscFree(B->defaultvectype);
3462: PetscStrallocpy(VECCUDA,&B->defaultvectype);
3464: if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3465: if (B->factortype == MAT_FACTOR_NONE) {
3466: Mat_SeqAIJCUSPARSE *spptr;
3467: PetscNew(&spptr);
3468: stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3469: stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3470: spptr->format = MAT_CUSPARSE_CSR;
3471: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3472: #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3473: spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3474: #else
3475: spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3476: #endif
3477: spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3478: spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3479: #endif
3480: B->spptr = spptr;
3481: } else {
3482: Mat_SeqAIJCUSPARSETriFactors *spptr;
3484: PetscNew(&spptr);
3485: stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3486: stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3487: B->spptr = spptr;
3488: }
3489: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3490: }
3491: B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE;
3492: B->ops->destroy = MatDestroy_SeqAIJCUSPARSE;
3493: B->ops->setoption = MatSetOption_SeqAIJCUSPARSE;
3494: B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3495: B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE;
3496: B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE;
3498: MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);
3499: PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);
3500: PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);
3501: #if defined(PETSC_HAVE_HYPRE)
3502: PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);
3503: #endif
3504: return(0);
3505: }
3507: PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3508: {
3512: MatCreate_SeqAIJ(B);
3513: MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);
3514: return(0);
3515: }
3517: /*MC
3518: MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3520: A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3521: CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3522: All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3524: Options Database Keys:
3525: + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3526: . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3527: - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3529: Level: beginner
3531: .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3532: M*/
3534: PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3536: PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3537: {
3541: MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);
3542: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);
3543: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);
3544: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);
3545: MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);
3547: return(0);
3548: }
3550: static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3551: {
3552: PetscErrorCode ierr;
3553: cusparseStatus_t stat;
3556: if (*cusparsestruct) {
3557: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
3558: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
3559: delete (*cusparsestruct)->workVector;
3560: delete (*cusparsestruct)->rowoffsets_gpu;
3561: delete (*cusparsestruct)->cooPerm;
3562: delete (*cusparsestruct)->cooPerm_a;
3563: delete (*cusparsestruct)->csr2csc_i;
3564: if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3565: PetscFree(*cusparsestruct);
3566: }
3567: return(0);
3568: }
3570: static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3571: {
3573: if (*mat) {
3574: delete (*mat)->values;
3575: delete (*mat)->column_indices;
3576: delete (*mat)->row_offsets;
3577: delete *mat;
3578: *mat = 0;
3579: }
3580: return(0);
3581: }
3583: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3584: {
3585: cusparseStatus_t stat;
3586: PetscErrorCode ierr;
3589: if (*trifactor) {
3590: if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3591: if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3592: CsrMatrix_Destroy(&(*trifactor)->csrMat);
3593: if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3594: if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3595: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3596: if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3597: #endif
3598: PetscFree(*trifactor);
3599: }
3600: return(0);
3601: }
3603: static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3604: {
3605: CsrMatrix *mat;
3606: cusparseStatus_t stat;
3607: cudaError_t err;
3610: if (*matstruct) {
3611: if ((*matstruct)->mat) {
3612: if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3613: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3614: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3615: #else
3616: cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3617: stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3618: #endif
3619: } else {
3620: mat = (CsrMatrix*)(*matstruct)->mat;
3621: CsrMatrix_Destroy(&mat);
3622: }
3623: }
3624: if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3625: delete (*matstruct)->cprowIndices;
3626: if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3627: if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3628: if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3630: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3631: Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3632: if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3633: for (int i=0; i<3; i++) {
3634: if (mdata->cuSpMV[i].initialized) {
3635: err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3636: stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3637: stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3638: }
3639: }
3640: #endif
3641: delete *matstruct;
3642: *matstruct = NULL;
3643: }
3644: return(0);
3645: }
3647: PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3648: {
3652: if (*trifactors) {
3653: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
3654: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
3655: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
3656: MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
3657: delete (*trifactors)->rpermIndices;
3658: delete (*trifactors)->cpermIndices;
3659: delete (*trifactors)->workVector;
3660: (*trifactors)->rpermIndices = NULL;
3661: (*trifactors)->cpermIndices = NULL;
3662: (*trifactors)->workVector = NULL;
3663: if ((*trifactors)->a_band_d) {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3664: if ((*trifactors)->i_band_d) {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3665: (*trifactors)->init_dev_prop = PETSC_FALSE;
3666: }
3667: return(0);
3668: }
3670: static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3671: {
3672: PetscErrorCode ierr;
3673: cusparseHandle_t handle;
3674: cusparseStatus_t stat;
3677: if (*trifactors) {
3678: MatSeqAIJCUSPARSETriFactors_Reset(trifactors);
3679: if (handle = (*trifactors)->handle) {
3680: stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3681: }
3682: PetscFree(*trifactors);
3683: }
3684: return(0);
3685: }
3687: struct IJCompare
3688: {
3689: __host__ __device__
3690: inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3691: {
3692: if (t1.get<0>() < t2.get<0>()) return true;
3693: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3694: return false;
3695: }
3696: };
3698: struct IJEqual
3699: {
3700: __host__ __device__
3701: inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3702: {
3703: if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3704: return true;
3705: }
3706: };
3708: struct IJDiff
3709: {
3710: __host__ __device__
3711: inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3712: {
3713: return t1 == t2 ? 0 : 1;
3714: }
3715: };
3717: struct IJSum
3718: {
3719: __host__ __device__
3720: inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3721: {
3722: return t1||t2;
3723: }
3724: };
3726: #include <thrust/iterator/discard_iterator.h>
3727: PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3728: {
3729: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3730: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3731: THRUSTARRAY *cooPerm_v = NULL;
3732: thrust::device_ptr<const PetscScalar> d_v;
3733: CsrMatrix *matrix;
3734: PetscErrorCode ierr;
3735: PetscInt n;
3738: if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3739: if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3740: if (!cusp->cooPerm) {
3741: MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);
3742: MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);
3743: return(0);
3744: }
3745: matrix = (CsrMatrix*)cusp->mat->mat;
3746: if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3747: if (!v) {
3748: if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3749: goto finalize;
3750: }
3751: n = cusp->cooPerm->size();
3752: if (isCudaMem(v)) {
3753: d_v = thrust::device_pointer_cast(v);
3754: } else {
3755: cooPerm_v = new THRUSTARRAY(n);
3756: cooPerm_v->assign(v,v+n);
3757: d_v = cooPerm_v->data();
3758: PetscLogCpuToGpu(n*sizeof(PetscScalar));
3759: }
3760: PetscLogGpuTimeBegin();
3761: if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3762: if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3763: THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3764: auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3765: /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3766: cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3767: cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3768: */
3769: thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3770: thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3771: delete cooPerm_w;
3772: } else {
3773: /* all nonzeros in d_v[] are unique entries */
3774: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3775: matrix->values->begin()));
3776: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3777: matrix->values->end()));
3778: thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */
3779: }
3780: } else {
3781: if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3782: auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3783: thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3784: } else {
3785: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3786: matrix->values->begin()));
3787: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3788: matrix->values->end()));
3789: thrust::for_each(zibit,zieit,VecCUDAEquals());
3790: }
3791: }
3792: PetscLogGpuTimeEnd();
3793: finalize:
3794: delete cooPerm_v;
3795: A->offloadmask = PETSC_OFFLOAD_GPU;
3796: PetscObjectStateIncrease((PetscObject)A);
3797: /* shorter version of MatAssemblyEnd_SeqAIJ */
3798: PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);
3799: PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");
3800: PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);
3801: a->reallocs = 0;
3802: A->info.mallocs += 0;
3803: A->info.nz_unneeded = 0;
3804: A->assembled = A->was_assembled = PETSC_TRUE;
3805: A->num_ass++;
3806: return(0);
3807: }
3809: PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3810: {
3811: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3812: PetscErrorCode ierr;
3816: if (!cusp) return(0);
3817: if (destroy) {
3818: MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);
3819: delete cusp->csr2csc_i;
3820: cusp->csr2csc_i = NULL;
3821: }
3822: A->transupdated = PETSC_FALSE;
3823: return(0);
3824: }
3826: #include <thrust/binary_search.h>
3827: PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3828: {
3829: PetscErrorCode ierr;
3830: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3831: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3832: PetscInt cooPerm_n, nzr = 0;
3833: cudaError_t cerr;
3836: PetscLayoutSetUp(A->rmap);
3837: PetscLayoutSetUp(A->cmap);
3838: cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3839: if (n != cooPerm_n) {
3840: delete cusp->cooPerm;
3841: delete cusp->cooPerm_a;
3842: cusp->cooPerm = NULL;
3843: cusp->cooPerm_a = NULL;
3844: }
3845: if (n) {
3846: THRUSTINTARRAY d_i(n);
3847: THRUSTINTARRAY d_j(n);
3848: THRUSTINTARRAY ii(A->rmap->n);
3850: if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); }
3851: if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3853: PetscLogCpuToGpu(2.*n*sizeof(PetscInt));
3854: d_i.assign(coo_i,coo_i+n);
3855: d_j.assign(coo_j,coo_j+n);
3857: /* Ex.
3858: n = 6
3859: coo_i = [3,3,1,4,1,4]
3860: coo_j = [3,2,2,5,2,6]
3861: */
3862: auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3863: auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3865: PetscLogGpuTimeBegin();
3866: thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3867: thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3868: *cusp->cooPerm_a = d_i; /* copy the sorted array */
3869: THRUSTINTARRAY w = d_j;
3871: /*
3872: d_i = [1,1,3,3,4,4]
3873: d_j = [2,2,2,3,5,6]
3874: cooPerm = [2,4,1,0,3,5]
3875: */
3876: auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3878: /*
3879: d_i = [1,3,3,4,4,x]
3880: ^ekey
3881: d_j = [2,2,3,5,6,x]
3882: ^nekye
3883: */
3884: if (nekey == ekey) { /* all entries are unique */
3885: delete cusp->cooPerm_a;
3886: cusp->cooPerm_a = NULL;
3887: } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3888: /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3889: adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3890: adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3891: (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3892: w[0] = 0;
3893: thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/
3894: thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3895: }
3896: thrust::counting_iterator<PetscInt> search_begin(0);
3897: thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3898: search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3899: ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3900: PetscLogGpuTimeEnd();
3902: MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);
3903: a->singlemalloc = PETSC_FALSE;
3904: a->free_a = PETSC_TRUE;
3905: a->free_ij = PETSC_TRUE;
3906: PetscMalloc1(A->rmap->n+1,&a->i);
3907: a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3908: cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3909: a->nz = a->maxnz = a->i[A->rmap->n];
3910: a->rmax = 0;
3911: PetscMalloc1(a->nz,&a->a);
3912: PetscMalloc1(a->nz,&a->j);
3913: cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3914: if (!a->ilen) { PetscMalloc1(A->rmap->n,&a->ilen); }
3915: if (!a->imax) { PetscMalloc1(A->rmap->n,&a->imax); }
3916: for (PetscInt i = 0; i < A->rmap->n; i++) {
3917: const PetscInt nnzr = a->i[i+1] - a->i[i];
3918: nzr += (PetscInt)!!(nnzr);
3919: a->ilen[i] = a->imax[i] = nnzr;
3920: a->rmax = PetscMax(a->rmax,nnzr);
3921: }
3922: a->nonzerorowcnt = nzr;
3923: A->preallocated = PETSC_TRUE;
3924: PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));
3925: MatMarkDiagonal_SeqAIJ(A);
3926: } else {
3927: MatSeqAIJSetPreallocation(A,0,NULL);
3928: }
3929: MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);
3931: /* We want to allocate the CUSPARSE struct for matvec now.
3932: The code is so convoluted now that I prefer to copy zeros */
3933: PetscArrayzero(a->a,a->nz);
3934: MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);
3935: A->offloadmask = PETSC_OFFLOAD_CPU;
3936: A->nonzerostate++;
3937: MatSeqAIJCUSPARSECopyToGPU(A);
3938: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);
3940: A->assembled = PETSC_FALSE;
3941: A->was_assembled = PETSC_FALSE;
3942: return(0);
3943: }
3945: /*@C
3946: MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
3948: Not collective
3950: Input Parameters:
3951: + A - the matrix
3952: - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
3954: Output Parameters:
3955: + ia - the CSR row pointers
3956: - ja - the CSR column indices
3958: Level: developer
3960: Notes:
3961: When compressed is true, the CSR structure does not contain empty rows
3963: .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
3964: @*/
3965: PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
3966: {
3967: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3968: CsrMatrix *csr;
3969: PetscErrorCode ierr;
3970: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3974: if (!i || !j) return(0);
3976: if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3977: MatSeqAIJCUSPARSECopyToGPU(A);
3978: if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3979: csr = (CsrMatrix*)cusp->mat->mat;
3980: if (i) {
3981: if (!compressed && a->compressedrow.use) { /* need full row offset */
3982: if (!cusp->rowoffsets_gpu) {
3983: cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
3984: cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3985: PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
3986: }
3987: *i = cusp->rowoffsets_gpu->data().get();
3988: } else *i = csr->row_offsets->data().get();
3989: }
3990: if (j) *j = csr->column_indices->data().get();
3991: return(0);
3992: }
3994: /*@C
3995: MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
3997: Not collective
3999: Input Parameters:
4000: + A - the matrix
4001: - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4003: Output Parameters:
4004: + ia - the CSR row pointers
4005: - ja - the CSR column indices
4007: Level: developer
4009: .seealso: MatSeqAIJCUSPARSEGetIJ()
4010: @*/
4011: PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4012: {
4016: if (i) *i = NULL;
4017: if (j) *j = NULL;
4018: return(0);
4019: }
4021: /*@C
4022: MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4024: Not Collective
4026: Input Parameter:
4027: . A - a MATSEQAIJCUSPARSE matrix
4029: Output Parameter:
4030: . a - pointer to the device data
4032: Level: developer
4034: Notes: may trigger host-device copies if up-to-date matrix data is on host
4036: .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4037: @*/
4038: PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4039: {
4040: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4041: CsrMatrix *csr;
4042: PetscErrorCode ierr;
4048: if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4049: MatSeqAIJCUSPARSECopyToGPU(A);
4050: if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4051: csr = (CsrMatrix*)cusp->mat->mat;
4052: if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4053: *a = csr->values->data().get();
4054: return(0);
4055: }
4057: /*@C
4058: MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4060: Not Collective
4062: Input Parameter:
4063: . A - a MATSEQAIJCUSPARSE matrix
4065: Output Parameter:
4066: . a - pointer to the device data
4068: Level: developer
4070: .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4071: @*/
4072: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4073: {
4078: *a = NULL;
4079: return(0);
4080: }
4082: /*@C
4083: MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4085: Not Collective
4087: Input Parameter:
4088: . A - a MATSEQAIJCUSPARSE matrix
4090: Output Parameter:
4091: . a - pointer to the device data
4093: Level: developer
4095: Notes: may trigger host-device copies if up-to-date matrix data is on host
4097: .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4098: @*/
4099: PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4100: {
4101: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4102: CsrMatrix *csr;
4103: PetscErrorCode ierr;
4109: if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4110: MatSeqAIJCUSPARSECopyToGPU(A);
4111: if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4112: csr = (CsrMatrix*)cusp->mat->mat;
4113: if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4114: *a = csr->values->data().get();
4115: A->offloadmask = PETSC_OFFLOAD_GPU;
4116: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
4117: return(0);
4118: }
4119: /*@C
4120: MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4122: Not Collective
4124: Input Parameter:
4125: . A - a MATSEQAIJCUSPARSE matrix
4127: Output Parameter:
4128: . a - pointer to the device data
4130: Level: developer
4132: .seealso: MatSeqAIJCUSPARSEGetArray()
4133: @*/
4134: PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4135: {
4142: PetscObjectStateIncrease((PetscObject)A);
4143: *a = NULL;
4144: return(0);
4145: }
4147: /*@C
4148: MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4150: Not Collective
4152: Input Parameter:
4153: . A - a MATSEQAIJCUSPARSE matrix
4155: Output Parameter:
4156: . a - pointer to the device data
4158: Level: developer
4160: Notes: does not trigger host-device copies and flags data validity on the GPU
4162: .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4163: @*/
4164: PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4165: {
4166: Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4167: CsrMatrix *csr;
4168: PetscErrorCode ierr;
4174: if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4175: if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4176: csr = (CsrMatrix*)cusp->mat->mat;
4177: if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4178: *a = csr->values->data().get();
4179: A->offloadmask = PETSC_OFFLOAD_GPU;
4180: MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);
4181: return(0);
4182: }
4184: /*@C
4185: MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4187: Not Collective
4189: Input Parameter:
4190: . A - a MATSEQAIJCUSPARSE matrix
4192: Output Parameter:
4193: . a - pointer to the device data
4195: Level: developer
4197: .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4198: @*/
4199: PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4200: {
4207: PetscObjectStateIncrease((PetscObject)A);
4208: *a = NULL;
4209: return(0);
4210: }
4212: struct IJCompare4
4213: {
4214: __host__ __device__
4215: inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4216: {
4217: if (t1.get<0>() < t2.get<0>()) return true;
4218: if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4219: return false;
4220: }
4221: };
4223: struct Shift
4224: {
4225: int _shift;
4227: Shift(int shift) : _shift(shift) {}
4228: __host__ __device__
4229: inline int operator() (const int &c)
4230: {
4231: return c + _shift;
4232: }
4233: };
4235: /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4236: PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4237: {
4238: PetscErrorCode ierr;
4239: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4240: Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4241: Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4242: CsrMatrix *Acsr,*Bcsr,*Ccsr;
4243: PetscInt Annz,Bnnz;
4244: cusparseStatus_t stat;
4245: PetscInt i,m,n,zero = 0;
4246: cudaError_t cerr;
4254: if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
4255: if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4256: if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4257: if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4258: if (reuse == MAT_INITIAL_MATRIX) {
4259: m = A->rmap->n;
4260: n = A->cmap->n + B->cmap->n;
4261: MatCreate(PETSC_COMM_SELF,C);
4262: MatSetSizes(*C,m,n,m,n);
4263: MatSetType(*C,MATSEQAIJCUSPARSE);
4264: c = (Mat_SeqAIJ*)(*C)->data;
4265: Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4266: Cmat = new Mat_SeqAIJCUSPARSEMultStruct;
4267: Ccsr = new CsrMatrix;
4268: Cmat->cprowIndices = NULL;
4269: c->compressedrow.use = PETSC_FALSE;
4270: c->compressedrow.nrows = 0;
4271: c->compressedrow.i = NULL;
4272: c->compressedrow.rindex = NULL;
4273: Ccusp->workVector = NULL;
4274: Ccusp->nrows = m;
4275: Ccusp->mat = Cmat;
4276: Ccusp->mat->mat = Ccsr;
4277: Ccsr->num_rows = m;
4278: Ccsr->num_cols = n;
4279: stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4280: stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4281: stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4282: cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4283: cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4284: cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4285: cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4286: cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4287: cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4288: MatSeqAIJCUSPARSECopyToGPU(A);
4289: MatSeqAIJCUSPARSECopyToGPU(B);
4290: MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);
4291: MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);
4292: if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4293: if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4295: Acsr = (CsrMatrix*)Acusp->mat->mat;
4296: Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4297: Annz = (PetscInt)Acsr->column_indices->size();
4298: Bnnz = (PetscInt)Bcsr->column_indices->size();
4299: c->nz = Annz + Bnnz;
4300: Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4301: Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4302: Ccsr->values = new THRUSTARRAY(c->nz);
4303: Ccsr->num_entries = c->nz;
4304: Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4305: if (c->nz) {
4306: auto Acoo = new THRUSTINTARRAY32(Annz);
4307: auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4308: auto Ccoo = new THRUSTINTARRAY32(c->nz);
4309: THRUSTINTARRAY32 *Aroff,*Broff;
4311: if (a->compressedrow.use) { /* need full row offset */
4312: if (!Acusp->rowoffsets_gpu) {
4313: Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4314: Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4315: PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));
4316: }
4317: Aroff = Acusp->rowoffsets_gpu;
4318: } else Aroff = Acsr->row_offsets;
4319: if (b->compressedrow.use) { /* need full row offset */
4320: if (!Bcusp->rowoffsets_gpu) {
4321: Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4322: Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4323: PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));
4324: }
4325: Broff = Bcusp->rowoffsets_gpu;
4326: } else Broff = Bcsr->row_offsets;
4327: PetscLogGpuTimeBegin();
4328: stat = cusparseXcsr2coo(Acusp->handle,
4329: Aroff->data().get(),
4330: Annz,
4331: m,
4332: Acoo->data().get(),
4333: CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4334: stat = cusparseXcsr2coo(Bcusp->handle,
4335: Broff->data().get(),
4336: Bnnz,
4337: m,
4338: Bcoo->data().get(),
4339: CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4340: /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4341: auto Aperm = thrust::make_constant_iterator(1);
4342: auto Bperm = thrust::make_constant_iterator(0);
4343: #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4344: auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4345: auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4346: #else
4347: /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4348: auto Bcib = Bcsr->column_indices->begin();
4349: auto Bcie = Bcsr->column_indices->end();
4350: thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4351: #endif
4352: auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4353: auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4354: auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4355: auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4356: auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4357: auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4358: auto p1 = Ccusp->cooPerm->begin();
4359: auto p2 = Ccusp->cooPerm->begin();
4360: thrust::advance(p2,Annz);
4361: PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4362: #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4363: thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4364: #endif
4365: auto cci = thrust::make_counting_iterator(zero);
4366: auto cce = thrust::make_counting_iterator(c->nz);
4367: #if 0 //Errors on SUMMIT cuda 11.1.0
4368: PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4369: #else
4370: auto pred = thrust::identity<int>();
4371: PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4372: PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4373: #endif
4374: stat = cusparseXcoo2csr(Ccusp->handle,
4375: Ccoo->data().get(),
4376: c->nz,
4377: m,
4378: Ccsr->row_offsets->data().get(),
4379: CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4380: PetscLogGpuTimeEnd();
4381: delete wPerm;
4382: delete Acoo;
4383: delete Bcoo;
4384: delete Ccoo;
4385: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4386: stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4387: Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4388: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4389: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4390: #endif
4391: if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4392: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4393: Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4394: CsrMatrix *CcsrT = new CsrMatrix;
4395: CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4396: CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4398: (*C)->form_explicit_transpose = PETSC_TRUE;
4399: (*C)->transupdated = PETSC_TRUE;
4400: Ccusp->rowoffsets_gpu = NULL;
4401: CmatT->cprowIndices = NULL;
4402: CmatT->mat = CcsrT;
4403: CcsrT->num_rows = n;
4404: CcsrT->num_cols = m;
4405: CcsrT->num_entries = c->nz;
4407: CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4408: CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4409: CcsrT->values = new THRUSTARRAY(c->nz);
4411: PetscLogGpuTimeBegin();
4412: auto rT = CcsrT->row_offsets->begin();
4413: if (AT) {
4414: rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4415: thrust::advance(rT,-1);
4416: }
4417: if (BT) {
4418: auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4419: auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4420: thrust::copy(titb,tite,rT);
4421: }
4422: auto cT = CcsrT->column_indices->begin();
4423: if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4424: if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4425: auto vT = CcsrT->values->begin();
4426: if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4427: if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4428: PetscLogGpuTimeEnd();
4430: stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4431: stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4432: stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4433: cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4434: cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4435: cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4436: cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4437: cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4438: cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4439: #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4440: stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4441: CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4442: CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4443: CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4444: #endif
4445: Ccusp->matTranspose = CmatT;
4446: }
4447: }
4449: c->singlemalloc = PETSC_FALSE;
4450: c->free_a = PETSC_TRUE;
4451: c->free_ij = PETSC_TRUE;
4452: PetscMalloc1(m+1,&c->i);
4453: PetscMalloc1(c->nz,&c->j);
4454: if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4455: THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4456: THRUSTINTARRAY jj(Ccsr->column_indices->size());
4457: ii = *Ccsr->row_offsets;
4458: jj = *Ccsr->column_indices;
4459: cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4460: cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4461: } else {
4462: cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4463: cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4464: }
4465: PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));
4466: PetscMalloc1(m,&c->ilen);
4467: PetscMalloc1(m,&c->imax);
4468: c->maxnz = c->nz;
4469: c->nonzerorowcnt = 0;
4470: c->rmax = 0;
4471: for (i = 0; i < m; i++) {
4472: const PetscInt nn = c->i[i+1] - c->i[i];
4473: c->ilen[i] = c->imax[i] = nn;
4474: c->nonzerorowcnt += (PetscInt)!!nn;
4475: c->rmax = PetscMax(c->rmax,nn);
4476: }
4477: MatMarkDiagonal_SeqAIJ(*C);
4478: PetscMalloc1(c->nz,&c->a);
4479: (*C)->nonzerostate++;
4480: PetscLayoutSetUp((*C)->rmap);
4481: PetscLayoutSetUp((*C)->cmap);
4482: Ccusp->nonzerostate = (*C)->nonzerostate;
4483: (*C)->preallocated = PETSC_TRUE;
4484: } else {
4485: if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4486: c = (Mat_SeqAIJ*)(*C)->data;
4487: if (c->nz) {
4488: Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4489: if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4490: if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4491: if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4492: MatSeqAIJCUSPARSECopyToGPU(A);
4493: MatSeqAIJCUSPARSECopyToGPU(B);
4494: if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4495: if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4496: Acsr = (CsrMatrix*)Acusp->mat->mat;
4497: Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4498: Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4499: if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4500: if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4501: if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4502: if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4503: if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4504: auto pmid = Ccusp->cooPerm->begin();
4505: thrust::advance(pmid,Acsr->num_entries);
4506: PetscLogGpuTimeBegin();
4507: auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4508: thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4509: auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4510: thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4511: thrust::for_each(zibait,zieait,VecCUDAEquals());
4512: auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4513: thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4514: auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4515: thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4516: thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4517: MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);
4518: if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4519: if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4520: PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4521: CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4522: CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4523: CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4524: auto vT = CcsrT->values->begin();
4525: if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4526: if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4527: (*C)->transupdated = PETSC_TRUE;
4528: }
4529: PetscLogGpuTimeEnd();
4530: }
4531: }
4532: PetscObjectStateIncrease((PetscObject)*C);
4533: (*C)->assembled = PETSC_TRUE;
4534: (*C)->was_assembled = PETSC_FALSE;
4535: (*C)->offloadmask = PETSC_OFFLOAD_GPU;
4536: return(0);
4537: }
4539: static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4540: {
4541: PetscErrorCode ierr;
4542: bool dmem;
4543: const PetscScalar *av;
4544: cudaError_t cerr;
4547: dmem = isCudaMem(v);
4548: MatSeqAIJCUSPARSEGetArrayRead(A,&av);
4549: if (n && idx) {
4550: THRUSTINTARRAY widx(n);
4551: widx.assign(idx,idx+n);
4552: PetscLogCpuToGpu(n*sizeof(PetscInt));
4554: THRUSTARRAY *w = NULL;
4555: thrust::device_ptr<PetscScalar> dv;
4556: if (dmem) {
4557: dv = thrust::device_pointer_cast(v);
4558: } else {
4559: w = new THRUSTARRAY(n);
4560: dv = w->data();
4561: }
4562: thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4564: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4565: auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4566: thrust::for_each(zibit,zieit,VecCUDAEquals());
4567: if (w) {
4568: cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4569: }
4570: delete w;
4571: } else {
4572: cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4573: }
4574: if (!dmem) { PetscLogCpuToGpu(n*sizeof(PetscScalar)); }
4575: MatSeqAIJCUSPARSERestoreArrayRead(A,&av);
4576: return(0);
4577: }