Actual source code: blockinvert.h
1: /*
2: Kernels used in sparse ILU (and LU) and in the resulting triangular
3: solves. These are for block algorithms where the block sizes are on
4: the order of 2-6+.
6: There are TWO versions of the macros below.
7: 1) standard for MatScalar == PetscScalar use the standard BLAS for
8: block size larger than 7 and
9: 2) handcoded Fortran single precision for the matrices, since BLAS
10: does not have some arguments in single and some in double.
12: */
15: #include <petscblaslapack.h>
17: /*
18: These are C kernels,they are contained in
19: src/mat/impls/baij/seq
20: */
22: PETSC_INTERN PetscErrorCode PetscLINPACKgefa(MatScalar*,PetscInt,PetscInt*,PetscBool,PetscBool*);
23: PETSC_INTERN PetscErrorCode PetscLINPACKgedi(MatScalar*,PetscInt,PetscInt*,MatScalar*);
24: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_2(MatScalar*,PetscReal,PetscBool,PetscBool*);
25: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_3(MatScalar*,PetscReal,PetscBool,PetscBool*);
27: #define PetscKernel_A_gets_inverse_A_4_nopivot(mat) PetscMacroReturnStandard(\
28: MatScalar d, di = mat[0]; \
29: mat[0] = d = 1.0 / di; \
30: mat[4] *= -d; \
31: mat[8] *= -d; \
32: mat[12] *= -d; \
33: mat[1] *= d; \
34: mat[2] *= d; \
35: mat[3] *= d; \
36: mat[5] += mat[4] * mat[1] * di; \
37: mat[6] += mat[4] * mat[2] * di; \
38: mat[7] += mat[4] * mat[3] * di; \
39: mat[9] += mat[8] * mat[1] * di; \
40: mat[10] += mat[8] * mat[2] * di; \
41: mat[11] += mat[8] * mat[3] * di; \
42: mat[13] += mat[12] * mat[1] * di; \
43: mat[14] += mat[12] * mat[2] * di; \
44: mat[15] += mat[12] * mat[3] * di; \
45: di = mat[5]; \
46: mat[5] = d = 1.0 / di; \
47: mat[1] *= -d; \
48: mat[9] *= -d; \
49: mat[13] *= -d; \
50: mat[4] *= d; \
51: mat[6] *= d; \
52: mat[7] *= d; \
53: mat[0] += mat[1] * mat[4] * di; \
54: mat[2] += mat[1] * mat[6] * di; \
55: mat[3] += mat[1] * mat[7] * di; \
56: mat[8] += mat[9] * mat[4] * di; \
57: mat[10] += mat[9] * mat[6] * di; \
58: mat[11] += mat[9] * mat[7] * di; \
59: mat[12] += mat[13] * mat[4] * di; \
60: mat[14] += mat[13] * mat[6] * di; \
61: mat[15] += mat[13] * mat[7] * di; \
62: di = mat[10]; \
63: mat[10] = d = 1.0 / di; \
64: mat[2] *= -d; \
65: mat[6] *= -d; \
66: mat[14] *= -d; \
67: mat[8] *= d; \
68: mat[9] *= d; \
69: mat[11] *= d; \
70: mat[0] += mat[2] * mat[8] * di; \
71: mat[1] += mat[2] * mat[9] * di; \
72: mat[3] += mat[2] * mat[11] * di; \
73: mat[4] += mat[6] * mat[8] * di; \
74: mat[5] += mat[6] * mat[9] * di; \
75: mat[7] += mat[6] * mat[11] * di; \
76: mat[12] += mat[14] * mat[8] * di; \
77: mat[13] += mat[14] * mat[9] * di; \
78: mat[15] += mat[14] * mat[11] * di; \
79: di = mat[15]; \
80: mat[15] = d = 1.0 / di; \
81: mat[3] *= -d; \
82: mat[7] *= -d; \
83: mat[11] *= -d; \
84: mat[12] *= d; \
85: mat[13] *= d; \
86: mat[14] *= d; \
87: mat[0] += mat[3] * mat[12] * di; \
88: mat[1] += mat[3] * mat[13] * di; \
89: mat[2] += mat[3] * mat[14] * di; \
90: mat[4] += mat[7] * mat[12] * di; \
91: mat[5] += mat[7] * mat[13] * di; \
92: mat[6] += mat[7] * mat[14] * di; \
93: mat[8] += mat[11] * mat[12] * di; \
94: mat[9] += mat[11] * mat[13] * di; \
95: mat[10] += mat[11] * mat[14] * di; \
96: )
98: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4(MatScalar*,PetscReal,PetscBool,PetscBool*);
99: # if defined(PETSC_HAVE_SSE)
100: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4_SSE(MatScalar*);
101: # endif
102: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_5(MatScalar*,PetscInt*,MatScalar*,PetscReal,PetscBool,PetscBool*);
103: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_6(MatScalar*,PetscReal,PetscBool,PetscBool*);
104: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_7(MatScalar*,PetscReal,PetscBool,PetscBool*);
105: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_9(MatScalar*,PetscReal,PetscBool,PetscBool*);
106: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_15(MatScalar*,PetscInt*,MatScalar*,PetscReal,PetscBool,PetscBool*);
108: /*
109: A = inv(A) A_gets_inverse_A
111: A - square bs by bs array stored in column major order
112: pivots - integer work array of length bs
113: W - bs by 1 work array
114: */
115: #define PetscKernel_A_gets_inverse_A(bs,A,pivots,W,allowzeropivot,zeropivotdetected) (PetscLINPACKgefa((A),(bs),(pivots),(allowzeropivot),(zeropivotdetected)) || PetscLINPACKgedi((A),(bs),(pivots),(W)))
117: /* -----------------------------------------------------------------------*/
119: #if !defined(PETSC_USE_REAL_MAT_SINGLE)
120: /*
121: Version that calls the BLAS directly
122: */
123: /*
124: A = A * B A_gets_A_times_B
126: A, B - square bs by bs arrays stored in column major order
127: W - square bs by bs work array
129: */
130: #define PetscKernel_A_gets_A_times_B(bs,A,B,W) do { \
131: PetscBLASInt _bbs; \
132: PetscScalar _one = 1.0,_zero = 0.0; \
133: PetscBLASIntCast(bs,&_bbs); \
134: PetscArraycpy((W),(A),(bs)*(bs)); \
138: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_one,(W),&(_bbs),(B),&(_bbs),&_zero,(A),&(_bbs))); \
139: } while (0)
141: /*
143: A = A - B * C A_gets_A_minus_B_times_C
145: A, B, C - square bs by bs arrays stored in column major order
146: */
147: #define PetscKernel_A_gets_A_minus_B_times_C(bs,A,B,C) do { \
148: PetscScalar _mone = -1.0,_one = 1.0; \
149: PetscBLASInt _bbs; \
150: PetscBLASIntCast(bs,&_bbs); \
154: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_mone,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs))); \
155: } while (0)
157: /*
159: A = A + B^T * C A_gets_A_plus_Btranspose_times_C
161: A, B, C - square bs by bs arrays stored in column major order
162: */
163: #define PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C) do { \
164: PetscScalar _one = 1.0; \
165: PetscBLASInt _bbs; \
166: PetscBLASIntCast(bs,&_bbs); \
170: PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&(_bbs),&(_bbs),&(_bbs),&_one,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs))); \
171: } while (0)
173: /*
174: v = v + A^T w v_gets_v_plus_Atranspose_times_w
176: v - array of length bs
177: A - square bs by bs array
178: w - array of length bs
179: */
180: #define PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w) do { \
181: PetscScalar _one = 1.0; \
182: PetscBLASInt _ione = 1, _bbs; \
183: PetscBLASIntCast(bs,&_bbs); \
187: PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
188: } while (0)
190: /*
191: v = v - A w v_gets_v_minus_A_times_w
193: v - array of length bs
194: A - square bs by bs array
195: w - array of length bs
196: */
197: #define PetscKernel_v_gets_v_minus_A_times_w(bs,v,A,w) do { \
198: PetscScalar _mone = -1.0,_one = 1.0; \
199: PetscBLASInt _ione = 1,_bbs; \
200: PetscBLASIntCast(bs,&_bbs); \
204: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
205: } while (0)
207: /*
208: v = v - A w v_gets_v_minus_transA_times_w
210: v - array of length bs
211: A - square bs by bs array
212: w - array of length bs
213: */
214: #define PetscKernel_v_gets_v_minus_transA_times_w(bs,v,A,w) do { \
215: PetscScalar _mone = -1.0,_one = 1.0; \
216: PetscBLASInt _ione = 1,_bbs; \
217: PetscBLASIntCast(bs,&_bbs); \
221: PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
222: } while (0)
224: /*
225: v = v + A w v_gets_v_plus_A_times_w
227: v - array of length bs
228: A - square bs by bs array
229: w - array of length bs
230: */
231: #define PetscKernel_v_gets_v_plus_A_times_w(bs,v,A,w) do { \
232: PetscScalar _one = 1.0; \
233: PetscBLASInt _ione = 1,_bbs; \
234: PetscBLASIntCast(bs,&_bbs); \
238: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
239: } while (0)
241: /*
242: v = v + A w v_gets_v_plus_Ar_times_w
244: v - array of length bs
245: A - square bs by bs array
246: w - array of length bs
247: */
248: #define PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,v,A,w) do { \
249: PetscScalar _one = 1.0; \
250: PetscBLASInt _ione = 1,_bbs,_bncols; \
251: PetscBLASIntCast(bs,&_bbs); \
252: PetscBLASIntCast(ncols,&_bncols); \
256: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bncols),&_one,A,&(_bbs),v,&_ione,&_one,w,&_ione)); \
257: } while (0)
259: /*
260: w <- w - A v w_gets_w_minus_Ar_times_v
262: v - array of length ncol
263: A(bs,ncols)
264: w - array of length bs
265: */
266: #define PetscKernel_w_gets_w_minus_Ar_times_v(bs,ncols,w,A,v) do { \
267: PetscScalar _one = 1.0,_mone = -1.0; \
268: PetscBLASInt _ione = 1,_bbs,_bncols; \
269: PetscBLASIntCast(bs,&_bbs); \
270: PetscBLASIntCast(ncols,&_bncols); \
274: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bncols),&_mone,A,&(_bbs),v,&_ione,&_one,w,&_ione)); \
275: } while (0)
277: /*
278: w = A v w_gets_A_times_v
280: v - array of length bs
281: A - square bs by bs array
282: w - array of length bs
283: */
284: #define PetscKernel_w_gets_A_times_v(bs,v,A,w) do { \
285: PetscScalar _zero = 0.0,_one = 1.0; \
286: PetscBLASInt _ione = 1,_bbs; \
287: PetscBLASIntCast(bs,&_bbs); \
291: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione)); \
292: } while (0)
294: /*
295: w = A' v w_gets_transA_times_v
297: v - array of length bs
298: A - square bs by bs array
299: w - array of length bs
300: */
301: #define PetscKernel_w_gets_transA_times_v(bs,v,A,w) do { \
302: PetscScalar _zero = 0.0,_one = 1.0; \
303: PetscBLASInt _ione = 1,_bbs; \
304: PetscBLASIntCast(bs,&_bbs); \
308: PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione)); \
309: } while (0)
311: /*
312: z = A*x
313: */
314: #define PetscKernel_w_gets_Ar_times_v(bs,ncols,x,A,z) do { \
315: PetscScalar _one = 1.0,_zero = 0.0; \
316: PetscBLASInt _ione = 1,_bbs,_bncols; \
317: PetscBLASIntCast(bs,&_bbs); \
318: PetscBLASIntCast(ncols,&_bncols); \
322: PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&_bncols,&_one,A,&(_bbs),x,&_ione,&_zero,z,&_ione)); \
323: } while (0)
325: /*
326: z = trans(A)*x
327: */
328: #define PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) do { \
329: PetscScalar _one = 1.0; \
330: PetscBLASInt _ione = 1,_bbs,_bncols; \
331: PetscBLASIntCast(bs,&_bbs); \
332: PetscBLASIntCast(ncols,&_bncols); \
336: PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&_bbs,&_bncols,&_one,A,&_bbs,x,&_ione,&_one,z,&_ione)); \
337: } while (0)
339: #else /* !defined(PETSC_USE_REAL_MAT_SINGLE) */
340: /*
341: Version that calls Fortran routines; can handle different precision
342: of matrix (array) and vectors
343: */
344: /*
345: These are Fortran kernels: They replace certain BLAS routines but
346: have some arguments that may be single precision,rather than double
347: These routines are provided in src/fortran/kernels/sgemv.F
348: They are pretty pitiful but get the job done. The intention is
349: that for important block sizes (currently 1,2,3,4,5,6,7) custom inlined
350: code is used.
351: */
353: #if defined(PETSC_HAVE_FORTRAN_CAPS)
354: #define msgemv_ MSGEMV
355: #define msgemvp_ MSGEMVP
356: #define msgemvm_ MSGEMVM
357: #define msgemvt_ MSGEMVT
358: #define msgemmi_ MSGEMMI
359: #define msgemm_ MSGEMM
360: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
361: #define msgemv_ msgemv
362: #define msgemvp_ msgemvp
363: #define msgemvm_ msgemvm
364: #define msgemvt_ msgemvt
365: #define msgemmi_ msgemmi
366: #define msgemm_ msgemm
367: #endif
369: PETSC_EXTERN void msgemv_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
370: PETSC_EXTERN void msgemvp_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
371: PETSC_EXTERN void msgemvm_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
372: PETSC_EXTERN void msgemvt_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
373: PETSC_EXTERN void msgemmi_(PetscInt*,MatScalar*,MatScalar*,MatScalar*);
374: PETSC_EXTERN void msgemm_(PetscInt*,MatScalar*,MatScalar*,MatScalar*);
376: /*
377: A = A * B A_gets_A_times_B
379: A, B - square bs by bs arrays stored in column major order
380: W - square bs by bs work array
382: */
383: #define PetscKernel_A_gets_A_times_B(bs,A,B,W) do { \
384: PetscArraycpy((W),(A),(bs)*(bs)); \
385: msgemmi_(&bs,A,B,W); \
386: } while (0)
388: /*
390: A = A - B * C A_gets_A_minus_B_times_C
392: A, B, C - square bs by bs arrays stored in column major order
393: */
394: #define PetscKernel_A_gets_A_minus_B_times_C(bs,A,B,C) msgemm_(&(bs),(A),(B),(C))
396: /*
397: v = v - A w v_gets_v_minus_A_times_w
399: v - array of length bs
400: A - square bs by bs array
401: w - array of length bs
402: */
403: #define PetscKernel_v_gets_v_minus_A_times_w(bs,v,A,w) msgemvm_(&(bs),&(bs),(A),(w),(v))
405: /*
406: v = v + A w v_gets_v_plus_A_times_w
408: v - array of length bs
409: A - square bs by bs array
410: w - array of length bs
411: */
412: #define PetscKernel_v_gets_v_plus_A_times_w(bs,v,A,w) msgemvp_(&(bs),&(bs),(A),(w),(v))
414: /*
415: v = v + A w v_gets_v_plus_Ar_times_w
417: v - array of length bs
418: A - square bs by bs array
419: w - array of length bs
420: */
421: #define PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncol,v,A,w) msgemvp_(&(bs),&(ncol),(A),(v),(w))
423: /*
424: w = A v w_gets_A_times_v
426: v - array of length bs
427: A - square bs by bs array
428: w - array of length bs
429: */
430: #define PetscKernel_w_gets_A_times_v(bs,v,A,w) msgemv_(&(bs),&(bs),(A),(v),(w))
432: /*
433: z = A*x
434: */
435: #define PetscKernel_w_gets_Ar_times_v(bs,ncols,x,A,z) msgemv_(&(bs),&(ncols),(A),(x),(z))
437: /*
438: z = trans(A)*x
439: */
440: #define PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) msgemvt_(&(bs),&(ncols),(A),(x),(z))
442: /* These do not work yet */
443: #define PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C)
444: #define PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w)
446: #endif /* !defined(PETSC_USE_REAL_MAT_SINGLE) */
448: #endif /* __ILU_H */