Actual source code: agmresorthog.c
1: #define PETSCKSP_DLL
3: #include <../src/ksp/ksp/impls/gmres/agmres/agmresimpl.h>
4: /*
5: * This file implements the RODDEC algorithm : its purpose is to orthogonalize a set of vectors distributed across several processes. These processes are organized in a virtual ring.
6: * References : [1] Sidje, Roger B. Alternatives for parallel Krylov subspace basis computation. Numer. Linear Algebra Appl. 4 (1997), no. 4, 305-331
7: *
8: *
9: * initial author R. B. SIDJE,
10: * modified : G.-A Atenekeng-Kahou, 2008
11: * modified : D. NUENTSA WAKAM, 2011
12: *
13: */
15: /*
16: * Take the processes that own the vectors and Organize them on a virtual ring.
17: */
18: static PetscErrorCode KSPAGMRESRoddecGivens(PetscReal*, PetscReal*, PetscReal*, PetscInt);
20: PetscErrorCode KSPAGMRESRoddecInitNeighboor(KSP ksp)
21: {
22: MPI_Comm comm;
23: KSP_AGMRES *agmres = (KSP_AGMRES*)(ksp->data);
25: PetscMPIInt First, Last, rank, size;
28: PetscObjectGetComm((PetscObject)agmres->vecs[0], &comm);
29: MPI_Comm_rank(comm, &rank);
30: MPI_Comm_size(comm, &size);
31: MPIU_Allreduce(&rank, &First, 1, MPI_INT, MPI_MIN, comm);
32: MPIU_Allreduce(&rank, &Last, 1, MPI_INT, MPI_MAX, comm);
34: if ((rank != Last) && (rank == 0)) {
35: agmres->Ileft = rank - 1;
36: agmres->Iright = rank + 1;
37: } else {
38: if (rank == Last) {
39: agmres->Ileft = rank - 1;
40: agmres->Iright = First;
41: } else {
42: {
43: agmres->Ileft = Last;
44: agmres->Iright = rank + 1;
45: }
46: }
47: }
48: agmres->rank = rank;
49: agmres->size = size;
50: agmres->First = First;
51: agmres->Last = Last;
52: return(0);
53: }
55: static PetscErrorCode KSPAGMRESRoddecGivens(PetscReal * c, PetscReal * s, PetscReal * r, PetscInt make_r)
56: {
57: PetscReal a, b, t;
60: if (make_r == 1) {
61: a = *c;
62: b = *s;
63: if (b == 0.e0) {
64: *c = 1.e0;
65: *s = 0.e0;
66: } else {
67: if (PetscAbsReal(b) > PetscAbsReal(a)) {
68: t = -a / b;
69: *s = 1.e0 / PetscSqrtReal(1.e0 + t * t);
70: *c = (*s) * t;
71: } else {
72: t = -b / a;
73: *c = 1.e0 / PetscSqrtReal(1.e0 + t * t);
74: *s = (*c) * t;
75: }
76: }
77: if (*c == 0.e0) {
78: *r = 1.e0;
79: } else {
80: if (PetscAbsReal(*s) < PetscAbsReal(*c)) {
81: *r = PetscSign(*c) * (*s) / 2.e0;
82: } else {
83: *r = PetscSign(*s) * 2.e0 / (*c);
84: }
85: }
86: }
88: if (*r == 1.e0) {
89: *c = 0.e0;
90: *s = 1.e0;
91: } else {
92: if (PetscAbsReal(*r) < 1.e0) {
93: *s = 2.e0 * (*r);
94: *c = PetscSqrtReal(1.e0 - (*s) * (*s));
95: } else {
96: *c = 2.e0 / (*r);
97: *s = PetscSqrtReal(1.e0 - (*c) * (*c));
98: }
99: }
100: return(0);
102: }
104: /*
105: * Compute the QR factorization of the Krylov basis vectors
106: * Input :
107: * - the vectors are availabe in agmres->vecs (alias VEC_V)
108: * - nvec : the number of vectors
109: * Output :
110: * - agmres->Qloc : product of elementary reflectors for the QR of the (local part) of the vectors.
111: * - agmres->sgn : Sign of the rotations
112: * - agmres->tloc : scalar factors of the elementary reflectors.
114: */
115: PetscErrorCode KSPAGMRESRoddec(KSP ksp, PetscInt nvec)
116: {
117: KSP_AGMRES *agmres = (KSP_AGMRES*) ksp->data;
118: MPI_Comm comm;
119: PetscScalar *Qloc = agmres->Qloc;
120: PetscScalar *sgn = agmres->sgn;
121: PetscScalar *tloc = agmres->tloc;
123: PetscReal *wbufptr = agmres->wbufptr;
124: PetscMPIInt rank = agmres->rank;
125: PetscMPIInt First = agmres->First;
126: PetscMPIInt Last = agmres->Last;
127: PetscBLASInt pas,len,bnloc,bpos;
128: PetscInt nloc,d, i, j, k;
129: PetscInt pos;
130: PetscReal c, s, rho, Ajj, val, tt, old;
131: PetscScalar *col;
132: MPI_Status status;
133: PetscBLASInt N = MAXKSPSIZE + 1;
136: PetscObjectGetComm((PetscObject)ksp,&comm);
137: PetscLogEventBegin(KSP_AGMRESRoddec,ksp,0,0,0);
138: PetscArrayzero(agmres->Rloc, N*N);
139: /* check input arguments */
140: if (nvec < 1) SETERRQ(PetscObjectComm((PetscObject)ksp),PETSC_ERR_ARG_OUTOFRANGE, "The number of input vectors shoud be positive");
141: VecGetLocalSize(VEC_V(0), &nloc);
142: PetscBLASIntCast(nloc,&bnloc);
143: if (nvec > nloc) SETERRQ(PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_WRONG, "In QR factorization, the number of local rows should be greater or equal to the number of columns");
144: pas = 1;
145: /* Copy the vectors of the basis */
146: for (j = 0; j < nvec; j++) {
147: VecGetArray(VEC_V(j), &col);
148: PetscStackCallBLAS("BLAScopy",BLAScopy_(&bnloc, col, &pas, &Qloc[j*nloc], &pas));
149: VecRestoreArray(VEC_V(j), &col);
150: }
151: /* Each process performs a local QR on its own block */
152: for (j = 0; j < nvec; j++) {
153: len = nloc - j;
154: Ajj = Qloc[j*nloc+j];
155: PetscStackCallBLAS("BLASnrm2",rho = -PetscSign(Ajj) * BLASnrm2_(&len, &(Qloc[j*nloc+j]), &pas));
156: if (rho == 0.0) tloc[j] = 0.0;
157: else {
158: tloc[j] = (Ajj - rho) / rho;
159: len = len - 1;
160: val = 1.0 / (Ajj - rho);
161: PetscStackCallBLAS("BLASscal",BLASscal_(&len, &val, &(Qloc[j*nloc+j+1]), &pas));
162: Qloc[j*nloc+j] = 1.0;
163: len = len + 1;
164: for (k = j + 1; k < nvec; k++) {
165: PetscStackCallBLAS("BLASdot",tt = tloc[j] * BLASdot_(&len, &(Qloc[j*nloc+j]), &pas, &(Qloc[k*nloc+j]), &pas));
166: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&len, &tt, &(Qloc[j*nloc+j]), &pas, &(Qloc[k*nloc+j]), &pas));
167: }
168: Qloc[j*nloc+j] = rho;
169: }
170: }
171: /* annihilate undesirable Rloc, diagonal by diagonal*/
172: for (d = 0; d < nvec; d++) {
173: len = nvec - d;
174: if (rank == First) {
175: PetscStackCallBLAS("BLAScopy",BLAScopy_(&len, &(Qloc[d*nloc+d]), &bnloc, &(wbufptr[d]), &pas));
176: MPI_Send(&(wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm);
177: } else {
178: MPI_Recv(&(wbufptr[d]), len, MPIU_SCALAR, rank - 1, agmres->tag, comm, &status);
179: /* Elimination of Rloc(1,d)*/
180: c = wbufptr[d];
181: s = Qloc[d*nloc];
182: KSPAGMRESRoddecGivens(&c, &s, &rho, 1);
183: /* Apply Givens Rotation*/
184: for (k = d; k < nvec; k++) {
185: old = wbufptr[k];
186: wbufptr[k] = c * old - s * Qloc[k*nloc];
187: Qloc[k*nloc] = s * old + c * Qloc[k*nloc];
188: }
189: Qloc[d*nloc] = rho;
190: if (rank != Last) {
191: MPI_Send(& (wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm);
192: }
193: /* zero-out the d-th diagonal of Rloc ...*/
194: for (j = d + 1; j < nvec; j++) {
195: /* elimination of Rloc[i][j]*/
196: i = j - d;
197: c = Qloc[j*nloc+i-1];
198: s = Qloc[j*nloc+i];
199: KSPAGMRESRoddecGivens(&c, &s, &rho, 1);
200: for (k = j; k < nvec; k++) {
201: old = Qloc[k*nloc+i-1];
202: Qloc[k*nloc+i-1] = c * old - s * Qloc[k*nloc+i];
203: Qloc[k*nloc+i] = s * old + c * Qloc[k*nloc+i];
204: }
205: Qloc[j*nloc+i] = rho;
206: }
207: if (rank == Last) {
208: PetscStackCallBLAS("BLAScopy",BLAScopy_(&len, &(wbufptr[d]), &pas, RLOC(d,d), &N));
209: for (k = d + 1; k < nvec; k++) *RLOC(k,d) = 0.0;
210: }
211: }
212: }
214: if (rank == Last) {
215: for (d = 0; d < nvec; d++) {
216: pos = nvec - d;
217: PetscBLASIntCast(pos,&bpos);
218: sgn[d] = PetscSign(*RLOC(d,d));
219: PetscStackCallBLAS("BLASscal",BLASscal_(&bpos, &(sgn[d]), RLOC(d,d), &N));
220: }
221: }
222: /* BroadCast Rloc to all other processes
223: * NWD : should not be needed
224: */
225: MPI_Bcast(agmres->Rloc,N*N,MPIU_SCALAR,Last,comm);
226: PetscLogEventEnd(KSP_AGMRESRoddec,ksp,0,0,0);
227: return(0);
228: }
230: /*
231: * Computes Out <-- Q * In where Q is the orthogonal matrix from AGMRESRoddec
232: * Input
233: * - Qloc, sgn, tloc, nvec (see AGMRESRoddec above)
234: * - In : input vector (size nvec)
235: * Output :
236: * - Out : Petsc vector (distributed as the basis vectors)
237: */
238: PetscErrorCode KSPAGMRESRodvec(KSP ksp, PetscInt nvec, PetscScalar *In, Vec Out)
239: {
240: KSP_AGMRES *agmres = (KSP_AGMRES*) ksp->data;
241: MPI_Comm comm;
242: PetscScalar *Qloc = agmres->Qloc;
243: PetscScalar *sgn = agmres->sgn;
244: PetscScalar *tloc = agmres->tloc;
245: PetscMPIInt rank = agmres->rank;
246: PetscMPIInt First = agmres->First, Last = agmres->Last;
247: PetscMPIInt Iright = agmres->Iright, Ileft = agmres->Ileft;
248: PetscScalar *y, *zloc;
250: PetscInt nloc,d, len, i, j;
251: PetscBLASInt bnvec,pas,blen;
252: PetscInt dpt;
253: PetscReal c, s, rho, zp, zq, yd=0.0, tt;
254: MPI_Status status;
257: PetscBLASIntCast(nvec,&bnvec);
258: PetscObjectGetComm((PetscObject)ksp,&comm);
259: pas = 1;
260: VecGetLocalSize(VEC_V(0), &nloc);
261: PetscMalloc1(nvec, &y);
262: PetscArraycpy(y, In, nvec);
263: VecGetArray(Out, &zloc);
265: if (rank == Last) {
266: for (i = 0; i < nvec; i++) y[i] = sgn[i] * y[i];
267: }
268: for (i = 0; i < nloc; i++) zloc[i] = 0.0;
269: if (agmres->size == 1) PetscStackCallBLAS("BLAScopy",BLAScopy_(&bnvec, y, &pas, &(zloc[0]), &pas));
270: else {
271: for (d = nvec - 1; d >= 0; d--) {
272: if (rank == First) {
273: MPI_Recv(&(zloc[d]), 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status);
274: } else {
275: for (j = nvec - 1; j >= d + 1; j--) {
276: i = j - d;
277: KSPAGMRESRoddecGivens(&c, &s, &(Qloc[j * nloc + i]), 0);
278: zp = zloc[i-1];
279: zq = zloc[i];
280: zloc[i-1] = c * zp + s * zq;
281: zloc[i] = -s * zp + c * zq;
282: }
283: KSPAGMRESRoddecGivens(&c, &s, &(Qloc[d * nloc]), 0);
284: if (rank == Last) {
285: zp = y[d];
286: zq = zloc[0];
287: y[d] = c * zp + s * zq;
288: zloc[0] = -s * zp + c * zq;
289: MPI_Send(&(y[d]), 1, MPIU_SCALAR, Ileft, agmres->tag, comm);
290: } else {
291: MPI_Recv(&yd, 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status);
292: zp = yd;
293: zq = zloc[0];
294: yd = c * zp + s * zq;
295: zloc[0] = -s * zp + c * zq;
296: MPI_Send(&yd, 1, MPIU_SCALAR, Ileft, agmres->tag, comm);
297: }
298: }
299: }
300: }
301: for (j = nvec - 1; j >= 0; j--) {
302: dpt = j * nloc + j;
303: if (tloc[j] != 0.0) {
304: len = nloc - j;
305: PetscBLASIntCast(len,&blen);
306: rho = Qloc[dpt];
307: Qloc[dpt] = 1.0;
308: tt = tloc[j] * (BLASdot_(&blen, &(Qloc[dpt]), &pas, &(zloc[j]), &pas));
309: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&blen, &tt, &(Qloc[dpt]), &pas, &(zloc[j]), &pas));
310: Qloc[dpt] = rho;
311: }
312: }
313: VecRestoreArray(Out, &zloc);
314: PetscFree(y);
315: return(0);
316: }