Actual source code: pipecg2.c

  1: #include <petsc/private/kspimpl.h>

  3: /* The auxiliary functions below are merged vector operations that load vectors from memory once and use
  4:    the data multiple times by performing vector operations element-wise. These functions
  5:    only apply to sequential vectors.
  6: */
  7: /*   VecMergedDot_Private function merges the dot products for gamma, delta and dp */
  8: static PetscErrorCode VecMergedDot_Private(Vec U,Vec W,Vec R,PetscInt normtype,PetscScalar *ru, PetscScalar *wu, PetscScalar *uu)
  9: {
 10:   const PetscScalar *PETSC_RESTRICT PU, *PETSC_RESTRICT PW, *PETSC_RESTRICT PR;
 11:   PetscScalar       sumru = 0.0, sumwu = 0.0, sumuu = 0.0;
 12:   PetscInt          j, n;

 14:   VecGetArrayRead(U,(const PetscScalar**)&PU);
 15:   VecGetArrayRead(W,(const PetscScalar**)&PW);
 16:   VecGetArrayRead(R,(const PetscScalar**)&PR);
 17:   VecGetLocalSize(U,&n);

 19:   if (normtype==KSP_NORM_PRECONDITIONED) {
 20:     PetscPragmaSIMD
 21:     for (j=0; j<n; j++) {
 22:       sumwu += PW[j] * PetscConj(PU[j]);
 23:       sumru += PR[j] * PetscConj(PU[j]);
 24:       sumuu += PU[j] * PetscConj(PU[j]);
 25:     }
 26:   } else if (normtype==KSP_NORM_UNPRECONDITIONED) {
 27:     PetscPragmaSIMD
 28:     for (j=0; j<n; j++) {
 29:       sumwu += PW[j] * PetscConj(PU[j]);
 30:       sumru += PR[j] * PetscConj(PU[j]);
 31:       sumuu += PR[j] * PetscConj(PR[j]);
 32:     }
 33:   } else if (normtype==KSP_NORM_NATURAL) {
 34:     PetscPragmaSIMD
 35:     for (j=0; j<n; j++) {
 36:       sumwu += PW[j] * PetscConj(PU[j]);
 37:       sumru += PR[j] * PetscConj(PU[j]);
 38:     }
 39:     sumuu = sumru;
 40:   }

 42:   *ru = sumru;
 43:   *wu = sumwu;
 44:   *uu = sumuu;

 46:   VecRestoreArrayRead(U,(const PetscScalar**)&PU);
 47:   VecRestoreArrayRead(W,(const PetscScalar**)&PW);
 48:   VecRestoreArrayRead(R,(const PetscScalar**)&PR);
 49:   return 0;
 50: }

 52: /*   VecMergedDot2_Private function merges the dot products for lambda_1 and lambda_4 */
 53: static PetscErrorCode VecMergedDot2_Private(Vec N,Vec M,Vec W,PetscScalar *wm,PetscScalar *nm)
 54: {
 55:   const PetscScalar *PETSC_RESTRICT PN, *PETSC_RESTRICT PM, *PETSC_RESTRICT PW;
 56:   PetscScalar       sumwm = 0.0, sumnm = 0.0;
 57:   PetscInt          j, n;

 59:   VecGetArrayRead(W,(const PetscScalar**)&PW);
 60:   VecGetArrayRead(N,(const PetscScalar**)&PN);
 61:   VecGetArrayRead(M,(const PetscScalar**)&PM);
 62:   VecGetLocalSize(N,&n);

 64:   PetscPragmaSIMD
 65:   for (j=0; j<n; j++) {
 66:     sumwm += PW[j] * PetscConj(PM[j]);
 67:     sumnm += PN[j] * PetscConj(PM[j]);
 68:   }

 70:   *wm = sumwm;
 71:   *nm = sumnm;

 73:   VecRestoreArrayRead(W,(const PetscScalar**)&PW);
 74:   VecRestoreArrayRead(N,(const PetscScalar**)&PN);
 75:   VecRestoreArrayRead(M,(const PetscScalar**)&PM);
 76:   return 0;
 77: }

 79: /*   VecMergedOpsShort_Private function merges the dot products, AXPY and SAXPY operations for all vectors for iteration 0  */
 80: static PetscErrorCode VecMergedOpsShort_Private(Vec vx,Vec vr,Vec vz,Vec vw,Vec vp,Vec vq,Vec vc,Vec vd,Vec vg0,Vec vh0,Vec vg1,Vec vh1,Vec vs,Vec va1,Vec vb1,Vec ve,Vec vf,Vec vm,Vec vn,Vec vu,PetscInt normtype,PetscScalar beta0,PetscScalar alpha0,PetscScalar beta1,PetscScalar alpha1,PetscScalar *lambda)
 81: {
 82:   PetscScalar       *PETSC_RESTRICT px, *PETSC_RESTRICT pr, *PETSC_RESTRICT pz, *PETSC_RESTRICT pw;
 83:   PetscScalar       *PETSC_RESTRICT pp, *PETSC_RESTRICT pq;
 84:   PetscScalar       *PETSC_RESTRICT pc, *PETSC_RESTRICT pd, *PETSC_RESTRICT pg0, *PETSC_RESTRICT ph0, *PETSC_RESTRICT pg1,*PETSC_RESTRICT ph1,*PETSC_RESTRICT ps,*PETSC_RESTRICT pa1,*PETSC_RESTRICT pb1, *PETSC_RESTRICT pe,*PETSC_RESTRICT pf,*PETSC_RESTRICT pm,*PETSC_RESTRICT pn, *PETSC_RESTRICT pu;
 85:   PetscInt          j, n;

 87:   VecGetArray(vx,(PetscScalar**)&px);
 88:   VecGetArray(vr,(PetscScalar**)&pr);
 89:   VecGetArray(vz,(PetscScalar**)&pz);
 90:   VecGetArray(vw,(PetscScalar**)&pw);
 91:   VecGetArray(vp,(PetscScalar**)&pp);
 92:   VecGetArray(vq,(PetscScalar**)&pq);
 93:   VecGetArray(vc,(PetscScalar**)&pc);
 94:   VecGetArray(vd,(PetscScalar**)&pd);
 95:   VecGetArray(vg0,(PetscScalar**)&pg0);
 96:   VecGetArray(vh0,(PetscScalar**)&ph0);
 97:   VecGetArray(vg1,(PetscScalar**)&pg1);
 98:   VecGetArray(vh1,(PetscScalar**)&ph1);
 99:   VecGetArray(vs,(PetscScalar**)&ps);
100:   VecGetArray(va1,(PetscScalar**)&pa1);
101:   VecGetArray(vb1,(PetscScalar**)&pb1);
102:   VecGetArray(ve,(PetscScalar**)&pe);
103:   VecGetArray(vf,(PetscScalar**)&pf);
104:   VecGetArray(vm,(PetscScalar**)&pm);
105:   VecGetArray(vn,(PetscScalar**)&pn);
106:   VecGetArray(vu,(PetscScalar**)&pu);

108:   VecGetLocalSize(vx,&n);
109:   for (j=0; j<15; j++) lambda[j] = 0.0;

111:   if (normtype==KSP_NORM_PRECONDITIONED) {
112:     PetscPragmaSIMD
113:     for (j=0; j<n; j++) {
114:       pz[j] = pn[j];
115:       pq[j] = pm[j];
116:       ps[j] = pw[j];
117:       pp[j] = pu[j];
118:       pc[j] = pg0[j];
119:       pd[j] = ph0[j];
120:       pa1[j] = pe[j];
121:       pb1[j] = pf[j];

123:       px[j] = px[j] + alpha0 * pp[j];
124:       pr[j] = pr[j] - alpha0 * ps[j];
125:       pu[j] = pu[j] - alpha0 * pq[j];
126:       pw[j] = pw[j] - alpha0 * pz[j];
127:       pm[j] = pm[j] - alpha0 * pc[j];
128:       pn[j] = pn[j] - alpha0 * pd[j];
129:       pg0[j] = pg0[j] - alpha0 * pa1[j];
130:       ph0[j] = ph0[j] - alpha0 * pb1[j];

132:       pg1[j] = pg0[j];
133:       ph1[j] = ph0[j];

135:       pz[j] = pn[j] + beta1 * pz[j];
136:       pq[j] = pm[j] + beta1 * pq[j];
137:       ps[j] = pw[j] + beta1 * ps[j];
138:       pp[j] = pu[j] + beta1 * pp[j];
139:       pc[j] = pg0[j] + beta1 * pc[j];
140:       pd[j] = ph0[j] + beta1 * pd[j];

142:       px[j] = px[j] + alpha1 * pp[j];
143:       pr[j] = pr[j] - alpha1 * ps[j];
144:       pu[j] = pu[j] - alpha1 * pq[j];
145:       pw[j] = pw[j] - alpha1 * pz[j];
146:       pm[j] = pm[j] - alpha1 * pc[j];
147:       pn[j] = pn[j] - alpha1 * pd[j];

149:       lambda[0] += ps[j] * PetscConj(pu[j]);   lambda[1] += pw[j] * PetscConj(pm[j]);
150:       lambda[2] += pw[j] * PetscConj(pq[j]);   lambda[4] += ps[j] * PetscConj(pq[j]);
151:       lambda[6] += pn[j] * PetscConj(pm[j]);   lambda[7] += pn[j] * PetscConj(pq[j]);
152:       lambda[9] += pz[j] * PetscConj(pq[j]);   lambda[10] += pr[j] * PetscConj(pu[j]);
153:       lambda[11] += pw[j] * PetscConj(pu[j]);  lambda[12] += pu[j] * PetscConj(pu[j]);
154:     }
155:     lambda[3] = PetscConj(lambda[2]);
156:     lambda[5] = PetscConj(lambda[1]);
157:     lambda[8] = PetscConj(lambda[7]);
158:     lambda[13] = PetscConj(lambda[11]);
159:     lambda[14] = PetscConj(lambda[0]);

161:   } else if (normtype==KSP_NORM_UNPRECONDITIONED) {
162:     PetscPragmaSIMD
163:     for (j=0; j<n; j++) {
164:       pz[j] = pn[j];
165:       pq[j] = pm[j];
166:       ps[j] = pw[j];
167:       pp[j] = pu[j];
168:       pc[j] = pg0[j];
169:       pd[j] = ph0[j];
170:       pa1[j] = pe[j];
171:       pb1[j] = pf[j];

173:       px[j] = px[j] + alpha0 * pp[j];
174:       pr[j] = pr[j] - alpha0 * ps[j];
175:       pu[j] = pu[j] - alpha0 * pq[j];
176:       pw[j] = pw[j] - alpha0 * pz[j];
177:       pm[j] = pm[j] - alpha0 * pc[j];
178:       pn[j] = pn[j] - alpha0 * pd[j];
179:       pg0[j] = pg0[j] - alpha0 * pa1[j];
180:       ph0[j] = ph0[j] - alpha0 * pb1[j];

182:       pg1[j] = pg0[j];
183:       ph1[j] = ph0[j];

185:       pz[j] = pn[j] + beta1 * pz[j];
186:       pq[j] = pm[j] + beta1 * pq[j];
187:       ps[j] = pw[j] + beta1 * ps[j];
188:       pp[j] = pu[j] + beta1 * pp[j];
189:       pc[j] = pg0[j] + beta1 * pc[j];
190:       pd[j] = ph0[j] + beta1 * pd[j];

192:       px[j] = px[j] + alpha1 * pp[j];
193:       pr[j] = pr[j] - alpha1 * ps[j];
194:       pu[j] = pu[j] - alpha1 * pq[j];
195:       pw[j] = pw[j] - alpha1 * pz[j];
196:       pm[j] = pm[j] - alpha1 * pc[j];
197:       pn[j] = pn[j] - alpha1 * pd[j];

199:       lambda[0] += ps[j] * PetscConj(pu[j]);   lambda[1] += pw[j] * PetscConj(pm[j]);
200:       lambda[2] += pw[j] * PetscConj(pq[j]);   lambda[4] += ps[j] * PetscConj(pq[j]);
201:       lambda[6] += pn[j] * PetscConj(pm[j]);   lambda[7] += pn[j] * PetscConj(pq[j]);
202:       lambda[9] += pz[j] * PetscConj(pq[j]);   lambda[10] += pr[j] * PetscConj(pu[j]);
203:       lambda[11] += pw[j] * PetscConj(pu[j]);  lambda[12] += pr[j] * PetscConj(pr[j]);
204:     }
205:     lambda[3] = PetscConj(lambda[2]);
206:     lambda[5] = PetscConj(lambda[1]);
207:     lambda[8] = PetscConj(lambda[7]);
208:     lambda[13] = PetscConj(lambda[11]);
209:     lambda[14] = PetscConj(lambda[0]);

211:   } else if (normtype==KSP_NORM_NATURAL) {
212:     PetscPragmaSIMD
213:     for (j=0; j<n; j++) {
214:       pz[j] = pn[j];
215:       pq[j] = pm[j];
216:       ps[j] = pw[j];
217:       pp[j] = pu[j];
218:       pc[j] = pg0[j];
219:       pd[j] = ph0[j];
220:       pa1[j] = pe[j];
221:       pb1[j] = pf[j];

223:       px[j] = px[j] + alpha0 * pp[j];
224:       pr[j] = pr[j] - alpha0 * ps[j];
225:       pu[j] = pu[j] - alpha0 * pq[j];
226:       pw[j] = pw[j] - alpha0 * pz[j];
227:       pm[j] = pm[j] - alpha0 * pc[j];
228:       pn[j] = pn[j] - alpha0 * pd[j];
229:       pg0[j] = pg0[j] - alpha0 * pa1[j];
230:       ph0[j] = ph0[j] - alpha0 * pb1[j];

232:       pg1[j] = pg0[j];
233:       ph1[j] = ph0[j];

235:       pz[j] = pn[j] + beta1 * pz[j];
236:       pq[j] = pm[j] + beta1 * pq[j];
237:       ps[j] = pw[j] + beta1 * ps[j];
238:       pp[j] = pu[j] + beta1 * pp[j];
239:       pc[j] = pg0[j] + beta1 * pc[j];
240:       pd[j] = ph0[j] + beta1 * pd[j];

242:       px[j] = px[j] + alpha1 * pp[j];
243:       pr[j] = pr[j] - alpha1 * ps[j];
244:       pu[j] = pu[j] - alpha1 * pq[j];
245:       pw[j] = pw[j] - alpha1 * pz[j];
246:       pm[j] = pm[j] - alpha1 * pc[j];
247:       pn[j] = pn[j] - alpha1 * pd[j];

249:       lambda[0] += ps[j] * PetscConj(pu[j]);   lambda[1] += pw[j] * PetscConj(pm[j]);
250:       lambda[2] += pw[j] * PetscConj(pq[j]);   lambda[4] += ps[j] * PetscConj(pq[j]);
251:       lambda[6] += pn[j] * PetscConj(pm[j]);   lambda[7] += pn[j] * PetscConj(pq[j]);
252:       lambda[9] += pz[j] * PetscConj(pq[j]);   lambda[10] += pr[j] * PetscConj(pu[j]);
253:       lambda[11] += pw[j] * PetscConj(pu[j]);
254:     }
255:     lambda[3] = PetscConj(lambda[2]);
256:     lambda[5] = PetscConj(lambda[1]);
257:     lambda[8] = PetscConj(lambda[7]);
258:     lambda[13] = PetscConj(lambda[11]);
259:     lambda[14] = PetscConj(lambda[0]);
260:     lambda[12] = lambda[10];

262:   }

264:   VecRestoreArray(vx,(PetscScalar**)&px);
265:   VecRestoreArray(vr,(PetscScalar**)&pr);
266:   VecRestoreArray(vz,(PetscScalar**)&pz);
267:   VecRestoreArray(vw,(PetscScalar**)&pw);
268:   VecRestoreArray(vp,(PetscScalar**)&pp);
269:   VecRestoreArray(vq,(PetscScalar**)&pq);
270:   VecRestoreArray(vc,(PetscScalar**)&pc);
271:   VecRestoreArray(vd,(PetscScalar**)&pd);
272:   VecRestoreArray(vg0,(PetscScalar**)&pg0);
273:   VecRestoreArray(vh0,(PetscScalar**)&ph0);
274:   VecRestoreArray(vg1,(PetscScalar**)&pg1);
275:   VecRestoreArray(vh1,(PetscScalar**)&ph1);
276:   VecRestoreArray(vs,(PetscScalar**)&ps);
277:   VecRestoreArray(va1,(PetscScalar**)&pa1);
278:   VecRestoreArray(vb1,(PetscScalar**)&pb1);
279:   VecRestoreArray(ve,(PetscScalar**)&pe);
280:   VecRestoreArray(vf,(PetscScalar**)&pf);
281:   VecRestoreArray(vm,(PetscScalar**)&pm);
282:   VecRestoreArray(vn,(PetscScalar**)&pn);
283:   VecRestoreArray(vu,(PetscScalar**)&pu);
284:   return 0;
285: }

287: /*   VecMergedOps_Private function merges the dot products, AXPY and SAXPY operations for all vectors for iteration > 0  */
288: static PetscErrorCode VecMergedOps_Private(Vec vx,Vec vr,Vec vz,Vec vw,Vec vp,Vec vq,Vec vc,Vec vd,Vec vg0,Vec vh0,Vec vg1,Vec vh1,Vec vs,Vec va1,Vec vb1,Vec ve,Vec vf,Vec vm,Vec vn,Vec vu,PetscInt normtype,PetscScalar beta0,PetscScalar alpha0,PetscScalar beta1,PetscScalar alpha1,PetscScalar *lambda, PetscScalar alphaold)
289: {
290:   PetscScalar       *PETSC_RESTRICT px, *PETSC_RESTRICT pr, *PETSC_RESTRICT pz, *PETSC_RESTRICT pw;
291:   PetscScalar       *PETSC_RESTRICT pp, *PETSC_RESTRICT pq;
292:   PetscScalar       *PETSC_RESTRICT pc,  *PETSC_RESTRICT pd, *PETSC_RESTRICT pg0,  *PETSC_RESTRICT ph0,  *PETSC_RESTRICT pg1, *PETSC_RESTRICT ph1,*PETSC_RESTRICT ps, *PETSC_RESTRICT pa1,*PETSC_RESTRICT pb1,*PETSC_RESTRICT pe,*PETSC_RESTRICT pf, *PETSC_RESTRICT pm,*PETSC_RESTRICT pn, *PETSC_RESTRICT pu;
293:   PetscInt          j, n;

295:   VecGetArray(vx,(PetscScalar**)&px);
296:   VecGetArray(vr,(PetscScalar**)&pr);
297:   VecGetArray(vz,(PetscScalar**)&pz);
298:   VecGetArray(vw,(PetscScalar**)&pw);
299:   VecGetArray(vp,(PetscScalar**)&pp);
300:   VecGetArray(vq,(PetscScalar**)&pq);
301:   VecGetArray(vc,(PetscScalar**)&pc);
302:   VecGetArray(vd,(PetscScalar**)&pd);
303:   VecGetArray(vg0,(PetscScalar**)&pg0);
304:   VecGetArray(vh0,(PetscScalar**)&ph0);
305:   VecGetArray(vg1,(PetscScalar**)&pg1);
306:   VecGetArray(vh1,(PetscScalar**)&ph1);
307:   VecGetArray(vs,(PetscScalar**)&ps);
308:   VecGetArray(va1,(PetscScalar**)&pa1);
309:   VecGetArray(vb1,(PetscScalar**)&pb1);
310:   VecGetArray(ve,(PetscScalar**)&pe);
311:   VecGetArray(vf,(PetscScalar**)&pf);
312:   VecGetArray(vm,(PetscScalar**)&pm);
313:   VecGetArray(vn,(PetscScalar**)&pn);
314:   VecGetArray(vu,(PetscScalar**)&pu);

316:   VecGetLocalSize(vx,&n);
317:   for (j=0; j<15; j++) lambda[j] = 0.0;

319:   if (normtype==KSP_NORM_PRECONDITIONED) {
320:     PetscPragmaSIMD
321:     for (j=0; j<n; j++) {
322:       pa1[j] = (pg1[j] - pg0[j])/alphaold;
323:       pb1[j] = (ph1[j] - ph0[j])/alphaold;

325:       pz[j] = pn[j] + beta0 * pz[j];
326:       pq[j] = pm[j] + beta0 * pq[j];
327:       ps[j] = pw[j] + beta0 * ps[j];
328:       pp[j] = pu[j] + beta0 * pp[j];
329:       pc[j] = pg0[j] + beta0 * pc[j];
330:       pd[j] = ph0[j] + beta0 * pd[j];
331:       pa1[j] = pe[j] + beta0 * pa1[j];
332:       pb1[j] = pf[j] + beta0 * pb1[j];

334:       px[j] = px[j] + alpha0 * pp[j];
335:       pr[j] = pr[j] - alpha0 * ps[j];
336:       pu[j] = pu[j] - alpha0 * pq[j];
337:       pw[j] = pw[j] - alpha0 * pz[j];
338:       pm[j] = pm[j] - alpha0 * pc[j];
339:       pn[j] = pn[j] - alpha0 * pd[j];
340:       pg0[j] = pg0[j] - alpha0 * pa1[j];
341:       ph0[j] = ph0[j] - alpha0 * pb1[j];

343:       pg1[j] = pg0[j];
344:       ph1[j] = ph0[j];

346:       pz[j] = pn[j] + beta1 * pz[j];
347:       pq[j] = pm[j] + beta1 * pq[j];
348:       ps[j] = pw[j] + beta1 * ps[j];
349:       pp[j] = pu[j] + beta1 * pp[j];
350:       pc[j] = pg0[j] + beta1 * pc[j];
351:       pd[j] = ph0[j] + beta1 * pd[j];

353:       px[j] = px[j] + alpha1 * pp[j];
354:       pr[j] = pr[j] - alpha1 * ps[j];
355:       pu[j] = pu[j] - alpha1 * pq[j];
356:       pw[j] = pw[j] - alpha1 * pz[j];
357:       pm[j] = pm[j] - alpha1 * pc[j];
358:       pn[j] = pn[j] - alpha1 * pd[j];

360:       lambda[0] += ps[j] * PetscConj(pu[j]);   lambda[1] += pw[j] * PetscConj(pm[j]);
361:       lambda[2] += pw[j] * PetscConj(pq[j]);   lambda[4] += ps[j] * PetscConj(pq[j]);
362:       lambda[6] += pn[j] * PetscConj(pm[j]);   lambda[7] += pn[j] * PetscConj(pq[j]);
363:       lambda[9] += pz[j] * PetscConj(pq[j]);   lambda[10] += pr[j] * PetscConj(pu[j]);
364:       lambda[11] += pw[j] * PetscConj(pu[j]);  lambda[12] += pu[j] * PetscConj(pu[j]);
365:     }
366:     lambda[3] = PetscConj(lambda[2]);
367:     lambda[5] = PetscConj(lambda[1]);
368:     lambda[8] = PetscConj(lambda[7]);
369:     lambda[13] = PetscConj(lambda[11]);
370:     lambda[14] = PetscConj(lambda[0]);
371:   } else if (normtype==KSP_NORM_UNPRECONDITIONED) {
372:     PetscPragmaSIMD
373:     for (j=0; j<n; j++) {
374:       pa1[j] = (pg1[j] - pg0[j])/alphaold;
375:       pb1[j] = (ph1[j] - ph0[j])/alphaold;

377:       pz[j] = pn[j] + beta0 * pz[j];
378:       pq[j] = pm[j] + beta0 * pq[j];
379:       ps[j] = pw[j] + beta0 * ps[j];
380:       pp[j] = pu[j] + beta0 * pp[j];
381:       pc[j] = pg0[j] + beta0 * pc[j];
382:       pd[j] = ph0[j] + beta0 * pd[j];
383:       pa1[j] = pe[j] + beta0 * pa1[j];
384:       pb1[j] = pf[j] + beta0 * pb1[j];

386:       px[j] = px[j] + alpha0 * pp[j];
387:       pr[j] = pr[j] - alpha0 * ps[j];
388:       pu[j] = pu[j] - alpha0 * pq[j];
389:       pw[j] = pw[j] - alpha0 * pz[j];
390:       pm[j] = pm[j] - alpha0 * pc[j];
391:       pn[j] = pn[j] - alpha0 * pd[j];
392:       pg0[j] = pg0[j] - alpha0 * pa1[j];
393:       ph0[j] = ph0[j] - alpha0 * pb1[j];

395:       pg1[j] = pg0[j];
396:       ph1[j] = ph0[j];

398:       pz[j] = pn[j] + beta1 * pz[j];
399:       pq[j] = pm[j] + beta1 * pq[j];
400:       ps[j] = pw[j] + beta1 * ps[j];
401:       pp[j] = pu[j] + beta1 * pp[j];
402:       pc[j] = pg0[j] + beta1 * pc[j];
403:       pd[j] = ph0[j] + beta1 * pd[j];

405:       px[j] = px[j] + alpha1 * pp[j];
406:       pr[j] = pr[j] - alpha1 * ps[j];
407:       pu[j] = pu[j] - alpha1 * pq[j];
408:       pw[j] = pw[j] - alpha1 * pz[j];
409:       pm[j] = pm[j] - alpha1 * pc[j];
410:       pn[j] = pn[j] - alpha1 * pd[j];

412:       lambda[0] += ps[j] * PetscConj(pu[j]);   lambda[1] += pw[j] * PetscConj(pm[j]);
413:       lambda[2] += pw[j] * PetscConj(pq[j]);   lambda[4] += ps[j] * PetscConj(pq[j]);
414:       lambda[6] += pn[j] * PetscConj(pm[j]);   lambda[7] += pn[j] * PetscConj(pq[j]);
415:       lambda[9] += pz[j] * PetscConj(pq[j]);   lambda[10] += pr[j] * PetscConj(pu[j]);
416:       lambda[11] += pw[j] * PetscConj(pu[j]);  lambda[12] += pr[j] * PetscConj(pr[j]);
417:     }
418:     lambda[3] = PetscConj(lambda[2]);
419:     lambda[5] = PetscConj(lambda[1]);
420:     lambda[8] = PetscConj(lambda[7]);
421:     lambda[13] = PetscConj(lambda[11]);
422:     lambda[14] = PetscConj(lambda[0]);
423:   } else if (normtype==KSP_NORM_NATURAL) {
424:     PetscPragmaSIMD
425:     for (j=0; j<n; j++) {
426:       pa1[j] = (pg1[j] - pg0[j])/alphaold;
427:       pb1[j] = (ph1[j] - ph0[j])/alphaold;

429:       pz[j] = pn[j] + beta0 * pz[j];
430:       pq[j] = pm[j] + beta0 * pq[j];
431:       ps[j] = pw[j] + beta0 * ps[j];
432:       pp[j] = pu[j] + beta0 * pp[j];
433:       pc[j] = pg0[j] + beta0 * pc[j];
434:       pd[j] = ph0[j] + beta0 * pd[j];
435:       pa1[j] = pe[j] + beta0 * pa1[j];
436:       pb1[j] = pf[j] + beta0 * pb1[j];

438:       px[j] = px[j] + alpha0 * pp[j];
439:       pr[j] = pr[j] - alpha0 * ps[j];
440:       pu[j] = pu[j] - alpha0 * pq[j];
441:       pw[j] = pw[j] - alpha0 * pz[j];
442:       pm[j] = pm[j] - alpha0 * pc[j];
443:       pn[j] = pn[j] - alpha0 * pd[j];
444:       pg0[j] = pg0[j] - alpha0 * pa1[j];
445:       ph0[j] = ph0[j] - alpha0 * pb1[j];

447:       pg1[j] = pg0[j];
448:       ph1[j] = ph0[j];

450:       pz[j] = pn[j] + beta1 * pz[j];
451:       pq[j] = pm[j] + beta1 * pq[j];
452:       ps[j] = pw[j] + beta1 * ps[j];
453:       pp[j] = pu[j] + beta1 * pp[j];
454:       pc[j] = pg0[j] + beta1 * pc[j];
455:       pd[j] = ph0[j] + beta1 * pd[j];

457:       px[j] = px[j] + alpha1 * pp[j];
458:       pr[j] = pr[j] - alpha1 * ps[j];
459:       pu[j] = pu[j] - alpha1 * pq[j];
460:       pw[j] = pw[j] - alpha1 * pz[j];
461:       pm[j] = pm[j] - alpha1 * pc[j];
462:       pn[j] = pn[j] - alpha1 * pd[j];

464:       lambda[0] += ps[j] * PetscConj(pu[j]);   lambda[1] += pw[j] * PetscConj(pm[j]);
465:       lambda[2] += pw[j] * PetscConj(pq[j]);   lambda[4] += ps[j] * PetscConj(pq[j]);
466:       lambda[6] += pn[j] * PetscConj(pm[j]);   lambda[7] += pn[j] * PetscConj(pq[j]);
467:       lambda[9] += pz[j] * PetscConj(pq[j]);   lambda[10] += pr[j] * PetscConj(pu[j]);
468:       lambda[11] += pw[j] * PetscConj(pu[j]);
469:     }
470:     lambda[3] = PetscConj(lambda[2]);
471:     lambda[5] = PetscConj(lambda[1]);
472:     lambda[8] = PetscConj(lambda[7]);
473:     lambda[13] = PetscConj(lambda[11]);
474:     lambda[14] = PetscConj(lambda[0]);
475:     lambda[12] = lambda[10];
476:   }

478:   VecRestoreArray(vx,(PetscScalar**)&px);
479:   VecRestoreArray(vr,(PetscScalar**)&pr);
480:   VecRestoreArray(vz,(PetscScalar**)&pz);
481:   VecRestoreArray(vw,(PetscScalar**)&pw);
482:   VecRestoreArray(vp,(PetscScalar**)&pp);
483:   VecRestoreArray(vq,(PetscScalar**)&pq);
484:   VecRestoreArray(vc,(PetscScalar**)&pc);
485:   VecRestoreArray(vd,(PetscScalar**)&pd);
486:   VecRestoreArray(vg0,(PetscScalar**)&pg0);
487:   VecRestoreArray(vh0,(PetscScalar**)&ph0);
488:   VecRestoreArray(vg1,(PetscScalar**)&pg1);
489:   VecRestoreArray(vh1,(PetscScalar**)&ph1);
490:   VecRestoreArray(vs,(PetscScalar**)&ps);
491:   VecRestoreArray(va1,(PetscScalar**)&pa1);
492:   VecRestoreArray(vb1,(PetscScalar**)&pb1);
493:   VecRestoreArray(ve,(PetscScalar**)&pe);
494:   VecRestoreArray(vf,(PetscScalar**)&pf);
495:   VecRestoreArray(vm,(PetscScalar**)&pm);
496:   VecRestoreArray(vn,(PetscScalar**)&pn);
497:   VecRestoreArray(vu,(PetscScalar**)&pu);
498:   return 0;
499: }

501: /*
502:      KSPSetUp_PIPECG2 - Sets up the workspace needed by the PIPECG method.

504:       This is called once, usually automatically by KSPSolve() or KSPSetUp()
505:      but can be called directly by KSPSetUp()
506: */
507: static  PetscErrorCode KSPSetUp_PIPECG2(KSP ksp)
508: {
509:   /* get work vectors needed by PIPECG2 */
510:   KSPSetWorkVecs(ksp,20);
511:   return 0;
512: }

514: /*
515:  KSPSolve_PIPECG2 - This routine actually applies the PIPECG2 method
516: */
517: static PetscErrorCode  KSPSolve_PIPECG2(KSP ksp)
518: {
519:   PetscInt       i,n;
520:   PetscScalar    alpha[2],beta[2],gamma[2],delta[2],lambda[15];
521:   PetscScalar    dps = 0.0,alphaold=0.0;
522:   PetscReal      dp = 0.0;
523:   Vec            X,B,Z,P,W,Q,U,M,N,R,S,C,D,E,F,G[2],H[2],A1,B1;
524:   Mat            Amat,Pmat;
525:   PetscBool      diagonalscale;
526:   MPI_Comm       pcomm;
527:   MPI_Request    req;
528:   MPI_Status     stat;

530:   pcomm = PetscObjectComm((PetscObject)ksp);
531:   PCGetDiagonalScale(ksp->pc,&diagonalscale);

534:   X = ksp->vec_sol;
535:   B = ksp->vec_rhs;
536:   M = ksp->work[0];
537:   Z = ksp->work[1];
538:   P = ksp->work[2];
539:   N = ksp->work[3];
540:   W = ksp->work[4];
541:   Q = ksp->work[5];
542:   U = ksp->work[6];
543:   R = ksp->work[7];
544:   S = ksp->work[8];
545:   C = ksp->work[9];
546:   D = ksp->work[10];
547:   E = ksp->work[11];
548:   F = ksp->work[12];
549:   G[0]  = ksp->work[13];
550:   H[0] = ksp->work[14];
551:   G[1]  = ksp->work[15];
552:   H[1] = ksp->work[16];
553:   A1 = ksp->work[17];
554:   B1 = ksp->work[18];

556:   PetscMemzero(alpha,2*sizeof(PetscScalar));
557:   PetscMemzero(beta,2*sizeof(PetscScalar));
558:   PetscMemzero(gamma,2*sizeof(PetscScalar));
559:   PetscMemzero(delta,2*sizeof(PetscScalar));
560:   PetscMemzero(lambda,15*sizeof(PetscScalar));

562:   VecGetLocalSize(B,&n);
563:   PCGetOperators(ksp->pc,&Amat,&Pmat);

565:   ksp->its = 0;
566:   if (!ksp->guess_zero) {
567:     KSP_MatMult(ksp,Amat,X,R);             /*  r <- b - Ax  */
568:     VecAYPX(R,-1.0,B);
569:   } else {
570:     VecCopy(B,R);                          /*  r <- b (x is 0) */
571:   }

573:   KSP_PCApply(ksp,R,U);                    /*  u <- Br  */
574:   KSP_MatMult(ksp,Amat,U,W);               /*  w <- Au  */

576:   VecMergedDot_Private(U,W,R,ksp->normtype,&gamma[0],&delta[0],&dps);     /*  gamma  <- r'*u , delta <- w'*u , dp <- u'*u or r'*r or r'*u depending on ksp_norm_type  */
577:   lambda[10]= gamma[0];
578:   lambda[11]= delta[0];
579:   lambda[12] = dps;

581: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
582:   MPI_Iallreduce(MPI_IN_PLACE,&lambda[10],3,MPIU_SCALAR,MPIU_SUM,pcomm,&req);
583: #else
584:   MPIU_Allreduce(MPI_IN_PLACE,&lambda[10],3,MPIU_SCALAR,MPIU_SUM,pcomm);
585:   req  = MPI_REQUEST_NULL;
586: #endif

588:   KSP_PCApply(ksp,W,M);                    /*  m <- Bw  */
589:   KSP_MatMult(ksp,Amat,M,N);               /*  n <- Am  */

591:   KSP_PCApply(ksp,N,G[0]);                 /*  g <- Bn  */
592:   KSP_MatMult(ksp,Amat,G[0],H[0]);         /*  h <- Ag  */

594:   KSP_PCApply(ksp,H[0],E);                 /*  e <- Bh  */
595:   KSP_MatMult(ksp,Amat,E,F);               /*  f <- Ae  */

597:   MPI_Wait(&req,&stat);

599:   gamma[0] = lambda[10];
600:   delta[0] = lambda[11];
601:   dp = PetscSqrtReal(PetscAbsScalar(lambda[12]));

603:   VecMergedDot2_Private(N,M,W,&lambda[1],&lambda[6]);     /*  lambda_1 <- w'*m , lambda_4 <- n'*m  */
604:   MPIU_Allreduce(MPI_IN_PLACE,&lambda[1],1,MPIU_SCALAR,MPIU_SUM,pcomm);
605:   MPIU_Allreduce(MPI_IN_PLACE,&lambda[6],1,MPIU_SCALAR,MPIU_SUM,pcomm);

607:   lambda[5] = PetscConj(lambda[1]);
608:   lambda[13] = PetscConj(lambda[11]);

610:   KSPLogResidualHistory(ksp,dp);
611:   KSPMonitor(ksp,0,dp);
612:   ksp->rnorm = dp;

614:   (*ksp->converged)(ksp,0,dp,&ksp->reason,ksp->cnvP); /* test for convergence */
615:   if (ksp->reason) return 0;

617:   for (i=2; i<ksp->max_it; i+=2) {
618:     if (i == 2) {
619:       beta[0] = 0;
620:       alpha[0] = gamma[0]/delta[0];

622:       gamma[1] = gamma[0] - alpha[0] * lambda[13] - alpha[0] * delta[0] + alpha[0] * alpha[0] * lambda[1];
623:       delta[1] = delta[0] - alpha[0] * lambda[1] - alpha[0] * lambda[5] + alpha[0]*alpha[0]*lambda[6];

625:       beta[1]  = gamma[1] / gamma[0];
626:       alpha[1] = gamma[1] / (delta[1] - beta[1] / alpha[0] * gamma[1]);

628:       VecMergedOpsShort_Private(X,R,Z,W,P,Q,C,D,G[0],H[0],G[1],H[1],S,A1,B1,E,F,M,N,U,ksp->normtype,beta[0],alpha[0],beta[1],alpha[1],lambda);
629:     } else {
630:       beta[0]  = gamma[1] / gamma[0];
631:       alpha[0] = gamma[1] / (delta[1] - beta[0] / alpha[1] * gamma[1]);

633:       gamma[0] = gamma[1];
634:       delta[0] = delta[1];

636:       gamma[1] = gamma[0] - alpha[0]*(lambda[13] + beta[0]*lambda[14]) - alpha[0] *(delta[0] + beta[0] * lambda[0]) + alpha[0] * alpha[0] * (lambda[1] + beta[0] * lambda[2] + beta[0] * lambda[3] + beta[0] * beta[0] * lambda[4]);

638:       delta[1] = delta[0] - alpha[0] * (lambda[1] + beta[0]* lambda[2]) - alpha[0] *(lambda[5] + beta[0] * lambda[3]) + alpha[0]*alpha[0] * (lambda[6] + beta[0] * lambda[7] + beta[0] *lambda[8] + beta[0] * beta[0] * lambda[9]);

640:       beta[1]  = gamma[1] / gamma[0];
641:       alpha[1] = gamma[1] / (delta[1] - beta[1] / alpha[0] * gamma[1]);

643:       VecMergedOps_Private(X,R,Z,W,P,Q,C,D,G[0],H[0],G[1],H[1],S,A1,B1,E,F,M,N,U,ksp->normtype,beta[0],alpha[0],beta[1],alpha[1],lambda,alphaold);
644:     }

646:     gamma[0] = gamma[1];
647:     delta[0] = delta[1];

649: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
650:     MPI_Iallreduce(MPI_IN_PLACE,lambda,15,MPIU_SCALAR,MPIU_SUM,pcomm,&req);
651: #else
652:     MPIU_Allreduce(MPI_IN_PLACE,lambda,15,MPIU_SCALAR,MPIU_SUM,pcomm);
653:     req  = MPI_REQUEST_NULL;
654: #endif

656:     KSP_PCApply(ksp,N,G[0]);                       /*  g <- Bn  */
657:     KSP_MatMult(ksp,Amat,G[0],H[0]);               /*  h <- Ag  */

659:     KSP_PCApply(ksp,H[0],E);               /*  e <- Bh  */
660:     KSP_MatMult(ksp,Amat,E,F);             /*  f <- Ae */

662:     MPI_Wait(&req,&stat);

664:     gamma[1] = lambda[10];
665:     delta[1] = lambda[11];
666:     dp = PetscSqrtReal(PetscAbsScalar(lambda[12]));

668:     alphaold = alpha[1];
669:     ksp->its = i;

671:     if (i > 0) {
672:       if (ksp->normtype == KSP_NORM_NONE) dp = 0.0;
673:       ksp->rnorm = dp;
674:       KSPLogResidualHistory(ksp,dp);
675:       KSPMonitor(ksp,i,dp);
676:       (*ksp->converged)(ksp,i,dp,&ksp->reason,ksp->cnvP);
677:       if (ksp->reason) break;
678:     }
679:   }

681:   if (i >= ksp->max_it) ksp->reason = KSP_DIVERGED_ITS;
682:   return 0;
683: }

685: /*MC
686:    KSPPIPECG2 - Pipelined conjugate gradient method with a single non-blocking allreduce per two iterations.

688:    This method has only a single non-blocking reduction per two iterations, compared to 2 blocking for standard CG.  The
689:    non-blocking reduction is overlapped by two matrix-vector products and two preconditioner applications.

691:    Level: intermediate

693:    Notes:
694:    MPI configuration may be necessary for reductions to make asynchronous progress, which is important for performance of pipelined methods.
695:    See the FAQ on the PETSc website for details.

697:    Contributed by:
698:    Manasi Tiwari, Computational and Data Sciences, Indian Institute of Science, Bangalore

700:    Reference:
701:    Manasi Tiwari and Sathish Vadhiyar, "Pipelined Conjugate Gradient Methods for Distributed Memory Systems",
702:    Submitted to International Conference on High Performance Computing, Data and Analytics 2020.

704: .seealso: KSPCreate(), KSPSetType(), KSPCG, KSPPIPECG
705: M*/
706: PETSC_EXTERN PetscErrorCode KSPCreate_PIPECG2(KSP ksp)
707: {
708:   KSPSetSupportedNorm(ksp,KSP_NORM_UNPRECONDITIONED,PC_LEFT,2);
709:   KSPSetSupportedNorm(ksp,KSP_NORM_PRECONDITIONED,PC_LEFT,2);
710:   KSPSetSupportedNorm(ksp,KSP_NORM_NATURAL,PC_LEFT,2);
711:   KSPSetSupportedNorm(ksp,KSP_NORM_NONE,PC_LEFT,1);

713:   ksp->ops->setup          = KSPSetUp_PIPECG2;
714:   ksp->ops->solve          = KSPSolve_PIPECG2;
715:   ksp->ops->destroy        = KSPDestroyDefault;
716:   ksp->ops->view           = NULL;
717:   ksp->ops->setfromoptions = NULL;
718:   ksp->ops->buildsolution  = KSPBuildSolutionDefault;
719:   ksp->ops->buildresidual  = KSPBuildResidualDefault;
720:   return 0;
721: }