Actual source code: landaucu.cu

  1: /*
  2:   Implements the Landau kernel
  3: */
  4: #include <petscconf.h>
  5: #include <petsc/private/dmpleximpl.h>
  6: #include <petsclandau.h>
  7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
  8: #include <../src/mat/impls/aij/seq/aij.h>
  9: #include <petscmat.h>
 10: #include <petscdevice.h>

 12: #include "../land_tensors.h"
 13: #include <petscaijdevice.h>

 15: #define CHECK_LAUNCH_ERROR()                                                             \
 16: do {                                                                                     \
 17:   /* Check synchronous errors, i.e. pre-launch */                                        \
 18:   cudaError_t err = cudaGetLastError();                                                  \
 19:   if (cudaSuccess != err) {                                                              \
 20:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
 21:   }                                                                                      \
 22:   /* Check asynchronous errors, i.e. kernel failed (ULF) */                              \
 23:   err = cudaDeviceSynchronize();                                                         \
 24:   if (cudaSuccess != err) {                                                              \
 25:     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
 26:   }                                                                                      \
 27:  } while (0)

 29: PETSC_EXTERN PetscErrorCode LandauCUDACreateMatMaps(P4estVertexMaps maps[], pointInterpolationP4est (*pointMaps)[LANDAU_MAX_Q_FACE],
 30:                                                     PetscInt Nf[], PetscInt Nq, PetscInt grid)
 31: {
 32:   P4estVertexMaps h_maps;
 33:   cudaError_t     cerr;
 35:   h_maps.num_elements = maps[grid].num_elements;
 36:   h_maps.num_face = maps[grid].num_face;
 37:   h_maps.num_reduced = maps[grid].num_reduced;
 38:   h_maps.deviceType = maps[grid].deviceType;
 39:   h_maps.Nf = Nf[grid];
 40:   h_maps.numgrids = maps[grid].numgrids;
 41:   h_maps.Nq = Nq;
 42:   cerr = cudaMalloc((void **)&h_maps.c_maps,                    maps[grid].num_reduced  * sizeof *pointMaps);CHKERRCUDA(cerr);
 43:   cerr = cudaMemcpy(          h_maps.c_maps, maps[grid].c_maps, maps[grid].num_reduced  * sizeof *pointMaps, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
 44:   cerr = cudaMalloc((void **)&h_maps.gIdx,                 maps[grid].num_elements * sizeof *maps[grid].gIdx);CHKERRCUDA(cerr);
 45:   cerr = cudaMemcpy(          h_maps.gIdx, maps[grid].gIdx,maps[grid].num_elements * sizeof *maps[grid].gIdx, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
 46:   cerr = cudaMalloc((void **)&maps[grid].d_self,            sizeof(P4estVertexMaps));CHKERRCUDA(cerr);
 47:   cerr = cudaMemcpy(          maps[grid].d_self,   &h_maps, sizeof(P4estVertexMaps), cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
 48:   return(0);
 49: }

 51: PETSC_EXTERN PetscErrorCode LandauCUDADestroyMatMaps(P4estVertexMaps maps[], PetscInt num_grids)
 52: {
 53:   cudaError_t     cerr;
 55:   for (PetscInt grid=0;grid<num_grids;grid++) {
 56:     P4estVertexMaps *d_maps = maps[grid].d_self, h_maps;
 57:     cerr = cudaMemcpy(&h_maps, d_maps, sizeof(P4estVertexMaps), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
 58:     cerr = cudaFree(h_maps.c_maps);CHKERRCUDA(cerr);
 59:     cerr = cudaFree(h_maps.gIdx);CHKERRCUDA(cerr);
 60:     cerr = cudaFree(d_maps);CHKERRCUDA(cerr);
 61:   }
 62:   return(0);
 63: }

 65: PetscErrorCode LandauCUDAStaticDataSet(DM plex, const PetscInt Nq, const PetscInt num_grids, PetscInt a_numCells[], PetscInt a_species_offset[], PetscInt a_mat_offset[],
 66:                                        PetscReal nu_alpha[], PetscReal nu_beta[], PetscReal a_invMass[], PetscReal a_invJ[],
 67:                                        PetscReal a_x[], PetscReal a_y[], PetscReal a_z[], PetscReal a_w[], LandauStaticData *SData_d)
 68: {
 69:   PetscErrorCode  ierr;
 70:   PetscTabulation *Tf;
 71:   PetscReal       *BB,*DD;
 72:   PetscInt        dim,Nb=Nq,szf=sizeof(PetscReal),szs=sizeof(PetscScalar),szi=sizeof(PetscInt);
 73:   PetscInt        h_ip_offset[LANDAU_MAX_GRIDS+1],h_ipf_offset[LANDAU_MAX_GRIDS+1],h_elem_offset[LANDAU_MAX_GRIDS+1],nip,IPfdf_sz,Nf;
 74:   PetscDS         prob;
 75:   cudaError_t     cerr;

 78:   DMGetDimension(plex, &dim);
 79:   DMGetDS(plex, &prob);
 80:   if (LANDAU_DIM != dim) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "dim %D != LANDAU_DIM %d",dim,LANDAU_DIM);
 81:   PetscDSGetTabulation(prob, &Tf);
 82:   BB   = Tf[0]->T[0]; DD = Tf[0]->T[1];
 83:   Nf = h_ip_offset[0] = h_ipf_offset[0] = h_elem_offset[0] = 0;
 84:   nip = 0;
 85:   IPfdf_sz = 0;
 86:   for (PetscInt grid=0 ; grid<num_grids ; grid++) {
 87:     PetscInt nfloc = a_species_offset[grid+1] - a_species_offset[grid];
 88:     h_elem_offset[grid+1] = h_elem_offset[grid] + a_numCells[grid];
 89:     nip += a_numCells[grid]*Nq;
 90:     h_ip_offset[grid+1] = nip;
 91:     IPfdf_sz += Nq*nfloc*a_numCells[grid];
 92:     h_ipf_offset[grid+1] = IPfdf_sz;
 93:   }
 94:   Nf = a_species_offset[num_grids];
 95:   {
 96:     cerr = cudaMalloc((void **)&SData_d->B,      Nq*Nb*szf);CHKERRCUDA(cerr);     // kernel input
 97:     cerr = cudaMemcpy(          SData_d->B, BB,  Nq*Nb*szf,   cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
 98:     cerr = cudaMalloc((void **)&SData_d->D,      Nq*Nb*dim*szf);CHKERRCUDA(cerr); // kernel input
 99:     cerr = cudaMemcpy(          SData_d->D, DD,  Nq*Nb*dim*szf,   cudaMemcpyHostToDevice);CHKERRCUDA(cerr);

101:     cerr = cudaMalloc((void **)&SData_d->alpha, Nf*szf);CHKERRCUDA(cerr); // kernel input
102:     cerr = cudaMalloc((void **)&SData_d->beta,  Nf*szf);CHKERRCUDA(cerr); // kernel input
103:     cerr = cudaMalloc((void **)&SData_d->invMass,  Nf*szf);CHKERRCUDA(cerr); // kernel input

105:     cerr = cudaMemcpy(SData_d->alpha,  nu_alpha, Nf*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
106:     cerr = cudaMemcpy(SData_d->beta,   nu_beta,  Nf*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
107:     cerr = cudaMemcpy(SData_d->invMass,a_invMass,Nf*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);

109:     // collect geometry
110:     cerr = cudaMalloc((void **)&SData_d->invJ,   nip*dim*dim*szf);CHKERRCUDA(cerr); // kernel input
111:     cerr = cudaMemcpy(SData_d->invJ,   a_invJ,   nip*dim*dim*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
112:     cerr = cudaMalloc((void **)&SData_d->x,      nip*szf);CHKERRCUDA(cerr);     // kernel input
113:     cerr = cudaMemcpy(          SData_d->x, a_x, nip*szf,   cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
114:     cerr = cudaMalloc((void **)&SData_d->y,      nip*szf);CHKERRCUDA(cerr); // kernel input
115:     cerr = cudaMemcpy(          SData_d->y, a_y, nip*szf,   cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
116: #if LANDAU_DIM==3
117:     cerr = cudaMalloc((void **)&SData_d->z,      nip*szf);CHKERRCUDA(cerr); // kernel input
118:     cerr = cudaMemcpy(          SData_d->z, a_z, nip*szf,   cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
119: #endif
120:     cerr = cudaMalloc((void **)&SData_d->w,      nip*szf);CHKERRCUDA(cerr); // kernel input
121:     cerr = cudaMemcpy(          SData_d->w, a_w, nip*szf,   cudaMemcpyHostToDevice);CHKERRCUDA(cerr);

123:     cerr = cudaMalloc((void **)&SData_d->NCells,              num_grids*szi);CHKERRCUDA(cerr);
124:     cerr = cudaMemcpy(          SData_d->NCells, a_numCells,  num_grids*szi,   cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
125:     cerr = cudaMalloc((void **)&SData_d->species_offset,                   (num_grids+1)*szi);CHKERRCUDA(cerr);
126:     cerr = cudaMemcpy(          SData_d->species_offset, a_species_offset, (num_grids+1)*szi,   cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
127:     cerr = cudaMalloc((void **)&SData_d->mat_offset,                      (num_grids+1)*szi);CHKERRCUDA(cerr);
128:     cerr = cudaMemcpy(          SData_d->mat_offset, a_mat_offset,       (num_grids+1)*szi,   cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
129:     cerr = cudaMalloc((void **)&SData_d->ip_offset,                       (num_grids+1)*szi);CHKERRCUDA(cerr);
130:     cerr = cudaMemcpy(          SData_d->ip_offset, h_ip_offset,          (num_grids+1)*szi,   cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
131:     cerr = cudaMalloc((void **)&SData_d->ipf_offset,                     (num_grids+1)*szi);CHKERRCUDA(cerr);
132:     cerr = cudaMemcpy(          SData_d->ipf_offset, h_ipf_offset,     (num_grids+1)*szi,   cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
133:     cerr = cudaMalloc((void **)&SData_d->elem_offset,                     (num_grids+1)*szi);CHKERRCUDA(cerr);
134:     cerr = cudaMemcpy(          SData_d->elem_offset, h_elem_offset,     (num_grids+1)*szi,   cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
135:     // allocate space for dynamic data once
136:     cerr = cudaMalloc((void **)&SData_d->Eq_m,       Nf*szf);CHKERRCUDA(cerr);
137:     cerr = cudaMalloc((void **)&SData_d->f,      nip*Nf*szs);CHKERRCUDA(cerr);
138:     cerr = cudaMalloc((void **)&SData_d->dfdx,   nip*Nf*szs);CHKERRCUDA(cerr);
139:     cerr = cudaMalloc((void **)&SData_d->dfdy,   nip*Nf*szs);CHKERRCUDA(cerr);
140: #if LANDAU_DIM==3
141:     cerr = cudaMalloc((void **)&SData_d->dfdz,   nip*Nf*szs);CHKERRCUDA(cerr);     // kernel input
142: #endif
143:     cerr = cudaMalloc((void**)&SData_d->maps, num_grids*sizeof(P4estVertexMaps*));CHKERRCUDA(cerr);
144:   }
145:   return(0);
146: }

148: PetscErrorCode LandauCUDAStaticDataClear(LandauStaticData *SData_d)
149: {
150:   cudaError_t     cerr;

153:   if (SData_d->alpha) {
154:     cerr = cudaFree(SData_d->alpha);CHKERRCUDA(cerr);
155:     SData_d->alpha = NULL;
156:     cerr = cudaFree(SData_d->beta);CHKERRCUDA(cerr);
157:     cerr = cudaFree(SData_d->invMass);CHKERRCUDA(cerr);
158:     cerr = cudaFree(SData_d->B);CHKERRCUDA(cerr);
159:     cerr = cudaFree(SData_d->D);CHKERRCUDA(cerr);
160:     cerr = cudaFree(SData_d->invJ);CHKERRCUDA(cerr);
161: #if LANDAU_DIM==3
162:     cerr = cudaFree(SData_d->z);CHKERRCUDA(cerr);
163: #endif
164:     cerr = cudaFree(SData_d->x);CHKERRCUDA(cerr);
165:     cerr = cudaFree(SData_d->y);CHKERRCUDA(cerr);
166:     cerr = cudaFree(SData_d->w);CHKERRCUDA(cerr);
167:     // dynamic data
168:     cerr = cudaFree(SData_d->Eq_m);CHKERRCUDA(cerr);
169:     cerr = cudaFree(SData_d->f);CHKERRCUDA(cerr);
170:     cerr = cudaFree(SData_d->dfdx);CHKERRCUDA(cerr);
171:     cerr = cudaFree(SData_d->dfdy);CHKERRCUDA(cerr);
172: #if LANDAU_DIM==3
173:     cerr = cudaFree(SData_d->dfdz);CHKERRCUDA(cerr);
174: #endif
175:     cerr = cudaFree(SData_d->NCells);CHKERRCUDA(cerr);
176:     cerr = cudaFree(SData_d->species_offset);CHKERRCUDA(cerr);
177:     cerr = cudaFree(SData_d->mat_offset);CHKERRCUDA(cerr);
178:     cerr = cudaFree(SData_d->ip_offset);CHKERRCUDA(cerr);
179:     cerr = cudaFree(SData_d->ipf_offset);CHKERRCUDA(cerr);
180:     cerr = cudaFree(SData_d->elem_offset);CHKERRCUDA(cerr);
181:     cerr = cudaFree(SData_d->maps);CHKERRCUDA(cerr);
182:   }
183:   return(0);
184: }

186: // The GPU Landau kernel
187: //
188: __global__
189: void landau_form_fdf(const PetscInt dim, const PetscInt Nb, const PetscReal d_invJ[],
190:                      const PetscReal * const BB, const PetscReal * const DD, PetscScalar *d_vertex_f, P4estVertexMaps *d_maps[],
191:                      PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[],
192: #if LANDAU_DIM==3
193:                      PetscReal d_dfdz[],
194: #endif
195:                      const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[]) // output
196: {
197:   const PetscInt    Nq = blockDim.y, g_cell = blockIdx.x;
198:   const PetscInt    myQi = threadIdx.y;
199:   const PetscReal   *Bq = &BB[myQi*Nb], *Dq = &DD[myQi*Nb*dim];
200:   PetscInt          grid = 0, f,d,b,e,q;
201:   while (g_cell >= d_elem_offset[grid+1]) grid++; // yuck search for grid
202:   {
203:     const PetscInt         moffset = d_mat_offset[grid], nip_loc = d_numCells[grid]*Nq, Nfloc = d_species_offset[grid+1] - d_species_offset[grid], elem = g_cell -  d_elem_offset[grid];
204:     const PetscInt         IP_idx = d_ip_offset[grid], IPf_vertex_idx = d_ipf_offset[grid];
205:     const PetscInt         ipidx_g = myQi + elem*Nq, ipidx = IP_idx + ipidx_g;
206:     const PetscReal *const invJj = &d_invJ[ipidx*dim*dim];
207:     PetscReal              u_x[LANDAU_MAX_SPECIES][LANDAU_DIM];
208:     const PetscScalar      *coef;
209:     PetscScalar            coef_buff[LANDAU_MAX_SPECIES*LANDAU_MAX_NQ];
210:     if (!d_maps) {
211:       coef = &d_vertex_f[elem*Nb*Nfloc + IPf_vertex_idx]; // closure and IP indexing are the same
212:     } else {
213:       coef = coef_buff;
214:       for (f = 0; f < Nfloc ; ++f) {
215:         LandauIdx *const Idxs = &d_maps[grid]->gIdx[elem][f][0];
216:         for (b = 0; b < Nb; ++b) {
217:           PetscInt idx = Idxs[b];
218:           if (idx >= 0) {
219:             coef_buff[f*Nb+b] = d_vertex_f[idx+moffset];
220:           } else {
221:             idx = -idx - 1;
222:             coef_buff[f*Nb+b] = 0;
223:             for (q = 0; q < d_maps[grid]->num_face; q++) {
224:               PetscInt  id    = d_maps[grid]->c_maps[idx][q].gid;
225:               PetscReal scale = d_maps[grid]->c_maps[idx][q].scale;
226:               coef_buff[f*Nb+b] += scale*d_vertex_f[id+moffset];
227:             }
228:           }
229:         }
230:       }
231:     }

233:     /* get f and df */
234:     for (f = threadIdx.x; f < Nfloc; f += blockDim.x) {
235:       PetscReal      refSpaceDer[LANDAU_DIM];
236:       const PetscInt idx = IPf_vertex_idx + f*nip_loc + ipidx_g;
237:       d_f[idx] = 0.0;
238:       for (d = 0; d < LANDAU_DIM; ++d) refSpaceDer[d] = 0.0;
239:       for (b = 0; b < Nb; ++b) {
240:         const PetscInt    cidx = b;
241:         d_f[idx] += Bq[cidx]*PetscRealPart(coef[f*Nb+cidx]);
242:         for (d = 0; d < dim; ++d) refSpaceDer[d] += Dq[cidx*dim+d]*PetscRealPart(coef[f*Nb+cidx]);
243:       }
244:       for (d = 0; d < dim; ++d) {
245:         for (e = 0, u_x[f][d] = 0.0; e < dim; ++e) {
246:           u_x[f][d] += invJj[e*dim+d]*refSpaceDer[e];
247:         }
248:       }
249:     }
250:     for (f = threadIdx.x; f < Nfloc ; f += blockDim.x) {
251:       const PetscInt idx = IPf_vertex_idx + f*nip_loc + ipidx_g;
252:       d_dfdx[idx] = u_x[f][0];
253:       d_dfdy[idx] = u_x[f][1];
254: #if LANDAU_DIM==3
255:       d_dfdz[idx] = u_x[f][2];
256: #endif
257:     }
258:   }
259: }

261: __device__ void
262: jac_kernel(const PetscInt myQi, const PetscInt jpidx, PetscInt nip_global, const PetscInt Nq, const PetscInt grid,
263:            const PetscInt dim,  const PetscReal xx[], const PetscReal yy[], const PetscReal ww[], const PetscReal invJj[],
264:            const PetscInt Nftot, const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[],
265:            const PetscReal * const BB, const PetscReal * const DD, PetscScalar *elemMat, P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, // output
266:            PetscScalar s_fieldMats[][LANDAU_MAX_NQ], // all these arrays are in shared memory
267:            PetscReal s_scale[][LANDAU_MAX_Q_FACE],
268:            PetscInt  s_idx[][LANDAU_MAX_Q_FACE],
269:            PetscReal s_g2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
270:            PetscReal s_g3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
271:            PetscReal s_gg2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
272:            PetscReal s_gg3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
273:            PetscReal s_nu_alpha[],
274:            PetscReal s_nu_beta[],
275:            PetscReal s_invMass[],
276:            PetscReal s_f[],
277:            PetscReal s_dfx[],
278:            PetscReal s_dfy[],
279:            PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[], // global memory
280: #if LANDAU_DIM==3
281:            const PetscReal zz[], PetscReal s_dfz[], PetscReal d_dfdz[],
282: #endif
283:            const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[])
284: {
285:   int             delta,d,f,g,d2,dp,d3,fieldA,ipidx_b;
286:   PetscReal       gg2_temp[LANDAU_DIM], gg3_temp[LANDAU_DIM][LANDAU_DIM];
287: #if LANDAU_DIM==2
288:   const PetscReal vj[3] = {xx[jpidx], yy[jpidx]};
289: #else
290:   const PetscReal vj[3] = {xx[jpidx], yy[jpidx], zz[jpidx]};
291: #endif
292:   const PetscInt  moffset = d_mat_offset[grid], Nfloc = d_species_offset[grid+1]-d_species_offset[grid], g_cell = blockIdx.x, elem = g_cell - d_elem_offset[grid];
293:   const PetscInt  f_off = d_species_offset[grid], Nb=Nq;

295:   // create g2 & g3
296:   for (f=threadIdx.x; f<Nfloc; f+=blockDim.x) {
297:     for (d=0;d<dim;d++) { // clear accumulation data D & K
298:       s_gg2[d][myQi][f] = 0;
299:       for (d2=0;d2<dim;d2++) s_gg3[d][d2][myQi][f] = 0;
300:     }
301:   }
302:   for (d2 = 0; d2 < dim; d2++) {
303:     gg2_temp[d2] = 0;
304:     for (d3 = 0; d3 < dim; d3++) {
305:       gg3_temp[d2][d3] = 0;
306:     }
307:   }
308:   if (threadIdx.y == 0) {
309:     // copy species into shared memory
310:     for (fieldA = threadIdx.x; fieldA < Nftot; fieldA += blockDim.x) {
311:       s_nu_alpha[fieldA] = nu_alpha[fieldA];
312:       s_nu_beta[fieldA] = nu_beta[fieldA];
313:       s_invMass[fieldA] = invMass[fieldA];
314:     }
315:   }
316:   __syncthreads();
317:   // inner integral, collect gg2/3
318:   for (ipidx_b = 0; ipidx_b < nip_global; ipidx_b += blockDim.x) {
319:     const PetscInt ipidx = ipidx_b + threadIdx.x;
320:     PetscInt       f_off_r,grid_r,Nfloc_r,nip_loc_r,ipidx_g,fieldB,IPf_idx_r;
321:     __syncthreads();
322:     if (ipidx < nip_global) {
323:       grid_r = 0;
324:       while (ipidx >= d_ip_offset[grid_r+1]) grid_r++; // yuck search for grid
325:       f_off_r = d_species_offset[grid_r];
326:       ipidx_g = ipidx - d_ip_offset[grid_r];
327:       nip_loc_r = d_numCells[grid_r]*Nq;
328:       Nfloc_r = d_species_offset[grid_r+1] - d_species_offset[grid_r];
329:       IPf_idx_r = d_ipf_offset[grid_r];
330:       for (fieldB = threadIdx.y; fieldB < Nfloc_r ; fieldB += blockDim.y) {
331:         const PetscInt idx = IPf_idx_r + fieldB*nip_loc_r + ipidx_g;
332:         s_f  [fieldB*blockDim.x + threadIdx.x] =    d_f[idx]; // all vector threads get copy of data (Peng: why?)
333:         s_dfx[fieldB*blockDim.x + threadIdx.x] = d_dfdx[idx];
334:         s_dfy[fieldB*blockDim.x + threadIdx.x] = d_dfdy[idx];
335: #if LANDAU_DIM==3
336:         s_dfz[fieldB*blockDim.x + threadIdx.x] = d_dfdz[idx];
337: #endif
338:       }
339:     }
340:     __syncthreads();
341:     if (ipidx < nip_global) {
342:       const PetscReal wi = ww[ipidx], x = xx[ipidx], y = yy[ipidx];
343:       PetscReal       temp1[3] = {0, 0, 0}, temp2 = 0;
344: #if LANDAU_DIM==2
345:       PetscReal Ud[2][2], Uk[2][2], mask = (PetscAbs(vj[0]-x) < 100*PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[1]-y) < 100*PETSC_SQRT_MACHINE_EPSILON) ? 0. : 1.;
346:       LandauTensor2D(vj, x, y, Ud, Uk, mask);
347: #else
348:       PetscReal U[3][3], z = zz[ipidx], mask = (PetscAbs(vj[0]-x) < 100*PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[1]-y) < 100*PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[2]-z) < 100*PETSC_SQRT_MACHINE_EPSILON) ? 0. : 1.;
349:       LandauTensor3D(vj, x, y, z, U, mask);
350: #endif
351:       for (fieldB = 0; fieldB < Nfloc_r; fieldB++) {
352:         temp1[0] += s_dfx[fieldB*blockDim.x + threadIdx.x]*s_nu_beta[fieldB + f_off_r]*s_invMass[fieldB + f_off_r];
353:         temp1[1] += s_dfy[fieldB*blockDim.x + threadIdx.x]*s_nu_beta[fieldB + f_off_r]*s_invMass[fieldB + f_off_r];
354: #if LANDAU_DIM==3
355:         temp1[2] += s_dfz[fieldB*blockDim.x + threadIdx.x]*s_nu_beta[fieldB + f_off_r]*s_invMass[fieldB + f_off_r];
356: #endif
357:         temp2    += s_f  [fieldB*blockDim.x + threadIdx.x]*s_nu_beta[fieldB + f_off_r];
358:       }
359:       temp1[0] *= wi;
360:       temp1[1] *= wi;
361: #if LANDAU_DIM==3
362:       temp1[2] *= wi;
363: #endif
364:       temp2    *= wi;
365: #if LANDAU_DIM==2
366:       for (d2 = 0; d2 < 2; d2++) {
367:         for (d3 = 0; d3 < 2; ++d3) {
368:           /* K = U * grad(f): g2=e: i,A */
369:           gg2_temp[d2] += Uk[d2][d3]*temp1[d3];
370:           /* D = -U * (I \kron (fx)): g3=f: i,j,A */
371:           gg3_temp[d2][d3] += Ud[d2][d3]*temp2;
372:         }
373:       }
374: #else
375:       for (d2 = 0; d2 < 3; ++d2) {
376:         for (d3 = 0; d3 < 3; ++d3) {
377:           /* K = U * grad(f): g2 = e: i,A */
378:           gg2_temp[d2] += U[d2][d3]*temp1[d3];
379:           /* D = -U * (I \kron (fx)): g3 = f: i,j,A */
380:           gg3_temp[d2][d3] += U[d2][d3]*temp2;
381:         }
382:       }
383: #endif
384:     }
385:   } /* IPs */

387:   /* reduce gg temp sums across threads */
388:   for (delta = blockDim.x/2; delta > 0; delta /= 2) {
389:     for (d2 = 0; d2 < dim; d2++) {
390:       gg2_temp[d2] += __shfl_xor_sync(0xffffffff, gg2_temp[d2], delta, blockDim.x);
391:       for (d3 = 0; d3 < dim; d3++) {
392:         gg3_temp[d2][d3] += __shfl_xor_sync(0xffffffff, gg3_temp[d2][d3], delta, blockDim.x);
393:       }
394:     }
395:   }
396:   // add alpha and put in gg2/3
397:   for (fieldA = threadIdx.x; fieldA < Nfloc; fieldA += blockDim.x) {
398:     for (d2 = 0; d2 < dim; d2++) {
399:       s_gg2[d2][myQi][fieldA] += gg2_temp[d2]*s_nu_alpha[fieldA+f_off];
400:       for (d3 = 0; d3 < dim; d3++) {
401:         s_gg3[d2][d3][myQi][fieldA] -= gg3_temp[d2][d3]*s_nu_alpha[fieldA+f_off]*s_invMass[fieldA+f_off];
402:       }
403:     }
404:   }
405:   __syncthreads();
406:   /* add electric field term once per IP */
407:   for (fieldA = threadIdx.x ; fieldA < Nfloc ; fieldA += blockDim.x) {
408:     s_gg2[dim-1][myQi][fieldA] += Eq_m[fieldA+f_off];
409:   }
410:   __syncthreads();
411:   /* Jacobian transform - g2 */
412:   for (fieldA = threadIdx.x ; fieldA < Nfloc ; fieldA += blockDim.x) {
413:     PetscReal wj = ww[jpidx];
414:     for (d = 0; d < dim; ++d) {
415:       s_g2[d][myQi][fieldA] = 0.0;
416:       for (d2 = 0; d2 < dim; ++d2) {
417:         s_g2[d][myQi][fieldA] += invJj[d*dim+d2]*s_gg2[d2][myQi][fieldA];
418:         s_g3[d][d2][myQi][fieldA] = 0.0;
419:         for (d3 = 0; d3 < dim; ++d3) {
420:           for (dp = 0; dp < dim; ++dp) {
421:             s_g3[d][d2][myQi][fieldA] += invJj[d*dim + d3]*s_gg3[d3][dp][myQi][fieldA]*invJj[d2*dim + dp];
422:           }
423:         }
424:         s_g3[d][d2][myQi][fieldA] *= wj;
425:       }
426:       s_g2[d][myQi][fieldA] *= wj;
427:     }
428:   }
429:   __syncthreads();  // Synchronize (ensure all the data is available) and sum IP matrices
430:   /* FE matrix construction */
431:   {
432:     int fieldA,d,qj,d2,q,idx,totDim=Nb*Nfloc;
433:     /* assemble */
434:     for (fieldA = 0; fieldA < Nfloc; fieldA++) {
435:       for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
436:         for (g = threadIdx.x; g < Nb; g += blockDim.x) {
437:           PetscScalar t = 0;
438:           for (qj = 0 ; qj < Nq ; qj++) {
439:             const PetscReal *BJq = &BB[qj*Nb], *DIq = &DD[qj*Nb*dim];
440:             for (d = 0; d < dim; ++d) {
441:               t += DIq[f*dim+d]*s_g2[d][qj][fieldA]*BJq[g];
442:               for (d2 = 0; d2 < dim; ++d2) {
443:                 t += DIq[f*dim + d]*s_g3[d][d2][qj][fieldA]*DIq[g*dim + d2];
444:               }
445:             }
446:           }
447:           if (elemMat) {
448:             const PetscInt fOff = (fieldA*Nb + f)*totDim + fieldA*Nb + g;
449:             elemMat[fOff] += t; // ????
450:           } else s_fieldMats[f][g] = t;
451:         }
452:       }
453:       if (s_fieldMats) {
454:         PetscScalar vals[LANDAU_MAX_Q_FACE*LANDAU_MAX_Q_FACE];
455:         PetscInt    nr,nc;
456:         const LandauIdx *const Idxs = &d_maps[grid]->gIdx[elem][fieldA][0];
457:         __syncthreads();
458:         if (threadIdx.y == 0) {
459:           for (f = threadIdx.x; f < Nb ; f += blockDim.x) {
460:             idx = Idxs[f];
461:             if (idx >= 0) {
462:               s_idx[f][0] = idx + moffset;
463:               s_scale[f][0] = 1.;
464:             } else {
465:               idx = -idx - 1;
466:               for (q = 0; q < d_maps[grid]->num_face; q++) {
467:                 s_idx[f][q]   = d_maps[grid]->c_maps[idx][q].gid + moffset;
468:                 s_scale[f][q] = d_maps[grid]->c_maps[idx][q].scale;
469:               }
470:             }
471:           }
472:         }
473:         __syncthreads();
474:         for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
475:           idx = Idxs[f];
476:           if (idx >= 0) {
477:             nr = 1;
478:           } else {
479:             nr = d_maps[grid]->num_face;
480:           }
481:           for (g = threadIdx.x; g < Nb; g += blockDim.x) {
482:             idx = Idxs[g];
483:             if (idx >= 0) {
484:               nc = 1;
485:             } else {
486:               nc = d_maps[grid]->num_face;
487:             }
488:             for (q = 0; q < nr; q++) {
489:               for (d = 0; d < nc; d++) {
490:                 vals[q*nc + d] = s_scale[f][q]*s_scale[g][d]*s_fieldMats[f][g];
491:               }
492:             }
493:             MatSetValuesDevice(d_mat,nr,s_idx[f],nc,s_idx[g],vals,ADD_VALUES);
494:           }
495:         }
496:         __syncthreads();
497:       }
498:     }
499:   }
500: }

502: //
503: // The CUDA Landau kernel
504: //
505: __global__
506: void __launch_bounds__(256,4)
507:   landau_jacobian(const PetscInt nip_global, const PetscInt dim, const PetscInt Nb, const PetscReal invJj[],
508:                   const PetscInt Nftot, const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[],
509:                   const PetscReal * const BB, const PetscReal * const DD, const PetscReal xx[], const PetscReal yy[], const PetscReal ww[],
510:                   PetscScalar d_elem_mats[], P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[],
511: #if LANDAU_DIM==3
512:                   const PetscReal zz[], PetscReal d_dfdz[],
513: #endif
514:                   const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[])
515: {
516:   extern __shared__ PetscReal smem[];
517:   int size = 0;
518:   PetscReal (*s_g2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]              =
519:     (PetscReal (*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES])             &smem[size];
520:   size += LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM;
521:   PetscReal (*s_g3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]  =
522:     (PetscReal (*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) &smem[size];
523:   size += LANDAU_DIM*LANDAU_DIM*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES;
524:   PetscReal (*s_gg2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]             =
525:     (PetscReal (*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES])             &smem[size];
526:   size += LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM;
527:   PetscReal (*s_gg3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] =
528:     (PetscReal (*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) &smem[size];
529:   size += LANDAU_DIM*LANDAU_DIM*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES;
530:   PetscReal *s_nu_alpha = &smem[size];
531:   size += LANDAU_MAX_SPECIES;
532:   PetscReal *s_nu_beta  = &smem[size];
533:   size += LANDAU_MAX_SPECIES;
534:   PetscReal *s_invMass  = &smem[size];
535:   size += LANDAU_MAX_SPECIES;
536:   PetscReal *s_f        = &smem[size];
537:   size += blockDim.x*LANDAU_MAX_SPECIES;
538:   PetscReal *s_dfx      = &smem[size];
539:   size += blockDim.x*LANDAU_MAX_SPECIES;
540:   PetscReal *s_dfy      = &smem[size];
541:   size += blockDim.x*LANDAU_MAX_SPECIES;
542: #if LANDAU_DIM==3
543:   PetscReal *s_dfz      = &smem[size];
544:   size += blockDim.x*LANDAU_MAX_SPECIES;
545: #endif
546:   PetscScalar (*s_fieldMats)[LANDAU_MAX_NQ][LANDAU_MAX_NQ];
547:   PetscReal   (*s_scale)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
548:   PetscInt    (*s_idx)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
549:   PetscInt    Nq = blockDim.y, g_cell = blockIdx.x, grid = 0;
550:   PetscScalar *elemMat = NULL; /* my output */

552:   while (g_cell >= d_elem_offset[grid+1]) grid++; // yuck search for grid
553:   {
554:     const PetscInt   Nfloc = d_species_offset[grid+1]-d_species_offset[grid], totDim = Nfloc*Nq, elem = g_cell - d_elem_offset[grid];
555:     const PetscInt   IP_idx = d_ip_offset[grid];
556:     const PetscInt   myQi = threadIdx.y;
557:     const PetscInt   jpidx = IP_idx + myQi + elem * Nq;
558:     const PetscReal  *invJ = &invJj[jpidx*dim*dim];
559:     if (d_elem_mats) {
560:       elemMat = d_elem_mats; // start a beginning
561:       for (PetscInt grid2=0 ; grid2<grid ; grid2++) {
562:         PetscInt Nfloc2 = d_species_offset[grid2+1] - d_species_offset[grid2], totDim2 = Nfloc2*Nb;
563:         elemMat += d_numCells[grid2]*totDim2*totDim2; // jump past grids, could be in an offset
564:       }
565:       elemMat += elem*totDim*totDim; // index into local matrix & zero out
566:       for (int i = threadIdx.x + threadIdx.y*blockDim.x; i < totDim*totDim; i += blockDim.x*blockDim.y) elemMat[i] = 0;
567:     }
568:     __syncthreads();
569:     if (d_maps) {
570:       // reuse the space for fieldMats
571:       s_fieldMats = (PetscScalar (*)[LANDAU_MAX_NQ][LANDAU_MAX_NQ]) &smem[size];
572:       size += LANDAU_MAX_NQ*LANDAU_MAX_NQ;
573:       s_scale =  (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) &smem[size];
574:       size += LANDAU_MAX_NQ*LANDAU_MAX_Q_FACE;
575:       s_idx = (PetscInt (*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) &smem[size];
576:       size += LANDAU_MAX_NQ*LANDAU_MAX_Q_FACE; // this is too big, idx is an integer
577:     } else {
578:       s_fieldMats = NULL;
579:     }
580:     __syncthreads();
581:     jac_kernel(myQi, jpidx, nip_global, Nq, grid, dim, xx, yy, ww,
582:                invJ, Nftot, nu_alpha, nu_beta, invMass, Eq_m, BB, DD,
583:                elemMat, d_maps, d_mat,
584:                *s_fieldMats, *s_scale, *s_idx,
585:                *s_g2, *s_g3, *s_gg2, *s_gg3,
586:                s_nu_alpha, s_nu_beta, s_invMass,
587:                s_f, s_dfx, s_dfy, d_f, d_dfdx, d_dfdy,
588: #if LANDAU_DIM==3
589:                zz, s_dfz, d_dfdz,
590: #endif
591:                d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
592:   }
593: }

595: __global__
596: void __launch_bounds__(256,4) landau_mass(const PetscInt dim, const PetscInt Nb, const PetscReal d_w[], const PetscReal * const BB, const PetscReal * const DD,
597:                                           PetscScalar d_elem_mats[], P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, PetscReal shift,
598:                                           const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[])
599: {
600:   const PetscInt         Nq = blockDim.y, g_cell = blockIdx.x;
601:   __shared__ PetscScalar s_fieldMats[LANDAU_MAX_NQ][LANDAU_MAX_NQ];
602:   __shared__ PetscInt    s_idx[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
603:   __shared__ PetscReal   s_scale[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
604:   int                    tid = threadIdx.x + threadIdx.y*blockDim.x;
605:   PetscScalar            *elemMat = NULL; /* my output */
606:   int                    fieldA,d,qj,q,idx,f,g, grid = 0;

608:   while (g_cell >= d_elem_offset[grid+1]) grid++; // yuck search for grid
609:   {
610:     const PetscInt moffset = d_mat_offset[grid], Nfloc = d_species_offset[grid+1]-d_species_offset[grid], totDim = Nfloc*Nq, elem = g_cell-d_elem_offset[grid];
611:     const PetscInt IP_idx = d_ip_offset[grid];
612:     if (d_elem_mats) {
613:       elemMat = d_elem_mats; // start a beginning
614:       for (PetscInt grid2=0 ; grid2<grid ; grid2++) {
615:         PetscInt Nfloc2 = d_species_offset[grid2+1] - d_species_offset[grid2], totDim2 = Nfloc2*Nb;
616:         elemMat += d_numCells[grid2]*totDim2*totDim2; // jump past grids,could be in an offset
617:       }
618:       elemMat += elem*totDim*totDim;
619:       for (int i = tid; i < totDim*totDim; i += blockDim.x*blockDim.y) elemMat[i] = 0;
620:     }
621:     __syncthreads();
622:     /* FE mass matrix construction */
623:     for (fieldA = 0; fieldA < Nfloc; fieldA++) {
624:       PetscScalar            vals[LANDAU_MAX_Q_FACE*LANDAU_MAX_Q_FACE];
625:       PetscInt               nr,nc;
626:       for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
627:         for (g = threadIdx.x; g < Nb; g += blockDim.x) {
628:           PetscScalar t = 0;
629:           for (qj = 0 ; qj < Nq ; qj++) {
630:             const PetscReal *BJq = &BB[qj*Nb];
631:             const PetscInt jpidx = IP_idx + qj + elem * Nq;
632:             if (dim==2) {
633:               t += BJq[f] * d_w[jpidx]*shift * BJq[g] * 2. * PETSC_PI;
634:             } else {
635:               t += BJq[f] * d_w[jpidx]*shift * BJq[g];
636:             }
637:           }
638:           if (elemMat) {
639:             const PetscInt fOff = (fieldA*Nb + f)*totDim + fieldA*Nb + g;
640:             elemMat[fOff] += t; // ????
641:           } else s_fieldMats[f][g] = t;
642:         }
643:       }
644:       if (!elemMat) {
645:         const LandauIdx *const Idxs = &d_maps[grid]->gIdx[elem][fieldA][0];
646:         __syncthreads();
647:         if (threadIdx.y == 0) {
648:           for (f = threadIdx.x; f < Nb ; f += blockDim.x) {
649:             idx = Idxs[f];
650:             if (idx >= 0) {
651:               s_idx[f][0] = idx + moffset;
652:               s_scale[f][0] = 1.;
653:             } else {
654:               idx = -idx - 1;
655:               for (q = 0; q < d_maps[grid]->num_face; q++) {
656:                 s_idx[f][q]   = d_maps[grid]->c_maps[idx][q].gid + moffset;
657:                 s_scale[f][q] = d_maps[grid]->c_maps[idx][q].scale;
658:               }
659:             }
660:           }
661:         }
662:         __syncthreads();
663:         for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
664:           idx = Idxs[f];
665:           if (idx >= 0) {
666:             nr = 1;
667:           } else {
668:             nr = d_maps[grid]->num_face;
669:           }
670:           for (g = threadIdx.x; g < Nb; g += blockDim.x) {
671:             idx = Idxs[g];
672:             if (idx >= 0) {
673:               nc = 1;
674:             } else {
675:               nc = d_maps[grid]->num_face;
676:             }
677:             for (q = 0; q < nr; q++) {
678:               for (d = 0; d < nc; d++) {
679:                 vals[q*nc + d] = s_scale[f][q]*s_scale[g][d]*s_fieldMats[f][g];
680:               }
681:             }
682:             MatSetValuesDevice(d_mat,nr,s_idx[f],nc,s_idx[g],vals,ADD_VALUES);
683:           }
684:         }
685:       }
686:       __syncthreads();
687:     }
688:   }
689: }

691: PetscErrorCode LandauCUDAJacobian(DM plex[], const PetscInt Nq, const PetscInt num_grids, const PetscInt a_numCells[], PetscReal a_Eq_m[], PetscScalar a_elem_closure[],
692:                                   const PetscInt N, const PetscScalar a_xarray[], const LandauStaticData *SData_d, const PetscInt num_sub_blocks, const PetscReal shift,
693:                                   const PetscLogEvent events[], const PetscInt a_mat_offset[], const PetscInt a_species_offset[], Mat subJ[], Mat JacP)
694: {
695:   PetscErrorCode    ierr;
696:   cudaError_t       cerr;
697:   PetscInt          Nb=Nq,dim,nip_global,num_cells_tot,elem_mat_size_tot;
698:   PetscInt          *d_numCells, *d_species_offset, *d_mat_offset, *d_ip_offset, *d_ipf_offset, *d_elem_offset;
699:   PetscInt          szf=sizeof(PetscReal),szs=sizeof(PetscScalar),Nftot=a_species_offset[num_grids];
700:   PetscReal         *d_BB=NULL,*d_DD=NULL,*d_invJj=NULL,*d_nu_alpha=NULL,*d_nu_beta=NULL,*d_invMass=NULL,*d_Eq_m=NULL,*d_x=NULL,*d_y=NULL,*d_w=NULL;
701:   PetscScalar       *d_elem_mats=NULL,*d_vertex_f=NULL;
702:   PetscReal         *d_f=NULL,*d_dfdx=NULL,*d_dfdy=NULL;
703: #if LANDAU_DIM==3
704:   PetscReal         *d_dfdz=NULL, *d_z = NULL;
705: #endif
706:   LandauCtx         *ctx;
707:   PetscSplitCSRDataStructure d_mat=NULL;
708:   P4estVertexMaps   **d_maps,*maps[LANDAU_MAX_GRIDS];
709:   int               nnn = 256/Nq; // machine dependent
710:   PetscContainer    container;
712:   PetscLogEventBegin(events[3],0,0,0,0);
713:   while (nnn & nnn - 1) nnn = nnn & nnn - 1;
714:   if (nnn>4) nnn = 4; // 16 debug
715:   DMGetApplicationContext(plex[0], &ctx);
716:   if (!ctx) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "no context");
717:   DMGetDimension(plex[0], &dim);
718:   if (dim!=LANDAU_DIM) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "LANDAU_DIM %D != dim %d",LANDAU_DIM,dim);
719:   if (ctx->gpu_assembly) {
720:     PetscObjectQuery((PetscObject) JacP, "assembly_maps", (PetscObject *) &container);
721:     if (container) { // not here first call
722:       static int init = 0; // hack. just do every time, or put in setup (but that is in base class code), or add init_maps flag
723:       if (!init++) {
724:         P4estVertexMaps   *h_maps=NULL;
725:         PetscContainerGetPointer(container, (void **) &h_maps);
726:         for (PetscInt grid=0 ; grid<num_grids ; grid++) {
727:           if (h_maps[grid].d_self) {
728:             maps[grid] = h_maps[grid].d_self;
729:           } else {
730:             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "GPU assembly but no metadata in container");
731:           }
732:         }
733:         cerr = cudaMemcpy(SData_d->maps, maps, num_grids*sizeof(P4estVertexMaps*), cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
734:       }
735:       d_maps = (P4estVertexMaps**)SData_d->maps;
736:       // this does the setup the first time called
737:       MatCUSPARSEGetDeviceMatWrite(JacP,&d_mat);
738:     } else {
739:       d_maps = NULL;
740:     }
741:   } else {
742:     container = NULL;
743:     d_maps = NULL;
744:   }
745:   PetscLogEventEnd(events[3],0,0,0,0);
746:   {
747:     PetscInt elem_mat_size = 0;
748:     nip_global = num_cells_tot = 0;
749:     for (PetscInt grid=0 ; grid<num_grids ; grid++) {
750:       PetscInt Nfloc = a_species_offset[grid+1] - a_species_offset[grid], totDim = Nfloc*Nb;
751:       nip_global += a_numCells[grid]*Nq;
752:       num_cells_tot += a_numCells[grid]; // is in d_elem_offset
753:       elem_mat_size += a_numCells[grid]*totDim*totDim; // could be in an offset
754:     }
755:     elem_mat_size_tot = d_maps ? 0 : elem_mat_size;
756:   }
757:   if (elem_mat_size_tot) {
758:     cerr = cudaMalloc((void **)&d_elem_mats, elem_mat_size_tot*szs);CHKERRCUDA(cerr); // kernel output - first call is on CPU
759:   } else d_elem_mats = NULL;
760:   // create data
761:   d_BB = (PetscReal*)SData_d->B;
762:   d_DD = (PetscReal*)SData_d->D;
763:   if (a_elem_closure || a_xarray) {  // form f and df
764:     PetscLogEventBegin(events[1],0,0,0,0);
765:     cerr = cudaMemcpy(SData_d->Eq_m, a_Eq_m,   Nftot*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
766:     d_invJj = (PetscReal*)SData_d->invJ;
767:     d_nu_alpha = (PetscReal*)SData_d->alpha;
768:     d_nu_beta = (PetscReal*)SData_d->beta;
769:     d_invMass = (PetscReal*)SData_d->invMass;
770:     d_x = (PetscReal*)SData_d->x;
771:     d_y = (PetscReal*)SData_d->y;
772:     d_w = (PetscReal*)SData_d->w;
773:     d_Eq_m = (PetscReal*)SData_d->Eq_m;
774:     d_dfdx = (PetscReal*)SData_d->dfdx;
775:     d_dfdy = (PetscReal*)SData_d->dfdy;
776: #if LANDAU_DIM==3
777:     d_dfdz = (PetscReal*)SData_d->dfdz;
778:     d_z = (PetscReal*)SData_d->z;
779: #endif
780:     d_f    = (PetscReal*)SData_d->f;
781:     // get a d_vertex_f
782:     if (a_elem_closure) {
783:       PetscInt closure_sz = 0; // argh, don't have this on the host!!!
784:       for (PetscInt grid=0 ; grid<num_grids ; grid++) {
785:         PetscInt nfloc = a_species_offset[grid+1] - a_species_offset[grid];
786:         closure_sz += Nq*nfloc*a_numCells[grid];
787:       }
788:       cerr = cudaMalloc((void **)&d_vertex_f, closure_sz * sizeof(*a_elem_closure));CHKERRCUDA(cerr);
789:       cerr = cudaMemcpy(d_vertex_f, a_elem_closure, closure_sz*sizeof(*a_elem_closure), cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
790:     } else {
791:       d_vertex_f = (PetscScalar*)a_xarray;
792:     }
793:     PetscLogEventEnd(events[1],0,0,0,0);
794:   } else {
795:     d_w = (PetscReal*)SData_d->w;
796:   }
797:   //
798:   d_numCells = (PetscInt*)SData_d->NCells; // redundant -- remove
799:   d_species_offset = (PetscInt*)SData_d->species_offset;
800:   d_mat_offset = (PetscInt*)SData_d->mat_offset;
801:   d_ip_offset = (PetscInt*)SData_d->ip_offset;
802:   d_ipf_offset = (PetscInt*)SData_d->ipf_offset;
803:   d_elem_offset = (PetscInt*)SData_d->elem_offset;
804:   if (a_elem_closure || a_xarray) {  // form f and df
805:     dim3 dimBlockFDF(nnn>Nftot ? Nftot : nnn, Nq), dimBlock((nip_global>nnn) ? nnn : nip_global , Nq);
806:     PetscLogEventBegin(events[8],0,0,0,0);
807:     PetscLogGpuTimeBegin();
808:     PetscInfo2(plex[0], "Form F and dF/dx vectors: nip_global=%D num_grids=%D\n",nip_global,num_grids);
809:     landau_form_fdf<<<num_cells_tot,dimBlockFDF>>>(dim, Nb, d_invJj, d_BB, d_DD, d_vertex_f, d_maps, d_f, d_dfdx, d_dfdy,
810: #if LANDAU_DIM==3
811:                                                    d_dfdz,
812: #endif
813:                                                    d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
814:     CHECK_LAUNCH_ERROR();
815:     PetscLogGpuFlops(nip_global*(PetscLogDouble)(2*Nb*(1+dim)));
816:     if (a_elem_closure) {
817:       cerr = cudaFree(d_vertex_f);CHKERRCUDA(cerr);
818:       d_vertex_f = NULL;
819:     }
820:     PetscLogGpuTimeEnd();
821:     PetscLogEventEnd(events[8],0,0,0,0);
822:     // Jacobian
823:     PetscLogEventBegin(events[4],0,0,0,0);
824:     PetscLogGpuTimeBegin();
825:     PetscLogGpuFlops(nip_global*(PetscLogDouble)(a_elem_closure ? (nip_global*(11*Nftot+ 4*dim*dim) + 6*Nftot*dim*dim*dim + 10*Nftot*dim*dim + 4*Nftot*dim + Nb*Nftot*Nb*Nq*dim*dim*5) : Nb*Nftot*Nb*Nq*4));
826:     PetscInt ii = 2*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM*(1+LANDAU_DIM) + 3*LANDAU_MAX_SPECIES + (1+LANDAU_DIM)*dimBlock.x*LANDAU_MAX_SPECIES + LANDAU_MAX_NQ*LANDAU_MAX_NQ + 2*LANDAU_MAX_NQ*LANDAU_MAX_Q_FACE;
827:     if (ii*szf >= 49152) {
828:       cerr = cudaFuncSetAttribute(landau_jacobian,
829:                                   cudaFuncAttributeMaxDynamicSharedMemorySize,
830:                                   98304);CHKERRCUDA(cerr);
831:     }
832:     PetscInfo3(plex[0], "Jacobian shared memory size: %D bytes, d_elem_mats=%p d_maps=%p\n",ii,d_elem_mats,d_maps);
833:     landau_jacobian<<<num_cells_tot,dimBlock,ii*szf>>>(nip_global, dim, Nb, d_invJj, Nftot, d_nu_alpha, d_nu_beta, d_invMass, d_Eq_m,
834:                                                      d_BB, d_DD, d_x, d_y, d_w,
835:                                                      d_elem_mats, d_maps, d_mat, d_f, d_dfdx, d_dfdy,
836: #if LANDAU_DIM==3
837:                                                      d_z, d_dfdz,
838: #endif
839:                                                      d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
840:     CHECK_LAUNCH_ERROR(); // has sync
841:     PetscLogGpuTimeEnd();
842:     PetscLogEventEnd(events[4],0,0,0,0);
843:   } else { // mass
844:     dim3 dimBlock(nnn,Nq);
845:     PetscInfo4(plex[0], "Mass d_maps = %p. Nq=%d, vector size %d num_cells_tot=%d\n",d_maps,Nq,nnn,num_cells_tot);
846:     PetscLogEventBegin(events[4],0,0,0,0);
847:     PetscLogGpuTimeBegin();
848:     landau_mass<<<num_cells_tot,dimBlock>>>(dim, Nb, d_w, d_BB, d_DD, d_elem_mats,
849:                                             d_maps, d_mat, shift, d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
850:     CHECK_LAUNCH_ERROR(); // has sync
851:     PetscLogGpuTimeEnd();
852:     PetscLogEventEnd(events[4],0,0,0,0);
853:   }
854:   // First time assembly with or without GPU assembly
855:   if (d_elem_mats) {
856:     for (PetscInt grid=0, elem_mats_idx = 0 ; grid<num_grids ; grid++) {  // elem_mats_idx += totDim*totDim*a_numCells[grid];
857:       const PetscInt Nfloc = a_species_offset[grid+1]-a_species_offset[grid], totDim = Nfloc*Nq;
858:       PetscScalar   *elemMats = NULL, *elMat;
859:       PetscSection  section, globalSection;
860:       PetscInt      cStart,cEnd,ej;
861:       PetscLogEventBegin(events[5],0,0,0,0);
862:       DMPlexGetHeightStratum(plex[grid],0,&cStart,&cEnd);
863:       DMGetLocalSection(plex[grid], &section);
864:       DMGetGlobalSection(plex[grid], &globalSection);
865:       PetscMalloc1(totDim*totDim*a_numCells[grid],&elemMats);
866:       cerr = cudaMemcpy(elemMats, &d_elem_mats[elem_mats_idx], totDim*totDim*a_numCells[grid]*sizeof(*elemMats), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
867:       PetscLogEventEnd(events[5],0,0,0,0);
868:       PetscLogEventBegin(events[6],0,0,0,0);
869:       for (ej = cStart, elMat = elemMats ; ej < cEnd; ++ej, elMat += totDim*totDim) {
870:         DMPlexMatSetClosure(plex[grid], section, globalSection, subJ[grid], ej, elMat, ADD_VALUES);
871:         if (ej==-1) {
872:           int d,f;
873:           PetscPrintf(PETSC_COMM_SELF,"GPU Element matrix\n");
874:           for (d = 0; d < totDim; ++d) {
875:             for (f = 0; f < totDim; ++f) PetscPrintf(PETSC_COMM_SELF," %12.5e",  PetscRealPart(elMat[d*totDim + f]));
876:             PetscPrintf(PETSC_COMM_SELF,"\n");
877:           }
878:           //exit(14);
879:         }
880:       }
881:       PetscFree(elemMats);
882:       elem_mats_idx += totDim*totDim*a_numCells[grid]; // this can be a stored offset?
883:       //
884:       if (!container) {   // move nest matrix to global JacP
885:         PetscInt          moffset = a_mat_offset[grid], nloc, nzl, colbuf[1024], row;
886:         const PetscInt    *cols;
887:         const PetscScalar *vals;
888:         Mat               B = subJ[grid];
889:         MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
890:         MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
891:         MatGetSize(B, &nloc, NULL);
892:         if (nloc != a_mat_offset[grid+1] - moffset) SETERRQ2(PetscObjectComm((PetscObject) B), PETSC_ERR_PLIB, "nloc %D != mat_offset[grid+1] - moffset = %D",nloc, a_mat_offset[grid+1] - moffset);
893:         for (int i=0 ; i<nloc ; i++) {
894:           MatGetRow(B,i,&nzl,&cols,&vals);
895:           if (nzl>1024) SETERRQ1(PetscObjectComm((PetscObject) B), PETSC_ERR_PLIB, "Row too big: %D",nzl);
896:           for (int j=0; j<nzl; j++) colbuf[j] = cols[j] + moffset;
897:           row = i + moffset;
898:           MatSetValues(JacP,1,&row,nzl,colbuf,vals,ADD_VALUES);
899:           MatRestoreRow(B,i,&nzl,&cols,&vals);
900:         }
901:         MatDestroy(&subJ[grid]);
902:       } else exit(34);
903:       PetscLogEventEnd(events[6],0,0,0,0);
904:     } // grids
905:     cerr = cudaFree(d_elem_mats);CHKERRCUDA(cerr);
906:     if (ctx->gpu_assembly) {
907:       // transition to use of maps for VecGetClosure
908:       if (!(a_elem_closure || a_xarray)) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "transition without Jacobian");
909:     }
910:   }

912:   return(0);
913: }