Actual source code: vechip2.hip.cpp

  1: /*
  2:    Implements the sequential hip vectors.
  3: */

  5: #define PETSC_SKIP_SPINLOCK

  7: #include <petscconf.h>
  8: #include <petsc/private/vecimpl.h>
  9: #include <../src/vec/vec/impls/dvecimpl.h>
 10: #include <petsc/private/hipvecimpl.h>

 12: #include <hip/hip_runtime.h>
 13: #include <thrust/device_ptr.h>
 14: #include <thrust/transform.h>
 15: #include <thrust/functional.h>
 16: #include <thrust/reduce.h>

 18: #if defined(PETSC_USE_COMPLEX)
 19:  /* SPOCK compilation issues, need to unroll division and multiplication with complex numbers */
 20: struct PetscDivideComplex
 21: {
 22:   __host__ __device__
 23:   PetscScalar operator()(const PetscScalar& lhs, const PetscScalar& rhs)
 24:   {
 25:     PetscReal lx = PetscRealPart(lhs);
 26:     PetscReal ly = PetscImaginaryPart(lhs);
 27:     PetscReal rx = PetscRealPart(rhs);
 28:     PetscReal ry = PetscImaginaryPart(rhs);
 29:     PetscReal n  = rx*rx + ry*ry;
 30:     return PetscComplex((lx*rx + ly*ry)/n,(rx*ly - lx*ry)/n);
 31:   }
 32: };

 34: struct PetscMultiplyComplex
 35: {
 36:   __host__ __device__
 37:   PetscScalar operator()(const PetscScalar& lhs, const PetscScalar& rhs)
 38:   {
 39:     PetscReal lx = PetscRealPart(lhs);
 40:     PetscReal ly = PetscImaginaryPart(lhs);
 41:     PetscReal rx = PetscRealPart(rhs);
 42:     PetscReal ry = PetscImaginaryPart(rhs);
 43:     return PetscComplex(lx*rx-ly*ry,ly*rx+lx*ry);
 44:   }
 45: };
 46: #endif

 48: /*
 49:     Allocates space for the vector array on the GPU if it does not exist.
 50:     Does NOT change the PetscHIPFlag for the vector
 51:     Does NOT zero the HIP array

 53:  */
 54: PetscErrorCode VecHIPAllocateCheck(Vec v)
 55: {
 57:   hipError_t     err;
 58:   Vec_HIP        *vechip;
 59:   PetscBool      option_set;

 62:   if (!v->spptr) {
 63:     PetscReal pinned_memory_min;
 64:     PetscCalloc(sizeof(Vec_HIP),&v->spptr);
 65:     vechip = (Vec_HIP*)v->spptr;
 66:     err = hipMalloc((void**)&vechip->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRHIP(err);
 67:     vechip->GPUarray = vechip->GPUarray_allocated;
 68:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
 69:       if (v->data && ((Vec_Seq*)v->data)->array) {
 70:         v->offloadmask = PETSC_OFFLOAD_CPU;
 71:       } else {
 72:         v->offloadmask = PETSC_OFFLOAD_GPU;
 73:       }
 74:     }
 75:     pinned_memory_min = 0;

 77:     /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
 78:        Note: This same code duplicated in VecCreate_SeqHIP_Private() and VecCreate_MPIHIP_Private(). Is there a good way to avoid this? */
 79:     PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECHIP Options","Vec");
 80:     PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
 81:     if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
 82:     PetscOptionsEnd();
 83:   }
 84:   return(0);
 85: }

 87: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
 88: PetscErrorCode VecHIPCopyToGPU(Vec v)
 89: {
 91:   hipError_t     err;
 92:   Vec_HIP        *vechip;
 93:   PetscScalar    *varray;

 97:   VecHIPAllocateCheck(v);
 98:   if (v->offloadmask == PETSC_OFFLOAD_CPU) {
 99:     PetscLogEventBegin(VEC_HIPCopyToGPU,v,0,0,0);
100:     vechip         = (Vec_HIP*)v->spptr;
101:     varray         = vechip->GPUarray;
102:     err            = hipMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),hipMemcpyHostToDevice);CHKERRHIP(err);
103:     PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
104:     PetscLogEventEnd(VEC_HIPCopyToGPU,v,0,0,0);
105:     v->offloadmask = PETSC_OFFLOAD_BOTH;
106:   }
107:   return(0);
108: }

110: /*
111:      VecHIPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
112: */
113: PetscErrorCode VecHIPCopyFromGPU(Vec v)
114: {
116:   hipError_t     err;
117:   Vec_HIP        *vechip;
118:   PetscScalar    *varray;

122:   VecHIPAllocateCheckHost(v);
123:   if (v->offloadmask == PETSC_OFFLOAD_GPU) {
124:     PetscLogEventBegin(VEC_HIPCopyFromGPU,v,0,0,0);
125:     vechip         = (Vec_HIP*)v->spptr;
126:     varray         = vechip->GPUarray;
127:     err            = hipMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),hipMemcpyDeviceToHost);CHKERRHIP(err);
128:     PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
129:     PetscLogEventEnd(VEC_HIPCopyFromGPU,v,0,0,0);
130:     v->offloadmask = PETSC_OFFLOAD_BOTH;
131:   }
132:   return(0);
133: }

135: /*MC
136:    VECSEQHIP - VECSEQHIP = "seqhip" - The basic sequential vector, modified to use HIP

138:    Options Database Keys:
139: . -vec_type seqhip - sets the vector type to VECSEQHIP during a call to VecSetFromOptions()

141:   Level: beginner

143: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
144: M*/

146: PetscErrorCode VecAYPX_SeqHIP(Vec yin,PetscScalar alpha,Vec xin)
147: {
148:   const PetscScalar *xarray;
149:   PetscScalar       *yarray;
150:   PetscErrorCode    ierr;
151:   PetscBLASInt      one = 1,bn = 0;
152:   PetscScalar       sone = 1.0;
153:   hipblasHandle_t   hipblasv2handle;
154:   hipblasStatus_t   hberr;
155:   hipError_t        err;

158:   PetscHIPBLASGetHandle(&hipblasv2handle);
159:   PetscBLASIntCast(yin->map->n,&bn);
160:   VecHIPGetArrayRead(xin,&xarray);
161:   VecHIPGetArray(yin,&yarray);
162:   PetscLogGpuTimeBegin();
163:   if (alpha == (PetscScalar)0.0) {
164:     err = hipMemcpy(yarray,xarray,bn*sizeof(PetscScalar),hipMemcpyDeviceToDevice);CHKERRHIP(err);
165:   } else if (alpha == (PetscScalar)1.0) {
166:     hberr = hipblasXaxpy(hipblasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRHIPBLAS(hberr);
167:     PetscLogGpuFlops(1.0*yin->map->n);
168:   } else {
169:     hberr = hipblasXscal(hipblasv2handle,bn,&alpha,yarray,one);CHKERRHIPBLAS(hberr);
170:     hberr = hipblasXaxpy(hipblasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRHIPBLAS(hberr);
171:     PetscLogGpuFlops(2.0*yin->map->n);
172:   }
173:   PetscLogGpuTimeEnd();
174:   VecHIPRestoreArrayRead(xin,&xarray);
175:   VecHIPRestoreArray(yin,&yarray);
176:   PetscLogCpuToGpuScalar(sizeof(PetscScalar));
177:   return(0);
178: }

180: PetscErrorCode VecAXPY_SeqHIP(Vec yin,PetscScalar alpha,Vec xin)
181: {
182:   const PetscScalar *xarray;
183:   PetscScalar       *yarray;
184:   PetscErrorCode    ierr;
185:   PetscBLASInt      one = 1,bn = 0;
186:   hipblasHandle_t   hipblasv2handle;
187:   hipblasStatus_t   hberr;
188:   PetscBool         xiship;

191:   if (alpha == (PetscScalar)0.0) return(0);
192:   PetscHIPBLASGetHandle(&hipblasv2handle);
193:   PetscObjectTypeCompareAny((PetscObject)xin,&xiship,VECSEQHIP,VECMPIHIP,"");
194:   if (xiship) {
195:     PetscBLASIntCast(yin->map->n,&bn);
196:     VecHIPGetArrayRead(xin,&xarray);
197:     VecHIPGetArray(yin,&yarray);
198:     PetscLogGpuTimeBegin();
199:     hberr = hipblasXaxpy(hipblasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRHIPBLAS(hberr);
200:     PetscLogGpuTimeEnd();
201:     VecHIPRestoreArrayRead(xin,&xarray);
202:     VecHIPRestoreArray(yin,&yarray);
203:     PetscLogGpuFlops(2.0*yin->map->n);
204:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
205:   } else {
206:     VecAXPY_Seq(yin,alpha,xin);
207:   }
208:   return(0);
209: }

211: PetscErrorCode VecPointwiseDivide_SeqHIP(Vec win, Vec xin, Vec yin)
212: {
213:   PetscInt                              n = xin->map->n;
214:   const PetscScalar                     *xarray=NULL,*yarray=NULL;
215:   PetscScalar                           *warray=NULL;
216:   thrust::device_ptr<const PetscScalar> xptr,yptr;
217:   thrust::device_ptr<PetscScalar>       wptr;
218:   PetscErrorCode                        ierr;

221:   if (xin->boundtocpu || yin->boundtocpu) {
222:     VecPointwiseDivide_Seq(win,xin,yin);
223:     return(0);
224:   }
225:   VecHIPGetArrayWrite(win,&warray);
226:   VecHIPGetArrayRead(xin,&xarray);
227:   VecHIPGetArrayRead(yin,&yarray);
228:   PetscLogGpuTimeBegin();
229:   try {
230:     wptr = thrust::device_pointer_cast(warray);
231:     xptr = thrust::device_pointer_cast(xarray);
232:     yptr = thrust::device_pointer_cast(yarray);
233: #if defined(PETSC_USE_COMPLEX)
234:     thrust::transform(xptr,xptr+n,yptr,wptr,PetscDivideComplex());
235: #else
236:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
237: #endif
238:   } catch (char *ex) {
239:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
240:   }
241:   PetscLogGpuTimeEnd();
242:   PetscLogGpuFlops(n);
243:   VecHIPRestoreArrayRead(xin,&xarray);
244:   VecHIPRestoreArrayRead(yin,&yarray);
245:   VecHIPRestoreArrayWrite(win,&warray);
246:   return(0);
247: }

249: PetscErrorCode VecWAXPY_SeqHIP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
250: {
251:   const PetscScalar *xarray=NULL,*yarray=NULL;
252:   PetscScalar       *warray=NULL;
253:   PetscErrorCode    ierr;
254:   PetscBLASInt      one = 1,bn = 0;
255:   hipblasHandle_t   hipblasv2handle;
256:   hipblasStatus_t   hberr;
257:   hipError_t        err;

260:   PetscHIPBLASGetHandle(&hipblasv2handle);
261:   PetscBLASIntCast(win->map->n,&bn);
262:   if (alpha == (PetscScalar)0.0) {
263:     VecCopy_SeqHIP(yin,win);
264:   } else {
265:     VecHIPGetArrayRead(xin,&xarray);
266:     VecHIPGetArrayRead(yin,&yarray);
267:     VecHIPGetArrayWrite(win,&warray);
268:     PetscLogGpuTimeBegin();
269:     err = hipMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);CHKERRHIP(err);
270:     hberr = hipblasXaxpy(hipblasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRHIPBLAS(hberr);
271:     PetscLogGpuTimeEnd();
272:     PetscLogGpuFlops(2*win->map->n);
273:     VecHIPRestoreArrayRead(xin,&xarray);
274:     VecHIPRestoreArrayRead(yin,&yarray);
275:     VecHIPRestoreArrayWrite(win,&warray);
276:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
277:   }
278:   return(0);
279: }

281: PetscErrorCode VecMAXPY_SeqHIP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
282: {
283:   PetscErrorCode    ierr;
284:   PetscInt          n = xin->map->n,j;
285:   PetscScalar       *xarray;
286:   const PetscScalar *yarray;
287:   PetscBLASInt      one = 1,bn = 0;
288:   hipblasHandle_t   hipblasv2handle;
289:   hipblasStatus_t   cberr;

292:   PetscLogGpuFlops(nv*2.0*n);
293:   PetscLogCpuToGpuScalar(nv*sizeof(PetscScalar));
294:   PetscHIPBLASGetHandle(&hipblasv2handle);
295:   PetscBLASIntCast(n,&bn);
296:   VecHIPGetArray(xin,&xarray);
297:   PetscLogGpuTimeBegin();
298:   for (j=0; j<nv; j++) {
299:     VecHIPGetArrayRead(y[j],&yarray);
300:     cberr = hipblasXaxpy(hipblasv2handle,bn,alpha+j,yarray,one,xarray,one);CHKERRHIPBLAS(cberr);
301:     VecHIPRestoreArrayRead(y[j],&yarray);
302:   }
303:   PetscLogGpuTimeEnd();
304:   VecHIPRestoreArray(xin,&xarray);
305:   return(0);
306: }

308: PetscErrorCode VecDot_SeqHIP(Vec xin,Vec yin,PetscScalar *z)
309: {
310:   const PetscScalar *xarray,*yarray;
311:   PetscErrorCode    ierr;
312:   PetscBLASInt      one = 1,bn = 0;
313:   hipblasHandle_t   hipblasv2handle;
314:   hipblasStatus_t   cerr;

317:   PetscHIPBLASGetHandle(&hipblasv2handle);
318:   PetscBLASIntCast(yin->map->n,&bn);
319:   VecHIPGetArrayRead(xin,&xarray);
320:   VecHIPGetArrayRead(yin,&yarray);
321:   /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
322:   PetscLogGpuTimeBegin();
323:   cerr = hipblasXdot(hipblasv2handle,bn,yarray,one,xarray,one,z);CHKERRHIPBLAS(cerr);
324:   PetscLogGpuTimeEnd();
325:   if (xin->map->n >0) {
326:     PetscLogGpuFlops(2.0*xin->map->n-1);
327:   }
328:   PetscLogGpuToCpu(sizeof(PetscScalar));
329:   VecHIPRestoreArrayRead(xin,&xarray);
330:   VecHIPRestoreArrayRead(yin,&yarray);
331:   return(0);
332: }

334: //
335: // HIP kernels for MDot to follow
336: //

338: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
339: #define MDOT_WORKGROUP_SIZE 128
340: #define MDOT_WORKGROUP_NUM  128

342: #if !defined(PETSC_USE_COMPLEX)
343: // M = 2:
344: __global__ void VecMDot_SeqHIP_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
345:                                         PetscInt size, PetscScalar *group_results)
346: {
347:   __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
348:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
349:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
350:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
351:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

353:   PetscScalar entry_x    = 0;
354:   PetscScalar group_sum0 = 0;
355:   PetscScalar group_sum1 = 0;
356:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
357:     entry_x     = x[i];   // load only once from global memory!
358:     group_sum0 += entry_x * y0[i];
359:     group_sum1 += entry_x * y1[i];
360:   }
361:   tmp_buffer[threadIdx.x]                       = group_sum0;
362:   tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;

364:   // parallel reduction
365:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
366:     __syncthreads();
367:     if (threadIdx.x < stride) {
368:       tmp_buffer[threadIdx.x                      ] += tmp_buffer[threadIdx.x+stride                      ];
369:       tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
370:     }
371:   }

373:   // write result of group to group_results
374:   if (threadIdx.x == 0) {
375:     group_results[blockIdx.x]             = tmp_buffer[0];
376:     group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
377:   }
378: }

380: // M = 3:
381: __global__ void VecMDot_SeqHIP_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
382:                                         PetscInt size, PetscScalar *group_results)
383: {
384:   __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
385:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
386:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
387:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
388:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

390:   PetscScalar entry_x    = 0;
391:   PetscScalar group_sum0 = 0;
392:   PetscScalar group_sum1 = 0;
393:   PetscScalar group_sum2 = 0;
394:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
395:     entry_x     = x[i];   // load only once from global memory!
396:     group_sum0 += entry_x * y0[i];
397:     group_sum1 += entry_x * y1[i];
398:     group_sum2 += entry_x * y2[i];
399:   }
400:   tmp_buffer[threadIdx.x]                           = group_sum0;
401:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
402:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;

404:   // parallel reduction
405:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
406:     __syncthreads();
407:     if (threadIdx.x < stride) {
408:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
409:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
410:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
411:     }
412:   }

414:   // write result of group to group_results
415:   if (threadIdx.x == 0) {
416:     group_results[blockIdx.x                ] = tmp_buffer[0];
417:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
418:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
419:   }
420: }

422: // M = 4:
423: __global__ void VecMDot_SeqHIP_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
424:                                         PetscInt size, PetscScalar *group_results)
425: {
426:   __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
427:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
428:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
429:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
430:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

432:   PetscScalar entry_x    = 0;
433:   PetscScalar group_sum0 = 0;
434:   PetscScalar group_sum1 = 0;
435:   PetscScalar group_sum2 = 0;
436:   PetscScalar group_sum3 = 0;
437:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
438:     entry_x     = x[i];   // load only once from global memory!
439:     group_sum0 += entry_x * y0[i];
440:     group_sum1 += entry_x * y1[i];
441:     group_sum2 += entry_x * y2[i];
442:     group_sum3 += entry_x * y3[i];
443:   }
444:   tmp_buffer[threadIdx.x]                           = group_sum0;
445:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
446:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
447:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;

449:   // parallel reduction
450:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
451:     __syncthreads();
452:     if (threadIdx.x < stride) {
453:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
454:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
455:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
456:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
457:     }
458:   }

460:   // write result of group to group_results
461:   if (threadIdx.x == 0) {
462:     group_results[blockIdx.x                ] = tmp_buffer[0];
463:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
464:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
465:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
466:   }
467: }

469: // M = 8:
470: __global__ void VecMDot_SeqHIP_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
471:                                           const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
472:                                           PetscInt size, PetscScalar *group_results)
473: {
474:   __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
475:   PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
476:   entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group;  // for very small vectors, a group should still do some work
477:   PetscInt vec_start_index = blockIdx.x * entries_per_group;
478:   PetscInt vec_stop_index  = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size

480:   PetscScalar entry_x    = 0;
481:   PetscScalar group_sum0 = 0;
482:   PetscScalar group_sum1 = 0;
483:   PetscScalar group_sum2 = 0;
484:   PetscScalar group_sum3 = 0;
485:   PetscScalar group_sum4 = 0;
486:   PetscScalar group_sum5 = 0;
487:   PetscScalar group_sum6 = 0;
488:   PetscScalar group_sum7 = 0;
489:   for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
490:     entry_x     = x[i];   // load only once from global memory!
491:     group_sum0 += entry_x * y0[i];
492:     group_sum1 += entry_x * y1[i];
493:     group_sum2 += entry_x * y2[i];
494:     group_sum3 += entry_x * y3[i];
495:     group_sum4 += entry_x * y4[i];
496:     group_sum5 += entry_x * y5[i];
497:     group_sum6 += entry_x * y6[i];
498:     group_sum7 += entry_x * y7[i];
499:   }
500:   tmp_buffer[threadIdx.x]                           = group_sum0;
501:   tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] = group_sum1;
502:   tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
503:   tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
504:   tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
505:   tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
506:   tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
507:   tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;

509:   // parallel reduction
510:   for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
511:     __syncthreads();
512:     if (threadIdx.x < stride) {
513:       tmp_buffer[threadIdx.x                          ] += tmp_buffer[threadIdx.x+stride                          ];
514:       tmp_buffer[threadIdx.x +     MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride +     MDOT_WORKGROUP_SIZE];
515:       tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
516:       tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
517:       tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
518:       tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
519:       tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
520:       tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
521:     }
522:   }

524:   // write result of group to group_results
525:   if (threadIdx.x == 0) {
526:     group_results[blockIdx.x                ] = tmp_buffer[0];
527:     group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    MDOT_WORKGROUP_SIZE];
528:     group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
529:     group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
530:     group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
531:     group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
532:     group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
533:     group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
534:   }
535: }
536: #endif /* !defined(PETSC_USE_COMPLEX) */

538: PetscErrorCode VecMDot_SeqHIP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
539: {
540:   PetscErrorCode    ierr;
541:   PetscInt          i,n = xin->map->n,current_y_index = 0;
542:   const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
543: #if !defined(PETSC_USE_COMPLEX)
544:   PetscInt          nv1 = ((nv % 4) == 1) ? nv-1: nv,j;
545:   PetscScalar       *group_results_gpu,*group_results_cpu;
546:   hipError_t        hip_ierr;
547: #endif
548:   PetscBLASInt      one = 1,bn = 0;
549:   hipblasHandle_t   hipblasv2handle;
550:   hipblasStatus_t   hberr;

553:   PetscHIPBLASGetHandle(&hipblasv2handle);
554:   PetscBLASIntCast(xin->map->n,&bn);
555:   if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqHIP not positive.");
556:   /* Handle the case of local size zero first */
557:   if (!xin->map->n) {
558:     for (i=0; i<nv; ++i) z[i] = 0;
559:     return(0);
560:   }

562: #if !defined(PETSC_USE_COMPLEX)
563:   // allocate scratchpad memory for the results of individual work groups:
564:   hip_hipMalloc((void**)&group_results_gpu, nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);CHKERRHIP(hip_ierr);
565:   PetscMalloc1(nv1*MDOT_WORKGROUP_NUM,&group_results_cpu);
566: #endif
567:   VecHIPGetArrayRead(xin,&xptr);
568:   PetscLogGpuTimeBegin();

570:   while (current_y_index < nv)
571:   {
572:     switch (nv - current_y_index) {

574:       case 7:
575:       case 6:
576:       case 5:
577:       case 4:
578:         VecHIPGetArrayRead(yin[current_y_index  ],&y0ptr);
579:         VecHIPGetArrayRead(yin[current_y_index+1],&y1ptr);
580:         VecHIPGetArrayRead(yin[current_y_index+2],&y2ptr);
581:         VecHIPGetArrayRead(yin[current_y_index+3],&y3ptr);
582: #if defined(PETSC_USE_COMPLEX)
583:         hberr = hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRHIPBLAS(hberr);
584:         hberr = hipblasXdot(hipblasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRHIPBLAS(hberr);
585:         hberr = hipblasXdot(hipblasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRHIPBLAS(hberr);
586:         hberr = hipblasXdot(hipblasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRHIPBLAS(hberr);
587: #else
588:         hipLaunchKernelGGL(VecMDot_SeqHIP_kernel4, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
589: #endif
590:         VecHIPRestoreArrayRead(yin[current_y_index  ],&y0ptr);
591:         VecHIPRestoreArrayRead(yin[current_y_index+1],&y1ptr);
592:         VecHIPRestoreArrayRead(yin[current_y_index+2],&y2ptr);
593:         VecHIPRestoreArrayRead(yin[current_y_index+3],&y3ptr);
594:         current_y_index += 4;
595:         break;

597:       case 3:
598:         VecHIPGetArrayRead(yin[current_y_index  ],&y0ptr);
599:         VecHIPGetArrayRead(yin[current_y_index+1],&y1ptr);
600:         VecHIPGetArrayRead(yin[current_y_index+2],&y2ptr);

602: #if defined(PETSC_USE_COMPLEX)
603:         hberr = hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRHIPBLAS(hberr);
604:         hberr = hipblasXdot(hipblasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRHIPBLAS(hberr);
605:         hberr = hipblasXdot(hipblasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRHIPBLAS(hberr);
606: #else
607:         hipLaunchKernelGGL(VecMDot_SeqHIP_kernel3, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
608: #endif
609:         VecHIPRestoreArrayRead(yin[current_y_index  ],&y0ptr);
610:         VecHIPRestoreArrayRead(yin[current_y_index+1],&y1ptr);
611:         VecHIPRestoreArrayRead(yin[current_y_index+2],&y2ptr);
612:         current_y_index += 3;
613:         break;

615:       case 2:
616:         VecHIPGetArrayRead(yin[current_y_index],&y0ptr);
617:         VecHIPGetArrayRead(yin[current_y_index+1],&y1ptr);
618: #if defined(PETSC_USE_COMPLEX)
619:         hberr = hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRHIPBLAS(hberr);
620:         hberr = hipblasXdot(hipblasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRHIPBLAS(hberr);
621: #else
622:         hipLaunchKernelGGL(VecMDot_SeqHIP_kernel2, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr,y0ptr,y1ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
623: #endif
624:         VecHIPRestoreArrayRead(yin[current_y_index],&y0ptr);
625:         VecHIPRestoreArrayRead(yin[current_y_index+1],&y1ptr);
626:         current_y_index += 2;
627:         break;

629:       case 1:
630:         VecHIPGetArrayRead(yin[current_y_index],&y0ptr);
631:         hberr = hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRHIPBLAS(hberr);
632:         VecHIPRestoreArrayRead(yin[current_y_index],&y0ptr);
633:         current_y_index += 1;
634:         break;

636:       default: // 8 or more vectors left
637:         VecHIPGetArrayRead(yin[current_y_index  ],&y0ptr);
638:         VecHIPGetArrayRead(yin[current_y_index+1],&y1ptr);
639:         VecHIPGetArrayRead(yin[current_y_index+2],&y2ptr);
640:         VecHIPGetArrayRead(yin[current_y_index+3],&y3ptr);
641:         VecHIPGetArrayRead(yin[current_y_index+4],&y4ptr);
642:         VecHIPGetArrayRead(yin[current_y_index+5],&y5ptr);
643:         VecHIPGetArrayRead(yin[current_y_index+6],&y6ptr);
644:         VecHIPGetArrayRead(yin[current_y_index+7],&y7ptr);
645: #if defined(PETSC_USE_COMPLEX)
646:         hberr = hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRHIPBLAS(hberr);
647:         hberr = hipblasXdot(hipblasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRHIPBLAS(hberr);
648:         hberr = hipblasXdot(hipblasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRHIPBLAS(hberr);
649:         hberr = hipblasXdot(hipblasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRHIPBLAS(hberr);
650:         hberr = hipblasXdot(hipblasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRHIPBLAS(hberr);
651:         hberr = hipblasXdot(hipblasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRHIPBLAS(hberr);
652:         hberr = hipblasXdot(hipblasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRHIPBLAS(hberr);
653:         hberr = hipblasXdot(hipblasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRHIPBLAS(hberr);
654: #else
655:         hipLaunchKernelGGL(VecMDot_SeqHIP_kernel8, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
656: #endif
657:         VecHIPRestoreArrayRead(yin[current_y_index  ],&y0ptr);
658:         VecHIPRestoreArrayRead(yin[current_y_index+1],&y1ptr);
659:         VecHIPRestoreArrayRead(yin[current_y_index+2],&y2ptr);
660:         VecHIPRestoreArrayRead(yin[current_y_index+3],&y3ptr);
661:         VecHIPRestoreArrayRead(yin[current_y_index+4],&y4ptr);
662:         VecHIPRestoreArrayRead(yin[current_y_index+5],&y5ptr);
663:         VecHIPRestoreArrayRead(yin[current_y_index+6],&y6ptr);
664:         VecHIPRestoreArrayRead(yin[current_y_index+7],&y7ptr);
665:         current_y_index += 8;
666:         break;
667:     }
668:   }
669:   PetscLogGpuTimeEnd();
670:   VecHIPRestoreArrayRead(xin,&xptr);

672: #if defined(PETSC_USE_COMPLEX)
673:   PetscLogGpuToCpu(nv*sizeof(PetscScalar));
674: #else
675:   // copy results to CPU
676:   hip_hipMemcpy(group_results_cpu,group_results_gpu,nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM,hipMemcpyDeviceToHost);CHKERRHIP(hip_ierr);

678:   // sum group results into z
679:   for (j=0; j<nv1; ++j) {
680:     z[j] = 0;
681:     for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[j] += group_results_cpu[i];
682:   }
683:   PetscLogFlops(nv1*MDOT_WORKGROUP_NUM);
684:   hip_hipFree(group_results_gpu);CHKERRHIP(hip_ierr);
685:   PetscFree(group_results_cpu);
686:   PetscLogGpuToCpu(nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);
687: #endif
688:   PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
689:   return(0);
690: }

692: #undef MDOT_WORKGROUP_SIZE
693: #undef MDOT_WORKGROUP_NUM

695: PetscErrorCode VecSet_SeqHIP(Vec xin,PetscScalar alpha)
696: {
697:   PetscInt                        n = xin->map->n;
698:   PetscScalar                     *xarray = NULL;
699:   thrust::device_ptr<PetscScalar> xptr;
700:   PetscErrorCode                  ierr;
701:   hipError_t                      err;

704:   VecHIPGetArrayWrite(xin,&xarray);
705:   PetscLogGpuTimeBegin();
706:   if (alpha == (PetscScalar)0.0) {
707:     err = hipMemset(xarray,0,n*sizeof(PetscScalar));CHKERRHIP(err);
708:   } else {
709:     try {
710:       xptr = thrust::device_pointer_cast(xarray);
711:       thrust::fill(xptr,xptr+n,alpha);
712:     } catch (char *ex) {
713:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
714:     }
715:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
716:   }
717:   PetscLogGpuTimeEnd();
718:   VecHIPRestoreArrayWrite(xin,&xarray);
719:   return(0);
720: }

722: struct PetscScalarReciprocal
723: {
724:   __host__ __device__
725:   PetscScalar operator()(const PetscScalar& s)
726:   {
727: #if defined(PETSC_USE_COMPLEX)
728:     /* SPOCK compilation issue, need to unroll division */
729:     PetscReal sx = PetscRealPart(s);
730:     PetscReal sy = PetscImaginaryPart(s);
731:     PetscReal n  = sx*sx + sy*sy;
732:     return n != 0.0 ? PetscComplex(sx/n,-sy/n) : 0.0;
733: #else
734:     return (s != (PetscScalar)0.0) ? (PetscScalar)1.0/s : 0.0;
735: #endif
736:   }
737: };

739: PetscErrorCode VecReciprocal_SeqHIP(Vec v)
740: {
742:   PetscInt       n;
743:   PetscScalar    *x;

746:   VecGetLocalSize(v,&n);
747:   VecHIPGetArray(v,&x);
748:   PetscLogGpuTimeBegin();
749:   try {
750:     auto xptr = thrust::device_pointer_cast(x);
751:     thrust::transform(xptr,xptr+n,xptr,PetscScalarReciprocal());
752:   } catch (char *ex) {
753:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
754:   }
755:   PetscLogGpuTimeEnd();
756:   VecHIPRestoreArray(v,&x);
757:   return(0);
758: }

760: PetscErrorCode VecScale_SeqHIP(Vec xin,PetscScalar alpha)
761: {
762:   PetscScalar     *xarray;
763:   PetscErrorCode  ierr;
764:   PetscBLASInt    one = 1,bn = 0;
765:   hipblasHandle_t hipblasv2handle;
766:   hipblasStatus_t hberr;

769:   if (alpha == (PetscScalar)0.0) {
770:     VecSet_SeqHIP(xin,alpha);
771:   } else if (alpha != (PetscScalar)1.0) {
772:     PetscHIPBLASGetHandle(&hipblasv2handle);
773:     PetscBLASIntCast(xin->map->n,&bn);
774:     VecHIPGetArray(xin,&xarray);
775:     PetscLogGpuTimeBegin();
776:     hberr = hipblasXscal(hipblasv2handle,bn,&alpha,xarray,one);CHKERRHIPBLAS(hberr);
777:     VecHIPRestoreArray(xin,&xarray);
778:     PetscLogGpuTimeEnd();
779:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
780:     PetscLogGpuFlops(xin->map->n);
781:   }
782:   return(0);
783: }

785: PetscErrorCode VecTDot_SeqHIP(Vec xin,Vec yin,PetscScalar *z)
786: {
787:   const PetscScalar *xarray,*yarray;
788:   PetscErrorCode    ierr;
789:   PetscBLASInt      one = 1,bn = 0;
790:   hipblasHandle_t   hipblasv2handle;
791:   hipblasStatus_t   cerr;

794:   PetscHIPBLASGetHandle(&hipblasv2handle);
795:   PetscBLASIntCast(xin->map->n,&bn);
796:   VecHIPGetArrayRead(xin,&xarray);
797:   VecHIPGetArrayRead(yin,&yarray);
798:   PetscLogGpuTimeBegin();
799:   cerr = hipblasXdotu(hipblasv2handle,bn,xarray,one,yarray,one,z);CHKERRHIPBLAS(cerr);
800:   PetscLogGpuTimeEnd();
801:   if (xin->map->n > 0) {
802:     PetscLogGpuFlops(2.0*xin->map->n-1);
803:   }
804:   PetscLogGpuToCpu(sizeof(PetscScalar));
805:   VecHIPRestoreArrayRead(yin,&yarray);
806:   VecHIPRestoreArrayRead(xin,&xarray);
807:   return(0);
808: }

810: PetscErrorCode VecCopy_SeqHIP(Vec xin,Vec yin)
811: {
812:   const PetscScalar *xarray;
813:   PetscScalar       *yarray;
814:   PetscErrorCode    ierr;
815:   hipError_t        err;

818:   if (xin != yin) {
819:     if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
820:       PetscBool yiship;

822:       PetscObjectTypeCompareAny((PetscObject)yin,&yiship,VECSEQHIP,VECMPIHIP,"");
823:       VecHIPGetArrayRead(xin,&xarray);
824:       if (yiship) {
825:         VecHIPGetArrayWrite(yin,&yarray);
826:       } else {
827:         VecGetArrayWrite(yin,&yarray);
828:       }
829:       PetscLogGpuTimeBegin();
830:       if (yiship) {
831:         err = hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);CHKERRHIP(err);
832:       } else {
833:         err = hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToHost);CHKERRHIP(err);
834:       }
835:       PetscLogGpuTimeEnd();
836:       VecHIPRestoreArrayRead(xin,&xarray);
837:       if (yiship) {
838:         VecHIPRestoreArrayWrite(yin,&yarray);
839:       } else {
840:         VecRestoreArrayWrite(yin,&yarray);
841:       }
842:     } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
843:       /* copy in CPU if we are on the CPU */
844:       VecCopy_SeqHIP_Private(xin,yin);
845:     } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
846:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
847:       if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
848:         /* copy in CPU */
849:         VecCopy_SeqHIP_Private(xin,yin);
850:       } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
851:         /* copy in GPU */
852:         VecHIPGetArrayRead(xin,&xarray);
853:         VecHIPGetArrayWrite(yin,&yarray);
854:         PetscLogGpuTimeBegin();
855:         err  = hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);CHKERRHIP(err);
856:         PetscLogGpuTimeEnd();
857:         VecHIPRestoreArrayRead(xin,&xarray);
858:         VecHIPRestoreArrayWrite(yin,&yarray);
859:       } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
860:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
861:            default to copy in GPU (this is an arbitrary choice) */
862:         VecHIPGetArrayRead(xin,&xarray);
863:         VecHIPGetArrayWrite(yin,&yarray);
864:         PetscLogGpuTimeBegin();
865:         err  = hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);CHKERRHIP(err);
866:         PetscLogGpuTimeEnd();
867:         VecHIPRestoreArrayRead(xin,&xarray);
868:         VecHIPRestoreArrayWrite(yin,&yarray);
869:       } else {
870:         VecCopy_SeqHIP_Private(xin,yin);
871:       }
872:     }
873:   }
874:   return(0);
875: }

877: PetscErrorCode VecSwap_SeqHIP(Vec xin,Vec yin)
878: {
879:   PetscErrorCode  ierr;
880:   PetscBLASInt    one = 1,bn;
881:   PetscScalar     *xarray,*yarray;
882:   hipblasHandle_t hipblasv2handle;
883:   hipblasStatus_t hberr;

886:   PetscHIPBLASGetHandle(&hipblasv2handle);
887:   PetscBLASIntCast(xin->map->n,&bn);
888:   if (xin != yin) {
889:     VecHIPGetArray(xin,&xarray);
890:     VecHIPGetArray(yin,&yarray);
891:     PetscLogGpuTimeBegin();
892:     hberr = hipblasXswap(hipblasv2handle,bn,xarray,one,yarray,one);CHKERRHIPBLAS(hberr);
893:     PetscLogGpuTimeEnd();
894:     VecHIPRestoreArray(xin,&xarray);
895:     VecHIPRestoreArray(yin,&yarray);
896:   }
897:   return(0);
898: }

900: PetscErrorCode VecAXPBY_SeqHIP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
901: {
902:   PetscErrorCode    ierr;
903:   PetscScalar       a = alpha,b = beta;
904:   const PetscScalar *xarray;
905:   PetscScalar       *yarray;
906:   PetscBLASInt      one = 1, bn = 0;
907:   hipblasHandle_t   hipblasv2handle;
908:   hipblasStatus_t   hberr;
909:   hipError_t        err;

912:   PetscHIPBLASGetHandle(&hipblasv2handle);
913:   PetscBLASIntCast(yin->map->n,&bn);
914:   if (a == (PetscScalar)0.0) {
915:     VecScale_SeqHIP(yin,beta);
916:   } else if (b == (PetscScalar)1.0) {
917:     VecAXPY_SeqHIP(yin,alpha,xin);
918:   } else if (a == (PetscScalar)1.0) {
919:     VecAYPX_SeqHIP(yin,beta,xin);
920:   } else if (b == (PetscScalar)0.0) {
921:     VecHIPGetArrayRead(xin,&xarray);
922:     VecHIPGetArray(yin,&yarray);
923:     PetscLogGpuTimeBegin();
924:     err = hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);CHKERRHIP(err);
925:     hberr = hipblasXscal(hipblasv2handle,bn,&alpha,yarray,one);CHKERRHIPBLAS(hberr);
926:     PetscLogGpuTimeEnd();
927:     PetscLogGpuFlops(xin->map->n);
928:     PetscLogCpuToGpuScalar(sizeof(PetscScalar));
929:     VecHIPRestoreArrayRead(xin,&xarray);
930:     VecHIPRestoreArray(yin,&yarray);
931:   } else {
932:     VecHIPGetArrayRead(xin,&xarray);
933:     VecHIPGetArray(yin,&yarray);
934:     PetscLogGpuTimeBegin();
935:     hberr = hipblasXscal(hipblasv2handle,bn,&beta,yarray,one);CHKERRHIPBLAS(hberr);
936:     hberr = hipblasXaxpy(hipblasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRHIPBLAS(hberr);
937:     VecHIPRestoreArrayRead(xin,&xarray);
938:     VecHIPRestoreArray(yin,&yarray);
939:     PetscLogGpuTimeEnd();
940:     PetscLogGpuFlops(3.0*xin->map->n);
941:     PetscLogCpuToGpuScalar(2*sizeof(PetscScalar));
942:   }
943:   return(0);
944: }

946: PetscErrorCode VecAXPBYPCZ_SeqHIP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
947: {
949:   PetscInt       n = zin->map->n;

952:   if (gamma == (PetscScalar)1.0) {
953:     /* z = ax + b*y + z */
954:     VecAXPY_SeqHIP(zin,alpha,xin);
955:     VecAXPY_SeqHIP(zin,beta,yin);
956:     PetscLogGpuFlops(4.0*n);
957:   } else {
958:     /* z = a*x + b*y + c*z */
959:     VecScale_SeqHIP(zin,gamma);
960:     VecAXPY_SeqHIP(zin,alpha,xin);
961:     VecAXPY_SeqHIP(zin,beta,yin);
962:     PetscLogGpuFlops(5.0*n);
963:   }
964:   return(0);
965: }

967: PetscErrorCode VecPointwiseMult_SeqHIP(Vec win,Vec xin,Vec yin)
968: {
969:   PetscInt                              n = win->map->n;
970:   const PetscScalar                     *xarray,*yarray;
971:   PetscScalar                           *warray;
972:   thrust::device_ptr<const PetscScalar> xptr,yptr;
973:   thrust::device_ptr<PetscScalar>       wptr;
974:   PetscErrorCode                        ierr;

977:   if (xin->boundtocpu || yin->boundtocpu) {
978:     VecPointwiseMult_Seq(win,xin,yin);
979:     return(0);
980:   }
981:   VecHIPGetArrayRead(xin,&xarray);
982:   VecHIPGetArrayRead(yin,&yarray);
983:   VecHIPGetArrayWrite(win,&warray);
984:   PetscLogGpuTimeBegin();
985:   try {
986:     wptr = thrust::device_pointer_cast(warray);
987:     xptr = thrust::device_pointer_cast(xarray);
988:     yptr = thrust::device_pointer_cast(yarray);
989: #if defined(PETSC_USE_COMPLEX)
990:     thrust::transform(xptr,xptr+n,yptr,wptr,PetscMultiplyComplex());
991: #else
992:     thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
993: #endif
994:   } catch (char *ex) {
995:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
996:   }
997:   PetscLogGpuTimeEnd();
998:   VecHIPRestoreArrayRead(xin,&xarray);
999:   VecHIPRestoreArrayRead(yin,&yarray);
1000:   VecHIPRestoreArrayWrite(win,&warray);
1001:   PetscLogGpuFlops(n);
1002:   return(0);
1003: }

1005: /* should do infinity norm in hip */

1007: PetscErrorCode VecNorm_SeqHIP(Vec xin,NormType type,PetscReal *z)
1008: {
1009:   PetscErrorCode    ierr;
1010:   PetscInt          n = xin->map->n;
1011:   PetscBLASInt      one = 1, bn = 0;
1012:   const PetscScalar *xarray;
1013:   hipblasHandle_t   hipblasv2handle;
1014:   hipblasStatus_t   hberr;
1015:   hipError_t        err;

1018:   PetscHIPBLASGetHandle(&hipblasv2handle);
1019:   PetscBLASIntCast(n,&bn);
1020:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1021:     VecHIPGetArrayRead(xin,&xarray);
1022:     PetscLogGpuTimeBegin();
1023:     hberr = hipblasXnrm2(hipblasv2handle,bn,xarray,one,z);CHKERRHIPBLAS(hberr);
1024:     PetscLogGpuTimeEnd();
1025:     VecHIPRestoreArrayRead(xin,&xarray);
1026:     PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
1027:   } else if (type == NORM_INFINITY) {
1028:     int  i;
1029:     VecHIPGetArrayRead(xin,&xarray);
1030:     PetscLogGpuTimeBegin();
1031:     hberr = hipblasIXamax(hipblasv2handle,bn,xarray,one,&i);CHKERRHIPBLAS(hberr);
1032:     PetscLogGpuTimeEnd();
1033:     if (bn) {
1034:       PetscScalar zs;
1035:       err = hipMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),hipMemcpyDeviceToHost);CHKERRHIP(err);
1036:       *z = PetscAbsScalar(zs);
1037:     } else *z = 0.0;
1038:     VecHIPRestoreArrayRead(xin,&xarray);
1039:   } else if (type == NORM_1) {
1040:     VecHIPGetArrayRead(xin,&xarray);
1041:     PetscLogGpuTimeBegin();
1042:     hberr = hipblasXasum(hipblasv2handle,bn,xarray,one,z);CHKERRHIPBLAS(hberr);
1043:     VecHIPRestoreArrayRead(xin,&xarray);
1044:     PetscLogGpuTimeEnd();
1045:     PetscLogGpuFlops(PetscMax(n-1.0,0.0));
1046:   } else if (type == NORM_1_AND_2) {
1047:     VecNorm_SeqHIP(xin,NORM_1,z);
1048:     VecNorm_SeqHIP(xin,NORM_2,z+1);
1049:   }
1050:   PetscLogGpuToCpu(sizeof(PetscReal));
1051:   return(0);
1052: }

1054: PetscErrorCode VecDotNorm2_SeqHIP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1055: {

1059:   VecDot_SeqHIP(s,t,dp);
1060:   VecDot_SeqHIP(t,t,nm);
1061:   return(0);
1062: }

1064: PetscErrorCode VecDestroy_SeqHIP(Vec v)
1065: {
1067:   hipError_t    err;

1070:   if (v->spptr) {
1071:     if (((Vec_HIP*)v->spptr)->GPUarray_allocated) {
1072:       err = hipFree(((Vec_HIP*)v->spptr)->GPUarray_allocated);CHKERRHIP(err);
1073:       ((Vec_HIP*)v->spptr)->GPUarray_allocated = NULL;
1074:     }
1075:     if (((Vec_HIP*)v->spptr)->stream) {
1076:       err = hipStreamDestroy(((Vec_HIP*)v->spptr)->stream);CHKERRHIP(err);
1077:     }
1078:   }
1079:   VecDestroy_SeqHIP_Private(v);
1080:   PetscFree(v->spptr);
1081:   return(0);
1082: }

1084: #if defined(PETSC_USE_COMPLEX)
1085: /* SPOCK compilation issue, need to do conjugation ourselves */
1086: struct conjugate
1087: {
1088:   __host__ __device__
1089:     PetscScalar operator()(const PetscScalar& x)
1090:     {
1091:       return PetscScalar(PetscRealPart(x),-PetscImaginaryPart(x));
1092:     }
1093: };
1094: #endif

1096: PetscErrorCode VecConjugate_SeqHIP(Vec xin)
1097: {
1098: #if defined(PETSC_USE_COMPLEX)
1099:   PetscScalar                     *xarray;
1100:   PetscErrorCode                  ierr;
1101:   PetscInt                        n = xin->map->n;
1102:   thrust::device_ptr<PetscScalar> xptr;

1105:   VecHIPGetArray(xin,&xarray);
1106:   PetscLogGpuTimeBegin();
1107:   try {
1108:     xptr = thrust::device_pointer_cast(xarray);
1109:     thrust::transform(xptr,xptr+n,xptr,conjugate());
1110:   } catch (char *ex) {
1111:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1112:   }
1113:   PetscLogGpuTimeEnd();
1114:   VecHIPRestoreArray(xin,&xarray);
1115: #else
1117: #endif
1118:   return(0);
1119: }

1121: PETSC_STATIC_INLINE PetscErrorCode VecGetLocalVectorK_SeqHIP(Vec v,Vec w,PetscBool read)
1122: {
1124:   hipError_t     err;
1125:   PetscBool      wisseqhip;

1131:   PetscObjectTypeCompare((PetscObject)w,VECSEQHIP,&wisseqhip);
1132:   if (w->data && wisseqhip) {
1133:     if (((Vec_Seq*)w->data)->array_allocated) {
1134:       if (w->pinned_memory) {
1135:         PetscMallocSetHIPHost();
1136:       }
1137:       PetscFree(((Vec_Seq*)w->data)->array_allocated);
1138:       if (w->pinned_memory) {
1139:         PetscMallocResetHIPHost();
1140:         w->pinned_memory = PETSC_FALSE;
1141:       }
1142:     }
1143:     ((Vec_Seq*)w->data)->array = NULL;
1144:     ((Vec_Seq*)w->data)->unplacedarray = NULL;
1145:   }
1146:   if (w->spptr && wisseqhip) {
1147:     if (((Vec_HIP*)w->spptr)->GPUarray) {
1148:       err = hipFree(((Vec_HIP*)w->spptr)->GPUarray);CHKERRHIP(err);
1149:       ((Vec_HIP*)w->spptr)->GPUarray = NULL;
1150:     }
1151:     if (((Vec_HIP*)v->spptr)->stream) {
1152:       err = hipStreamDestroy(((Vec_HIP*)w->spptr)->stream);CHKERRHIP(err);
1153:     }
1154:     PetscFree(w->spptr);
1155:   }

1157:   if (v->petscnative && wisseqhip) {
1158:     PetscFree(w->data);
1159:     w->data = v->data;
1160:     w->offloadmask = v->offloadmask;
1161:     w->pinned_memory = v->pinned_memory;
1162:     w->spptr = v->spptr;
1163:     PetscObjectStateIncrease((PetscObject)w);
1164:   } else {
1165:     if (read) {
1166:       VecGetArrayRead(v,(const PetscScalar**)&((Vec_Seq*)w->data)->array);
1167:     } else {
1168:       VecGetArray(v,&((Vec_Seq*)w->data)->array);
1169:     }
1170:     w->offloadmask = PETSC_OFFLOAD_CPU;
1171:     if (wisseqhip) {
1172:       VecHIPAllocateCheck(w);
1173:     }
1174:   }
1175:   return(0);
1176: }

1178: PETSC_STATIC_INLINE PetscErrorCode VecRestoreLocalVectorK_SeqHIP(Vec v,Vec w,PetscBool read)
1179: {
1181:   hipError_t     err;
1182:   PetscBool      wisseqhip;

1188:   PetscObjectTypeCompare((PetscObject)w,VECSEQHIP,&wisseqhip);
1189:   if (v->petscnative && wisseqhip) {
1190:     v->data = w->data;
1191:     v->offloadmask = w->offloadmask;
1192:     v->pinned_memory = w->pinned_memory;
1193:     v->spptr = w->spptr;
1194:     w->data = 0;
1195:     w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1196:     w->spptr = 0;
1197:   } else {
1198:     if (read) {
1199:       VecRestoreArrayRead(v,(const PetscScalar**)&((Vec_Seq*)w->data)->array);
1200:     } else {
1201:       VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1202:     }
1203:     if ((Vec_HIP*)w->spptr && wisseqhip) {
1204:       err = hipFree(((Vec_HIP*)w->spptr)->GPUarray);CHKERRHIP(err);
1205:       ((Vec_HIP*)w->spptr)->GPUarray = NULL;
1206:       if (((Vec_HIP*)v->spptr)->stream) {
1207:         err = hipStreamDestroy(((Vec_HIP*)w->spptr)->stream);CHKERRHIP(err);
1208:       }
1209:       PetscFree(w->spptr);
1210:     }
1211:   }
1212:   return(0);
1213: }

1215: PetscErrorCode VecGetLocalVector_SeqHIP(Vec v,Vec w)
1216: {

1220:   VecGetLocalVectorK_SeqHIP(v,w,PETSC_FALSE);
1221:   return(0);
1222: }

1224: PetscErrorCode VecGetLocalVectorRead_SeqHIP(Vec v,Vec w)
1225: {

1229:   VecGetLocalVectorK_SeqHIP(v,w,PETSC_TRUE);
1230:   return(0);
1231: }

1233: PetscErrorCode VecRestoreLocalVector_SeqHIP(Vec v,Vec w)
1234: {

1238:   VecRestoreLocalVectorK_SeqHIP(v,w,PETSC_FALSE);
1239:   return(0);
1240: }

1242: PetscErrorCode VecRestoreLocalVectorRead_SeqHIP(Vec v,Vec w)
1243: {

1247:   VecRestoreLocalVectorK_SeqHIP(v,w,PETSC_TRUE);
1248:   return(0);
1249: }

1251: struct petscrealpart : public thrust::unary_function<PetscScalar,PetscReal>
1252: {
1253:   __host__ __device__
1254:   PetscReal operator()(const PetscScalar& x) {
1255:     return PetscRealPart(x);
1256:   }
1257: };

1259: struct petscrealparti : public thrust::unary_function<thrust::tuple<PetscScalar, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1260: {
1261:   __host__ __device__
1262:   thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscScalar, PetscInt>& x) {
1263:     return thrust::make_tuple(PetscRealPart(x.get<0>()), x.get<1>());
1264:   }
1265: };

1267: struct petscmax : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1268: {
1269:   __host__ __device__
1270:   PetscReal operator()(const PetscReal& x, const PetscReal& y) {
1271:     return x < y ? y : x;
1272:   }
1273: };

1275: struct petscmaxi : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1276: {
1277:   __host__ __device__
1278:   thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt>& x, const thrust::tuple<PetscReal, PetscInt>& y) {
1279:     return x.get<0>() < y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1280:            (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1281:            (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1282:   }
1283: };

1285: struct petscmin : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1286: {
1287:   __host__ __device__
1288:   PetscReal operator()(const PetscReal& x, const PetscReal& y) {
1289:     return x < y ? x : y;
1290:   }
1291: };

1293: struct petscmini : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1294: {
1295:   __host__ __device__
1296:   thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt>& x, const thrust::tuple<PetscReal, PetscInt>& y) {
1297:     return x.get<0>() > y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1298:            (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1299:            (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1300:   }
1301: };

1303: PetscErrorCode VecMax_SeqHIP(Vec v, PetscInt *p, PetscReal *m)
1304: {
1305:   PetscErrorCode                        ierr;
1306:   PetscInt                              n = v->map->n;
1307:   const PetscScalar                     *av;
1308:   thrust::device_ptr<const PetscScalar> avpt;

1312:   if (!n) {
1313:     *m = PETSC_MIN_REAL;
1314:     if (p) *p = -1;
1315:     return(0);
1316:   }
1317:   VecHIPGetArrayRead(v,&av);
1318:   avpt = thrust::device_pointer_cast(av);
1319:   PetscLogGpuTimeBegin();
1320:   if (p) {
1321:     thrust::tuple<PetscReal,PetscInt> res(PETSC_MIN_REAL,-1);
1322:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1323:     try {
1324: #if defined(PETSC_USE_COMPLEX)
1325:       res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmaxi());
1326: #else
1327:       res = thrust::reduce(zibit,zibit+n,res,petscmaxi());
1328: #endif
1329:     } catch (char *ex) {
1330:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1331:     }
1332:     *m = res.get<0>();
1333:     *p = res.get<1>();
1334:   } else {
1335:     try {
1336: #if defined(PETSC_USE_COMPLEX)
1337:       *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MIN_REAL,petscmax());
1338: #else
1339:       *m = thrust::reduce(avpt,avpt+n,PETSC_MIN_REAL,petscmax());
1340: #endif
1341:     } catch (char *ex) {
1342:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1343:     }
1344:   }
1345:   PetscLogGpuTimeEnd();
1346:   VecHIPRestoreArrayRead(v,&av);
1347:   return(0);
1348: }

1350: PetscErrorCode VecMin_SeqHIP(Vec v, PetscInt *p, PetscReal *m)
1351: {
1352:   PetscErrorCode                        ierr;
1353:   PetscInt                              n = v->map->n;
1354:   const PetscScalar                     *av;
1355:   thrust::device_ptr<const PetscScalar> avpt;

1359:   if (!n) {
1360:     *m = PETSC_MAX_REAL;
1361:     if (p) *p = -1;
1362:     return(0);
1363:   }
1364:   VecHIPGetArrayRead(v,&av);
1365:   avpt = thrust::device_pointer_cast(av);
1366:   PetscLogGpuTimeBegin();
1367:   if (p) {
1368:     thrust::tuple<PetscReal,PetscInt> res(PETSC_MAX_REAL,-1);
1369:     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1370:     try {
1371: #if defined(PETSC_USE_COMPLEX)
1372:       res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmini());
1373: #else
1374:       res = thrust::reduce(zibit,zibit+n,res,petscmini());
1375: #endif
1376:     } catch (char *ex) {
1377:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1378:     }
1379:     *m = res.get<0>();
1380:     *p = res.get<1>();
1381:   } else {
1382:     try {
1383: #if defined(PETSC_USE_COMPLEX)
1384:       *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MAX_REAL,petscmin());
1385: #else
1386:       *m = thrust::reduce(avpt,avpt+n,PETSC_MAX_REAL,petscmin());
1387: #endif
1388:     } catch (char *ex) {
1389:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1390:     }
1391:   }
1392:   PetscLogGpuTimeEnd();
1393:   VecHIPRestoreArrayRead(v,&av);
1394:   return(0);
1395: }

1397: PetscErrorCode VecSum_SeqHIP(Vec v,PetscScalar *sum)
1398: {
1399:   PetscErrorCode                        ierr;
1400:   PetscInt                              n = v->map->n;
1401:   const PetscScalar                     *a;
1402:   thrust::device_ptr<const PetscScalar> dptr;

1406:   VecHIPGetArrayRead(v,&a);
1407:   dptr = thrust::device_pointer_cast(a);
1408:   PetscLogGpuTimeBegin();
1409:   try {
1410:     *sum = thrust::reduce(dptr,dptr+n,PetscScalar(0.0));
1411:   } catch (char *ex) {
1412:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1413:   }
1414:   PetscLogGpuTimeEnd();
1415:   VecHIPRestoreArrayRead(v,&a);
1416:   return(0);
1417: }

1419: struct petscshift : public thrust::unary_function<PetscScalar,PetscScalar>
1420: {
1421:   const PetscScalar shift_;
1422:   petscshift(PetscScalar shift) : shift_(shift){}
1423:   __host__ __device__
1424:   PetscScalar operator()(PetscScalar x) {return x + shift_;}
1425: };

1427: PetscErrorCode VecShift_SeqHIP(Vec v,PetscScalar shift)
1428: {
1429:   PetscErrorCode                        ierr;
1430:   PetscInt                              n = v->map->n;
1431:   PetscScalar                           *a;
1432:   thrust::device_ptr<PetscScalar>       dptr;

1436:   VecHIPGetArray(v,&a);
1437:   dptr = thrust::device_pointer_cast(a);
1438:   PetscLogGpuTimeBegin();
1439:   try {
1440:     thrust::transform(dptr,dptr+n,dptr,petscshift(shift)); /* in-place transform */
1441:   } catch (char *ex) {
1442:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1443:   }
1444:   PetscLogGpuTimeEnd();
1445:   VecHIPRestoreArray(v,&a);
1446:   return(0);
1447: }