Actual source code: fnutil.c

slepc-3.18.2 2023-01-26
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    Utility subroutines common to several impls
 12: */

 14: #include <slepc/private/fnimpl.h>
 15: #include <slepcblaslapack.h>

 17: /*
 18:    Compute the square root of an upper quasi-triangular matrix T,
 19:    using Higham's algorithm (LAA 88, 1987). T is overwritten with sqrtm(T).
 20:  */
 21: static PetscErrorCode SlepcMatDenseSqrt(PetscBLASInt n,PetscScalar *T,PetscBLASInt ld)
 22: {
 23:   PetscScalar  one=1.0,mone=-1.0;
 24:   PetscReal    scal;
 25:   PetscBLASInt i,j,si,sj,r,ione=1,info;
 26: #if !defined(PETSC_USE_COMPLEX)
 27:   PetscReal    alpha,theta,mu,mu2;
 28: #endif

 30:   for (j=0;j<n;j++) {
 31: #if defined(PETSC_USE_COMPLEX)
 32:     sj = 1;
 33:     T[j+j*ld] = PetscSqrtScalar(T[j+j*ld]);
 34: #else
 35:     sj = (j==n-1 || T[j+1+j*ld] == 0.0)? 1: 2;
 36:     if (sj==1) {
 38:       T[j+j*ld] = PetscSqrtReal(T[j+j*ld]);
 39:     } else {
 40:       /* square root of 2x2 block */
 41:       theta = (T[j+j*ld]+T[j+1+(j+1)*ld])/2.0;
 42:       mu    = (T[j+j*ld]-T[j+1+(j+1)*ld])/2.0;
 43:       mu2   = -mu*mu-T[j+1+j*ld]*T[j+(j+1)*ld];
 44:       mu    = PetscSqrtReal(mu2);
 45:       if (theta>0.0) alpha = PetscSqrtReal((theta+PetscSqrtReal(theta*theta+mu2))/2.0);
 46:       else alpha = mu/PetscSqrtReal(2.0*(-theta+PetscSqrtReal(theta*theta+mu2)));
 47:       T[j+j*ld]       /= 2.0*alpha;
 48:       T[j+1+(j+1)*ld] /= 2.0*alpha;
 49:       T[j+(j+1)*ld]   /= 2.0*alpha;
 50:       T[j+1+j*ld]     /= 2.0*alpha;
 51:       T[j+j*ld]       += alpha-theta/(2.0*alpha);
 52:       T[j+1+(j+1)*ld] += alpha-theta/(2.0*alpha);
 53:     }
 54: #endif
 55:     for (i=j-1;i>=0;i--) {
 56: #if defined(PETSC_USE_COMPLEX)
 57:       si = 1;
 58: #else
 59:       si = (i==0 || T[i+(i-1)*ld] == 0.0)? 1: 2;
 60:       if (si==2) i--;
 61: #endif
 62:       /* solve Sylvester equation of order si x sj */
 63:       r = j-i-si;
 64:       if (r) PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&si,&sj,&r,&mone,T+i+(i+si)*ld,&ld,T+i+si+j*ld,&ld,&one,T+i+j*ld,&ld));
 65:       PetscCallBLAS("LAPACKtrsyl",LAPACKtrsyl_("N","N",&ione,&si,&sj,T+i+i*ld,&ld,T+j+j*ld,&ld,T+i+j*ld,&ld,&scal,&info));
 66:       SlepcCheckLapackInfo("trsyl",info);
 68:     }
 69:     if (sj==2) j++;
 70:   }
 71:   return 0;
 72: }

 74: #define BLOCKSIZE 64

 76: /*
 77:    Schur method for the square root of an upper quasi-triangular matrix T.
 78:    T is overwritten with sqrtm(T).
 79:    If firstonly then only the first column of T will contain relevant values.
 80:  */
 81: PetscErrorCode FNSqrtmSchur(FN fn,PetscBLASInt n,PetscScalar *T,PetscBLASInt ld,PetscBool firstonly)
 82: {
 83:   PetscBLASInt   i,j,k,r,ione=1,sdim,lwork,*s,*p,info,bs=BLOCKSIZE;
 84:   PetscScalar    *wr,*W,*Q,*work,one=1.0,zero=0.0,mone=-1.0;
 85:   PetscInt       m,nblk;
 86:   PetscReal      scal;
 87: #if defined(PETSC_USE_COMPLEX)
 88:   PetscReal      *rwork;
 89: #else
 90:   PetscReal      *wi;
 91: #endif

 93:   m     = n;
 94:   nblk  = (m+bs-1)/bs;
 95:   lwork = 5*n;
 96:   k     = firstonly? 1: n;

 98:   /* compute Schur decomposition A*Q = Q*T */
 99: #if !defined(PETSC_USE_COMPLEX)
100:   PetscMalloc7(m,&wr,m,&wi,m*k,&W,m*m,&Q,lwork,&work,nblk,&s,nblk,&p);
101:   PetscCallBLAS("LAPACKgees",LAPACKgees_("V","N",NULL,&n,T,&ld,&sdim,wr,wi,Q,&ld,work,&lwork,NULL,&info));
102: #else
103:   PetscMalloc7(m,&wr,m,&rwork,m*k,&W,m*m,&Q,lwork,&work,nblk,&s,nblk,&p);
104:   PetscCallBLAS("LAPACKgees",LAPACKgees_("V","N",NULL,&n,T,&ld,&sdim,wr,Q,&ld,work,&lwork,rwork,NULL,&info));
105: #endif
106:   SlepcCheckLapackInfo("gees",info);

108:   /* determine block sizes and positions, to avoid cutting 2x2 blocks */
109:   j = 0;
110:   p[j] = 0;
111:   do {
112:     s[j] = PetscMin(bs,n-p[j]);
113: #if !defined(PETSC_USE_COMPLEX)
114:     if (p[j]+s[j]!=n && T[p[j]+s[j]+(p[j]+s[j]-1)*ld]!=0.0) s[j]++;
115: #endif
116:     if (p[j]+s[j]==n) break;
117:     j++;
118:     p[j] = p[j-1]+s[j-1];
119:   } while (1);
120:   nblk = j+1;

122:   for (j=0;j<nblk;j++) {
123:     /* evaluate f(T_jj) */
124:     SlepcMatDenseSqrt(s[j],T+p[j]+p[j]*ld,ld);
125:     for (i=j-1;i>=0;i--) {
126:       /* solve Sylvester equation for block (i,j) */
127:       r = p[j]-p[i]-s[i];
128:       if (r) PetscCallBLAS("BLASgemm",BLASgemm_("N","N",s+i,s+j,&r,&mone,T+p[i]+(p[i]+s[i])*ld,&ld,T+p[i]+s[i]+p[j]*ld,&ld,&one,T+p[i]+p[j]*ld,&ld));
129:       PetscCallBLAS("LAPACKtrsyl",LAPACKtrsyl_("N","N",&ione,s+i,s+j,T+p[i]+p[i]*ld,&ld,T+p[j]+p[j]*ld,&ld,T+p[i]+p[j]*ld,&ld,&scal,&info));
130:       SlepcCheckLapackInfo("trsyl",info);
132:     }
133:   }

135:   /* backtransform B = Q*T*Q' */
136:   PetscCallBLAS("BLASgemm",BLASgemm_("N","C",&n,&k,&n,&one,T,&ld,Q,&ld,&zero,W,&ld));
137:   PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&k,&n,&one,Q,&ld,W,&ld,&zero,T,&ld));

139:   /* flop count: Schur decomposition, triangular square root, and backtransform */
140:   PetscLogFlops(25.0*n*n*n+n*n*n/3.0+4.0*n*n*k);

142: #if !defined(PETSC_USE_COMPLEX)
143:   PetscFree7(wr,wi,W,Q,work,s,p);
144: #else
145:   PetscFree7(wr,rwork,W,Q,work,s,p);
146: #endif
147:   return 0;
148: }

150: #define DBMAXIT 25

152: /*
153:    Computes the principal square root of the matrix T using the product form
154:    of the Denman-Beavers iteration.
155:    T is overwritten with sqrtm(T) or inv(sqrtm(T)) depending on flag inv.
156:  */
157: PetscErrorCode FNSqrtmDenmanBeavers(FN fn,PetscBLASInt n,PetscScalar *T,PetscBLASInt ld,PetscBool inv)
158: {
159:   PetscScalar        *Told,*M=NULL,*invM,*work,work1,prod,alpha;
160:   PetscScalar        szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sp25=0.25;
161:   PetscReal          tol,Mres=0.0,detM,g,reldiff,fnormdiff,fnormT,rwork[1];
162:   PetscBLASInt       N,i,it,*piv=NULL,info,query=-1,lwork;
163:   const PetscBLASInt one=1;
164:   PetscBool          converged=PETSC_FALSE,scale;
165:   unsigned int       ftz;

167:   N = n*n;
168:   tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
169:   scale = PetscDefined(USE_REAL_SINGLE)? PETSC_FALSE: PETSC_TRUE;
170:   SlepcSetFlushToZero(&ftz);

172:   /* query work size */
173:   PetscCallBLAS("LAPACKgetri",LAPACKgetri_(&n,M,&ld,piv,&work1,&query,&info));
174:   PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
175:   PetscMalloc5(lwork,&work,n,&piv,n*n,&Told,n*n,&M,n*n,&invM);
176:   PetscArraycpy(M,T,n*n);

178:   if (inv) {  /* start recurrence with I instead of A */
179:     PetscArrayzero(T,n*n);
180:     for (i=0;i<n;i++) T[i+i*ld] += 1.0;
181:   }

183:   for (it=0;it<DBMAXIT && !converged;it++) {

185:     if (scale) {  /* g = (abs(det(M)))^(-1/(2*n)) */
186:       PetscArraycpy(invM,M,n*n);
187:       PetscCallBLAS("LAPACKgetrf",LAPACKgetrf_(&n,&n,invM,&ld,piv,&info));
188:       SlepcCheckLapackInfo("getrf",info);
189:       prod = invM[0];
190:       for (i=1;i<n;i++) prod *= invM[i+i*ld];
191:       detM = PetscAbsScalar(prod);
192:       g = (detM>PETSC_MAX_REAL)? 0.5: PetscPowReal(detM,-1.0/(2.0*n));
193:       alpha = g;
194:       PetscCallBLAS("BLASscal",BLASscal_(&N,&alpha,T,&one));
195:       alpha = g*g;
196:       PetscCallBLAS("BLASscal",BLASscal_(&N,&alpha,M,&one));
197:       PetscLogFlops(2.0*n*n*n/3.0+2.0*n*n);
198:     }

200:     PetscArraycpy(Told,T,n*n);
201:     PetscArraycpy(invM,M,n*n);

203:     PetscCallBLAS("LAPACKgetrf",LAPACKgetrf_(&n,&n,invM,&ld,piv,&info));
204:     SlepcCheckLapackInfo("getrf",info);
205:     PetscCallBLAS("LAPACKgetri",LAPACKgetri_(&n,invM,&ld,piv,work,&lwork,&info));
206:     SlepcCheckLapackInfo("getri",info);
207:     PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n*n/3.0);

209:     for (i=0;i<n;i++) invM[i+i*ld] += 1.0;
210:     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&spfive,Told,&ld,invM,&ld,&szero,T,&ld));
211:     for (i=0;i<n;i++) invM[i+i*ld] -= 1.0;

213:     PetscCallBLAS("BLASaxpy",BLASaxpy_(&N,&sone,invM,&one,M,&one));
214:     PetscCallBLAS("BLASscal",BLASscal_(&N,&sp25,M,&one));
215:     for (i=0;i<n;i++) M[i+i*ld] -= 0.5;
216:     PetscLogFlops(2.0*n*n*n+2.0*n*n);

218:     Mres = LAPACKlange_("F",&n,&n,M,&n,rwork);
219:     for (i=0;i<n;i++) M[i+i*ld] += 1.0;

221:     if (scale) {
222:       /* reldiff = norm(T - Told,'fro')/norm(T,'fro') */
223:       PetscCallBLAS("BLASaxpy",BLASaxpy_(&N,&smone,T,&one,Told,&one));
224:       fnormdiff = LAPACKlange_("F",&n,&n,Told,&n,rwork);
225:       fnormT = LAPACKlange_("F",&n,&n,T,&n,rwork);
226:       PetscLogFlops(7.0*n*n);
227:       reldiff = fnormdiff/fnormT;
228:       PetscInfo(fn,"it: %" PetscBLASInt_FMT " reldiff: %g scale: %g tol*scale: %g\n",it,(double)reldiff,(double)g,(double)(tol*g));
229:       if (reldiff<1e-2) scale = PETSC_FALSE;  /* Switch off scaling */
230:     }

232:     if (Mres<=tol) converged = PETSC_TRUE;
233:   }

236:   PetscFree5(work,piv,Told,M,invM);
237:   SlepcResetFlushToZero(&ftz);
238:   return 0;
239: }

241: #define NSMAXIT 50

243: /*
244:    Computes the principal square root of the matrix A using the Newton-Schulz iteration.
245:    T is overwritten with sqrtm(T) or inv(sqrtm(T)) depending on flag inv.
246:  */
247: PetscErrorCode FNSqrtmNewtonSchulz(FN fn,PetscBLASInt n,PetscScalar *A,PetscBLASInt ld,PetscBool inv)
248: {
249:   PetscScalar    *Y=A,*Yold,*Z,*Zold,*M;
250:   PetscScalar    szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sthree=3.0;
251:   PetscReal      sqrtnrm,tol,Yres=0.0,nrm,rwork[1],done=1.0;
252:   PetscBLASInt   info,i,it,N,one=1,zero=0;
253:   PetscBool      converged=PETSC_FALSE;
254:   unsigned int   ftz;

256:   N = n*n;
257:   tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
258:   SlepcSetFlushToZero(&ftz);

260:   PetscMalloc4(N,&Yold,N,&Z,N,&Zold,N,&M);

262:   /* scale */
263:   PetscArraycpy(Z,A,N);
264:   for (i=0;i<n;i++) Z[i+i*ld] -= 1.0;
265:   nrm = LAPACKlange_("fro",&n,&n,Z,&n,rwork);
266:   sqrtnrm = PetscSqrtReal(nrm);
267:   PetscCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,&nrm,&done,&N,&one,A,&N,&info));
268:   SlepcCheckLapackInfo("lascl",info);
269:   tol *= nrm;
270:   PetscInfo(fn,"||I-A||_F = %g, new tol: %g\n",(double)nrm,(double)tol);
271:   PetscLogFlops(2.0*n*n);

273:   /* Z = I */
274:   PetscArrayzero(Z,N);
275:   for (i=0;i<n;i++) Z[i+i*ld] = 1.0;

277:   for (it=0;it<NSMAXIT && !converged;it++) {
278:     /* Yold = Y, Zold = Z */
279:     PetscArraycpy(Yold,Y,N);
280:     PetscArraycpy(Zold,Z,N);

282:     /* M = (3*I-Zold*Yold) */
283:     PetscArrayzero(M,N);
284:     for (i=0;i<n;i++) M[i+i*ld] = sthree;
285:     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&smone,Zold,&ld,Yold,&ld,&sone,M,&ld));

287:     /* Y = (1/2)*Yold*M, Z = (1/2)*M*Zold */
288:     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&spfive,Yold,&ld,M,&ld,&szero,Y,&ld));
289:     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&spfive,M,&ld,Zold,&ld,&szero,Z,&ld));

291:     /* reldiff = norm(Y-Yold,'fro')/norm(Y,'fro') */
292:     PetscCallBLAS("BLASaxpy",BLASaxpy_(&N,&smone,Y,&one,Yold,&one));
293:     Yres = LAPACKlange_("fro",&n,&n,Yold,&n,rwork);
295:     if (Yres<=tol) converged = PETSC_TRUE;
296:     PetscInfo(fn,"it: %" PetscBLASInt_FMT " res: %g\n",it,(double)Yres);

298:     PetscLogFlops(6.0*n*n*n+2.0*n*n);
299:   }


303:   /* undo scaling */
304:   if (inv) {
305:     PetscArraycpy(A,Z,N);
306:     PetscCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,&sqrtnrm,&done,&N,&one,A,&N,&info));
307:   } else PetscCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,&done,&sqrtnrm,&N,&one,A,&N,&info));
308:   SlepcCheckLapackInfo("lascl",info);

310:   PetscFree4(Yold,Z,Zold,M);
311:   SlepcResetFlushToZero(&ftz);
312:   return 0;
313: }

315: #if defined(PETSC_HAVE_CUDA)
316: #include "../src/sys/classes/fn/impls/cuda/fnutilcuda.h"
317: #include <slepccublas.h>

319: /*
320:  * Matrix square root by Newton-Schulz iteration. CUDA version.
321:  * Computes the principal square root of the matrix A using the
322:  * Newton-Schulz iteration. A is overwritten with sqrtm(A).
323:  */
324: PetscErrorCode FNSqrtmNewtonSchulz_CUDA(FN fn,PetscBLASInt n,PetscScalar *d_A,PetscBLASInt ld,PetscBool inv)
325: {
326:   PetscScalar        *d_Yold,*d_Z,*d_Zold,*d_M,alpha;
327:   PetscReal          nrm,sqrtnrm,tol,Yres=0.0;
328:   const PetscScalar  szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sthree=3.0;
329:   PetscInt           it;
330:   PetscBLASInt       N;
331:   const PetscBLASInt one=1;
332:   PetscBool          converged=PETSC_FALSE;
333:   cublasHandle_t     cublasv2handle;

335:   PetscDeviceInitialize(PETSC_DEVICE_CUDA); /* For CUDA event timers */
336:   PetscCUBLASGetHandle(&cublasv2handle);
337:   N = n*n;
338:   tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;

340:   cudaMalloc((void **)&d_Yold,sizeof(PetscScalar)*N);
341:   cudaMalloc((void **)&d_Z,sizeof(PetscScalar)*N);
342:   cudaMalloc((void **)&d_Zold,sizeof(PetscScalar)*N);
343:   cudaMalloc((void **)&d_M,sizeof(PetscScalar)*N);

345:   PetscLogGpuTimeBegin();

347:   /* Z = I; */
348:   cudaMemset(d_Z,0,sizeof(PetscScalar)*N);
349:   set_diagonal(n,d_Z,ld,sone);

351:   /* scale */
352:   cublasXaxpy(cublasv2handle,N,&smone,d_A,one,d_Z,one);
353:   cublasXnrm2(cublasv2handle,N,d_Z,one,&nrm);
354:   sqrtnrm = PetscSqrtReal(nrm);
355:   alpha = 1.0/nrm;
356:   cublasXscal(cublasv2handle,N,&alpha,d_A,one);
357:   tol *= nrm;
358:   PetscInfo(fn,"||I-A||_F = %g, new tol: %g\n",(double)nrm,(double)tol);
359:   PetscLogGpuFlops(2.0*n*n);

361:   /* Z = I; */
362:   cudaMemset(d_Z,0,sizeof(PetscScalar)*N);
363:   set_diagonal(n,d_Z,ld,sone);

365:   for (it=0;it<NSMAXIT && !converged;it++) {
366:     /* Yold = Y, Zold = Z */
367:     cudaMemcpy(d_Yold,d_A,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);
368:     cudaMemcpy(d_Zold,d_Z,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);

370:     /* M = (3*I - Zold*Yold) */
371:     cudaMemset(d_M,0,sizeof(PetscScalar)*N);
372:     set_diagonal(n,d_M,ld,sthree);
373:     cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&smone,d_Zold,ld,d_Yold,ld,&sone,d_M,ld);

375:     /* Y = (1/2) * Yold * M, Z = (1/2) * M * Zold */
376:     cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&spfive,d_Yold,ld,d_M,ld,&szero,d_A,ld);
377:     cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&spfive,d_M,ld,d_Zold,ld,&szero,d_Z,ld);

379:     /* reldiff = norm(Y-Yold,'fro')/norm(Y,'fro') */
380:     cublasXaxpy(cublasv2handle,N,&smone,d_A,one,d_Yold,one);
381:     cublasXnrm2(cublasv2handle,N,d_Yold,one,&Yres);
383:     if (Yres<=tol) converged = PETSC_TRUE;
384:     PetscInfo(fn,"it: %" PetscInt_FMT " res: %g\n",it,(double)Yres);

386:     PetscLogGpuFlops(6.0*n*n*n+2.0*n*n);
387:   }


391:   /* undo scaling */
392:   if (inv) {
393:     alpha = 1.0/sqrtnrm;
394:     cublasXscal(cublasv2handle,N,&alpha,d_Z,one);
395:     cudaMemcpy(d_A,d_Z,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);
396:   } else {
397:     alpha = sqrtnrm;
398:     cublasXscal(cublasv2handle,N,&alpha,d_A,one);
399:   }

401:   PetscLogGpuTimeEnd();
402:   cudaFree(d_Yold);
403:   cudaFree(d_Z);
404:   cudaFree(d_Zold);
405:   cudaFree(d_M);
406:   return 0;
407: }

409: #if defined(PETSC_HAVE_MAGMA)
410: #include <slepcmagma.h>

412: /*
413:  * Matrix square root by product form of Denman-Beavers iteration. CUDA version.
414:  * Computes the principal square root of the matrix T using the product form
415:  * of the Denman-Beavers iteration. T is overwritten with sqrtm(T).
416:  */
417: PetscErrorCode FNSqrtmDenmanBeavers_CUDAm(FN fn,PetscBLASInt n,PetscScalar *d_T,PetscBLASInt ld,PetscBool inv)
418: {
419:   PetscScalar    *d_Told,*d_M,*d_invM,*d_work,prod,szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sneg_pfive=-0.5,sp25=0.25,alpha;
420:   PetscReal      tol,Mres=0.0,detM,g,reldiff,fnormdiff,fnormT;
421:   PetscInt       it,lwork,nb;
422:   PetscBLASInt   N,one=1,*piv=NULL;
423:   PetscBool      converged=PETSC_FALSE,scale;
424:   cublasHandle_t cublasv2handle;

426:   PetscDeviceInitialize(PETSC_DEVICE_CUDA); /* For CUDA event timers */
427:   PetscCUBLASGetHandle(&cublasv2handle);
428:   SlepcMagmaInit();
429:   N = n*n;
430:   scale = PetscDefined(USE_REAL_SINGLE)? PETSC_FALSE: PETSC_TRUE;
431:   tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;

433:   /* query work size */
434:   nb = magma_get_xgetri_nb(n);
435:   lwork = nb*n;
436:   PetscMalloc1(n,&piv);
437:   cudaMalloc((void **)&d_work,sizeof(PetscScalar)*lwork);
438:   cudaMalloc((void **)&d_Told,sizeof(PetscScalar)*N);
439:   cudaMalloc((void **)&d_M,sizeof(PetscScalar)*N);
440:   cudaMalloc((void **)&d_invM,sizeof(PetscScalar)*N);

442:   PetscLogGpuTimeBegin();
443:   cudaMemcpy(d_M,d_T,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);
444:   if (inv) {  /* start recurrence with I instead of A */
445:     cudaMemset(d_T,0,sizeof(PetscScalar)*N);
446:     set_diagonal(n,d_T,ld,1.0);
447:   }

449:   for (it=0;it<DBMAXIT && !converged;it++) {

451:     if (scale) { /* g = (abs(det(M)))^(-1/(2*n)); */
452:       cudaMemcpy(d_invM,d_M,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);
453:       PetscCallMAGMA(magma_xgetrf_gpu,n,n,d_invM,ld,piv);
454:       mult_diagonal(n,d_invM,ld,&prod);
455:       detM = PetscAbsScalar(prod);
456:       g = (detM>PETSC_MAX_REAL)? 0.5: PetscPowReal(detM,-1.0/(2.0*n));
457:       alpha = g;
458:       cublasXscal(cublasv2handle,N,&alpha,d_T,one);
459:       alpha = g*g;
460:       cublasXscal(cublasv2handle,N,&alpha,d_M,one);
461:       PetscLogGpuFlops(2.0*n*n*n/3.0+2.0*n*n);
462:     }

464:     cudaMemcpy(d_Told,d_T,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);
465:     cudaMemcpy(d_invM,d_M,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);

467:     PetscCallMAGMA(magma_xgetrf_gpu,n,n,d_invM,ld,piv);
468:     PetscCallMAGMA(magma_xgetri_gpu,n,d_invM,ld,piv,d_work,lwork);
469:     PetscLogGpuFlops(2.0*n*n*n/3.0+4.0*n*n*n/3.0);

471:     shift_diagonal(n,d_invM,ld,sone);
472:     cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&spfive,d_Told,ld,d_invM,ld,&szero,d_T,ld);
473:     shift_diagonal(n,d_invM,ld,smone);

475:     cublasXaxpy(cublasv2handle,N,&sone,d_invM,one,d_M,one);
476:     cublasXscal(cublasv2handle,N,&sp25,d_M,one);
477:     shift_diagonal(n,d_M,ld,sneg_pfive);
478:     PetscLogGpuFlops(2.0*n*n*n+2.0*n*n);

480:     cublasXnrm2(cublasv2handle,N,d_M,one,&Mres);
481:     shift_diagonal(n,d_M,ld,sone);

483:     if (scale) {
484:       /* reldiff = norm(T - Told,'fro')/norm(T,'fro'); */
485:       cublasXaxpy(cublasv2handle,N,&smone,d_T,one,d_Told,one);
486:       cublasXnrm2(cublasv2handle,N,d_Told,one,&fnormdiff);
487:       cublasXnrm2(cublasv2handle,N,d_T,one,&fnormT);
488:       PetscLogGpuFlops(7.0*n*n);
489:       reldiff = fnormdiff/fnormT;
490:       PetscInfo(fn,"it: %" PetscInt_FMT " reldiff: %g scale: %g tol*scale: %g\n",it,(double)reldiff,(double)g,(double)tol*g);
491:       if (reldiff<1e-2) scale = PETSC_FALSE; /* Switch to no scaling. */
492:     }

494:     PetscInfo(fn,"it: %" PetscInt_FMT " Mres: %g\n",it,(double)Mres);
495:     if (Mres<=tol) converged = PETSC_TRUE;
496:   }

499:   PetscLogGpuTimeEnd();
500:   PetscFree(piv);
501:   cudaFree(d_work);
502:   cudaFree(d_Told);
503:   cudaFree(d_M);
504:   cudaFree(d_invM);
505:   return 0;
506: }
507: #endif /* PETSC_HAVE_MAGMA */

509: #endif /* PETSC_HAVE_CUDA */

511: #define ITMAX 5
512: #define SWAP(a,b,t) {t=a;a=b;b=t;}

514: /*
515:    Estimate norm(A^m,1) by block 1-norm power method (required workspace is 11*n)
516: */
517: static PetscErrorCode SlepcNormEst1(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand,PetscReal *nrm)
518: {
519:   PetscScalar    *X,*Y,*Z,*S,*S_old,*aux,val,sone=1.0,szero=0.0;
520:   PetscReal      est=0.0,est_old,vals[2]={0.0,0.0},*zvals,maxzval[2],raux;
521:   PetscBLASInt   i,j,t=2,it=0,ind[2],est_j=0,m1;

523:   X = work;
524:   Y = work + 2*n;
525:   Z = work + 4*n;
526:   S = work + 6*n;
527:   S_old = work + 8*n;
528:   zvals = (PetscReal*)(work + 10*n);

530:   for (i=0;i<n;i++) {  /* X has columns of unit 1-norm */
531:     X[i] = 1.0/n;
532:     PetscRandomGetValue(rand,&val);
533:     if (PetscRealPart(val) < 0.5) X[i+n] = -1.0/n;
534:     else X[i+n] = 1.0/n;
535:   }
536:   for (i=0;i<t*n;i++) S[i] = 0.0;
537:   ind[0] = 0; ind[1] = 0;
538:   est_old = 0;
539:   while (1) {
540:     it++;
541:     for (j=0;j<m;j++) {  /* Y = A^m*X */
542:       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&t,&n,&sone,A,&n,X,&n,&szero,Y,&n));
543:       if (j<m-1) SWAP(X,Y,aux);
544:     }
545:     for (j=0;j<t;j++) {  /* vals[j] = norm(Y(:,j),1) */
546:       vals[j] = 0.0;
547:       for (i=0;i<n;i++) vals[j] += PetscAbsScalar(Y[i+j*n]);
548:     }
549:     if (vals[0]<vals[1]) {
550:       SWAP(vals[0],vals[1],raux);
551:       m1 = 1;
552:     } else m1 = 0;
553:     est = vals[0];
554:     if (est>est_old || it==2) est_j = ind[m1];
555:     if (it>=2 && est<=est_old) {
556:       est = est_old;
557:       break;
558:     }
559:     est_old = est;
560:     if (it>ITMAX) break;
561:     SWAP(S,S_old,aux);
562:     for (i=0;i<t*n;i++) {  /* S = sign(Y) */
563:       S[i] = (PetscRealPart(Y[i]) < 0.0)? -1.0: 1.0;
564:     }
565:     for (j=0;j<m;j++) {  /* Z = (A^T)^m*S */
566:       PetscCallBLAS("BLASgemm",BLASgemm_("C","N",&n,&t,&n,&sone,A,&n,S,&n,&szero,Z,&n));
567:       if (j<m-1) SWAP(S,Z,aux);
568:     }
569:     maxzval[0] = -1; maxzval[1] = -1;
570:     ind[0] = 0; ind[1] = 0;
571:     for (i=0;i<n;i++) {  /* zvals[i] = norm(Z(i,:),inf) */
572:       zvals[i] = PetscMax(PetscAbsScalar(Z[i+0*n]),PetscAbsScalar(Z[i+1*n]));
573:       if (zvals[i]>maxzval[0]) {
574:         maxzval[0] = zvals[i];
575:         ind[0] = i;
576:       } else if (zvals[i]>maxzval[1]) {
577:         maxzval[1] = zvals[i];
578:         ind[1] = i;
579:       }
580:     }
581:     if (it>=2 && maxzval[0]==zvals[est_j]) break;
582:     for (i=0;i<t*n;i++) X[i] = 0.0;
583:     for (j=0;j<t;j++) X[ind[j]+j*n] = 1.0;
584:   }
585:   *nrm = est;
586:   /* Flop count is roughly (it * 2*m * t*gemv) = 4*its*m*t*n*n */
587:   PetscLogFlops(4.0*it*m*t*n*n);
588:   return 0;
589: }

591: #define SMALLN 100

593: /*
594:    Estimate norm(A^m,1) (required workspace is 2*n*n)
595: */
596: PetscErrorCode SlepcNormAm(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand,PetscReal *nrm)
597: {
598:   PetscScalar    *v=work,*w=work+n*n,*aux,sone=1.0,szero=0.0;
599:   PetscReal      rwork[1],tmp;
600:   PetscBLASInt   i,j,one=1;
601:   PetscBool      isrealpos=PETSC_TRUE;

603:   if (n<SMALLN) {   /* compute matrix power explicitly */
604:     if (m==1) {
605:       *nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
606:       PetscLogFlops(1.0*n*n);
607:     } else {  /* m>=2 */
608:       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,A,&n,&szero,v,&n));
609:       for (j=0;j<m-2;j++) {
610:         PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,v,&n,&szero,w,&n));
611:         SWAP(v,w,aux);
612:       }
613:       *nrm = LAPACKlange_("O",&n,&n,v,&n,rwork);
614:       PetscLogFlops(2.0*n*n*n*(m-1)+1.0*n*n);
615:     }
616:   } else {
617:     for (i=0;i<n;i++)
618:       for (j=0;j<n;j++)
619: #if defined(PETSC_USE_COMPLEX)
620:         if (PetscRealPart(A[i+j*n])<0.0 || PetscImaginaryPart(A[i+j*n])!=0.0) { isrealpos = PETSC_FALSE; break; }
621: #else
622:         if (A[i+j*n]<0.0) { isrealpos = PETSC_FALSE; break; }
623: #endif
624:     if (isrealpos) {   /* for positive matrices only */
625:       for (i=0;i<n;i++) v[i] = 1.0;
626:       for (j=0;j<m;j++) {  /* w = A'*v */
627:         PetscCallBLAS("BLASgemv",BLASgemv_("C",&n,&n,&sone,A,&n,v,&one,&szero,w,&one));
628:         SWAP(v,w,aux);
629:       }
630:       PetscLogFlops(2.0*n*n*m);
631:       *nrm = 0.0;
632:       for (i=0;i<n;i++) if ((tmp = PetscAbsScalar(v[i])) > *nrm) *nrm = tmp;   /* norm(v,inf) */
633:     } else SlepcNormEst1(n,A,m,work,rand,nrm);
634:   }
635:   return 0;
636: }