Actual source code: fnexp.c

slepc-3.18.0 2022-10-01
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    Exponential function  exp(x)
 12: */

 14: #include <slepc/private/fnimpl.h>
 15: #include <slepcblaslapack.h>

 17: PetscErrorCode FNEvaluateFunction_Exp(FN fn,PetscScalar x,PetscScalar *y)
 18: {
 19:   *y = PetscExpScalar(x);
 20:   return 0;
 21: }

 23: PetscErrorCode FNEvaluateDerivative_Exp(FN fn,PetscScalar x,PetscScalar *y)
 24: {
 25:   *y = PetscExpScalar(x);
 26:   return 0;
 27: }

 29: #define MAX_PADE 6
 30: #define SWAP(a,b,t) {t=a;a=b;b=t;}

 32: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade(FN fn,Mat A,Mat B)
 33: {
 34:   PetscBLASInt      n=0,ld,ld2,*ipiv,info,inc=1;
 35:   PetscInt          m,j,k,sexp;
 36:   PetscBool         odd;
 37:   const PetscInt    p=MAX_PADE;
 38:   PetscReal         c[MAX_PADE+1],s,*rwork;
 39:   PetscScalar       scale,mone=-1.0,one=1.0,two=2.0,zero=0.0;
 40:   PetscScalar       *Ba,*As,*A2,*Q,*P,*W,*aux;
 41:   const PetscScalar *Aa;

 43:   MatDenseGetArrayRead(A,&Aa);
 44:   MatDenseGetArray(B,&Ba);
 45:   MatGetSize(A,&m,NULL);
 46:   PetscBLASIntCast(m,&n);
 47:   ld  = n;
 48:   ld2 = ld*ld;
 49:   P   = Ba;
 50:   PetscMalloc6(m*m,&Q,m*m,&W,m*m,&As,m*m,&A2,ld,&rwork,ld,&ipiv);
 51:   PetscArraycpy(As,Aa,ld2);

 53:   /* Pade' coefficients */
 54:   c[0] = 1.0;
 55:   for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));

 57:   /* Scaling */
 58:   s = LAPACKlange_("I",&n,&n,As,&ld,rwork);
 59:   PetscLogFlops(1.0*n*n);
 60:   if (s>0.5) {
 61:     sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
 62:     scale = PetscPowRealInt(2.0,-sexp);
 63:     PetscCallBLAS("BLASscal",BLASscal_(&ld2,&scale,As,&inc));
 64:     PetscLogFlops(1.0*n*n);
 65:   } else sexp = 0;

 67:   /* Horner evaluation */
 68:   PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,As,&ld,As,&ld,&zero,A2,&ld));
 69:   PetscLogFlops(2.0*n*n*n);
 70:   PetscArrayzero(Q,ld2);
 71:   PetscArrayzero(P,ld2);
 72:   for (j=0;j<n;j++) {
 73:     Q[j+j*ld] = c[p];
 74:     P[j+j*ld] = c[p-1];
 75:   }

 77:   odd = PETSC_TRUE;
 78:   for (k=p-1;k>0;k--) {
 79:     if (odd) {
 80:       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld));
 81:       SWAP(Q,W,aux);
 82:       for (j=0;j<n;j++) Q[j+j*ld] += c[k-1];
 83:       odd = PETSC_FALSE;
 84:     } else {
 85:       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld));
 86:       SWAP(P,W,aux);
 87:       for (j=0;j<n;j++) P[j+j*ld] += c[k-1];
 88:       odd = PETSC_TRUE;
 89:     }
 90:     PetscLogFlops(2.0*n*n*n);
 91:   }
 92:   /*if (odd) {
 93:     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,As,&ld,&zero,W,&ld));
 94:     SWAP(Q,W,aux);
 95:     PetscCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
 96:     PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
 97:     SlepcCheckLapackInfo("gesv",info);
 98:     PetscCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
 99:     for (j=0;j<n;j++) P[j+j*ld] += 1.0;
100:     PetscCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc));
101:   } else {*/
102:     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,As,&ld,&zero,W,&ld));
103:     SWAP(P,W,aux);
104:     PetscCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
105:     PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
106:     SlepcCheckLapackInfo("gesv",info);
107:     PetscCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
108:     for (j=0;j<n;j++) P[j+j*ld] += 1.0;
109:   /*}*/
110:   PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);

112:   for (k=1;k<=sexp;k++) {
113:     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld));
114:     PetscArraycpy(P,W,ld2);
115:   }
116:   if (P!=Ba) PetscArraycpy(Ba,P,ld2);
117:   PetscLogFlops(2.0*n*n*n*sexp);

119:   PetscFree6(Q,W,As,A2,rwork,ipiv);
120:   MatDenseRestoreArrayRead(A,&Aa);
121:   MatDenseRestoreArray(B,&Ba);
122:   return 0;
123: }

125: /*
126:  * Set scaling factor (s) and Pade degree (k,m)
127:  */
128: static PetscErrorCode sexpm_params(PetscReal nrm,PetscInt *s,PetscInt *k,PetscInt *m)
129: {
130:   if (nrm>1) {
131:     if      (nrm<200)  {*s = 4; *k = 5; *m = *k-1;}
132:     else if (nrm<1e4)  {*s = 4; *k = 4; *m = *k+1;}
133:     else if (nrm<1e6)  {*s = 4; *k = 3; *m = *k+1;}
134:     else if (nrm<1e9)  {*s = 3; *k = 3; *m = *k+1;}
135:     else if (nrm<1e11) {*s = 2; *k = 3; *m = *k+1;}
136:     else if (nrm<1e12) {*s = 2; *k = 2; *m = *k+1;}
137:     else if (nrm<1e14) {*s = 2; *k = 1; *m = *k+1;}
138:     else               {*s = 1; *k = 1; *m = *k+1;}
139:   } else { /* nrm<1 */
140:     if       (nrm>0.5)  {*s = 4; *k = 4; *m = *k-1;}
141:     else  if (nrm>0.3)  {*s = 3; *k = 4; *m = *k-1;}
142:     else  if (nrm>0.15) {*s = 2; *k = 4; *m = *k-1;}
143:     else  if (nrm>0.07) {*s = 1; *k = 4; *m = *k-1;}
144:     else  if (nrm>0.01) {*s = 0; *k = 4; *m = *k-1;}
145:     else  if (nrm>3e-4) {*s = 0; *k = 3; *m = *k-1;}
146:     else  if (nrm>1e-5) {*s = 0; *k = 3; *m = 0;}
147:     else  if (nrm>1e-8) {*s = 0; *k = 2; *m = 0;}
148:     else                {*s = 0; *k = 1; *m = 0;}
149:   }
150:   return 0;
151: }

153: #if defined(PETSC_HAVE_COMPLEX)
154: /*
155:  * Partial fraction form coefficients.
156:  * If query, the function returns the size necessary to store the coefficients.
157:  */
158: static PetscErrorCode getcoeffs(PetscInt k,PetscInt m,PetscComplex *r,PetscComplex *q,PetscComplex *remain,PetscBool query)
159: {
160:   PetscInt i;
161:   const PetscComplex /* m == k+1 */
162:     p1r4[5] = {-1.582680186458572e+01 - 2.412564578224361e+01*PETSC_i,
163:                -1.582680186458572e+01 + 2.412564578224361e+01*PETSC_i,
164:                 1.499984465975511e+02 + 6.804227952202417e+01*PETSC_i,
165:                 1.499984465975511e+02 - 6.804227952202417e+01*PETSC_i,
166:                -2.733432894659307e+02                                },
167:     p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
168:                 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
169:                 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
170:                 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i,
171:                 6.286704751729261e+00                               },
172:     p1r3[4] = {-1.130153999597152e+01 + 1.247167585025031e+01*PETSC_i,
173:                -1.130153999597152e+01 - 1.247167585025031e+01*PETSC_i,
174:                 1.330153999597152e+01 - 6.007173273704750e+01*PETSC_i,
175:                 1.330153999597152e+01 + 6.007173273704750e+01*PETSC_i},
176:     p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
177:                 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
178:                 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
179:                 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
180:     p1r2[3] = { 7.648749087422928e+00 + 4.171640244747463e+00*PETSC_i,
181:                 7.648749087422928e+00 - 4.171640244747463e+00*PETSC_i,
182:                -1.829749817484586e+01                                },
183:     p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
184:                 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
185:                 3.637834252744491e+00                                },
186:     p1r1[2] = { 1.000000000000000e+00 - 3.535533905932738e+00*PETSC_i,
187:                 1.000000000000000e+00 + 3.535533905932738e+00*PETSC_i},
188:     p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
189:                 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
190:   const PetscComplex /* m == k-1 */
191:     m1r5[4] = {-1.423367961376821e+02 - 1.385465094833037e+01*PETSC_i,
192:                -1.423367961376821e+02 + 1.385465094833037e+01*PETSC_i,
193:                 2.647367961376822e+02 - 4.814394493714596e+02*PETSC_i,
194:                 2.647367961376822e+02 + 4.814394493714596e+02*PETSC_i},
195:     m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
196:                 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
197:                 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
198:                 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
199:     m1r4[3] = { 2.484269593165883e+01 + 7.460342395992306e+01*PETSC_i,
200:                 2.484269593165883e+01 - 7.460342395992306e+01*PETSC_i,
201:                -1.734353918633177e+02                                },
202:     m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
203:                 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
204:                 5.648485971016893e+00                                },
205:     m1r3[2] = { 2.533333333333333e+01 - 2.733333333333333e+01*PETSC_i,
206:                 2.533333333333333e+01 + 2.733333333333333e+01*PETSC_i},
207:     m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
208:                 4.000000000000000e+00 - 2.000000000000000e+00*PETSC_i};
209:   const PetscScalar /* m == k-1 */
210:     m1remain5[2] = { 2.000000000000000e-01,  9.800000000000000e+00},
211:     m1remain4[2] = {-2.500000000000000e-01, -7.750000000000000e+00},
212:     m1remain3[2] = { 3.333333333333333e-01,  5.666666666666667e+00},
213:     m1remain2[2] = {-0.5,                   -3.5},
214:     remain3[4] = {1.0/6.0, 1.0/2.0, 1, 1},
215:     remain2[3] = {1.0/2.0, 1, 1};

217:   if (query) { /* query about buffer's size */
218:     if (m==k+1) {
219:       *remain = 0;
220:       *r = *q = k+1;
221:       return 0; /* quick return */
222:     }
223:     if (m==k-1) {
224:       *remain = 2;
225:       if (k==5) *r = *q = 4;
226:       else if (k==4) *r = *q = 3;
227:       else if (k==3) *r = *q = 2;
228:       else if (k==2) *r = *q = 1;
229:     }
230:     if (m==0) {
231:       *r = *q = 0;
232:       *remain = k+1;
233:     }
234:   } else {
235:     if (m==k+1) {
236:       if (k==4) {
237:         for (i=0;i<5;i++) { r[i] = p1r4[i]; q[i] = p1q4[i]; }
238:       } else if (k==3) {
239:         for (i=0;i<4;i++) { r[i] = p1r3[i]; q[i] = p1q3[i]; }
240:       } else if (k==2) {
241:         for (i=0;i<3;i++) { r[i] = p1r2[i]; q[i] = p1q2[i]; }
242:       } else if (k==1) {
243:         for (i=0;i<2;i++) { r[i] = p1r1[i]; q[i] = p1q1[i]; }
244:       }
245:       return 0; /* quick return */
246:     }
247:     if (m==k-1) {
248:       if (k==5) {
249:         for (i=0;i<4;i++) { r[i] = m1r5[i]; q[i] = m1q5[i]; }
250:         for (i=0;i<2;i++) remain[i] = m1remain5[i];
251:       } else if (k==4) {
252:         for (i=0;i<3;i++) { r[i] = m1r4[i]; q[i] = m1q4[i]; }
253:         for (i=0;i<2;i++) remain[i] = m1remain4[i];
254:       } else if (k==3) {
255:         for (i=0;i<2;i++) { r[i] = m1r3[i]; q[i] = m1q3[i]; remain[i] = m1remain3[i]; }
256:       } else if (k==2) {
257:         r[0] = -13.5; q[0] = 3;
258:         for (i=0;i<2;i++) remain[i] = m1remain2[i];
259:       }
260:     }
261:     if (m==0) {
262:       r = q = NULL;
263:       if (k==3) {
264:         for (i=0;i<4;i++) remain[i] = remain3[i];
265:       } else if (k==2) {
266:         for (i=0;i<3;i++) remain[i] = remain2[i];
267:       }
268:     }
269:   }
270:   return 0;
271: }

273: /*
274:  * Product form coefficients.
275:  * If query, the function returns the size necessary to store the coefficients.
276:  */
277: static PetscErrorCode getcoeffsproduct(PetscInt k,PetscInt m,PetscComplex *p,PetscComplex *q,PetscComplex *mult,PetscBool query)
278: {
279:   PetscInt i;
280:   const PetscComplex /* m == k+1 */
281:   p1p4[4] = {-5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
282:              -5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
283:              -6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
284:              -6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
285:   p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
286:               3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
287:               6.286704751729261e+00                                ,
288:               5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
289:               5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
290:   p1p3[3] = {-4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
291:              -4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
292:              -5.648485971016893e+00                                },
293:   p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
294:               3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
295:               4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
296:               4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
297:   p1p2[2] = {-4.00000000000000e+00  + 2.000000000000000e+00*PETSC_i,
298:              -4.00000000000000e+00  - 2.000000000000000e+00*PETSC_i},
299:   p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
300:               2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
301:               3.637834252744491e+00                               },
302:   p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
303:               2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
304:   const PetscComplex /* m == k-1 */
305:   m1p5[5] = {-3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
306:              -3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
307:              -6.286704751729261e+00                                ,
308:              -5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
309:              -5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
310:   m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
311:               5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
312:               6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
313:               6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
314:   m1p4[4] = {-3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
315:              -3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
316:              -4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
317:              -4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
318:   m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
319:               4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
320:               5.648485971016893e+00                                },
321:   m1p3[3] = {-2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
322:              -2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
323:              -3.637834252744491e+00                                },
324:   m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
325:               4.000000000000000e+00 - 2.000000000000001e+00*PETSC_i},
326:   m1p2[2] = {-2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
327:              -2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};

329:   if (query) {
330:     if (m == k+1) {
331:       *mult = 1;
332:       *p = k;
333:       *q = k+1;
334:       return 0;
335:     }
336:     if (m==k-1) {
337:       *mult = 1;
338:       *p = k;
339:       *q = k-1;
340:     }
341:   } else {
342:     if (m == k+1) {
343:       *mult = PetscPowInt(-1,m);
344:       *mult *= m;
345:       if (k==4) {
346:         for (i=0;i<4;i++) { p[i] = p1p4[i]; q[i] = p1q4[i]; }
347:         q[4] = p1q4[4];
348:       } else if (k==3) {
349:         for (i=0;i<3;i++) { p[i] = p1p3[i]; q[i] = p1q3[i]; }
350:         q[3] = p1q3[3];
351:       } else if (k==2) {
352:         for (i=0;i<2;i++) { p[i] = p1p2[i]; q[i] = p1q2[i]; }
353:         q[2] = p1q2[2];
354:       } else if (k==1) {
355:         p[0] = -3;
356:         for (i=0;i<2;i++) q[i] = p1q1[i];
357:       }
358:       return 0;
359:     }
360:     if (m==k-1) {
361:       *mult = PetscPowInt(-1,m);
362:       *mult /= k;
363:       if (k==5) {
364:         for (i=0;i<4;i++) { p[i] = m1p5[i]; q[i] = m1q5[i]; }
365:         p[4] = m1p5[4];
366:       } else if (k==4) {
367:         for (i=0;i<3;i++) { p[i] = m1p4[i]; q[i] = m1q4[i]; }
368:         p[3] = m1p4[3];
369:       } else if (k==3) {
370:         for (i=0;i<2;i++) { p[i] = m1p3[i]; q[i] = m1q3[i]; }
371:         p[2] = m1p3[2];
372:       } else if (k==2) {
373:         for (i=0;i<2;i++) p[i] = m1p2[i];
374:         q[0] = 3;
375:       }
376:     }
377:   }
378:   return 0;
379: }
380: #endif /* PETSC_HAVE_COMPLEX */

382: #if defined(PETSC_USE_COMPLEX)
383: static PetscErrorCode getisreal(PetscInt n,PetscComplex *a,PetscBool *result)
384: {
385:   PetscInt i;

387:   *result=PETSC_TRUE;
388:   for (i=0;i<n&&*result;i++) {
389:     if (PetscImaginaryPartComplex(a[i])) *result=PETSC_FALSE;
390:   }
391:   return 0;
392: }
393: #endif

395: /*
396:  * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
397:  * and Yuji Nakatsukasa
398:  *
399:  *     Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade
400:  *     Approximation for the Matrix Exponential",
401:  *     SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
402:  *     https://doi.org/10.1137/15M1027553
403:  */
404: PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa(FN fn,Mat A,Mat B)
405: {
406: #if !defined(PETSC_HAVE_COMPLEX)
407:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This function requires C99 or C++ complex support");
408: #else
409:   PetscInt          i,j,n_,s,k,m,mod;
410:   PetscBLASInt      n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,info,*piv,minlen,lwork=0,one=1;
411:   PetscReal         nrm,shift=0.0;
412: #if defined(PETSC_USE_COMPLEX)
413:   PetscReal         *rwork=NULL;
414: #endif
415:   PetscComplex      *As,*RR,*RR2,*expmA,*expmA2,*Maux,*Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
416:   PetscScalar       *Ba,*Ba2,*sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*saux;
417:   const PetscScalar *Aa;
418:   PetscBool         isreal,flg;
419:   PetscBLASInt      query=-1;
420:   PetscScalar       work1,*work;

422:   MatGetSize(A,&n_,NULL);
423:   PetscBLASIntCast(n_,&n);
424:   MatDenseGetArrayRead(A,&Aa);
425:   MatDenseGetArray(B,&Ba);
426:   Ba2 = Ba;
427:   PetscBLASIntCast(n*n,&n2);

429:   PetscMalloc2(n2,&sMaux,n2,&Maux);
430:   Maux2 = Maux;
431:   PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg);
432:   if (!flg) {
433:     PetscMalloc2(n,&wr,n,&wi);
434:     PetscArraycpy(sMaux,Aa,n2);
435:     /* estimate rightmost eigenvalue and shift A with it */
436: #if !defined(PETSC_USE_COMPLEX)
437:     PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,&work1,&query,&info));
438:     SlepcCheckLapackInfo("geev",info);
439:     PetscBLASIntCast((PetscInt)work1,&lwork);
440:     PetscMalloc1(lwork,&work);
441:     PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,work,&lwork,&info));
442:     PetscFree(work);
443: #else
444:     PetscArraycpy(Maux,Aa,n2);
445:     PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,&work1,&query,rwork,&info));
446:     SlepcCheckLapackInfo("geev",info);
447:     PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
448:     PetscMalloc2(2*n,&rwork,lwork,&work);
449:     PetscCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,work,&lwork,rwork,&info));
450:     PetscFree2(rwork,work);
451: #endif
452:     SlepcCheckLapackInfo("geev",info);
453:     PetscLogFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n);

455:     shift = PetscRealPart(wr[0]);
456:     for (i=1;i<n;i++) {
457:       if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
458:     }
459:     PetscFree2(wr,wi);
460:   }
461:   /* shift so that largest real part is (about) 0 */
462:   PetscArraycpy(sMaux,Aa,n2);
463:   if (shift) {
464:     for (i=0;i<n;i++) sMaux[i+i*n] -= shift;
465:     PetscLogFlops(1.0*n);
466:   }
467: #if defined(PETSC_USE_COMPLEX)
468:   PetscArraycpy(Maux,Aa,n2);
469:   if (shift) {
470:     for (i=0;i<n;i++) Maux[i+i*n] -= shift;
471:     PetscLogFlops(1.0*n);
472:   }
473: #endif

475:   /* estimate norm(A) and select the scaling factor */
476:   nrm = LAPACKlange_("O",&n,&n,sMaux,&n,NULL);
477:   PetscLogFlops(1.0*n*n);
478:   sexpm_params(nrm,&s,&k,&m);
479:   if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
480:     if (shift) expshift = PetscExpReal(shift);
481:     for (i=0;i<n;i++) sMaux[i+i*n] += 1.0;
482:     if (shift) {
483:       PetscCallBLAS("BLASscal",BLASscal_(&n2,&expshift,sMaux,&one));
484:       PetscLogFlops(1.0*(n+n2));
485:     } else PetscLogFlops(1.0*n);
486:     PetscArraycpy(Ba,sMaux,n2);
487:     PetscFree2(sMaux,Maux);
488:     MatDenseRestoreArrayRead(A,&Aa);
489:     MatDenseRestoreArray(B,&Ba);
490:     return 0; /* quick return */
491:   }

493:   PetscMalloc4(n2,&expmA,n2,&As,n2,&RR,n,&piv);
494:   expmA2 = expmA; RR2 = RR;
495:   /* scale matrix */
496: #if !defined(PETSC_USE_COMPLEX)
497:   for (i=0;i<n2;i++) {
498:     As[i] = sMaux[i];
499:   }
500: #else
501:   PetscArraycpy(As,sMaux,n2);
502: #endif
503:   scale = 1.0/PetscPowRealInt(2.0,s);
504:   PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&scale,As,&one));
505:   SlepcLogFlopsComplex(1.0*n2);

507:   /* evaluate Pade approximant (partial fraction or product form) */
508:   if (fn->method==3 || !m) { /* partial fraction */
509:     getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE);
510:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
511:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
512:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize);
513:     PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm);
514:     getcoeffs(k,m,r,p,remainterm,PETSC_FALSE);

516:     PetscArrayzero(expmA,n2);
517: #if !defined(PETSC_USE_COMPLEX)
518:     isreal = PETSC_TRUE;
519: #else
520:     getisreal(n2,Maux,&isreal);
521: #endif
522:     if (isreal) {
523:       rsizediv2 = irsize/2;
524:       for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
525:         PetscArraycpy(Maux,As,n2);
526:         PetscArrayzero(RR,n2);
527:         for (j=0;j<n;j++) {
528:           Maux[j+j*n] -= p[2*i];
529:           RR[j+j*n] = r[2*i];
530:         }
531:         PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
532:         SlepcCheckLapackInfo("gesv",info);
533:         for (j=0;j<n2;j++) {
534:           expmA[j] += RR[j] + PetscConj(RR[j]);
535:         }
536:         /* loop(n) + gesv + loop(n2) */
537:         SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2);
538:       }

540:       mod = ipsize % 2;
541:       if (mod) {
542:         PetscArraycpy(Maux,As,n2);
543:         PetscArrayzero(RR,n2);
544:         for (j=0;j<n;j++) {
545:           Maux[j+j*n] -= p[ipsize-1];
546:           RR[j+j*n] = r[irsize-1];
547:         }
548:         PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
549:         SlepcCheckLapackInfo("gesv",info);
550:         for (j=0;j<n2;j++) {
551:           expmA[j] += RR[j];
552:         }
553:         SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
554:       }
555:     } else { /* complex */
556:       for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
557:         PetscArraycpy(Maux,As,n2);
558:         PetscArrayzero(RR,n2);
559:         for (j=0;j<n;j++) {
560:           Maux[j+j*n] -= p[i];
561:           RR[j+j*n] = r[i];
562:         }
563:         PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
564:         SlepcCheckLapackInfo("gesv",info);
565:         for (j=0;j<n2;j++) {
566:           expmA[j] += RR[j];
567:         }
568:         SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
569:       }
570:     }
571:     for (i=0;i<iremainsize;i++) {
572:       if (!i) {
573:         PetscArrayzero(RR,n2);
574:         for (j=0;j<n;j++) {
575:           RR[j+j*n] = remainterm[iremainsize-1];
576:         }
577:       } else {
578:         PetscArraycpy(RR,As,n2);
579:         for (j=1;j<i;j++) {
580:           PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,RR,&n,&czero,Maux,&n));
581:           SWAP(RR,Maux,aux);
582:           SlepcLogFlopsComplex(2.0*n*n*n);
583:         }
584:         PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&remainterm[iremainsize-1-i],RR,&one));
585:         SlepcLogFlopsComplex(1.0*n2);
586:       }
587:       for (j=0;j<n2;j++) {
588:         expmA[j] += RR[j];
589:       }
590:       SlepcLogFlopsComplex(1.0*n2);
591:     }
592:     PetscFree3(r,p,remainterm);
593:   } else { /* product form, default */
594:     getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE);
595:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
596:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
597:     PetscMalloc2(irsize,&rootp,ipsize,&rootq);
598:     getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE);

600:     PetscArrayzero(expmA,n2);
601:     for (i=0;i<n;i++) { /* initialize */
602:       expmA[i+i*n] = 1.0;
603:     }
604:     minlen = PetscMin(irsize,ipsize);
605:     for (i=0;i<minlen;i++) {
606:       PetscArraycpy(RR,As,n2);
607:       for (j=0;j<n;j++) {
608:         RR[j+j*n] -= rootp[i];
609:       }
610:       PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
611:       SWAP(expmA,Maux,aux);
612:       PetscArraycpy(RR,As,n2);
613:       for (j=0;j<n;j++) {
614:         RR[j+j*n] -= rootq[i];
615:       }
616:       PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
617:       SlepcCheckLapackInfo("gesv",info);
618:       /* loop(n) + gemm + loop(n) + gesv */
619:       SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
620:     }
621:     /* extra numerator */
622:     for (i=minlen;i<irsize;i++) {
623:       PetscArraycpy(RR,As,n2);
624:       for (j=0;j<n;j++) {
625:         RR[j+j*n] -= rootp[i];
626:       }
627:       PetscCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
628:       SWAP(expmA,Maux,aux);
629:       SlepcLogFlopsComplex(1.0*n+2.0*n*n*n);
630:     }
631:     /* extra denominator */
632:     for (i=minlen;i<ipsize;i++) {
633:       PetscArraycpy(RR,As,n2);
634:       for (j=0;j<n;j++) RR[j+j*n] -= rootq[i];
635:       PetscCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
636:       SlepcCheckLapackInfo("gesv",info);
637:       SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
638:     }
639:     PetscCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&mult,expmA,&one));
640:     SlepcLogFlopsComplex(1.0*n2);
641:     PetscFree2(rootp,rootq);
642:   }

644: #if !defined(PETSC_USE_COMPLEX)
645:   for (i=0;i<n2;i++) {
646:     Ba2[i] = PetscRealPartComplex(expmA[i]);
647:   }
648: #else
649:   PetscArraycpy(Ba2,expmA,n2);
650: #endif

652:   /* perform repeated squaring */
653:   for (i=0;i<s;i++) { /* final squaring */
654:     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,Ba2,&n,Ba2,&n,&szero,sMaux,&n));
655:     SWAP(Ba2,sMaux,saux);
656:     PetscLogFlops(2.0*n*n*n);
657:   }
658:   if (Ba2!=Ba) {
659:     PetscArraycpy(Ba,Ba2,n2);
660:     sMaux = Ba2;
661:   }
662:   if (shift) {
663:     expshift = PetscExpReal(shift);
664:     PetscCallBLAS("BLASscal",BLASscal_(&n2,&expshift,Ba,&one));
665:     PetscLogFlops(1.0*n2);
666:   }

668:   /* restore pointers */
669:   Maux = Maux2; expmA = expmA2; RR = RR2;
670:   PetscFree2(sMaux,Maux);
671:   PetscFree4(expmA,As,RR,piv);
672:   MatDenseRestoreArrayRead(A,&Aa);
673:   MatDenseRestoreArray(B,&Ba);
674:   return 0;
675: #endif
676: }

678: #define SMALLN 100

680: /*
681:  * Function needed to compute optimal parameters (required workspace is 3*n*n)
682:  */
683: static PetscInt ell(PetscBLASInt n,PetscScalar *A,PetscReal coeff,PetscInt m,PetscScalar *work,PetscRandom rand)
684: {
685:   PetscScalar    *Ascaled=work;
686:   PetscReal      nrm,alpha,beta,rwork[1];
687:   PetscInt       t;
688:   PetscBLASInt   i,j;

690:   beta = PetscPowReal(coeff,1.0/(2*m+1));
691:   for (i=0;i<n;i++)
692:     for (j=0;j<n;j++)
693:       Ascaled[i+j*n] = beta*PetscAbsScalar(A[i+j*n]);
694:   nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
695:   PetscLogFlops(2.0*n*n);
696:   SlepcNormAm(n,Ascaled,2*m+1,work+n*n,rand,&alpha);
697:   alpha /= nrm;
698:   t = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(2.0*alpha/PETSC_MACHINE_EPSILON)/PetscLogReal(2.0)/(2*m)),0);
699:   return t;
700: }

702: /*
703:  * Compute scaling parameter (s) and order of Pade approximant (m)  (required workspace is 4*n*n)
704:  */
705: static PetscErrorCode expm_params(PetscInt n,PetscScalar **Apowers,PetscInt *s,PetscInt *m,PetscScalar *work)
706: {
707:   PetscScalar     sfactor,sone=1.0,szero=0.0,*A=Apowers[0],*Ascaled;
708:   PetscReal       d4,d6,d8,d10,eta1,eta3,eta4,eta5,rwork[1];
709:   PetscBLASInt    n_=0,n2,one=1;
710:   PetscRandom     rand;
711:   const PetscReal coeff[5] = { 9.92063492063492e-06, 9.94131285136576e-11,  /* backward error function */
712:                                2.22819456055356e-16, 1.69079293431187e-22, 8.82996160201868e-36 };
713:   const PetscReal theta[5] = { 1.495585217958292e-002,    /* m = 3  */
714:                                2.539398330063230e-001,    /* m = 5  */
715:                                9.504178996162932e-001,    /* m = 7  */
716:                                2.097847961257068e+000,    /* m = 9  */
717:                                5.371920351148152e+000 };  /* m = 13 */

719:   *s = 0;
720:   *m = 13;
721:   PetscBLASIntCast(n,&n_);
722:   PetscRandomCreate(PETSC_COMM_SELF,&rand);
723:   d4 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[2],&n_,rwork),1.0/4.0);
724:   if (d4==0.0) { /* safeguard for the case A = 0 */
725:     *m = 3;
726:     goto done;
727:   }
728:   d6 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[3],&n_,rwork),1.0/6.0);
729:   PetscLogFlops(2.0*n*n);
730:   eta1 = PetscMax(d4,d6);
731:   if (eta1<=theta[0] && !ell(n_,A,coeff[0],3,work,rand)) {
732:     *m = 3;
733:     goto done;
734:   }
735:   if (eta1<=theta[1] && !ell(n_,A,coeff[1],5,work,rand)) {
736:     *m = 5;
737:     goto done;
738:   }
739:   if (n<SMALLN) {
740:     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[2],&n_,&szero,work,&n_));
741:     d8 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/8.0);
742:     PetscLogFlops(2.0*n*n*n+1.0*n*n);
743:   } else {
744:     SlepcNormAm(n_,Apowers[2],2,work,rand,&d8);
745:     d8 = PetscPowReal(d8,1.0/8.0);
746:   }
747:   eta3 = PetscMax(d6,d8);
748:   if (eta3<=theta[2] && !ell(n_,A,coeff[2],7,work,rand)) {
749:     *m = 7;
750:     goto done;
751:   }
752:   if (eta3<=theta[3] && !ell(n_,A,coeff[3],9,work,rand)) {
753:     *m = 9;
754:     goto done;
755:   }
756:   if (n<SMALLN) {
757:     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[3],&n_,&szero,work,&n_));
758:     d10 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/10.0);
759:     PetscLogFlops(2.0*n*n*n+1.0*n*n);
760:   } else {
761:     SlepcNormAm(n_,Apowers[1],5,work,rand,&d10);
762:     d10 = PetscPowReal(d10,1.0/10.0);
763:   }
764:   eta4 = PetscMax(d8,d10);
765:   eta5 = PetscMin(eta3,eta4);
766:   *s = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(eta5/theta[4])/PetscLogReal(2.0)),0);
767:   if (*s) {
768:     Ascaled = work+3*n*n;
769:     n2 = n_*n_;
770:     PetscCallBLAS("BLAScopy",BLAScopy_(&n2,A,&one,Ascaled,&one));
771:     sfactor = PetscPowRealInt(2.0,-(*s));
772:     PetscCallBLAS("BLASscal",BLASscal_(&n2,&sfactor,Ascaled,&one));
773:     PetscLogFlops(1.0*n*n);
774:   } else Ascaled = A;
775:   *s += ell(n_,Ascaled,coeff[4],13,work,rand);
776: done:
777:   PetscRandomDestroy(&rand);
778:   return 0;
779: }

781: /*
782:  * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
783:  *
784:  *     N. J. Higham, "The scaling and squaring method for the matrix exponential
785:  *     revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
786:  */
787: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham(FN fn,Mat A,Mat B)
788: {
789:   PetscBLASInt      n_=0,n2,*ipiv,info,one=1;
790:   PetscInt          n,m,j,s;
791:   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
792:   PetscScalar       *Ba,*Apowers[5],*Q,*P,*W,*work,*aux;
793:   const PetscScalar *Aa,*c;
794:   const PetscScalar c3[4]   = { 120, 60, 12, 1 };
795:   const PetscScalar c5[6]   = { 30240, 15120, 3360, 420, 30, 1 };
796:   const PetscScalar c7[8]   = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
797:   const PetscScalar c9[10]  = { 17643225600.0, 8821612800.0, 2075673600, 302702400, 30270240,
798:                                 2162160, 110880, 3960, 90, 1 };
799:   const PetscScalar c13[14] = { 64764752532480000.0, 32382376266240000.0, 7771770303897600.0,
800:                                 1187353796428800.0,  129060195264000.0,   10559470521600.0,
801:                                 670442572800.0,      33522128640.0,       1323241920.0,
802:                                 40840800,          960960,            16380,  182,  1 };

804:   MatDenseGetArrayRead(A,&Aa);
805:   MatDenseGetArray(B,&Ba);
806:   MatGetSize(A,&n,NULL);
807:   PetscBLASIntCast(n,&n_);
808:   n2 = n_*n_;
809:   PetscMalloc2(8*n*n,&work,n,&ipiv);

811:   /* Matrix powers */
812:   Apowers[0] = work;                  /* Apowers[0] = A   */
813:   Apowers[1] = Apowers[0] + n*n;      /* Apowers[1] = A^2 */
814:   Apowers[2] = Apowers[1] + n*n;      /* Apowers[2] = A^4 */
815:   Apowers[3] = Apowers[2] + n*n;      /* Apowers[3] = A^6 */
816:   Apowers[4] = Apowers[3] + n*n;      /* Apowers[4] = A^8 */

818:   PetscArraycpy(Apowers[0],Aa,n2);
819:   PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,Apowers[0],&n_,&szero,Apowers[1],&n_));
820:   PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[1],&n_,&szero,Apowers[2],&n_));
821:   PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[2],&n_,&szero,Apowers[3],&n_));
822:   PetscLogFlops(6.0*n*n*n);

824:   /* Compute scaling parameter and order of Pade approximant */
825:   expm_params(n,Apowers,&s,&m,Apowers[4]);

827:   if (s) { /* rescale */
828:     for (j=0;j<4;j++) {
829:       scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
830:       PetscCallBLAS("BLASscal",BLASscal_(&n2,&scale,Apowers[j],&one));
831:     }
832:     PetscLogFlops(4.0*n*n);
833:   }

835:   /* Evaluate the Pade approximant */
836:   switch (m) {
837:     case 3:  c = c3;  break;
838:     case 5:  c = c5;  break;
839:     case 7:  c = c7;  break;
840:     case 9:  c = c9;  break;
841:     case 13: c = c13; break;
842:     default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
843:   }
844:   P = Ba;
845:   Q = Apowers[4] + n*n;
846:   W = Q + n*n;
847:   switch (m) {
848:     case 3:
849:     case 5:
850:     case 7:
851:     case 9:
852:       if (m==9) PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[3],&n_,&szero,Apowers[4],&n_));
853:       PetscArrayzero(P,n2);
854:       PetscArrayzero(Q,n2);
855:       for (j=0;j<n;j++) {
856:         P[j+j*n] = c[1];
857:         Q[j+j*n] = c[0];
858:       }
859:       for (j=m;j>=3;j-=2) {
860:         PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j],Apowers[(j+1)/2-1],&one,P,&one));
861:         PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j-1],Apowers[(j+1)/2-1],&one,Q,&one));
862:         PetscLogFlops(4.0*n*n);
863:       }
864:       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,P,&n_,&szero,W,&n_));
865:       PetscLogFlops(2.0*n*n*n);
866:       SWAP(P,W,aux);
867:       break;
868:     case 13:
869:       /*  P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
870:               + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I)       */
871:       PetscCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,P,&one));
872:       PetscCallBLAS("BLASscal",BLASscal_(&n2,&c[13],P,&one));
873:       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[11],Apowers[2],&one,P,&one));
874:       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[9],Apowers[1],&one,P,&one));
875:       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,P,&n_,&szero,W,&n_));
876:       PetscLogFlops(5.0*n*n+2.0*n*n*n);
877:       PetscArrayzero(P,n2);
878:       for (j=0;j<n;j++) P[j+j*n] = c[1];
879:       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[7],Apowers[3],&one,P,&one));
880:       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[5],Apowers[2],&one,P,&one));
881:       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[3],Apowers[1],&one,P,&one));
882:       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,P,&one,W,&one));
883:       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,W,&n_,&szero,P,&n_));
884:       PetscLogFlops(7.0*n*n+2.0*n*n*n);
885:       /*  Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
886:               + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I        */
887:       PetscCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,Q,&one));
888:       PetscCallBLAS("BLASscal",BLASscal_(&n2,&c[12],Q,&one));
889:       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[10],Apowers[2],&one,Q,&one));
890:       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[8],Apowers[1],&one,Q,&one));
891:       PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,Q,&n_,&szero,W,&n_));
892:       PetscLogFlops(5.0*n*n+2.0*n*n*n);
893:       PetscArrayzero(Q,n2);
894:       for (j=0;j<n;j++) Q[j+j*n] = c[0];
895:       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[6],Apowers[3],&one,Q,&one));
896:       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[4],Apowers[2],&one,Q,&one));
897:       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[2],Apowers[1],&one,Q,&one));
898:       PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,W,&one,Q,&one));
899:       PetscLogFlops(7.0*n*n);
900:       break;
901:     default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
902:   }
903:   PetscCallBLAS("BLASaxpy",BLASaxpy_(&n2,&smone,P,&one,Q,&one));
904:   PetscCallBLAS("LAPACKgesv",LAPACKgesv_(&n_,&n_,Q,&n_,ipiv,P,&n_,&info));
905:   SlepcCheckLapackInfo("gesv",info);
906:   PetscCallBLAS("BLASscal",BLASscal_(&n2,&stwo,P,&one));
907:   for (j=0;j<n;j++) P[j+j*n] += 1.0;
908:   PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n);

910:   /* Squaring */
911:   for (j=1;j<=s;j++) {
912:     PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,P,&n_,P,&n_,&szero,W,&n_));
913:     SWAP(P,W,aux);
914:   }
915:   if (P!=Ba) PetscArraycpy(Ba,P,n2);
916:   PetscLogFlops(2.0*n*n*n*s);

918:   PetscFree2(work,ipiv);
919:   MatDenseRestoreArrayRead(A,&Aa);
920:   MatDenseRestoreArray(B,&Ba);
921:   return 0;
922: }

924: #if defined(PETSC_HAVE_CUDA)
925: #include "../src/sys/classes/fn/impls/cuda/fnutilcuda.h"
926: #include <slepccublas.h>

928: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDA(FN fn,Mat A,Mat B)
929: {
930:   PetscBLASInt      n=0,ld,ld2,*d_ipiv,*d_info,info,one=1;
931:   PetscInt          m,k,sexp;
932:   PetscBool         odd;
933:   const PetscInt    p=MAX_PADE;
934:   PetscReal         c[MAX_PADE+1],s;
935:   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
936:   const PetscScalar *Aa;
937:   PetscScalar       *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux,**ppP,**d_ppP,**ppQ,**d_ppQ;
938:   cublasHandle_t    cublasv2handle;

940:   PetscDeviceInitialize(PETSC_DEVICE_CUDA); /* For CUDA event timers */
941:   PetscCUBLASGetHandle(&cublasv2handle);
942:   MatGetSize(A,&m,NULL);
943:   PetscBLASIntCast(m,&n);
944:   ld  = n;
945:   ld2 = ld*ld;
946:   if (A==B) {
947:     cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m);
948:     MatDenseCUDAGetArrayRead(A,&Aa);
949:     cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice);
950:     MatDenseCUDARestoreArrayRead(A,&Aa);
951:   } else MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_As);
952:   MatDenseCUDAGetArrayWrite(B,&d_Ba);

954:   cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m);
955:   cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m);
956:   cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m);
957:   cudaMalloc((void **)&d_ipiv,sizeof(PetscBLASInt)*ld);
958:   cudaMalloc((void **)&d_info,sizeof(PetscBLASInt));
959:   cudaMalloc((void **)&d_ppP,sizeof(PetscScalar*));
960:   cudaMalloc((void **)&d_ppQ,sizeof(PetscScalar*));

962:   PetscMalloc1(1,&ppP);
963:   PetscMalloc1(1,&ppQ);

965:   d_P = d_Ba;
966:   PetscLogGpuTimeBegin();

968:   /* Pade' coefficients */
969:   c[0] = 1.0;
970:   for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));

972:   /* Scaling */
973:   cublasXnrm2(cublasv2handle,ld2,d_As,one,&s);
974:   if (s>0.5) {
975:     sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
976:     scale = PetscPowRealInt(2.0,-sexp);
977:     cublasXscal(cublasv2handle,ld2,&scale,d_As,one);
978:     PetscLogGpuFlops(1.0*n*n);
979:   } else sexp = 0;

981:   /* Horner evaluation */
982:   cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld);
983:   PetscLogGpuFlops(2.0*n*n*n);
984:   cudaMemset(d_Q,0,sizeof(PetscScalar)*ld2);
985:   cudaMemset(d_P,0,sizeof(PetscScalar)*ld2);
986:   set_diagonal(n,d_Q,ld,c[p]);
987:   set_diagonal(n,d_P,ld,c[p-1]);

989:   odd = PETSC_TRUE;
990:   for (k=p-1;k>0;k--) {
991:     if (odd) {
992:       cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld);
993:       SWAP(d_Q,d_W,aux);
994:       shift_diagonal(n,d_Q,ld,c[k-1]);
995:       odd = PETSC_FALSE;
996:     } else {
997:       cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld);
998:       SWAP(d_P,d_W,aux);
999:       shift_diagonal(n,d_P,ld,c[k-1]);
1000:       odd = PETSC_TRUE;
1001:     }
1002:     PetscLogGpuFlops(2.0*n*n*n);
1003:   }
1004:   if (odd) {
1005:     cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld);
1006:     SWAP(d_Q,d_W,aux);
1007:     cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);

1009:     ppQ[0] = d_Q;
1010:     ppP[0] = d_P;
1011:     cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice);
1012:     cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice);

1014:     cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one);
1015:     cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost);
1018:     cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one);
1021:     cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);
1022:     shift_diagonal(n,d_P,ld,sone);
1023:     cublasXscal(cublasv2handle,ld2,&smone,d_P,one);
1024:   } else {
1025:     cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld);
1026:     SWAP(d_P,d_W,aux);
1027:     cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);

1029:     ppQ[0] = d_Q;
1030:     ppP[0] = d_P;
1031:     cudaMemcpy(d_ppQ,ppQ,sizeof(PetscScalar*),cudaMemcpyHostToDevice);
1032:     cudaMemcpy(d_ppP,ppP,sizeof(PetscScalar*),cudaMemcpyHostToDevice);

1034:     cublasXgetrfBatched(cublasv2handle,n,d_ppQ,ld,d_ipiv,d_info,one);
1035:     cudaMemcpy(&info,d_info,sizeof(PetscBLASInt),cudaMemcpyDeviceToHost);
1038:     cublasXgetrsBatched(cublasv2handle,CUBLAS_OP_N,n,n,(const PetscScalar **)d_ppQ,ld,d_ipiv,d_ppP,ld,&info,one);
1041:     cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);
1042:     shift_diagonal(n,d_P,ld,sone);
1043:   }
1044:   PetscLogGpuFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);

1046:   for (k=1;k<=sexp;k++) {
1047:     cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld);
1048:     cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice);
1049:   }
1050:   PetscLogGpuFlops(2.0*n*n*n*sexp);

1052:   PetscLogGpuTimeEnd();
1053:   cudaFree(d_Q);
1054:   cudaFree(d_W);
1055:   cudaFree(d_A2);
1056:   cudaFree(d_ipiv);
1057:   cudaFree(d_info);
1058:   cudaFree(d_ppP);
1059:   cudaFree(d_ppQ);

1061:   PetscFree(ppP);
1062:   PetscFree(ppQ);

1064:   if (d_P!=d_Ba) cudaMemcpy(d_Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice);
1065:   if (A!=B) {
1066:     if (s>0.5) {  /* undo scaling */
1067:       scale = 1.0/scale;
1068:       cublasXscal(cublasv2handle,ld2,&scale,d_As,one);
1069:     }
1070:     MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_As);
1071:   } else cudaFree(d_As);
1072:   MatDenseCUDARestoreArrayWrite(B,&d_Ba);
1073:   return 0;
1074: }

1076: #if defined(PETSC_HAVE_MAGMA)
1077: #include <slepcmagma.h>

1079: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade_CUDAm(FN fn,Mat A,Mat B)
1080: {
1081:   PetscBLASInt      n=0,ld,ld2,*piv,one=1;
1082:   PetscInt          m,k,sexp;
1083:   PetscBool         odd;
1084:   const PetscInt    p=MAX_PADE;
1085:   PetscReal         c[MAX_PADE+1],s;
1086:   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1087:   const PetscScalar *Aa;
1088:   PetscScalar       *d_Ba,*d_As,*d_A2,*d_Q,*d_P,*d_W,*aux;
1089:   cublasHandle_t    cublasv2handle;

1091:   PetscDeviceInitialize(PETSC_DEVICE_CUDA); /* For CUDA event timers */
1092:   PetscCUBLASGetHandle(&cublasv2handle);
1093:   SlepcMagmaInit();
1094:   MatGetSize(A,&m,NULL);
1095:   PetscBLASIntCast(m,&n);
1096:   ld  = n;
1097:   ld2 = ld*ld;
1098:   if (A==B) {
1099:     cudaMalloc((void **)&d_As,sizeof(PetscScalar)*m*m);
1100:     MatDenseCUDAGetArrayRead(A,&Aa);
1101:     cudaMemcpy(d_As,Aa,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice);
1102:     MatDenseCUDARestoreArrayRead(A,&Aa);
1103:   } else MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_As);
1104:   MatDenseCUDAGetArrayWrite(B,&d_Ba);

1106:   cudaMalloc((void **)&d_Q,sizeof(PetscScalar)*m*m);
1107:   cudaMalloc((void **)&d_W,sizeof(PetscScalar)*m*m);
1108:   cudaMalloc((void **)&d_A2,sizeof(PetscScalar)*m*m);

1110:   PetscMalloc1(n,&piv);

1112:   d_P = d_Ba;
1113:   PetscLogGpuTimeBegin();

1115:   /* Pade' coefficients */
1116:   c[0] = 1.0;
1117:   for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));

1119:   /* Scaling */
1120:   cublasXnrm2(cublasv2handle,ld2,d_As,one,&s);
1121:   PetscLogGpuFlops(1.0*n*n);

1123:   if (s>0.5) {
1124:     sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
1125:     scale = PetscPowRealInt(2.0,-sexp);
1126:     cublasXscal(cublasv2handle,ld2,&scale,d_As,one);
1127:     PetscLogGpuFlops(1.0*n*n);
1128:   } else sexp = 0;

1130:   /* Horner evaluation */
1131:   cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_As,ld,d_As,ld,&szero,d_A2,ld);
1132:   PetscLogGpuFlops(2.0*n*n*n);
1133:   cudaMemset(d_Q,0,sizeof(PetscScalar)*ld2);
1134:   cudaMemset(d_P,0,sizeof(PetscScalar)*ld2);
1135:   set_diagonal(n,d_Q,ld,c[p]);
1136:   set_diagonal(n,d_P,ld,c[p-1]);

1138:   odd = PETSC_TRUE;
1139:   for (k=p-1;k>0;k--) {
1140:     if (odd) {
1141:       cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_A2,ld,&szero,d_W,ld);
1142:       SWAP(d_Q,d_W,aux);
1143:       shift_diagonal(n,d_Q,ld,c[k-1]);
1144:       odd = PETSC_FALSE;
1145:     } else {
1146:       cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_A2,ld,&szero,d_W,ld);
1147:       SWAP(d_P,d_W,aux);
1148:       shift_diagonal(n,d_P,ld,c[k-1]);
1149:       odd = PETSC_TRUE;
1150:     }
1151:     PetscLogGpuFlops(2.0*n*n*n);
1152:   }
1153:   if (odd) {
1154:     cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Q,ld,d_As,ld,&szero,d_W,ld);
1155:     SWAP(d_Q,d_W,aux);
1156:     cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);
1157:     PetscCallMAGMA(magma_xgesv_gpu,n,n,d_Q,ld,piv,d_P,ld);
1158:     cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);
1159:     shift_diagonal(n,d_P,ld,sone);
1160:     cublasXscal(cublasv2handle,ld2,&smone,d_P,one);
1161:   } else {
1162:     cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_As,ld,&szero,d_W,ld);
1163:     SWAP(d_P,d_W,aux);
1164:     cublasXaxpy(cublasv2handle,ld2,&smone,d_P,one,d_Q,one);
1165:     PetscCallMAGMA(magma_xgesv_gpu,n,n,d_Q,ld,piv,d_P,ld);
1166:     cublasXscal(cublasv2handle,ld2,&stwo,d_P,one);
1167:     shift_diagonal(n,d_P,ld,sone);
1168:   }
1169:   PetscLogGpuFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);

1171:   for (k=1;k<=sexp;k++) {
1172:     cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_P,ld,d_P,ld,&szero,d_W,ld);
1173:     cudaMemcpy(d_P,d_W,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice);
1174:   }
1175:   PetscLogGpuFlops(2.0*n*n*n*sexp);

1177:   PetscLogGpuTimeEnd();
1178:   cudaFree(d_Q);
1179:   cudaFree(d_W);
1180:   cudaFree(d_A2);
1181:   PetscFree(piv);

1183:   if (d_P!=d_Ba) cudaMemcpy(d_Ba,d_P,sizeof(PetscScalar)*ld2,cudaMemcpyDeviceToDevice);
1184:   if (A!=B) {
1185:     if (s>0.5) {  /* undo scaling */
1186:       scale = 1.0/scale;
1187:       cublasXscal(cublasv2handle,ld2,&scale,d_As,one);
1188:     }
1189:     MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_As);
1190:   } else cudaFree(d_As);
1191:   MatDenseCUDARestoreArrayWrite(B,&d_Ba);
1192:   return 0;
1193: }

1195: /*
1196:  * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
1197:  *
1198:  *     N. J. Higham, "The scaling and squaring method for the matrix exponential
1199:  *     revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
1200:  */
1201: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham_CUDAm(FN fn,Mat A,Mat B)
1202: {
1203:   PetscBLASInt      n_=0,n2,*ipiv,one=1;
1204:   PetscInt          n,m,j,s;
1205:   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1206:   PetscScalar       *d_Ba,*Apowers[5],*d_Apowers[5],*d_Q,*d_P,*d_W,*work,*d_work,*aux;
1207:   const PetscScalar *Aa,*c;
1208:   const PetscScalar c3[4]   = { 120, 60, 12, 1 };
1209:   const PetscScalar c5[6]   = { 30240, 15120, 3360, 420, 30, 1 };
1210:   const PetscScalar c7[8]   = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
1211:   const PetscScalar c9[10]  = { 17643225600, 8821612800, 2075673600, 302702400, 30270240,
1212:     2162160, 110880, 3960, 90, 1 };
1213:   const PetscScalar c13[14] = { 64764752532480000, 32382376266240000, 7771770303897600,
1214:     1187353796428800,  129060195264000,   10559470521600,
1215:     670442572800,      33522128640,       1323241920,
1216:     40840800,          960960,            16380,  182,  1 };
1217:   cublasHandle_t    cublasv2handle;

1219:   PetscDeviceInitialize(PETSC_DEVICE_CUDA); /* For CUDA event timers */
1220:   PetscCUBLASGetHandle(&cublasv2handle);
1221:   SlepcMagmaInit();
1222:   MatGetSize(A,&n,NULL);
1223:   PetscBLASIntCast(n,&n_);
1224:   n2 = n_*n_;
1225:   PetscMalloc2(8*n*n,&work,n,&ipiv);
1226:   /* Matrix powers */
1227:   Apowers[0] = work;                  /* Apowers[0] = A   */
1228:   Apowers[1] = Apowers[0] + n*n;      /* Apowers[1] = A^2 */
1229:   Apowers[2] = Apowers[1] + n*n;      /* Apowers[2] = A^4 */
1230:   Apowers[3] = Apowers[2] + n*n;      /* Apowers[3] = A^6 */
1231:   Apowers[4] = Apowers[3] + n*n;      /* Apowers[4] = A^8 */
1232:   if (A==B) {
1233:     cudaMalloc((void**)&d_work,7*n*n*sizeof(PetscScalar));
1234:     d_Apowers[0] = d_work;              /* d_Apowers[0] = A   */
1235:     d_Apowers[1] = d_Apowers[0] + n*n;  /* d_Apowers[1] = A^2 */
1236:     MatDenseCUDAGetArrayRead(A,&Aa);
1237:     cudaMemcpy(d_Apowers[0],Aa,n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
1238:     MatDenseCUDARestoreArrayRead(A,&Aa);
1239:   } else {
1240:     cudaMalloc((void**)&d_work,6*n*n*sizeof(PetscScalar));
1241:     MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_Apowers[0]);
1242:     d_Apowers[1] = d_work;              /* d_Apowers[1] = A^2 */
1243:   }
1244:   MatDenseCUDAGetArrayWrite(B,&d_Ba);
1245:   d_Apowers[2] = d_Apowers[1] + n*n;    /* d_Apowers[2] = A^4 */
1246:   d_Apowers[3] = d_Apowers[2] + n*n;    /* d_Apowers[3] = A^6 */
1247:   d_Apowers[4] = d_Apowers[3] + n*n;    /* d_Apowers[4] = A^8 */
1248:   d_Q = d_Apowers[4] + n*n;
1249:   d_W = d_Q + n*n;

1251:   PetscLogGpuTimeBegin();

1253:   cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_Apowers[0],n_,&szero,d_Apowers[1],n_);
1254:   cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[1],n_,&szero,d_Apowers[2],n_);
1255:   cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[2],n_,&szero,d_Apowers[3],n_);
1256:   PetscLogGpuFlops(6.0*n*n*n);

1258:   cudaMemcpy(Apowers[0],d_Apowers[0],n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
1259:   cudaMemcpy(Apowers[1],d_Apowers[1],3*n2*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
1260:   PetscLogGpuToCpu(4*n2*sizeof(PetscScalar));
1261:   /* Compute scaling parameter and order of Pade approximant */
1262:   expm_params(n,Apowers,&s,&m,Apowers[4]);

1264:   if (s) { /* rescale */
1265:     for (j=0;j<4;j++) {
1266:       scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
1267:       cublasXscal(cublasv2handle,n2,&scale,d_Apowers[j],one);
1268:     }
1269:     PetscLogGpuFlops(4.0*n*n);
1270:   }

1272:   /* Evaluate the Pade approximant */
1273:   switch (m) {
1274:     case 3:  c = c3;  break;
1275:     case 5:  c = c5;  break;
1276:     case 7:  c = c7;  break;
1277:     case 9:  c = c9;  break;
1278:     case 13: c = c13; break;
1279:     default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
1280:   }
1281:   d_P = d_Ba;
1282:   switch (m) {
1283:     case 3:
1284:     case 5:
1285:     case 7:
1286:     case 9:
1287:       if (m==9) cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[1],n_,d_Apowers[3],n_,&szero,d_Apowers[4],n_);
1288:       cudaMemset(d_P,0,sizeof(PetscScalar)*n2);
1289:       cudaMemset(d_Q,0,sizeof(PetscScalar)*n2);
1290:       set_diagonal(n,d_P,n,c[1]);
1291:       set_diagonal(n,d_Q,n,c[0]);
1292:       for (j=m;j>=3;j-=2) {
1293:         cublasXaxpy(cublasv2handle,n2,&c[j],d_Apowers[(j+1)/2-1],one,d_P,one);
1294:         cublasXaxpy(cublasv2handle,n2,&c[j-1],d_Apowers[(j+1)/2-1],one,d_Q,one);
1295:         PetscLogGpuFlops(4.0*n*n);
1296:       }
1297:       cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_P,n_,&szero,d_W,n_);
1298:       PetscLogGpuFlops(2.0*n*n*n);
1299:       SWAP(d_P,d_W,aux);
1300:       break;
1301:     case 13:
1302:       /*  P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
1303:           + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I)       */
1304:       cudaMemcpy(d_P,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
1305:       cublasXscal(cublasv2handle,n2,&c[13],d_P,one);
1306:       cublasXaxpy(cublasv2handle,n2,&c[11],d_Apowers[2],one,d_P,one);
1307:       cublasXaxpy(cublasv2handle,n2,&c[9],d_Apowers[1],one,d_P,one);
1308:       cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_P,n_,&szero,d_W,n_);
1309:       PetscLogGpuFlops(5.0*n*n+2.0*n*n*n);

1311:       cudaMemset(d_P,0,sizeof(PetscScalar)*n2);
1312:       set_diagonal(n,d_P,n,c[1]);
1313:       cublasXaxpy(cublasv2handle,n2,&c[7],d_Apowers[3],one,d_P,one);
1314:       cublasXaxpy(cublasv2handle,n2,&c[5],d_Apowers[2],one,d_P,one);
1315:       cublasXaxpy(cublasv2handle,n2,&c[3],d_Apowers[1],one,d_P,one);
1316:       cublasXaxpy(cublasv2handle,n2,&sone,d_P,one,d_W,one);
1317:       cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[0],n_,d_W,n_,&szero,d_P,n_);
1318:       PetscLogGpuFlops(7.0*n*n+2.0*n*n*n);
1319:       /*  Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
1320:           + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I        */
1321:       cudaMemcpy(d_Q,d_Apowers[3],n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
1322:       cublasXscal(cublasv2handle,n2,&c[12],d_Q,one);
1323:       cublasXaxpy(cublasv2handle,n2,&c[10],d_Apowers[2],one,d_Q,one);
1324:       cublasXaxpy(cublasv2handle,n2,&c[8],d_Apowers[1],one,d_Q,one);
1325:       cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_Apowers[3],n_,d_Q,n_,&szero,d_W,n_);
1326:       PetscLogGpuFlops(5.0*n*n+2.0*n*n*n);
1327:       cudaMemset(d_Q,0,sizeof(PetscScalar)*n2);
1328:       set_diagonal(n,d_Q,n,c[0]);
1329:       cublasXaxpy(cublasv2handle,n2,&c[6],d_Apowers[3],one,d_Q,one);
1330:       cublasXaxpy(cublasv2handle,n2,&c[4],d_Apowers[2],one,d_Q,one);
1331:       cublasXaxpy(cublasv2handle,n2,&c[2],d_Apowers[1],one,d_Q,one);
1332:       cublasXaxpy(cublasv2handle,n2,&sone,d_W,one,d_Q,one);
1333:       PetscLogGpuFlops(7.0*n*n);
1334:       break;
1335:     default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %" PetscInt_FMT,m);
1336:   }
1337:   cublasXaxpy(cublasv2handle,n2,&smone,d_P,one,d_Q,one);

1339:   PetscCallMAGMA(magma_xgesv_gpu,n_,n_,d_Q,n_,ipiv,d_P,n_);

1341:   cublasXscal(cublasv2handle,n2,&stwo,d_P,one);
1342:   shift_diagonal(n,d_P,n,sone);
1343:   PetscLogGpuFlops(2.0*n*n*n/3.0+4.0*n*n);

1345:   /* Squaring */
1346:   for (j=1;j<=s;j++) {
1347:     cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n_,n_,n_,&sone,d_P,n_,d_P,n_,&szero,d_W,n_);
1348:     SWAP(d_P,d_W,aux);
1349:   }
1350:   PetscLogGpuFlops(2.0*n*n*n*s);
1351:   PetscLogGpuTimeEnd();

1353:   PetscFree2(work,ipiv);
1354:   if (d_P!=d_Ba) cudaMemcpy(d_Ba,d_P,n2*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
1355:   if (A!=B) {
1356:     if (s>0.5) {  /* undo scaling */
1357:       scale = 1.0/PetscPowRealInt(2.0,-s);
1358:       cublasXscal(cublasv2handle,n2,&scale,d_Apowers[0],one);
1359:     }
1360:     MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_Apowers[0]);
1361:   }
1362:   MatDenseCUDARestoreArrayWrite(B,&d_Ba);
1363:   cudaFree(d_work);
1364:   return 0;
1365: }

1367: /*
1368:  * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
1369:  * and Yuji Nakatsukasa
1370:  *
1371:  *     Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade'
1372:  *     Approximation for the Matrix Exponential",
1373:  *     SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
1374:  *     https://doi.org/10.1137/15M1027553
1375:  */
1376: PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm(FN fn,Mat A,Mat B)
1377: {
1378:   PetscInt          i,j,n_,s,k,m,mod;
1379:   PetscBLASInt      n=0,n2=0,irsize=0,rsizediv2,ipsize=0,iremainsize=0,query=-1,*piv,minlen,lwork=0,one=1;
1380:   PetscReal         nrm,shift=0.0,rone=1.0,rzero=0.0;
1381: #if defined(PETSC_USE_COMPLEX)
1382:   PetscReal         *rwork=NULL;
1383: #endif
1384:   PetscComplex      *d_As,*d_RR,*d_RR2,*d_expmA,*d_expmA2,*d_Maux,*d_Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
1385:   PetscScalar       *d_Aa,*d_Ba,*d_Ba2,*Maux,*d_sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*work,work1,*saux;
1386:   const PetscScalar *Aa;
1387:   PetscBool         isreal,*d_isreal,flg;
1388:   cublasHandle_t    cublasv2handle;

1390:   PetscDeviceInitialize(PETSC_DEVICE_CUDA); /* For CUDA event timers */
1391:   PetscCUBLASGetHandle(&cublasv2handle);
1392:   SlepcMagmaInit();
1393:   MatGetSize(A,&n_,NULL);
1394:   PetscBLASIntCast(n_,&n);
1395:   PetscBLASIntCast(n*n,&n2);

1397:   if (A==B) {
1398:     cudaMalloc((void **)&d_Aa,sizeof(PetscScalar)*n2);
1399:     MatDenseCUDAGetArrayRead(A,&Aa);
1400:     cudaMemcpy(d_Aa,Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);
1401:     MatDenseCUDARestoreArrayRead(A,&Aa);
1402:   } else MatDenseCUDAGetArrayRead(A,(const PetscScalar**)&d_Aa);
1403:   MatDenseCUDAGetArrayWrite(B,&d_Ba);
1404:   d_Ba2 = d_Ba;

1406:   cudaMalloc((void **)&d_isreal,sizeof(PetscBool));
1407:   cudaMalloc((void **)&d_sMaux,sizeof(PetscScalar)*n2);
1408:   cudaMalloc((void **)&d_Maux,sizeof(PetscComplex)*n2);

1410:   PetscLogGpuTimeBegin();
1411:   d_Maux2 = d_Maux;
1412:   PetscOptionsGetReal(NULL,NULL,"-fn_expm_estimated_eig",&shift,&flg);
1413:   if (!flg) {
1414:     PetscMalloc2(n,&wr,n,&wi);
1415:     /* estimate rightmost eigenvalue and shift A with it */
1416:     PetscMalloc1(n2,&Maux);
1417:     MatDenseGetArrayRead(A,&Aa);
1418:     PetscArraycpy(Maux,Aa,n2);
1419:     MatDenseRestoreArrayRead(A,&Aa);
1420: #if !defined(PETSC_USE_COMPLEX)
1421:     PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,wi,NULL,n,NULL,n,&work1,query);
1422:     PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
1423:     PetscMalloc1(lwork,&work);
1424:     PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,wi,NULL,n,NULL,n,work,lwork);
1425:     PetscFree(work);
1426: #else
1427:     PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,&work1,query,rwork);
1428:     PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
1429:     PetscMalloc2(2*n,&rwork,lwork,&work);
1430:     PetscCallMAGMA(magma_xgeev,MagmaNoVec,MagmaNoVec,n,Maux,n,wr,NULL,n,NULL,n,work,lwork,rwork);
1431:     PetscFree2(rwork,work);
1432: #endif
1433:     PetscFree(Maux);
1434:     PetscLogGpuFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n);

1436:     shift = PetscRealPart(wr[0]);
1437:     for (i=1;i<n;i++) {
1438:       if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
1439:     }
1440:     PetscFree2(wr,wi);
1441:   }
1442:   /* shift so that largest real part is (about) 0 */
1443:   cudaMemcpy(d_sMaux,d_Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);
1444:   if (shift) {
1445:     shift_diagonal(n,d_sMaux,n,-shift);
1446:     PetscLogGpuFlops(1.0*n);
1447:   }
1448: #if defined(PETSC_USE_COMPLEX)
1449:   cudaMemcpy(d_Maux,d_Aa,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);
1450:   if (shift) {
1451:     shift_diagonal(n,d_Maux,n,-shift);
1452:     PetscLogGpuFlops(1.0*n);
1453:   }
1454: #endif
1455:   if (A!=B) MatDenseCUDARestoreArrayRead(A,(const PetscScalar**)&d_Aa);
1456:   else cudaFree(d_Aa);

1458:   /* estimate norm(A) and select the scaling factor */
1459:   cublasXnrm2(cublasv2handle,n2,d_sMaux,one,&nrm);
1460:   PetscLogGpuFlops(2.0*n*n);
1461:   sexpm_params(nrm,&s,&k,&m);
1462:   if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
1463:     if (shift) expshift = PetscExpReal(shift);
1464:     shift_Cdiagonal(n,d_Maux,n,rone,rzero);
1465:     if (shift) {
1466:       cublasXscal(cublasv2handle,n2,&expshift,d_sMaux,one);
1467:       PetscLogGpuFlops(1.0*(n+n2));
1468:     } else PetscLogGpuFlops(1.0*n);
1469:     cudaMemcpy(d_Ba,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);
1470:     cudaFree(d_isreal);
1471:     cudaFree(d_sMaux);
1472:     cudaFree(d_Maux);
1473:     MatDenseCUDARestoreArrayWrite(B,&d_Ba);
1474:     return 0; /* quick return */
1475:   }

1477:   cudaMalloc((void **)&d_expmA,sizeof(PetscComplex)*n2);
1478:   cudaMalloc((void **)&d_As,sizeof(PetscComplex)*n2);
1479:   cudaMalloc((void **)&d_RR,sizeof(PetscComplex)*n2);
1480:   d_expmA2 = d_expmA; d_RR2 = d_RR;
1481:   PetscMalloc1(n,&piv);
1482:   /* scale matrix */
1483: #if !defined(PETSC_USE_COMPLEX)
1484:   copy_array2D_S2C(n,n,d_As,n,d_sMaux,n);
1485: #else
1486:   cudaMemcpy(d_As,d_sMaux,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);
1487: #endif
1488:   scale = 1.0/PetscPowRealInt(2.0,s);
1489:   cublasXCscal(cublasv2handle,n2,(const cuComplex *)&scale,(cuComplex *)d_As,one);
1490:   SlepcLogGpuFlopsComplex(1.0*n2);

1492:   /* evaluate Pade approximant (partial fraction or product form) */
1493:   if (fn->method==3 || !m) { /* partial fraction */
1494:     getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE);
1495:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
1496:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
1497:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize);
1498:     PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm);
1499:     getcoeffs(k,m,r,p,remainterm,PETSC_FALSE);

1501:     cudaMemset(d_expmA,0,sizeof(PetscComplex)*n2);
1502: #if !defined(PETSC_USE_COMPLEX)
1503:     isreal = PETSC_TRUE;
1504: #else
1505:     getisreal_array2D(n,n,d_Maux,n,d_isreal);
1506:     cudaMemcpy(&isreal,d_isreal,sizeof(PetscBool),cudaMemcpyDeviceToHost);
1507: #endif
1508:     if (isreal) {
1509:       rsizediv2 = irsize/2;
1510:       for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
1511:         cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);
1512:         cudaMemset(d_RR,0,sizeof(PetscComplex)*n2);
1513:         shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[2*i]),-PetscImaginaryPartComplex(p[2*i]));
1514:         set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[2*i]),PetscImaginaryPartComplex(r[2*i]));
1515:         PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
1516:         add_array2D_Conj(n,n,d_RR,n);
1517:         cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);
1518:         /* shift(n) + gesv + axpy(n2) */
1519:         SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2);
1520:       }

1522:       mod = ipsize % 2;
1523:       if (mod) {
1524:         cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);
1525:         cudaMemset(d_RR,0,sizeof(PetscComplex)*n2);
1526:         shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[ipsize-1]),-PetscImaginaryPartComplex(p[ipsize-1]));
1527:         set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[irsize-1]),PetscImaginaryPartComplex(r[irsize-1]));
1528:         PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
1529:         cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);
1530:         SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
1531:       }
1532:     } else { /* complex */
1533:       for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
1534:         cudaMemcpy(d_Maux,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);
1535:         cudaMemset(d_RR,0,sizeof(PetscComplex)*n2);
1536:         shift_Cdiagonal(n,d_Maux,n,-PetscRealPartComplex(p[i]),-PetscImaginaryPartComplex(p[i]));
1537:         set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(r[i]),PetscImaginaryPartComplex(r[i]));
1538:         PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_Maux,n,piv,d_RR,n);
1539:         cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);
1540:         SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
1541:       }
1542:     }
1543:     for (i=0;i<iremainsize;i++) {
1544:       if (!i) {
1545:         cudaMemset(d_RR,0,sizeof(PetscComplex)*n2);
1546:         set_Cdiagonal(n,d_RR,n,PetscRealPartComplex(remainterm[iremainsize-1]),PetscImaginaryPartComplex(remainterm[iremainsize-1]));
1547:       } else {
1548:         cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);
1549:         for (j=1;j<i;j++) {
1550:           cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_RR,n,&czero,d_Maux,n);
1551:           SWAP(d_RR,d_Maux,aux);
1552:           SlepcLogGpuFlopsComplex(2.0*n*n*n);
1553:         }
1554:         cublasXCscal(cublasv2handle,n2,&remainterm[iremainsize-1-i],d_RR,one);
1555:         SlepcLogGpuFlopsComplex(1.0*n2);
1556:       }
1557:       cublasXCaxpy(cublasv2handle,n2,&cone,d_RR,one,d_expmA,one);
1558:       SlepcLogGpuFlopsComplex(1.0*n2);
1559:     }
1560:     PetscFree3(r,p,remainterm);
1561:   } else { /* product form, default */
1562:     getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE);
1563:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
1564:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
1565:     PetscMalloc2(irsize,&rootp,ipsize,&rootq);
1566:     getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE);

1568:     cudaMemset(d_expmA,0,sizeof(PetscComplex)*n2);
1569:     set_Cdiagonal(n,d_expmA,n,rone,rzero); /* initialize */
1570:     minlen = PetscMin(irsize,ipsize);
1571:     for (i=0;i<minlen;i++) {
1572:       cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);
1573:       shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i]));
1574:       cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n);
1575:       SWAP(d_expmA,d_Maux,aux);
1576:       cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);
1577:       shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i]));
1578:       PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_RR,n,piv,d_expmA,n);
1579:       /* shift(n) + gemm + shift(n) + gesv */
1580:       SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
1581:     }
1582:     /* extra enumerator */
1583:     for (i=minlen;i<irsize;i++) {
1584:       cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);
1585:       shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootp[i]),-PetscImaginaryPartComplex(rootp[i]));
1586:       cublasXCgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&cone,d_RR,n,d_expmA,n,&czero,d_Maux,n);
1587:       SWAP(d_expmA,d_Maux,aux);
1588:       SlepcLogGpuFlopsComplex(1.0*n+2.0*n*n*n);
1589:     }
1590:     /* extra denominator */
1591:     for (i=minlen;i<ipsize;i++) {
1592:       cudaMemcpy(d_RR,d_As,sizeof(PetscComplex)*n2,cudaMemcpyDeviceToDevice);
1593:       shift_Cdiagonal(n,d_RR,n,-PetscRealPartComplex(rootq[i]),-PetscImaginaryPartComplex(rootq[i]));
1594:       PetscCallMAGMA(magma_Cgesv_gpu,n,n,d_RR,n,piv,d_expmA,n);
1595:       SlepcLogGpuFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
1596:     }
1597:     cublasXCscal(cublasv2handle,n2,&mult,d_expmA,one);
1598:     SlepcLogGpuFlopsComplex(1.0*n2);
1599:     PetscFree2(rootp,rootq);
1600:   }

1602: #if !defined(PETSC_USE_COMPLEX)
1603:   copy_array2D_C2S(n,n,d_Ba2,n,d_expmA,n);
1604: #else
1605:   cudaMemcpy(d_Ba2,d_expmA,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);
1606: #endif

1608:   /* perform repeated squaring */
1609:   for (i=0;i<s;i++) { /* final squaring */
1610:     cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_Ba2,n,d_Ba2,n,&szero,d_sMaux,n);
1611:     SWAP(d_Ba2,d_sMaux,saux);
1612:     PetscLogGpuFlops(2.0*n*n*n);
1613:   }
1614:   if (d_Ba2!=d_Ba) {
1615:     cudaMemcpy(d_Ba,d_Ba2,sizeof(PetscScalar)*n2,cudaMemcpyDeviceToDevice);
1616:     d_sMaux = d_Ba2;
1617:   }
1618:   if (shift) {
1619:     expshift = PetscExpReal(shift);
1620:     cublasXscal(cublasv2handle,n2,&expshift,d_Ba,one);
1621:     PetscLogGpuFlops(1.0*n2);
1622:   }

1624:   PetscLogGpuTimeEnd();

1626:   /* restore pointers */
1627:   d_Maux = d_Maux2; d_expmA = d_expmA2; d_RR = d_RR2;
1628:   MatDenseCUDARestoreArrayWrite(B,&d_Ba);
1629:   cudaFree(d_isreal);
1630:   cudaFree(d_sMaux);
1631:   cudaFree(d_Maux);
1632:   cudaFree(d_expmA);
1633:   cudaFree(d_As);
1634:   cudaFree(d_RR);
1635:   PetscFree(piv);
1636:   return 0;
1637: }
1638: #endif /* PETSC_HAVE_MAGMA */
1639: #endif /* PETSC_HAVE_CUDA */

1641: PetscErrorCode FNView_Exp(FN fn,PetscViewer viewer)
1642: {
1643:   PetscBool      isascii;
1644:   char           str[50];
1645:   const char     *methodname[] = {
1646:                   "scaling & squaring, [m/m] Pade approximant (Higham)",
1647:                   "scaling & squaring, [6/6] Pade approximant",
1648:                   "scaling & squaring, subdiagonal Pade approximant (product form)",
1649:                   "scaling & squaring, subdiagonal Pade approximant (partial fraction)"
1650:   };
1651:   const int      nmeth=PETSC_STATIC_ARRAY_LENGTH(methodname);

1653:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii);
1654:   if (isascii) {
1655:     if (fn->beta==(PetscScalar)1.0) {
1656:       if (fn->alpha==(PetscScalar)1.0) PetscViewerASCIIPrintf(viewer,"  exponential: exp(x)\n");
1657:       else {
1658:         SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE);
1659:         PetscViewerASCIIPrintf(viewer,"  exponential: exp(%s*x)\n",str);
1660:       }
1661:     } else {
1662:       SlepcSNPrintfScalar(str,sizeof(str),fn->beta,PETSC_TRUE);
1663:       if (fn->alpha==(PetscScalar)1.0) PetscViewerASCIIPrintf(viewer,"  exponential: %s*exp(x)\n",str);
1664:       else {
1665:         PetscViewerASCIIPrintf(viewer,"  exponential: %s",str);
1666:         PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1667:         SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE);
1668:         PetscViewerASCIIPrintf(viewer,"*exp(%s*x)\n",str);
1669:         PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1670:       }
1671:     }
1672:     if (fn->method<nmeth) PetscViewerASCIIPrintf(viewer,"  computing matrix functions with: %s\n",methodname[fn->method]);
1673:   }
1674:   return 0;
1675: }

1677: SLEPC_EXTERN PetscErrorCode FNCreate_Exp(FN fn)
1678: {
1679:   fn->ops->evaluatefunction       = FNEvaluateFunction_Exp;
1680:   fn->ops->evaluatederivative     = FNEvaluateDerivative_Exp;
1681:   fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Exp_Higham;
1682:   fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Exp_Pade;
1683:   fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* product form */
1684:   fn->ops->evaluatefunctionmat[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* partial fraction */
1685: #if defined(PETSC_HAVE_CUDA)
1686:   fn->ops->evaluatefunctionmatcuda[1] = FNEvaluateFunctionMat_Exp_Pade_CUDA;
1687: #if defined(PETSC_HAVE_MAGMA)
1688:   fn->ops->evaluatefunctionmatcuda[0] = FNEvaluateFunctionMat_Exp_Higham_CUDAm;
1689:   fn->ops->evaluatefunctionmatcuda[1] = FNEvaluateFunctionMat_Exp_Pade_CUDAm;
1690:   fn->ops->evaluatefunctionmatcuda[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* product form */
1691:   fn->ops->evaluatefunctionmatcuda[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa_CUDAm; /* partial fraction */
1692: #endif
1693: #endif
1694:   fn->ops->view                   = FNView_Exp;
1695:   return 0;
1696: }