Actual source code: bvlapack.c

slepc-3.18.0 2022-10-01
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV private kernels that use the LAPACK
 12: */

 14: #include <slepc/private/bvimpl.h>
 15: #include <slepcblaslapack.h>

 17: /*
 18:     Reduction operation to compute sqrt(x**2+y**2) when normalizing vectors
 19: */
 20: SLEPC_EXTERN void MPIAPI SlepcPythag(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
 21: {
 22:   PetscBLASInt i,n=*len;
 23:   PetscReal    *x = (PetscReal*)in,*y = (PetscReal*)inout;

 25:   if (PetscUnlikely(*datatype!=MPIU_REAL)) {
 26:     (*PetscErrorPrintf)("Only implemented for MPIU_REAL data type");
 27:     MPI_Abort(PETSC_COMM_WORLD,1);
 28:   }
 29:   for (i=0;i<n;i++) y[i] = SlepcAbs(x[i],y[i]);
 30:   return;
 31: }

 33: /*
 34:     Compute ||A|| for an mxn matrix
 35: */
 36: PetscErrorCode BVNorm_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,NormType type,PetscReal *nrm,PetscBool mpi)
 37: {
 38:   PetscBLASInt   m,n,i,j;
 39:   PetscMPIInt    len;
 40:   PetscReal      lnrm,*rwork=NULL,*rwork2=NULL;

 42:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
 43:   PetscBLASIntCast(m_,&m);
 44:   PetscBLASIntCast(n_,&n);
 45:   if (type==NORM_FROBENIUS || type==NORM_2) {
 46:     lnrm = LAPACKlange_("F",&m,&n,(PetscScalar*)A,&m,rwork);
 47:     if (mpi) MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv));
 48:     else *nrm = lnrm;
 49:     PetscLogFlops(2.0*m*n);
 50:   } else if (type==NORM_1) {
 51:     if (mpi) {
 52:       BVAllocateWork_Private(bv,2*n_);
 53:       rwork = (PetscReal*)bv->work;
 54:       rwork2 = rwork+n_;
 55:       PetscArrayzero(rwork,n_);
 56:       PetscArrayzero(rwork2,n_);
 57:       for (j=0;j<n_;j++) {
 58:         for (i=0;i<m_;i++) {
 59:           rwork[j] += PetscAbsScalar(A[i+j*m_]);
 60:         }
 61:       }
 62:       PetscMPIIntCast(n_,&len);
 63:       MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv));
 64:       *nrm = 0.0;
 65:       for (j=0;j<n_;j++) if (rwork2[j] > *nrm) *nrm = rwork2[j];
 66:     } else {
 67:       *nrm = LAPACKlange_("O",&m,&n,(PetscScalar*)A,&m,rwork);
 68:     }
 69:     PetscLogFlops(1.0*m*n);
 70:   } else if (type==NORM_INFINITY) {
 71:     BVAllocateWork_Private(bv,m_);
 72:     rwork = (PetscReal*)bv->work;
 73:     lnrm = LAPACKlange_("I",&m,&n,(PetscScalar*)A,&m,rwork);
 74:     if (mpi) MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)bv));
 75:     else *nrm = lnrm;
 76:     PetscLogFlops(1.0*m*n);
 77:   }
 78:   PetscFPTrapPop();
 79:   return 0;
 80: }

 82: /*
 83:     Normalize the columns of an mxn matrix A
 84: */
 85: PetscErrorCode BVNormalize_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,PetscScalar *eigi,PetscBool mpi)
 86: {
 87:   PetscBLASInt   m,j,k,info,zero=0;
 88:   PetscMPIInt    len;
 89:   PetscReal      *norms,*rwork=NULL,*rwork2=NULL,done=1.0;

 91:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
 92:   PetscBLASIntCast(m_,&m);
 93:   BVAllocateWork_Private(bv,2*n_);
 94:   rwork = (PetscReal*)bv->work;
 95:   rwork2 = rwork+n_;
 96:   /* compute local norms */
 97:   for (j=0;j<n_;j++) {
 98:     k = 1;
 99: #if !defined(PETSC_USE_COMPLEX)
100:     if (eigi && eigi[j] != 0.0) k = 2;
101: #endif
102:     rwork[j] = LAPACKlange_("F",&m,&k,(PetscScalar*)(A+j*m_),&m,rwork2);
103:     if (k==2) { rwork[j+1] = rwork[j]; j++; }
104:   }
105:   /* reduction to get global norms */
106:   if (mpi) {
107:     PetscMPIIntCast(n_,&len);
108:     PetscArrayzero(rwork2,n_);
109:     MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv));
110:     norms = rwork2;
111:   } else norms = rwork;
112:   /* scale columns */
113:   for (j=0;j<n_;j++) {
114:     k = 1;
115: #if !defined(PETSC_USE_COMPLEX)
116:     if (eigi && eigi[j] != 0.0) k = 2;
117: #endif
118:     PetscCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,norms+j,&done,&m,&k,(PetscScalar*)(A+j*m_),&m,&info));
119:     SlepcCheckLapackInfo("lascl",info);
120:     if (k==2) j++;
121:   }
122:   PetscLogFlops(3.0*m*n_);
123:   PetscFPTrapPop();
124:   return 0;
125: }

127: /*
128:    Compute the upper Cholesky factor in R and its inverse in S.
129:    If S == R then the inverse overwrites the Cholesky factor.
130:  */
131: PetscErrorCode BVMatCholInv_LAPACK_Private(BV bv,Mat R,Mat S)
132: {
133:   PetscInt       i,k,l,n,m,ld,lds;
134:   PetscScalar    *pR,*pS;
135:   PetscBLASInt   info,n_ = 0,m_ = 0,ld_,lds_;

137:   l = bv->l;
138:   k = bv->k;
139:   MatGetSize(R,&m,NULL);
140:   n = k-l;
141:   PetscBLASIntCast(m,&m_);
142:   PetscBLASIntCast(n,&n_);
143:   ld  = m;
144:   ld_ = m_;
145:   MatDenseGetArray(R,&pR);

147:   if (S==R) {
148:     BVAllocateWork_Private(bv,m*k);
149:     pS = bv->work;
150:     lds = ld;
151:     lds_ = ld_;
152:   } else {
153:     MatDenseGetArray(S,&pS);
154:     MatGetSize(S,&lds,NULL);
155:     PetscBLASIntCast(lds,&lds_);
156:   }

158:   /* save a copy of matrix in S */
159:   for (i=l;i<k;i++) PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n);

161:   /* compute upper Cholesky factor in R */
162:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
163:   PetscCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
164:   PetscLogFlops((1.0*n*n*n)/3.0);

166:   if (info) {  /* LAPACKpotrf failed, retry on diagonally perturbed matrix */
167:     for (i=l;i<k;i++) {
168:       PetscArraycpy(pR+i*ld+l,pS+i*lds+l,n);
169:       pR[i+i*ld] += 50.0*PETSC_MACHINE_EPSILON;
170:     }
171:     PetscCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
172:     SlepcCheckLapackInfo("potrf",info);
173:     PetscLogFlops((1.0*n*n*n)/3.0);
174:   }

176:   /* compute S = inv(R) */
177:   if (S==R) {
178:     PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
179:   } else {
180:     PetscArrayzero(pS+l*lds,(k-l)*k);
181:     for (i=l;i<k;i++) PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n);
182:     PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
183:   }
184:   SlepcCheckLapackInfo("trtri",info);
185:   PetscFPTrapPop();
186:   PetscLogFlops(0.33*n*n*n);

188:   /* Zero out entries below the diagonal */
189:   for (i=l;i<k-1;i++) {
190:     PetscArrayzero(pR+i*ld+i+1,(k-i-1));
191:     if (S!=R) PetscArrayzero(pS+i*lds+i+1,(k-i-1));
192:   }
193:   MatDenseRestoreArray(R,&pR);
194:   if (S!=R) MatDenseRestoreArray(S,&pS);
195:   return 0;
196: }

198: /*
199:    Compute the inverse of an upper triangular matrix R, store it in S.
200:    If S == R then the inverse overwrites R.
201:  */
202: PetscErrorCode BVMatTriInv_LAPACK_Private(BV bv,Mat R,Mat S)
203: {
204:   PetscInt       i,k,l,n,m,ld,lds;
205:   PetscScalar    *pR,*pS;
206:   PetscBLASInt   info,n_,m_ = 0,ld_,lds_;

208:   l = bv->l;
209:   k = bv->k;
210:   MatGetSize(R,&m,NULL);
211:   n = k-l;
212:   PetscBLASIntCast(m,&m_);
213:   PetscBLASIntCast(n,&n_);
214:   ld  = m;
215:   ld_ = m_;
216:   MatDenseGetArray(R,&pR);

218:   if (S==R) {
219:     BVAllocateWork_Private(bv,m*k);
220:     pS = bv->work;
221:     lds = ld;
222:     lds_ = ld_;
223:   } else {
224:     MatDenseGetArray(S,&pS);
225:     MatGetSize(S,&lds,NULL);
226:     PetscBLASIntCast(lds,&lds_);
227:   }

229:   /* compute S = inv(R) */
230:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
231:   if (S==R) {
232:     PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
233:   } else {
234:     PetscArrayzero(pS+l*lds,(k-l)*k);
235:     for (i=l;i<k;i++) PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n);
236:     PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
237:   }
238:   SlepcCheckLapackInfo("trtri",info);
239:   PetscFPTrapPop();
240:   PetscLogFlops(0.33*n*n*n);

242:   MatDenseRestoreArray(R,&pR);
243:   if (S!=R) MatDenseRestoreArray(S,&pS);
244:   return 0;
245: }

247: /*
248:    Compute the matrix to be used for post-multiplying the basis in the SVQB
249:    block orthogonalization method.
250:    On input R = V'*V, on output S = D*U*Lambda^{-1/2} where (U,Lambda) is
251:    the eigendecomposition of D*R*D with D=diag(R)^{-1/2}.
252:    If S == R then the result overwrites R.
253:  */
254: PetscErrorCode BVMatSVQB_LAPACK_Private(BV bv,Mat R,Mat S)
255: {
256:   PetscInt       i,j,k,l,n,m,ld,lds;
257:   PetscScalar    *pR,*pS,*D,*work,a;
258:   PetscReal      *eig,dummy;
259:   PetscBLASInt   info,lwork,n_,m_ = 0,ld_,lds_;
260: #if defined(PETSC_USE_COMPLEX)
261:   PetscReal      *rwork,rdummy;
262: #endif

264:   l = bv->l;
265:   k = bv->k;
266:   MatGetSize(R,&m,NULL);
267:   MatDenseGetLDA(R,&ld);
268:   n = k-l;
269:   PetscBLASIntCast(m,&m_);
270:   PetscBLASIntCast(n,&n_);
271:   ld_ = m_;
272:   MatDenseGetArray(R,&pR);

274:   if (S==R) {
275:     pS = pR;
276:     lds = ld;
277:     lds_ = ld_;
278:   } else {
279:     MatDenseGetArray(S,&pS);
280:     MatDenseGetLDA(S,&lds);
281:     PetscBLASIntCast(lds,&lds_);
282:   }
283:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);

285:   /* workspace query and memory allocation */
286:   lwork = -1;
287: #if defined(PETSC_USE_COMPLEX)
288:   PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&rdummy,&info));
289:   PetscBLASIntCast((PetscInt)PetscRealPart(a),&lwork);
290:   PetscMalloc4(n,&eig,n,&D,lwork,&work,PetscMax(1,3*n-2),&rwork);
291: #else
292:   PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&info));
293:   PetscBLASIntCast((PetscInt)a,&lwork);
294:   PetscMalloc3(n,&eig,n,&D,lwork,&work);
295: #endif

297:   /* copy and scale matrix */
298:   for (i=l;i<k;i++) D[i-l] = 1.0/PetscSqrtReal(PetscRealPart(pR[i+i*ld]));
299:   for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] = pR[i+j*ld]*D[i-l];
300:   for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] *= D[j-l];

302:   /* compute eigendecomposition */
303: #if defined(PETSC_USE_COMPLEX)
304:   PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,rwork,&info));
305: #else
306:   PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,&info));
307: #endif
308:   SlepcCheckLapackInfo("syev",info);

310:   if (S!=R) {   /* R = U' */
311:     for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] = pS[j+i*lds];
312:   }

314:   /* compute S = D*U*Lambda^{-1/2} */
315:   for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] *= D[i-l];
316:   for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] /= PetscSqrtReal(eig[j-l]);

318:   if (S!=R) {   /* compute R = inv(S) = Lambda^{1/2}*U'/D */
319:     for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] *= PetscSqrtReal(eig[i-l]);
320:     for (j=l;j<k;j++) for (i=l;i<k;i++) pR[i+j*ld] /= D[j-l];
321:   }

323: #if defined(PETSC_USE_COMPLEX)
324:   PetscFree4(eig,D,work,rwork);
325: #else
326:   PetscFree3(eig,D,work);
327: #endif
328:   PetscLogFlops(9.0*n*n*n);
329:   PetscFPTrapPop();

331:   MatDenseRestoreArray(R,&pR);
332:   if (S!=R) MatDenseRestoreArray(S,&pS);
333:   return 0;
334: }

336: /*
337:     QR factorization of an mxn matrix via parallel TSQR
338: */
339: PetscErrorCode BVOrthogonalize_LAPACK_TSQR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscScalar *R,PetscInt ldr)
340: {
341:   PetscInt       level,plevel,nlevels,powtwo,lda,worklen;
342:   PetscBLASInt   m,n,i,j,k,l,s = 0,nb,sz,lwork,info;
343:   PetscScalar    *tau,*work,*A=NULL,*QQ=NULL,*Qhalf,*C=NULL,one=1.0,zero=0.0;
344:   PetscMPIInt    rank,size,count,stride;
345:   MPI_Datatype   tmat;

347:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
348:   PetscBLASIntCast(m_,&m);
349:   PetscBLASIntCast(n_,&n);
350:   k  = PetscMin(m,n);
351:   nb = 16;
352:   lda = 2*n;
353:   MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size);
354:   MPI_Comm_rank(PetscObjectComm((PetscObject)bv),&rank);
355:   nlevels = (PetscInt)PetscCeilReal(PetscLog2Real((PetscReal)size));
356:   powtwo  = PetscPowInt(2,(PetscInt)PetscFloorReal(PetscLog2Real((PetscReal)size)));
357:   worklen = n+n*nb;
358:   if (nlevels) worklen += n*lda+n*lda*nlevels+n*lda;
359:   BVAllocateWork_Private(bv,worklen);
360:   tau  = bv->work;
361:   work = bv->work+n;
362:   PetscBLASIntCast(n*nb,&lwork);
363:   if (nlevels) {
364:     A  = bv->work+n+n*nb;
365:     QQ = bv->work+n+n*nb+n*lda;
366:     C  = bv->work+n+n*nb+n*lda+n*lda*nlevels;
367:   }

369:   /* Compute QR */
370:   PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,Q,&m,tau,work,&lwork,&info));
371:   SlepcCheckLapackInfo("geqrf",info);

373:   /* Extract R */
374:   if (R || nlevels) {
375:     for (j=0;j<n;j++) {
376:       for (i=0;i<=PetscMin(j,m-1);i++) {
377:         if (nlevels) A[i+j*lda] = Q[i+j*m];
378:         else R[i+j*ldr] = Q[i+j*m];
379:       }
380:       for (i=PetscMin(j,m-1)+1;i<n;i++) {
381:         if (nlevels) A[i+j*lda] = 0.0;
382:         else R[i+j*ldr] = 0.0;
383:       }
384:     }
385:   }

387:   /* Compute orthogonal matrix in Q */
388:   PetscCallBLAS("LAPACKorgqr",LAPACKorgqr_(&m,&k,&k,Q,&m,tau,work,&lwork,&info));
389:   SlepcCheckLapackInfo("orgqr",info);

391:   if (nlevels) {

393:     PetscMPIIntCast(n,&count);
394:     PetscMPIIntCast(lda,&stride);
395:     PetscBLASIntCast(lda,&l);
396:     MPI_Type_vector(count,count,stride,MPIU_SCALAR,&tmat);
397:     MPI_Type_commit(&tmat);

399:     for (level=nlevels;level>=1;level--) {

401:       plevel = PetscPowInt(2,level);
402:       PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s);

404:       /* Stack triangular matrices */
405:       if (rank<s && s<size) {  /* send top part, receive bottom part */
406:         MPI_Sendrecv(A,1,tmat,s,111,A+n,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);
407:       } else if (s<size) {  /* copy top to bottom, receive top part */
408:         MPI_Sendrecv(A,1,tmat,rank,111,A+n,1,tmat,rank,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);
409:         MPI_Sendrecv(A+n,1,tmat,s,111,A,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);
410:       }
411:       if (level<nlevels && size!=powtwo) {  /* for cases when size is not a power of 2 */
412:         if (rank<size-powtwo) {  /* send bottom part */
413:           MPI_Send(A+n,1,tmat,rank+powtwo,111,PetscObjectComm((PetscObject)bv));
414:         } else if (rank>=powtwo) {  /* receive bottom part */
415:           MPI_Recv(A+n,1,tmat,rank-powtwo,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);
416:         }
417:       }
418:       /* Compute QR and build orthogonal matrix */
419:       if (level<nlevels || (level==nlevels && s<size)) {
420:         PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&l,&n,A,&l,tau,work,&lwork,&info));
421:         SlepcCheckLapackInfo("geqrf",info);
422:         PetscArraycpy(QQ+(level-1)*n*lda,A,n*lda);
423:         PetscCallBLAS("LAPACKorgqr",LAPACKorgqr_(&l,&n,&n,QQ+(level-1)*n*lda,&l,tau,work,&lwork,&info));
424:         SlepcCheckLapackInfo("orgqr",info);
425:         for (j=0;j<n;j++) {
426:           for (i=j+1;i<n;i++) A[i+j*lda] = 0.0;
427:         }
428:       } else if (level==nlevels) {  /* only one triangular matrix, set Q=I */
429:         PetscArrayzero(QQ+(level-1)*n*lda,n*lda);
430:         for (j=0;j<n;j++) QQ[j+j*lda+(level-1)*n*lda] = 1.0;
431:       }
432:     }

434:     /* Extract R */
435:     if (R) {
436:       for (j=0;j<n;j++) {
437:         for (i=0;i<=j;i++) R[i+j*ldr] = A[i+j*lda];
438:         for (i=j+1;i<n;i++) R[i+j*ldr] = 0.0;
439:       }
440:     }

442:     /* Accumulate orthogonal matrices */
443:     for (level=1;level<=nlevels;level++) {
444:       plevel = PetscPowInt(2,level);
445:       PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s);
446:       Qhalf = (rank<s)? QQ+(level-1)*n*lda: QQ+(level-1)*n*lda+n;
447:       if (level<nlevels) {
448:         PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,QQ+level*n*lda,&l,Qhalf,&l,&zero,C,&l));
449:         PetscArraycpy(QQ+level*n*lda,C,n*lda);
450:       } else {
451:         for (i=0;i<m/l;i++) {
452:           PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,Q+i*l,&m,Qhalf,&l,&zero,C,&l));
453:           for (j=0;j<n;j++) PetscArraycpy(Q+i*l+j*m,C+j*l,l);
454:         }
455:         sz = m%l;
456:         if (sz) {
457:           PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&sz,&n,&n,&one,Q+(m/l)*l,&m,Qhalf,&l,&zero,C,&l));
458:           for (j=0;j<n;j++) PetscArraycpy(Q+(m/l)*l+j*m,C+j*l,sz);
459:         }
460:       }
461:     }

463:     MPI_Type_free(&tmat);
464:   }

466:   PetscLogFlops(3.0*m*n*n);
467:   PetscFPTrapPop();
468:   return 0;
469: }

471: /*
472:     Reduction operation to compute [~,Rout]=qr([Rin1;Rin2]) in the TSQR algorithm;
473:     all matrices are upper triangular stored in packed format
474: */
475: SLEPC_EXTERN void MPIAPI SlepcGivensPacked(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
476: {
477:   PetscBLASInt   n,i,j,k,one=1;
478:   PetscMPIInt    tsize;
479:   PetscScalar    v,s,*R2=(PetscScalar*)in,*R1=(PetscScalar*)inout;
480:   PetscReal      c;

482:   PETSC_COMM_SELF,MPI_Type_size(*datatype,&tsize);  /* we assume len=1 */
483:   tsize /= sizeof(PetscScalar);
484:   n = (-1+(PetscBLASInt)PetscSqrtReal(1+8*tsize))/2;
485:   for (j=0;j<n;j++) {
486:     for (i=0;i<=j;i++) {
487:       LAPACKlartg_(R1+(2*n-j-1)*j/2+j,R2+(2*n-i-1)*i/2+j,&c,&s,&v);
488:       R1[(2*n-j-1)*j/2+j] = v;
489:       k = n-j-1;
490:       if (k) BLASrot_(&k,R1+(2*n-j-1)*j/2+j+1,&one,R2+(2*n-i-1)*i/2+j+1,&one,&c,&s);
491:     }
492:   }
493:   return;
494: }

496: /*
497:     Computes the R factor of the QR factorization of an mxn matrix via parallel TSQR
498: */
499: PetscErrorCode BVOrthogonalize_LAPACK_TSQR_OnlyR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscScalar *R,PetscInt ldr)
500: {
501:   PetscInt       worklen;
502:   PetscBLASInt   m,n,i,j,s,nb,lwork,info;
503:   PetscScalar    *tau,*work,*A=NULL,*R1=NULL,*R2=NULL;
504:   PetscMPIInt    size,count;
505:   MPI_Datatype   tmat;

507:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
508:   PetscBLASIntCast(m_,&m);
509:   PetscBLASIntCast(n_,&n);
510:   nb = 16;
511:   s  = n+n*(n-1)/2;  /* length of packed triangular matrix */
512:   MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size);
513:   worklen = n+n*nb+2*s+m*n;
514:   BVAllocateWork_Private(bv,worklen);
515:   tau  = bv->work;
516:   work = bv->work+n;
517:   R1   = bv->work+n+n*nb;
518:   R2   = bv->work+n+n*nb+s;
519:   A    = bv->work+n+n*nb+2*s;
520:   PetscBLASIntCast(n*nb,&lwork);
521:   PetscArraycpy(A,Q,m*n);

523:   /* Compute QR */
524:   PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,A,&m,tau,work,&lwork,&info));
525:   SlepcCheckLapackInfo("geqrf",info);

527:   if (size==1) {
528:     /* Extract R */
529:     for (j=0;j<n;j++) {
530:       for (i=0;i<=PetscMin(j,m-1);i++) R[i+j*ldr] = A[i+j*m];
531:       for (i=PetscMin(j,m-1)+1;i<n;i++) R[i+j*ldr] = 0.0;
532:     }
533:   } else {
534:     /* Use MPI reduction operation to obtain global R */
535:     PetscMPIIntCast(s,&count);
536:     MPI_Type_contiguous(count,MPIU_SCALAR,&tmat);
537:     MPI_Type_commit(&tmat);
538:     for (i=0;i<n;i++) {
539:       for (j=i;j<n;j++) R1[(2*n-i-1)*i/2+j] = (i<m)?A[i+j*m]:0.0;
540:     }
541:     MPIU_Allreduce(R1,R2,1,tmat,MPIU_TSQR,PetscObjectComm((PetscObject)bv));
542:     for (i=0;i<n;i++) {
543:       for (j=0;j<i;j++) R[i+j*ldr] = 0.0;
544:       for (j=i;j<n;j++) R[i+j*ldr] = R2[(2*n-i-1)*i/2+j];
545:     }
546:     MPI_Type_free(&tmat);
547:   }

549:   PetscLogFlops(3.0*m*n*n);
550:   PetscFPTrapPop();
551:   return 0;
552: }