Add new files for Householder reconstruction functions from 3.9.1

4 years ago · 4c1d47098b
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -135,14 +135,14 @@ SLASRC_O = \
   slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \
   slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \
   slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \
   slarf.o  slarfb.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \
   slarf.o  slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \
   slarrv.o slartv.o \
   slarz.o  slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \
   slasyf_rk.o \
   slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o \
   slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \
   sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \
   sorgrq.o sorgtr.o sorgtsqr.o sorm2l.o sorm2r.o sorm22.o \
   sorgrq.o sorgtr.o sorgtsqr.o sorgtsqr_row.o sorm2l.o sorm2r.o sorm22.o \
   sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \
   sormr3.o sormrq.o sormrz.o sormtr.o spbcon.o spbequ.o spbrfs.o \
   spbstf.o spbsv.o  spbsvx.o \
@@ -181,7 +181,7 @@ SLASRC_O = \
   sgeqrt.o sgeqrt2.o sgeqrt3.o sgemqrt.o \
   stpqrt.o stpqrt2.o stpmqrt.o stprfb.o \
   sgelqt.o sgelqt3.o sgemlqt.o \
   sgetsls.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \
   sgetsls.o sgetsqrhrt.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \
   sgelq.o slaswlq.o slamswlq.o sgemlq.o \
   stplqt.o stplqt2.o stpmlqt.o \
   sorhr_col.o slaorhr_col_getrfnp.o slaorhr_col_getrfnp2.o \
@@ -250,7 +250,7 @@ CLASRC_O = \
   claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqsb.o \
   claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \
   claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \
   clarf.o  clarfb.o clarfg.o clarft.o clarfgp.o \
   clarf.o  clarfb.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \
   clarfx.o clarfy.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \
   clarz.o  clarzb.o clarzt.o clascl.o claset.o clasr.o  classq.o \
   claswp.o clasyf.o clasyf_rook.o clasyf_rk.o clasyf_aa.o \
@@ -278,7 +278,7 @@ CLASRC_O = \
   ctptrs.o ctrcon.o ctrevc.o ctrevc3.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \
   ctrsyl.o ctrti2.o ctrtri.o ctrtrs.o ctzrzf.o cung2l.o cung2r.o \
   cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \
   cungrq.o cungtr.o cungtsqr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \
   cungrq.o cungtr.o cungtsqr.o cungtsqr_row.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \
   cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \
   cunmtr.o cupgtr.o cupmtr.o icmax1.o scsum1.o cstemr.o \
   chfrk.o ctfttp.o clanhf.o cpftrf.o cpftri.o cpftrs.o ctfsm.o ctftri.o \
@@ -342,14 +342,14 @@ DLASRC_O = \
   dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \
   dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \
   dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \
   dlarf.o  dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \
   dlarf.o  dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \
   dlargv.o dlarrv.o dlartv.o \
   dlarz.o  dlarzb.o dlarzt.o dlaswp.o dlasy2.o \
   dlasyf.o dlasyf_rook.o dlasyf_rk.o \
   dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlauu2.o \
   dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \
   dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \
   dorgrq.o dorgtr.o dorgtsqr.o dorm2l.o dorm2r.o dorm22.o \
   dorgrq.o dorgtr.o dorgtsqr.o dorgtsqr_row.o dorm2l.o dorm2r.o dorm22.o \
   dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \
   dormr3.o dormrq.o dormrz.o dormtr.o dpbcon.o dpbequ.o dpbrfs.o \
   dpbstf.o dpbsv.o  dpbsvx.o \
@@ -389,7 +389,7 @@ DLASRC_O = \
   dgeqrt.o dgeqrt2.o dgeqrt3.o dgemqrt.o \
   dtpqrt.o dtpqrt2.o dtpmqrt.o dtprfb.o \
   dgelqt.o dgelqt3.o dgemlqt.o \
   dgetsls.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \
   dgetsls.o dgetsqrhrt.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \
   dgelq.o dlaswlq.o dlamswlq.o dgemlq.o \
   dtplqt.o dtplqt2.o dtpmlqt.o \
   dorhr_col.o dlaorhr_col_getrfnp.o dlaorhr_col_getrfnp2.o \
@@ -455,7 +455,7 @@ ZLASRC_O = \
   zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqsb.o \
   zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \
   zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \
   zlarcm.o zlarf.o  zlarfb.o \
   zlarcm.o zlarf.o  zlarfb.o zlarfb_gett.o \
   zlarfg.o zlarft.o zlarfgp.o \
   zlarfx.o zlarfy.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \
   zlarz.o  zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \
@@ -484,7 +484,7 @@ ZLASRC_O = \
   ztptrs.o ztrcon.o ztrevc.o ztrevc3.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \
   ztrsyl.o ztrti2.o ztrtri.o ztrtrs.o ztzrzf.o zung2l.o \
   zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \
   zungrq.o zungtr.o zungtsqr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \
   zungrq.o zungtr.o zungtsqr.o zungtsqr_row.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \
   zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \
   zunmtr.o zupgtr.o \
   zupmtr.o izmax1.o dzsum1.o zstemr.o \
@@ -498,7 +498,7 @@ ZLASRC_O = \
   ztpqrt.o ztpqrt2.o ztpmqrt.o ztprfb.o \
   ztplqt.o ztplqt2.o ztpmlqt.o \
   zgelqt.o zgelqt3.o zgemlqt.o \
   zgetsls.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \
   zgetsls.o zgetsqrhrt.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \
   zgelq.o zlaswlq.o zlamswlq.o zgemlq.o \
   zunhr_col.o zlaunhr_col_getrfnp.o zlaunhr_col_getrfnp2.o \
   zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \
--- a/lapack-netlib/SRC/cgetsqrhrt.f
+++ b/lapack-netlib/SRC/cgetsqrhrt.f
@@ -0,0 +1,349 @@
 *> \brief \b CGETSQRHRT
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download CGETSQRHRT + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgetsqrhrt.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgetsqrhrt.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgetsqrhrt.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE CGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
 *      $                       LWORK, INFO )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
 *       ..
 *       .. Array Arguments ..
 *       COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
 *       ..
 *
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> CGETSQRHRT computes a NB2-sized column blocked QR-factorization
 *> of a complex M-by-N matrix A with M >= N,
 *>
 *>    A = Q * R.
 *>
 *> The routine uses internally a NB1-sized column blocked and MB1-sized
 *> row blocked TSQR-factorization and perfors the reconstruction
 *> of the Householder vectors from the TSQR output. The routine also
 *> converts the R_tsqr factor from the TSQR-factorization output into
 *> the R factor that corresponds to the Householder QR-factorization,
 *>
 *>    A = Q_tsqr * R_tsqr = Q * R.
 *>
 *> The output Q and R factors are stored in the same format as in CGEQRT
 *> (Q is in blocked compact WY-representation). See the documentation
 *> of CGEQRT for more details on the format.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix A.  M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrix A. M >= N >= 0.
 *> \endverbatim
 *>
 *> \param[in] MB1
 *> \verbatim
 *>          MB1 is INTEGER
 *>          The row block size to be used in the blocked TSQR.
 *>          MB1 > N.
 *> \endverbatim
 *>
 *> \param[in] NB1
 *> \verbatim
 *>          NB1 is INTEGER
 *>          The column block size to be used in the blocked TSQR.
 *>          N >= NB1 >= 1.
 *> \endverbatim
 *>
 *> \param[in] NB2
 *> \verbatim
 *>          NB2 is INTEGER
 *>          The block size to be used in the blocked QR that is
 *>          output. NB2 >= 1.
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX*16 array, dimension (LDA,N)
 *>
 *>          On entry: an M-by-N matrix A.
 *>
 *>          On exit:
 *>           a) the elements on and above the diagonal
 *>              of the array contain the N-by-N upper-triangular
 *>              matrix R corresponding to the Householder QR;
 *>           b) the elements below the diagonal represent Q by
 *>              the columns of blocked V (compact WY-representation).
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[out] T
 *> \verbatim
 *>          T is COMPLEX array, dimension (LDT,N))
 *>          The upper triangular block reflectors stored in compact form
 *>          as a sequence of upper triangular blocks.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T.  LDT >= NB2.
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
 *>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
 *>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
 *>          array, and no error message related to LWORK is issued
 *>          by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup comlpexOTHERcomputational
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE CGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
     $                       LWORK, INFO )
      IMPLICIT NONE
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
 *     ..
 *     .. Array Arguments ..
      COMPLEX           A( LDA, * ), T( LDT, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      COMPLEX            CONE
      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ) )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY
      INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
     $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           CCOPY, CLATSQR, CUNGTSQR_ROW, CUNHR_COL,
     $                   XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          CEILING, REAL, CMPLX, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
      INFO = 0
      LQUERY  = LWORK.EQ.-1
      IF( M.LT.0 ) THEN
         INFO = -1
      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
         INFO = -2
      ELSE IF( MB1.LE.N ) THEN
         INFO = -3
      ELSE IF( NB1.LT.1 ) THEN
         INFO = -4
      ELSE IF( NB2.LT.1 ) THEN
         INFO = -5
      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -7
      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
         INFO = -9
      ELSE
 *
 *        Test the input LWORK for the dimension of the array WORK.
 *        This workspace is used to store array:
 *        a) Matrix T and WORK for CLATSQR;
 *        b) N-by-N upper-triangular factor R_tsqr;
 *        c) Matrix T and array WORK for CUNGTSQR_ROW;
 *        d) Diagonal D for CUNHR_COL.
 *
         IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN
            INFO = -11
         ELSE
 *
 *           Set block size for column blocks
 *
            NB1LOCAL = MIN( NB1, N )
 *
            NUM_ALL_ROW_BLOCKS = MAX( 1,
     $                   CEILING( REAL( M - N ) / REAL( MB1 - N ) ) )
 *
 *           Length and leading dimension of WORK array to place
 *           T array in TSQR.
 *
            LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL

            LDWT = NB1LOCAL
 *
 *           Length of TSQR work array
 *
            LW1 = NB1LOCAL * N
 *
 *           Length of CUNGTSQR_ROW work array.
 *
            LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
            LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
 *
            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
               INFO = -11
            END IF
 *
         END IF
      END IF
 *
 *     Handle error in the input parameters and return workspace query.
 *
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CGETSQRHRT', -INFO )
         RETURN
      ELSE IF ( LQUERY ) THEN
         WORK( 1 ) = CMPLX( LWORKOPT )
         RETURN
      END IF
 *
 *     Quick return if possible
 *
      IF( MIN( M, N ).EQ.0 ) THEN
         WORK( 1 ) = CMPLX( LWORKOPT )
         RETURN
      END IF
 *
      NB2LOCAL = MIN( NB2, N )
 *
 *
 *     (1) Perform TSQR-factorization of the M-by-N matrix A.
 *
      CALL CLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
     $              WORK(LWT+1), LW1, IINFO )
 *
 *     (2) Copy the factor R_tsqr stored in the upper-triangular part
 *         of A into the square matrix in the work array
 *         WORK(LWT+1:LWT+N*N) column-by-column.
 *
      DO J = 1, N
         CALL CCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 )
      END DO
 *
 *     (3) Generate a M-by-N matrix Q with orthonormal columns from
 *     the result stored below the diagonal in the array A in place.
 *

      CALL CUNGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
     $                   WORK( LWT+N*N+1 ), LW2, IINFO )
 *
 *     (4) Perform the reconstruction of Householder vectors from
 *     the matrix Q (stored in A) in place.
 *
      CALL CUNHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT,
     $                WORK( LWT+N*N+1 ), IINFO )
 *
 *     (5) Copy the factor R_tsqr stored in the square matrix in the
 *     work array WORK(LWT+1:LWT+N*N) into the upper-triangular
 *     part of A.
 *
 *     (6) Compute from R_tsqr the factor R_hr corresponding to
 *     the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr.
 *     This multiplication by the sign matrix S on the left means
 *     changing the sign of I-th row of the matrix R_tsqr according
 *     to sign of the I-th diagonal element DIAG(I) of the matrix S.
 *     DIAG is stored in WORK( LWT+N*N+1 ) from the CUNHR_COL output.
 *
 *     (5) and (6) can be combined in a single loop, so the rows in A
 *     are accessed only once.
 *
      DO I = 1, N
         IF( WORK( LWT+N*N+I ).EQ.-CONE ) THEN
            DO J = I, N
               A( I, J ) = -CONE * WORK( LWT+N*(J-1)+I )
            END DO
         ELSE
            CALL CCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA )
         END IF
      END DO
 *
      WORK( 1 ) = CMPLX( LWORKOPT )
      RETURN
 *
 *     End of CGETSQRHRT
 *
      END
--- a/lapack-netlib/SRC/clarfb_gett.f
+++ b/lapack-netlib/SRC/clarfb_gett.f
@@ -0,0 +1,597 @@
 *> \brief \b CLARFB_GETT
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download CLARFB_GETT + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/clarfb_gett.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/clarfb_gett.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/clarfb_gett.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *>
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE CLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
 *      $                        WORK, LDWORK )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       CHARACTER          IDENT
 *       INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
 *       ..
 *       .. Array Arguments ..
 *       COMPLEX            A( LDA, * ), B( LDB, * ), T( LDT, * ),
 *      $                   WORK( LDWORK, * )
 *       ..
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> CLARFB_GETT applies a complex Householder block reflector H from the
 *> left to a complex (K+M)-by-N  "triangular-pentagonal" matrix
 *> composed of two block matrices: an upper trapezoidal K-by-N matrix A
 *> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
 *> in the array B. The block reflector H is stored in a compact
 *> WY-representation, where the elementary reflectors are in the
 *> arrays A, B and T. See Further Details section.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] IDENT
 *> \verbatim
 *>          IDENT is CHARACTER*1
 *>          If IDENT = not 'I', or not 'i', then V1 is unit
 *>             lower-triangular and stored in the left K-by-K block of
 *>             the input matrix A,
 *>          If IDENT = 'I' or 'i', then  V1 is an identity matrix and
 *>             not stored.
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix B.
 *>          M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrices A and B.
 *>          N >= 0.
 *> \endverbatim
 *>
 *> \param[in] K
 *> \verbatim
 *>          K is INTEGER
 *>          The number or rows of the matrix A.
 *>          K is also order of the matrix T, i.e. the number of
 *>          elementary reflectors whose product defines the block
 *>          reflector. 0 <= K <= N.
 *> \endverbatim
 *>
 *> \param[in] T
 *> \verbatim
 *>          T is COMPLEX array, dimension (LDT,K)
 *>          The upper-triangular K-by-K matrix T in the representation
 *>          of the block reflector.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T. LDT >= K.
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX array, dimension (LDA,N)
 *>
 *>          On entry:
 *>           a) In the K-by-N upper-trapezoidal part A: input matrix A.
 *>           b) In the columns below the diagonal: columns of V1
 *>              (ones are not stored on the diagonal).
 *>
 *>          On exit:
 *>            A is overwritten by rectangular K-by-N product H*A.
 *>
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDB is INTEGER
 *>          The leading dimension of the array A. LDA >= max(1,K).
 *> \endverbatim
 *>
 *> \param[in,out] B
 *> \verbatim
 *>          B is COMPLEX array, dimension (LDB,N)
 *>
 *>          On entry:
 *>            a) In the M-by-(N-K) right block: input matrix B.
 *>            b) In the M-by-N left block: columns of V2.
 *>
 *>          On exit:
 *>            B is overwritten by rectangular M-by-N product H*B.
 *>
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] LDB
 *> \verbatim
 *>          LDB is INTEGER
 *>          The leading dimension of the array B. LDB >= max(1,M).
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX array,
 *>          dimension (LDWORK,max(K,N-K))
 *> \endverbatim
 *>
 *> \param[in] LDWORK
 *> \verbatim
 *>          LDWORK is INTEGER
 *>          The leading dimension of the array WORK. LDWORK>=max(1,K).
 *>
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup complexOTHERauxiliary
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *
 *> \par Further Details:
 *  =====================
 *>
 *> \verbatim
 *>
 *>    (1) Description of the Algebraic Operation.
 *>
 *>    The matrix A is a K-by-N matrix composed of two column block
 *>    matrices, A1, which is K-by-K, and A2, which is K-by-(N-K):
 *>    A = ( A1, A2 ).
 *>    The matrix B is an M-by-N matrix composed of two column block
 *>    matrices, B1, which is M-by-K, and B2, which is M-by-(N-K):
 *>    B = ( B1, B2 ).
 *>
 *>    Perform the operation:
 *>
 *>       ( A_out ) := H * ( A_in ) = ( I - V * T * V**H ) * ( A_in ) =
 *>       ( B_out )        ( B_in )                          ( B_in )
 *>                  = ( I - ( V1 ) * T * ( V1**H, V2**H ) ) * ( A_in )
 *>                          ( V2 )                            ( B_in )
 *>     On input:
 *>
 *>    a) ( A_in )  consists of two block columns:
 *>       ( B_in )
 *>
 *>       ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in ))
 *>       ( B_in )   (( B1_in ) ( B2_in ))   ((     0 ) ( B2_in )),
 *>
 *>       where the column blocks are:
 *>
 *>       (  A1_in )  is a K-by-K upper-triangular matrix stored in the
 *>                   upper triangular part of the array A(1:K,1:K).
 *>       (  B1_in )  is an M-by-K rectangular ZERO matrix and not stored.
 *>
 *>       ( A2_in )  is a K-by-(N-K) rectangular matrix stored
 *>                  in the array A(1:K,K+1:N).
 *>       ( B2_in )  is an M-by-(N-K) rectangular matrix stored
 *>                  in the array B(1:M,K+1:N).
 *>
 *>    b) V = ( V1 )
 *>           ( V2 )
 *>
 *>       where:
 *>       1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored;
 *>       2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix,
 *>          stored in the lower-triangular part of the array
 *>          A(1:K,1:K) (ones are not stored),
 *>       and V2 is an M-by-K rectangular stored the array B(1:M,1:K),
 *>                 (because on input B1_in is a rectangular zero
 *>                  matrix that is not stored and the space is
 *>                  used to store V2).
 *>
 *>    c) T is a K-by-K upper-triangular matrix stored
 *>       in the array T(1:K,1:K).
 *>
 *>    On output:
 *>
 *>    a) ( A_out ) consists of two  block columns:
 *>       ( B_out )
 *>
 *>       ( A_out ) = (( A1_out ) ( A2_out ))
 *>       ( B_out )   (( B1_out ) ( B2_out )),
 *>
 *>       where the column blocks are:
 *>
 *>       ( A1_out )  is a K-by-K square matrix, or a K-by-K
 *>                   upper-triangular matrix, if V1 is an
 *>                   identity matrix. AiOut is stored in
 *>                   the array A(1:K,1:K).
 *>       ( B1_out )  is an M-by-K rectangular matrix stored
 *>                   in the array B(1:M,K:N).
 *>
 *>       ( A2_out )  is a K-by-(N-K) rectangular matrix stored
 *>                   in the array A(1:K,K+1:N).
 *>       ( B2_out )  is an M-by-(N-K) rectangular matrix stored
 *>                   in the array B(1:M,K+1:N).
 *>
 *>
 *>    The operation above can be represented as the same operation
 *>    on each block column:
 *>
 *>       ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**H ) * ( A1_in )
 *>       ( B1_out )        (     0 )                          (     0 )
 *>
 *>       ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**H ) * ( A2_in )
 *>       ( B2_out )        ( B2_in )                          ( B2_in )
 *>
 *>    If IDENT != 'I':
 *>
 *>       The computation for column block 1:
 *>
 *>       A1_out: = A1_in - V1*T*(V1**H)*A1_in
 *>
 *>       B1_out: = - V2*T*(V1**H)*A1_in
 *>
 *>       The computation for column block 2, which exists if N > K:
 *>
 *>       A2_out: = A2_in - V1*T*( (V1**H)*A2_in + (V2**H)*B2_in )
 *>
 *>       B2_out: = B2_in - V2*T*( (V1**H)*A2_in + (V2**H)*B2_in )
 *>
 *>    If IDENT == 'I':
 *>
 *>       The operation for column block 1:
 *>
 *>       A1_out: = A1_in - V1*T*A1_in
 *>
 *>       B1_out: = - V2*T*A1_in
 *>
 *>       The computation for column block 2, which exists if N > K:
 *>
 *>       A2_out: = A2_in - T*( A2_in + (V2**H)*B2_in )
 *>
 *>       B2_out: = B2_in - V2*T*( A2_in + (V2**H)*B2_in )
 *>
 *>    (2) Description of the Algorithmic Computation.
 *>
 *>    In the first step, we compute column block 2, i.e. A2 and B2.
 *>    Here, we need to use the K-by-(N-K) rectangular workspace
 *>    matrix W2 that is of the same size as the matrix A2.
 *>    W2 is stored in the array WORK(1:K,1:(N-K)).
 *>
 *>    In the second step, we compute column block 1, i.e. A1 and B1.
 *>    Here, we need to use the K-by-K square workspace matrix W1
 *>    that is of the same size as the as the matrix A1.
 *>    W1 is stored in the array WORK(1:K,1:K).
 *>
 *>    NOTE: Hence, in this routine, we need the workspace array WORK
 *>    only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from
 *>    the first step and W1 from the second step.
 *>
 *>    Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I',
 *>    more computations than in the Case (B).
 *>
 *>    if( IDENT != 'I' ) then
 *>     if ( N > K ) then
 *>       (First Step - column block 2)
 *>       col2_(1) W2: = A2
 *>       col2_(2) W2: = (V1**H) * W2 = (unit_lower_tr_of_(A1)**H) * W2
 *>       col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
 *>       col2_(4) W2: = T * W2
 *>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>       col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
 *>       col2_(7) A2: = A2 - W2
 *>     else
 *>       (Second Step - column block 1)
 *>       col1_(1) W1: = A1
 *>       col1_(2) W1: = (V1**H) * W1 = (unit_lower_tr_of_(A1)**H) * W1
 *>       col1_(3) W1: = T * W1
 *>       col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>       col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
 *>       col1_(6) square A1: = A1 - W1
 *>     end if
 *>    end if
 *>
 *>    Case (B), when V1 is an identity matrix, i.e. IDENT == 'I',
 *>    less computations than in the Case (A)
 *>
 *>    if( IDENT == 'I' ) then
 *>     if ( N > K ) then
 *>       (First Step - column block 2)
 *>       col2_(1) W2: = A2
 *>       col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
 *>       col2_(4) W2: = T * W2
 *>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>       col2_(7) A2: = A2 - W2
 *>     else
 *>       (Second Step - column block 1)
 *>       col1_(1) W1: = A1
 *>       col1_(3) W1: = T * W1
 *>       col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>       col1_(6) upper-triangular_of_(A1): = A1 - W1
 *>     end if
 *>    end if
 *>
 *>    Combine these cases (A) and (B) together, this is the resulting
 *>    algorithm:
 *>
 *>    if ( N > K ) then
 *>
 *>      (First Step - column block 2)
 *>
 *>      col2_(1)  W2: = A2
 *>      if( IDENT != 'I' ) then
 *>        col2_(2)  W2: = (V1**H) * W2
 *>                      = (unit_lower_tr_of_(A1)**H) * W2
 *>      end if
 *>      col2_(3)  W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2]
 *>      col2_(4)  W2: = T * W2
 *>      col2_(5)  B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>      if( IDENT != 'I' ) then
 *>        col2_(6)    W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
 *>      end if
 *>      col2_(7) A2: = A2 - W2
 *>
 *>    else
 *>
 *>    (Second Step - column block 1)
 *>
 *>      col1_(1) W1: = A1
 *>      if( IDENT != 'I' ) then
 *>        col1_(2) W1: = (V1**H) * W1
 *>                    = (unit_lower_tr_of_(A1)**H) * W1
 *>      end if
 *>      col1_(3) W1: = T * W1
 *>      col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>      if( IDENT != 'I' ) then
 *>        col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
 *>        col1_(6_a) below_diag_of_(A1): =  - below_diag_of_(W1)
 *>      end if
 *>      col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1)
 *>
 *>    end if
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE CLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
     $                        WORK, LDWORK )
      IMPLICIT NONE
 *
 *  -- LAPACK auxiliary routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      CHARACTER          IDENT
      INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
 *     ..
 *     .. Array Arguments ..
      COMPLEX            A( LDA, * ), B( LDB, * ), T( LDT, * ),
     $                   WORK( LDWORK, * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      COMPLEX            CONE, CZERO
      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ),
     $                     CZERO = ( 0.0E+0, 0.0E+0 ) )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LNOTIDENT
      INTEGER            I, J
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
      LOGICAL            LSAME
      EXTERNAL           LSAME
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           CCOPY, CGEMM, CTRMM
 *     ..
 *     .. Executable Statements ..
 *
 *     Quick return if possible
 *
      IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N )
     $   RETURN
 *
      LNOTIDENT = .NOT.LSAME( IDENT, 'I' )
 *
 *     ------------------------------------------------------------------
 *
 *     First Step. Computation of the Column Block 2:
 *
 *        ( A2 ) := H * ( A2 )
 *        ( B2 )        ( B2 )
 *
 *     ------------------------------------------------------------------
 *
      IF( N.GT.K ) THEN
 *
 *        col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N)
 *        into W2=WORK(1:K, 1:N-K) column-by-column.
 *
         DO J = 1, N-K
            CALL CCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 )
         END DO

         IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2,
 *           V1 is not an identy matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
            CALL CTRMM( 'L', 'L', 'C', 'U', K, N-K, CONE, A, LDA,
     $                  WORK, LDWORK )
         END IF
 *
 *        col2_(3) Compute W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
 *        V2 stored in B1.
 *
         IF( M.GT.0 ) THEN
            CALL CGEMM( 'C', 'N', K, N-K, M, CONE, B, LDB,
     $                  B( 1, K+1 ), LDB, CONE, WORK, LDWORK )
         END IF
 *
 *        col2_(4) Compute W2: = T * W2,
 *        T is upper-triangular.
 *
         CALL CTRMM( 'L', 'U', 'N', 'N', K, N-K, CONE, T, LDT,
     $               WORK, LDWORK )
 *
 *        col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2,
 *        V2 stored in B1.
 *
         IF( M.GT.0 ) THEN
            CALL CGEMM( 'N', 'N', M, N-K, K, -CONE, B, LDB,
     $                   WORK, LDWORK, CONE, B( 1, K+1 ), LDB )
         END IF
 *
         IF( LNOTIDENT ) THEN
 *
 *           col2_(6) Compute W2: = V1 * W2 = A1 * W2,
 *           V1 is not an identity matrix, but unit lower-triangular,
 *           V1 stored in A1 (diagonal ones are not stored).
 *
            CALL CTRMM( 'L', 'L', 'N', 'U', K, N-K, CONE, A, LDA,
     $                  WORK, LDWORK )
         END IF
 *
 *        col2_(7) Compute A2: = A2 - W2 =
 *                             = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K),
 *        column-by-column.
 *
         DO J = 1, N-K
            DO I = 1, K
               A( I, K+J ) = A( I, K+J ) - WORK( I, J )
            END DO
         END DO
 *
      END IF
 *
 *     ------------------------------------------------------------------
 *
 *     Second Step. Computation of the Column Block 1:
 *
 *        ( A1 ) := H * ( A1 )
 *        ( B1 )        (  0 )
 *
 *     ------------------------------------------------------------------
 *
 *     col1_(1) Compute W1: = A1. Copy the upper-triangular
 *     A1 = A(1:K, 1:K) into the upper-triangular
 *     W1 = WORK(1:K, 1:K) column-by-column.
 *
      DO J = 1, K
         CALL CCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 )
      END DO
 *
 *     Set the subdiagonal elements of W1 to zero column-by-column.
 *
      DO J = 1, K - 1
         DO I = J + 1, K
            WORK( I, J ) = CZERO
         END DO
      END DO
 *
      IF( LNOTIDENT ) THEN
 *
 *        col1_(2) Compute W1: = (V1**H) * W1 = (A1**H) * W1,
 *        V1 is not an identity matrix, but unit lower-triangular
 *        V1 stored in A1 (diagonal ones are not stored),
 *        W1 is upper-triangular with zeroes below the diagonal.
 *
         CALL CTRMM( 'L', 'L', 'C', 'U', K, K, CONE, A, LDA,
     $               WORK, LDWORK )
      END IF
 *
 *     col1_(3) Compute W1: = T * W1,
 *     T is upper-triangular,
 *     W1 is upper-triangular with zeroes below the diagonal.
 *
      CALL CTRMM( 'L', 'U', 'N', 'N', K, K, CONE, T, LDT,
     $            WORK, LDWORK )
 *
 *     col1_(4) Compute B1: = - V2 * W1 = - B1 * W1,
 *     V2 = B1, W1 is upper-triangular with zeroes below the diagonal.
 *
      IF( M.GT.0 ) THEN
         CALL CTRMM( 'R', 'U', 'N', 'N', M, K, -CONE, WORK, LDWORK,
     $               B, LDB )
      END IF
 *
      IF( LNOTIDENT ) THEN
 *
 *        col1_(5) Compute W1: = V1 * W1 = A1 * W1,
 *        V1 is not an identity matrix, but unit lower-triangular
 *        V1 stored in A1 (diagonal ones are not stored),
 *        W1 is upper-triangular on input with zeroes below the diagonal,
 *        and square on output.
 *
         CALL CTRMM( 'L', 'L', 'N', 'U', K, K, CONE, A, LDA,
     $               WORK, LDWORK )
 *
 *        col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K)
 *        column-by-column. A1 is upper-triangular on input.
 *        If IDENT, A1 is square on output, and W1 is square,
 *        if NOT IDENT, A1 is upper-triangular on output,
 *        W1 is upper-triangular.
 *
 *        col1_(6)_a Compute elements of A1 below the diagonal.
 *
         DO J = 1, K - 1
            DO I = J + 1, K
               A( I, J ) = - WORK( I, J )
            END DO
         END DO
 *
      END IF
 *
 *     col1_(6)_b Compute elements of A1 on and above the diagonal.
 *
      DO J = 1, K
         DO I = 1, J
            A( I, J ) = A( I, J ) - WORK( I, J )
         END DO
      END DO
 *
      RETURN
 *
 *     End of CLARFB_GETT
 *
      END
--- a/lapack-netlib/SRC/cungtsqr_row.f
+++ b/lapack-netlib/SRC/cungtsqr_row.f
@@ -0,0 +1,380 @@
 *> \brief \b CUNGTSQR_ROW
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download CUNGTSQR_ROW + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cunrgtsqr_row.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cunrgtsqr_row.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cunrgtsqr_row.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *>
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE CUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
 *      $                         LWORK, INFO )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
 *       ..
 *       .. Array Arguments ..
 *       COMPLEX           A( LDA, * ), T( LDT, * ), WORK( * )
 *       ..
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> CUNGTSQR_ROW generates an M-by-N complex matrix Q_out with
 *> orthonormal columns from the output of CLATSQR. These N orthonormal
 *> columns are the first N columns of a product of complex unitary
 *> matrices Q(k)_in of order M, which are returned by CLATSQR in
 *> a special format.
 *>
 *>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
 *>
 *> The input matrices Q(k)_in are stored in row and column blocks in A.
 *> See the documentation of CLATSQR for more details on the format of
 *> Q(k)_in, where each Q(k)_in is represented by block Householder
 *> transformations. This routine calls an auxiliary routine CLARFB_GETT,
 *> where the computation is performed on each individual block. The
 *> algorithm first sweeps NB-sized column blocks from the right to left
 *> starting in the bottom row block and continues to the top row block
 *> (hence _ROW in the routine name). This sweep is in reverse order of
 *> the order in which CLATSQR generates the output blocks.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix A.  M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrix A. M >= N >= 0.
 *> \endverbatim
 *>
 *> \param[in] MB
 *> \verbatim
 *>          MB is INTEGER
 *>          The row block size used by CLATSQR to return
 *>          arrays A and T. MB > N.
 *>          (Note that if MB > M, then M is used instead of MB
 *>          as the row block size).
 *> \endverbatim
 *>
 *> \param[in] NB
 *> \verbatim
 *>          NB is INTEGER
 *>          The column block size used by CLATSQR to return
 *>          arrays A and T. NB >= 1.
 *>          (Note that if NB > N, then N is used instead of NB
 *>          as the column block size).
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX array, dimension (LDA,N)
 *>
 *>          On entry:
 *>
 *>             The elements on and above the diagonal are not used as
 *>             input. The elements below the diagonal represent the unit
 *>             lower-trapezoidal blocked matrix V computed by CLATSQR
 *>             that defines the input matrices Q_in(k) (ones on the
 *>             diagonal are not stored). See CLATSQR for more details.
 *>
 *>          On exit:
 *>
 *>             The array A contains an M-by-N orthonormal matrix Q_out,
 *>             i.e the columns of A are orthogonal unit vectors.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[in] T
 *> \verbatim
 *>          T is COMPLEX array,
 *>          dimension (LDT, N * NIRB)
 *>          where NIRB = Number_of_input_row_blocks
 *>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
 *>          Let NICB = Number_of_input_col_blocks
 *>                   = CEIL(N/NB)
 *>
 *>          The upper-triangular block reflectors used to define the
 *>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
 *>          reflectors are stored in compact form in NIRB block
 *>          reflector sequences. Each of the NIRB block reflector
 *>          sequences is stored in a larger NB-by-N column block of T
 *>          and consists of NICB smaller NB-by-NB upper-triangular
 *>          column blocks. See CLATSQR for more details on the format
 *>          of T.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T.
 *>          LDT >= max(1,min(NB,N)).
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
 *>          LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)),
 *>          where NBLOCAL=MIN(NB,N).
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
 *>          array, and no error message related to LWORK is issued
 *>          by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *> \endverbatim
 *>
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup complexOTHERcomputational
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE CUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
     $                         LWORK, INFO )
      IMPLICIT NONE
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
 *     ..
 *     .. Array Arguments ..
      COMPLEX           A( LDA, * ), T( LDT, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      COMPLEX            CONE, CZERO
      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ),
     $                     CZERO = ( 0.0E+0, 0.0E+0 ) )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY
      INTEGER            NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM,
     $                   LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB,
     $                   KB, KB_LAST, KNB, MB1
 *     ..
 *     .. Local Arrays ..
      COMPLEX            DUMMY( 1, 1 )
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           CLARFB_GETT, CLASET, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          CMPLX, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input parameters
 *
      INFO = 0
      LQUERY  = LWORK.EQ.-1
      IF( M.LT.0 ) THEN
         INFO = -1
      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
         INFO = -2
      ELSE IF( MB.LE.N ) THEN
         INFO = -3
      ELSE IF( NB.LT.1 ) THEN
         INFO = -4
      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
         INFO = -8
      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
         INFO = -10
      END IF
 *
      NBLOCAL = MIN( NB, N )
 *
 *     Determine the workspace size.
 *
      IF( INFO.EQ.0 ) THEN
         LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) )
      END IF
 *
 *     Handle error in the input parameters and handle the workspace query.
 *
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CUNGTSQR_ROW', -INFO )
         RETURN
      ELSE IF ( LQUERY ) THEN
         WORK( 1 ) = CMPLX( LWORKOPT )
         RETURN
      END IF
 *
 *     Quick return if possible
 *
      IF( MIN( M, N ).EQ.0 ) THEN
         WORK( 1 ) = CMPLX( LWORKOPT )
         RETURN
      END IF
 *
 *     (0) Set the upper-triangular part of the matrix A to zero and
 *     its diagonal elements to one.
 *
      CALL CLASET('U', M, N, CZERO, CONE, A, LDA )
 *
 *     KB_LAST is the column index of the last column block reflector
 *     in the matrices T and V.
 *
      KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1
 *
 *
 *     (1) Bottom-up loop over row blocks of A, except the top row block.
 *     NOTE: If MB>=M, then the loop is never executed.
 *
      IF ( MB.LT.M ) THEN
 *
 *        MB2 is the row blocking size for the row blocks before the
 *        first top row block in the matrix A. IB is the row index for
 *        the row blocks in the matrix A before the first top row block.
 *        IB_BOTTOM is the row index for the last bottom row block
 *        in the matrix A. JB_T is the column index of the corresponding
 *        column block in the matrix T.
 *
 *        Initialize variables.
 *
 *        NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A
 *        including the first row block.
 *
         MB2 = MB - N
         M_PLUS_ONE = M + 1
         ITMP = ( M - MB - 1 ) / MB2
         IB_BOTTOM = ITMP * MB2 + MB + 1
         NUM_ALL_ROW_BLOCKS = ITMP + 2
         JB_T = NUM_ALL_ROW_BLOCKS * N + 1
 *
         DO IB = IB_BOTTOM, MB+1, -MB2
 *
 *           Determine the block size IMB for the current row block
 *           in the matrix A.
 *
            IMB = MIN( M_PLUS_ONE - IB, MB2 )
 *
 *           Determine the column index JB_T for the current column block
 *           in the matrix T.
 *
            JB_T = JB_T - N
 *
 *           Apply column blocks of H in the row block from right to left.
 *
 *           KB is the column index of the current column block reflector
 *           in the matrices T and V.
 *
            DO KB = KB_LAST, 1, -NBLOCAL
 *
 *              Determine the size of the current column block KNB in
 *              the matrices T and V.
 *
               KNB = MIN( NBLOCAL, N - KB + 1 )
 *
               CALL CLARFB_GETT( 'I', IMB, N-KB+1, KNB,
     $                     T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA,
     $                     A( IB, KB ), LDA, WORK, KNB )
 *
            END DO
 *
         END DO
 *
      END IF
 *
 *     (2) Top row block of A.
 *     NOTE: If MB>=M, then we have only one row block of A of size M
 *     and we work on the entire matrix A.
 *
      MB1 = MIN( MB, M )
 *
 *     Apply column blocks of H in the top row block from right to left.
 *
 *     KB is the column index of the current block reflector in
 *     the matrices T and V.
 *
      DO KB = KB_LAST, 1, -NBLOCAL
 *
 *        Determine the size of the current column block KNB in
 *        the matrices T and V.
 *
         KNB = MIN( NBLOCAL, N - KB + 1 )
 *
         IF( MB1-KB-KNB+1.EQ.0 ) THEN
 *
 *           In SLARFB_GETT parameters, when M=0, then the matrix B
 *           does not exist, hence we need to pass a dummy array
 *           reference DUMMY(1,1) to B with LDDUMMY=1.
 *
            CALL CLARFB_GETT( 'N', 0, N-KB+1, KNB,
     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
     $                        DUMMY( 1, 1 ), 1, WORK, KNB )
         ELSE
            CALL CLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB,
     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
     $                        A( KB+KNB, KB), LDA, WORK, KNB )

         END IF
 *
      END DO
 *
      WORK( 1 ) = CMPLX( LWORKOPT )
      RETURN
 *
 *     End of CUNGTSQR_ROW
 *
      END
--- a/lapack-netlib/SRC/dgetsqrhrt.f
+++ b/lapack-netlib/SRC/dgetsqrhrt.f
@@ -0,0 +1,349 @@
 *> \brief \b DGETSQRHRT
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download DGETSQRHRT + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgetsqrhrt.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgetsqrhrt.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgetsqrhrt.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE DGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
 *      $                       LWORK, INFO )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
 *       ..
 *       .. Array Arguments ..
 *       DOUBLE PRECISION  A( LDA, * ), T( LDT, * ), WORK( * )
 *       ..
 *
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> DGETSQRHRT computes a NB2-sized column blocked QR-factorization
 *> of a real M-by-N matrix A with M >= N,
 *>
 *>    A = Q * R.
 *>
 *> The routine uses internally a NB1-sized column blocked and MB1-sized
 *> row blocked TSQR-factorization and perfors the reconstruction
 *> of the Householder vectors from the TSQR output. The routine also
 *> converts the R_tsqr factor from the TSQR-factorization output into
 *> the R factor that corresponds to the Householder QR-factorization,
 *>
 *>    A = Q_tsqr * R_tsqr = Q * R.
 *>
 *> The output Q and R factors are stored in the same format as in DGEQRT
 *> (Q is in blocked compact WY-representation). See the documentation
 *> of DGEQRT for more details on the format.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix A.  M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrix A. M >= N >= 0.
 *> \endverbatim
 *>
 *> \param[in] MB1
 *> \verbatim
 *>          MB1 is INTEGER
 *>          The row block size to be used in the blocked TSQR.
 *>          MB1 > N.
 *> \endverbatim
 *>
 *> \param[in] NB1
 *> \verbatim
 *>          NB1 is INTEGER
 *>          The column block size to be used in the blocked TSQR.
 *>          N >= NB1 >= 1.
 *> \endverbatim
 *>
 *> \param[in] NB2
 *> \verbatim
 *>          NB2 is INTEGER
 *>          The block size to be used in the blocked QR that is
 *>          output. NB2 >= 1.
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is DOUBLE PRECISION array, dimension (LDA,N)
 *>
 *>          On entry: an M-by-N matrix A.
 *>
 *>          On exit:
 *>           a) the elements on and above the diagonal
 *>              of the array contain the N-by-N upper-triangular
 *>              matrix R corresponding to the Householder QR;
 *>           b) the elements below the diagonal represent Q by
 *>              the columns of blocked V (compact WY-representation).
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[out] T
 *> \verbatim
 *>          T is DOUBLE PRECISION array, dimension (LDT,N))
 *>          The upper triangular block reflectors stored in compact form
 *>          as a sequence of upper triangular blocks.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T.  LDT >= NB2.
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
 *>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
 *>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
 *>          array, and no error message related to LWORK is issued
 *>          by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup doubleOTHERcomputational
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE DGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
     $                       LWORK, INFO )
      IMPLICIT NONE
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
 *     ..
 *     .. Array Arguments ..
      DOUBLE PRECISION  A( LDA, * ), T( LDT, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      DOUBLE PRECISION   ONE
      PARAMETER          ( ONE = 1.0D+0 )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY
      INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
     $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           DCOPY, DLATSQR, DORGTSQR_ROW, DORHR_COL,
     $                   XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          CEILING, DBLE, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
      INFO = 0
      LQUERY  = LWORK.EQ.-1
      IF( M.LT.0 ) THEN
         INFO = -1
      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
         INFO = -2
      ELSE IF( MB1.LE.N ) THEN
         INFO = -3
      ELSE IF( NB1.LT.1 ) THEN
         INFO = -4
      ELSE IF( NB2.LT.1 ) THEN
         INFO = -5
      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -7
      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
         INFO = -9
      ELSE
 *
 *        Test the input LWORK for the dimension of the array WORK.
 *        This workspace is used to store array:
 *        a) Matrix T and WORK for DLATSQR;
 *        b) N-by-N upper-triangular factor R_tsqr;
 *        c) Matrix T and array WORK for DORGTSQR_ROW;
 *        d) Diagonal D for DORHR_COL.
 *
         IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN
            INFO = -11
         ELSE
 *
 *           Set block size for column blocks
 *
            NB1LOCAL = MIN( NB1, N )
 *
            NUM_ALL_ROW_BLOCKS = MAX( 1,
     $                   CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) )
 *
 *           Length and leading dimension of WORK array to place
 *           T array in TSQR.
 *
            LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL

            LDWT = NB1LOCAL
 *
 *           Length of TSQR work array
 *
            LW1 = NB1LOCAL * N
 *
 *           Length of DORGTSQR_ROW work array.
 *
            LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
            LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
 *
            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
               INFO = -11
            END IF
 *
         END IF
      END IF
 *
 *     Handle error in the input parameters and return workspace query.
 *
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'DGETSQRHRT', -INFO )
         RETURN
      ELSE IF ( LQUERY ) THEN
         WORK( 1 ) = DBLE( LWORKOPT )
         RETURN
      END IF
 *
 *     Quick return if possible
 *
      IF( MIN( M, N ).EQ.0 ) THEN
         WORK( 1 ) = DBLE( LWORKOPT )
         RETURN
      END IF
 *
      NB2LOCAL = MIN( NB2, N )
 *
 *
 *     (1) Perform TSQR-factorization of the M-by-N matrix A.
 *
      CALL DLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
     $              WORK(LWT+1), LW1, IINFO )
 *
 *     (2) Copy the factor R_tsqr stored in the upper-triangular part
 *         of A into the square matrix in the work array
 *         WORK(LWT+1:LWT+N*N) column-by-column.
 *
      DO J = 1, N
         CALL DCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 )
      END DO
 *
 *     (3) Generate a M-by-N matrix Q with orthonormal columns from
 *     the result stored below the diagonal in the array A in place.
 *

      CALL DORGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
     $                   WORK( LWT+N*N+1 ), LW2, IINFO )
 *
 *     (4) Perform the reconstruction of Householder vectors from
 *     the matrix Q (stored in A) in place.
 *
      CALL DORHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT,
     $                WORK( LWT+N*N+1 ), IINFO )
 *
 *     (5) Copy the factor R_tsqr stored in the square matrix in the
 *     work array WORK(LWT+1:LWT+N*N) into the upper-triangular
 *     part of A.
 *
 *     (6) Compute from R_tsqr the factor R_hr corresponding to
 *     the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr.
 *     This multiplication by the sign matrix S on the left means
 *     changing the sign of I-th row of the matrix R_tsqr according
 *     to sign of the I-th diagonal element DIAG(I) of the matrix S.
 *     DIAG is stored in WORK( LWT+N*N+1 ) from the DORHR_COL output.
 *
 *     (5) and (6) can be combined in a single loop, so the rows in A
 *     are accessed only once.
 *
      DO I = 1, N
         IF( WORK( LWT+N*N+I ).EQ.-ONE ) THEN
            DO J = I, N
               A( I, J ) = -ONE * WORK( LWT+N*(J-1)+I )
            END DO
         ELSE
            CALL DCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA )
         END IF
      END DO
 *
      WORK( 1 ) = DBLE( LWORKOPT )
      RETURN
 *
 *     End of DGETSQRHRT
 *
      END
--- a/lapack-netlib/SRC/dlarfb_gett.f
+++ b/lapack-netlib/SRC/dlarfb_gett.f
@@ -0,0 +1,596 @@
 *> \brief \b DLARFB_GETT
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download DLARFB_GETT + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlarfb_gett.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlarfb_gett.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlarfb_gett.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE DLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
 *      $                        WORK, LDWORK )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       CHARACTER          IDENT
 *       INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
 *       ..
 *       .. Array Arguments ..
 *       DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), T( LDT, * ),
 *      $                   WORK( LDWORK, * )
 *       ..
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> DLARFB_GETT applies a real Householder block reflector H from the
 *> left to a real (K+M)-by-N  "triangular-pentagonal" matrix
 *> composed of two block matrices: an upper trapezoidal K-by-N matrix A
 *> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
 *> in the array B. The block reflector H is stored in a compact
 *> WY-representation, where the elementary reflectors are in the
 *> arrays A, B and T. See Further Details section.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] IDENT
 *> \verbatim
 *>          IDENT is CHARACTER*1
 *>          If IDENT = not 'I', or not 'i', then V1 is unit
 *>             lower-triangular and stored in the left K-by-K block of
 *>             the input matrix A,
 *>          If IDENT = 'I' or 'i', then  V1 is an identity matrix and
 *>             not stored.
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix B.
 *>          M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrices A and B.
 *>          N >= 0.
 *> \endverbatim
 *>
 *> \param[in] K
 *> \verbatim
 *>          K is INTEGER
 *>          The number or rows of the matrix A.
 *>          K is also order of the matrix T, i.e. the number of
 *>          elementary reflectors whose product defines the block
 *>          reflector. 0 <= K <= N.
 *> \endverbatim
 *>
 *> \param[in] T
 *> \verbatim
 *>          T is DOUBLE PRECISION array, dimension (LDT,K)
 *>          The upper-triangular K-by-K matrix T in the representation
 *>          of the block reflector.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T. LDT >= K.
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is DOUBLE PRECISION array, dimension (LDA,N)
 *>
 *>          On entry:
 *>           a) In the K-by-N upper-trapezoidal part A: input matrix A.
 *>           b) In the columns below the diagonal: columns of V1
 *>              (ones are not stored on the diagonal).
 *>
 *>          On exit:
 *>            A is overwritten by rectangular K-by-N product H*A.
 *>
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDB is INTEGER
 *>          The leading dimension of the array A. LDA >= max(1,K).
 *> \endverbatim
 *>
 *> \param[in,out] B
 *> \verbatim
 *>          B is DOUBLE PRECISION array, dimension (LDB,N)
 *>
 *>          On entry:
 *>            a) In the M-by-(N-K) right block: input matrix B.
 *>            b) In the M-by-N left block: columns of V2.
 *>
 *>          On exit:
 *>            B is overwritten by rectangular M-by-N product H*B.
 *>
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] LDB
 *> \verbatim
 *>          LDB is INTEGER
 *>          The leading dimension of the array B. LDB >= max(1,M).
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is DOUBLE PRECISION array,
 *>          dimension (LDWORK,max(K,N-K))
 *> \endverbatim
 *>
 *> \param[in] LDWORK
 *> \verbatim
 *>          LDWORK is INTEGER
 *>          The leading dimension of the array WORK. LDWORK>=max(1,K).
 *>
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup doubleOTHERauxiliary
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *
 *> \par Further Details:
 *  =====================
 *>
 *> \verbatim
 *>
 *>    (1) Description of the Algebraic Operation.
 *>
 *>    The matrix A is a K-by-N matrix composed of two column block
 *>    matrices, A1, which is K-by-K, and A2, which is K-by-(N-K):
 *>    A = ( A1, A2 ).
 *>    The matrix B is an M-by-N matrix composed of two column block
 *>    matrices, B1, which is M-by-K, and B2, which is M-by-(N-K):
 *>    B = ( B1, B2 ).
 *>
 *>    Perform the operation:
 *>
 *>       ( A_out ) := H * ( A_in ) = ( I - V * T * V**T ) * ( A_in ) =
 *>       ( B_out )        ( B_in )                          ( B_in )
 *>                  = ( I - ( V1 ) * T * ( V1**T, V2**T ) ) * ( A_in )
 *>                          ( V2 )                            ( B_in )
 *>     On input:
 *>
 *>    a) ( A_in )  consists of two block columns:
 *>       ( B_in )
 *>
 *>       ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in ))
 *>       ( B_in )   (( B1_in ) ( B2_in ))   ((     0 ) ( B2_in )),
 *>
 *>       where the column blocks are:
 *>
 *>       (  A1_in )  is a K-by-K upper-triangular matrix stored in the
 *>                   upper triangular part of the array A(1:K,1:K).
 *>       (  B1_in )  is an M-by-K rectangular ZERO matrix and not stored.
 *>
 *>       ( A2_in )  is a K-by-(N-K) rectangular matrix stored
 *>                  in the array A(1:K,K+1:N).
 *>       ( B2_in )  is an M-by-(N-K) rectangular matrix stored
 *>                  in the array B(1:M,K+1:N).
 *>
 *>    b) V = ( V1 )
 *>           ( V2 )
 *>
 *>       where:
 *>       1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored;
 *>       2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix,
 *>          stored in the lower-triangular part of the array
 *>          A(1:K,1:K) (ones are not stored),
 *>       and V2 is an M-by-K rectangular stored the array B(1:M,1:K),
 *>                 (because on input B1_in is a rectangular zero
 *>                  matrix that is not stored and the space is
 *>                  used to store V2).
 *>
 *>    c) T is a K-by-K upper-triangular matrix stored
 *>       in the array T(1:K,1:K).
 *>
 *>    On output:
 *>
 *>    a) ( A_out ) consists of two  block columns:
 *>       ( B_out )
 *>
 *>       ( A_out ) = (( A1_out ) ( A2_out ))
 *>       ( B_out )   (( B1_out ) ( B2_out )),
 *>
 *>       where the column blocks are:
 *>
 *>       ( A1_out )  is a K-by-K square matrix, or a K-by-K
 *>                   upper-triangular matrix, if V1 is an
 *>                   identity matrix. AiOut is stored in
 *>                   the array A(1:K,1:K).
 *>       ( B1_out )  is an M-by-K rectangular matrix stored
 *>                   in the array B(1:M,K:N).
 *>
 *>       ( A2_out )  is a K-by-(N-K) rectangular matrix stored
 *>                   in the array A(1:K,K+1:N).
 *>       ( B2_out )  is an M-by-(N-K) rectangular matrix stored
 *>                   in the array B(1:M,K+1:N).
 *>
 *>
 *>    The operation above can be represented as the same operation
 *>    on each block column:
 *>
 *>       ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**T ) * ( A1_in )
 *>       ( B1_out )        (     0 )                          (     0 )
 *>
 *>       ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**T ) * ( A2_in )
 *>       ( B2_out )        ( B2_in )                          ( B2_in )
 *>
 *>    If IDENT != 'I':
 *>
 *>       The computation for column block 1:
 *>
 *>       A1_out: = A1_in - V1*T*(V1**T)*A1_in
 *>
 *>       B1_out: = - V2*T*(V1**T)*A1_in
 *>
 *>       The computation for column block 2, which exists if N > K:
 *>
 *>       A2_out: = A2_in - V1*T*( (V1**T)*A2_in + (V2**T)*B2_in )
 *>
 *>       B2_out: = B2_in - V2*T*( (V1**T)*A2_in + (V2**T)*B2_in )
 *>
 *>    If IDENT == 'I':
 *>
 *>       The operation for column block 1:
 *>
 *>       A1_out: = A1_in - V1*T**A1_in
 *>
 *>       B1_out: = - V2*T**A1_in
 *>
 *>       The computation for column block 2, which exists if N > K:
 *>
 *>       A2_out: = A2_in - T*( A2_in + (V2**T)*B2_in )
 *>
 *>       B2_out: = B2_in - V2*T*( A2_in + (V2**T)*B2_in )
 *>
 *>    (2) Description of the Algorithmic Computation.
 *>
 *>    In the first step, we compute column block 2, i.e. A2 and B2.
 *>    Here, we need to use the K-by-(N-K) rectangular workspace
 *>    matrix W2 that is of the same size as the matrix A2.
 *>    W2 is stored in the array WORK(1:K,1:(N-K)).
 *>
 *>    In the second step, we compute column block 1, i.e. A1 and B1.
 *>    Here, we need to use the K-by-K square workspace matrix W1
 *>    that is of the same size as the as the matrix A1.
 *>    W1 is stored in the array WORK(1:K,1:K).
 *>
 *>    NOTE: Hence, in this routine, we need the workspace array WORK
 *>    only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from
 *>    the first step and W1 from the second step.
 *>
 *>    Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I',
 *>    more computations than in the Case (B).
 *>
 *>    if( IDENT != 'I' ) then
 *>     if ( N > K ) then
 *>       (First Step - column block 2)
 *>       col2_(1) W2: = A2
 *>       col2_(2) W2: = (V1**T) * W2 = (unit_lower_tr_of_(A1)**T) * W2
 *>       col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
 *>       col2_(4) W2: = T * W2
 *>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>       col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
 *>       col2_(7) A2: = A2 - W2
 *>     else
 *>       (Second Step - column block 1)
 *>       col1_(1) W1: = A1
 *>       col1_(2) W1: = (V1**T) * W1 = (unit_lower_tr_of_(A1)**T) * W1
 *>       col1_(3) W1: = T * W1
 *>       col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>       col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
 *>       col1_(6) square A1: = A1 - W1
 *>     end if
 *>    end if
 *>
 *>    Case (B), when V1 is an identity matrix, i.e. IDENT == 'I',
 *>    less computations than in the Case (A)
 *>
 *>    if( IDENT == 'I' ) then
 *>     if ( N > K ) then
 *>       (First Step - column block 2)
 *>       col2_(1) W2: = A2
 *>       col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
 *>       col2_(4) W2: = T * W2
 *>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>       col2_(7) A2: = A2 - W2
 *>     else
 *>       (Second Step - column block 1)
 *>       col1_(1) W1: = A1
 *>       col1_(3) W1: = T * W1
 *>       col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>       col1_(6) upper-triangular_of_(A1): = A1 - W1
 *>     end if
 *>    end if
 *>
 *>    Combine these cases (A) and (B) together, this is the resulting
 *>    algorithm:
 *>
 *>    if ( N > K ) then
 *>
 *>      (First Step - column block 2)
 *>
 *>      col2_(1)  W2: = A2
 *>      if( IDENT != 'I' ) then
 *>        col2_(2)  W2: = (V1**T) * W2
 *>                      = (unit_lower_tr_of_(A1)**T) * W2
 *>      end if
 *>      col2_(3)  W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2]
 *>      col2_(4)  W2: = T * W2
 *>      col2_(5)  B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>      if( IDENT != 'I' ) then
 *>        col2_(6)    W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
 *>      end if
 *>      col2_(7) A2: = A2 - W2
 *>
 *>    else
 *>
 *>    (Second Step - column block 1)
 *>
 *>      col1_(1) W1: = A1
 *>      if( IDENT != 'I' ) then
 *>        col1_(2) W1: = (V1**T) * W1
 *>                    = (unit_lower_tr_of_(A1)**T) * W1
 *>      end if
 *>      col1_(3) W1: = T * W1
 *>      col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>      if( IDENT != 'I' ) then
 *>        col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
 *>        col1_(6_a) below_diag_of_(A1): =  - below_diag_of_(W1)
 *>      end if
 *>      col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1)
 *>
 *>    end if
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE DLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
     $                        WORK, LDWORK )
      IMPLICIT NONE
 *
 *  -- LAPACK auxiliary routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      CHARACTER          IDENT
      INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
 *     ..
 *     .. Array Arguments ..
      DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), T( LDT, * ),
     $                   WORK( LDWORK, * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      DOUBLE PRECISION   ONE, ZERO
      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0 )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LNOTIDENT
      INTEGER            I, J
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
      LOGICAL            LSAME
      EXTERNAL           LSAME
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           DCOPY, DGEMM, DTRMM
 *     ..
 *     .. Executable Statements ..
 *
 *     Quick return if possible
 *
      IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N )
     $   RETURN
 *
      LNOTIDENT = .NOT.LSAME( IDENT, 'I' )
 *
 *     ------------------------------------------------------------------
 *
 *     First Step. Computation of the Column Block 2:
 *
 *        ( A2 ) := H * ( A2 )
 *        ( B2 )        ( B2 )
 *
 *     ------------------------------------------------------------------
 *
      IF( N.GT.K ) THEN
 *
 *        col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N)
 *        into W2=WORK(1:K, 1:N-K) column-by-column.
 *
         DO J = 1, N-K
            CALL DCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 )
         END DO

         IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2,
 *           V1 is not an identy matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
            CALL DTRMM( 'L', 'L', 'T', 'U', K, N-K, ONE, A, LDA,
     $                  WORK, LDWORK )
         END IF
 *
 *        col2_(3) Compute W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
 *        V2 stored in B1.
 *
         IF( M.GT.0 ) THEN
            CALL DGEMM( 'T', 'N', K, N-K, M, ONE, B, LDB,
     $                  B( 1, K+1 ), LDB, ONE, WORK, LDWORK )
         END IF
 *
 *        col2_(4) Compute W2: = T * W2,
 *        T is upper-triangular.
 *
         CALL DTRMM( 'L', 'U', 'N', 'N', K, N-K, ONE, T, LDT,
     $               WORK, LDWORK )
 *
 *        col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2,
 *        V2 stored in B1.
 *
         IF( M.GT.0 ) THEN
            CALL DGEMM( 'N', 'N', M, N-K, K, -ONE, B, LDB,
     $                   WORK, LDWORK, ONE, B( 1, K+1 ), LDB )
         END IF
 *
         IF( LNOTIDENT ) THEN
 *
 *           col2_(6) Compute W2: = V1 * W2 = A1 * W2,
 *           V1 is not an identity matrix, but unit lower-triangular,
 *           V1 stored in A1 (diagonal ones are not stored).
 *
            CALL DTRMM( 'L', 'L', 'N', 'U', K, N-K, ONE, A, LDA,
     $                  WORK, LDWORK )
         END IF
 *
 *        col2_(7) Compute A2: = A2 - W2 =
 *                             = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K),
 *        column-by-column.
 *
         DO J = 1, N-K
            DO I = 1, K
               A( I, K+J ) = A( I, K+J ) - WORK( I, J )
            END DO
         END DO
 *
      END IF
 *
 *     ------------------------------------------------------------------
 *
 *     Second Step. Computation of the Column Block 1:
 *
 *        ( A1 ) := H * ( A1 )
 *        ( B1 )        (  0 )
 *
 *     ------------------------------------------------------------------
 *
 *     col1_(1) Compute W1: = A1. Copy the upper-triangular
 *     A1 = A(1:K, 1:K) into the upper-triangular
 *     W1 = WORK(1:K, 1:K) column-by-column.
 *
      DO J = 1, K
         CALL DCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 )
      END DO
 *
 *     Set the subdiagonal elements of W1 to zero column-by-column.
 *
      DO J = 1, K - 1
         DO I = J + 1, K
            WORK( I, J ) = ZERO
         END DO
      END DO
 *
      IF( LNOTIDENT ) THEN
 *
 *        col1_(2) Compute W1: = (V1**T) * W1 = (A1**T) * W1,
 *        V1 is not an identity matrix, but unit lower-triangular
 *        V1 stored in A1 (diagonal ones are not stored),
 *        W1 is upper-triangular with zeroes below the diagonal.
 *
         CALL DTRMM( 'L', 'L', 'T', 'U', K, K, ONE, A, LDA,
     $               WORK, LDWORK )
      END IF
 *
 *     col1_(3) Compute W1: = T * W1,
 *     T is upper-triangular,
 *     W1 is upper-triangular with zeroes below the diagonal.
 *
      CALL DTRMM( 'L', 'U', 'N', 'N', K, K, ONE, T, LDT,
     $            WORK, LDWORK )
 *
 *     col1_(4) Compute B1: = - V2 * W1 = - B1 * W1,
 *     V2 = B1, W1 is upper-triangular with zeroes below the diagonal.
 *
      IF( M.GT.0 ) THEN
         CALL DTRMM( 'R', 'U', 'N', 'N', M, K, -ONE, WORK, LDWORK,
     $               B, LDB )
      END IF
 *
      IF( LNOTIDENT ) THEN
 *
 *        col1_(5) Compute W1: = V1 * W1 = A1 * W1,
 *        V1 is not an identity matrix, but unit lower-triangular
 *        V1 stored in A1 (diagonal ones are not stored),
 *        W1 is upper-triangular on input with zeroes below the diagonal,
 *        and square on output.
 *
         CALL DTRMM( 'L', 'L', 'N', 'U', K, K, ONE, A, LDA,
     $               WORK, LDWORK )
 *
 *        col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K)
 *        column-by-column. A1 is upper-triangular on input.
 *        If IDENT, A1 is square on output, and W1 is square,
 *        if NOT IDENT, A1 is upper-triangular on output,
 *        W1 is upper-triangular.
 *
 *        col1_(6)_a Compute elements of A1 below the diagonal.
 *
         DO J = 1, K - 1
            DO I = J + 1, K
               A( I, J ) = - WORK( I, J )
            END DO
         END DO
 *
      END IF
 *
 *     col1_(6)_b Compute elements of A1 on and above the diagonal.
 *
      DO J = 1, K
         DO I = 1, J
            A( I, J ) = A( I, J ) - WORK( I, J )
         END DO
      END DO
 *
      RETURN
 *
 *     End of DLARFB_GETT
 *
      END
--- a/lapack-netlib/SRC/dorgtsqr_row.f
+++ b/lapack-netlib/SRC/dorgtsqr_row.f
@@ -0,0 +1,379 @@
 *> \brief \b DORGTSQR_ROW
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download DORGTSQR_ROW + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dorgtsqr_row.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dorgtsqr_row.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dorgtsqr_row.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE DORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
 *      $                         LWORK, INFO )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
 *       ..
 *       .. Array Arguments ..
 *       DOUBLE PRECISION  A( LDA, * ), T( LDT, * ), WORK( * )
 *       ..
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> DORGTSQR_ROW generates an M-by-N real matrix Q_out with
 *> orthonormal columns from the output of DLATSQR. These N orthonormal
 *> columns are the first N columns of a product of complex unitary
 *> matrices Q(k)_in of order M, which are returned by DLATSQR in
 *> a special format.
 *>
 *>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
 *>
 *> The input matrices Q(k)_in are stored in row and column blocks in A.
 *> See the documentation of DLATSQR for more details on the format of
 *> Q(k)_in, where each Q(k)_in is represented by block Householder
 *> transformations. This routine calls an auxiliary routine DLARFB_GETT,
 *> where the computation is performed on each individual block. The
 *> algorithm first sweeps NB-sized column blocks from the right to left
 *> starting in the bottom row block and continues to the top row block
 *> (hence _ROW in the routine name). This sweep is in reverse order of
 *> the order in which DLATSQR generates the output blocks.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix A.  M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrix A. M >= N >= 0.
 *> \endverbatim
 *>
 *> \param[in] MB
 *> \verbatim
 *>          MB is INTEGER
 *>          The row block size used by DLATSQR to return
 *>          arrays A and T. MB > N.
 *>          (Note that if MB > M, then M is used instead of MB
 *>          as the row block size).
 *> \endverbatim
 *>
 *> \param[in] NB
 *> \verbatim
 *>          NB is INTEGER
 *>          The column block size used by DLATSQR to return
 *>          arrays A and T. NB >= 1.
 *>          (Note that if NB > N, then N is used instead of NB
 *>          as the column block size).
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is DOUBLE PRECISION array, dimension (LDA,N)
 *>
 *>          On entry:
 *>
 *>             The elements on and above the diagonal are not used as
 *>             input. The elements below the diagonal represent the unit
 *>             lower-trapezoidal blocked matrix V computed by DLATSQR
 *>             that defines the input matrices Q_in(k) (ones on the
 *>             diagonal are not stored). See DLATSQR for more details.
 *>
 *>          On exit:
 *>
 *>             The array A contains an M-by-N orthonormal matrix Q_out,
 *>             i.e the columns of A are orthogonal unit vectors.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[in] T
 *> \verbatim
 *>          T is DOUBLE PRECISION array,
 *>          dimension (LDT, N * NIRB)
 *>          where NIRB = Number_of_input_row_blocks
 *>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
 *>          Let NICB = Number_of_input_col_blocks
 *>                   = CEIL(N/NB)
 *>
 *>          The upper-triangular block reflectors used to define the
 *>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
 *>          reflectors are stored in compact form in NIRB block
 *>          reflector sequences. Each of the NIRB block reflector
 *>          sequences is stored in a larger NB-by-N column block of T
 *>          and consists of NICB smaller NB-by-NB upper-triangular
 *>          column blocks. See DLATSQR for more details on the format
 *>          of T.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T.
 *>          LDT >= max(1,min(NB,N)).
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
 *>          LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)),
 *>          where NBLOCAL=MIN(NB,N).
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
 *>          array, and no error message related to LWORK is issued
 *>          by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *> \endverbatim
 *>
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup doubleOTHERcomputational
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE DORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
     $                         LWORK, INFO )
      IMPLICIT NONE
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
 *     ..
 *     .. Array Arguments ..
      DOUBLE PRECISION  A( LDA, * ), T( LDT, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      DOUBLE PRECISION   ONE, ZERO
      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0 )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY
      INTEGER            NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM,
     $                   LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB,
     $                   KB, KB_LAST, KNB, MB1
 *     ..
 *     .. Local Arrays ..
      DOUBLE PRECISION   DUMMY( 1, 1 )
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           DLARFB_GETT, DLASET, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          DBLE, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input parameters
 *
      INFO = 0
      LQUERY  = LWORK.EQ.-1
      IF( M.LT.0 ) THEN
         INFO = -1
      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
         INFO = -2
      ELSE IF( MB.LE.N ) THEN
         INFO = -3
      ELSE IF( NB.LT.1 ) THEN
         INFO = -4
      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
         INFO = -8
      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
         INFO = -10
      END IF
 *
      NBLOCAL = MIN( NB, N )
 *
 *     Determine the workspace size.
 *
      IF( INFO.EQ.0 ) THEN
         LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) )
      END IF
 *
 *     Handle error in the input parameters and handle the workspace query.
 *
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'DORGTSQR_ROW', -INFO )
         RETURN
      ELSE IF ( LQUERY ) THEN
         WORK( 1 ) = DBLE( LWORKOPT )
         RETURN
      END IF
 *
 *     Quick return if possible
 *
      IF( MIN( M, N ).EQ.0 ) THEN
         WORK( 1 ) = DBLE( LWORKOPT )
         RETURN
      END IF
 *
 *     (0) Set the upper-triangular part of the matrix A to zero and
 *     its diagonal elements to one.
 *
      CALL DLASET('U', M, N, ZERO, ONE, A, LDA )
 *
 *     KB_LAST is the column index of the last column block reflector
 *     in the matrices T and V.
 *
      KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1
 *
 *
 *     (1) Bottom-up loop over row blocks of A, except the top row block.
 *     NOTE: If MB>=M, then the loop is never executed.
 *
      IF ( MB.LT.M ) THEN
 *
 *        MB2 is the row blocking size for the row blocks before the
 *        first top row block in the matrix A. IB is the row index for
 *        the row blocks in the matrix A before the first top row block.
 *        IB_BOTTOM is the row index for the last bottom row block
 *        in the matrix A. JB_T is the column index of the corresponding
 *        column block in the matrix T.
 *
 *        Initialize variables.
 *
 *        NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A
 *        including the first row block.
 *
         MB2 = MB - N
         M_PLUS_ONE = M + 1
         ITMP = ( M - MB - 1 ) / MB2
         IB_BOTTOM = ITMP * MB2 + MB + 1
         NUM_ALL_ROW_BLOCKS = ITMP + 2
         JB_T = NUM_ALL_ROW_BLOCKS * N + 1
 *
         DO IB = IB_BOTTOM, MB+1, -MB2
 *
 *           Determine the block size IMB for the current row block
 *           in the matrix A.
 *
            IMB = MIN( M_PLUS_ONE - IB, MB2 )
 *
 *           Determine the column index JB_T for the current column block
 *           in the matrix T.
 *
            JB_T = JB_T - N
 *
 *           Apply column blocks of H in the row block from right to left.
 *
 *           KB is the column index of the current column block reflector
 *           in the matrices T and V.
 *
            DO KB = KB_LAST, 1, -NBLOCAL
 *
 *              Determine the size of the current column block KNB in
 *              the matrices T and V.
 *
               KNB = MIN( NBLOCAL, N - KB + 1 )
 *
               CALL DLARFB_GETT( 'I', IMB, N-KB+1, KNB,
     $                     T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA,
     $                     A( IB, KB ), LDA, WORK, KNB )
 *
            END DO
 *
         END DO
 *
      END IF
 *
 *     (2) Top row block of A.
 *     NOTE: If MB>=M, then we have only one row block of A of size M
 *     and we work on the entire matrix A.
 *
      MB1 = MIN( MB, M )
 *
 *     Apply column blocks of H in the top row block from right to left.
 *
 *     KB is the column index of the current block reflector in
 *     the matrices T and V.
 *
      DO KB = KB_LAST, 1, -NBLOCAL
 *
 *        Determine the size of the current column block KNB in
 *        the matrices T and V.
 *
         KNB = MIN( NBLOCAL, N - KB + 1 )
 *
         IF( MB1-KB-KNB+1.EQ.0 ) THEN
 *
 *           In SLARFB_GETT parameters, when M=0, then the matrix B
 *           does not exist, hence we need to pass a dummy array
 *           reference DUMMY(1,1) to B with LDDUMMY=1.
 *
            CALL DLARFB_GETT( 'N', 0, N-KB+1, KNB,
     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
     $                        DUMMY( 1, 1 ), 1, WORK, KNB )
         ELSE
            CALL DLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB,
     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
     $                        A( KB+KNB, KB), LDA, WORK, KNB )

         END IF
 *
      END DO
 *
      WORK( 1 ) = DBLE( LWORKOPT )
      RETURN
 *
 *     End of DORGTSQR_ROW
 *
      END
--- a/lapack-netlib/SRC/sgetsqrhrt.f
+++ b/lapack-netlib/SRC/sgetsqrhrt.f
@@ -0,0 +1,349 @@
 *> \brief \b SGETSQRHRT
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download SGETSQRHRT + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sgetsqrhrt.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sgetsqrhrt.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sgetsqrhrt.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE SGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
 *      $                       LWORK, INFO )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
 *       ..
 *       .. Array Arguments ..
 *       REAL              A( LDA, * ), T( LDT, * ), WORK( * )
 *       ..
 *
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> SGETSQRHRT computes a NB2-sized column blocked QR-factorization
 *> of a complex M-by-N matrix A with M >= N,
 *>
 *>    A = Q * R.
 *>
 *> The routine uses internally a NB1-sized column blocked and MB1-sized
 *> row blocked TSQR-factorization and perfors the reconstruction
 *> of the Householder vectors from the TSQR output. The routine also
 *> converts the R_tsqr factor from the TSQR-factorization output into
 *> the R factor that corresponds to the Householder QR-factorization,
 *>
 *>    A = Q_tsqr * R_tsqr = Q * R.
 *>
 *> The output Q and R factors are stored in the same format as in SGEQRT
 *> (Q is in blocked compact WY-representation). See the documentation
 *> of SGEQRT for more details on the format.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix A.  M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrix A. M >= N >= 0.
 *> \endverbatim
 *>
 *> \param[in] MB1
 *> \verbatim
 *>          MB1 is INTEGER
 *>          The row block size to be used in the blocked TSQR.
 *>          MB1 > N.
 *> \endverbatim
 *>
 *> \param[in] NB1
 *> \verbatim
 *>          NB1 is INTEGER
 *>          The column block size to be used in the blocked TSQR.
 *>          N >= NB1 >= 1.
 *> \endverbatim
 *>
 *> \param[in] NB2
 *> \verbatim
 *>          NB2 is INTEGER
 *>          The block size to be used in the blocked QR that is
 *>          output. NB2 >= 1.
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is REAL array, dimension (LDA,N)
 *>
 *>          On entry: an M-by-N matrix A.
 *>
 *>          On exit:
 *>           a) the elements on and above the diagonal
 *>              of the array contain the N-by-N upper-triangular
 *>              matrix R corresponding to the Householder QR;
 *>           b) the elements below the diagonal represent Q by
 *>              the columns of blocked V (compact WY-representation).
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[out] T
 *> \verbatim
 *>          T is REAL array, dimension (LDT,N))
 *>          The upper triangular block reflectors stored in compact form
 *>          as a sequence of upper triangular blocks.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T.  LDT >= NB2.
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          (workspace) REAL array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
 *>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
 *>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
 *>          array, and no error message related to LWORK is issued
 *>          by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup singleOTHERcomputational
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE SGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
     $                       LWORK, INFO )
      IMPLICIT NONE
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
 *     ..
 *     .. Array Arguments ..
      REAL              A( LDA, * ), T( LDT, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      REAL               ONE
      PARAMETER          ( ONE = 1.0E+0 )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY
      INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
     $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           SCOPY, SLATSQR, SORGTSQR_ROW, SORHR_COL,
     $                   XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          CEILING, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
      INFO = 0
      LQUERY  = LWORK.EQ.-1
      IF( M.LT.0 ) THEN
         INFO = -1
      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
         INFO = -2
      ELSE IF( MB1.LE.N ) THEN
         INFO = -3
      ELSE IF( NB1.LT.1 ) THEN
         INFO = -4
      ELSE IF( NB2.LT.1 ) THEN
         INFO = -5
      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -7
      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
         INFO = -9
      ELSE
 *
 *        Test the input LWORK for the dimension of the array WORK.
 *        This workspace is used to store array:
 *        a) Matrix T and WORK for SLATSQR;
 *        b) N-by-N upper-triangular factor R_tsqr;
 *        c) Matrix T and array WORK for SORGTSQR_ROW;
 *        d) Diagonal D for SORHR_COL.
 *
         IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN
            INFO = -11
         ELSE
 *
 *           Set block size for column blocks
 *
            NB1LOCAL = MIN( NB1, N )
 *
            NUM_ALL_ROW_BLOCKS = MAX( 1,
     $                   CEILING( REAL( M - N ) / REAL( MB1 - N ) ) )
 *
 *           Length and leading dimension of WORK array to place
 *           T array in TSQR.
 *
            LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL

            LDWT = NB1LOCAL
 *
 *           Length of TSQR work array
 *
            LW1 = NB1LOCAL * N
 *
 *           Length of SORGTSQR_ROW work array.
 *
            LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
            LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
 *
            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
               INFO = -11
            END IF
 *
         END IF
      END IF
 *
 *     Handle error in the input parameters and return workspace query.
 *
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'SGETSQRHRT', -INFO )
         RETURN
      ELSE IF ( LQUERY ) THEN
         WORK( 1 ) = REAL( LWORKOPT )
         RETURN
      END IF
 *
 *     Quick return if possible
 *
      IF( MIN( M, N ).EQ.0 ) THEN
         WORK( 1 ) = REAL( LWORKOPT )
         RETURN
      END IF
 *
      NB2LOCAL = MIN( NB2, N )
 *
 *
 *     (1) Perform TSQR-factorization of the M-by-N matrix A.
 *
      CALL SLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
     $              WORK(LWT+1), LW1, IINFO )
 *
 *     (2) Copy the factor R_tsqr stored in the upper-triangular part
 *         of A into the square matrix in the work array
 *         WORK(LWT+1:LWT+N*N) column-by-column.
 *
      DO J = 1, N
         CALL SCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 )
      END DO
 *
 *     (3) Generate a M-by-N matrix Q with orthonormal columns from
 *     the result stored below the diagonal in the array A in place.
 *

      CALL SORGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
     $                   WORK( LWT+N*N+1 ), LW2, IINFO )
 *
 *     (4) Perform the reconstruction of Householder vectors from
 *     the matrix Q (stored in A) in place.
 *
      CALL SORHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT,
     $                WORK( LWT+N*N+1 ), IINFO )
 *
 *     (5) Copy the factor R_tsqr stored in the square matrix in the
 *     work array WORK(LWT+1:LWT+N*N) into the upper-triangular
 *     part of A.
 *
 *     (6) Compute from R_tsqr the factor R_hr corresponding to
 *     the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr.
 *     This multiplication by the sign matrix S on the left means
 *     changing the sign of I-th row of the matrix R_tsqr according
 *     to sign of the I-th diagonal element DIAG(I) of the matrix S.
 *     DIAG is stored in WORK( LWT+N*N+1 ) from the SORHR_COL output.
 *
 *     (5) and (6) can be combined in a single loop, so the rows in A
 *     are accessed only once.
 *
      DO I = 1, N
         IF( WORK( LWT+N*N+I ).EQ.-ONE ) THEN
            DO J = I, N
               A( I, J ) = -ONE * WORK( LWT+N*(J-1)+I )
            END DO
         ELSE
            CALL SCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA )
         END IF
      END DO
 *
      WORK( 1 ) = REAL( LWORKOPT )
      RETURN
 *
 *     End of SGETSQRHRT
 *
      END
--- a/lapack-netlib/SRC/slarfb_gett.f
+++ b/lapack-netlib/SRC/slarfb_gett.f
@@ -0,0 +1,596 @@
 *> \brief \b SLARFB_GETT
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download SLARFB_GETT + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slarfb_gett.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slarfb_gett.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slarfb_gett.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE SLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
 *      $                        WORK, LDWORK )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       CHARACTER          IDENT
 *       INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
 *       ..
 *       .. Array Arguments ..
 *       REAL               A( LDA, * ), B( LDB, * ), T( LDT, * ),
 *      $                   WORK( LDWORK, * )
 *       ..
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> SLARFB_GETT applies a real Householder block reflector H from the
 *> left to a real (K+M)-by-N  "triangular-pentagonal" matrix
 *> composed of two block matrices: an upper trapezoidal K-by-N matrix A
 *> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
 *> in the array B. The block reflector H is stored in a compact
 *> WY-representation, where the elementary reflectors are in the
 *> arrays A, B and T. See Further Details section.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] IDENT
 *> \verbatim
 *>          IDENT is CHARACTER*1
 *>          If IDENT = not 'I', or not 'i', then V1 is unit
 *>             lower-triangular and stored in the left K-by-K block of
 *>             the input matrix A,
 *>          If IDENT = 'I' or 'i', then  V1 is an identity matrix and
 *>             not stored.
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix B.
 *>          M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrices A and B.
 *>          N >= 0.
 *> \endverbatim
 *>
 *> \param[in] K
 *> \verbatim
 *>          K is INTEGER
 *>          The number or rows of the matrix A.
 *>          K is also order of the matrix T, i.e. the number of
 *>          elementary reflectors whose product defines the block
 *>          reflector. 0 <= K <= N.
 *> \endverbatim
 *>
 *> \param[in] T
 *> \verbatim
 *>          T is REAL array, dimension (LDT,K)
 *>          The upper-triangular K-by-K matrix T in the representation
 *>          of the block reflector.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T. LDT >= K.
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is REAL array, dimension (LDA,N)
 *>
 *>          On entry:
 *>           a) In the K-by-N upper-trapezoidal part A: input matrix A.
 *>           b) In the columns below the diagonal: columns of V1
 *>              (ones are not stored on the diagonal).
 *>
 *>          On exit:
 *>            A is overwritten by rectangular K-by-N product H*A.
 *>
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDB is INTEGER
 *>          The leading dimension of the array A. LDA >= max(1,K).
 *> \endverbatim
 *>
 *> \param[in,out] B
 *> \verbatim
 *>          B is REAL array, dimension (LDB,N)
 *>
 *>          On entry:
 *>            a) In the M-by-(N-K) right block: input matrix B.
 *>            b) In the M-by-N left block: columns of V2.
 *>
 *>          On exit:
 *>            B is overwritten by rectangular M-by-N product H*B.
 *>
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] LDB
 *> \verbatim
 *>          LDB is INTEGER
 *>          The leading dimension of the array B. LDB >= max(1,M).
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is REAL array,
 *>          dimension (LDWORK,max(K,N-K))
 *> \endverbatim
 *>
 *> \param[in] LDWORK
 *> \verbatim
 *>          LDWORK is INTEGER
 *>          The leading dimension of the array WORK. LDWORK>=max(1,K).
 *>
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup singleOTHERauxiliary
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *
 *> \par Further Details:
 *  =====================
 *>
 *> \verbatim
 *>
 *>    (1) Description of the Algebraic Operation.
 *>
 *>    The matrix A is a K-by-N matrix composed of two column block
 *>    matrices, A1, which is K-by-K, and A2, which is K-by-(N-K):
 *>    A = ( A1, A2 ).
 *>    The matrix B is an M-by-N matrix composed of two column block
 *>    matrices, B1, which is M-by-K, and B2, which is M-by-(N-K):
 *>    B = ( B1, B2 ).
 *>
 *>    Perform the operation:
 *>
 *>       ( A_out ) := H * ( A_in ) = ( I - V * T * V**T ) * ( A_in ) =
 *>       ( B_out )        ( B_in )                          ( B_in )
 *>                  = ( I - ( V1 ) * T * ( V1**T, V2**T ) ) * ( A_in )
 *>                          ( V2 )                            ( B_in )
 *>     On input:
 *>
 *>    a) ( A_in )  consists of two block columns:
 *>       ( B_in )
 *>
 *>       ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in ))
 *>       ( B_in )   (( B1_in ) ( B2_in ))   ((     0 ) ( B2_in )),
 *>
 *>       where the column blocks are:
 *>
 *>       (  A1_in )  is a K-by-K upper-triangular matrix stored in the
 *>                   upper triangular part of the array A(1:K,1:K).
 *>       (  B1_in )  is an M-by-K rectangular ZERO matrix and not stored.
 *>
 *>       ( A2_in )  is a K-by-(N-K) rectangular matrix stored
 *>                  in the array A(1:K,K+1:N).
 *>       ( B2_in )  is an M-by-(N-K) rectangular matrix stored
 *>                  in the array B(1:M,K+1:N).
 *>
 *>    b) V = ( V1 )
 *>           ( V2 )
 *>
 *>       where:
 *>       1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored;
 *>       2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix,
 *>          stored in the lower-triangular part of the array
 *>          A(1:K,1:K) (ones are not stored),
 *>       and V2 is an M-by-K rectangular stored the array B(1:M,1:K),
 *>                 (because on input B1_in is a rectangular zero
 *>                  matrix that is not stored and the space is
 *>                  used to store V2).
 *>
 *>    c) T is a K-by-K upper-triangular matrix stored
 *>       in the array T(1:K,1:K).
 *>
 *>    On output:
 *>
 *>    a) ( A_out ) consists of two  block columns:
 *>       ( B_out )
 *>
 *>       ( A_out ) = (( A1_out ) ( A2_out ))
 *>       ( B_out )   (( B1_out ) ( B2_out )),
 *>
 *>       where the column blocks are:
 *>
 *>       ( A1_out )  is a K-by-K square matrix, or a K-by-K
 *>                   upper-triangular matrix, if V1 is an
 *>                   identity matrix. AiOut is stored in
 *>                   the array A(1:K,1:K).
 *>       ( B1_out )  is an M-by-K rectangular matrix stored
 *>                   in the array B(1:M,K:N).
 *>
 *>       ( A2_out )  is a K-by-(N-K) rectangular matrix stored
 *>                   in the array A(1:K,K+1:N).
 *>       ( B2_out )  is an M-by-(N-K) rectangular matrix stored
 *>                   in the array B(1:M,K+1:N).
 *>
 *>
 *>    The operation above can be represented as the same operation
 *>    on each block column:
 *>
 *>       ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**T ) * ( A1_in )
 *>       ( B1_out )        (     0 )                          (     0 )
 *>
 *>       ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**T ) * ( A2_in )
 *>       ( B2_out )        ( B2_in )                          ( B2_in )
 *>
 *>    If IDENT != 'I':
 *>
 *>       The computation for column block 1:
 *>
 *>       A1_out: = A1_in - V1*T*(V1**T)*A1_in
 *>
 *>       B1_out: = - V2*T*(V1**T)*A1_in
 *>
 *>       The computation for column block 2, which exists if N > K:
 *>
 *>       A2_out: = A2_in - V1*T*( (V1**T)*A2_in + (V2**T)*B2_in )
 *>
 *>       B2_out: = B2_in - V2*T*( (V1**T)*A2_in + (V2**T)*B2_in )
 *>
 *>    If IDENT == 'I':
 *>
 *>       The operation for column block 1:
 *>
 *>       A1_out: = A1_in - V1*T**A1_in
 *>
 *>       B1_out: = - V2*T**A1_in
 *>
 *>       The computation for column block 2, which exists if N > K:
 *>
 *>       A2_out: = A2_in - T*( A2_in + (V2**T)*B2_in )
 *>
 *>       B2_out: = B2_in - V2*T*( A2_in + (V2**T)*B2_in )
 *>
 *>    (2) Description of the Algorithmic Computation.
 *>
 *>    In the first step, we compute column block 2, i.e. A2 and B2.
 *>    Here, we need to use the K-by-(N-K) rectangular workspace
 *>    matrix W2 that is of the same size as the matrix A2.
 *>    W2 is stored in the array WORK(1:K,1:(N-K)).
 *>
 *>    In the second step, we compute column block 1, i.e. A1 and B1.
 *>    Here, we need to use the K-by-K square workspace matrix W1
 *>    that is of the same size as the as the matrix A1.
 *>    W1 is stored in the array WORK(1:K,1:K).
 *>
 *>    NOTE: Hence, in this routine, we need the workspace array WORK
 *>    only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from
 *>    the first step and W1 from the second step.
 *>
 *>    Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I',
 *>    more computations than in the Case (B).
 *>
 *>    if( IDENT != 'I' ) then
 *>     if ( N > K ) then
 *>       (First Step - column block 2)
 *>       col2_(1) W2: = A2
 *>       col2_(2) W2: = (V1**T) * W2 = (unit_lower_tr_of_(A1)**T) * W2
 *>       col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
 *>       col2_(4) W2: = T * W2
 *>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>       col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
 *>       col2_(7) A2: = A2 - W2
 *>     else
 *>       (Second Step - column block 1)
 *>       col1_(1) W1: = A1
 *>       col1_(2) W1: = (V1**T) * W1 = (unit_lower_tr_of_(A1)**T) * W1
 *>       col1_(3) W1: = T * W1
 *>       col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>       col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
 *>       col1_(6) square A1: = A1 - W1
 *>     end if
 *>    end if
 *>
 *>    Case (B), when V1 is an identity matrix, i.e. IDENT == 'I',
 *>    less computations than in the Case (A)
 *>
 *>    if( IDENT == 'I' ) then
 *>     if ( N > K ) then
 *>       (First Step - column block 2)
 *>       col2_(1) W2: = A2
 *>       col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
 *>       col2_(4) W2: = T * W2
 *>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>       col2_(7) A2: = A2 - W2
 *>     else
 *>       (Second Step - column block 1)
 *>       col1_(1) W1: = A1
 *>       col1_(3) W1: = T * W1
 *>       col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>       col1_(6) upper-triangular_of_(A1): = A1 - W1
 *>     end if
 *>    end if
 *>
 *>    Combine these cases (A) and (B) together, this is the resulting
 *>    algorithm:
 *>
 *>    if ( N > K ) then
 *>
 *>      (First Step - column block 2)
 *>
 *>      col2_(1)  W2: = A2
 *>      if( IDENT != 'I' ) then
 *>        col2_(2)  W2: = (V1**T) * W2
 *>                      = (unit_lower_tr_of_(A1)**T) * W2
 *>      end if
 *>      col2_(3)  W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2]
 *>      col2_(4)  W2: = T * W2
 *>      col2_(5)  B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>      if( IDENT != 'I' ) then
 *>        col2_(6)    W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
 *>      end if
 *>      col2_(7) A2: = A2 - W2
 *>
 *>    else
 *>
 *>    (Second Step - column block 1)
 *>
 *>      col1_(1) W1: = A1
 *>      if( IDENT != 'I' ) then
 *>        col1_(2) W1: = (V1**T) * W1
 *>                    = (unit_lower_tr_of_(A1)**T) * W1
 *>      end if
 *>      col1_(3) W1: = T * W1
 *>      col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>      if( IDENT != 'I' ) then
 *>        col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
 *>        col1_(6_a) below_diag_of_(A1): =  - below_diag_of_(W1)
 *>      end if
 *>      col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1)
 *>
 *>    end if
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE SLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
     $                        WORK, LDWORK )
      IMPLICIT NONE
 *
 *  -- LAPACK auxiliary routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      CHARACTER          IDENT
      INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
 *     ..
 *     .. Array Arguments ..
      REAL               A( LDA, * ), B( LDB, * ), T( LDT, * ),
     $                   WORK( LDWORK, * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      REAL               ONE, ZERO
      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0 )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LNOTIDENT
      INTEGER            I, J
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
      LOGICAL            LSAME
      EXTERNAL           LSAME
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           SCOPY, SGEMM, STRMM
 *     ..
 *     .. Executable Statements ..
 *
 *     Quick return if possible
 *
      IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N )
     $   RETURN
 *
      LNOTIDENT = .NOT.LSAME( IDENT, 'I' )
 *
 *     ------------------------------------------------------------------
 *
 *     First Step. Computation of the Column Block 2:
 *
 *        ( A2 ) := H * ( A2 )
 *        ( B2 )        ( B2 )
 *
 *     ------------------------------------------------------------------
 *
      IF( N.GT.K ) THEN
 *
 *        col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N)
 *        into W2=WORK(1:K, 1:N-K) column-by-column.
 *
         DO J = 1, N-K
            CALL SCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 )
         END DO

         IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2,
 *           V1 is not an identy matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
            CALL STRMM( 'L', 'L', 'T', 'U', K, N-K, ONE, A, LDA,
     $                  WORK, LDWORK )
         END IF
 *
 *        col2_(3) Compute W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2
 *        V2 stored in B1.
 *
         IF( M.GT.0 ) THEN
            CALL SGEMM( 'T', 'N', K, N-K, M, ONE, B, LDB,
     $                  B( 1, K+1 ), LDB, ONE, WORK, LDWORK )
         END IF
 *
 *        col2_(4) Compute W2: = T * W2,
 *        T is upper-triangular.
 *
         CALL STRMM( 'L', 'U', 'N', 'N', K, N-K, ONE, T, LDT,
     $               WORK, LDWORK )
 *
 *        col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2,
 *        V2 stored in B1.
 *
         IF( M.GT.0 ) THEN
            CALL SGEMM( 'N', 'N', M, N-K, K, -ONE, B, LDB,
     $                   WORK, LDWORK, ONE, B( 1, K+1 ), LDB )
         END IF
 *
         IF( LNOTIDENT ) THEN
 *
 *           col2_(6) Compute W2: = V1 * W2 = A1 * W2,
 *           V1 is not an identity matrix, but unit lower-triangular,
 *           V1 stored in A1 (diagonal ones are not stored).
 *
            CALL STRMM( 'L', 'L', 'N', 'U', K, N-K, ONE, A, LDA,
     $                  WORK, LDWORK )
         END IF
 *
 *        col2_(7) Compute A2: = A2 - W2 =
 *                             = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K),
 *        column-by-column.
 *
         DO J = 1, N-K
            DO I = 1, K
               A( I, K+J ) = A( I, K+J ) - WORK( I, J )
            END DO
         END DO
 *
      END IF
 *
 *     ------------------------------------------------------------------
 *
 *     Second Step. Computation of the Column Block 1:
 *
 *        ( A1 ) := H * ( A1 )
 *        ( B1 )        (  0 )
 *
 *     ------------------------------------------------------------------
 *
 *     col1_(1) Compute W1: = A1. Copy the upper-triangular
 *     A1 = A(1:K, 1:K) into the upper-triangular
 *     W1 = WORK(1:K, 1:K) column-by-column.
 *
      DO J = 1, K
         CALL SCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 )
      END DO
 *
 *     Set the subdiagonal elements of W1 to zero column-by-column.
 *
      DO J = 1, K - 1
         DO I = J + 1, K
            WORK( I, J ) = ZERO
         END DO
      END DO
 *
      IF( LNOTIDENT ) THEN
 *
 *        col1_(2) Compute W1: = (V1**T) * W1 = (A1**T) * W1,
 *        V1 is not an identity matrix, but unit lower-triangular
 *        V1 stored in A1 (diagonal ones are not stored),
 *        W1 is upper-triangular with zeroes below the diagonal.
 *
         CALL STRMM( 'L', 'L', 'T', 'U', K, K, ONE, A, LDA,
     $               WORK, LDWORK )
      END IF
 *
 *     col1_(3) Compute W1: = T * W1,
 *     T is upper-triangular,
 *     W1 is upper-triangular with zeroes below the diagonal.
 *
      CALL STRMM( 'L', 'U', 'N', 'N', K, K, ONE, T, LDT,
     $            WORK, LDWORK )
 *
 *     col1_(4) Compute B1: = - V2 * W1 = - B1 * W1,
 *     V2 = B1, W1 is upper-triangular with zeroes below the diagonal.
 *
      IF( M.GT.0 ) THEN
         CALL STRMM( 'R', 'U', 'N', 'N', M, K, -ONE, WORK, LDWORK,
     $               B, LDB )
      END IF
 *
      IF( LNOTIDENT ) THEN
 *
 *        col1_(5) Compute W1: = V1 * W1 = A1 * W1,
 *        V1 is not an identity matrix, but unit lower-triangular
 *        V1 stored in A1 (diagonal ones are not stored),
 *        W1 is upper-triangular on input with zeroes below the diagonal,
 *        and square on output.
 *
         CALL STRMM( 'L', 'L', 'N', 'U', K, K, ONE, A, LDA,
     $               WORK, LDWORK )
 *
 *        col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K)
 *        column-by-column. A1 is upper-triangular on input.
 *        If IDENT, A1 is square on output, and W1 is square,
 *        if NOT IDENT, A1 is upper-triangular on output,
 *        W1 is upper-triangular.
 *
 *        col1_(6)_a Compute elements of A1 below the diagonal.
 *
         DO J = 1, K - 1
            DO I = J + 1, K
               A( I, J ) = - WORK( I, J )
            END DO
         END DO
 *
      END IF
 *
 *     col1_(6)_b Compute elements of A1 on and above the diagonal.
 *
      DO J = 1, K
         DO I = 1, J
            A( I, J ) = A( I, J ) - WORK( I, J )
         END DO
      END DO
 *
      RETURN
 *
 *     End of SLARFB_GETT
 *
      END
--- a/lapack-netlib/SRC/sorgtsqr_row.f
+++ b/lapack-netlib/SRC/sorgtsqr_row.f
@@ -0,0 +1,379 @@
 *> \brief \b SORGTSQR_ROW
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download SORGTSQR_ROW + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sorgtsqr_row.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sorgtsqr_row.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sorgtsqr_row.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE SORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
 *      $                         LWORK, INFO )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
 *       ..
 *       .. Array Arguments ..
 *       REAL              A( LDA, * ), T( LDT, * ), WORK( * )
 *       ..
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> SORGTSQR_ROW generates an M-by-N real matrix Q_out with
 *> orthonormal columns from the output of SLATSQR. These N orthonormal
 *> columns are the first N columns of a product of complex unitary
 *> matrices Q(k)_in of order M, which are returned by SLATSQR in
 *> a special format.
 *>
 *>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
 *>
 *> The input matrices Q(k)_in are stored in row and column blocks in A.
 *> See the documentation of SLATSQR for more details on the format of
 *> Q(k)_in, where each Q(k)_in is represented by block Householder
 *> transformations. This routine calls an auxiliary routine SLARFB_GETT,
 *> where the computation is performed on each individual block. The
 *> algorithm first sweeps NB-sized column blocks from the right to left
 *> starting in the bottom row block and continues to the top row block
 *> (hence _ROW in the routine name). This sweep is in reverse order of
 *> the order in which SLATSQR generates the output blocks.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix A.  M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrix A. M >= N >= 0.
 *> \endverbatim
 *>
 *> \param[in] MB
 *> \verbatim
 *>          MB is INTEGER
 *>          The row block size used by SLATSQR to return
 *>          arrays A and T. MB > N.
 *>          (Note that if MB > M, then M is used instead of MB
 *>          as the row block size).
 *> \endverbatim
 *>
 *> \param[in] NB
 *> \verbatim
 *>          NB is INTEGER
 *>          The column block size used by SLATSQR to return
 *>          arrays A and T. NB >= 1.
 *>          (Note that if NB > N, then N is used instead of NB
 *>          as the column block size).
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is REAL array, dimension (LDA,N)
 *>
 *>          On entry:
 *>
 *>             The elements on and above the diagonal are not used as
 *>             input. The elements below the diagonal represent the unit
 *>             lower-trapezoidal blocked matrix V computed by SLATSQR
 *>             that defines the input matrices Q_in(k) (ones on the
 *>             diagonal are not stored). See SLATSQR for more details.
 *>
 *>          On exit:
 *>
 *>             The array A contains an M-by-N orthonormal matrix Q_out,
 *>             i.e the columns of A are orthogonal unit vectors.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[in] T
 *> \verbatim
 *>          T is REAL array,
 *>          dimension (LDT, N * NIRB)
 *>          where NIRB = Number_of_input_row_blocks
 *>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
 *>          Let NICB = Number_of_input_col_blocks
 *>                   = CEIL(N/NB)
 *>
 *>          The upper-triangular block reflectors used to define the
 *>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
 *>          reflectors are stored in compact form in NIRB block
 *>          reflector sequences. Each of the NIRB block reflector
 *>          sequences is stored in a larger NB-by-N column block of T
 *>          and consists of NICB smaller NB-by-NB upper-triangular
 *>          column blocks. See SLATSQR for more details on the format
 *>          of T.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T.
 *>          LDT >= max(1,min(NB,N)).
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          (workspace) REAL array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
 *>          LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)),
 *>          where NBLOCAL=MIN(NB,N).
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
 *>          array, and no error message related to LWORK is issued
 *>          by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *> \endverbatim
 *>
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup sigleOTHERcomputational
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE SORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
     $                         LWORK, INFO )
      IMPLICIT NONE
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
 *     ..
 *     .. Array Arguments ..
      REAL              A( LDA, * ), T( LDT, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      REAL               ONE, ZERO
      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0 )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY
      INTEGER            NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM,
     $                   LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB,
     $                   KB, KB_LAST, KNB, MB1
 *     ..
 *     .. Local Arrays ..
      REAL               DUMMY( 1, 1 )
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           SLARFB_GETT, SLASET, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          REAL, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input parameters
 *
      INFO = 0
      LQUERY  = LWORK.EQ.-1
      IF( M.LT.0 ) THEN
         INFO = -1
      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
         INFO = -2
      ELSE IF( MB.LE.N ) THEN
         INFO = -3
      ELSE IF( NB.LT.1 ) THEN
         INFO = -4
      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
         INFO = -8
      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
         INFO = -10
      END IF
 *
      NBLOCAL = MIN( NB, N )
 *
 *     Determine the workspace size.
 *
      IF( INFO.EQ.0 ) THEN
         LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) )
      END IF
 *
 *     Handle error in the input parameters and handle the workspace query.
 *
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'SORGTSQR_ROW', -INFO )
         RETURN
      ELSE IF ( LQUERY ) THEN
         WORK( 1 ) = REAL( LWORKOPT )
         RETURN
      END IF
 *
 *     Quick return if possible
 *
      IF( MIN( M, N ).EQ.0 ) THEN
         WORK( 1 ) = REAL( LWORKOPT )
         RETURN
      END IF
 *
 *     (0) Set the upper-triangular part of the matrix A to zero and
 *     its diagonal elements to one.
 *
      CALL SLASET('U', M, N, ZERO, ONE, A, LDA )
 *
 *     KB_LAST is the column index of the last column block reflector
 *     in the matrices T and V.
 *
      KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1
 *
 *
 *     (1) Bottom-up loop over row blocks of A, except the top row block.
 *     NOTE: If MB>=M, then the loop is never executed.
 *
      IF ( MB.LT.M ) THEN
 *
 *        MB2 is the row blocking size for the row blocks before the
 *        first top row block in the matrix A. IB is the row index for
 *        the row blocks in the matrix A before the first top row block.
 *        IB_BOTTOM is the row index for the last bottom row block
 *        in the matrix A. JB_T is the column index of the corresponding
 *        column block in the matrix T.
 *
 *        Initialize variables.
 *
 *        NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A
 *        including the first row block.
 *
         MB2 = MB - N
         M_PLUS_ONE = M + 1
         ITMP = ( M - MB - 1 ) / MB2
         IB_BOTTOM = ITMP * MB2 + MB + 1
         NUM_ALL_ROW_BLOCKS = ITMP + 2
         JB_T = NUM_ALL_ROW_BLOCKS * N + 1
 *
         DO IB = IB_BOTTOM, MB+1, -MB2
 *
 *           Determine the block size IMB for the current row block
 *           in the matrix A.
 *
            IMB = MIN( M_PLUS_ONE - IB, MB2 )
 *
 *           Determine the column index JB_T for the current column block
 *           in the matrix T.
 *
            JB_T = JB_T - N
 *
 *           Apply column blocks of H in the row block from right to left.
 *
 *           KB is the column index of the current column block reflector
 *           in the matrices T and V.
 *
            DO KB = KB_LAST, 1, -NBLOCAL
 *
 *              Determine the size of the current column block KNB in
 *              the matrices T and V.
 *
               KNB = MIN( NBLOCAL, N - KB + 1 )
 *
               CALL SLARFB_GETT( 'I', IMB, N-KB+1, KNB,
     $                     T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA,
     $                     A( IB, KB ), LDA, WORK, KNB )
 *
            END DO
 *
         END DO
 *
      END IF
 *
 *     (2) Top row block of A.
 *     NOTE: If MB>=M, then we have only one row block of A of size M
 *     and we work on the entire matrix A.
 *
      MB1 = MIN( MB, M )
 *
 *     Apply column blocks of H in the top row block from right to left.
 *
 *     KB is the column index of the current block reflector in
 *     the matrices T and V.
 *
      DO KB = KB_LAST, 1, -NBLOCAL
 *
 *        Determine the size of the current column block KNB in
 *        the matrices T and V.
 *
         KNB = MIN( NBLOCAL, N - KB + 1 )
 *
         IF( MB1-KB-KNB+1.EQ.0 ) THEN
 *
 *           In SLARFB_GETT parameters, when M=0, then the matrix B
 *           does not exist, hence we need to pass a dummy array
 *           reference DUMMY(1,1) to B with LDDUMMY=1.
 *
            CALL SLARFB_GETT( 'N', 0, N-KB+1, KNB,
     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
     $                        DUMMY( 1, 1 ), 1, WORK, KNB )
         ELSE
            CALL SLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB,
     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
     $                        A( KB+KNB, KB), LDA, WORK, KNB )

         END IF
 *
      END DO
 *
      WORK( 1 ) = REAL( LWORKOPT )
      RETURN
 *
 *     End of SORGTSQR_ROW
 *
      END
--- a/lapack-netlib/SRC/zgetsqrhrt.f
+++ b/lapack-netlib/SRC/zgetsqrhrt.f
@@ -0,0 +1,349 @@
 *> \brief \b ZGETSQRHRT
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download ZGETSQRHRT + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgetsqrhrt.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgetsqrhrt.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgetsqrhrt.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE ZGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
 *      $                       LWORK, INFO )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
 *       ..
 *       .. Array Arguments ..
 *       COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
 *       ..
 *
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> ZGETSQRHRT computes a NB2-sized column blocked QR-factorization
 *> of a complex M-by-N matrix A with M >= N,
 *>
 *>    A = Q * R.
 *>
 *> The routine uses internally a NB1-sized column blocked and MB1-sized
 *> row blocked TSQR-factorization and perfors the reconstruction
 *> of the Householder vectors from the TSQR output. The routine also
 *> converts the R_tsqr factor from the TSQR-factorization output into
 *> the R factor that corresponds to the Householder QR-factorization,
 *>
 *>    A = Q_tsqr * R_tsqr = Q * R.
 *>
 *> The output Q and R factors are stored in the same format as in ZGEQRT
 *> (Q is in blocked compact WY-representation). See the documentation
 *> of ZGEQRT for more details on the format.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix A.  M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrix A. M >= N >= 0.
 *> \endverbatim
 *>
 *> \param[in] MB1
 *> \verbatim
 *>          MB1 is INTEGER
 *>          The row block size to be used in the blocked TSQR.
 *>          MB1 > N.
 *> \endverbatim
 *>
 *> \param[in] NB1
 *> \verbatim
 *>          NB1 is INTEGER
 *>          The column block size to be used in the blocked TSQR.
 *>          N >= NB1 >= 1.
 *> \endverbatim
 *>
 *> \param[in] NB2
 *> \verbatim
 *>          NB2 is INTEGER
 *>          The block size to be used in the blocked QR that is
 *>          output. NB2 >= 1.
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX*16 array, dimension (LDA,N)
 *>
 *>          On entry: an M-by-N matrix A.
 *>
 *>          On exit:
 *>           a) the elements on and above the diagonal
 *>              of the array contain the N-by-N upper-triangular
 *>              matrix R corresponding to the Householder QR;
 *>           b) the elements below the diagonal represent Q by
 *>              the columns of blocked V (compact WY-representation).
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[out] T
 *> \verbatim
 *>          T is COMPLEX*16 array, dimension (LDT,N))
 *>          The upper triangular block reflectors stored in compact form
 *>          as a sequence of upper triangular blocks.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T.  LDT >= NB2.
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
 *>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
 *>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
 *>          array, and no error message related to LWORK is issued
 *>          by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup comlpex16OTHERcomputational
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE ZGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK,
     $                       LWORK, INFO )
      IMPLICIT NONE
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1
 *     ..
 *     .. Array Arguments ..
      COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      COMPLEX*16         CONE
      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ) )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY
      INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
     $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           ZCOPY, ZLATSQR, ZUNGTSQR_ROW, ZUNHR_COL,
     $                   XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          CEILING, DBLE, DCMPLX, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
      INFO = 0
      LQUERY  = LWORK.EQ.-1
      IF( M.LT.0 ) THEN
         INFO = -1
      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
         INFO = -2
      ELSE IF( MB1.LE.N ) THEN
         INFO = -3
      ELSE IF( NB1.LT.1 ) THEN
         INFO = -4
      ELSE IF( NB2.LT.1 ) THEN
         INFO = -5
      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -7
      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
         INFO = -9
      ELSE
 *
 *        Test the input LWORK for the dimension of the array WORK.
 *        This workspace is used to store array:
 *        a) Matrix T and WORK for ZLATSQR;
 *        b) N-by-N upper-triangular factor R_tsqr;
 *        c) Matrix T and array WORK for ZUNGTSQR_ROW;
 *        d) Diagonal D for ZUNHR_COL.
 *
         IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN
            INFO = -11
         ELSE
 *
 *           Set block size for column blocks
 *
            NB1LOCAL = MIN( NB1, N )
 *
            NUM_ALL_ROW_BLOCKS = MAX( 1,
     $                   CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) )
 *
 *           Length and leading dimension of WORK array to place
 *           T array in TSQR.
 *
            LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL

            LDWT = NB1LOCAL
 *
 *           Length of TSQR work array
 *
            LW1 = NB1LOCAL * N
 *
 *           Length of ZUNGTSQR_ROW work array.
 *
            LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
            LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
 *
            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
               INFO = -11
            END IF
 *
         END IF
      END IF
 *
 *     Handle error in the input parameters and return workspace query.
 *
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'ZGETSQRHRT', -INFO )
         RETURN
      ELSE IF ( LQUERY ) THEN
         WORK( 1 ) = DCMPLX( LWORKOPT )
         RETURN
      END IF
 *
 *     Quick return if possible
 *
      IF( MIN( M, N ).EQ.0 ) THEN
         WORK( 1 ) = DCMPLX( LWORKOPT )
         RETURN
      END IF
 *
      NB2LOCAL = MIN( NB2, N )
 *
 *
 *     (1) Perform TSQR-factorization of the M-by-N matrix A.
 *
      CALL ZLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
     $              WORK(LWT+1), LW1, IINFO )
 *
 *     (2) Copy the factor R_tsqr stored in the upper-triangular part
 *         of A into the square matrix in the work array
 *         WORK(LWT+1:LWT+N*N) column-by-column.
 *
      DO J = 1, N
         CALL ZCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 )
      END DO
 *
 *     (3) Generate a M-by-N matrix Q with orthonormal columns from
 *     the result stored below the diagonal in the array A in place.
 *

      CALL ZUNGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT,
     $                   WORK( LWT+N*N+1 ), LW2, IINFO )
 *
 *     (4) Perform the reconstruction of Householder vectors from
 *     the matrix Q (stored in A) in place.
 *
      CALL ZUNHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT,
     $                WORK( LWT+N*N+1 ), IINFO )
 *
 *     (5) Copy the factor R_tsqr stored in the square matrix in the
 *     work array WORK(LWT+1:LWT+N*N) into the upper-triangular
 *     part of A.
 *
 *     (6) Compute from R_tsqr the factor R_hr corresponding to
 *     the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr.
 *     This multiplication by the sign matrix S on the left means
 *     changing the sign of I-th row of the matrix R_tsqr according
 *     to sign of the I-th diagonal element DIAG(I) of the matrix S.
 *     DIAG is stored in WORK( LWT+N*N+1 ) from the ZUNHR_COL output.
 *
 *     (5) and (6) can be combined in a single loop, so the rows in A
 *     are accessed only once.
 *
      DO I = 1, N
         IF( WORK( LWT+N*N+I ).EQ.-CONE ) THEN
            DO J = I, N
               A( I, J ) = -CONE * WORK( LWT+N*(J-1)+I )
            END DO
         ELSE
            CALL ZCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA )
         END IF
      END DO
 *
      WORK( 1 ) = DCMPLX( LWORKOPT )
      RETURN
 *
 *     End of ZGETSQRHRT
 *
      END
--- a/lapack-netlib/SRC/zlarfb_gett.f
+++ b/lapack-netlib/SRC/zlarfb_gett.f
@@ -0,0 +1,597 @@
 *> \brief \b ZLARFB_GETT
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download ZLARFB_GETT + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zlarfb_gett.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zlarfb_gett.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zlarfb_gett.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE ZLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
 *      $                        WORK, LDWORK )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       CHARACTER          IDENT
 *       INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
 *       ..
 *       .. Array Arguments ..
 *       COMPLEX*16         A( LDA, * ), B( LDB, * ), T( LDT, * ),
 *      $                   WORK( LDWORK, * )
 *       ..
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> ZLARFB_GETT applies a complex Householder block reflector H from the
 *> left to a complex (K+M)-by-N  "triangular-pentagonal" matrix
 *> composed of two block matrices: an upper trapezoidal K-by-N matrix A
 *> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored
 *> in the array B. The block reflector H is stored in a compact
 *> WY-representation, where the elementary reflectors are in the
 *> arrays A, B and T. See Further Details section.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] IDENT
 *> \verbatim
 *>          IDENT is CHARACTER*1
 *>          If IDENT = not 'I', or not 'i', then V1 is unit
 *>             lower-triangular and stored in the left K-by-K block of
 *>             the input matrix A,
 *>          If IDENT = 'I' or 'i', then  V1 is an identity matrix and
 *>             not stored.
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix B.
 *>          M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrices A and B.
 *>          N >= 0.
 *> \endverbatim
 *>
 *> \param[in] K
 *> \verbatim
 *>          K is INTEGER
 *>          The number or rows of the matrix A.
 *>          K is also order of the matrix T, i.e. the number of
 *>          elementary reflectors whose product defines the block
 *>          reflector. 0 <= K <= N.
 *> \endverbatim
 *>
 *> \param[in] T
 *> \verbatim
 *>          T is COMPLEX*16 array, dimension (LDT,K)
 *>          The upper-triangular K-by-K matrix T in the representation
 *>          of the block reflector.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T. LDT >= K.
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX*16 array, dimension (LDA,N)
 *>
 *>          On entry:
 *>           a) In the K-by-N upper-trapezoidal part A: input matrix A.
 *>           b) In the columns below the diagonal: columns of V1
 *>              (ones are not stored on the diagonal).
 *>
 *>          On exit:
 *>            A is overwritten by rectangular K-by-N product H*A.
 *>
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDB is INTEGER
 *>          The leading dimension of the array A. LDA >= max(1,K).
 *> \endverbatim
 *>
 *> \param[in,out] B
 *> \verbatim
 *>          B is COMPLEX*16 array, dimension (LDB,N)
 *>
 *>          On entry:
 *>            a) In the M-by-(N-K) right block: input matrix B.
 *>            b) In the M-by-N left block: columns of V2.
 *>
 *>          On exit:
 *>            B is overwritten by rectangular M-by-N product H*B.
 *>
 *>          See Further Details section.
 *> \endverbatim
 *>
 *> \param[in] LDB
 *> \verbatim
 *>          LDB is INTEGER
 *>          The leading dimension of the array B. LDB >= max(1,M).
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX*16 array,
 *>          dimension (LDWORK,max(K,N-K))
 *> \endverbatim
 *>
 *> \param[in] LDWORK
 *> \verbatim
 *>          LDWORK is INTEGER
 *>          The leading dimension of the array WORK. LDWORK>=max(1,K).
 *>
 *> \endverbatim
 *
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup complex16OTHERauxiliary
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *
 *> \par Further Details:
 *  =====================
 *>
 *> \verbatim
 *>
 *>    (1) Description of the Algebraic Operation.
 *>
 *>    The matrix A is a K-by-N matrix composed of two column block
 *>    matrices, A1, which is K-by-K, and A2, which is K-by-(N-K):
 *>    A = ( A1, A2 ).
 *>    The matrix B is an M-by-N matrix composed of two column block
 *>    matrices, B1, which is M-by-K, and B2, which is M-by-(N-K):
 *>    B = ( B1, B2 ).
 *>
 *>    Perform the operation:
 *>
 *>       ( A_out ) := H * ( A_in ) = ( I - V * T * V**H ) * ( A_in ) =
 *>       ( B_out )        ( B_in )                          ( B_in )
 *>                  = ( I - ( V1 ) * T * ( V1**H, V2**H ) ) * ( A_in )
 *>                          ( V2 )                            ( B_in )
 *>     On input:
 *>
 *>    a) ( A_in )  consists of two block columns:
 *>       ( B_in )
 *>
 *>       ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in ))
 *>       ( B_in )   (( B1_in ) ( B2_in ))   ((     0 ) ( B2_in )),
 *>
 *>       where the column blocks are:
 *>
 *>       (  A1_in )  is a K-by-K upper-triangular matrix stored in the
 *>                   upper triangular part of the array A(1:K,1:K).
 *>       (  B1_in )  is an M-by-K rectangular ZERO matrix and not stored.
 *>
 *>       ( A2_in )  is a K-by-(N-K) rectangular matrix stored
 *>                  in the array A(1:K,K+1:N).
 *>       ( B2_in )  is an M-by-(N-K) rectangular matrix stored
 *>                  in the array B(1:M,K+1:N).
 *>
 *>    b) V = ( V1 )
 *>           ( V2 )
 *>
 *>       where:
 *>       1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored;
 *>       2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix,
 *>          stored in the lower-triangular part of the array
 *>          A(1:K,1:K) (ones are not stored),
 *>       and V2 is an M-by-K rectangular stored the array B(1:M,1:K),
 *>                 (because on input B1_in is a rectangular zero
 *>                  matrix that is not stored and the space is
 *>                  used to store V2).
 *>
 *>    c) T is a K-by-K upper-triangular matrix stored
 *>       in the array T(1:K,1:K).
 *>
 *>    On output:
 *>
 *>    a) ( A_out ) consists of two  block columns:
 *>       ( B_out )
 *>
 *>       ( A_out ) = (( A1_out ) ( A2_out ))
 *>       ( B_out )   (( B1_out ) ( B2_out )),
 *>
 *>       where the column blocks are:
 *>
 *>       ( A1_out )  is a K-by-K square matrix, or a K-by-K
 *>                   upper-triangular matrix, if V1 is an
 *>                   identity matrix. AiOut is stored in
 *>                   the array A(1:K,1:K).
 *>       ( B1_out )  is an M-by-K rectangular matrix stored
 *>                   in the array B(1:M,K:N).
 *>
 *>       ( A2_out )  is a K-by-(N-K) rectangular matrix stored
 *>                   in the array A(1:K,K+1:N).
 *>       ( B2_out )  is an M-by-(N-K) rectangular matrix stored
 *>                   in the array B(1:M,K+1:N).
 *>
 *>
 *>    The operation above can be represented as the same operation
 *>    on each block column:
 *>
 *>       ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**H ) * ( A1_in )
 *>       ( B1_out )        (     0 )                          (     0 )
 *>
 *>       ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**H ) * ( A2_in )
 *>       ( B2_out )        ( B2_in )                          ( B2_in )
 *>
 *>    If IDENT != 'I':
 *>
 *>       The computation for column block 1:
 *>
 *>       A1_out: = A1_in - V1*T*(V1**H)*A1_in
 *>
 *>       B1_out: = - V2*T*(V1**H)*A1_in
 *>
 *>       The computation for column block 2, which exists if N > K:
 *>
 *>       A2_out: = A2_in - V1*T*( (V1**H)*A2_in + (V2**H)*B2_in )
 *>
 *>       B2_out: = B2_in - V2*T*( (V1**H)*A2_in + (V2**H)*B2_in )
 *>
 *>    If IDENT == 'I':
 *>
 *>       The operation for column block 1:
 *>
 *>       A1_out: = A1_in - V1*T*A1_in
 *>
 *>       B1_out: = - V2*T*A1_in
 *>
 *>       The computation for column block 2, which exists if N > K:
 *>
 *>       A2_out: = A2_in - T*( A2_in + (V2**H)*B2_in )
 *>
 *>       B2_out: = B2_in - V2*T*( A2_in + (V2**H)*B2_in )
 *>
 *>    (2) Description of the Algorithmic Computation.
 *>
 *>    In the first step, we compute column block 2, i.e. A2 and B2.
 *>    Here, we need to use the K-by-(N-K) rectangular workspace
 *>    matrix W2 that is of the same size as the matrix A2.
 *>    W2 is stored in the array WORK(1:K,1:(N-K)).
 *>
 *>    In the second step, we compute column block 1, i.e. A1 and B1.
 *>    Here, we need to use the K-by-K square workspace matrix W1
 *>    that is of the same size as the as the matrix A1.
 *>    W1 is stored in the array WORK(1:K,1:K).
 *>
 *>    NOTE: Hence, in this routine, we need the workspace array WORK
 *>    only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from
 *>    the first step and W1 from the second step.
 *>
 *>    Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I',
 *>    more computations than in the Case (B).
 *>
 *>    if( IDENT != 'I' ) then
 *>     if ( N > K ) then
 *>       (First Step - column block 2)
 *>       col2_(1) W2: = A2
 *>       col2_(2) W2: = (V1**H) * W2 = (unit_lower_tr_of_(A1)**H) * W2
 *>       col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
 *>       col2_(4) W2: = T * W2
 *>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>       col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
 *>       col2_(7) A2: = A2 - W2
 *>     else
 *>       (Second Step - column block 1)
 *>       col1_(1) W1: = A1
 *>       col1_(2) W1: = (V1**H) * W1 = (unit_lower_tr_of_(A1)**H) * W1
 *>       col1_(3) W1: = T * W1
 *>       col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>       col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
 *>       col1_(6) square A1: = A1 - W1
 *>     end if
 *>    end if
 *>
 *>    Case (B), when V1 is an identity matrix, i.e. IDENT == 'I',
 *>    less computations than in the Case (A)
 *>
 *>    if( IDENT == 'I' ) then
 *>     if ( N > K ) then
 *>       (First Step - column block 2)
 *>       col2_(1) W2: = A2
 *>       col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
 *>       col2_(4) W2: = T * W2
 *>       col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>       col2_(7) A2: = A2 - W2
 *>     else
 *>       (Second Step - column block 1)
 *>       col1_(1) W1: = A1
 *>       col1_(3) W1: = T * W1
 *>       col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>       col1_(6) upper-triangular_of_(A1): = A1 - W1
 *>     end if
 *>    end if
 *>
 *>    Combine these cases (A) and (B) together, this is the resulting
 *>    algorithm:
 *>
 *>    if ( N > K ) then
 *>
 *>      (First Step - column block 2)
 *>
 *>      col2_(1)  W2: = A2
 *>      if( IDENT != 'I' ) then
 *>        col2_(2)  W2: = (V1**H) * W2
 *>                      = (unit_lower_tr_of_(A1)**H) * W2
 *>      end if
 *>      col2_(3)  W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2]
 *>      col2_(4)  W2: = T * W2
 *>      col2_(5)  B2: = B2 - V2 * W2 = B2 - B1 * W2
 *>      if( IDENT != 'I' ) then
 *>        col2_(6)    W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2
 *>      end if
 *>      col2_(7) A2: = A2 - W2
 *>
 *>    else
 *>
 *>    (Second Step - column block 1)
 *>
 *>      col1_(1) W1: = A1
 *>      if( IDENT != 'I' ) then
 *>        col1_(2) W1: = (V1**H) * W1
 *>                    = (unit_lower_tr_of_(A1)**H) * W1
 *>      end if
 *>      col1_(3) W1: = T * W1
 *>      col1_(4) B1: = - V2 * W1 = - B1 * W1
 *>      if( IDENT != 'I' ) then
 *>        col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1
 *>        col1_(6_a) below_diag_of_(A1): =  - below_diag_of_(W1)
 *>      end if
 *>      col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1)
 *>
 *>    end if
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE ZLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB,
     $                        WORK, LDWORK )
      IMPLICIT NONE
 *
 *  -- LAPACK auxiliary routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      CHARACTER          IDENT
      INTEGER            K, LDA, LDB, LDT, LDWORK, M, N
 *     ..
 *     .. Array Arguments ..
      COMPLEX*16         A( LDA, * ), B( LDB, * ), T( LDT, * ),
     $                   WORK( LDWORK, * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      COMPLEX*16         CONE, CZERO
      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ),
     $                     CZERO = ( 0.0D+0, 0.0D+0 ) )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LNOTIDENT
      INTEGER            I, J
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
      LOGICAL            LSAME
      EXTERNAL           LSAME
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           ZCOPY, ZGEMM, ZTRMM
 *     ..
 *     .. Executable Statements ..
 *
 *     Quick return if possible
 *
      IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N )
     $   RETURN
 *
      LNOTIDENT = .NOT.LSAME( IDENT, 'I' )
 *
 *     ------------------------------------------------------------------
 *
 *     First Step. Computation of the Column Block 2:
 *
 *        ( A2 ) := H * ( A2 )
 *        ( B2 )        ( B2 )
 *
 *     ------------------------------------------------------------------
 *
      IF( N.GT.K ) THEN
 *
 *        col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N)
 *        into W2=WORK(1:K, 1:N-K) column-by-column.
 *
         DO J = 1, N-K
            CALL ZCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 )
         END DO

         IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2,
 *           V1 is not an identy matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
            CALL ZTRMM( 'L', 'L', 'C', 'U', K, N-K, CONE, A, LDA,
     $                  WORK, LDWORK )
         END IF
 *
 *        col2_(3) Compute W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2
 *        V2 stored in B1.
 *
         IF( M.GT.0 ) THEN
            CALL ZGEMM( 'C', 'N', K, N-K, M, CONE, B, LDB,
     $                  B( 1, K+1 ), LDB, CONE, WORK, LDWORK )
         END IF
 *
 *        col2_(4) Compute W2: = T * W2,
 *        T is upper-triangular.
 *
         CALL ZTRMM( 'L', 'U', 'N', 'N', K, N-K, CONE, T, LDT,
     $               WORK, LDWORK )
 *
 *        col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2,
 *        V2 stored in B1.
 *
         IF( M.GT.0 ) THEN
            CALL ZGEMM( 'N', 'N', M, N-K, K, -CONE, B, LDB,
     $                   WORK, LDWORK, CONE, B( 1, K+1 ), LDB )
         END IF
 *
         IF( LNOTIDENT ) THEN
 *
 *           col2_(6) Compute W2: = V1 * W2 = A1 * W2,
 *           V1 is not an identity matrix, but unit lower-triangular,
 *           V1 stored in A1 (diagonal ones are not stored).
 *
            CALL ZTRMM( 'L', 'L', 'N', 'U', K, N-K, CONE, A, LDA,
     $                  WORK, LDWORK )
         END IF
 *
 *        col2_(7) Compute A2: = A2 - W2 =
 *                             = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K),
 *        column-by-column.
 *
         DO J = 1, N-K
            DO I = 1, K
               A( I, K+J ) = A( I, K+J ) - WORK( I, J )
            END DO
         END DO
 *
      END IF
 *
 *     ------------------------------------------------------------------
 *
 *     Second Step. Computation of the Column Block 1:
 *
 *        ( A1 ) := H * ( A1 )
 *        ( B1 )        (  0 )
 *
 *     ------------------------------------------------------------------
 *
 *     col1_(1) Compute W1: = A1. Copy the upper-triangular
 *     A1 = A(1:K, 1:K) into the upper-triangular
 *     W1 = WORK(1:K, 1:K) column-by-column.
 *
      DO J = 1, K
         CALL ZCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 )
      END DO
 *
 *     Set the subdiagonal elements of W1 to zero column-by-column.
 *
      DO J = 1, K - 1
         DO I = J + 1, K
            WORK( I, J ) = CZERO
         END DO
      END DO
 *
      IF( LNOTIDENT ) THEN
 *
 *        col1_(2) Compute W1: = (V1**H) * W1 = (A1**H) * W1,
 *        V1 is not an identity matrix, but unit lower-triangular
 *        V1 stored in A1 (diagonal ones are not stored),
 *        W1 is upper-triangular with zeroes below the diagonal.
 *
         CALL ZTRMM( 'L', 'L', 'C', 'U', K, K, CONE, A, LDA,
     $               WORK, LDWORK )
      END IF
 *
 *     col1_(3) Compute W1: = T * W1,
 *     T is upper-triangular,
 *     W1 is upper-triangular with zeroes below the diagonal.
 *
      CALL ZTRMM( 'L', 'U', 'N', 'N', K, K, CONE, T, LDT,
     $            WORK, LDWORK )
 *
 *     col1_(4) Compute B1: = - V2 * W1 = - B1 * W1,
 *     V2 = B1, W1 is upper-triangular with zeroes below the diagonal.
 *
      IF( M.GT.0 ) THEN
         CALL ZTRMM( 'R', 'U', 'N', 'N', M, K, -CONE, WORK, LDWORK,
     $               B, LDB )
      END IF
 *
      IF( LNOTIDENT ) THEN
 *
 *        col1_(5) Compute W1: = V1 * W1 = A1 * W1,
 *        V1 is not an identity matrix, but unit lower-triangular
 *        V1 stored in A1 (diagonal ones are not stored),
 *        W1 is upper-triangular on input with zeroes below the diagonal,
 *        and square on output.
 *
         CALL ZTRMM( 'L', 'L', 'N', 'U', K, K, CONE, A, LDA,
     $               WORK, LDWORK )
 *
 *        col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K)
 *        column-by-column. A1 is upper-triangular on input.
 *        If IDENT, A1 is square on output, and W1 is square,
 *        if NOT IDENT, A1 is upper-triangular on output,
 *        W1 is upper-triangular.
 *
 *        col1_(6)_a Compute elements of A1 below the diagonal.
 *
         DO J = 1, K - 1
            DO I = J + 1, K
               A( I, J ) = - WORK( I, J )
            END DO
         END DO
 *
      END IF
 *
 *     col1_(6)_b Compute elements of A1 on and above the diagonal.
 *
      DO J = 1, K
         DO I = 1, J
            A( I, J ) = A( I, J ) - WORK( I, J )
         END DO
      END DO
 *
      RETURN
 *
 *     End of ZLARFB_GETT
 *
      END
--- a/lapack-netlib/SRC/zungtsqr_row.f
+++ b/lapack-netlib/SRC/zungtsqr_row.f
@@ -0,0 +1,380 @@
 *> \brief \b ZUNGTSQR_ROW
 *
 *  =========== DOCUMENTATION ===========
 *
 * Online html documentation available at
 *            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
 *> Download ZUNGTSQR_ROW + dependencies
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zunrgtsqr_row.f">
 *> [TGZ]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zunrgtsqr_row.f">
 *> [ZIP]</a>
 *> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zunrgtsqr_row.f">
 *> [TXT]</a>
 *> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
 *       SUBROUTINE ZUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
 *      $                         LWORK, INFO )
 *       IMPLICIT NONE
 *
 *       .. Scalar Arguments ..
 *       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
 *       ..
 *       .. Array Arguments ..
 *       COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
 *       ..
 *
 *> \par Purpose:
 *  =============
 *>
 *> \verbatim
 *>
 *> ZUNGTSQR_ROW generates an M-by-N complex matrix Q_out with
 *> orthonormal columns from the output of ZLATSQR. These N orthonormal
 *> columns are the first N columns of a product of complex unitary
 *> matrices Q(k)_in of order M, which are returned by ZLATSQR in
 *> a special format.
 *>
 *>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
 *>
 *> The input matrices Q(k)_in are stored in row and column blocks in A.
 *> See the documentation of ZLATSQR for more details on the format of
 *> Q(k)_in, where each Q(k)_in is represented by block Householder
 *> transformations. This routine calls an auxiliary routine ZLARFB_GETT,
 *> where the computation is performed on each individual block. The
 *> algorithm first sweeps NB-sized column blocks from the right to left
 *> starting in the bottom row block and continues to the top row block
 *> (hence _ROW in the routine name). This sweep is in reverse order of
 *> the order in which ZLATSQR generates the output blocks.
 *> \endverbatim
 *
 *  Arguments:
 *  ==========
 *
 *> \param[in] M
 *> \verbatim
 *>          M is INTEGER
 *>          The number of rows of the matrix A.  M >= 0.
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
 *>          The number of columns of the matrix A. M >= N >= 0.
 *> \endverbatim
 *>
 *> \param[in] MB
 *> \verbatim
 *>          MB is INTEGER
 *>          The row block size used by ZLATSQR to return
 *>          arrays A and T. MB > N.
 *>          (Note that if MB > M, then M is used instead of MB
 *>          as the row block size).
 *> \endverbatim
 *>
 *> \param[in] NB
 *> \verbatim
 *>          NB is INTEGER
 *>          The column block size used by ZLATSQR to return
 *>          arrays A and T. NB >= 1.
 *>          (Note that if NB > N, then N is used instead of NB
 *>          as the column block size).
 *> \endverbatim
 *>
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX*16 array, dimension (LDA,N)
 *>
 *>          On entry:
 *>
 *>             The elements on and above the diagonal are not used as
 *>             input. The elements below the diagonal represent the unit
 *>             lower-trapezoidal blocked matrix V computed by ZLATSQR
 *>             that defines the input matrices Q_in(k) (ones on the
 *>             diagonal are not stored). See ZLATSQR for more details.
 *>
 *>          On exit:
 *>
 *>             The array A contains an M-by-N orthonormal matrix Q_out,
 *>             i.e the columns of A are orthogonal unit vectors.
 *> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.  LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[in] T
 *> \verbatim
 *>          T is COMPLEX*16 array,
 *>          dimension (LDT, N * NIRB)
 *>          where NIRB = Number_of_input_row_blocks
 *>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
 *>          Let NICB = Number_of_input_col_blocks
 *>                   = CEIL(N/NB)
 *>
 *>          The upper-triangular block reflectors used to define the
 *>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
 *>          reflectors are stored in compact form in NIRB block
 *>          reflector sequences. Each of the NIRB block reflector
 *>          sequences is stored in a larger NB-by-N column block of T
 *>          and consists of NICB smaller NB-by-NB upper-triangular
 *>          column blocks. See ZLATSQR for more details on the format
 *>          of T.
 *> \endverbatim
 *>
 *> \param[in] LDT
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of the array T.
 *>          LDT >= max(1,min(NB,N)).
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
 *>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
 *>          LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)),
 *>          where NBLOCAL=MIN(NB,N).
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
 *>          array, and no error message related to LWORK is issued
 *>          by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *> \endverbatim
 *>
 *  Authors:
 *  ========
 *
 *> \author Univ. of Tennessee
 *> \author Univ. of California Berkeley
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
 *> \ingroup complex16OTHERcomputational
 *
 *> \par Contributors:
 *  ==================
 *>
 *> \verbatim
 *>
 *> November 2020, Igor Kozachenko,
 *>                Computer Science Division,
 *>                University of California, Berkeley
 *>
 *> \endverbatim
 *>
 *  =====================================================================
      SUBROUTINE ZUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK,
     $                         LWORK, INFO )
      IMPLICIT NONE
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
 *     ..
 *     .. Array Arguments ..
      COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *
 *     .. Parameters ..
      COMPLEX*16         CONE, CZERO
      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ),
     $                     CZERO = ( 0.0D+0, 0.0D+0 ) )
 *     ..
 *     .. Local Scalars ..
      LOGICAL            LQUERY
      INTEGER            NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM,
     $                   LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB,
     $                   KB, KB_LAST, KNB, MB1
 *     ..
 *     .. Local Arrays ..
      COMPLEX*16         DUMMY( 1, 1 )
 *     ..
 *     .. External Subroutines ..
      EXTERNAL           ZLARFB_GETT, ZLASET, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
      INTRINSIC          DCMPLX, MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input parameters
 *
      INFO = 0
      LQUERY  = LWORK.EQ.-1
      IF( M.LT.0 ) THEN
         INFO = -1
      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
         INFO = -2
      ELSE IF( MB.LE.N ) THEN
         INFO = -3
      ELSE IF( NB.LT.1 ) THEN
         INFO = -4
      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
         INFO = -8
      ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN
         INFO = -10
      END IF
 *
      NBLOCAL = MIN( NB, N )
 *
 *     Determine the workspace size.
 *
      IF( INFO.EQ.0 ) THEN
         LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) )
      END IF
 *
 *     Handle error in the input parameters and handle the workspace query.
 *
      IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'ZUNGTSQR_ROW', -INFO )
         RETURN
      ELSE IF ( LQUERY ) THEN
         WORK( 1 ) = DCMPLX( LWORKOPT )
         RETURN
      END IF
 *
 *     Quick return if possible
 *
      IF( MIN( M, N ).EQ.0 ) THEN
         WORK( 1 ) = DCMPLX( LWORKOPT )
         RETURN
      END IF
 *
 *     (0) Set the upper-triangular part of the matrix A to zero and
 *     its diagonal elements to one.
 *
      CALL ZLASET('U', M, N, CZERO, CONE, A, LDA )
 *
 *     KB_LAST is the column index of the last column block reflector
 *     in the matrices T and V.
 *
      KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1
 *
 *
 *     (1) Bottom-up loop over row blocks of A, except the top row block.
 *     NOTE: If MB>=M, then the loop is never executed.
 *
      IF ( MB.LT.M ) THEN
 *
 *        MB2 is the row blocking size for the row blocks before the
 *        first top row block in the matrix A. IB is the row index for
 *        the row blocks in the matrix A before the first top row block.
 *        IB_BOTTOM is the row index for the last bottom row block
 *        in the matrix A. JB_T is the column index of the corresponding
 *        column block in the matrix T.
 *
 *        Initialize variables.
 *
 *        NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A
 *        including the first row block.
 *
         MB2 = MB - N
         M_PLUS_ONE = M + 1
         ITMP = ( M - MB - 1 ) / MB2
         IB_BOTTOM = ITMP * MB2 + MB + 1
         NUM_ALL_ROW_BLOCKS = ITMP + 2
         JB_T = NUM_ALL_ROW_BLOCKS * N + 1
 *
         DO IB = IB_BOTTOM, MB+1, -MB2
 *
 *           Determine the block size IMB for the current row block
 *           in the matrix A.
 *
            IMB = MIN( M_PLUS_ONE - IB, MB2 )
 *
 *           Determine the column index JB_T for the current column block
 *           in the matrix T.
 *
            JB_T = JB_T - N
 *
 *           Apply column blocks of H in the row block from right to left.
 *
 *           KB is the column index of the current column block reflector
 *           in the matrices T and V.
 *
            DO KB = KB_LAST, 1, -NBLOCAL
 *
 *              Determine the size of the current column block KNB in
 *              the matrices T and V.
 *
               KNB = MIN( NBLOCAL, N - KB + 1 )
 *
               CALL ZLARFB_GETT( 'I', IMB, N-KB+1, KNB,
     $                     T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA,
     $                     A( IB, KB ), LDA, WORK, KNB )
 *
            END DO
 *
         END DO
 *
      END IF
 *
 *     (2) Top row block of A.
 *     NOTE: If MB>=M, then we have only one row block of A of size M
 *     and we work on the entire matrix A.
 *
      MB1 = MIN( MB, M )
 *
 *     Apply column blocks of H in the top row block from right to left.
 *
 *     KB is the column index of the current block reflector in
 *     the matrices T and V.
 *
      DO KB = KB_LAST, 1, -NBLOCAL
 *
 *        Determine the size of the current column block KNB in
 *        the matrices T and V.
 *
         KNB = MIN( NBLOCAL, N - KB + 1 )
 *
         IF( MB1-KB-KNB+1.EQ.0 ) THEN
 *
 *           In SLARFB_GETT parameters, when M=0, then the matrix B
 *           does not exist, hence we need to pass a dummy array
 *           reference DUMMY(1,1) to B with LDDUMMY=1.
 *
            CALL ZLARFB_GETT( 'N', 0, N-KB+1, KNB,
     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
     $                        DUMMY( 1, 1 ), 1, WORK, KNB )
         ELSE
            CALL ZLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB,
     $                        T( 1, KB ), LDT, A( KB, KB ), LDA,
     $                        A( KB+KNB, KB), LDA, WORK, KNB )

         END IF
 *
      END DO
 *
      WORK( 1 ) = DCMPLX( LWORKOPT )
      RETURN
 *
 *     End of ZUNGTSQR_ROW
 *
      END