subroutine nittfq (n, xcur, fcur, fcnrm, step, eta, f, jacv, rpar, 
     &     ipar, ijacv, irpre, iksmax, ifdord, nfe, njve, nrpre, nli, r,
     &     rcgs, rtil, d, p, q, u, v, y, rwork1, rwork2, rsnrm, dinpr, 
     &                                                   dnorm, itrmks )

      implicit none

      integer ifdord, ijacv, iksmax, irpre, itrmks, n, nfe, njve,
     &        nrpre, nli

      integer ipar(*)

      double precision eta, fcnrm, rsnrm

      double precision d(n), fcur(n), p(n), q(n), rcgs(n), r(n),
     &                 rpar(*), rtil(n), rwork1(n), rwork2(n), step(n),
     &                 u(n), v(n), xcur(n), y(n)

      double precision dinpr, dnorm

      external f, jacv, dinpr, dnorm

c ------------------------------------------------------------------------
c
c This is nittfq v0.1, the TFQMR routine for determining (trial) inexact 
c Newton steps. The original reference is R. W. Freund, "A Transpose-Free
c Quasi-Minimal Residual Algorithm for Non-Hermitian Linear Systems", 
c SIAM J. Sci. Comput., 14 (1993), pp. 470-482.  The implementation here
c is based on the right preconditioned algorithm that appears in 
c J. N. Shadid and R. S. Tuminaro, "A Comparison of Preconditioned
c Nonsymmetric Krylov Methods on a Large-Scale MIMD Machine", SIAM J.
c Sci. Comput., 15 (1994), pp. 440-459.
c
c ------------------------------------------------------------------------
c
c Explanation:
c
c  n       = dimension of the problem.
c
c  xcur    = vector of length n, current approximate solution. 
c
c  fcur    = vector of length n, value of f at xcur.
c
c  fcnrm   = norm of fcur. 
c
c  step    = vector of length n, (trial) step. 
c
c  eta     = relative residual reduction factor. 
c
c  f      = name of user-supplied subroutine for evaluating the function 
c           the zero of which is sought; this routine has the form
c
c                 subroutine f(n, xcur, fcur, rpar, ipar, itrmf)
c
c           where xcur is the array containing the current x value, fcur 
c           is f(xcur) on output, rpar and ipar are, respectively, real 
c           and integer parameter/work arrays for use by the subroutine,
c           and itrmf is an integer termination flag.  The meaning of
c           itrmf is as follows:
c             0 => normal termination; desired function value calculated.
c             1 => failure to produce f(xcur).
c 
c  jacv   = name of user-supplied subroutine for evaluating J*v or 
c           P(inverse)*v, where J is the Jacobian of f and P is a 
c           right preconditioning operator. If neither analytic J*v 
c           evaluations nor right preconditioning is used, this can 
c           be a dummy subroutine; if right preconditioning is used but 
c           not analytic J*v evaluations, this need only evaluate 
c           P(inverse)*v. The form is 
c
c           subroutine jacv(n, xcur, fcur, ijob, v, z, rpar, ipar, itrmjv)
c
c           where xcur and fcur are vectors of length n containing the 
c           current x and f values, ijob is an integer flag indicating 
c           which product is desired, v is a vector of length n to be 
c           multiplied, z is a vector of length n containing the desired 
c           product on output, rpar and ipar are, respectively, real 
c           and integer parameter/work arrays for use by the subroutine, 
c           and itrmjv is an integer termination 
c           flag. The meaning of ijob is as follows: 
c             0 => z = J*v
c             1 => z = P(inverse)*v 
c           The meaning of itrmjv is as follows:
c             0 => normal termination; desired product evaluated. 
c             1 => failure to produce J*v.
c             2 => failure to produce P(inverse)*v. 
c           This subroutine is called only from nitjv, and is always 
c           called with v .ne. z. 
c
c  rpar    = real parameter/work array passed to the f and jacv routines. 
c
c  ipar    = integer parameter/work array passed to the f and jacv routines. 
c
c  ijacv   = flag for determining method of J*v evaluation.
c              0 => finite-difference evaluation (default). 
c              1 => analytic evaluation. 
c
c  irpre   = flag for right preconditioning. 
c              0 => no right preconditioning
c              1 => right preconditioning
c
c  iksmax  = maximum allowable number of TFQMR iterations. 
c
c  ifdord  = order of the finite-difference formula used in BiCGSTAB 
c            when J*v products are evaluated using finite-differences. 
c            When ijacv = 0 on input to nitsol, ifdord is set to 1, 2, or 
c            4 in nitsol; otherwise, it is irrelevant. When ijacv = 0 on 
c            input to this subroutine, ifdord determines the order of the 
c            finite-difference formula used at each BiCGSTAB iteration 
c            (default 1). In this case, ijacv is set to -1 below to 
c            signal to nitjv that the order of the finite-difference 
c            formula is to be determined by ifdord. The original value 
c            ijacv = 0 is restored on return. 
c            
c  nfe     = number of function evaluations.
c
c  njve    = number of J*v evaluations. 
c
c  nrpre   = number of P(inverse)*v evaluations.
c
c  nli     = number of linear iterations.
c
c  r       = residual vector (for the QMR process)
c
c  rcgs    = residual vector (of the underlying CGS process)
c
c  rtil    = 'shadow' residual vector used in bi-orthogonalization
c
c  d       = vector used in TFQMR
c
c  p       = vector used in TFQMR
c
c  q       = vector used in TFQMR
c
c  u       = vector used in TFQMR
c
c  v       = vector used in TFQMR
c
c  y       = vector used in TFQMR
c
c  rwork1  = work vector, passed on to nitjv
c
c  rwork2  = work vector, passed on to nitjv
c
c  rsnrm   = TFQMR residual norm on return. 
c
c  dinpr   = inner-product routine, either user-supplied or blas ddot. 
c
c  dnorm   = norm routine, either user-supplied or blas dnrm2. 
c
c  itrmks  = termination flag; values have the following meanings: 
c              0 => normal termination: acceptable step found. 
c              1 => J*v failure in nitjv. 
c              2 => P(inverse)*v failure in nitjv. 
c              3 => acceptable step not found in iksmax TFQMR iterations. 
c              4 => TFQMR breakdown.
c              5 => floating point error (the underlying CGS iteration
c                   has probably blown up)
c
c             Note: On return, nitsol terminates if itrmks is 1 or 2. 
c             If itrmks is 3 or 4, nitsol may terminate or continue. 
c             In this event, the step returned is a meaningful inexact 
c             Newton step only if the residual norm has been reduced. 
c             A decision on termination/continuation is made in nitdrv 
c             according to whether there is sufficient residual norm 
c             reduction, even though the desired inexact Newton condition 
c             may not hold.  
c
c -------------------------------------------------------------------------

c  Subroutines required by this and all called routines:

c    user supplied:  nitjv

c    nitsol routines: none

c    BLAS routines- dcopy, daxpy, dscal, dswap

c    LAPACK routines - dlamch

c    user supplied or BLAS:  dinpr, dnorm

c    explanation: In nitsol, dinpr and dnorm are set to either the BLAS 
c    ddot and dnrm2 routines or the user-supplied routines. 

c This subroutine called by: nitdrv

c Subroutines called by this subroutine: daxpy, dcopy, dscal, dswap, dinpr,
c    dlamch, dnorm, nitjv, dlamch

c Common block: 

      include 'nitprint.h'

c If diagnostic information is desired, include this common block in the 
c main program and set iplvl and ipunit according to the following: 
c
c     iplvl = 0 => no printout
c           = 1 => iteration numbers and F-norms
c           = 2 => ... + some stats, step norms, and linear model norms
c           = 3 => ... + some Krylov solver and backtrack information
c           = 4 => ... + more Krylov solver and backtrack information
c
c     ipunit = printout unit number.

c  Parameters-

      double precision zero,       one
      parameter      ( zero=0.0d0, one=1.0d0 )

c  Local variables-

      integer i
      integer itask
      integer itrmjv
      integer itfq
      integer k

      double precision alpha
      double precision abstol
      double precision beta
      double precision c
      double precision cgsnorm
      double precision qmreta
      double precision omega
      double precision rho
      double precision rho_old
      double precision sigma
      double precision t
      double precision tau
      double precision theta

      character*2 ab(0:1)
      data ab /'.a','.b'/

      double precision sfmin
      data sfmin /zero/

c  External subroutines-

      double precision dlamch

      external daxpy
      external dcopy
      external dlamch
      external dscal
      external dswap
      external nitjv

c  Intrinsics-

      intrinsic abs
      intrinsic dble
      intrinsic sqrt

c  Start of executable code-

c  Initialize sfmin only on first entry.

      if ( sfmin .eq. zero ) sfmin = dlamch( 's' )

c If finite-differences are used to evaluate J*v products (ijacv= 0), then 
c ijacv is set to -1 within this subroutine to signal to nitjv that the 
c order of the finite-difference formula is to be determined by ifdord. 
c The original value ijacv= 0 is restored on return. 

      if (ijacv .eq. 0) ijacv = -1 

c Set the stopping tolerance, initialize the step, etc. 

      rsnrm = fcnrm
      abstol = eta*rsnrm
      do 10 i = 1, n
         step(i) = zero
 10   continue
      itfq = 0

c For printing:

      if ( iplvl .ge. 3 ) then 
         write(ipunit,*) 
         write(ipunit,800) eta 
      endif
      if ( iplvl .ge. 4 ) then 
         write(ipunit,810) 
         write(ipunit,*) 
         write(ipunit,820) itfq, rsnrm 
      endif

c  Initialize residual and work vectors.

      call dcopy( n, fcur, 1, rcgs, 1 )
      call dscal( n, -one, rcgs, 1 )

c  Choice here is rtil = r.

      call dcopy( n, rcgs, 1, rtil, 1 )

      if ( irpre .eq. 0 ) then
         call dcopy( n, rcgs, 1, p, 1 )
      else
         itask = 2
         call nitjv( n, xcur, fcur, f, jacv, rpar, ipar, ijacv, ifdord,
     &               itask, nfe, njve, nrpre, rcgs, p, rwork1, rwork2,
     &                                                   dnorm, itrmjv )    
         if ( itrmjv .ne. 0 ) then
            itrmks = 2
            goto 900
         endif
      endif
      call dcopy( n, p, 1, u, 1 )
      rho = dinpr( n, rcgs, 1, rtil, 1 )
      do 20 i = 1, n
         d(i) = zero
 20   continue

      itask = 0
      call nitjv( n, xcur, fcur, f, jacv, rpar, ipar, ijacv, ifdord,
     &            itask, nfe, njve, nrpre, p, v, rwork1, rwork2,
     &                                                   dnorm, itrmjv )
      if ( itrmjv .ne. 0 ) then
         itrmks = 1
         goto 900
      end if

      alpha = zero
      omega = fcnrm
      tau = omega
      theta = zero
      qmreta = zero

c  Start iterations.

 100  continue
      itfq = itfq + 1
      nli = nli + 1
      sigma = dinpr( n, rtil, 1, v, 1 )

c  If sigma = 0 we have a serious breakdown.  We check this condition
c  by trying to detect whether division by sigma causes an overflow.

      if ( abs(sigma) .lt. sfmin*abs(rho) ) then
         itrmks = 4
         goto 900
      else
         alpha = rho/sigma
      endif

c  Need Pv for calculation of q.  First store result in q, then
c  swap some vectors to cast calculation of q as a SAXPY.

      if ( irpre .eq. 0 ) then
         call dcopy( n, v, 1, q, 1 )
      else
         itask = 2
         call nitjv( n, xcur, fcur, f, jacv, rpar, ipar, ijacv, ifdord,
     &               itask, nfe, njve, nrpre, v, q, rwork1, rwork2,
     &                                                   dnorm, itrmjv )    
         if ( itrmjv .ne. 0 ) then
            itrmks = 2
            goto 900
         endif
      endif
      call dcopy( n, u, 1, y, 1 )
      call dswap( n, q, 1, u, 1 )
      call daxpy( n, -alpha, u, 1, q, 1 )
      call dcopy( n, y, 1, u, 1 )

c  Update residual.

      do 30 i = 1, n
         y(i) = u(i) + q(i)
 30   continue
      itask = 0
      call nitjv( n, xcur, fcur, f, jacv, rpar, ipar, ijacv, ifdord,
     &            itask, nfe, njve, nrpre, y, v, rwork1, rwork2,
     &                                                   dnorm, itrmjv )
      if ( itrmjv .ne. 0 ) then
         itrmks = 1
         goto 900
      end if
      call daxpy( n, -alpha, v, 1, rcgs, 1 )
      cgsnorm = dnorm( n, rcgs, 1 )

c  Check for cgsnorm = NaN.

      if ( cgsnorm .ne. cgsnorm ) then
         itrmks = 5
         goto 900
      endif

c  QMR section.

      do 60 k = 0, 1

c  Use weighting strategy from (5.11) of Freund reference.

         t = qmreta*theta**2
         t = t/alpha

         if ( k .eq. 0 ) then
            do 40 i = 1, n
               d(i) = u(i) + t*d(i)
 40         continue
            omega = sqrt(omega*cgsnorm)
         else if ( k .eq. 1 ) then
            do 50 i = 1, n
               d(i) = q(i) + t*d(i)
 50         continue
            omega = cgsnorm
         endif

         theta = omega/tau
         c = one/sqrt(one + theta**2)
         tau = tau*theta*c
         qmreta = alpha*c**2

c For printing:

         if ( iplvl .ge. 4 .and. tau .gt. abstol ) then 
            write(ipunit,830) itfq, ab(k), tau, '     (estimated)'
         endif

         call daxpy( n, qmreta, d, 1, step, 1 )

c  Convergence check.  Do a cheap test to save on Jacobi-vector products.
c  In case residual history is requested by iplvl, we must calculate 
c  the QMR residual from scratch.  Note termination is always determined
c  by the smoothed residual, calculated from scratch.

         if ( tau .le. abstol ) then

            itask = 0
            call nitjv( n, xcur, fcur, f, jacv, rpar, ipar, ijacv,
     &            ifdord, itask, nfe, njve, nrpre, step, r, rwork1,
     &                                           rwork2, dnorm, itrmjv )

c  This calculation of the QMR residual is off by a factor
c  of -1, but we don't care until we return from this routine.

            call daxpy( n, one, fcur, 1, r, 1 )
            rsnrm = dnorm( n, r, 1 )

c For printing:

            if ( iplvl .ge. 4 ) then 
               write(ipunit,830) itfq, ab(k), rsnrm, '  (from scratch)'
            endif

c  Check for rsnrm = NaN.

            if ( rsnrm .ne. rsnrm ) then
               itrmks = 5
               goto 900
            endif

c  If rsnrm is small enough, exit.

            if ( rsnrm .lt. abstol ) then
               itrmks = 0
               goto 900
            endif
         endif

 60   continue

      rho_old = rho
      rho = dinpr( n, rtil, 1, rcgs, 1 )

c  If rho_old = 0 we have a serious breakdown.  We check this condition
c  by trying to detect whether division by rho_old causes an overflow.

      if ( abs(rho_old) .lt. sfmin*abs(rho) ) then
         itrmks = 4
         goto 900
      else
         beta = rho/rho_old
      endif

      if ( irpre .eq. 0 ) then
         call dcopy( n, rcgs, 1, v, 1 )
      else
         itask = 2
         call nitjv( n, xcur, fcur, f, jacv, rpar, ipar, ijacv, ifdord,
     &               itask, nfe, njve, nrpre, rcgs, v, rwork1, rwork2,
     &                                                   dnorm, itrmjv )    
         if ( itrmjv .ne. 0 ) then
            itrmks = 2
            goto 900
         endif
      endif
      call daxpy( n, beta, q, 1, v, 1 )
      call dcopy( n, v, 1, u, 1 )
      call daxpy( n, beta, p, 1, q, 1 )
      call daxpy( n, beta, q, 1, v, 1 )
      call dcopy( n, v, 1, p, 1 )

      itask = 0
      call nitjv( n, xcur, fcur, f, jacv, rpar, ipar, ijacv, ifdord,
     &            itask, nfe, njve, nrpre, p, v, rwork1, rwork2,
     &                                                   dnorm, itrmjv )
      if ( itrmjv .ne. 0 ) then
         itrmks = 1
         goto 900
      end if
      if ( itfq .ge. iksmax ) then
         itrmks = 3
         goto 900
      end if

c  Do again

      goto 100

c  All returns made here.

  900 continue

c  If residual hasn't been updated, force
c  computation of residual from scratch.

      if ( rsnrm .eq. fcnrm ) then

         itask = 0
         call nitjv( n, xcur, fcur, f, jacv, rpar, ipar, ijacv,
     &         ifdord, itask, nfe, njve, nrpre, step, r, rwork1,
     &                                        rwork2, dnorm, itrmjv )

         call daxpy( n, one, fcur, 1, r, 1 )
         rsnrm = dnorm( n, r, 1 )
         if ( rsnrm .le. abstol ) itrmks = 0

      end if

c  Correct residual before returning.

      call dscal( n, -one, r, 1 )

c If ijacv = -1, then restore it to the original value ijacv = 0. 

      if (ijacv .eq. -1) ijacv = 0 

c For printing:

      if ( iplvl .ge. 3 ) then 
         write(ipunit,*) 
         if (itrmks .ne. 1 .and. itrmks .ne. 2) then 
            write(ipunit,840) itrmks, rsnrm 
         else
            write(ipunit,850) itrmks
         endif
      endif

      return

 800  format('nittfq:  eta =', 1pd10.3)
 810  format('nittfq:  TFQMR iteration no. (parts a and b),',
     &                                        ' linear residual norm')
 820  format(5x,i4,5x,1pd10.3)
 830  format(5x,i4,a2,3x,1pd10.3,a16)
 840  format('nittfq:  itrmks =', i2, '   final lin. res. norm =', 
     &                                                          1pd10.3)
 850  format('nittfq: itrmks:', i4) 

c  End of nittfq.

      end