!|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

 module POP_SolversMod

!BOP
! !MODULE: POP_SolversMod
!
! !DESCRIPTION:
!  This module contains routines and operators for solving the elliptic
!  system for surface pressure in the barotropic mode.
!
! !REVISION HISTORY:
!  SVN:$Id: $

! !USES:

   use POP_KindsMod
   use blocks, only: block, get_block
   use POP_ErrorMod
   use POP_CommMod
   use POP_ConfigMod
   use POP_IOUnitsMod
   use POP_BlocksMod
   use POP_DistributionMod
   use POP_GridHorzMod
   use POP_ReductionsMod
   use POP_BroadcastMod
   use POP_RedistributeMod
   use POP_HaloMod
   use POP_GridHorzMod
   use POP_FieldMod
   use POP_DomainSizeMod
   use time_management,only:matsuno_ts
   use domain
   use grid

   implicit none
   private
   save

! !PUBLIC MEMBER FUNCTIONS:

   public :: POP_SolversInit,     &
             POP_SolversPrep,     &
             POP_SolversRun,      &
             POP_SolversDiagonal, &
             POP_SolversGetDiagnostics

! !PUBLIC DATA MEMBERS:


!-----------------------------------------------------------------------
!
!  other operator and preconditioning weights for barotropic operator
!
!-----------------------------------------------------------------------

   real (POP_r8), dimension (:,:,:), allocatable, public :: & 
      mMaskTropic,     &! land mask in barotropic distribution 
      btropWgtCenter,  &! barotropic operater center coefficient
      btropWgtNorth,   &! barotropic operater north  coefficient
      btropWgtEast,    &! barotropic operater east   coefficient
      btropWgtNE        ! barotropic operater northeast coefficient

   real (POP_r8), dimension (:,:,:), allocatable, public :: & 
      centerWgtClinicIndep, &! time indep  center wgt on clinic distrb
      centerWgtClinic        ! time depend center wgt on clinic distrb  

!-----------------------------------------------------------------------
!
!  EVP preconditioner influence matrix and block coefficient matrix,
!  Saved in EVP sub-block format. Dimension switched:
!  (X,Y,block_ID) ==> (subblock_X, subblock_Y, subblock_ID, block_ID) 
!
!-----------------------------------------------------------------------

   real (POP_r8), dimension (:,:,:,:), allocatable, public :: & 
      EvpRinv               ! EVP influence matrix

   real (POP_r8), dimension (:,:,:,:), allocatable, public :: & 
      EvpCenterWgt,        &! reshape coefficients into blocks 
      InvEvpCenterWgt,     &! save inverse for computational efficiency
      EvpNeWgt,            &! 
      InvEvpNeWgt          

   integer (POP_i4),dimension(:,:,:),allocatable :: &
      landIndx              ! index of land EVP blocks 

!EOP
!BOC
   !*** preconditioning operator coefficients (ninept operator)

   real (POP_r8), dimension (:,:,:), allocatable :: & 
      precondCenter,              &
      precondNorth, precondSouth, &
      precondEast,  precondWest,  &
      precondNE,    precondSE,    &
      precondNW,    precondSW

!-----------------------------------------------------------------------
!
!  supported solvers and preconditioners
!
!-----------------------------------------------------------------------

   character (POP_charLength) :: &
      solverChoice

   character (3), parameter :: &
      solverChoicePCG = 'pcg'
   character (4), parameter :: & 
      solverChoicePCSI = 'PCSI'
   character (9), parameter :: &
      solverChoiceChronGear = 'ChronGear'

   character (POP_charLength) :: &
      preconditionerChoice

   character (8), parameter :: &
      precondChoiceDiag = 'diagonal'
   character (4), parameter :: &
      precondChoiceFile = 'file'
   character (3), parameter :: &
      precondChoiceEvp = 'evp' 

   logical (POP_logical) :: &
      usePreconditioner

!-----------------------------------------------------------------------
!
!  scalar convergence-related variables
!
!-----------------------------------------------------------------------

   integer (POP_i4) ::      &
      maxIterations,        &! max number of solver iterations
      convergenceCheckStart,&! start checking from given steps
      convergenceCheckFreq   ! check convergence every freq steps

   real (POP_r8) ::         &
      convergenceCriterion, &! convergence error criterion
      LanczosconvergenceCriterion, &! Lanczos convergence criterion
      residualNorm           ! residual normalization

   !*** convergence diagnostics

   integer (POP_i4), public ::      &
      numIterations          ! accumulated no of iterations (diagnostic)

   real (POP_r8) ::         &
      rmsResidual            ! residual (also a diagnostic)

   real (POP_r8) ::        & 
      PcsiMaxEigs,         &! Largest eigenvalues for PCSI method
      PcsiMinEigs           ! smallest eigenvalues for PCSI method

   integer(POP_i4), parameter  :: &
      EvpXbs  = 8,        &! EVP block size along longitude
      EvpYbs  = 8          ! EVP block size along latitude

   integer (POP_i4) :: &
      MaxLanczosStep        ! Max lanczos steps to get eigenvalues 

   integer (POP_i4) :: &
      EvpXnb,          &    ! sub block number along longitude
      EvpYnb                ! sub block number along latitude

   integer (POP_i4),dimension(:),allocatable :: &
      EvpXbidx,          &  ! index of sub blocks start point 
      EvpYbidx              ! along longtitude and latitude

!EOC
!***********************************************************************

 contains
!***********************************************************************
!BOP
! !SUBROUTINE: POP_SolversPrep
! !INTERFACE:

 subroutine POP_SolversPrep(errorCode)

! !DESCRIPTION:
! solver preprocessing : 
! 1. compute eigenvalues if PCSI is used 
! 2. Preprocessing EVP influence matrix if EVP preconditioning is used 
!
! !REVISION HISTORY:
!  this routine implemented by Yong Hu et al., Tsinghua University
!BOP
! !INPUT PARAMETERS:


! !INPUT/OUTPUT PARAMETERS:


! !OUTPUT PARAMETERS:

   integer (POP_i4), intent(out) :: &
      errorCode         ! returned error code
   integer (POP_i4) :: &
      numProcs           ! number of processors in barotropic distrib

!EOP
!BOC
!-----------------------------------------------------------------------
!
!  local variables
!
!-----------------------------------------------------------------------

   integer (POP_i4) :: &
      numBlocks,          &! number of blocks in barotropic distrib
      bid,                &! local counters
      nx1,ny1             ! POP_nxBlock -1


   errorCode = POP_Success
   if (POP_myTask == POP_masterTask) then
      write(POP_stdout,POP_blankFormat)
      write(POP_stdout,POP_delimFormat)
      write(POP_stdout,'(a42)') ' Solver Preprocessing (barotropic solver)'
      write(POP_stdout,POP_delimFormat)
      write(POP_stdout,POP_blankFormat)
   endif

   call POP_DistributionGet(POP_distrbTropic, errorCode, &
                            numProcs = numProcs)

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversRun: error getting num procs')
      return
   endif

   call POP_RedistributeBlocks(btropWgtCenter,  POP_distrbTropic, &
                               centerWgtClinic, POP_distrbClinic, errorCode)


   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversRun: error redistributing center operator weight')
      return
   endif

   if (POP_myTask < numProcs) then
      call POP_DistributionGet(POP_distrbTropic, errorCode, &
                               numLocalBlocks = numBlocks)

!-----------------------------------------------------------------------
!
!   EVP preprocessing
!
!-----------------------------------------------------------------------
      if (trim(preconditionerChoice) == precondChoiceEvp) then

          if (POP_myTask == POP_masterTask) then
            write(POP_stdout,'(a50)')  & 
              'Solver EVP preconditioning START '
          endif

          EvpCenterWgt(:,:,:,:) = 0.0
          InvEvpCenterWgt(:,:,:,:) = 0.0
          EvpNeWgt(:,:,:,:) = 0.0
          InvEvpNeWgt(:,:,:,:) = 0.0

          do bid=1,numBlocks
            nx1 = POP_nxBlock-1
            ny1 = POP_nyBlock-1
            call EvpPre(btropWgtCenter(2:nx1,2:ny1,bid),btropWgtNE(2:nx1,2:ny1,bid),&
                        EvpCenterWgt(:,:,:,bid),EvpNeWgt(:,:,:,bid),EvpRinv(:,:,:,bid),&
                        landIndx(:,:,bid), nx1-1,ny1-1,EvpXbs,EvpYbs,&
                        EvpXbidx,EvpYbidx,EvpXnb,EvpYnb,bid)
            

            ! Save inverse of coefficents for efficiency
            where(EvpCenterWgt(:,:,:,bid) .ne. 0.0) 
              InvEvpCenterWgt(:,:,:,bid) = &
                          1.0_POP_r8/EvpCenterWgt(:,:,:,bid)
            end where
            where(EvpNeWgt(:,:,:,bid) .ne. 0.0)
              InvEvpNeWgt(:,:,:,bid) = 1.0_POP_r8/EvpNeWgt(:,:,:,bid)
            end where

          end do 

          if (POP_myTask == POP_masterTask) then
            write(POP_stdout,'(a35)') ' Solver EVP preconditioning END'
          endif

      endif

   
!-----------------------------------------------------------------------
!
! call lanczos function to compute eigenvalues for PCSI
!
!-----------------------------------------------------------------------

      if(trim(solverChoice) .eq. solverChoicePCSI) then
        call PcsiLanczos(POP_nxBlock, POP_nyBlock, numBlocks, &
                    MaxLanczosStep, PcsiMaxEigs,PcsiMinEigs, errorCode)

        if (POP_myTask == POP_masterTask) then 
            write(POP_stdout,*) "LANCZOS EIGS: ",PcsiMinEigs, PcsiMaxEigs
        endif 
      endif 

      if (errorCode /= POP_Success) then
         call POP_ErrorSet(errorCode, &
            'POP_SolversRun: error computing eigenvalues in Lanczos')
         return
      endif

   endif

!-----------------------------------------------------------------------
!EOC

 end subroutine POP_SolversPrep

!***********************************************************************
!BOP
! !IROUTINE: POP_SolversRun
! !INTERFACE:

 subroutine POP_SolversRun(sfcPressure, rhsClinic, errorCode)

! !DESCRIPTION:
!  Solves the elliptic equation for surface pressure by calling
!  the requested solver routine.  Also redistributes necessary
!  array to the barotropic distribution of blocks for better performance
!  of the solver.
!  The elliptic equation is
!  \begin{equation}
!     AF = B
!  \end{equation}
!  where $F$ is a field (eg surface pressure), $B$ is the right hand side
!  and $A$ is the operator defined as
!  \begin{equation}
!     AF = a \nabla\cdot(H \nabla F)
!  \end{equation}
!  where $a$ is the cell area.
!
! !REVISION HISTORY:
!  same as module

! !INPUT PARAMETERS:

   real (POP_r8), dimension(:,:,:), intent(in) :: &
      rhsClinic         ! right-hand-side of linear system
                        !  for blocks in baroclinic distribution

! !INPUT/OUTPUT PARAMETERS:

   real (POP_r8), dimension(:,:,:), intent(inout) :: &
     sfcPressure              ! on input,  initial guess in baroclinic distrb
                        ! on output, final solution for sfc pressure

! !OUTPUT PARAMETERS:

   integer (POP_i4), intent(out) :: &
      errorCode         ! returned error code

!EOP
!BOC
!-----------------------------------------------------------------------
!
!  local variables
!
!-----------------------------------------------------------------------

   integer (POP_i4) :: &
      numProcs           ! number of processors in barotropic distrib

   real (POP_r8), dimension(size(sfcPressure,dim=1), &
                            size(sfcPressure,dim=2), &
                            size(sfcPressure,dim=3)) :: &
      pressTropic,     &! surface pressure in barotropic distribution
      rhsTropic         ! right hand side  in barotropic distribution

!-----------------------------------------------------------------------
!
!  switch to the barotropic distribution for iterative solvers
!
!-----------------------------------------------------------------------

   errorCode = POP_Success

   call POP_RedistributeBlocks(btropWgtCenter,  POP_distrbTropic, &
                               centerWgtClinic, POP_distrbClinic, errorCode)


   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversRun: error redistributing center operator weight')
      return
   endif

   call POP_RedistributeBlocks(pressTropic, POP_distrbTropic, &
                               sfcPressure, POP_distrbClinic, errorCode)


   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversRun: error redistributing pressure')
      return
   endif

   call POP_RedistributeBlocks(rhsTropic, POP_distrbTropic, &
                               rhsClinic, POP_distrbClinic, errorCode)

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversRun: error redistributing right hand side')
      return
   endif

!-----------------------------------------------------------------------
!
!  call proper routine based on user choice of solver
!
!-----------------------------------------------------------------------

   call POP_DistributionGet(POP_distrbTropic, errorCode, &
                            numProcs = numProcs)

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversRun: error getting num procs')
      return
   endif

   if (POP_myTask < numProcs) then

!-----------------------------------------------------------------------
!
!  solver preprocessing if PCSI or EVP preconditioner is used 
!
!-----------------------------------------------------------------------

      select case(trim(solverChoice))
      case (solverChoicePCG)

         call pcg(pressTropic, rhsTropic, errorCode)

         if (errorCode /= POP_Success) then
            call POP_ErrorSet(errorCode, &
               'POP_SolverRun: error in PCG')
            return
         endif

      case (solverChoicePCSI)  

         call PCSI(pressTropic, rhsTropic, errorCode)

         if (errorCode /= POP_Success) then
            call POP_ErrorSet(errorCode, &
               'POP_SolverRun: error in PCSI')
            return
         endif
      case (solverChoiceChronGear)

         call ChronGear(pressTropic, rhsTropic, errorCode)

         if (errorCode /= POP_Success) then
            call POP_ErrorSet(errorCode, &
               'POP_SolverRun: error in ChronGear')
            return
         endif

      end select
   endif

!-----------------------------------------------------------------------
!
!  switch solution back to the baroclinic distribution
!
!-----------------------------------------------------------------------

   call POP_RedistributeBlocks(sfcPressure, POP_distrbClinic, &
                               pressTropic, POP_distrbTropic, errorCode)


   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversRun: error redistributing pressure back')
      return
   endif

!-----------------------------------------------------------------------
!EOC

 end subroutine POP_SolversRun

!***********************************************************************
!BOP
! !IROUTINE: POP_SolversInit
! !INTERFACE:

 subroutine POP_SolversInit(errorCode)

! !DESCRIPTION:
!  This routine initializes choice of solver, calculates the 
!  coefficients of the 9-point stencils for the barotropic operator and
!  reads in a preconditioning matrix if requested.
!
! !REVISION HISTORY:
!  same as module
!
! !OUTPUT PARAMETERS:

   integer (POP_i4), intent(out) :: &
      errorCode       ! returned error code

!EOP
!BOC
!-----------------------------------------------------------------------
!
!  local variables:
!
!       {X,Y}{NE,SE,NW,SW} = contribution to {ne,se,nw,sw} coefficients 
!         from {x,y} components of divergence
!       HU = depth at U points
!
!-----------------------------------------------------------------------

   integer (POP_i4) :: &
      i,j,lEvp,nbEvp,  &! dummy loop counters
      configUnit,      &! unit for configuration file
      numBlocksClinic, &! num local blocks in baroclinic distribution
      numBlocksTropic, &! num local blocks in barotropic distribution
      iblock,          &! block counter
      istat             ! status flag for allocates

   character (POP_charLength) :: &
      preconditionerFile  ! file containing preconditioner

   real (POP_r8) ::    &
      xne,xse,xnw,xsw, &! contribution to coefficients from x,y
      yne,yse,ynw,ysw, &!   components of divergence
      ase,anw,asw

   real (POP_r8), dimension(:,:,:), allocatable :: &
      work0,           &! temp space for computing barotropic
      workNorth,       &!
      workEast,        &!
      workNE,          &!
      mMaskTmp          ! land mask in barotropic distribution


!-----------------------------------------------------------------------
!
!  read solver choice and solver constants from configuration file
!
!-----------------------------------------------------------------------

   errorCode = POP_Success

   if (POP_myTask == POP_masterTask) then
      write(POP_stdout,POP_blankFormat)
      write(POP_stdout,POP_delimFormat)
      write(POP_stdout,'(a35)') ' Solver options (barotropic solver)'
      write(POP_stdout,POP_delimFormat)
      write(POP_stdout,POP_blankFormat)
   endif

   call POP_ConfigOpen(configUnit, errorCode)

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error opening config file')
      return
   endif

!  call POP_ConfigRead(configUnit, 'solvers', 'solverChoice',    &
   call POP_ConfigRead(configUnit, 'solvers', 'solverchoice',    &
                       solverChoice, solverChoicePCG, errorCode, &
                       outStringBefore = 'Solver choice: ')

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error reading solver choice')
      return
   endif

!  call POP_ConfigRead(configUnit, 'solvers', 'convergenceCriterion', &
   call POP_ConfigRead(configUnit, 'solvers', 'convergencecriterion', &
                   convergenceCriterion, 1.e-12_POP_r8, errorCode,    &
                   outStringBefore = 'Solver converged for err < ')

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error reading solver convergence criterion')
      return
   endif

!  call POP_ConfigRead(configUnit, 'solvers', 'maxIterations',    &
   call POP_ConfigRead(configUnit, 'solvers', 'maxiterations',    &
                       maxIterations, 1000, errorCode, &
                       outStringBefore = 'Solver maximum iterations: ')

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error reading solver max iterations')
      return
   endif
!-----------------------------------------------------------------------
!
! Read in Lanczos parameters for PCSI
!
!-----------------------------------------------------------------------

!  call POP_ConfigRead(configUnit, 'solvers', 'LanczosconvergenceCriterion', &
   call POP_ConfigRead(configUnit, 'solvers', 'lanczosconvergencecriterion', &
                   LanczosconvergenceCriterion, 0.1_POP_r8, errorCode,    &
                   outStringBefore = 'Lanczos converged for err < ')

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error reading Lanczos convergence criterion')
      return
   endif

   call POP_ConfigRead(configUnit, 'solvers', 'maxlanczosstep', &
                       MaxLanczosStep, 20, errorCode,           &
                       outStringBefore = 'Maximum Lanczos step ',  &
                       outStringAfter  = ' iterations')

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error reading lanczos step')
      return
   endif
!  call POP_ConfigRead(configUnit, 'solvers', 'convergenceCheckstart', &
   call POP_ConfigRead(configUnit, 'solvers', 'convergencecheckstart', &
                       convergenceCheckStart, 60, errorCode,           &
                       outStringBefore = 'Check convergence start from ',  &
                       outStringAfter  = ' iterations')

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error reading convergence check start steps')
      return
   endif

!  call POP_ConfigRead(configUnit, 'solvers', 'convergenceCheckFreq', &
   call POP_ConfigRead(configUnit, 'solvers', 'convergencecheckfreq', &
                       convergenceCheckFreq, 10, errorCode,           &
                       outStringBefore = 'Check convergence every ',  &
                       outStringAfter  = ' iterations')

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error reading convergence check frequency')
      return
   endif

!  call POP_ConfigRead(configUnit, 'solvers', 'preconditionerChoice', &
   call POP_ConfigRead(configUnit, 'solvers', 'preconditionerchoice', &
                       preconditionerChoice, precondChoiceDiag,       &
                       errorCode,                                     &
                       outStringBefore = 'Preconditioner choice: ')

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error reading preconditioner choice')
      return
   endif

   if (trim(preconditionerChoice) == precondChoiceFile) then
!     call POP_ConfigRead(configUnit, 'solvers', 'preconditionerFile', &
      call POP_ConfigRead(configUnit, 'solvers', 'preconditionerfile', &
                 preconditionerFile, 'UnknownPrecondFile', errorCode,  &
                 outStringBefore = 'Reading preconditioner from file: ')

      if (errorCode /= POP_Success) then
         call POP_ErrorSet(errorCode, &
            'POP_SolversInit: error reading preconditioner file name')
         return
      endif
   endif

   call POP_ConfigClose(configUnit, errorCode)

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error closing config file')
      return
   endif

!-----------------------------------------------------------------------
!
!  check inputs
!
!-----------------------------------------------------------------------

   select case(trim(solverChoice))
   case(solverChoicePCG)
   case(solverChoicePCSI) 
   case(solverChoiceChronGear)
   case default
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: unknown solver - must be pcg, ChronGear, PCSI')
      return
   end select


   select case (trim(preconditionerChoice))
   case(precondChoiceDiag)
      usePreconditioner = .false.   ! default is diagonal
   case(precondChoiceEvp)
      usePreconditioner = .true.
   case(precondChoiceFile)
      usePreconditioner = .true.
   case default
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: unknown preconditioner choice')
      return
   end select

!-----------------------------------------------------------------------
!
!   Matsnuo timestep is not supported in PCSI or EVP. Because it  uses 
!   a half timestep, which introduces a different coefficient matrix A. 
!
!-----------------------------------------------------------------------
   if (matsuno_ts .and. (trim(preconditionerChoice) == precondChoiceEvp &
                   .or.  trim(solverChoice) == solverChoicePCSI)) then
      call POP_ErrorSet(errorCode, &
      'POP_SolversInit: Matsnuo timestep is not supported in &
       &PCSI solver or EVP preconditioning')
      return
   endif

!-----------------------------------------------------------------------
!
!  compute nine point operator coefficients: compute on baroclinic
!  decomposition first where grid info defined and redistribute
!  to barotropic distribution
!  leave center coefficients in baroclinic distribution to facilitate 
!  easy time-dependent changes in barotropic routine
!
!-----------------------------------------------------------------------

   call POP_DistributionGet(POP_distrbClinic, errorCode, &
                            numLocalBlocks = numBlocksClinic)

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error retrieving clinic local block count')
      return
   endif

   allocate(work0     (POP_nxBlock,POP_nyBlock,numBlocksClinic), &
            workNorth (POP_nxBlock,POP_nyBlock,numBlocksClinic), &
            workEast  (POP_nxBlock,POP_nyBlock,numBlocksClinic), &
            workNE    (POP_nxBlock,POP_nyBlock,numBlocksClinic), &
            mMaskTmp  (POP_nxBlock,POP_nyBlock,numBlocksClinic), &
       centerWgtClinicIndep(POP_nxBlock,POP_nyBlock,numBlocksClinic), &
       centerWgtClinic     (POP_nxBlock,POP_nyBlock,numBlocksClinic), &
            stat=istat)

   if (istat > 0) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error allocating temporary arrays')
      return
   endif

   !$OMP PARALLEL DO PRIVATE(iblock,i,j,xne,xse,xnw,xsw,yne,yse,ynw,ysw, &
   !$OMP                     ase,anw,asw)

   do iblock = 1,numBlocksClinic

      work0                (:,:,iblock) = 0.0_POP_r8
      workNorth            (:,:,iblock) = 0.0_POP_r8
      workEast             (:,:,iblock) = 0.0_POP_r8
      workNE               (:,:,iblock) = 0.0_POP_r8
      mMaskTmp             (:,:,iblock) = 0.0_POP_r8
      centerWgtClinicIndep (:,:,iblock) = 0.0_POP_r8

      do j=2,POP_nyBlock
      do i=2,POP_nxBlock

         xne = 0.25_POP_r8*HU(i  ,j  ,iblock)*DXUR(i  ,j  ,iblock)* &
                                              DYU (i  ,j  ,iblock)
         xse = 0.25_POP_r8*HU(i  ,j-1,iblock)*DXUR(i  ,j-1,iblock)* &
                                              DYU (i  ,j-1,iblock)
         xnw = 0.25_POP_r8*HU(i-1,j  ,iblock)*DXUR(i-1,j  ,iblock)* &
                                              DYU (i-1,j  ,iblock)
         xsw = 0.25_POP_r8*HU(i-1,j-1,iblock)*DXUR(i-1,j-1,iblock)* &
                                              DYU (i-1,j-1,iblock)

         yne = 0.25_POP_r8*HU(i  ,j  ,iblock)*DYUR(i  ,j  ,iblock)* &
                                              DXU (i  ,j  ,iblock)
         yse = 0.25_POP_r8*HU(i  ,j-1,iblock)*DYUR(i  ,j-1,iblock)* &
                                              DXU (i  ,j-1,iblock)
         ynw = 0.25_POP_r8*HU(i-1,j  ,iblock)*DYUR(i-1,j  ,iblock)* &
                                              DXU (i-1,j  ,iblock)
         ysw = 0.25_POP_r8*HU(i-1,j-1,iblock)*DYUR(i-1,j-1,iblock)* &
                                              DXU (i-1,j-1,iblock)

         workNE(i,j,iblock) = xne + yne
         ase                = xse + yse
         anw                = xnw + ynw
         asw                = xsw + ysw
 
         workEast (i,j,iblock)  = xne + xse - yne - yse
         workNorth(i,j,iblock)  = yne + ynw - xne - xnw

         centerWgtClinicIndep(i,j,iblock) = &
                        -(workNE(i,j,iblock) + ase + anw + asw)

         work0   (i,j,iblock) = TAREA (i,j,iblock)**2
         mMaskTmp(i,j,iblock) = RCALCT(i,j,iblock)

      end do
      end do
   end do

   !$OMP END PARALLEL DO

!-----------------------------------------------------------------------
!
!  redistribute operator weights and mask to barotropic distribution
!
!-----------------------------------------------------------------------

   call POP_DistributionGet(POP_distrbTropic, errorCode, &
                            numLocalBlocks = numBlocksTropic)

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error retrieving tropic local block count')
      return
   endif

   allocate(btropWgtCenter (POP_nxBlock,POP_nyBlock,numBlocksTropic), &
            btropWgtNorth  (POP_nxBlock,POP_nyBlock,numBlocksTropic), &
            btropWgtEast   (POP_nxBlock,POP_nyBlock,numBlocksTropic), &
            btropWgtNE     (POP_nxBlock,POP_nyBlock,numBlocksTropic), &
            mMaskTropic    (POP_nxBlock,POP_nyBlock,numBlocksTropic), &
            stat=istat)

   if (istat > 0) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error allocating operator weights')
      return
   endif

   call POP_RedistributeBlocks(btropWgtNorth, POP_distrbTropic, &
                               workNorth,     POP_distrbClinic, errorCode)

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error redistributing north operator weight')
      return
   endif

   call POP_RedistributeBlocks(btropWgtEast, POP_distrbTropic, &
                               workEast,     POP_distrbClinic, errorCode)

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error redistributing east operator weight')
      return
   endif

   call POP_RedistributeBlocks(btropWgtNE, POP_distrbTropic, &
                               workNE,     POP_distrbClinic, errorCode)

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error redistributing NE operator weight')
      return
   endif

   call POP_RedistributeBlocks(mMaskTropic, POP_distrbTropic, &
                               mMaskTmp,    POP_distrbClinic, errorCode)

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error redistributing mask')
      return
   endif

!-----------------------------------------------------------------------
!
!  calculate normalization constant (darea,darea) for rmsResidual
!  in cgr routine.
!
!-----------------------------------------------------------------------

   residualNorm = 1.0_POP_r8/POP_GlobalSum(work0, POP_distrbClinic, &
                                           POP_gridHorzLocCenter,   &
                                           errorCode,               &
                                           mMask = mMaskTmp)

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error computing normalization factor')
      return
   endif

   convergenceCriterion = convergenceCriterion**2/residualNorm

   deallocate(mMaskTmp, stat=istat)

   if (istat > 0) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error deallocating temp mask')
      return
   endif

!-----------------------------------------------------------------------
!
!  setup preconditioner if required
!
!-----------------------------------------------------------------------

   if (usePreconditioner) then

     if (trim(preconditionerChoice) == precondChoiceFile) then
       call POP_ErrorSet(errorCode, &
          'POP_SolversInit: preconditioner not supported')
       return

       allocate(                                                   &
          precondCenter (POP_nxBlock,POP_nyBlock,numBlocksTropic), &
          precondNorth  (POP_nxBlock,POP_nyBlock,numBlocksTropic), &
          precondSouth  (POP_nxBlock,POP_nyBlock,numBlocksTropic), &
          precondEast   (POP_nxBlock,POP_nyBlock,numBlocksTropic), &
          precondWest   (POP_nxBlock,POP_nyBlock,numBlocksTropic), &
          precondNE     (POP_nxBlock,POP_nyBlock,numBlocksTropic), &
          precondSE     (POP_nxBlock,POP_nyBlock,numBlocksTropic), &
          precondNW     (POP_nxBlock,POP_nyBlock,numBlocksTropic), &
          precondSW     (POP_nxBlock,POP_nyBlock,numBlocksTropic), &
          stat = istat)

       if (istat > 0) then
          call POP_ErrorSet(errorCode, &
             'POP_SolversInit: error allocating preconditioner')
          return
       endif

!****
!**** OLD CODE FOR READING PRECONDITIONER - NEEDS TO BE REVISED
!****
!     if (POP_myTask == POP_masterTask) then
!       write(stdout,*) ' Preconditioner read from file: ', &
!                         trim(precond_file)
!     endif
!
!     allocate(WORKS (POP_nxBlock,POP_nyBlock,nblocks_clinic), &
!              WORKW (POP_nxBlock,POP_nyBlock,nblocks_clinic), &
!              WORKSE(POP_nxBlock,POP_nyBlock,nblocks_clinic), &
!              WORKSW(POP_nxBlock,POP_nyBlock,nblocks_clinic))
!
!     allocate(icheck(nblocks_clinic))
!
!-----------------------------------------------------------------------
!
!    read preconditioner and check that it is consistent with
!    KMU field
!
!-----------------------------------------------------------------------
!
!     call open_parallel_file(nu,precond_file,recl_dbl)
!     call read_array(nu,workC)
!     call read_array(nu,workN)
!     call read_array(nu,WORKS)
!     call read_array(nu,workE)
!     call read_array(nu,WORKW)
!     call read_array(nu,workNE)
!     call read_array(nu,workNW)
!     call read_array(nu,WORKSE)
!     call read_array(nu,WORKSW)
!     call close_parallel_file(nu)
!
!     if (POP_myTask == POP_masterTask) then
!       write(stdout,blank_fmt)
!       write(stdout,*) ' file read: ', trim(precond_file)
!     endif
!
!-----------------------------------------------------------------------
!
!    check that PC is consistent with KMU field
!
!-----------------------------------------------------------------------
!
!     do iblock = 1,nblocks_clinic
!
!       icheck(iblock) = 0
!
!       do j=1,POP_nyBlock
!       do i=1,POP_nxBlock
!
!         mlandne = .false.
!         mlandnw = .false.
!         mlandse = .false.
!         mlandsw = .false.
!         if (KMU(i  ,j  ,iblock) == 0) mlandne = .true.
!         if (KMU(i-1,j  ,iblock) == 0) mlandnw = .true.
!         if (KMU(i  ,j-1,iblock) == 0) mlandse = .true.
!         if (KMU(i-1,j-1,iblock) == 0) mlandsw = .true.
!
!         if (mlandne .and. workNE(i,j,iblock) /= 0.0_POP_r8)  &
!                           icheck(iblock) = icheck(iblock) + 1
!
!         if (mlandnw .and. workNW(i,j,iblock) /= 0.0_POP_r8)  &
!                           icheck(iblock) = icheck(iblock) + 1
!
!         if (mlandse .and. WORKSE(i,j,iblock) /= 0.0_POP_r8)  &
!                           icheck(iblock) = icheck(iblock) + 1
!
!         if (mlandsw .and. WORKSW(i,j,iblock) /= 0.0_POP_r8)  &
!                           icheck(iblock) = icheck(iblock) + 1
!      
!         if (mlandne .and. mlandnw .and. (workN(i,j,iblock) /= 0.0_POP_r8)) &
!                           icheck(iblock) = icheck(iblock) + 1
!         if (mlandne .and. mlandse .and. (workE(i,j,iblock) /= 0.0_POP_r8)) &
!                           icheck(iblock) = icheck(iblock) + 1
!         if (mlandnw .and. mlandsw .and. (WORKW(i,j,iblock) /= 0.0_POP_r8)) &
!                           icheck(iblock) = icheck(iblock) + 1
!         if (mlandse .and. mlandsw .and. (WORKS(i,j,iblock) /= 0.0_POP_r8)) &
!                           icheck(iblock) = icheck(iblock) + 1
!         if (mlandne .and. mlandse .and.                            &
!             mlandnw .and. mlandsw .and. (workC(i,j,iblock) /= 0.0_POP_r8)) &
!                           icheck(iblock) = icheck(iblock) + 1
!       end do
!       end do
!     end do
!
!     ncheck = sum(icheck)
!     if (POP_GlobalSum(ncheck, POP_distrbClinic, errorCode) /= 0) then
!        call POP_ErrorSet(errorCode, &
!           'POP_SolversInit: PC and KMU are incompatible')
!        return
!     endif
!
!     deallocate(icheck)
!
!     call redistribute_blocks(precondCenter ,distrb_tropic,workC ,distrb_clinic)
!     call redistribute_blocks(precondNorth ,distrb_tropic,workN ,distrb_clinic)
!     call redistribute_blocks(precondEast ,distrb_tropic,workE ,distrb_clinic)
!     call redistribute_blocks(precondSouth ,distrb_tropic,WORKS ,distrb_clinic)
!     call redistribute_blocks(precondWest ,distrb_tropic,WORKW ,distrb_clinic)
!     call redistribute_blocks(precondNE,distrb_tropic,workNE,distrb_clinic)
!     call redistribute_blocks(precondNW,distrb_tropic,workNW,distrb_clinic)
!     call redistribute_blocks(precondSE,distrb_tropic,WORKSE,distrb_clinic)
!     call redistribute_blocks(precondSW,distrb_tropic,WORKSW,distrb_clinic)
!
!     deallocate(WORKS, WORKW, workNW, WORKSE, WORKSW)
!
     else if (trim(preconditionerChoice) == precondChoiceEvp) then


         call EvpBlockPartition(POP_nxBlock-2,EvpXbs,EvpXnb,EvpXbidx,errorCode)
         call EvpBlockPartition(POP_nyBlock-2,EvpYbs,EvpYnb,EvpYbidx,errorCode)
         
         if (POP_myTask == POP_masterTask) then
           write(POP_stdout,'(a35)') 'EVP block distribution'
           write(POP_stdout,*) EvpXbidx(:)
           write(POP_stdout,*) EvpYbidx(:)
         endif

         lEvp  = EvpXbs + EvpYbs -1
         nbEvp = EvpXnb*EvpYnb
         allocate(landIndx   (EvpXnb,EvpYnb,numBlocksTropic)          ,&         
                  EvpRinv    (levp,levp,EvpXnb*EvpYnb,numBlocksTropic),&
                  EvpNeWgt   (EvpXbs+2,EvpYbs+2,nbEvp,numBlocksTropic),&
                  InvEvpNeWgt(EvpXbs+2,EvpYbs+2,nbEvp,numBlocksTropic),&
                 EvpCenterWgt(EvpXbs+2,EvpYbs+2,nbEvp,numBlocksTropic),&
              InvEvpCenterWgt(EvpXbs+2,EvpYbs+2,nbEvp,numBlocksTropic),&
           stat = istat )

         if (istat > 0) then
            call POP_ErrorSet(errorCode, &
               'POP_SolversInit: error allocating EVP arrays')
            return
         endif
     endif 
   endif

!-----------------------------------------------------------------------
!
!  clean up temporary arrays
!
!-----------------------------------------------------------------------

   deallocate(work0, workNorth, workEast, workNE, stat=istat)

   if (istat > 0) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversInit: error deallocating temp mask')
      return
   endif

!-----------------------------------------------------------------------
!EOC

 end subroutine POP_SolversInit

!***********************************************************************
!BOP
! !IROUTINE: POP_SolversDiagonal
! !INTERFACE:

 subroutine POP_SolversDiagonal(diagonalCorrection, blockIndx, errorCode)

! !DESCRIPTION:
!  This routine corrects the center barotropic operator by 
!  subtracting off the time dependent diagnonal term computed in
!  the barotropic driver.
!
! !REVISION HISTORY:
!  same as module

! !INPUT PARAMETERS:

   real (POP_r8), dimension(:,:), intent(in) :: &
      diagonalCorrection   ! time dependent diagonal term

   integer (POP_i4), intent(in) :: &
      blockIndx            ! local block index

! !OUTPUT PARAMETERS:

   integer (POP_i4), intent(out) :: &
      errorCode         ! returned error code

!EOP
!BOC
!-----------------------------------------------------------------------
!
!  subtract the time dependent diagonal term from the center operator
!  coefficient
!
!-----------------------------------------------------------------------

   errorCode = POP_Success

   centerWgtClinic(:,:,blockIndx) =                               &
                            centerWgtClinicIndep(:,:,blockIndx) - &
                              diagonalCorrection(:,:)
                     
!-----------------------------------------------------------------------
!EOC

 end subroutine POP_SolversDiagonal

!***********************************************************************
!BOP
! !IROUTINE: POP_SolversGetDiagnostics
! !INTERFACE:

 subroutine POP_SolversGetDiagnostics(iterationCount, residual, &
                                      errorCode)

! !DESCRIPTION:
!  This routine returns the latest iteration count and residual
!  for diagnostic purposes.
!
! !REVISION HISTORY:
!  same as module

! !OUTPUT PARAMETERS:

   real (POP_r8), intent(out) :: &
      residual          ! latest residual from solver

   integer (POP_i4), intent(out) :: &
      iterationCount,  &! latest iteration count from solver
      errorCode         ! returned error code

!EOP
!BOC
!-----------------------------------------------------------------------
!
!  return the diagnostic quantities
!
!-----------------------------------------------------------------------

   errorCode = POP_Success

   residual = rmsResidual
   iterationCount = numIterations

!-----------------------------------------------------------------------
!EOC

 end subroutine POP_SolversGetDiagnostics

!***********************************************************************
!BOP
! !IROUTINE: pcg
! !INTERFACE:

 subroutine pcg(X,B,errorCode)

! !DESCRIPTION:
!  This routine uses a preconditioned conjugate-gradient solver to
!  solve the equation $Ax=b$.  Both the operator $A$ and preconditioner
!  are nine-point stencils. If no preconditioner has been supplied,
!  a diagonal preconditioner is applied.  Convergence is checked
!  every {\em ncheck} steps.
!
! !REVISION HISTORY:
!  same as module

! !INPUT PARAMETERS:

   real (POP_r8), dimension(:,:,:), intent(in) :: &
      B                         ! right hand side of linear system

! !INPUT/OUTPUT PARAMETERS:

   real (POP_r8), dimension(:,:,:), intent(inout) :: &
      X                  ! on input,  an initial guess for the solution
                         ! on output, solution of the linear system

! !OUTPUT PARAMETERS:

   integer (POP_i4), intent(out) :: &
      errorCode          ! returned error code

!EOP
!BOC
!-----------------------------------------------------------------------
!
!  local variables
!
!-----------------------------------------------------------------------

   integer (POP_i4) :: &
      i,j,m,           &! local iteration counter
      nx, ny,          &! horizontal extents
      numBlocks,       &! number of local blocks
      iblock            ! local block     counter

   real (POP_r8) ::          &
      eta0,eta1,rr        ! scalar inner product results

   real (POP_r8), dimension(size(X,dim=1),size(X,dim=2), &
                                          size(X,dim=3)) :: &
      R,                 &! residual (b-Ax)
      S,                 &! conjugate direction vector
      Q,work0,work1       ! various cg intermediate results

!-----------------------------------------------------------------------
!
!  compute initial residual and initialize S
!
!-----------------------------------------------------------------------

   errorCode = POP_Success

   call POP_DistributionGet(POP_distrbTropic, errorCode, &
                            numLocalBlocks = numBlocks)

   nx = size(X,dim=1)
   ny = size(X,dim=2)

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversPCG: error retrieving local block count')
      return
   endif

   !$OMP PARALLEL DO PRIVATE(iblock,i,j)

   do iblock=1,numBlocks
      call btropOperator(S,X,iblock)
      do j=1,ny
      do i=1,nx
         R(i,j,iblock) = B(i,j,iblock) - S(i,j,iblock)
         S(i,j,iblock) = 0.0_POP_r8
      end do
      end do
   end do ! block loop

   !$OMP END PARALLEL DO

!-----------------------------------------------------------------------
!
!  initialize fields and scalars
!
!-----------------------------------------------------------------------

   call POP_HaloUpdate(R, POP_haloTropic, POP_gridHorzLocCenter, &
                       POP_fieldKindScalar, errorCode)

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversPCG: error updating initial residual halo')
      return
   endif

   eta0 =1.0_POP_r8 
   numIterations = maxIterations
 
!-----------------------------------------------------------------------
!
!  iterate
!
!-----------------------------------------------------------------------

   iterationLoop: do m = 1, maxIterations

!-----------------------------------------------------------------------
!
!     calculate (PC)r 
!     diagonal preconditioner if preconditioner not specified
!
!-----------------------------------------------------------------------

      !$OMP PARALLEL DO PRIVATE(iblock,i,j)

      do iblock=1,numBlocks

         if (usePreconditioner) then
            call preconditioner(work1,R,iblock)
         else
            do j=1,ny
            do i=1,nx
               if (btropWgtCenter(i,j,iblock) /= 0.0_POP_r8) then
                  work1(i,j,iblock) = R(i,j,iblock)/ &
                                      btropWgtCenter(i,j,iblock)
               else
                  work1(i,j,iblock) = 0.0_POP_r8
               endif
            end do
            end do
         endif

         do j=1,ny
         do i=1,nx
            work0(i,j,iblock) = R(i,j,iblock)*work1(i,j,iblock)
         end do
         end do
      end do ! block loop

      !$OMP END PARALLEL DO

!-----------------------------------------------------------------------
!
!     update conjugate direction vector s
!
!-----------------------------------------------------------------------

      if (usePreconditioner) then
         call POP_HaloUpdate(work1, POP_haloTropic, &
                             POP_gridHorzLocCenter, &
                             POP_fieldKindScalar, errorCode)

         if (errorCode /= POP_Success) then
            call POP_ErrorSet(errorCode, &
               'POP_SolversPCG: error updating halo for preconditioner')
            return
         endif
      endif

      !*** (r,(PC)r)
      eta1 = POP_GlobalSum(work0, POP_distrbTropic, &
                           POP_gridHorzLocCenter,   &
                           errorCode, mMask = mMaskTropic)

      if (errorCode /= POP_Success) then
         call POP_ErrorSet(errorCode, &
            'POP_SolversPCG: error in initial dot product')
         return
      endif

      !$OMP PARALLEL DO PRIVATE(iblock,i,j)

      do iblock=1,numBlocks

         do j=1,ny
         do i=1,nx
            S(i,j,iblock) = work1(i,j,iblock) + &
                                S(i,j,iblock)*(eta1/eta0) 
         end do
         end do

!-----------------------------------------------------------------------
!
!        compute As
!
!-----------------------------------------------------------------------

         call btropOperator(Q,S,iblock)
         do j=1,ny
         do i=1,nx
            work0(i,j,iblock) = Q(i,j,iblock)*S(i,j,iblock)
         end do
         end do

      end do ! block loop

      !$OMP END PARALLEL DO

!-----------------------------------------------------------------------
!
!     compute next solution and residual
!
!-----------------------------------------------------------------------

      call POP_HaloUpdate(Q, POP_haloTropic, POP_gridHorzLocCenter, &
                          POP_fieldKindScalar, errorCode)

      if (errorCode /= POP_Success) then
         call POP_ErrorSet(errorCode, &
            'POP_SolversPCG: error updating Q halo')
         return
      endif

      eta0 = eta1
      eta1 = eta0/POP_GlobalSum(work0, POP_distrbTropic,          &
                                POP_gridHorzLocCenter, errorCode, &
                                mMask = mMaskTropic)

      if (errorCode /= POP_Success) then
         call POP_ErrorSet(errorCode, &
            'POP_SolversPCG: error in dot product')
         return
      endif

      !$OMP PARALLEL DO PRIVATE(iblock,i,j)

      do iblock=1,numBlocks

         do j=1,ny
         do i=1,nx
            X(i,j,iblock) = X(i,j,iblock) + eta1*S(i,j,iblock)
            R(i,j,iblock) = R(i,j,iblock) - eta1*Q(i,j,iblock)
         end do
         end do

         if (mod(m,convergenceCheckFreq) == 0) then

            call btropOperator(R,X,iblock)
            do j=1,ny
            do i=1,nx
               R(i,j,iblock) = B(i,j,iblock) - R(i,j,iblock)
               work0(i,j,iblock) = R(i,j,iblock)*R(i,j,iblock)
            end do
            end do
         endif
      end do ! block loop

      !$OMP END PARALLEL DO

!-----------------------------------------------------------------------
!
!     test for convergence
!
!-----------------------------------------------------------------------

      if (mod(m,convergenceCheckFreq) == 0) then

         call POP_HaloUpdate(R, POP_haloTropic, POP_gridHorzLocCenter, &
                             POP_fieldKindScalar, errorCode)

         if (errorCode /= POP_Success) then
            call POP_ErrorSet(errorCode, &
               'POP_SolversPCG: error updating residual halo in convrg')
            return
         endif

         rr = POP_GlobalSum(work0, POP_distrbTropic, &
                            POP_gridHorzLocCenter,   &
                            errorCode, mMask = mMaskTropic)   ! (r,r)

         if (errorCode /= POP_Success) then
            call POP_ErrorSet(errorCode, &
               'POP_SolversPCG: error computing convergence dot prod')
            return
         endif

         if (rr < convergenceCriterion) then
            numIterations = m
            exit iterationLoop
         endif

      endif

   enddo iterationLoop

   rmsResidual = sqrt(rr*residualNorm)

   if (numIterations == maxIterations) then
      if (convergenceCriterion /= 0.0_POP_r8) then
         call POP_ErrorSet(errorCode, &
            'POP_SolversPCG: solver not converged')
         return
      endif
   endif

!-----------------------------------------------------------------------
!EOC

 end subroutine pcg

!***********************************************************************
!BOP
! !IROUTINE: PCSI
! !INTERFACE:

 subroutine PCSI(X,B,errorCode)  

! !DESCRIPTION:
!  This routine implements the Preconditioned Classical Stiefel Iteration 
!  (PCSI)  solver for solving the linear system $Ax=b$.
!  It uses the two extreme eigenvalues of $A$ instead of the norm of
!  residual. Thus, PCSI eliminates global reductions in each iteration.
!  The eigenvalues are estimated by routine PcsiLanczos.
!  PCSI supports all kinds of preconditioners supported by PCG/ChronGear, 
!  including diagonal and EVP preconditioning.
!
!  References:
!     Stiefel, E. L. (1958). Kernel polynomial in linear algebra and their
!        numerical applications, in: Further contributions to the 
!        determination of eigenvalues. NBS Applied Math. Ser., 49, 1-22.
!     Hu, Y., Huang, X., Wang, X., Fu, H., Xu, S., Ruan, H., Xue, W. and Yang, G. (2013). 
!        A scalable barotropic mode solver for the parallel ocean program. 
!        In Euro-Par 2013  Parallel Processing (pp. 739-750) Springer Berlin Heidelberg.
!
! !REVISION HISTORY:
!  this routine implemented by Yong Hu et al., Tsinghua University

! !INPUT PARAMETERS:

   real (POP_r8), dimension(:,:,:), intent(in) :: &
      B                         ! right hand side of linear system

! !INPUT/OUTPUT PARAMETERS:

   real (POP_r8), dimension(:,:,:), intent(inout) :: &
      X                  ! on input,  an initial guess for the solution
                         ! on output, solution of the linear system

! !OUTPUT PARAMETERS:

   integer (POP_i4), intent(out) :: &
      errorCode          ! returned error code

!EOP
!BOC
!-----------------------------------------------------------------------
!
!  local variables
!
!-----------------------------------------------------------------------

   integer (POP_i4) :: &
      i,j,m,           &! local iteration counter
      nx, ny,          &! horizontal dimensions
      numBlocks,       &! number of local blocks
      iblock            ! local block counter


   real (POP_r8) ::          &
      csalpha,csbeta,csy,csomga,rr ! scalar inner product results
   
   real (POP_r8), dimension(size(X,dim=1),size(X,dim=2), &
                                          size(X,dim=3)) :: &
      R,                 &! residual (b-Ax)
      S,                 &! conjugate direction vector
      Q,work0,work1,     &! various cg intermediate results
      A0R


   errorCode = POP_Success

   call POP_DistributionGet(POP_distrbTropic, errorCode, &
                            numLocalBlocks = numBlocks)

   nx = size(X,dim=1)
   ny = size(X,dim=2)


   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversPCSI: error retrieving local block count')
      return
   endif

   if (.not. usePreconditioner) then
   
      !--- diagonal preconditioner if preconditioner not specified
      !$OMP PARALLEL DO PRIVATE(iblock)
      do iblock=1,numBlocks
         do j=1,ny
         do i=1,nx
            if (btropWgtCenter(i,j,iblock) /= 0.0_POP_r8) then
               A0R(i,j,iblock) = 1.0_POP_r8 /btropWgtCenter(i,j,iblock)
            else
               A0R(i,j,iblock) = 0.0_POP_r8
            endif
         end do
         end do
      end do 
      !$OMP END PARALLEL DO
   endif


!-----------------------------------------------------------------------
!
!  step 1 : compute iteration parameters by eigenvalues
!  $\alpha =\frac{2}{\mu -\nu}$, $ \beta = \frac{\mu +\nu}{\mu -\nu}$,
!  $\gamma = \frac{\beta}{\alpha}$, $\omega_0 =\frac{ 2}{\gamma}$
!
!-----------------------------------------------------------------------
    
    csalpha = 2.0_POP_r8/(PcsiMaxEigs-PcsiMinEigs)
    csbeta = (PcsiMaxEigs+PcsiMinEigs)/(PcsiMaxEigs-PcsiMinEigs)
    csy = csbeta/csalpha
    csomga = 2.0_POP_r8/csy
    
!-----------------------------------------------------------------------
!
! step 2 : compute initial residual and initialize X
! $\textbf{r}_0 = \textbf{b}-\textbf{B}\textbf{x}_0$; 
! $\textbf{x}_1 =\textbf{x}_0 -\gamma^{-1}\textbf{M}^{-1}\textbf{r}_0$; 
! $\textbf{r}_1 =\textbf{b} -\textbf{B}\textbf{x}_1$; 
!
!-----------------------------------------------------------------------
    
    !$OMP PARALLEL DO PRIVATE(iblock,i,j)
    do iblock=1,numBlocks

        call btropOperator(S,X,iblock)
        do j=1,ny
        do i=1,nx
            R(i,j,iblock) = B(i,j,iblock) - S(i,j,iblock)
        end do 
        end do 
    end do ! block loop
   !$OMP END PARALLEL DO


    !$OMP PARALLEL DO PRIVATE(iblock,i,j)
    do iblock=1,numBlocks
        if (usePreconditioner) then
            call preconditioner(work1,R,iblock)
            do j=1,ny
            do i=1,nx
                R(i,j,iblock) = work1(i,j,iblock)
            end do 
            end do 
        else
            do j=1,ny
            do i=1,nx
                R(i,j,iblock) = R(i,j,iblock)*A0R(i,j,iblock)
            end do 
            end do 
        endif

        do j=1,ny
        do i=1,nx
            Q(i,j,iblock) = (1.0_POP_r8/csy)*R(i,j,iblock)
        end do
        end do
    
    end do ! block loop
   !$OMP END PARALLEL DO


    call POP_HaloUpdate(Q, POP_haloTropic, POP_gridHorzLocCenter, &
      POP_fieldKindScalar, errorCode)

    if (errorCode /= POP_Success) then
       call POP_ErrorSet(errorCode, &
          'POP_SolversPCSI: error updating Q halo')
       return
    endif

    !$OMP PARALLEL DO PRIVATE(iblock,i,j)
    do iblock=1,numBlocks
      do j=1,ny
      do i=1,nx
        X(i,j,iblock) = X(i,j,iblock) + Q(i,j,iblock)
      end do
      end do
    
      call btropOperator(S,X,iblock)
    
      do j=1,ny
      do i=1,nx
          R(i,j,iblock) = B(i,j,iblock) - S(i,j,iblock)
      end do
      end do
    
    end do ! block loop
    !$OMP END PARALLEL DO
    
    
    call POP_HaloUpdate(R, POP_haloTropic, POP_gridHorzLocCenter, &
         POP_fieldKindScalar, errorCode)
    
    if (errorCode /= POP_Success) then
        call POP_ErrorSet(errorCode, &
        'POP_SolversPCSI: error updating initial residual halo')
        return
    endif
    
    numIterations = maxIterations
    
    !-----------------------------------------------------------------------
    !
    !  iterate
    !
    !-----------------------------------------------------------------------
    
    iterationLoop: do m = 1, maxIterations


!-----------------------------------------------------------------------
!
! step 3 : update iteration parameter 
! $\omega_k = 1/(\gamma - \frac{1}{4\alpha^2}\omega_{k-1})$
!
!-----------------------------------------------------------------------
    csomga = 1.0_POP_r8/(csy-csomga/(4.0_POP_r8*csalpha*csalpha))

!-----------------------------------------------------------------------
!
! step 4 : preconditioning 
! $\textbf{r}'_{k-1} =\textbf{M}^{-1}\textbf{r}_{k-1}$
!
!-----------------------------------------------------------------------

    !$OMP PARALLEL DO PRIVATE(iblock)
    do iblock=1,numBlocks
      if (usePreconditioner) then
          call preconditioner(work1,R,iblock)
          do j=1,ny
          do i=1,nx
              R(i,j,iblock) = work1(i,j,iblock)
          end do 
          end do 
      else
          do j=1,ny
          do i=1,nx
                R(i,j,iblock) = R(i,j,iblock)*A0R(i,j,iblock)
          end do 
          end do
      endif 
    end do ! block loop
    !$OMP END PARALLEL DO
    

    !!! assume that halosize = 2
    call POP_HaloUpdate(R, POP_haloTropic, POP_gridHorzLocCenter, &
                           POP_fieldKindScalar, errorCode)

    if (errorCode /= POP_Success) then
        call POP_ErrorSet(errorCode, &
        'POP_SolversPCSI: error updating R halo')
        return
    endif

!-----------------------------------------------------------------------
!
! step 5 : compute X increment and update X and R
! $\Delta \textbf{x}_{k} =\omega_k\textbf{r}'_{k-1}+(\gamma \omega_k-1)\Delta \textbf{x}_{k-1}$; 
! $\textbf{x}_{k} =\textbf{x}_{k-1}+\Delta \textbf{x}_{k-1}$;  
! $\textbf{r}_{k} =b- \textbf{B}\textbf{x}_{k}$;
!
!-----------------------------------------------------------------------

    !$OMP PARALLEL DO PRIVATE(iblock)
    do iblock=1,numBlocks
    
      do j=1,ny
      do i=1,nx
        Q(i,j,iblock) = csomga*R(i,j,iblock)+ (csy*csomga-1.0_POP_r8)*Q(i,j,iblock)
        X(i,j,iblock) = X(i,j,iblock) + Q(i,j,iblock)
      end do
      end do
    
      call btropOperator(S,X,iblock)
    
      do j=1,ny
      do i=1,nx
        R(i,j,iblock) = B(i,j,iblock) - S(i,j,iblock)
      end do 
      end do
    
      if ((mod(m,convergenceCheckFreq) == 0) .and. (m.ge.convergenceCheckStart)) then
          do j=1,ny
          do i=1,nx
          WORK0(i,j,iblock) = R(i,j,iblock)*R(i,j,iblock)
          end do 
          end do
      endif
    end do ! block loop
    !$OMP END PARALLEL DO
    
    !-----------------------------------------------------------------------
    !
    !     test for convergence
    !
    !-----------------------------------------------------------------------
    
   if ((mod(m,convergenceCheckFreq) == 0) .and. (m.ge.convergenceCheckStart)) then
    
        rr = POP_GlobalSum(work0, POP_distrbTropic, &
                           POP_gridHorzLocCenter,   &
                           errorCode, mMask = mMaskTropic)   ! (r,r)
    
        if (errorCode /= POP_Success) then
            call POP_ErrorSet(errorCode, &
            'POP_SolversPCSI: error computing convergence dot prod')
            return
        endif
    
        if (rr < convergenceCriterion) then
            numIterations = m
            exit iterationLoop
        endif
    
    endif
    
    enddo iterationLoop
    
    rmsResidual = sqrt(rr*residualNorm)

!-----------------------------------------------------------------------
!EOC

 end subroutine PCSI
!***********************************************************************
!BOP
! !IROUTINE: ChronGear
! !INTERFACE:

 subroutine ChronGear(X,B,errorCode)

! !DESCRIPTION:
!  This routine implements the Chronopoulos-Gear conjugate-gradient 
!  solver with preconditioner for solving the linear system $Ax=b$.
!  It is a rearranged conjugate gradient solver that reduces the 
!  number of inner products per iteration from two to one (not 
!  counting convergence check). Both the operator $A$ and 
!  preconditioner are nine-point stencils. If no preconditioner has 
!  been supplied, a diagonal preconditioner is applied.  Convergence 
!  is checked every {\em ncheck} steps.
!
!
!  References:
!     Dongarra, J. and V. Eijkhout. LAPACK Working Note 159.
!        Finite-choice algorithm optimization in conjugate gradients.
!        Tech. Rep. ut-cs-03-502. Computer Science Department.
!        University of Tennessee, Knoxville. 2003.
!
!     D Azevedo, E.F., V.L. Eijkhout, and C.H. Romine. LAPACK Working
!        Note 56. Conjugate gradient algorithms with reduced
!        synchronization overhead on distributed memory multiprocessors.
!        Tech. Rep. CS-93-185.  Computer Science Department.
!        University of Tennessee, Knoxville. 1993.
!
! !REVISION HISTORY:
!  this routine implemented by Frank Bryan et al., NCAR

! !INPUT PARAMETERS:

   real (POP_r8), dimension(:,:,:), intent(in) :: &
      B                         ! right hand side of linear system

! !INPUT/OUTPUT PARAMETERS:

   real (POP_r8), dimension(:,:,:), intent(inout) :: &
      X                  ! on input,  an initial guess for the solution
                         ! on output, solution of the linear system

! !OUTPUT PARAMETERS:

   integer (POP_i4), intent(out) :: &
      errorCode          ! returned error code

!EOP
!BOC
!-----------------------------------------------------------------------

   integer (POP_i4) :: &
      i,j,m,           &! local iteration counter
      nx, ny,          &! horizontal extents
      numBlocks,       &! number of local blocks
      iblock            ! local block counter

   real (POP_r8) :: & ! scalar results
      cgAlpha, cgBeta, cgSigma, cgDelta, cgRhoOld, cgRho, rr

   real (POP_r8), dimension(size(X,dim=1),size(X,dim=2),    &
                                          size(X,dim=3)) :: &
      R,                  &! residual (b-Ax)
      S,                  &! conjugate direction vector
      Q,Z,AZ,WORK0,       &! various cg intermediate results
      A0R                  ! diagonal preconditioner

   real (POP_r8), dimension(size(X,dim=1),size(X,dim=2),    &
                                        2,size(X,dim=3)) :: & 
      WORKN              ! WORK array 

   real (POP_r8), dimension(2) :: &
      sumN               ! global sum results for multiple arrays

!-----------------------------------------------------------------------
!
!  initialize some scalars
!
!-----------------------------------------------------------------------

   errorCode = POP_Success


   call POP_DistributionGet(POP_distrbTropic, errorCode, &
                            numLocalBlocks = numBlocks)


   nx = size(X,dim=1)
   ny = size(X,dim=2)

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversChronGear: error retrieving local block count')
      return
   endif

   cgRho = 1.0_POP_r8
   numIterations = maxIterations


!-----------------------------------------------------------------------
!
!  compute initial residual and initialize other arrays
!
!-----------------------------------------------------------------------

   !$OMP PARALLEL DO PRIVATE(iblock,i,j)

   do iblock=1,numBlocks

      do j=1,ny
      do i=1,nx
         R    (i,j,iblock)   = 0.0_POP_r8
         S    (i,j,iblock)   = 0.0_POP_r8
         Z    (i,j,iblock)   = 0.0_POP_r8
         Q    (i,j,iblock)   = 0.0_POP_r8
         AZ   (i,j,iblock)   = 0.0_POP_r8
         WORK0(i,j,iblock)   = 0.0_POP_r8
         WORKN(i,j,1,iblock) = 0.0_POP_r8
         WORKN(i,j,2,iblock) = 0.0_POP_r8
      end do
      end do

      if (.not. usePreconditioner) then
     
         !--- diagonal preconditioner if preconditioner not specified
         do j=1,ny
         do i=1,nx
            if (btropWgtCenter(i,j,iblock) /= 0.0_POP_r8) then
               A0R(i,j,iblock) = 1.0_POP_r8/btropWgtCenter(i,j,iblock)
            else
               A0R(i,j,iblock) = 0.0_POP_r8
            endif
         end do
         end do
      endif

     
      ! use S as a temp here for Ax

      call btropOperator(S,X,iblock)
      do j=1,ny
      do i=1,nx
         R(i,j,iblock) = B(i,j,iblock) - S(i,j,iblock) ! b-Ax
      end do
      end do
   end do ! block loop

   !$OMP END PARALLEL DO

   call POP_HaloUpdate(R, POP_haloTropic, POP_gridHorzLocCenter, &
                          POP_fieldKindScalar, errorCode)

   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversChronGear: error updating initial residual halo')
      return
   endif

!-----------------------------------------------------------------------
!
!    take one pass of standard algorithm
!
!-----------------------------------------------------------------------

   !$OMP PARALLEL DO PRIVATE(iblock,i,j)

   do iblock=1,numBlocks

      !---- calculate (PC)r store in Z
      if (usePreconditioner) then
         call preconditioner(Z,R,iblock)
      else   ! use diagonal preconditioner
         do j=1,ny
         do i=1,nx
            Z(i,j,iblock) = R(i,j,iblock)*A0R(i,j,iblock)
         end do
         end do
      endif

   end do
   !$OMP END PARALLEL DO


   if (usePreconditioner) then
     call POP_HaloUpdate(Z, POP_haloTropic, POP_gridHorzLocCenter, &
                            POP_fieldKindScalar, errorCode)

     if (errorCode /= POP_Success) then
        call POP_ErrorSet(errorCode, &
           'POP_SolversChronGear: error updating Z halo')
        return
     endif
   endif


   !$OMP PARALLEL DO PRIVATE(iblock,i,j)
   do iblock=1,numBlocks
      !---- Compute intermediate result for dot product
      !---- update conjugate direction vector S
      do j=1,ny
      do i=1,nx
         WORKN(i,j,1,iblock) = R(i,j,iblock)*Z(i,j,iblock)
         S(i,j,iblock) =  Z(i,j,iblock)
      end do
      end do

      !---- compute Q = A * S
      call btropOperator(Q,S,iblock)

      !---- compute intermediate result for dot product
      do j=1,ny
      do i=1,nx
         WORKN(i,j,2,iblock) = S(i,j,iblock)*Q(i,j,iblock)
      end do
      end do

   end do
   !$OMP END PARALLEL DO

   call POP_HaloUpdate(Q, POP_haloTropic, POP_gridHorzLocCenter, &
                          POP_fieldKindScalar, errorCode)


   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversChronGear: error updating Q halo')
      return
   endif


   !---- Form dot products
   sumN = POP_GlobalSum(WORKN, POP_distrbTropic,        &
                               POP_gridHorzLocCenter,   &
                               errorCode, mMask = mMaskTropic)


   if (errorCode /= POP_Success) then
      call POP_ErrorSet(errorCode, &
         'POP_SolversChronGear: error in initial dot products')
      return
   endif

   cgRhoOld = sumN(1) !(r,PCr)
   cgSigma  = sumN(2) !(s,As)
   cgAlpha  = cgRhoOld/cgSigma

   !---- compute first solution and residual
   !$OMP PARALLEL DO PRIVATE(iblock,i,j)
   do iblock=1,numBlocks

      do j=1,ny
      do i=1,nx
         X(i,j,iblock) = X(i,j,iblock) + cgAlpha*S(i,j,iblock)
         R(i,j,iblock) = R(i,j,iblock) - cgAlpha*Q(i,j,iblock)
      end do
      end do

   end do
   !$OMP END PARALLEL DO

!-----------------------------------------------------------------------
!
!     iterate
!
!-----------------------------------------------------------------------

   iterationLoop: do m = 1, maxIterations

!-----------------------------------------------------------------------
!
!     calculate (PC)r and A*(Pc)r
!
!-----------------------------------------------------------------------

      !$OMP PARALLEL DO PRIVATE(iblock,i,j)
      do iblock=1,numBlocks

         if (usePreconditioner) then
            call preconditioner(Z,R,iblock)
         else
            do j=1,ny
            do i=1,nx
               Z(i,j,iblock) = R(i,j,iblock)*A0R(i,j,iblock)
            end do
            end do
         endif
      end do
      !$OMP END PARALLEL DO


      ! Here we move the haloupdating forward in order to use EVP preconditioning
      ! Based on the assumption that halo_size = 2. Otherwise, its not correct
      call POP_HaloUpdate(Z, POP_haloTropic, POP_gridHorzLocCenter, &
                             POP_fieldKindScalar, errorCode)

      if (errorCode /= POP_Success) then
         call POP_ErrorSet(errorCode, &
            'POP_SolversChronGear: error updating Z halo')
         return
      endif

      !$OMP PARALLEL DO PRIVATE(iblock,i,j)
      do iblock=1,numBlocks

        call btropOperator(AZ,Z,iblock)

         !--- intermediate results for inner products

         do j=1,ny
         do i=1,nx
            WORKN(i,j,1,iblock) =  R(i,j,iblock)*Z(i,j,iblock)
            WORKN(i,j,2,iblock) = AZ(i,j,iblock)*Z(i,j,iblock)
         end do
         end do

      end do
      !$OMP END PARALLEL DO


      sumN = POP_GlobalSum(WORKN, POP_distrbTropic,        &
                                  POP_gridHorzLocCenter,   &
                                  errorCode, mMask = mMaskTropic)


      if (errorCode /= POP_Success) then
         call POP_ErrorSet(errorCode, &
            'POP_SolversChronGear: error in dot products')
         return
      endif

      cgRho    = sumN(1)     ! (r,(PC)r)
      cgDelta  = sumN(2)     ! (A (PC)r,(PC)r)
      cgBeta   = cgRho/cgRhoOld
      cgSigma  = cgDelta - (cgBeta**2)*cgSigma
      cgAlpha  = cgRho/cgSigma
      cgRhoOld = cgRho

!-----------------------------------------------------------------------
!
!     compute S and Q
!     compute next solution and residual
!
!-----------------------------------------------------------------------

      !$OMP PARALLEL DO PRIVATE(iblock, i,j)
      do iblock=1,numBlocks

         do j=1,ny
         do i=1,nx
            S(i,j,iblock) =  Z(i,j,iblock) + cgBeta *S(i,j,iblock)
            Q(i,j,iblock) = AZ(i,j,iblock) + cgBeta *Q(i,j,iblock)
            X(i,j,iblock) =  X(i,j,iblock) + cgAlpha*S(i,j,iblock)
            R(i,j,iblock) =  R(i,j,iblock) - cgAlpha*Q(i,j,iblock)
         end do
         end do

         !--- recompute residual as b-Ax for convergence check
         if (mod(m,convergenceCheckFreq) == 0) then

            !--- Reset residual using r = b - Ax
            !--- (r,r) for norm of residual
            call btropOperator(Z,X,iblock)
            do j=1,ny
            do i=1,nx
               R(i,j,iblock) = B(i,j,iblock) - Z(i,j,iblock)
               WORK0(i,j,iblock) = R(i,j,iblock)*R(i,j,iblock)
            end do
            end do

         endif
      end do
      !$OMP END PARALLEL DO

!-----------------------------------------------------------------------
!
!     test for convergence if it is time
!
!-----------------------------------------------------------------------

      if (mod(m,convergenceCheckFreq) == 0) then

         !--- update ghost cells for next iteration
         call POP_HaloUpdate(R, POP_haloTropic, POP_gridHorzLocCenter, &
                                POP_fieldKindScalar, errorCode)

         if (errorCode /= POP_Success) then
            call POP_ErrorSet(errorCode, &
               'POP_SolversChronGear: error updating residual halo in convrg')
            return
         endif

         !--- residual norm for convergence 
         rr = POP_GlobalSum(work0, POP_distrbTropic,        &! (r,r)
                                   POP_gridHorzLocCenter,   &
                                   errorCode, mMask = mMaskTropic)


         if (errorCode /= POP_Success) then
            call POP_ErrorSet(errorCode, &
               'POP_SolversChronGear: error computing convergence dot prod')
            return
         endif

         if (rr < convergenceCriterion) then
            numIterations = m
            exit iterationLoop
         endif

      endif

   end do iterationLoop

   rmsResidual = sqrt(rr*residualNorm)

   if (numIterations == maxIterations) then
      if (convergenceCriterion /= 0.0_POP_r8) then

         call POP_ErrorSet(errorCode, &
            'POP_SolversChronGear: solver not converged')
         return
      endif
   endif

!-----------------------------------------------------------------------
!EOC

 end subroutine ChronGear

!***********************************************************************
!BOP
! !IROUTINE: preconditioner
! !INTERFACE:

 subroutine preconditioner(PX,X,bid)

! !DESCRIPTION:
!  This function applies a precomputed preconditioner as a nine-point
!  stencil operator.
!
! !REVISION HISTORY:
!  add EVP preconditioning interface by Yong Hu et al., Tsinghua University

! !INPUT PARAMETERS:

   real (POP_r8), dimension(:,:,:), intent(in) :: & 
      X                     ! array to be operated on 

   integer (POP_i4), intent(in) :: &
      bid                    ! local block address for this block

! !OUTPUT PARAMETERS:

   real (POP_r8), dimension(:,:,:), intent(out) :: &
      PX                  ! nine point operator result

!EOP
!BOC
!-----------------------------------------------------------------------
!
!  local variables
!
!-----------------------------------------------------------------------
   real (POP_r8), dimension(EvpXbs+2,EvpYbs+2,EvpXnb*EvpYnb):: &
      f               ! reshaped X

   integer (POP_i4) :: &
      i,j,js,je,is,ie,lm,ln,l,ib               ! dummy counters

!-----------------------------------------------------------------------

   PX(:,:,bid) = 0.0_POP_r8
   if (trim(preconditionerChoice) == precondChoiceFile) then
       do j=2,size(X,dim=2)-1
       do i=2,size(X,dim=1)-1
          PX(i,j,bid) = precondNE    (i,j,bid)*X(i+1,j+1,bid) + &
                        precondNW    (i,j,bid)*X(i-1,j+1,bid) + &
                        precondSE    (i,j,bid)*X(i+1,j-1,bid) + &
                        precondSW    (i,j,bid)*X(i-1,j-1,bid) + &
                        precondNorth (i,j,bid)*X(i  ,j+1,bid) + &
                        precondSouth (i,j,bid)*X(i  ,j-1,bid) + &
                        precondEast  (i,j,bid)*X(i+1,j  ,bid) + &
                        precondWest  (i,j,bid)*X(i-1,j  ,bid) + &
                        precondCenter(i,j,bid)*X(i  ,j  ,bid)
       end do
       end do
!-----------------------------------------------------------------------
!
!  EVP preconditioning 
!
!-----------------------------------------------------------------------

   else if (trim(preconditionerChoice) == precondChoiceEvp) then
       f = 0.0_POP_r8 
       do j = 1, EvpYnb
         js = EvpYbidx(j)
         je = EvpYbidx(j+1) +1
         lm = (je-js) +1
         do i = 1, EvpXnb
           is = EvpXbidx(i) 
           ie = EvpXbidx(i+1) +1
           ln = (ie-is) +1
           l  = ln + lm -5
           ib = (j-1)*EvpXnb+i
           f(2:ln-1,2:lm-1,ib) = X(is+1:ie-1,js+1:je-1,bid)
           if (landIndx(i,j,bid) == 1 ) then 

             ! diagonal preconditioning for blocks containing land potins
             PX(is+1:ie-1,js+1:je-1,bid) = & 
             f(2:ln-1,2:lm-1,ib)*InvEvpCenterWgt(2:ln-1,2:lm-1,ib,bid)

           else if (landIndx(i,j,bid) == 0 ) then 

             ! EVP solver on sub-blocks
             call ExplicitEvp(EvpCenterWgt(1:ln,1:lm,ib,bid),&
                  EvpNeWgt(1:ln,1:lm,ib,bid),InvEvpNeWgt(1:ln,1:lm,ib,bid),&
                  EvpRinv(1:l,1:l,ib,bid),PX(is+1:ie-1,js+1:je-1,bid),f(1:ln,1:lm,ib),ln,lm)

           else 
             write(POP_stdout,'(a35,3I5.3)') 'EVP Error: unpreconditioned block ',&
                                             i,j,landIndx(i,j,bid)
           endif 

         end do 
       end do 
  endif 

!-----------------------------------------------------------------------
!EOC

 end subroutine preconditioner

!***********************************************************************
!BOP
! !IROUTINE: btropOperator
! !INTERFACE:

 subroutine btropOperator(AX,X,bid)

! !DESCRIPTION:
!  This routine applies the nine-point stencil operator for the
!  barotropic solver.  It takes advantage of some 9pt weights being 
!  shifted versions of others.
!
! !REVISION HISTORY:
!  same as module

! !INPUT PARAMETERS:

   real (POP_r8), dimension(:,:,:), intent(in) :: & 
      X                  ! array to be operated on 

   integer (POP_i4), intent(in) :: &
      bid                    ! local block address for this block

! !OUTPUT PARAMETERS:

   real (POP_r8), dimension(:,:,:), intent(out) :: &
      AX                     ! nine point operator result (Ax)

!EOP
!BOC
!-----------------------------------------------------------------------
!
!  local variables
!
!-----------------------------------------------------------------------

   integer (POP_i4) :: &
      i,j                ! dummy counters

!-----------------------------------------------------------------------

   AX(:,:,bid) = 0.0_POP_r8

   do j=2,size(X,dim=2)-1
   do i=2,size(X,dim=1)-1
      AX(i,j,bid) = btropWgtCenter(i  ,j  ,bid)*X(i  ,j  ,bid) + &
                    btropWgtNorth (i  ,j  ,bid)*X(i  ,j+1,bid) + &
                    btropWgtNorth (i  ,j-1,bid)*X(i  ,j-1,bid) + &
                    btropWgtEast  (i  ,j  ,bid)*X(i+1,j  ,bid) + &
                    btropWgtEast  (i-1,j  ,bid)*X(i-1,j  ,bid) + &
                    btropWgtNE    (i  ,j  ,bid)*X(i+1,j+1,bid) + &
                    btropWgtNE    (i  ,j-1,bid)*X(i+1,j-1,bid) + &
                    btropWgtNE    (i-1,j  ,bid)*X(i-1,j+1,bid) + &
                    btropWgtNE    (i-1,j-1,bid)*X(i-1,j-1,bid)
   end do
   end do

!-----------------------------------------------------------------------
!EOC

 end subroutine btropOperator


 subroutine EvpPre(cc,ne,evpcc,evpne,rinv,landIndx,n,m,nn,mm,ndi,mdi,nb,mb,bid)

! !DESCRIPTION:
! !prepare EVP preconditioning and save reshaped array for efficiency 
!
! !REVISION HISTORY:
!  this routine implemented by Yong Hu, et al., Tsinghua University
! !INPUT/OUTPUT PARAMETERS:
   integer(POP_i4),intent(in) :: n,m  ! total block size
   integer(POP_i4),intent(in) :: nn,mm  ! small block ideal size
   real(POP_r8),dimension(n,m),intent(in) :: cc,ne
   integer(POP_i4),intent(in) :: nb, mb  ! blocks on x and y direction 
   integer(POP_i4),intent(in) :: ndi(nb+1),mdi(mb+1) ! bound index 

! !INPUT/OUTPUT PARAMETERS:
   real(POP_r8),dimension(nn+2,mm+2,nb*mb),intent(inout) :: evpcc,evpne
   real(POP_r8),dimension(nn+mm-1,nn+mm-1,nb*mb),intent(inout):: rinv
   integer(POP_i4),dimension(nb,mb),intent(inout):: landIndx
   integer(POP_i4),intent(in):: bid  ! bartropic blocks id

   ! LOCAL  VARIABLES
   integer (POP_i4) :: &
      is,js,ie,je,          &! start and end index for EVP sub block
      lm,ln,l,              &! EVP sub block size
      i,j,ib,               &! index of sub block
      iland                 ! index of sub block

   type (block) :: &
      this_block   ! block information for this block

   this_block = get_block(blocks_tropic(bid),bid)
   do j = 1, mb
     js = mdi(j)-1
     je = mdi(j+1) 
     lm = (je-js) +1
     do i = 1, nb
       is = ndi(i)-1
       ie = ndi(i+1)
       ln = (ie-is) +1
       ib = (j-1)*nb+i
       ! Reshape coefficents into EVP sub-blocks  for efficiency
       evpcc(1:ln,1:lm,ib) =  cc(is:ie,js:je)
       evpne(1:ln,1:lm,ib) =  ne(is:ie,js:je)

       iland = 0 
       if (minval(abs(ne(is+1:ie-1,js+1:je-1))) == 0.0_POP_r8) then 
         iland = 1
       endif 
       if (is+2 <this_block%ib .or. ie-1 > this_block%ie) then
         iland = 2
       endif 
       if (js+2 <this_block%jb .or. je-1 > this_block%je) then
         iland = 3
       endif 

       if( iland > 0 ) then 
         ! mark land sub-blocks
         landIndx(i,j) = 1
         rinv(:,:,ib) = 0.0_POP_r8
       else 
         ! EVP preprocessing ocean sub-blocks
         landIndx(i,j) = 0
         l  = ln + lm -5
         call ExplicitBlockEvpPre(cc(is:ie,js:je),ne(is:ie,js:je), &
                                  rinv(1:l,1:l,ib),ln,lm)
       endif 
     end do 
   end do 
   write(POP_stdout,'(a8,I5,a15,I5,a15,I5)') 'PROC',POP_myTask, & 
                    ' EVP blocks :',nb*mb, ' land blocks :', sum(landIndx)

  end subroutine EvpPre
  
  subroutine ExplicitBlockEvpPre(cc,ne,rinv,n,m)
! !DESCRIPTION:
!  This routine implements the preprocessing of explicit EVP method for
!  solve a nine point elliptic equation on sub blocks.
!  Four coefficients related to north, south, east and west are small compared 
!  with the other five coefficients, ignored here to reduce computation.
!
!  References:
!     Roache, P. J. (1995). Elliptic marching methods and domain
!        decomposition (Vol.5). CRC press.
!
! !REVISION HISTORY:
!  this routine implemented by Yong Hu, et al., Tsinghua University
 
 ! !INPUT PARAMETERS:
   integer,intent(in) :: n,m                      !sub-block size [n,m]
   real(POP_r8),dimension(n,m),intent(in) :: cc   !center coefficients
   real(POP_r8),dimension(n,m),intent(in) :: ne   !four corners coefficients

 ! !OUTPUT PARAMETERS:
   real(POP_r8),dimension(n+m-5,n+m-5),intent(inout) :: rinv !preconditioning matrix

 ! LOCAL VARIABLES
   integer :: i,j,k,ii
   integer :: nm                          ! length of error and final vectors
   integer (POP_i4)::  errorCode          ! returned error code
   real(POP_r8),dimension(n,m) :: y       ! temporary array

   real(POP_r8),dimension(n +m -5,n +m -5) :: rin,WORK
   real(POP_r8):: maxvalr
   real(POP_r8),dimension(n +m -5) :: r

   nm = n +m -5
   y(:,:) = 0.0_POP_r8

   ! five points marching from initial error vectors (west)
   do ii = 1,m-2
     y(2,m-ii) = 1.0_POP_r8
     do j = 2, m-1
       do i = 2, n-1
         y(i+1,j+1)  = (- cc(i,j)     * y(i,j )     & 
                -  ne(i,j-1)   * y(i+1,j-1)  &
                -  ne(i-1,j)   * y(i-1,j+1)  & 
                -  ne(i-1,j-1) * y(i-1,j-1) ) /ne(i,j) 
       end do
     end do

     ! get the final error vector f
     do i = 1,n-2
       rin(ii,i) = -y(i+2,m)
     end do 

     do j = 1,m-3
       rin(ii,n-2+j) = -y(n,m-j)
     end do 

     y(2,m-ii) = 0.0_POP_r8
   end do 
   
   ! five points marching from inital error vectors (south)
   do ii = 1,n-3
     y(ii+2,2) = 1.0_POP_r8
     do j = 2, m-1
       do i = 2, n-1
         y(i+1,j+1)  = (- cc(i,j)     * y(i,j )     & 
                -  ne(i,j-1)   * y(i+1,j-1)  &
                -  ne(i-1,j)   * y(i-1,j+1)  & 
                -  ne(i-1,j-1) * y(i-1,j-1) ) /ne(i,j) 
       end do
     end do
     
     ! get the final error vector f
     do i = 1,n-2
       rin(m-2+ii,i) = -y(i+2,m)
     end do 

     do j = 1,m-3
       rin(m-2+ii,n-2+j) = -y(n,m-j)
     end do 

     y(ii+2,2) = 0.0_POP_r8
   end do 

   WORK(:,:) = rin(:,:)
   call inverse(WORK,rinv,nm)

   !! check pre rinv 
   WORK(:,:) = 0.0_POP_r8
   do j = 1,nm
     do i = 1,nm
       do k = 1,nm
         WORK(i,j) = WORK(i,j) + rinv(i,k)*rin(k,j)
       end do 
       if (i == j ) then 
         WORK(i,j) = WORK(i,j) -1.0
       endif 
     end do 
   end do 
   maxvalr = maxval(abs(WORK))
   !write(POP_stdout,*) "MAXVAL(RINV*RIN) = ", maxvalr
   if (maxvalr > 1.0e-8 ) then 
     write(POP_stdout,*) 'maxvalr ', maxvalr
     call POP_ErrorSet(errorCode, &
          'POP_EXPLICITPRE: error in computing the inverse, error > 1.0e-8 ;&
           &Check EVP sub-block size!')
     return
   endif 

  end subroutine 

  subroutine ExplicitEvp(cc,ne,ine,rinv,tu,f,n,m)
! !DESCRIPTION:
!  This routine implements the EVP method to explicitly solve 
!  a nine point elliptic equation on sub blocks.
!  Four coefficients related to north, south, east and west are 
!  small compared with the rest five coefficients, thus ignored 
!  here to reduce computation
!
!  References:
!     Roache, P. J. (1995). Elliptic marching methods and domain 
!        decomposition (Vol.5). CRC press.
!
! !REVISION HISTORY:
!  this routine implemented by Yong Hu, et al., Tsinghua University
 
 ! !INPUT PARAMETERS:
   integer(POP_i4),intent(in) :: n,m                    !sub-block size [n,m]
   real(POP_r8),dimension(n,m),intent(in) :: cc,&! center weight
                                             ne,&! northeast weight
                                             ine ! inverse of ne
   real(POP_r8),dimension(n,m),intent(in) :: f   ! residual 
   real(POP_r8),dimension(n+m-5,n+m-5),intent(in) :: rinv
                                                 ! preconditioning matrx

 ! !OUTPUT PARAMETERS:
   real(POP_r8),dimension(n-2,m-2),intent(inout) :: tu ! solution
 
 
   !LOCAL VARIABLES
   integer(POP_i4) :: i,j,k       ! local counters
   integer(POP_i4) :: nm          ! length of initial and final error
   real(POP_r8),dimension(n,m) :: y ! temporary array
   real(POP_r8),dimension(n+m-5) :: r !final error vector
 
   nm = n+m-5
   y(:,:) = 0.0_POP_r8
   y(2:n-1,2:m-1) = tu(:,:) 
   
   ! marching from intial error vectors (west and south)
   do j = 2, m-1
     do i = 2, n-1
         y(i+1,j+1)  = (f(i,j)- cc(i,j)     * y(i,j )     & 
                -  ne(i,j-1)   * y(i+1,j-1)  &
                -  ne(i-1,j)   * y(i-1,j+1)  & 
                -  ne(i-1,j-1) * y(i-1,j-1) ) *ine(i,j) 
     end do
   end do
 
   ! get final error vectors (east and north)
   r(1:n-2) = y(3:n,m)
   r(n-1:n+m-5) = y(n,m-1:3:-1) 
 
   !compute intial error 
   do j = 1,m-2
       do k = 1,nm
         y(2,m-j)  = y(2,m-j) + rinv(k,j)*r(k)
       end do 
   end do 
   do i = 1,n-3
       do k = 1,nm
         y(i+2,2)  = y(i+2,2) + rinv(k,m-2+i)*r(k)
       end do 
   end do 
 
 
   !marching again based on the adjusted intial vector
   do j = 2, m-2
     do i = 2, n-2
         y(i+1,j+1)  = (f(i,j)- cc(i,j)     * y(i,j )     & 
                -  ne(i,j-1)   * y(i+1,j-1)  &
                -  ne(i-1,j)   * y(i-1,j+1)  & 
                -  ne(i-1,j-1) * y(i-1,j-1) ) *ine(i,j) 
       end do
   end do

   !copy results 
   tu(1:n-2,1:m-2) = y(2:n-1,2:m-1) 
 
  end subroutine 
 
 !***********************************************************************
   subroutine PcsiLanczos(nx, ny, nb, maxlanczosstep, u, v, errorCode)
 
! !DESCRIPTION:
!  This routine implements the Lanczos based method to solve  
!  the maximum and minimum eigenvalues of the coefficient matrix $A$.
!  Or $AM$ if a preconditioner $M^{-1}$ is provided.
!
!  References:
!     Hu, Y., Huang, X., Wang, X., Fu, H., Xu, S., Ruan, H., ... & Yang, G. (2013). A
!        scalable barotropic mode solver for the parallel ocean program. In Euro-Par 2013
!        Parallel Processing (pp. 739-750). Springer Berlin Heidelberg.
!     Paige, C. C. (1980). Accuracy and effectiveness of the Lanczos algorithm for the
!        symmetric eigenproblem. Linear algebra and its applications, 34, 235-258.
!
! !REVISION HISTORY:
!  this routine implemented by Yong Hu, et al., Tsinghua University
 
 ! !INPUT PARAMETERS:
 
   integer (POP_i4),intent(in) :: &
      nx, ny, nb         ! block size and numblocktropic
   integer (POP_i4),intent(in) :: &
      maxlanczosstep     ! max lanczos steps
 
   real (POP_r8), intent(inout) :: &
      u, v             !  max and min eigenvalues
 
   integer (POP_i4), intent(out) :: &
      errorCode          ! returned error code
 
 ! LOCAL VARIABLES
   real (POP_r8), dimension(nx, ny,nb) :: &
      R,                  &! r 
      S,                  &! preconditioned r : Mr
      Q,                  &! normalized r : r/||r||_A
      Q1,                 &! previous step q
      P,                  &! preconditioned q : Mq
      A0R,                &! inverse of diagonal of A
      WORK,WORK1           ! temporary vectors
 
   real (POP_r8), dimension(maxlanczosstep) :: &
      vcsa, vcsb                ! diagonal and off-diagonal 
                                ! elements of tridiagonal matrix $T$
 
   real (POP_r8), dimension(:), allocatable :: &
      mcsa, mcsb                ! temporary vectors saving 

 
   integer (POP_i4) :: &
      i, j, m,                 &! local iteration counter
      info,istat,              &! local flags 
      iblock                    ! local block counter
 
   real (POP_r8) :: &
        csa,                   &! inner product alpha = (p,r)
        csb,                   &! inner product beta = (r,s)
        csc                     ! vector magnitude  ||r0||_A

   real (POP_r8) :: & 
        mineig   

 
   errorCode = POP_Success
   
   ! compute diagonal inverse for diagonal preconditioning
   do iblock=1,nb
      do j=1,ny
      do i=1,nx
        if (btropWgtCenter(i,j,iblock) /= 0.0_POP_r8) then
           A0R(i,j,iblock) = 1.0_POP_r8 /btropWgtCenter(i,j,iblock)
        else
           A0R(i,j,iblock) = 0.0_POP_r8
        endif
      end do
      end do
   end do ! block loop
 
   ! set random vector R, intialize temporary vectors P, Q
   do iblock=1,nb
      do j=1,ny
      do i=1,nx
        R(i,j,iblock) = 1.0_POP_r8
        Q(i,j,iblock) = 0.0_POP_r8
        Q1(i,j,iblock) = 0.0_POP_r8
      end do
      end do
   end do ! block loop
   
   ! preconditioning s = M^{-1}r 
   do iblock=1,nb
        if (usePreconditioner) then
            call preconditioner(S,R,iblock)
        else
            do j=1,ny
            do i=1,nx
                S(i,j,iblock) = R(i,j,iblock)*A0R(i,j,iblock)
            end do 
            end do 
        endif 
        do j=1,ny
        do i=1,nx
            WORK(i,j,iblock) = S(i,j,iblock)*R(i,j,iblock)
        end do
        end do
   end do ! block loop
 
   ! (R,S)  $A$ and $M$  is negative defined 
   csc =- POP_GlobalSum(WORK, POP_distrbTropic, &
              POP_gridHorzLocCenter,   &
              errorCode, mMask = mMaskTropic)

   ! q = R/||R||_{AM}  normalize R based on measurement of $AM^{-1}$
   if (csc .gt. 0.0_POP_r8) then 
      do iblock=1,nb
      do j=1,ny
      do i=1,nx
            Q(i,j,iblock) = (1/sqrt(csc))*R(i,j,iblock)
      end do 
      end do 
      end do ! block loop
   else 
      ! either random vector R == 0 or A is sigular 
      call POP_ErrorSet(errorCode, &
         'POP_PcsiLanczos: error estimating in lanczos: b == 0 !!!')
      return
   endif 
 
   call POP_HaloUpdate(Q, POP_haloTropic, POP_gridHorzLocCenter, &
                          POP_fieldKindScalar, errorCode)
 
   csb= 0.0_POP_r8
   u= 0.0_POP_r8
   v= 0.0_POP_r8
   mineig= 1.0_POP_r8
 
!-----------------------------------------------------------------------
!
!  lanczos iterations
!
!-----------------------------------------------------------------------
   lanczos_iter: do m = 1, maxlanczosstep

   !  preconditioning q
   do iblock=1,nb
        if (usePreconditioner) then
            call preconditioner(P,Q,iblock)
        else
            do j=1,ny
            do i=1,nx
                P(i,j,iblock) = Q(i,j,iblock)*A0R(i,j,iblock)
            end do 
            end do 
        endif 
   end do ! block loop
    call POP_HaloUpdate(P, POP_haloTropic, POP_gridHorzLocCenter, &
                          POP_fieldKindScalar, errorCode)

   ! r = Mp - beta_{j-1} q_{j-1}
   do iblock=1,nb
        call btropOperator(WORK1,P,iblock)
        
        do j=1,ny
        do i=1,nx
            R(i,j,iblock) =  WORK1(i,j,iblock)-csb*Q1(i,j,iblock)
            WORK(i,j,iblock) = P(i,j,iblock)*R(i,j,iblock)
        end do
        end do
   end do ! block loop
 
    csa = -POP_GlobalSum(WORK, POP_distrbTropic, &
           POP_gridHorzLocCenter,   &
           errorCode, mMask = mMaskTropic)   ! (r,r)
   
   ! r = r - alpha*q 
   do iblock=1,nb
      do j=1,ny
      do i=1,nx
        R(i,j,iblock) = R(i,j,iblock)-csa*Q(i,j,iblock)
      end do
      end do 
   end do 
   call POP_HaloUpdate(R, POP_haloTropic, POP_gridHorzLocCenter, &
                          POP_fieldKindScalar, errorCode)
   ! preconditioning R 
   do iblock=1,nb
        if (usePreconditioner) then
            call preconditioner(S,R,iblock)
        else
            do j=1,ny
            do i=1,nx
                S(i,j,iblock) = R(i,j,iblock)*A0R(i,j,iblock)
            end do 
            end do 
        endif 
        do j=1,ny
        do i=1,nx
          WORK(i,j,iblock) = S(i,j,iblock)*R(i,j,iblock)
        end do
        end do 
   end do ! block loop
 
   csc = -POP_GlobalSum(WORK, POP_distrbTropic, &
                POP_gridHorzLocCenter,   &
                errorCode, mMask = mMaskTropic)   ! (r,s)
   csb = sqrt(csc)
 
   vcsa(m) = csa
   vcsb(m) = csb
   

   ! Gershgorin circle theorem to estimate the largest eigenvaule 
   if (m ==1 ) then
    u = vcsa(1) + vcsb(1)
   else 
    u = max(u, vcsa(m) + vcsb(m) + vcsb(m-1))
   endif

   ! normalize r  : q = r/||r||_A
   if (csb /= 0.0_POP_r8) then 
      do iblock=1,nb
         do j=1,ny
         do i=1,nx
          Q1(i,j,iblock) = Q(i,j,iblock)
          Q(i,j,iblock) = (1/csb)*R(i,j,iblock)
          end do 
          end do 
       end do ! block loop
   else 
      call POP_ErrorSet(errorCode, &
         'POP_PcsiLanczos: error estimating in lanczos: b == 0 !!!')
      return
   endif 
   
!-----------------------------------------------------------------------
!
!  compute eigenvalues of the tridiagonal matrix T every 10 steps
!
!-----------------------------------------------------------------------
    
   if ((mod(m,10) == 0) .or. (m ==maxlanczosstep)) then
      if (POP_myTask == POP_masterTask ) then

        allocate(mcsa(m), mcsb(m), stat = istat)
        if (istat > 0) then
           call POP_ErrorSet(errorCode, &
              'POP_PcsiLanczos: error allocating Lanczos Tridiagonal')
           return
        endif

        do i = 1, m-1
            mcsa(i) = vcsa(i) 
            mcsb(i+1) = vcsb(i)
        end do
        mcsa(m) = vcsa(m)
        mcsb(1) = 0.0
            
        info = 0
        ! compute smallest eigenvalue
        call ratqr(m, 1.0e-8_POP_r8, mcsa,mcsb,v,info)
        if (info /= 0) then 
           call POP_ErrorSet(errorCode, &
              'POP_PcsiLanczos: error estimating smallest eigenvalue !!!')
           return
        endif

        write(POP_stdout,'(a18,i3,a8,2e15.7)') "Lanczos steps ", m, " eigs:", v, u

        deallocate(mcsa, mcsb,stat = istat)
        if (istat > 0) then
           call POP_ErrorSet(errorCode, &
              'POP_PcsiLanczos: error deallocating Lanczos Tridiagonal')
           return
        endif
      endif 

      call POP_Broadcast(v,POP_masterTask,errorCode)

      ! check convergence of the smallest eigenvalue
      if (abs(1-v/mineig ) <  LanczosconvergenceCriterion) then 
          exit lanczos_iter
      endif 

      mineig = v  ! save previous value 
   endif 
    
   enddo lanczos_iter
 

!-----------------------------------------------------------------------
!EOC

 end subroutine PcsiLanczos

 subroutine EvpBlockPartition(m,mm,mb,mdi,errorCode)
! !DESCRIPTION:
!  This routine implements the strategy to divide the local barotropic block 
!  into smaller blocks in one direction.
!  EVP can not handle block larger than [12x12] while a small block size has
!  an adverse effect on the convergence rate.
!
!
! !REVISION HISTORY:
!  this routine implemented by Yong Hu, et al., Tsinghua University

 ! !INPUT PARAMETERS:
   integer(POP_i4), intent(in) :: m              ! size of original block
   integer(POP_i4), intent(in) :: mm             ! user-defined size

 ! !OUTPUT PARAMETERS:
   integer(POP_i4), dimension(:),allocatable, intent(inout) :: &
                                  mdi  !start index of each small block
   integer(POP_i4), intent(inout) :: mb          ! number of blocks
   integer(POP_i4), intent(out) :: &
      errorCode          ! returned error code
  
 ! !LOCAL  VARIABLE
   integer :: i, istat    

   mb = (m-3)/mm +1 
   allocate(mdi(mb+1), stat = istat)
   if (istat > 0) then
      call POP_ErrorSet(errorCode, &
         'POP_EvpBlockPartition: error allocating EVP subBlock Index')
      return
   endif

   mdi(1) = 2 

   if ( mb == 1 ) then 
     ! block size is not larger than sub-block size 
     mdi(mb+1) = m  
   else 
     ! more than one sub block
     do i = 1, mb-2
       mdi(i+1) = 2+i*mm
     end do 

     ! in case mod(m-2, mm) /= 0, make sure the last block not too small
     mdi(mb) = (mdi(mb-1) +m )/2 
     mdi(mb+1) = m
   endif 
 end subroutine EvpBlockPartition

 subroutine inverse(a,c,n)
! !DESCRIPTION:
! This routine implemetns the Inverse matrix method 
! based on Doolittle LU factorization for Ax=b.
! Follow : Alex G. December 2009
!-----------------------------------------------------------

! !REVISION HISTORY:
!  this routine implemented by Yong Hu, et al., Tsinghua University

 ! !INPUT PARAMETERS:
  integer(POP_i4),intent(in) ::  n    !dimension
  real(POP_r8),intent(inout) :: a(n,n)!array of coefficients for matrix A

 ! !OUTPUT PARAMETERS:
  real(POP_r8),intent(inout) :: c(n,n) !inverse matrix of A

  
 ! !LOCAL VARIABLES
  real(POP_r8) :: L(n,n), U(n,n), b(n), d(n), x(n)
  real(POP_r8) :: coeff
  integer::  i, j, k

  ! step 0: initialization for matrices L and U and b
  L(:,:)=0.0
  U(:,:)=0.0
  b(:)=0.0

  ! step 1: forward elimination
  do k=1, n-1
    do i=k+1,n
      coeff=a(i,k)/a(k,k)
      L(i,k) = coeff
      do j=k+1,n
        a(i,j) = a(i,j)-coeff*a(k,j)
      end do
    end do
  end do

  ! Step 2: prepare L and U matrices.
  do i=1,n
    L(i,i) = 1.0
  end do

  do j=1,n
    do i=1,j
      U(i,j) = a(i,j)
    end do
  end do

  ! Step 3: compute columns of the inverse matrix C
  do k=1,n
    b(k)=1.0
    d(1) = b(1)

    do i=2,n
      d(i)=b(i)
      do j=1,i-1
        d(i)=d(i)-L(i,j)*d(j)
      end do
    end do

    x(n)=d(n)/U(n,n)
    do i = n-1,1,-1
      x(i) = d(i)
      do j=n,i+1,-1
        x(i)=x(i)-U(i,j)*x(j)
      end do
      x(i) = x(i)/u(i,i)
    end do

    ! fill the solutions  x(n) into column  k  of  C
    do  i=1,n
      c(i,k) = x(i)
    end do
    b(k)=0.0
  end do 

 end subroutine inverse

 subroutine  ratqr(n, eps1, d, e, mineig,ierr )
! !DESCRIPTION:
!  This subroutine finds the algebraically smallest 
!  eigenvalue of a positive definite symmetric tridiagonal matrix by the
!  rational QR method with Newton corrections.
!  Follow EISPACK lib subroutine RATQR 
!
! ! INPUT VARIABLES
   integer(POP_i4), intent(in) ::  n    ! matrix order
   real(POP_r8),intent(in) :: d(n)      ! diagonal elements
   real(POP_r8),intent(in) :: e(n)      ! off-diagonal elements, e(1) is arbitrary
   real(POP_r8),intent(in) :: eps1      ! convergence tolerance

!  ! OUTPUT VARIABLES
   integer(POP_i4), intent(inout) ::  ierr  ! status flag 
   real(POP_r8), intent(inout) ::  mineig   !smallest eigevalue

!  ! LOCAL  VARIABLES
   real(POP_r8), dimension(n) ::  e2, bd, w
   real(POP_r8) :: f, ep, delta, err
   integer(POP_i4) i, ii ! local counters
   real(POP_r8) p, q, qp,r,s,tot

   ierr = 0
   w(1:n) = d(1:n)
   err = 0.0_POP_r8 
   s = 0.0_POP_r8 
   tot = w(1)
   q = 0.0_POP_r8 

   do i = 1, n
      p = q
      bd(i) = e(i)*e(i)
      q = 0.0_POP_r8 

      if ( i /= n ) then
        q = abs ( e(i+1) )
      endif

      tot = min ( w(i) - p - q, tot )
   end do
   bd(1) = 0.0_POP_r8 

   if ( tot < 0.0_POP_r8  ) then
     tot = 0.0_POP_r8 
   else 
     w(1:n) = w(1:n) - tot
   endif

!  QR transformation.

   do while (.true.)
     tot = tot + s
     delta = w(n) - s
     i = n
     if ( delta <= eps1 ) then
       exit
     endif 

     f = bd(n) / delta
     qp = delta + f
     p = 1.0

     do ii = 1, n - 1
       i = n - ii
       q = w(i) - s - f
       r = q / qp
       p = p * r + 1.0
       ep = f * r
       w(i+1) = qp + ep
       delta = q - ep

       ! check convergence 
       if ( delta <= eps1 ) exit

       f = bd(i) / q
       qp = delta + f
       bd(i+1) = qp * ep
     end do
     
     ! check convergence 
     if ( delta <= eps1 ) exit

     w(1) = qp
     s = qp / p

     !  Set error: irregular end of iteration.
     if ( tot + s <= tot ) then
       ierr = 1
       return
     endif

   end do 

   w(1) = tot
   err = err + abs ( delta)
   bd(1) = err
   mineig = w(1)

   return
 end subroutine ratqr

 end module POP_SolversMod

!|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||