! (C) Copyright 2008- ECMWF.
! (C) Copyright 2008- Meteo-France.
! 
! This software is licensed under the terms of the Apache Licence Version 2.0
! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
! In applying this licence, ECMWF does not waive the privileges and immunities
! granted to it by virtue of its status as an intergovernmental organisation
! nor does it submit to any jurisdiction.
!

SUBROUTINE GPNORM_TRANS_GPU(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL)


!**** *GPNORM_TRANS_GPU* - calculate grid-point norms

!     Purpose.
!     --------
!        calculate grid-point norms using a 2 stage (NPRTRV,NPRTRW) communication rather
!        than an approach using a more expensive global gather collective communication

!**   Interface.
!     ----------
!     CALL GPNORM_TRANS(...)

!     Explicit arguments :
!     --------------------
!     PGP(:,:,:) - gridpoint fields (input)
!                  PGP is  dimensioned (NPROMA,KFIELDS,NGPBLKS) where
!                  NPROMA is the blocking factor, KFIELDS the total number
!                  of fields and NGPBLKS the number of NPROMA blocks.
!     KFIELDS     - number of fields (input)
!                   (these do not have to be just levels)
!     KPROMA      - required blocking factor (input)
!     PAVE        - average (output)
!     PMIN        - minimum (input/output)
!     PMAX        - maximum (input/output)
!     LDAVE_ONLY  - T : PMIN and PMAX already contain local MIN and MAX
!     KRESOL      -  resolution tag (optional)
!                    default assumes first defined resolution
!

!     Author.
!     -------
!        George Mozdzynski *ECMWF*

!     Modifications.
!     --------------
!        Original : 19th Sept 2008
!        R. El Khatib 07-08-2009 Optimisation directive for NEC

!     ------------------------------------------------------------------

USE PARKIND1        ,ONLY : JPIM     ,JPRB , JPRD
USE PARKIND_ECTRANS ,ONLY : JPRBT

!ifndef INTERFACE

USE TPM_GEN         ,ONLY : NOUT
USE TPM_DIM         ,ONLY : R
USE TPM_TRANS       ,ONLY : LGPNORM, NGPBLKS, NPROMA
USE TPM_DISTR       ,ONLY : D, NPRCIDS, NPRTRV, NPRTRW, MYSETV, MYSETW, NPROC, D_NSTAGTF,D_NPTRLS, MYPROC
USE TPM_GEOMETRY    ,ONLY : G,G_NLOEN,G_NLOEN_MAX
USE TPM_FIELDS      ,ONLY : F_RW
USE SET_RESOL_MOD   ,ONLY : SET_RESOL
USE TRGTOL_MOD      ,ONLY : TRGTOL
USE SET2PE_MOD      ,ONLY : SET2PE
USE MPL_MODULE      ,ONLY : MPL_RECV, MPL_SEND, JP_BLOCKING_STANDARD
USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS
USE YOMHOOK         ,ONLY : LHOOK,   DR_HOOK,  JPHOOK

!endif INTERFACE

IMPLICIT NONE

! Declaration of arguments

REAL(KIND=JPRB)   ,INTENT(IN)    :: PGP(:,:,:)
REAL(KIND=JPRB)   ,INTENT(OUT)   :: PAVE(:)
REAL(KIND=JPRB)   ,INTENT(INOUT) :: PMIN(:)
REAL(KIND=JPRB)   ,INTENT(INOUT) :: PMAX(:)
INTEGER(KIND=JPIM),INTENT(IN)    :: KFIELDS
INTEGER(KIND=JPIM),INTENT(IN)    :: KPROMA
LOGICAL           ,INTENT(IN)    :: LDAVE_ONLY
INTEGER(KIND=JPIM),OPTIONAL, INTENT(IN)  :: KRESOL

!ifndef INTERFACE

! Local variables
REAL(KIND=JPHOOK) :: ZHOOK_HANDLE
INTEGER(KIND=JPIM) :: IUBOUND(4)
INTEGER(KIND=JPIM) :: IVSET(KFIELDS)
INTEGER(KIND=JPIM),ALLOCATABLE :: IVSETS(:)
INTEGER(KIND=JPIM),ALLOCATABLE :: IVSETG(:,:)
!GPU
REAL(KIND=JPRBT) :: V
REAL(KIND=JPRBT),ALLOCATABLE,SAVE :: ZGTF(:)
REAL(KIND=JPRD),ALLOCATABLE :: ZAVE(:,:)
REAL(KIND=JPRBT),ALLOCATABLE :: ZMINGL(:,:)
REAL(KIND=JPRBT),ALLOCATABLE :: ZMAXGL(:,:)
REAL(KIND=JPRBT),ALLOCATABLE :: ZMIN(:)
REAL(KIND=JPRBT),ALLOCATABLE :: ZMAX(:)
REAL(KIND=JPRBT),ALLOCATABLE :: ZMINGPN(:)
REAL(KIND=JPRBT),ALLOCATABLE :: ZMAXGPN(:)
REAL(KIND=JPRD),ALLOCATABLE :: ZAVEG(:,:)
REAL(KIND=JPRB),ALLOCATABLE :: ZMING(:)
REAL(KIND=JPRB),ALLOCATABLE :: ZMAXG(:)
REAL(KIND=JPRD),ALLOCATABLE :: ZSND(:)
REAL(KIND=JPRD),ALLOCATABLE :: ZRCV(:)
INTEGER(KIND=JPIM) :: J,JGL,IGL,JL,JF,IF_GP,IF_SCALARS_G,IF_FS,JSETV,JSETW,IWLATS,JMAX
INTEGER(KIND=JPIM) :: IPROC,ITAG,ILEN,ILENR,IBEG,IEND,IND
!INTEGER(KIND=JPIM) :: iunit

!     ------------------------------------------------------------------
IF (LHOOK) CALL DR_HOOK('GPNORM_TRANS',0,ZHOOK_HANDLE)

! Set current resolution
CALL SET_RESOL(KRESOL)

! Set defaults

NPROMA = KPROMA
NGPBLKS = (D%NGPTOT-1)/NPROMA+1

! Consistency checks

IUBOUND(1:3)=UBOUND(PGP)
IF(IUBOUND(1) < NPROMA) THEN
  WRITE(NOUT,*)'GPNORM_TRANS:FIRST DIM. OF PGP TOO SMALL ',IUBOUND(1),NPROMA
  CALL ABORT_TRANS('GPNORM_TRANS:FIRST DIMENSION OF PGP TOO SMALL ')
ENDIF
IF(IUBOUND(2) < KFIELDS) THEN
  WRITE(NOUT,*)'GPNORM_TRANS:SEC. DIM. OF PGP TOO SMALL ',IUBOUND(2),KFIELDS
  CALL ABORT_TRANS('GPNORM_TRANS:SECOND DIMENSION OF PGP TOO SMALL ')
ENDIF
IF(IUBOUND(3) < NGPBLKS) THEN
  WRITE(NOUT,*)'GPNORM_TRANS:THIRD DIM. OF PGP TOO SMALL ',IUBOUND(3),NGPBLKS
  CALL ABORT_TRANS('GPNORM_TRANS:THIRD DIMENSION OF PGP TOO SMALL ')
ENDIF


IF_GP=KFIELDS
IF_SCALARS_G=0

IF_FS=0
DO J=1,KFIELDS
  IVSET(J)=MOD(J-1,NPRTRV)+1
  IF(IVSET(J)==MYSETV)THEN
    IF_FS=IF_FS+1
  ENDIF
ENDDO
IF (.NOT. ALLOCATED(ZAVE)) THEN
  ALLOCATE(ZAVE(IF_FS,R%NDGL))
  ALLOCATE(ZMINGL(IF_FS,R%NDGL))
  ALLOCATE(ZMAXGL(IF_FS,R%NDGL))
  ALLOCATE(ZMINGPN(IF_FS))
  ALLOCATE(ZMAXGPN(IF_FS))
  
  ZAVE = 0._JPRBT
  ZMINGL = 0._JPRBT
  ZMAXGL = 0._JPRBT
  ZMINGPN = 0._JPRBT
  ZMAXGPN = 0._JPRBT
#ifdef ACCGPU
  !$ACC ENTER DATA COPYIN(ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN)
#endif
  IF (.NOT. ALLOCATED(ZGTF)) THEN
  ALLOCATE(ZGTF(IF_FS*D%NLENGTF))
  WRITE(NOUT,*)'ZGTF :',SIZE(ZGTF)
#ifdef ACCGPU
  !$ACC ENTER DATA CREATE(ZGTF)
#endif
  ENDIF
ENDIF

ALLOCATE(IVSETS(NPRTRV))
IVSETS(:)=0
DO J=1,KFIELDS
  IVSETS(IVSET(J))=IVSETS(IVSET(J))+1
ENDDO
ALLOCATE(IVSETG(NPRTRV,MAXVAL(IVSETS(:))))
IVSETG(:,:)=0
IVSETS(:)=0
DO J=1,KFIELDS
  IVSETS(IVSET(J))=IVSETS(IVSET(J))+1
  IVSETG(IVSET(J),IVSETS(IVSET(J)))=J
ENDDO


! done in setup_trans
LGPNORM=.TRUE.
!!FIXME
!!CALL TRGTOL_CUDAAWARE(ZGTF,IF_FS,IF_GP,IVSET,PGP=PGP)
LGPNORM=.FALSE.

! ZGTF is now on GPU

IBEG=1
IEND=D%NDGL_FS

CALL GSTATS(1429,0)
IF( IF_FS > 0 )THEN

#ifdef ACCGPU
 !$ACC DATA &
 !$ACC& COPY(F_RW) &
 !$ACC& COPY(D,D_NSTAGTF,D_NPTRLS,G_NLOEN,G_NLOEN_MAX) &
 !$ACC& PRESENT(ZGTF,ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN)
#endif
#ifdef OMPGPU
 !$OMP TARGET DATA MAP(TO:F_RW,D,D_NSTAGTF,D_NPTRLS,G_NLOEN,G_NLOEN_MAX) &
 !$OMP&          MAP(PRESENT,ALLOC:ZGTF,ZAVE,ZMINGL,ZMAXGL,ZMINGPN,ZMAXGPN)
 !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO
#endif
#ifdef ACCGPU
    !$ACC KERNELS
#endif
    DO JF=1,IF_FS
      V = ZGTF(IF_FS*D_NSTAGTF(1)+(JF-1)*(D%NSTAGTF(2)-D%NSTAGTF(1)))
      ZMINGL(JF,IBEG:IEND)=HUGE(1_JPRBT)
      ZMAXGL(JF,IBEG:IEND)=-HUGE(1_JPRBT)
    ENDDO
#ifdef ACCGPU
    !$ACC END KERNELS
#endif

! FIRST DO SUMS IN EACH FULL LATITUDE

#ifdef OMPGPU
  !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO
#endif
#ifdef ACCGPU
  !$ACC KERNELS
#endif
  DO JGL=1,D%NDGL_FS
    IGL = D_NPTRLS(MYSETW) + JGL - 1
    DO JF=1,IF_FS
      ZAVE(JF,JGL)=0.0_JPRBT
#ifdef ACCGPU
      !$ACC LOOP
#endif
      DO JL=1,G_NLOEN(IGL)
        V = ZGTF(IF_FS*D%NSTAGTF(JGL)+(JF-1)*(D%NSTAGTF(JGL+1)-D%NSTAGTF(JGL))+JL)
        ZAVE(JF,JGL)=ZAVE(JF,JGL)+V
        ZMINGL(JF,JGL)=MIN(ZMINGL(JF,JGL),V)
        ZMAXGL(JF,JGL)=MAX(ZMAXGL(JF,JGL),V)
      ENDDO
    ENDDO
  ENDDO
#ifdef ACCGPU
  !$ACC END KERNELS
#endif

#ifdef OMPGPU
  !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO
#endif
#ifdef ACCGPU
  !$ACC KERNELS
#endif
  DO JF=1,IF_FS
    ZMINGPN(JF)=MINVAL(ZMINGL(JF,IBEG:IEND))
    ZMAXGPN(JF)=MAXVAL(ZMAXGL(JF,IBEG:IEND))
  ENDDO
#ifdef ACCGPU
  !$ACC END KERNELS
#endif

#ifdef OMPGPU
  !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO
#endif
#ifdef ACCGPU
  !$ACC KERNELS
#endif
  DO JGL=IBEG,IEND
    IGL = D_NPTRLS(MYSETW) + JGL - 1
    DO JF=1,IF_FS
      ZAVE(JF,JGL)=ZAVE(JF,JGL)*F_RW(IGL)/G_NLOEN(IGL)
      !write(iunit,*) 'aver inside ',JF,IF_FS,IGL,ZAVE(JF,JGL), F_RW(IGL), G_NLOEN(IGL),ZMINGPN(JF),ZMAXGPN(JF)
    ENDDO
  ENDDO
#ifdef ACCGPU
  !$ACC END KERNELS
#endif

#ifdef OMPGPU
!$OMP END TARGET DATA
#endif
#ifdef ACCGPU
!$ACC END DATA
#endif

#ifdef ACCGPU
!$ACC UPDATE HOST(ZAVE)
#endif
#ifdef OMPGPU
!$OMP TARGET UPDATE FROM(ZAVE)
#endif
#ifdef ACCGPU
!$ACC UPDATE HOST(ZMINGPN)
#endif
#ifdef OMPGPU
!$OMP TARGET UPDATE FROM(ZMINGPN)
#endif
#ifdef ACCGPU
!$ACC UPDATE HOST(ZMAXGPN)
#endif
#ifdef OMPGPU
!$OMP TARGET UPDATE FROM(ZMAXGPN)
#endif
#ifdef ACCGPU
!$ACC WAIT
#endif
#ifdef OMPGPU
!$OMP BARRIER
#endif

ENDIF
CALL GSTATS(1429,1)

! from here rest on CPU

! IT IS IMPORTANT THAT SUMS ARE NOW DONE IN LATITUDE ORDER
ALLOCATE(ZAVEG(R%NDGL,KFIELDS))
ALLOCATE(ZMING(KFIELDS))
ALLOCATE(ZMAXG(KFIELDS))

ZAVEG(:,:)=0.0_JPRD
DO JF=1,IF_FS
  DO JGL=IBEG,IEND
    IGL = D%NPTRLS(MYSETW) + JGL - 1
    ZAVEG(IGL,IVSETG(MYSETV,JF))=ZAVEG(IGL,IVSETG(MYSETV,JF))+ZAVE(JF,JGL)
  ENDDO
ENDDO

IF(LDAVE_ONLY)THEN
  ZMING(:)=PMIN(:)
  ZMAXG(:)=PMAX(:)
ELSE
  DO JF=1,IF_FS
    ZMING(IVSETG(MYSETV,JF))=ZMINGPN(JF)
    ZMAXG(IVSETG(MYSETV,JF))=ZMAXGPN(JF)
  ENDDO
ENDIF

! RECEIVE ABOVE FROM OTHER NPRTRV SETS FOR SAME LATS BUT DIFFERENT FIELDS
ITAG=123

CALL GSTATS(815,0)

IF( MYSETV==1 )THEN

  DO JSETV=2,NPRTRV
    IF(LDAVE_ONLY)THEN
      ILEN=D%NDGL_FS*IVSETS(JSETV)+2*KFIELDS
    ELSE
      ILEN=(D%NDGL_FS+2)*IVSETS(JSETV)
    ENDIF
    IF(ILEN > 0)THEN
      ALLOCATE(ZRCV(ILEN))
      CALL SET2PE(IPROC,0,0,MYSETW,JSETV)
      CALL MPL_RECV(ZRCV(:),KSOURCE=NPRCIDS(IPROC),KTAG=ITAG,&
        &KMP_TYPE=JP_BLOCKING_STANDARD,KOUNT=ILENR,CDSTRING='GPNORM_TRANS:V')
      IF(ILENR /= ILEN)THEN
        CALL ABOR1('GPNORM_TRANS:ILENR /= ILEN')
      ENDIF
      IND=0
      DO JF=1,IVSETS(JSETV)
        DO JGL=IBEG,IEND
          IGL = D%NPTRLS(MYSETW) + JGL - 1
          IND=IND+1
          ZAVEG(IGL,IVSETG(JSETV,JF))=ZRCV(IND)
        ENDDO
        IF(.NOT.LDAVE_ONLY)THEN
          IND=IND+1
          ZMING(IVSETG(JSETV,JF))=ZRCV(IND)
          IND=IND+1
          ZMAXG(IVSETG(JSETV,JF))=ZRCV(IND)
        ENDIF
      ENDDO
      IF(LDAVE_ONLY)THEN
        DO JF=1,KFIELDS
          IND=IND+1
          ZMING(JF)=MIN(ZMING(JF),REAL(ZRCV(IND),JPRB))
          IND=IND+1
          ZMAXG(JF)=MAX(ZMAXG(JF),REAL(ZRCV(IND),JPRB))
        ENDDO
      ENDIF
      DEALLOCATE(ZRCV)
    ENDIF
  ENDDO

ELSE

  IF(LDAVE_ONLY)THEN
    ILEN=D%NDGL_FS*IVSETS(MYSETV)+2*KFIELDS
  ELSE
    ILEN=(D%NDGL_FS+2)*IVSETS(MYSETV)
  ENDIF
  IF(ILEN > 0)THEN
    CALL SET2PE(IPROC,0,0,MYSETW,1)
    ALLOCATE(ZSND(ILEN))
    IND=0
    DO JF=1,IF_FS
      DO JGL=IBEG,IEND
        IGL = D%NPTRLS(MYSETW) + JGL - 1
        IND=IND+1
        ZSND(IND)=ZAVEG(IGL,IVSETG(MYSETV,JF))
       ENDDO
      IF(.NOT.LDAVE_ONLY)THEN
        IND=IND+1
        ZSND(IND)=ZMING(IVSETG(MYSETV,JF))
        IND=IND+1
        ZSND(IND)=ZMAXG(IVSETG(MYSETV,JF))
      ENDIF
    ENDDO
    IF(LDAVE_ONLY)THEN
      DO JF=1,KFIELDS
        IND=IND+1
        ZSND(IND)=PMIN(JF)
        IND=IND+1
        ZSND(IND)=PMAX(JF)
      ENDDO
    ENDIF
    CALL MPL_SEND(ZSND(:),KDEST=NPRCIDS(IPROC),KTAG=ITAG,&
      &KMP_TYPE=JP_BLOCKING_STANDARD,CDSTRING='GPNORM_TRANS:V')
    DEALLOCATE(ZSND)
  ENDIF

ENDIF

! FINALLY RECEIVE CONTRIBUTIONS FROM OTHER NPRTRW SETS

IF( MYSETV == 1 )THEN

  IF( MYSETW == 1 )THEN

    DO JSETW=2,NPRTRW
      IWLATS=D%NULTPP(JSETW)
      IBEG=1
      IEND=IWLATS
      IF(LDAVE_ONLY)THEN
        ILEN=IWLATS*KFIELDS+2*KFIELDS
      ELSE
        ILEN=(IWLATS+2)*KFIELDS
      ENDIF
      IF(ILEN > 0 )THEN
        ALLOCATE(ZRCV(ILEN))
        CALL SET2PE(IPROC,0,0,JSETW,1)
        CALL MPL_RECV(ZRCV(:),KSOURCE=NPRCIDS(IPROC),KTAG=ITAG,&
          &KMP_TYPE=JP_BLOCKING_STANDARD,KOUNT=ILENR,CDSTRING='GPNORM_TRANS:W')
        IF(ILENR /= ILEN)THEN
          CALL ABOR1('GPNORM_TRANS:ILENR /= ILEN')
        ENDIF
        IND=0
        DO JF=1,KFIELDS
          DO JGL=IBEG,IEND
            IGL = D%NPTRLS(JSETW) + JGL - 1
            IND=IND+1
            ZAVEG(IGL,JF)=ZRCV(IND)
          ENDDO
          IF(.NOT.LDAVE_ONLY)THEN
            IND=IND+1
            ZMING(JF)=MIN(ZMING(JF),ZRCV(IND))
            IND=IND+1
            ZMAXG(JF)=MAX(ZMAXG(JF),ZRCV(IND))
          ENDIF
        ENDDO
        IF(LDAVE_ONLY)THEN
          DO JF=1,KFIELDS
            IND=IND+1
            ZMING(JF)=MIN(ZMING(JF),ZRCV(IND))
            IND=IND+1
            ZMAXG(JF)=MAX(ZMAXG(JF),ZRCV(IND))
          ENDDO
        ENDIF
        DEALLOCATE(ZRCV)
      ENDIF
    ENDDO

  ELSE

    IF(LDAVE_ONLY)THEN
      ILEN=D%NDGL_FS*KFIELDS+2*KFIELDS
    ELSE
      ILEN=(D%NDGL_FS+2)*KFIELDS
    ENDIF
    IF(ILEN > 0)THEN
      CALL SET2PE(IPROC,0,0,1,1)
      ALLOCATE(ZSND(ILEN))
      IND=0
      DO JF=1,KFIELDS
        DO JGL=IBEG,IEND
          IGL = D%NPTRLS(MYSETW) + JGL - 1
          IND=IND+1
          ZSND(IND)=ZAVEG(IGL,JF)
        ENDDO
        IF(.NOT.LDAVE_ONLY)THEN
          IND=IND+1
          ZSND(IND)=ZMING(JF)
          IND=IND+1
          ZSND(IND)=ZMAXG(JF)
        ENDIF
      ENDDO
      IF(LDAVE_ONLY)THEN
        DO JF=1,KFIELDS
          IND=IND+1
          ZSND(IND)=ZMING(JF)
          IND=IND+1
          ZSND(IND)=ZMAXG(JF)
        ENDDO
      ENDIF
      CALL MPL_SEND(ZSND(:),KDEST=NPRCIDS(IPROC),KTAG=ITAG,&
        &KMP_TYPE=JP_BLOCKING_STANDARD,CDSTRING='GPNORM_TRANS:V')
      DEALLOCATE(ZSND)
    ENDIF

  ENDIF

ENDIF

CALL GSTATS(815,1)

IF( MYSETW == 1 .AND. MYSETV == 1 )THEN

  PAVE(:)=0.0_JPRB
  DO JGL=1,R%NDGL
    PAVE(:)=PAVE(:)+REAL(ZAVEG(JGL,:),JPRB)
  ENDDO

  PMIN(:)=ZMING(:)
  PMAX(:)=ZMAXG(:)

ENDIF

!DEALLOCATE(ZGTF)
!DEALLOCATE(ZAVE)
!DEALLOCATE(ZMIN)
!DEALLOCATE(ZMAX)
DEALLOCATE(ZAVEG)
DEALLOCATE(ZMING)
DEALLOCATE(ZMAXG)
DEALLOCATE(IVSETS)
DEALLOCATE(IVSETG)

IF (LHOOK) CALL DR_HOOK('GPNORM_TRANS',1,ZHOOK_HANDLE)

!     ------------------------------------------------------------------

!endif INTERFACE


END SUBROUTINE GPNORM_TRANS_GPU
