OpenACC和CUDA感知MPI

问题描述

我主要希望在整个while循环中在设备上移动。当我将#pragma acc host_data use_device(err)添加MPI_Allreduce (&err,&err,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);时出现问题。

错误是减少err无效,因此代码从循环执行一步后退出

MPI_Allreduce()之后,即使使用#pragma acc update self(err),err仍然等于零。

我正在使用mpicc -acc -ta=tesla:managed -Minfo=accel -w jacobi.c进行编译 并以mpirun -np 2 -mca pml ^ucx ./a.out

运行

您能帮我找到错误吗?

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define ParaLLEL
#define NX_GLOB     128   /* Global number of interior points */
#define NY_GLOB     128  /* Global number of interior points */
#define NGHOST   1
#define NDIM     2

#ifdef ParaLLEL
  #include <mpi.h>
  MPI_Comm MPI_COMM_CART;
#endif

typedef struct MPI_Decomp_{
  int nprocs[NDIM];     /*  Number of processors in each dimension */
  int periods[NDIM];    /*  periodicity flag in each dimension     */
  int coords[NDIM];     /*  Cartesian coordinate in the MPI topology */
  int gsize[NDIM];      /*  Global domain size (no ghosts)  */
  int lsize[NDIM];      /*  Local domain size (no ghosts)   */
  int start[NDIM];      /*  Local start index in each dimension           */
  int procL[NDIM];      /*  Rank of left-lying  process in each direction */
  int procR[NDIM];      /*  Rank of right-lying process in each direction */
  int rank;             /*  Local process rank */
  int size;             /*  Communicator size  */
} MPI_Decomp;



void BoundaryConditions(double **,double *,int,MPI_Decomp *);
void DomainDecomposition(MPI_Decomp *);
void WriteSolution (double **,MPI_Decomp *);
double **Allocate_2DdblArray(int,int);
int    **Allocate_2DintArray(int,int);
void   Show_2DdblArray(double **,const char *);
void   Show_2DintArray(int **,const char *);

int nx_tot,ny_tot;

int main(int argc,char ** argv)
{
  int    nx,i,ibeg,iend;
  int    ny,j,jbeg,jend;
  int    k,rank=0,size=1;
  double xbeg = 0.0,xend = 1.0;
  double ybeg = 0.0,yend = 1.0;
  double dx = (xend - xbeg)/(NX_GLOB + 1);
  double dy = (yend - ybeg)/(NY_GLOB + 1);
  double *xg,*yg,*x,*y,**phi,**phi0;
  double err,tol;
  MPI_Decomp  mpi_decomp;
  double err_glob;
  int procL[NDIM] = {-1,-1};
  int procR[NDIM] = {-1,-1};

/* --------------------------------------------------------
   0. Initialize the MPI execution environment
   -------------------------------------------------------- */

#ifdef ParaLLEL
  MPI_Datatype row_type,col_type;

  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
  MPI_Comm_size(MPI_COMM_WORLD,&size);

  DomainDecomposition(&mpi_decomp);
  nx = mpi_decomp.lsize[0];
  ny = mpi_decomp.lsize[1];
#else
  mpi_decomp.gsize[0] = mpi_decomp.lsize[0] = nx = NX_GLOB;
  mpi_decomp.gsize[1] = mpi_decomp.lsize[1] = ny = NY_GLOB;
  mpi_decomp.procL[0] = mpi_decomp.procL[1] = -1;
  mpi_decomp.procR[0] = mpi_decomp.procR[1] = -1;
#endif

/* --------------------------------------------------------
   1. Set local grid indices
   -------------------------------------------------------- */

  ibeg   = NGHOST;
  iend   = ibeg + nx - 1;
  nx     = iend - ibeg + 1;
  nx_tot = nx + 2*NGHOST;

  jbeg   = NGHOST;
  jend   = jbeg + ny - 1;
  ny     = jend - jbeg + 1;
  ny_tot = ny + 2*NGHOST;

/* --------------------------------------------------------
   2. Generate global and local grids
   -------------------------------------------------------- */

  xg = (double *) malloc ( (NX_GLOB+2*NGHOST)*sizeof(double));
  yg = (double *) malloc ( (NY_GLOB+2*NGHOST)*sizeof(double));
  for (i = 0; i < (NX_GLOB+2*NGHOST); i++) xg[i] = xbeg + (i-ibeg+1)*dx;
  for (j = 0; j < (NY_GLOB+2*NGHOST); j++) yg[j] = ybeg + (j-jbeg+1)*dy;
  #ifdef ParaLLEL
  x = xg + mpi_decomp.start[0];
  y = yg + mpi_decomp.start[1];
  #else
  x = xg;
  y = yg;
  #endif

/* --------------------------------------------------------
   3. Allocate memory on local processor and
      assign initial conditions.
   -------------------------------------------------------- */

  phi  = Allocate_2DdblArray(ny_tot,nx_tot);
  phi0 = Allocate_2DdblArray(ny_tot,nx_tot);

  for (j = jbeg; j <= jend; j++){
  for (i = ibeg; i <= iend; i++){
    phi0[j][i] = 0.0;
  }}

#ifdef ParaLLEL
  MPI_Type_contiguous (nx_tot,&row_type);
  MPI_Type_vector     (ny_tot,nx_tot,&col_type);
  MPI_Type_commit (&row_type);
  MPI_Type_commit (&col_type);
#endif

/* --------------------------------------------------------
   4. Main iteration cycle
   -------------------------------------------------------- */

  tol = 1.e-5;
  err = 1.0;
  k   = 0;

  #pragma acc enter data copyin(phi[:ny_tot][:nx_tot],phi0[:ny_tot][:nx_tot],x[:NX_GLOB+2*NGHOST],y[NX_GLOB+2*NGHOST],err,err_glob)
  while  (err > tol){

  /* -- 4a. Set boundary conditions first -- */

    BoundaryConditions(phi0,x,y,nx,ny,&mpi_decomp);

  /* -- 4b. Jacobi's method and residual (interior points) -- */

    err = 0.0;

    #pragma acc parallel loop collapse(2) reduction(+:err) present(phi[:ny_tot][:nx_tot],phi0[:ny_tot][:nx_tot])
    for (j = jbeg; j <= jend; j++){
    for (i = ibeg; i <= iend; i++){
      phi[j][i] = 0.25*(  phi0[j][i-1] + phi0[j][i+1]
                        + phi0[j-1][i] + phi0[j+1][i] );

      err += dx*dy*fabs(phi[j][i] - phi0[j][i]);
    }}


    #pragma acc parallel loop collapse(2) present(phi[:ny_tot][:nx_tot],phi0[:ny_tot][:nx_tot])
    for (j = jbeg; j <= jend; j++){
    for (i = ibeg; i <= iend; i++){
      phi0[j][i] = phi[j][i];
    }}

    #ifdef ParaLLEL
    // double err_glob;
    #pragma acc host_data use_device(err,err_glob)
    {
    MPI_Allreduce (&err,&err_glob,MPI_COMM_WORLD);
    }
    err = err_glob;
    #endif

    // #pragma acc update host(err)
    if (rank == 0){
      printf ("k = %d; err = %8.3e\n",k,err);
    }
    k++;
  }
  #pragma acc exit data copyout(phi[:ny_tot][:nx_tot],err_glob)

  WriteSolution (phi,&mpi_decomp);

  #ifdef ParaLLEL
  MPI_Finalize();
  #endif
  return 0;
}

#ifdef ParaLLEL
/* ********************************************************************* */
void DomainDecomposition(MPI_Decomp *mpi_decomp)
/*
 *
 *********************************************************************** */
{
  int dim,i;
  int rank,size;
  int *coords  = mpi_decomp->coords;
  int *gsize   = mpi_decomp->gsize;
  int *lsize   = mpi_decomp->lsize;
  int *nprocs  = mpi_decomp->nprocs;
  int *periods = mpi_decomp->periods;
  int *procL   = mpi_decomp->procL;
  int *procR   = mpi_decomp->procR;
  int *start   = mpi_decomp->start;
  int new_coords[NDIM];

/* --------------------------------------------------------
   1. Get rank & size
   -------------------------------------------------------- */

  MPI_Comm_rank(MPI_COMM_WORLD,&size);

  mpi_decomp->rank = rank;
  mpi_decomp->size = size;

/* --------------------------------------------------------
   2. Obtain number of processor along each dimension.
      Use maximally squared decomp.
   -------------------------------------------------------- */

  nprocs[0] = (int)sqrt(size);
  nprocs[1] = size/nprocs[0];
  if (nprocs[0]*nprocs[1] != size){
    if (rank == 0) printf ("! Cannot decompose\n");
    MPI_Finalize();
    exit(1);
  }
  if (rank == 0){
    printf ("Decomposition achieved with %d X %d procs\n",nprocs[0],nprocs[1]);
  }

  periods[0] = 0;
  periods[1] = 0;

/* --------------------------------------------------------
   3. Create Cartesian topology
   -------------------------------------------------------- */

  MPI_Cart_create(MPI_COMM_WORLD,NDIM,nprocs,periods,&MPI_COMM_CART);
  MPI_Cart_get(MPI_COMM_CART,coords);

/* --------------------------------------------------------
   4. Fill structure members
   -------------------------------------------------------- */

  gsize[0] = NX_GLOB;
  gsize[1] = NY_GLOB;
  lsize[0] = NX_GLOB/nprocs[0];
  lsize[1] = NY_GLOB/nprocs[1];
  start[0] = coords[0]*lsize[0];
  start[1] = coords[1]*lsize[1];

/* --------------------------------------------------------
   5. Determine ranks of neighbour processors
   -------------------------------------------------------- */

  for (dim = 0; dim < NDIM; dim++) {
    for (i = 0; i < NDIM; i++) new_coords[i] = coords[i];

    new_coords[dim] = coords[dim] + 1;
    if (new_coords[dim] < nprocs[dim]) {
      MPI_Cart_rank ( MPI_COMM_CART,new_coords,&(procR[dim]) );
    } else {
      procR[dim] = MPI_PROC_NULL;
    }

    new_coords[dim] = coords[dim] - 1;
    if (new_coords[dim] >= 0) {
     MPI_Cart_rank ( MPI_COMM_CART,&(procL[dim]) );
    } else {
      procL[dim] = MPI_PROC_NULL;
    }
  }

/* --------------------------------------------------------
   6. Print processor information.
      (Use MPI_Bcast() to print in sequence)
   -------------------------------------------------------- */

  int proc,go;
  for (proc = 0; proc < size; proc++){
    go = proc;
    MPI_Bcast(&go,MPI_INT,MPI_COMM_WORLD);
    if (rank == go) {
      printf ("[Rank %d]\n",rank);
      printf ("  coords = [%d,%d],lsize = [%d,%d]\n",coords[0],coords[1],lsize[0],lsize[1]);
      for (dim = 0; dim < NDIM; dim++){
        printf ("  (procL,procR)[%d] = %d,%d\n",dim,procL[dim],procR[dim]);
      }
    }
    MPI_Barrier(MPI_COMM_WORLD);
  }

  return;
}
#endif

/* ********************************************************************* */
void BoundaryConditions(double **phi,double *x,double *y,int nx,int ny,MPI_Decomp *mpi_decomp)
/*
 *********************************************************************** */
{
  int i,j;
  int ibeg   = NGHOST;
  int iend   = ibeg + nx - 1;

  int jbeg   = NGHOST;
  int jend   = jbeg + ny - 1;

  int *procL = mpi_decomp->procL;
  int *procR = mpi_decomp->procR;
#ifdef ParaLLEL
  int rank = mpi_decomp->rank;
  int size = mpi_decomp->size;
  double send_buf[NX_GLOB + 2*NGHOST];
  double recv_buf[NX_GLOB + 2*NGHOST];

/*  Used for testing
    for (j = 0; j <= jend+1; j++){
    for (i = 0; i <= iend+1; i++){
      phi[j][i] = -1;
    }}

    for (j = jbeg; j <= jend; j++){
    for (i = ibeg; i <= iend; i++){
      phi[j][i] = rank;
    }}
*/

  #pragma acc enter data create(send_buf[:NX_GLOB+2*NGHOST],recv_buf[NX_GLOB+2*NGHOST])

  // Left buffer
  i = ibeg;

  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot],send_buf[:NX_GLOB+2*NGHOST])
  for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
  #pragma acc host_data use_device(send_buf,recv_buf)
  {
  MPI_Sendrecv (send_buf,jend+1,procL[0],recv_buf,MPI_COMM_WORLD,MPI_STATUS_IGnorE);
  }
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot],recv_buf[NX_GLOB+2*NGHOST])
  for (j = jbeg; j <= jend; j++) phi[j][i-1] = recv_buf[j];

  // Right buffer
  i = iend;
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot],procR[0],recv_buf[NX_GLOB+2*NGHOST])
  for (j = jbeg; j <= jend; j++) phi[j][i+1] = recv_buf[j];


  // Bottom buffer
  j = jbeg;
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot],send_buf[:NX_GLOB+2*NGHOST])
  for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
  // #pragma acc update self(send_buf[:NX_GLOB+2*NGHOST])
  #pragma acc host_data use_device(send_buf,iend+1,procL[1],recv_buf[NX_GLOB+2*NGHOST])
  for (i = ibeg; i <= iend; i++) phi[j-1][i] = recv_buf[i];

  // Top buffer
  j = jend;
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot],send_buf[:NX_GLOB+2*NGHOST])
  for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
  #pragma acc host_data use_device(send_buf,procR[1],recv_buf[NX_GLOB+2*NGHOST])
  for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];

  #pragma acc exit data copyout(send_buf[:NX_GLOB+2*NGHOST],recv_buf[NX_GLOB+2*NGHOST])

#endif

/* -- Left -- */

  if (procL[0] < 0){
    i = ibeg-1;
    #pragma acc parallel loop present(phi[:ny_tot][:nx_tot],y[:NY_GLOB+2*NGHOST])
    for (j = jbeg; j <= jend; j++) phi[j][i] = 1.0-y[j];
  }

/* -- Right -- */

  if (procR[0] < 0){
    i = iend+1;
    #pragma acc parallel loop present(phi[:ny_tot][:nx_tot],y[:NY_GLOB+2*NGHOST])
    for (j = jbeg; j <= jend; j++) phi[j][i] = y[j]*y[j];
  }

/* -- Bottom -- */

  if (procL[1] < 0){
    j = jbeg-1;
    #pragma acc parallel loop present(phi[:ny_tot][:nx_tot],x[:NX_GLOB+2*NGHOST])
    for (i = ibeg; i <= iend; i++) phi[j][i] = 1.0-x[i];
  }

/* -- Top -- */

  if (procR[1] < 0){
    j = jend+1;
    #pragma acc parallel loop present(phi[:ny_tot][:nx_tot],x[:NX_GLOB+2*NGHOST])
    for (i = ibeg; i <= iend; i++) phi[j][i] = x[i];
  }

  return;

#ifdef ParaLLEL
// Print
  MPI_Barrier(MPI_COMM_WORLD);
  int go,proc;
  for (proc = 0; proc < size; proc++){
    go = proc;
    MPI_Bcast(&go,MPI_COMM_WORLD);

    if (rank == go) {
      printf ("Boundary [Rank %d]\n",rank);
      for (j = jend+1; j >= 0; j--){
        for (i = 0; i <= iend+1; i++){
          printf ("%6.2f  ",phi[j][i]);
        }
        printf ("\n");
      }
    }
  }

MPI_Finalize();
exit(0);
#endif
}

/* ********************************************************************* */
void WriteSolution (double **phi,MPI_Decomp *md)
/*
 *********************************************************************** */
{
  int i,j;
  int ibeg   = NGHOST;
  int iend   = ibeg + nx - 1;

  int jbeg   = NGHOST;
  int jend   = jbeg + ny - 1;

  static int nfile = 0;
  char fname[32];

  sprintf (fname,"laplace2D_MPIACC.txt",nfile);

/*
for (j = jbeg-1; j <= jend+1; j++) for (i = ibeg-1; i <= iend+1; i++) {
  phi[j][i] = -1;
}
for (j = jbeg; j <= jend; j++) for (i = ibeg; i <= iend; i++) {
  phi[j][i] = md->rank;
}
*/
#ifdef ParaLLEL
  MPI_File fh;
  MPI_Datatype type_local,type_domain;
  int amode = MPI_MODE_CREATE | MPI_MODE_WRONLY;
  int gsize[2],lsize[2],start[2];

/* --------------------------------------------------------
   1. Create a local array type without the ghost zones
      This datatype will be passed to MPI_File_write()
   -------------------------------------------------------- */

  gsize[0] = md->lsize[0] + 2*NGHOST;
  gsize[1] = md->lsize[1] + 2*NGHOST;

  lsize[0] = md->lsize[0];
  lsize[1] = md->lsize[1];

  start[0] = NGHOST;
  start[1] = NGHOST;

  MPI_Type_create_subarray (NDIM,gsize,lsize,start,MPI_ORDER_FORTRAN,&type_local);
  MPI_Type_commit (&type_local);

/* --------------------------------------------------------
   2. Create the subarry in the global domain.
      This datatype is used to set the file view.
   -------------------------------------------------------- */

  gsize[0] = NX_GLOB;
  gsize[1] = NY_GLOB;

  lsize[0] = md->lsize[0];
  lsize[1] = md->lsize[1];

  start[0] = lsize[0]*md->coords[0];  // equal to md->start[0]
  start[1] = lsize[1]*md->coords[1];  // equal to md->start[1]

  MPI_Type_create_subarray (NDIM,&type_domain);
  MPI_Type_commit (&type_domain);

/* --------------------------------------------------------
   3. Write to disk
   -------------------------------------------------------- */

  MPI_File_delete(fname,MPI_INFO_NULL);
  MPI_File_open(MPI_COMM_CART,fname,amode,MPI_INFO_NULL,&fh);
  MPI_File_set_view(fh,type_domain,"native",MPI_INFO_NULL);
  MPI_File_write_all(fh,phi[0],type_local,MPI_STATUS_IGnorE);
  MPI_File_close(&fh);
  MPI_Type_free (&type_local);
  MPI_Type_free (&type_domain);
#else
  FILE *fp;
  printf ("> Writing %s\n",fname);
  fp = fopen(fname,"wb");

  for (j = jbeg; j <= jend; j++){
    fwrite (phi[j] + ibeg,sizeof(double),fp);
  }

  fclose(fp);
#endif

  nfile++;
}

/* ********************************************************************* */
double **Allocate_2DdblArray(int nx,int ny)
/*
 * Allocate memory for a double precision array with
 * nx rows and ny columns
 *********************************************************************** */
{
  int i,j;
  double **buf;

  buf    = (double **)malloc (nx*sizeof(double *));
  buf[0] = (double *) malloc (nx*ny*sizeof(double));
  for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;

  return buf;
}
/* ********************************************************************* */
int **Allocate_2DintArray(int nx,int ny)
/*
 * Allocate memory for an integer-type array with
 * nx rows and ny columns
 *********************************************************************** */
{
  int i,j;
  int **buf;

  buf    = (int **)malloc (nx*sizeof(int *));
  buf[0] = (int *) malloc (nx*ny*sizeof(int));
  for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;

  return buf;
}


/* ********************************************************************* */
void Show_2DdblArray(double **A,const char *string)
/*
 *********************************************************************** */
{
  int i,j;

  printf ("%s\n",string);
  printf ("------------------------------\n");
  for (i = 0; i < nx; i++) {
    for (j = 0; j < ny; j++) {
      printf ("%8.2f  ",A[i][j]);
    }
    printf ("\n");
  }
  printf ("------------------------------\n");
}
/* ********************************************************************* */
void Show_2DintArray(int **A,string);
  for (j = 0; j < ny; j++) printf ("-----");
  printf ("\n");

  for (i = 0; i < nx; i++) {
    for (j = 0; j < ny; j++) {
      printf ("%03d  ",A[i][j]);
    }
    printf ("\n");
  }

  for (j = 0; j < ny; j++) printf ("-----");
  printf ("\n");
}

解决方法

感谢您更新示例。这里有几个问题。

首先,对于“ err”和“ err_glob”。在循环开始时,您在主机上设置了“ err = 0”,但没有在设备上更新它。然后,在MPI_AllReduce调用之后,再次在主机上设置“ err = err_glob”,因此需要更新“ err_glob”。

第二个问题是,当使用多个等级运行时,代码会部分出现“ y”错误。问题是您正在使用全局大小而不是“ x”和“ y”的局部大小,因此当您复制“ y”时,由于偏移量,它与“ x”重叠。我通过将“ xg”和“ yg”复制到设备来解决此问题。

至于相对于CPU的性能,这里的主要问题是尺寸小,因此很多代码都利用了GPU。我将GLOB的大小增加到了4096,并且看到了更好的相对性能,尽管代码的收敛速度更快。

我还随意添加了一些我用来排序的样板代码到设备分配中,以便该代码可以利用多个GPU。

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define PARALLEL
#define NX_GLOB     128   /* Global number of interior points */
#define NY_GLOB     128  /* Global number of interior points */
#define NGHOST   1
#define NDIM     2

#ifdef PARALLEL
  #include <mpi.h>
  MPI_Comm MPI_COMM_CART;
#endif

#ifdef _OPENACC
  #include <openacc.h>
#endif

typedef struct MPI_Decomp_{
  int nprocs[NDIM];     /*  Number of processors in each dimension */
  int periods[NDIM];    /*  Periodicity flag in each dimension     */
  int coords[NDIM];     /*  Cartesian coordinate in the MPI topology */
  int gsize[NDIM];      /*  Global domain size (no ghosts)  */
  int lsize[NDIM];      /*  Local domain size (no ghosts)   */
  int start[NDIM];      /*  Local start index in each dimension           */
  int procL[NDIM];      /*  Rank of left-lying  process in each direction */
  int procR[NDIM];      /*  Rank of right-lying process in each direction */
  int rank;             /*  Local process rank */
  int size;             /*  Communicator size  */
} MPI_Decomp;



void BoundaryConditions(double **,double *,int,MPI_Decomp *);
void DomainDecomposition(MPI_Decomp *);
void WriteSolution (double **,MPI_Decomp *);
double **Allocate_2DdblArray(int,int);
int    **Allocate_2DintArray(int,int);
void   Show_2DdblArray(double **,const char *);
void   Show_2DintArray(int **,const char *);

int nx_tot,ny_tot;

int main(int argc,char ** argv)
{
  int    nx,i,ibeg,iend;
  int    ny,j,jbeg,jend;
  int    k,rank=0,size=1;
  int xsize,ysize;
  double xbeg = 0.0,xend = 1.0;
  double ybeg = 0.0,yend = 1.0;
  double dx = (xend - xbeg)/(NX_GLOB + 1);
  double dy = (yend - ybeg)/(NY_GLOB + 1);
  double *xg,*yg,*x,*y,**phi,**phi0;
  double err,tol;
  MPI_Decomp  mpi_decomp;
  double err_glob;
  int procL[NDIM] = {-1,-1};
  int procR[NDIM] = {-1,-1};

/* --------------------------------------------------------
   0. Initialize the MPI execution environment
   -------------------------------------------------------- */

#ifdef PARALLEL
  MPI_Datatype row_type,col_type;

  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
  MPI_Comm_size(MPI_COMM_WORLD,&size);

  DomainDecomposition(&mpi_decomp);
  nx = mpi_decomp.lsize[0];
  ny = mpi_decomp.lsize[1];
#else
  mpi_decomp.gsize[0] = mpi_decomp.lsize[0] = nx = NX_GLOB;
  mpi_decomp.gsize[1] = mpi_decomp.lsize[1] = ny = NY_GLOB;
  mpi_decomp.procL[0] = mpi_decomp.procL[1] = -1;
  mpi_decomp.procR[0] = mpi_decomp.procR[1] = -1;
#endif

#ifdef _OPENACC
/* -------------------------------------------------------
   0. Set the device for each rank
   ------------------------------------------------------- */
  int device_type,num_devices;
  int gpuId;
  MPI_Comm shmcomm;
  int local_rank;

// Get the local rank number
  MPI_Comm_split_type(MPI_COMM_WORLD,MPI_COMM_TYPE_SHARED,MPI_INFO_NULL,&shmcomm);
  MPI_Comm_rank(shmcomm,&local_rank);

// Device num = local rank mod number of devices on the node
  device_type = acc_get_device_type();
  num_devices = acc_get_num_devices(device_type);
  gpuId = local_rank % num_devices;
  acc_set_device_num(gpuId,device_type);
  acc_init(device_type);
#endif

/* --------------------------------------------------------
   1. Set local grid indices
   -------------------------------------------------------- */

  ibeg   = NGHOST;
  iend   = ibeg + nx - 1;
  nx     = iend - ibeg + 1;
  nx_tot = nx + 2*NGHOST;

  jbeg   = NGHOST;
  jend   = jbeg + ny - 1;
  ny     = jend - jbeg + 1;
  ny_tot = ny + 2*NGHOST;

/* --------------------------------------------------------
   2. Generate global and local grids
   -------------------------------------------------------- */

  xg = (double *) malloc ( (NX_GLOB+2*NGHOST)*sizeof(double));
  yg = (double *) malloc ( (NY_GLOB+2*NGHOST)*sizeof(double));
  for (i = 0; i < (NX_GLOB+2*NGHOST); i++) xg[i] = xbeg + (i-ibeg+1)*dx;
  for (j = 0; j < (NY_GLOB+2*NGHOST); j++) yg[j] = ybeg + (j-jbeg+1)*dy;
#pragma acc enter data copyin(xg[:NX_GLOB+2*NGHOST],yg[:NY_GLOB+2*NGHOST])
  #ifdef PARALLEL
  x = xg + mpi_decomp.start[0];
  y = yg + mpi_decomp.start[1];
  #else
  x = xg;
  y = yg;
  #endif

/* --------------------------------------------------------
   3. Allocate memory on local processor and
      assign initial conditions.
   -------------------------------------------------------- */

  phi  = Allocate_2DdblArray(ny_tot,nx_tot);
  phi0 = Allocate_2DdblArray(ny_tot,nx_tot);

  for (j = jbeg; j <= jend; j++){
  for (i = ibeg; i <= iend; i++){
    phi0[j][i] = 0.0;
  }}

#ifdef PARALLEL
  MPI_Type_contiguous (nx_tot,MPI_DOUBLE,&row_type);
  MPI_Type_vector     (ny_tot,1,nx_tot,&col_type);
  MPI_Type_commit (&row_type);
  MPI_Type_commit (&col_type);
#endif

/* --------------------------------------------------------
   4. Main iteration cycle
   -------------------------------------------------------- */

  tol = 1.e-5;
  err = 1.0;
  k   = 0;

  //#pragma acc enter data copyin(phi[:ny_tot][:nx_tot],phi0[:ny_tot][:nx_tot],x[:NX_GLOB+2*NGHOST],y[:NX_GLOB+2*NGHOST])
  #pragma acc enter data copyin(phi[:ny_tot][:nx_tot],err,err_glob)
 while  (err > tol){

  /* -- 4a. Set boundary conditions first -- */

    BoundaryConditions(phi0,x,y,nx,ny,&mpi_decomp);

  /* -- 4b. Jacobi's method and residual (interior points) -- */

    err = 0.0;
#pragma acc update device(err)

    #pragma acc parallel loop collapse(2) reduction(+:err) present(phi[:ny_tot][:nx_tot],phi0[:ny_tot][:nx_tot])
    for (j = jbeg; j <= jend; j++){
    for (i = ibeg; i <= iend; i++){
      phi[j][i] = 0.25*(  phi0[j][i-1] + phi0[j][i+1]
                        + phi0[j-1][i] + phi0[j+1][i] );

      err += dx*dy*fabs(phi[j][i] - phi0[j][i]);
    }}


    #pragma acc parallel loop collapse(2) present(phi[:ny_tot][:nx_tot],phi0[:ny_tot][:nx_tot])
    for (j = jbeg; j <= jend; j++){
    for (i = ibeg; i <= iend; i++){
      phi0[j][i] = phi[j][i];
    }}

    #ifdef PARALLEL
    // double err_glob;
    #pragma acc host_data use_device(err,err_glob)
    {
    MPI_Allreduce (&err,&err_glob,MPI_SUM,MPI_COMM_WORLD);
    }
    #pragma acc update host(err_glob)
    err = err_glob;
    #endif

    if (rank == 0){
      printf ("k = %d; err = %8.3e\n",k,err);
    }
    k++;
  }
  #pragma acc exit data copyout(phi[:ny_tot][:nx_tot],err_glob)

  WriteSolution (phi,&mpi_decomp);

  #ifdef PARALLEL
  MPI_Finalize();
  #endif
  return 0;
}

#ifdef PARALLEL
/* ********************************************************************* */
void DomainDecomposition(MPI_Decomp *mpi_decomp)
/*
 *
 *********************************************************************** */
{
  int dim,i;
  int rank,size;
  int *coords  = mpi_decomp->coords;
  int *gsize   = mpi_decomp->gsize;
  int *lsize   = mpi_decomp->lsize;
  int *nprocs  = mpi_decomp->nprocs;
  int *periods = mpi_decomp->periods;
  int *procL   = mpi_decomp->procL;
  int *procR   = mpi_decomp->procR;
  int *start   = mpi_decomp->start;
  int new_coords[NDIM];

/* --------------------------------------------------------
   1. Get rank & size
   -------------------------------------------------------- */

  MPI_Comm_rank(MPI_COMM_WORLD,&size);

  mpi_decomp->rank = rank;
  mpi_decomp->size = size;

/* --------------------------------------------------------
   2. Obtain number of processor along each dimension.
      Use maximally squared decomp.
   -------------------------------------------------------- */

  nprocs[0] = (int)sqrt(size);
  nprocs[1] = size/nprocs[0];
  if (nprocs[0]*nprocs[1] != size){
    if (rank == 0) printf ("! Cannot decompose\n");
    MPI_Finalize();
    exit(1);
  }
  if (rank == 0){
    printf ("Decomposition achieved with %d X %d procs\n",nprocs[0],nprocs[1]);
  }

  periods[0] = 0;
  periods[1] = 0;

/* --------------------------------------------------------
   3. Create Cartesian topology
   -------------------------------------------------------- */

  MPI_Cart_create(MPI_COMM_WORLD,NDIM,nprocs,periods,&MPI_COMM_CART);
  MPI_Cart_get(MPI_COMM_CART,coords);

/* --------------------------------------------------------
   4. Fill structure members
   -------------------------------------------------------- */

  gsize[0] = NX_GLOB;
  gsize[1] = NY_GLOB;
  lsize[0] = NX_GLOB/nprocs[0];
  lsize[1] = NY_GLOB/nprocs[1];
  start[0] = coords[0]*lsize[0];
  start[1] = coords[1]*lsize[1];

/* --------------------------------------------------------
   5. Determine ranks of neighbour processors
   -------------------------------------------------------- */

  for (dim = 0; dim < NDIM; dim++) {
    for (i = 0; i < NDIM; i++) new_coords[i] = coords[i];

    new_coords[dim] = coords[dim] + 1;
    if (new_coords[dim] < nprocs[dim]) {
      MPI_Cart_rank ( MPI_COMM_CART,new_coords,&(procR[dim]) );
    } else {
      procR[dim] = MPI_PROC_NULL;
    }

    new_coords[dim] = coords[dim] - 1;
    if (new_coords[dim] >= 0) {
     MPI_Cart_rank ( MPI_COMM_CART,&(procL[dim]) );
    } else {
      procL[dim] = MPI_PROC_NULL;
    }
  }

/* --------------------------------------------------------
   6. Print processor information.
      (Use MPI_Bcast() to print in sequence)
   -------------------------------------------------------- */

  int proc,go;
  for (proc = 0; proc < size; proc++){
    go = proc;
    MPI_Bcast(&go,MPI_INT,MPI_COMM_WORLD);
    if (rank == go) {
      printf ("[Rank %d]\n",rank);
      printf ("  coords = [%d,%d],lsize = [%d,%d]\n",coords[0],coords[1],lsize[0],lsize[1]);
      for (dim = 0; dim < NDIM; dim++){
        printf ("  (procL,procR)[%d] = %d,%d\n",dim,procL[dim],procR[dim]);
      }
    }
    MPI_Barrier(MPI_COMM_WORLD);
  }

  return;
}
#endif

/* ********************************************************************* */
void BoundaryConditions(double **phi,double *x,double *y,int nx,int ny,MPI_Decomp *mpi_decomp)
/*
 *********************************************************************** */
{
  int i,j;
  int ibeg   = NGHOST;
  int iend   = ibeg + nx - 1;

  int jbeg   = NGHOST;
  int jend   = jbeg + ny - 1;

  int *procL = mpi_decomp->procL;
  int *procR = mpi_decomp->procR;
#ifdef PARALLEL
  int rank = mpi_decomp->rank;
  int size = mpi_decomp->size;
  double send_buf[NX_GLOB + 2*NGHOST];
  double recv_buf[NX_GLOB + 2*NGHOST];

/*  Used for testing
    for (j = 0; j <= jend+1; j++){
    for (i = 0; i <= iend+1; i++){
      phi[j][i] = -1;
    }}

    for (j = jbeg; j <= jend; j++){
    for (i = ibeg; i <= iend; i++){
      phi[j][i] = rank;
    }}
*/

  #pragma acc enter data create(send_buf[:NX_GLOB+2*NGHOST],recv_buf[NX_GLOB+2*NGHOST])

  // Left buffer
  i = ibeg;

  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot],send_buf[:NX_GLOB+2*NGHOST])
  for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
  #pragma acc host_data use_device(send_buf,recv_buf)
  {
  MPI_Sendrecv (send_buf,jend+1,procL[0],recv_buf,MPI_COMM_WORLD,MPI_STATUS_IGNORE);
  }
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot],recv_buf[NX_GLOB+2*NGHOST])
  for (j = jbeg; j <= jend; j++) phi[j][i-1] = recv_buf[j];

  // Right buffer
  i = iend;
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot],procR[0],recv_buf[NX_GLOB+2*NGHOST])
  for (j = jbeg; j <= jend; j++) phi[j][i+1] = recv_buf[j];


  // Bottom buffer
  j = jbeg;
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot],send_buf[:NX_GLOB+2*NGHOST])
  for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
  // #pragma acc update self(send_buf[:NX_GLOB+2*NGHOST])
  #pragma acc host_data use_device(send_buf,iend+1,procL[1],recv_buf[NX_GLOB+2*NGHOST])
  for (i = ibeg; i <= iend; i++) phi[j-1][i] = recv_buf[i];

  // Top buffer
  j = jend;
  #pragma acc parallel loop present(phi[:ny_tot][:nx_tot],send_buf[:NX_GLOB+2*NGHOST])
  for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
  #pragma acc host_data use_device(send_buf,procR[1],recv_buf[NX_GLOB+2*NGHOST])
  for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];

  #pragma acc exit data copyout(send_buf[:NX_GLOB+2*NGHOST],recv_buf[NX_GLOB+2*NGHOST])

#endif

/* -- Left -- */

  if (procL[0] < 0){
    i = ibeg-1;
    #pragma acc parallel loop present(phi[:ny_tot][:nx_tot],y)
    for (j = jbeg; j <= jend; j++) phi[j][i] = 1.0-y[j];
  }

/* -- Right -- */

  if (procR[0] < 0){
    i = iend+1;
    #pragma acc parallel loop present(phi[:ny_tot][:nx_tot],y)
    for (j = jbeg; j <= jend; j++) phi[j][i] = y[j]*y[j];
  }

/* -- Bottom -- */

  if (procL[1] < 0){
    j = jbeg-1;
    #pragma acc parallel loop present(phi[:ny_tot][:nx_tot],x)
    for (i = ibeg; i <= iend; i++) phi[j][i] = 1.0-x[i];
  }

/* -- Top -- */

  if (procR[1] < 0){
    j = jend+1;
    #pragma acc parallel loop present(phi[:ny_tot][:nx_tot],x)
    for (i = ibeg; i <= iend; i++) phi[j][i] = x[i];
  }

  return;

#ifdef PARALLEL
// Print
  MPI_Barrier(MPI_COMM_WORLD);
  int go,proc;
  for (proc = 0; proc < size; proc++){
    go = proc;
    MPI_Bcast(&go,MPI_COMM_WORLD);

    if (rank == go) {
      printf ("Boundary [Rank %d]\n",rank);
      for (j = jend+1; j >= 0; j--){
        for (i = 0; i <= iend+1; i++){
          printf ("%6.2f  ",phi[j][i]);
        }
        printf ("\n");
      }
    }
  }

MPI_Finalize();
exit(0);
#endif
}

/* ********************************************************************* */
void WriteSolution (double **phi,MPI_Decomp *md)
/*
 *********************************************************************** */
{
  int i,j;
  int ibeg   = NGHOST;
  int iend   = ibeg + nx - 1;

  int jbeg   = NGHOST;
  int jend   = jbeg + ny - 1;

  static int nfile = 0;
  char fname[32];

  sprintf (fname,"laplace2D_MPIACC.txt",nfile);

/*
for (j = jbeg-1; j <= jend+1; j++) for (i = ibeg-1; i <= iend+1; i++) {
  phi[j][i] = -1;
}
for (j = jbeg; j <= jend; j++) for (i = ibeg; i <= iend; i++) {
  phi[j][i] = md->rank;
}
*/
#ifdef PARALLEL
  MPI_File fh;
  MPI_Datatype type_local,type_domain;
  int amode = MPI_MODE_CREATE | MPI_MODE_WRONLY;
  int gsize[2],lsize[2],start[2];

/* --------------------------------------------------------
   1. Create a local array type without the ghost zones
      This datatype will be passed to MPI_File_write()
   -------------------------------------------------------- */

  gsize[0] = md->lsize[0] + 2*NGHOST;
  gsize[1] = md->lsize[1] + 2*NGHOST;

  lsize[0] = md->lsize[0];
  lsize[1] = md->lsize[1];

  start[0] = NGHOST;
  start[1] = NGHOST;

  MPI_Type_create_subarray (NDIM,gsize,lsize,start,MPI_ORDER_FORTRAN,&type_local);
  MPI_Type_commit (&type_local);

/* --------------------------------------------------------
   2. Create the subarry in the global domain.
      This datatype is used to set the file view.
   -------------------------------------------------------- */

  gsize[0] = NX_GLOB;
  gsize[1] = NY_GLOB;

  lsize[0] = md->lsize[0];
  lsize[1] = md->lsize[1];

  start[0] = lsize[0]*md->coords[0];  // equal to md->start[0]
  start[1] = lsize[1]*md->coords[1];  // equal to md->start[1]

  MPI_Type_create_subarray (NDIM,&type_domain);
  MPI_Type_commit (&type_domain);

/* --------------------------------------------------------
   3. Write to disk
   -------------------------------------------------------- */

  MPI_File_delete(fname,MPI_INFO_NULL);
  MPI_File_open(MPI_COMM_CART,fname,amode,&fh);
  MPI_File_set_view(fh,type_domain,"native",MPI_INFO_NULL);
  MPI_File_write_all(fh,phi[0],type_local,MPI_STATUS_IGNORE);
  MPI_File_close(&fh);
  MPI_Type_free (&type_local);
  MPI_Type_free (&type_domain);
#else
  FILE *fp;
  printf ("> Writing %s\n",fname);
  fp = fopen(fname,"wb");

  for (j = jbeg; j <= jend; j++){
    fwrite (phi[j] + ibeg,sizeof(double),fp);
  }

  fclose(fp);
#endif

  nfile++;
}

/* ********************************************************************* */
double **Allocate_2DdblArray(int nx,int ny)
/*
 * Allocate memory for a double precision array with
 * nx rows and ny columns
 *********************************************************************** */
{
  int i,j;
  double **buf;

  buf    = (double **)malloc (nx*sizeof(double *));
  buf[0] = (double *) malloc (nx*ny*sizeof(double));
  for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;

  return buf;
}
/* ********************************************************************* */
int **Allocate_2DintArray(int nx,int ny)
/*
 * Allocate memory for an integer-type array with
 * nx rows and ny columns
 *********************************************************************** */
{
  int i,j;
  int **buf;

  buf    = (int **)malloc (nx*sizeof(int *));
  buf[0] = (int *) malloc (nx*ny*sizeof(int));
  for (j = 1; j < nx; j++) buf[j] = buf[j-1] + ny;

  return buf;
}


/* ********************************************************************* */
void Show_2DdblArray(double **A,const char *string)
/*
 *********************************************************************** */
{
  int i,j;

  printf ("%s\n",string);
  printf ("------------------------------\n");
  for (i = 0; i < nx; i++) {
    for (j = 0; j < ny; j++) {
      printf ("%8.2f  ",A[i][j]);
    }
    printf ("\n");
  }
  printf ("------------------------------\n");
}
/* ********************************************************************* */
void Show_2DintArray(int **A,string);
  for (j = 0; j < ny; j++) printf ("-----");
  printf ("\n");

  for (i = 0; i < nx; i++) {
    for (j = 0; j < ny; j++) {
      printf ("%03d  ",A[i][j]);
    }
    printf ("\n");
  }

  for (j = 0; j < ny; j++) printf ("-----");
  printf ("\n");
}