/* 
 * example for onesided communication 
 */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <mpi.h>
#include "DistMatrix.h"

#define   SIZE   20

/* global variables = status of current task */
int       nTasks;           /* number of tasks */
int       myId;             /* my rank */


double compute(int i, int j) {
  /* dummy computation of m(i,j), i,j global indices */
  /* amount of work highly varying */
  /* returns i + j + eps */

  double t1, t2;
  double random, work;
  int    k;

  random = rand()/(double)RAND_MAX; /* between 0 and 1 */
  work = 2500*exp(4*j*random/SIZE);

  t1 = i + j;
  for (k=1; k<work; k++) {
       t1 = cos(t1);
  }
    
  t2 = i + j;
  for (k=1; k<work; k++) {
    t2 = sin(t2);
  }
  return i + j + t1 + t2 - 0.7390851332151607;
}

void check(DistMatrix ar) {
  /* use formula for global sum of array a(i,j)= i+j */

  double  localsum, sum1, sum2;
  int     i, j, n;

  n = ar.nx;
  sum2 = n*n*n - n*n;

  localsum = 0.0;
  for (i=0; i<ar.ly; i++) {
    for (j=0; j<ar.lx; j++) {
      localsum += INDEX(ar, i, j);
    }
  }

  MPI_Reduce(&localsum, &sum1, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  
  if (myId == 0) {
    printf("sum is       : %lf\n", sum1);
    printf("and should be: %lf\n", sum2);
  }
}

  
int get_next(int oldId, MPI_Win lockWin) {
  /* finds next task with work to do */
  /* starts search after oldId */
  
  int   curId;
  int   last;

  /* start at next task */
  curId = (oldId + 1) % nTasks;
  while (curId != myId) {
    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, curId, 0, lockWin);
    MPI_Get(&last, 1, MPI_INT, curId, 0, 1, MPI_INT, lockWin);
    if (last < SIZE) { /* work to do at curId */
      MPI_Win_unlock(curId, lockWin);
      return curId;
    } 
    MPI_Win_unlock(curId, lockWin);
    curId = (curId + 1) % nTasks;
  }
  
  return curId;
}



void main(int argc, char *argv[]) {

  DistMatrix     ar;
  MPI_Win        arWin, lockWin;   /* windows for array and array status */
  int *          lastp;  /* next row of local array to work on */
  int            curRow, nextRow, col, count, offset;
  int            freeId;
  double         *remoteRow;


  /*
   * initialize everything
   */
  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &nTasks);
  MPI_Comm_rank(MPI_COMM_WORLD, &myId);
  newDistMatrix(&ar, SIZE, SIZE);
  
  /* get memory for lockable last pointer */
  MPI_Alloc_mem(sizeof(int), MPI_INFO_NULL, &lastp); 

  /* create windows for remote access */
  MPI_Win_create(lastp, sizeof(int), sizeof(int), 
		 MPI_INFO_NULL, MPI_COMM_WORLD, &lockWin);
  MPI_Win_create(ar.data, ar.lx*ar.ly*sizeof(double), sizeof(double),
		 MPI_INFO_NULL, MPI_COMM_WORLD, &arWin);

  *lastp = 0;                   /* start with first row */
  MPI_Barrier(MPI_COMM_WORLD);  /* wait until initialisation is ready */

  /* 
   * do the local work
   */
  while (1) {
    /* get next line and update pointer */
    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, myId, 0, lockWin);
    curRow = *lastp;
    if (curRow >= SIZE) {  /* ready with local part */
      MPI_Win_unlock(myId, lockWin);
      break;
    }
    (*lastp)++;
    MPI_Win_unlock(myId, lockWin);

    /* compute the current row */
    for (col = 0; col < ar.lx; col++) {
      INDEX(ar, curRow, col) = compute(curRow, ar.dx + col);
    }
    printf("--- Task %2d: computed row %d locally\n", myId, curRow);
  }
  printf("Task %2d: ready with local work\n", myId);

  /*
   * do remote work
   */

  /* get space for remote column, task 0 has largest count */
  blockDistribute(ar.nx, nTasks, 0, &count, &offset);
  if ( (remoteRow = (double *) malloc(count * sizeof(double))) == NULL) {
      perror("malloc");
      MPI_Finalize();
      exit(-1);
  }

  /* find other task with work to do */
  freeId = myId;
  while ( (freeId = get_next(freeId, lockWin)) != myId ) {
    /* do the work */
    while (1) {
      MPI_Win_lock(MPI_LOCK_EXCLUSIVE, freeId, 0, lockWin);
      MPI_Get(&curRow, 1, MPI_INT, freeId, 0, 1, MPI_INT, lockWin);
      if (curRow >= SIZE) { /* work done at freeId */
	MPI_Win_unlock(freeId, lockWin);
	break;       /* find next free task */
      }
      /* update remote status */
      nextRow = curRow + 1;
      MPI_Put(&nextRow, 1, MPI_INT, freeId, 0, 1, MPI_INT, lockWin);
      MPI_Win_unlock(freeId, lockWin);
      
      /* do the work and put the results to the remote task */
      /* remember to use the index values for freeId! */
      blockDistribute(ar.nx, nTasks, freeId, &count, &offset);
      for (col = 0; col < count; col++) {
	remoteRow[col] = compute(curRow, offset + col);
      }
      printf("--- Task %2d: computed row %d from Task %d\n", 
	     myId, curRow, freeId);
      MPI_Put(remoteRow, count, MPI_DOUBLE, 
	      freeId, count*curRow, count, MPI_DOUBLE, arWin);      
    }
    printf("Task %2d: no more work at Task %d\n", myId, freeId);
  }

  MPI_Barrier(MPI_COMM_WORLD);  /* all computations done before check */

  /*
   * check results
   */
  check(ar);

  /* 
   * free all resources
   */
  free(remoteRow);
  MPI_Win_free(&lockWin);
  MPI_Win_free(&arWin);
  deleteDistMatrix(ar);
  MPI_Free_mem(lastp);
  MPI_Finalize();
}