#include <stdio.h>
#include "mpi.h"
static int nproc;
static int iproc;

void print_matrix (char *mesg, int N, int *a)
{
   int *p;
   register int i;
   register int j;
   printf("%s\n", mesg);
   p = a;
   for (i = 0 ; i < N ; i++)
   {
      for(j = 0 ; j < N ; j++) 
      {
         printf("%4d ", *p ++);
      }
      printf("\n");
   }
}

void transpose(int n, int *a, int *b)
{
   int i;
   int nl;
   MPI_Datatype svec;
   MPI_Datatype s_matrix;
   MPI_Datatype rvec;
   MPI_Datatype r_matrix;
   MPI_Request *send_request;
   MPI_Request *recv_request;
   MPI_Status *send_status;
   MPI_Status *recv_status;
   nl = n / nproc;

   MPI_Type_vector(nl, 1, n, MPI_INT, &svec);
   MPI_Type_hvector(nl, 1, sizeof(int), svec, &s_matrix);
   MPI_Type_commit(&s_matrix);
   MPI_Type_contiguous(nl, MPI_INT, &rvec);
   MPI_Type_hvector(nl, 1, sizeof(int)*n, rvec, &r_matrix);

   send_request = (MPI_Request *) malloc(nproc * sizeof(MPI_Request));
   recv_request = (MPI_Request *) malloc(nproc * sizeof(MPI_Request));
   send_status = (MPI_Status *) malloc(nproc * sizeof(MPI_Status));
   recv_status = (MPI_Status *) malloc(nproc * sizeof(MPI_Status));

   for ( i = 0 ; i < nproc ; i++ )
   {
      MPI_Isend(a + i * nl, 1, s_matrix, i, 0, MPI_COMM_WORLD,
         send_request +i);
   }

   for (i = 0 ; i < nproc ; i++)
   {
      MPI_Irecv(b + i * nl, 1, r_matrix, i, 0, MPI_COMM_WORLD,
         recv_request + i);
   }

   for (i = 0 ; i < nproc ; i++)
   {
      MPI_Wait (send_request + i, send_status + i);
   }

   for (i = 0 ; i < nproc ; i++)
   {
      MPI_Wait (recv_request + i, recv_status + i);
   }

   free(send_request);
   free(recv_request);
   free(send_status);
   free(recv_status);
}

int main(int argc, char *argv[])
{
   int N;
   int NL;
   int *a;
   int *a_local;
   int *b;
   int *b_local;
   int i;

   MPI_Init(&argc, &argv);
   MPI_Comm_size(MPI_COMM_WORLD, &nproc);
   MPI_Comm_rank(MPI_COMM_WORLD, &iproc);

   N = atoi(argv[1]);
   if (argc != 2)
      perror("Usage: transpose <N, for a N x N matrix>");
   if (N % nproc)
      perror("N must be divisible by nproc");
   NL = N / nproc;

   a = (int *) malloc(N * N * sizeof(int));
   if (!a)
      perror("malloc a");
   a_local = (int *) malloc(N * NL * sizeof(int));
   if (!a_local)
      perror("malloc a_local");
   b = (int *) malloc(N * N * sizeof(int));
   if (!b)
      perror ("malloc b");
   b_local = (int *) malloc(N * NL * sizeof(int));
   if (!b_local)
      perror("malloc b_local");
   if (iproc == 0)
   {
      for (i = 0 ; i < N * N ; i++)
      {
         a[i] = i;
      }
   }
   MPI_Scatter(a, N*NL, MPI_INT, a_local, N*NL, MPI_INT, 0, MPI_COMM_WORLD);
   transpose(N, a_local, b_local);
   MPI_Gather (b_local, N*NL, MPI_INT, b, N*NL, MPI_INT, 0, MPI_COMM_WORLD);
   if (iproc == 0)
   {
      print_matrix("Input",  N,  a);
      print_matrix("Output", N, b);
   }

   free(a);
   free(a_local);
   free(b);
   free(b_local);

   MPI_Finalize();

   exit(0) ;
}
