oomph-lib: preconditioner_array.cc Source File

Go to the documentation of this file.
 //LIC// ====================================================================
 //LIC// This file forms part of oomph-lib, the object-oriented, 
 //LIC// multi-physics finite-element library, available 
 //LIC// at http://www.oomph-lib.org.
 //LIC// 
 //LIC//    Version 1.0; svn revision $LastChangedRevision$
 //LIC//
 //LIC// $LastChangedDate$
 //LIC// 
 //LIC// Copyright (C) 2006-2016 Matthias Heil and Andrew Hazel
 //LIC// 
 //LIC// This library is free software; you can redistribute it and/or
 //LIC// modify it under the terms of the GNU Lesser General Public
 //LIC// License as published by the Free Software Foundation; either
 //LIC// version 2.1 of the License, or (at your option) any later version.
 //LIC// 
 //LIC// This library is distributed in the hope that it will be useful,
 //LIC// but WITHOUT ANY WARRANTY; without even the implied warranty of
 //LIC// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 //LIC// Lesser General Public License for more details.
 //LIC// 
 //LIC// You should have received a copy of the GNU Lesser General Public
 //LIC// License along with this library; if not, write to the Free Software
 //LIC// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 //LIC// 02110-1301  USA.
 //LIC// 
 //LIC// The authors may be contacted at oomph-lib@maths.man.ac.uk.
 //LIC// 
 //LIC//====================================================================
 
 // Config header generated by autoconfig
 #ifdef HAVE_CONFIG_H
 #include <oomph-lib-config.h>
 #endif
 
 // Preconditioner array is only useful if we have mpi, otherwise a dummy
 // implmentation is used and this file doesn't need to implement anything
 // (see the header file).
 #ifdef OOMPH_HAS_MPI
 
 //oomph-lib includes
 #include "preconditioner_array.h"
 
 namespace oomph
 {
 
 
  //============================================================================
  /// Setup the preconditioners. Sets up each preconditioner in the
  /// array for the corresponding matrix in the vector matrix_pt.
  /// The number of preconditioners in the array is taken to be the length of
  /// prec_pt.
  //============================================================================
  void PreconditionerArray::setup_preconditioners
  (Vector<CRDoubleMatrix*> matrix_pt,
   Vector<Preconditioner*> prec_pt,
   const OomphCommunicator* comm_pt)
  {
   // clean memory
   this->clean_up_memory();
 
   // get the number of preconditioners in the array
   Nprec = prec_pt.size();
 
 #ifdef PARANOID
   // check that the preconditioners have been set
   if (Nprec < 2)
    {
     std::ostringstream error_message;
     error_message << "The PreconditionerArray requires at least 2 "
                   << "preconditioners";
     throw OomphLibError(error_message.str(),
                         OOMPH_CURRENT_FUNCTION,
                         OOMPH_EXCEPTION_LOCATION);
    }
   // first check that the vector matrix_pt is the correct length
   if (matrix_pt.size() != Nprec)
    {
     std::ostringstream error_message;
     error_message << "The same number of preconditioners and matrices must "
                   << "be passed to the setup_preconditioners(...).";
     throw OomphLibError(error_message.str(),
                         OOMPH_CURRENT_FUNCTION,
                         OOMPH_EXCEPTION_LOCATION);
    }
 
   // Resize the storage of the PARANOID check distributions
   // Already cleared by clean_up_memory call at top of function
   Distribution_pt.resize(Nprec);
 #endif
 
   // for each matrix...  PARANOID and store copy of global communicator
   for (unsigned i = 0; i < Nprec; i++)
    {
 
 #ifdef PARANOID
     // paranoid check that each matrix is a CRDoubleMatrix and that
     // it is built
     if (matrix_pt[i] == 0)
      {
       std::ostringstream error_message;
       error_message << "matrix_pt[" << i << "] = NULL.";
       throw OomphLibError(error_message.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
      }
 
     // check the matrix has been built
     if (!matrix_pt[i]->built())
      {
       std::ostringstream error_message;
       error_message << "Matrix " << i << " has not been built.";
       throw OomphLibError(error_message.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
 
      }
 #endif
 
     // check that all the matrices have the same communicator
     // and store a copy of the communicator
     if (i == 0)
      {
       Global_communicator_pt =
        new OomphCommunicator
        (matrix_pt[i]->distribution_pt()->communicator_pt());
      }
 
 #ifdef PARANOID
     else
      {
       if (*Global_communicator_pt !=
           *matrix_pt[i]->distribution_pt()->communicator_pt())
        {
         std::ostringstream error_message;
         error_message << "All matrices must have the same communicator.";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
        }
      }
 
     // store a copy of the Distribution of each preconditioner for future
     // PARANOID checks
     Distribution_pt[i] = new LinearAlgebraDistribution
      (matrix_pt[i]->distribution_pt());
 #endif
    }
 
   // number of processors
   unsigned nproc = Global_communicator_pt->nproc();
 
   // next compute the distribution of the preconditioner over the processors
   // such that each preconditioner has an (as to near to) equal number of
   // processors
   First_proc_for_prec.resize(Nprec);
   Nproc_for_prec.resize(Nprec);
 
   // compute first row
   for (unsigned p=0;p<Nprec;p++)
    {
     First_proc_for_prec[p] = unsigned(double(p*nproc)/
                                       double(Nprec));
    }
 
   // compute local nrow
   for (unsigned p=0; p<Nprec-1; p++)
    {
     Nproc_for_prec[p] = First_proc_for_prec[p+1] - First_proc_for_prec[p];
    }
   Nproc_for_prec[Nprec-1] = nproc -  First_proc_for_prec[Nprec-1];
 
  #ifdef PARANOID
   // paranoid check that every preconditioner has more than one processor
   for (unsigned p=0;p<Nprec;p++)
    {
     if (Nproc_for_prec[p] == 0)
      {
       std::ostringstream error_message;
       error_message << "We only have " << nproc << " processor[s]!\n"
                     << "This is not enough to perform the " << Nprec 
                     << " block solves in parallel! Sorry! \n"
                     << "Please run this with more processors or disable the\n"
                     << "request for two-level paralellism.\n"; 
       throw OomphLibError(error_message.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
      }
    }
 #endif
 
   // compute the color of this processor
   Color = 0;
   unsigned my_rank = Global_communicator_pt->my_rank();
   while (!(First_proc_for_prec[Color] <= my_rank &&
            First_proc_for_prec[Color] + Nproc_for_prec[Color] > my_rank))
    {
     Color++;
    }
 
   // create the local preconditioner
   Local_communicator_pt = Global_communicator_pt->split(Color,my_rank);
 
   // pointer for the local matrix on this processor
   CRDoubleMatrix* local_matrix_pt = 0;
 
   // resize storage for details of the data to be sent and received
   First_row_for_proc.resize(Nprec);
   Nrow_local_for_proc.resize(Nprec);
   First_row_from_proc.resize(Nprec);
   Nrow_local_from_proc.resize(Nprec);
 
   // Vector of MPI_Requests - used for distributed matrices
   Vector<MPI_Request> req;
 
   // Counter for the number of requests used
   unsigned c = 0;
 
   // storage for the target distribution
   Vector< Vector<unsigned> > target_first_row(Nprec);
   Vector< Vector<unsigned> > target_nrow_local(Nprec);
 
   // create storage for the nnz to be sent and received for each
   // preconditioner
   Vector< Vector<unsigned> > nnz_send(Nprec);
   Vector< Vector<unsigned> > nnz_recv(Nprec);
 
 
   /////////////////////////////////////////////////////////////////////////////
   /////////////////////////////////////////////////////////////////////////////
   /////////////////////////////////////////////////////////////////////////////
   /////////////////////////////////////////////////////////////////////////////
 
 
 
   // METHOD 0
   if (Method == 0)
    {
 
     // for every matrix we assemble the duplicate of the matrix on fewer
     // processors and setup the preconditioner
     for (unsigned i = 0; i < Nprec; i++)
      {
 
       // if the matrix is global (!distributed) then just construct a copy
       // on the subset of processors
       if (matrix_pt[i]->distributed())
        {
 
         // first compute the distribution of this preconditioner on its subset
         // of processors
 
         // number of rows for this preconditioner
         unsigned nrow = matrix_pt[i]->nrow();
 
         // setup First_row_for_local_prec and Nrow_local_for_local_prec
         target_first_row[i].resize(nproc);
         target_nrow_local[i].resize(nproc);
         unsigned nproc_local = Nproc_for_prec[i];
         for (unsigned p = 0; p < nproc_local; p++)
          {
           int pp = First_proc_for_prec[i] + p;
           target_first_row[i][pp] =  unsigned(double(p*nrow)/
                                               double(nproc_local));
          }
         for (unsigned p = 0; p < nproc_local-1; p++)
          {
           int pp = First_proc_for_prec[i] + p;
           target_nrow_local[i][pp] = target_first_row[i][pp+1]
            - target_first_row[i][pp];
          }
         unsigned last_local_proc = First_proc_for_prec[i] + nproc_local - 1;
         target_nrow_local[i][last_local_proc] = nrow -
          target_first_row[i][last_local_proc];
 
         // get the details of the current distribution
         Vector<unsigned> current_first_row(nproc);
         Vector<unsigned> current_nrow_local(nproc);
         for (unsigned p = 0; p < nproc; p++)
          {
           current_first_row[p] = matrix_pt[i]->first_row(p);
           current_nrow_local[p] = matrix_pt[i]->nrow_local(p);
          }
 
         // resize storage for details of the data to be sent and received
         First_row_for_proc[i].resize(nproc,0);
         Nrow_local_for_proc[i].resize(nproc,0);
         First_row_from_proc[i].resize(nproc,0);
         Nrow_local_from_proc[i].resize(nproc,0);
 
         // for every processor compute first_row and nrow_local that will
         // will sent and received by this processor
         for (unsigned p = 0; p < nproc; p++)
          {
           // start with data to be sent
           if ((target_first_row[i][p] < (current_first_row[my_rank] +
                                          current_nrow_local[my_rank])) &&
               (current_first_row[my_rank] < (target_first_row[i][p] +
                                              target_nrow_local[i][p])))
            {
             First_row_for_proc[i][p] =
              std::max(current_first_row[my_rank],
                       target_first_row[i][p]);
             Nrow_local_for_proc[i][p] =
              std::min((current_first_row[my_rank] +
                        current_nrow_local[my_rank]),
                       (target_first_row[i][p] +
                        target_nrow_local[i][p])) - First_row_for_proc[i][p];
            }
 
           // and data to be received
           if ((target_first_row[i][my_rank] < (current_first_row[p] +
                                                current_nrow_local[p]))
               && (current_first_row[p] < (target_first_row[i][my_rank] +
                                           target_nrow_local[i][my_rank])))
            {
             First_row_from_proc[i][p] =
              std::max(current_first_row[p],
                       target_first_row[i][my_rank]);
             Nrow_local_from_proc[i][p] =
              std::min((current_first_row[p] +
                        current_nrow_local[p]),
                       (target_first_row[i][my_rank] +
                        target_nrow_local[i][my_rank]))-
              First_row_from_proc[i][p];
            }
          }
 
         // resize nnz_send
         nnz_send[i].resize(nproc);
 
         // compute the number of nnzs to be sent
         // and the number of send and receive requests to be made (nreq)
         for (unsigned p = 0; p < nproc; p++)
          {
           if (Nrow_local_for_proc[i][p] != 0)
            {
             int* row_start = matrix_pt[i]->row_start();
             unsigned k = First_row_for_proc[i][p]-current_first_row[my_rank];
             nnz_send[i][p] = row_start[k + Nrow_local_for_proc[i][p]] -
              row_start[k];
            }
          }
 
         // send nnz to be sent to each processor
         for (unsigned p = 0; p < nproc; p++)
          {
 
           // dont mpi send to send
           if (p != my_rank)
            {
 
             // non block send
             if (Nrow_local_for_proc[i][p] != 0)
              {
 
               // send to other processors
               int tag = this->compute_tag(nproc,my_rank,p,0);
               MPI_Request tr;
               req.push_back(tr);
               MPI_Isend(&nnz_send[i][p],1,MPI_UNSIGNED,p,tag,
                         Global_communicator_pt->mpi_comm(),&req[c]);
               c++;
              }
            }
          }
 
         // resize nnz_recv
         nnz_recv[i].resize(nproc);
 
         // receive nnz from other processors
         for (unsigned pp = 0; pp < nproc; pp++)
          {
 
           // next processor to receive from
           unsigned p = (nproc + my_rank - pp)%nproc;
 
           // dont mpi send to send
           if (p != my_rank)
            {
 
             // blocking recv
             if (Nrow_local_from_proc[i][p] != 0)
              {
               int tag = this->compute_tag(nproc,p,my_rank,0);
               MPI_Status stat;
               unsigned nnz_temp;
               MPI_Recv(&nnz_temp,1,MPI_UNSIGNED,p,tag,
                        Global_communicator_pt->mpi_comm(),&stat);
               nnz_recv[i][p] = nnz_temp;
              }
            }
 
           // receive from self
           else
            {
             nnz_recv[i][p] = nnz_send[i][p];
            }
          }
 
         // get pointers to the underlying data in the current matrix
         double* values_send = matrix_pt[i]->value();
         int* row_start_send = matrix_pt[i]->row_start();
         int* column_index_send = matrix_pt[i]->column_index();
 
         // send and receive the contents of the vector
         for (unsigned p = 0; p < nproc; p++)
          {
 
           // use mpi methods to send to and receive from all but my rank
           if (p != my_rank)
            {
 
             // send
             if (nnz_send[i][p] != 0)
              {
 
               // compute the offset for row_start
               int offset_n =
                First_row_for_proc[i][p]-current_first_row[my_rank];
 
               // compute the offset for the values and column_index
               int offset_nnz = row_start_send[offset_n];
 
               // values
               int tag = this->compute_tag(nproc,my_rank,p,1);
               MPI_Request tr1;
               req.push_back(tr1);
               MPI_Isend(values_send + offset_nnz,int(nnz_send[i][p]),
                         MPI_DOUBLE,
                         p,tag,Global_communicator_pt->mpi_comm(),&req[c]);
               c++;
 
               // column_index
               tag = this->compute_tag(nproc,my_rank,p,2);
               MPI_Request tr2;
               req.push_back(tr2);
               MPI_Isend(column_index_send + offset_nnz,int(nnz_send[i][p]),
                         MPI_INT,p,tag,Global_communicator_pt->mpi_comm(),
                         &req[c]);
               c++;
 
               // row_start
               tag = this->compute_tag(nproc,my_rank,p,3);
               MPI_Request tr3;
               req.push_back(tr3);
               MPI_Isend(row_start_send + offset_n,
                         int(Nrow_local_for_proc[i][p]),MPI_INT,p,tag,
                         Global_communicator_pt->mpi_comm(),&req[c]);
               c++;
              }
            }
          }
        }
      }
 
     // for every matrix we assemble the duplicate of the matrix on fewer
     // processors and setup the preconditioner
     for (unsigned i = 0; i < Nprec; i++)
      {
 
       // if the matrix is global (!distributed) then just construct a copy
       // on the subset of processors
       if (!matrix_pt[i]->distributed())
        {
         oomph_info << "matrix not distributed" << std::endl;
         // if this matrix is to be preconditioned my this processor
         if (i == Color)
          {
 
           // create the local distribution for this matrix
           LinearAlgebraDistribution* temp_dist_pt =
            new LinearAlgebraDistribution(Local_communicator_pt,
                                          matrix_pt[i]->nrow(),
                                          false);
 
           // create the corresponding matrix
           local_matrix_pt = new CRDoubleMatrix(temp_dist_pt);
           delete temp_dist_pt; // (dist has now been copied)
 
           // get pointers to the underlying data
           double* values_pt = matrix_pt[i]->value();
           int* column_index_pt = matrix_pt[i]->column_index();
           int* row_start_pt = matrix_pt[i]->row_start();
 
           // build the matrix without a copy of the data
           local_matrix_pt->build_without_copy(matrix_pt[i]->ncol(),
                                                      matrix_pt[i]->nnz(),
                                                      values_pt,
                                                      column_index_pt,
                                                      row_start_pt);
          }
        }
 
       // else we assemble a copy of the matrix distributed over a subset of
       // processors
       else
        {
 
         // number of rows for this preconditioner
 
         // if we are assembling the matrix on this processor
         if (i == Color)
          {
 
 
           // create the local distribution for this matrix
           LinearAlgebraDistribution* temp_dist_pt =
            new LinearAlgebraDistribution
            (Local_communicator_pt,target_first_row[i][my_rank],
             target_nrow_local[i][my_rank]);
 
           // create the corresponding matrix
           local_matrix_pt = new CRDoubleMatrix(temp_dist_pt);
           delete temp_dist_pt; // (dist has now been copied)
 
           // get the number of nnzs to be received from each processor
 
           // total number of nnz to be reveived
           unsigned nnz_total = 0;
           for (unsigned p = 0; p < nproc; p++)
            {
             nnz_total += nnz_recv[i][p];
            }
 
           // compute nnz block start
           Vector<unsigned> nnz_start_proc;
           Vector<unsigned> nnz_start_index;
           unsigned row_ptr = target_first_row[i][my_rank];
           int p = 0;
           unsigned nnz_ptr = 0;
           for (p = 0; p < int(nproc); p++)
            {
 
             if (First_row_from_proc[i][p] == row_ptr &&
                 Nrow_local_from_proc[i][p] != 0 &&
                 nnz_ptr != nnz_total)
              {
               nnz_start_proc.push_back(p);
               nnz_start_index.push_back(nnz_ptr);
               nnz_ptr += nnz_recv[i][p];
               row_ptr += Nrow_local_from_proc[i][p];
               p = -1;
              }
            }
 
           // storage for received data
           double* values_recv = new double[nnz_total];
           int* column_index_recv = new int[nnz_total];
           int* row_start_recv = new int[target_nrow_local[i][my_rank]+1];
 
           // send and receive the contents of the vector
           for (unsigned pp = 0; pp < nproc; pp++)
            {
 
             // next processor to receive from
             unsigned p = (nproc + my_rank - pp)%nproc;
 
             // use mpi methods to send to and receive from all but my rank
             if (p != my_rank)
              {
 
               // just receive
               if (nnz_recv[i][p] != 0)
                {
 
                 // compute the offset for row_start
                 int offset_n =
                  First_row_from_proc[i][p]-target_first_row[i][my_rank];
 
                 // compute the offset for the values and column_index
                 unsigned k = 0;
                 while (nnz_start_proc[k] != p)
                  {
                   k++;
                  }
                 int offset_nnz = nnz_start_index[k];
 
                 // values
                 int tag = this->compute_tag(nproc,p,my_rank,1);
                 MPI_Status stat1;
                 MPI_Recv(values_recv + offset_nnz,int(nnz_recv[i][p]),
                          MPI_DOUBLE,p,tag,Global_communicator_pt->mpi_comm(),
                          &stat1);
 
                 // column_index
                 tag = this->compute_tag(nproc,p,my_rank,2);
                 MPI_Status stat2;
                 MPI_Recv(column_index_recv + offset_nnz,int(nnz_recv[i][p]),
                          MPI_INT,p,tag,Global_communicator_pt->mpi_comm(),
                          &stat2);
 
                 // row_start
                 tag = this->compute_tag(nproc,p,my_rank,3);
                 MPI_Status stat3;
                 MPI_Recv(row_start_recv + offset_n,
                          int(Nrow_local_from_proc[i][p]),MPI_INT,p,tag,
                          Global_communicator_pt->mpi_comm(),&stat3);
                }
              }
             // otehrwise just send to self (or copy)
             else
              {
               if (nnz_recv[i][p] != 0)
                {
 
                 // get pointers to the underlying data in the current matrix
                 double* values_send = matrix_pt[i]->value();
                 int* row_start_send = matrix_pt[i]->row_start();
                 int* column_index_send = matrix_pt[i]->column_index();
 
                 // offset for row_start send to self
                 unsigned offset_n_send =
                  First_row_for_proc[i][my_rank]-matrix_pt[i]->first_row(p);
                 // offset for values and column+_index send to self
                 unsigned offset_nnz_send = row_start_send[offset_n_send];
 
                 // offset for row_start receive from self
                 unsigned offset_n_recv =
                  First_row_from_proc[i][my_rank]-target_first_row[i][my_rank];
 
                 // offset for values and columm_index receive from self
                 unsigned k = 0;
                 while (nnz_start_proc[k] != p)
                  {
                   k++;
                  }
                 unsigned offset_nnz_recv = nnz_start_index[k];
 
                 // and send
 
                 // values and column_index
                 unsigned n_nnz = nnz_send[i][my_rank];
                 for (unsigned j = 0; j < n_nnz; j++)
                  {
                   values_recv[offset_nnz_recv + j] =
                    values_send[offset_nnz_send + j];
                   column_index_recv[offset_nnz_recv + j] =
                    column_index_send[offset_nnz_send + j];
                  }
 
                 // row start
                 unsigned n_n = Nrow_local_from_proc[i][my_rank];
                 for (unsigned j = 0; j < n_n; j++)
                  {
                   row_start_recv[offset_n_recv + j]  =
                    row_start_send[offset_n_send + j];
                  }
                }
              }
            }
 
 
           // number of processors contributing to the local vector on this
           // processor
 
           // update the row start
           unsigned nproc_contrib = nnz_start_index.size();
           for (unsigned j = 0; j < nproc_contrib; j++)
            {
             unsigned first = First_row_from_proc[i][nnz_start_proc[j]] -
              target_first_row[i][my_rank];
             unsigned last = first + Nrow_local_from_proc[i][nnz_start_proc[j]];
             unsigned nnz_inc = nnz_start_index[j]-row_start_recv[first];
             for (unsigned k = first; k < last; k++)
              {
               row_start_recv[k]+=nnz_inc;
              }
            }
           row_start_recv[target_nrow_local[i][my_rank]] = int(nnz_total);
 
           // build the matrix without a copy of the data
           local_matrix_pt->build_without_copy(matrix_pt[i]->ncol(),
                                                        nnz_total,
                                                        values_recv,
                                                        column_index_recv,
                                                        row_start_recv);
          }
        }
      }
 
     // wait for all sends to complete
     if (c!=0)
      {
       Vector<MPI_Status> stat(c);
       MPI_Waitall(c,&req[0],&stat[0]);
      }
    }
 
 
   /////////////////////////////////////////////////////////////////////////////
   /////////////////////////////////////////////////////////////////////////////
   /////////////////////////////////////////////////////////////////////////////
   /////////////////////////////////////////////////////////////////////////////
 
 
   // METHOD 1
   else if (Method == 1)
    {
 
     // temporary storgage for nnz recv
     unsigned* nnz_recv_temp = new unsigned[nproc*Nprec];
     for (unsigned j = 0; j < nproc*Nprec; j++)
      {
       nnz_recv_temp[j] = 0;
      }
 
     // for every matrix we assemble the duplicate of the matrix on fewer
     // processors and setup the preconditioner
     for (unsigned i = 0; i < Nprec; i++)
      {
 
       // if the matrix is global (!distributed) then just construct a copy
       // on the subset of processors
       if (!matrix_pt[i]->distributed())
        {
 
         // if this matrix is to be preconditioned my this processor
         if (i == Color)
          {
 
           // create the local distribution for this matrix
           LinearAlgebraDistribution* temp_dist_pt =
            new LinearAlgebraDistribution(Local_communicator_pt,
                                          matrix_pt[i]->nrow(),
                                          false);
 
           // create the corresponding matrix
           local_matrix_pt = new CRDoubleMatrix(temp_dist_pt);
           delete temp_dist_pt; // (dist has now been copied)
 
           // get pointers to the underlying data
           double* values_pt = matrix_pt[i]->value();
           int* column_index_pt = matrix_pt[i]->column_index();
           int* row_start_pt = matrix_pt[i]->row_start();
 
           // build the matrix without a copy of the data
           local_matrix_pt->build_without_copy(matrix_pt[i]->ncol(),
                                                        matrix_pt[i]->nnz(),
                                                        values_pt,
                                                        column_index_pt,
                                                        row_start_pt);
          }
        }
 
       // if the matrix is global (!distributed) then just construct a copy
       // on the subset of processors
       else
        {
 
         // first compute the distribution of this preconditioner on its subset
         // of processors
 
         // number of rows for this preconditioner
         unsigned nrow = matrix_pt[i]->nrow();
 
         // setup First_row_for_local_prec and Nrow_local_for_local_prec
         target_first_row[i].resize(nproc);
         target_nrow_local[i].resize(nproc);
         unsigned nproc_local = Nproc_for_prec[i];
         for (unsigned p = 0; p < nproc_local; p++)
          {
           int pp = First_proc_for_prec[i] + p;
           target_first_row[i][pp] =  unsigned(double(p*nrow)/
                                               double(nproc_local));
          }
         for (unsigned p = 0; p < nproc_local-1; p++)
          {
           int pp = First_proc_for_prec[i] + p;
           target_nrow_local[i][pp] = target_first_row[i][pp+1]
            - target_first_row[i][pp];
          }
         unsigned last_local_proc = First_proc_for_prec[i] + nproc_local - 1;
         target_nrow_local[i][last_local_proc] = nrow -
          target_first_row[i][last_local_proc];
 
         // get the details of the current distribution
         Vector<unsigned> current_first_row(nproc);
         Vector<unsigned> current_nrow_local(nproc);
         for (unsigned p = 0; p < nproc; p++)
          {
           current_first_row[p] = matrix_pt[i]->first_row(p);
           current_nrow_local[p] = matrix_pt[i]->nrow_local(p);
          }
 
         // resize storage for details of the data to be sent and received
         First_row_for_proc[i].resize(nproc,0);
         Nrow_local_for_proc[i].resize(nproc,0);
         First_row_from_proc[i].resize(nproc,0);
         Nrow_local_from_proc[i].resize(nproc,0);
 
         // for every processor compute first_row and nrow_local that will
         // will sent and received by this processor
         for (unsigned p = 0; p < nproc; p++)
          {
           // start with data to be sent
           if ((target_first_row[i][p] < (current_first_row[my_rank] +
                                          current_nrow_local[my_rank])) &&
               (current_first_row[my_rank] < (target_first_row[i][p] +
                                              target_nrow_local[i][p])))
            {
             First_row_for_proc[i][p] =
              std::max(current_first_row[my_rank],
                       target_first_row[i][p]);
             Nrow_local_for_proc[i][p] =
              std::min((current_first_row[my_rank] +
                        current_nrow_local[my_rank]),
                       (target_first_row[i][p] +
                        target_nrow_local[i][p])) - First_row_for_proc[i][p];
            }
 
           // and data to be received
           if ((target_first_row[i][my_rank] < (current_first_row[p] +
                                                current_nrow_local[p]))
               && (current_first_row[p] < (target_first_row[i][my_rank] +
                                           target_nrow_local[i][my_rank])))
            {
             First_row_from_proc[i][p] =
              std::max(current_first_row[p],
                       target_first_row[i][my_rank]);
             Nrow_local_from_proc[i][p] =
              std::min((current_first_row[p] +
                        current_nrow_local[p]),
                       (target_first_row[i][my_rank] +
                        target_nrow_local[i][my_rank]))-
              First_row_from_proc[i][p];
            }
          }
 
         // resize nnz_send
         nnz_send[i].resize(nproc);
 
         // compute the number of nnzs to be sent
         // and the number of send and receive requests to be made (nreq)
         for (unsigned p = 0; p < nproc; p++)
          {
           if (Nrow_local_for_proc[i][p] != 0)
            {
             int* row_start = matrix_pt[i]->row_start();
             unsigned k = First_row_for_proc[i][p]-current_first_row[my_rank];
             nnz_send[i][p] = row_start[k + Nrow_local_for_proc[i][p]] -
              row_start[k];
            }
          }
 
         // resize nnz_recv
         nnz_recv[i].resize(nproc);
 
         // send nnz to be sent to each processor
         for (unsigned p = 0; p < nproc; p++)
          {
 
           // send and recv
 
           // dont mpi send to self
           if (p != my_rank)
            {
 
             // non block send
             if (Nrow_local_for_proc[i][p] != 0)
              {
 
               // send to other processors
               int tag = this->compute_tag(nproc,my_rank,p,0);
               MPI_Request tr;
               req.push_back(tr);
               MPI_Isend(&nnz_send[i][p],1,MPI_UNSIGNED,p,tag,
                         Global_communicator_pt->mpi_comm(),&req[c]);
               c++;
              }
 
             // non blocking recv
             if (Nrow_local_from_proc[i][p] != 0)
              {
               int tag = this->compute_tag(nproc,p,my_rank,0);
               MPI_Request tr;
               req.push_back(tr);
               MPI_Irecv(nnz_recv_temp + (i*nproc) + p,1,MPI_UNSIGNED,p,tag,
                        Global_communicator_pt->mpi_comm(),&req[c]);
               c++;
              }
            }
           // receive from self
           else
            {
             if (Nrow_local_for_proc[i][p] != 0)
              {
               nnz_recv_temp[(i*nproc)+p] = nnz_send[i][p];
              }
            }
          }
        }
      }
     if (c!=0)
      {
       Vector<MPI_Status> stat(c);
       MPI_Waitall(c,&req[0],&stat[0]);
       req.clear();
       stat.clear();
      }
     c=0;
     for (unsigned i = 0; i < Nprec; i++)
      {
       for (unsigned p = 0; p < nproc; p++)
        {
         nnz_recv[i][p] = nnz_recv_temp[(i*nproc)+p];
        }
      }
     delete nnz_recv_temp;
 
     // get the number of nnzs to be received from each processor
 
     // total number of nnz to be reveived
     unsigned nnz_total = 0;
     for (unsigned p = 0; p < nproc; p++)
      {
       nnz_total += nnz_recv[Color][p];
      }
 
     // compute nnz block start
     Vector<unsigned> nnz_start_proc;
     Vector<unsigned> nnz_start_index;
     unsigned row_ptr = target_first_row[Color][my_rank];
     int p = 0;
     unsigned nnz_ptr = 0;
     for (p = 0; p < int(nproc); p++)
      {
       if (First_row_from_proc[Color][p] == row_ptr &&
           Nrow_local_from_proc[Color][p] != 0 &&
           nnz_ptr != nnz_total)
        {
         nnz_start_proc.push_back(p);
         nnz_start_index.push_back(nnz_ptr);
         nnz_ptr += nnz_recv[Color][p];
         row_ptr += Nrow_local_from_proc[Color][p];
         p = -1;
        }
      }
 
     // storage for derived datatypes
     Vector<MPI_Datatype> datatypes;
 
     // storage for received data
     double* values_recv = new double[nnz_total];
     int* column_index_recv = new int[nnz_total];
     int* row_start_recv = new int[target_nrow_local[Color][my_rank]+1];
 
     ///////////////////////////////////////////////////////////////////////////
     // SEND
     ///////////////////////////////////////////////////////////////////////////
     unsigned c_send = 0;
     Vector<MPI_Request> send_req;
 
     // for every matrix we assemble the duplicate of the matrix on fewer
     // processors and setup the preconditioner
     for (unsigned i = 0; i < Nprec; i++)
      {
 
       // get pointers to the underlying data in the current matrix
       double* values_send = matrix_pt[i]->value();
       int* row_start_send = matrix_pt[i]->row_start();
       int* column_index_send = matrix_pt[i]->column_index();
 
       // send and receive the contents of the vector
       for (unsigned p = 0; p < nproc; p++)
        {
 
         // use mpi methods to send to and receive from all but my rank
         if (p != my_rank)
          {
 
           // send
           if (nnz_send[i][p] != 0)
            {
 
              // create 3 MPI contiguous datatypes
             // + values
             // + column_index
             // + row_start
 
             // values
             MPI_Datatype datatype_values;
             MPI_Type_contiguous(int(nnz_send[i][p]),MPI_DOUBLE,
                                 &datatype_values);
             MPI_Type_commit(&datatype_values);
             datatypes.push_back(datatype_values);
 
             // column index
             MPI_Datatype datatype_column_index;
             MPI_Type_contiguous(int(nnz_send[i][p]),MPI_INT,
                                 &datatype_column_index);
             MPI_Type_commit(&datatype_column_index);
             datatypes.push_back(datatype_column_index);
 
             // row start
             MPI_Datatype datatype_row_start;
             MPI_Type_contiguous(int(Nrow_local_for_proc[i][p]),MPI_INT,
                                 &datatype_row_start);
             MPI_Type_commit(&datatype_row_start);
             datatypes.push_back(datatype_row_start);
 
             // assemble the typelist
             MPI_Datatype typelist[3];
             typelist[0] = datatype_values;
             typelist[1] = datatype_column_index;
             typelist[2] = datatype_row_start;
 
             // compute the offset for row_start
             int offset_n =
              First_row_for_proc[i][p]-matrix_pt[i]->first_row(my_rank);
 
             // compute the offset for the values and column_index
             int offset_nnz = row_start_send[offset_n];
 
             // next compute the displacements
             MPI_Aint displacements[3];
             MPI_Get_address(values_send + offset_nnz,&displacements[0]);
             MPI_Get_address(column_index_send + offset_nnz,&displacements[1]);
             MPI_Get_address(row_start_send + offset_n,&displacements[2]);
             for (int j = 2; j >= 0; j--)
              {
               displacements[j] -= displacements[0];
              }
 
             // set the block lengths
             int block_length[3];
             block_length[0] = block_length[1] = block_length[2] = 1;
 
             // now build the final datatype
             MPI_Datatype send_type;
             MPI_Type_create_struct(3,block_length,displacements,typelist,
                             &send_type);
             MPI_Type_commit(&send_type);
             datatypes.push_back(send_type);
 
             // send
             int tag = this->compute_tag(nproc,my_rank,p,1);
             MPI_Request tr1;
             send_req.push_back(tr1);
             MPI_Isend(values_send + offset_nnz,1,send_type,
                       p,tag,Global_communicator_pt->mpi_comm(),
                       &send_req[c_send]);
             c_send++;
            }
          }
        }
      }
 
     ///////////////////////////////////////////////////////////////////////////
     // RECV
     ///////////////////////////////////////////////////////////////////////////
     unsigned c_recv = 0;
     Vector<MPI_Request> recv_req;
 
     // receive the contents of the vector
     for (unsigned p = 0; p < nproc; p++)
      {
 
       // use mpi methods to send to and receive from all but my rank
       if (p != my_rank)
        {
 
         // just receive
         if (nnz_recv[Color][p] != 0)
          {
 
           // create 3 MPI contiguous datatypes
           // + values
           // + column_index
           // + row_start
 
           // values
           MPI_Datatype datatype_values;
           MPI_Type_contiguous(int(nnz_recv[Color][p]),MPI_DOUBLE,
                               &datatype_values);
           MPI_Type_commit(&datatype_values);
           datatypes.push_back(datatype_values);
 
           // column index
           MPI_Datatype datatype_column_index;
           MPI_Type_contiguous(int(nnz_recv[Color][p]),MPI_INT,
                               &datatype_column_index);
           MPI_Type_commit(&datatype_column_index);
           datatypes.push_back(datatype_column_index);
 
           // row start
           MPI_Datatype datatype_row_start;
           MPI_Type_contiguous(int(Nrow_local_from_proc[Color][p]),MPI_INT,
                               &datatype_row_start);
           MPI_Type_commit(&datatype_row_start);
           datatypes.push_back(datatype_row_start);
 
           // assemble the typelist
           MPI_Datatype typelist[3];
           typelist[0] = datatype_values;
           typelist[1] = datatype_column_index;
           typelist[2] = datatype_row_start;
 
           // compute the offset for row_start
           int offset_n =
            First_row_from_proc[Color][p]-target_first_row[Color][my_rank];
 
           // compute the offset for the values and column_index
           unsigned k = 0;
           while (nnz_start_proc[k] != p)
            {
             k++;
            }
           int offset_nnz = nnz_start_index[k];
 
           // next compute the displacements
           MPI_Aint displacements[3];
           MPI_Get_address(values_recv + offset_nnz,&displacements[0]);
           MPI_Get_address(column_index_recv + offset_nnz,&displacements[1]);
           MPI_Get_address(row_start_recv + offset_n,&displacements[2]);
           for (int j = 2; j >= 0; j--)
            {
             displacements[j] -= displacements[0];
            }
 
           // set the block lengths
           int block_length[3];
           block_length[0] = block_length[1] = block_length[2] = 1;
 
           // now build the final datatype
           MPI_Datatype recv_type;
           MPI_Type_create_struct(3,block_length,displacements,typelist,
                           &recv_type);
           MPI_Type_commit(&recv_type);
           datatypes.push_back(recv_type);
 
           // recv
           int tag = this->compute_tag(nproc,p,my_rank,1);
           MPI_Request tr1;
           recv_req.push_back(tr1);
           MPI_Irecv(values_recv + offset_nnz,1,recv_type,
                     p,tag,Global_communicator_pt->mpi_comm(),
                     &recv_req[c_recv]);
           c_recv++;
          }
        }
      }
 
     // otherwise send to self (copy)
     if (nnz_recv[Color][my_rank] != 0)
      {
 
       // get pointers to the underlying data in the current matrix
       double* values_send = matrix_pt[Color]->value();
       int* row_start_send = matrix_pt[Color]->row_start();
       int* column_index_send = matrix_pt[Color]->column_index();
 
       // offset for row_start send to self
       unsigned offset_n_send =
        First_row_for_proc[Color][my_rank]-matrix_pt[Color]->first_row(my_rank);
 
       // offset for values and column+_index send to self
       unsigned offset_nnz_send = row_start_send[offset_n_send];
 
       // offset for row_start receive from self
       unsigned offset_n_recv =
        First_row_from_proc[Color][my_rank]-target_first_row[Color][my_rank];
 
       // offset for values and columm_index receive from self
       unsigned k = 0;
       while (nnz_start_proc[k] != my_rank)
        {
         k++;
        }
       unsigned offset_nnz_recv = nnz_start_index[k];
 
       // and send
 
       // values and column_index
       unsigned n_nnz = nnz_send[Color][my_rank];
       for (unsigned j = 0; j < n_nnz; j++)
        {
         values_recv[offset_nnz_recv + j] =
          values_send[offset_nnz_send + j];
         column_index_recv[offset_nnz_recv + j] =
          column_index_send[offset_nnz_send + j];
        }
 
       // row start
       unsigned n_n = Nrow_local_from_proc[Color][my_rank];
       for (unsigned j = 0; j < n_n; j++)
        {
         row_start_recv[offset_n_recv + j]  =
          row_start_send[offset_n_send + j];
        }
      }
 
     // create the local distribution for this matrix
     LinearAlgebraDistribution* temp_dist_pt =
      new LinearAlgebraDistribution
      (Local_communicator_pt,target_first_row[Color][my_rank],
       target_nrow_local[Color][my_rank]);
 
     // create the corresponding matrix
     local_matrix_pt = new CRDoubleMatrix(temp_dist_pt);
     delete temp_dist_pt; // (dist has now been copied)
 
     ///////////////////////////////////////////////////////////////////////////
     // and WAIT...
     ///////////////////////////////////////////////////////////////////////////
     if (c_recv!=0)
      {
       Vector<MPI_Status> recv_stat(c_recv);
       MPI_Waitall(c_recv,&recv_req[0],&recv_stat[0]);
       recv_req.clear();
       recv_stat.clear();
      }
 
     // build the matrix
 
     // update the row start
     unsigned nproc_contrib = nnz_start_index.size();
     for (unsigned j = 0; j < nproc_contrib; j++)
      {
       unsigned first = First_row_from_proc[Color][nnz_start_proc[j]] -
        target_first_row[Color][my_rank];
       unsigned last = first + Nrow_local_from_proc[Color][nnz_start_proc[j]];
       unsigned nnz_inc = nnz_start_index[j]-row_start_recv[first];
       for (unsigned k = first; k < last; k++)
        {
         row_start_recv[k]+=nnz_inc;
        }
      }
     row_start_recv[target_nrow_local[Color][my_rank]] = int(nnz_total);
 
     // build the matrix without a copy of the data
     local_matrix_pt->build_without_copy(matrix_pt[Color]->ncol(),
                                                  nnz_total,
                                                  values_recv,
                                                  column_index_recv,
                                                  row_start_recv);
 
     // and finally wait for the sends
     if (c_recv!=0)
      {
       Vector<MPI_Status> send_stat(c_recv);
       MPI_Waitall(c_send,&send_req[0],&send_stat[0]);
       send_req.clear();
       send_stat.clear();
      }
 
     // and clear the datatype
     unsigned ndatatypes = datatypes.size();
     for (unsigned i = 0; i < ndatatypes; i++)
      {
       MPI_Type_free(&datatypes[i]);
      }
    }
 
 
 
   /////////////////////////////////////////////////////////////////////////////
   /////////////////////////////////////////////////////////////////////////////
   /////////////////////////////////////////////////////////////////////////////
   /////////////////////////////////////////////////////////////////////////////
 
 
   // METHOD 2
   else if (Method == 2)
    {
 
     // temporary storgage for nnz recv
     unsigned* nnz_recv_temp = new unsigned[nproc*Nprec];
     for (unsigned j = 0; j < nproc*Nprec; j++)
      {
       nnz_recv_temp[j] = 0;
      }
 
     // for every matrix we assemble the duplicate of the matrix on fewer
     // processors and setup the preconditioner
     for (unsigned i = 0; i < Nprec; i++)
      {
 
       // if the matrix is global (!distributed) then just construct a copy
       // on the subset of processors
       if (!matrix_pt[i]->distributed())
        {
 
         // if this matrix is to be preconditioned my this processor
         if (i == Color)
          {
 
           // create the local distribution for this matrix
           LinearAlgebraDistribution* temp_dist_pt =
            new LinearAlgebraDistribution(Local_communicator_pt,
                                          matrix_pt[i]->nrow(),
                                          false);
 
           // create the corresponding matrix
           local_matrix_pt = new CRDoubleMatrix(temp_dist_pt);
           delete temp_dist_pt; // (dist has now been copied)
 
           // get pointers to the underlying data
           double* values_pt = matrix_pt[i]->value();
           int* column_index_pt = matrix_pt[i]->column_index();
           int* row_start_pt = matrix_pt[i]->row_start();
 
           // build the matrix without a copy of the data
           local_matrix_pt->build_without_copy(matrix_pt[i]->ncol(),
                                                        matrix_pt[i]->nnz(),
                                                        values_pt,
                                                        column_index_pt,
                                                        row_start_pt);
          }
        }
 
       // if the matrix is global (!distributed) then just construct a copy
       // on the subset of processors
       else
        {
 
         // first compute the distribution of this preconditioner on its subset
         // of processors
 
         // number of rows for this preconditioner
         unsigned nrow = matrix_pt[i]->nrow();
 
         // setup First_row_for_local_prec and Nrow_local_for_local_prec
         target_first_row[i].resize(nproc);
         target_nrow_local[i].resize(nproc);
         unsigned nproc_local = Nproc_for_prec[i];
         for (unsigned p = 0; p < nproc_local; p++)
          {
           int pp = First_proc_for_prec[i] + p;
           target_first_row[i][pp] =  unsigned(double(p*nrow)/
                                               double(nproc_local));
          }
         for (unsigned p = 0; p < nproc_local-1; p++)
          {
           int pp = First_proc_for_prec[i] + p;
           target_nrow_local[i][pp] = target_first_row[i][pp+1]
            - target_first_row[i][pp];
          }
         unsigned last_local_proc = First_proc_for_prec[i] + nproc_local - 1;
         target_nrow_local[i][last_local_proc] = nrow -
          target_first_row[i][last_local_proc];
 
         // get the details of the current distribution
         Vector<unsigned> current_first_row(nproc);
         Vector<unsigned> current_nrow_local(nproc);
         for (unsigned p = 0; p < nproc; p++)
          {
           current_first_row[p] = matrix_pt[i]->first_row(p);
           current_nrow_local[p] = matrix_pt[i]->nrow_local(p);
          }
 
         // resize storage for details of the data to be sent and received
         First_row_for_proc[i].resize(nproc,0);
         Nrow_local_for_proc[i].resize(nproc,0);
         First_row_from_proc[i].resize(nproc,0);
         Nrow_local_from_proc[i].resize(nproc,0);
 
         // for every processor compute first_row and nrow_local that will
         // will sent and received by this processor
         for (unsigned p = 0; p < nproc; p++)
          {
           // start with data to be sent
           if ((target_first_row[i][p] < (current_first_row[my_rank] +
                                          current_nrow_local[my_rank])) &&
               (current_first_row[my_rank] < (target_first_row[i][p] +
                                              target_nrow_local[i][p])))
            {
             First_row_for_proc[i][p] =
              std::max(current_first_row[my_rank],
                       target_first_row[i][p]);
             Nrow_local_for_proc[i][p] =
              std::min((current_first_row[my_rank] +
                        current_nrow_local[my_rank]),
                       (target_first_row[i][p] +
                        target_nrow_local[i][p])) - First_row_for_proc[i][p];
            }
 
           // and data to be received
           if ((target_first_row[i][my_rank] < (current_first_row[p] +
                                                current_nrow_local[p]))
               && (current_first_row[p] < (target_first_row[i][my_rank] +
                                           target_nrow_local[i][my_rank])))
            {
             First_row_from_proc[i][p] =
              std::max(current_first_row[p],
                       target_first_row[i][my_rank]);
             Nrow_local_from_proc[i][p] =
              std::min((current_first_row[p] +
                        current_nrow_local[p]),
                       (target_first_row[i][my_rank] +
                        target_nrow_local[i][my_rank]))-
              First_row_from_proc[i][p];
            }
          }
 
         // resize nnz_send
         nnz_send[i].resize(nproc);
 
         // compute the number of nnzs to be sent
         // and the number of send and receive requests to be made (nreq)
         for (unsigned p = 0; p < nproc; p++)
          {
           if (Nrow_local_for_proc[i][p] != 0)
            {
             int* row_start = matrix_pt[i]->row_start();
             unsigned k = First_row_for_proc[i][p]-current_first_row[my_rank];
             nnz_send[i][p] = row_start[k + Nrow_local_for_proc[i][p]] -
              row_start[k];
            }
          }
 
         // resize nnz_recv
         nnz_recv[i].resize(nproc);
 
         // send nnz to be sent to each processor
         for (unsigned p = 0; p < nproc; p++)
          {
 
           // send and recv
 
           // dont mpi send to self
           if (p != my_rank)
            {
 
             // non block send
             if (Nrow_local_for_proc[i][p] != 0)
              {
 
               // send to other processors
               int tag = this->compute_tag(nproc,my_rank,p,0);
               MPI_Request tr;
               req.push_back(tr);
               MPI_Isend(&nnz_send[i][p],1,MPI_UNSIGNED,p,tag,
                         Global_communicator_pt->mpi_comm(),&req[c]);
               c++;
              }
 
             // non blocking recv
             if (Nrow_local_from_proc[i][p] != 0)
              {
               int tag = this->compute_tag(nproc,p,my_rank,0);
               MPI_Request tr;
               req.push_back(tr);
               MPI_Irecv(nnz_recv_temp + (i*nproc) + p,1,MPI_UNSIGNED,p,tag,
                        Global_communicator_pt->mpi_comm(),&req[c]);
               c++;
              }
            }
           // receive from self
           else
            {
             if (Nrow_local_for_proc[i][p] != 0)
              {
               nnz_recv_temp[(i*nproc)+p] = nnz_send[i][p];
              }
            }
          }
        }
      }
     if (c!=0)
      {
       Vector<MPI_Status> stat(c);
       MPI_Waitall(c,&req[0],&stat[0]);
       req.clear();
       stat.clear();
       c=0;
      }
     for (unsigned i = 0; i < Nprec; i++)
      {
       for (unsigned p = 0; p < nproc; p++)
        {
         nnz_recv[i][p] = nnz_recv_temp[(i*nproc)+p];
        }
      }
     delete nnz_recv_temp;
 
     // get the number of nnzs to be received from each processor
 
     // total number of nnz to be reveived
     unsigned nnz_total = 0;
     for (unsigned p = 0; p < nproc; p++)
      {
       nnz_total += nnz_recv[Color][p];
      }
 
     // compute nnz block start
     Vector<unsigned> nnz_start_proc;
     Vector<unsigned> nnz_start_index;
     unsigned row_ptr = target_first_row[Color][my_rank];
     int p = 0;
     unsigned nnz_ptr = 0;
     for (p = 0; p < int(nproc); p++)
      {
       if (First_row_from_proc[Color][p] == row_ptr &&
           Nrow_local_from_proc[Color][p] != 0 &&
           nnz_ptr != nnz_total)
        {
         nnz_start_proc.push_back(p);
         nnz_start_index.push_back(nnz_ptr);
         nnz_ptr += nnz_recv[Color][p];
         row_ptr += Nrow_local_from_proc[Color][p];
         p = -1;
        }
      }
 
     // storage for derived datatypes
     Vector<MPI_Datatype> datatypes;
 
     // storage for received data
     double* values_recv = new double[nnz_total];
     int* column_index_recv = new int[nnz_total];
     int* row_start_recv = new int[target_nrow_local[Color][my_rank]+1];
 
     ///////////////////////////////////////////////////////////////////////////
     // RECV
     ///////////////////////////////////////////////////////////////////////////
     unsigned c_recv = 0;
     Vector<MPI_Request> recv_req;
 
     // receive the contents of the vector
     for (unsigned p = 0; p < nproc; p++)
      {
 
       // use mpi methods to send to and receive from all but my rank
       if (p != my_rank)
        {
 
         // just receive
         if (nnz_recv[Color][p] != 0)
          {
 
           // create 3 MPI contiguous datatypes
           // + values
           // + column_index
           // + row_start
 
           // values
           MPI_Datatype datatype_values;
           MPI_Type_contiguous(int(nnz_recv[Color][p]),MPI_DOUBLE,
                               &datatype_values);
           MPI_Type_commit(&datatype_values);
           datatypes.push_back(datatype_values);
 
           // column index
           MPI_Datatype datatype_column_index;
           MPI_Type_contiguous(int(nnz_recv[Color][p]),MPI_INT,
                               &datatype_column_index);
           MPI_Type_commit(&datatype_column_index);
           datatypes.push_back(datatype_column_index);
 
           // row start
           MPI_Datatype datatype_row_start;
           MPI_Type_contiguous(int(Nrow_local_from_proc[Color][p]),MPI_INT,
                               &datatype_row_start);
           MPI_Type_commit(&datatype_row_start);
           datatypes.push_back(datatype_row_start);
 
           // assemble the typelist
           MPI_Datatype typelist[3];
           typelist[0] = datatype_values;
           typelist[1] = datatype_column_index;
           typelist[2] = datatype_row_start;
 
           // compute the offset for row_start
           int offset_n =
            First_row_from_proc[Color][p]-target_first_row[Color][my_rank];
 
           // compute the offset for the values and column_index
           unsigned k = 0;
           while (nnz_start_proc[k] != p)
            {
             k++;
            }
           int offset_nnz = nnz_start_index[k];
 
           // next compute the displacements
           MPI_Aint displacements[3];
           MPI_Get_address(values_recv + offset_nnz,&displacements[0]);
           MPI_Get_address(column_index_recv + offset_nnz,&displacements[1]);
           MPI_Get_address(row_start_recv + offset_n,&displacements[2]);
           for (int j = 2; j >= 0; j--)
            {
             displacements[j] -= displacements[0];
            }
 
           // set the block lengths
           int block_length[3];
           block_length[0] = block_length[1] = block_length[2] = 1;
 
           // now build the final datatype
           MPI_Datatype recv_type;
           MPI_Type_create_struct(3,block_length,displacements,typelist,
                           &recv_type);
           MPI_Type_commit(&recv_type);
           datatypes.push_back(recv_type);
 
           // recv
           int tag = this->compute_tag(nproc,p,my_rank,1);
           MPI_Request tr1;
           recv_req.push_back(tr1);
           MPI_Irecv(values_recv + offset_nnz,1,recv_type,
                     p,tag,Global_communicator_pt->mpi_comm(),
                     &recv_req[c_recv]);
           c_recv++;
          }
        }
      }
 
     ///////////////////////////////////////////////////////////////////////////
     // SEND
     ///////////////////////////////////////////////////////////////////////////
     unsigned c_send = 0;
     Vector<MPI_Request> send_req;
 
     // for every matrix we assemble the duplicate of the matrix on fewer
     // processors and setup the preconditioner
     for (unsigned i = 0; i < Nprec; i++)
      {
 
       // get pointers to the underlying data in the current matrix
       double* values_send = matrix_pt[i]->value();
       int* row_start_send = matrix_pt[i]->row_start();
       int* column_index_send = matrix_pt[i]->column_index();
 
       // send and receive the contents of the vector
       for (unsigned p = 0; p < nproc; p++)
        {
 
         // use mpi methods to send to and receive from all but my rank
         if (p != my_rank)
          {
 
           // send
           if (nnz_send[i][p] != 0)
            {
 
              // create 3 MPI contiguous datatypes
             // + values
             // + column_index
             // + row_start
 
             // values
             MPI_Datatype datatype_values;
             MPI_Type_contiguous(int(nnz_send[i][p]),MPI_DOUBLE,
                                 &datatype_values);
             MPI_Type_commit(&datatype_values);
             datatypes.push_back(datatype_values);
 
             // column index
             MPI_Datatype datatype_column_index;
             MPI_Type_contiguous(int(nnz_send[i][p]),MPI_INT,
                                 &datatype_column_index);
             MPI_Type_commit(&datatype_column_index);
             datatypes.push_back(datatype_column_index);
 
             // row start
             MPI_Datatype datatype_row_start;
             MPI_Type_contiguous(int(Nrow_local_for_proc[i][p]),MPI_INT,
                                 &datatype_row_start);
             MPI_Type_commit(&datatype_row_start);
             datatypes.push_back(datatype_row_start);
 
             // assemble the typelist
             MPI_Datatype typelist[3];
             typelist[0] = datatype_values;
             typelist[1] = datatype_column_index;
             typelist[2] = datatype_row_start;
 
             // compute the offset for row_start
             int offset_n =
              First_row_for_proc[i][p]-matrix_pt[i]->first_row(my_rank);
 
             // compute the offset for the values and column_index
             int offset_nnz = row_start_send[offset_n];
 
             // next compute the displacements
             MPI_Aint displacements[3];
             MPI_Get_address(values_send + offset_nnz,&displacements[0]);
             MPI_Get_address(column_index_send + offset_nnz,&displacements[1]);
             MPI_Get_address(row_start_send + offset_n,&displacements[2]);
             for (int j = 2; j >= 0; j--)
              {
               displacements[j] -= displacements[0];
              }
 
             // set the block lengths
             int block_length[3];
             block_length[0] = block_length[1] = block_length[2] = 1;
 
             // now build the final datatype
             MPI_Datatype send_type;
             MPI_Type_create_struct(3,block_length,displacements,typelist,
                             &send_type);
             MPI_Type_commit(&send_type);
             datatypes.push_back(send_type);
 
             // send
             int tag = this->compute_tag(nproc,my_rank,p,1);
             MPI_Request tr1;
             send_req.push_back(tr1);
             MPI_Isend(values_send + offset_nnz,1,send_type,
                       p,tag,Global_communicator_pt->mpi_comm(),
                       &send_req[c_send]);
             c_send++;
            }
          }
        }
      }
 
     // otherwise send to self (copy)
     if (nnz_recv[Color][my_rank] != 0)
      {
 
       // get pointers to the underlying data in the current matrix
       double* values_send = matrix_pt[Color]->value();
       int* row_start_send = matrix_pt[Color]->row_start();
       int* column_index_send = matrix_pt[Color]->column_index();
 
       // offset for row_start send to self
       unsigned offset_n_send =
        First_row_for_proc[Color][my_rank]-matrix_pt[Color]->first_row(my_rank);
 
       // offset for values and column+_index send to self
       unsigned offset_nnz_send = row_start_send[offset_n_send];
 
       // offset for row_start receive from self
       unsigned offset_n_recv =
        First_row_from_proc[Color][my_rank]-target_first_row[Color][my_rank];
 
       // offset for values and columm_index receive from self
       unsigned k = 0;
       while (nnz_start_proc[k] != my_rank)
        {
         k++;
        }
       unsigned offset_nnz_recv = nnz_start_index[k];
 
       // and send
 
       // values and column_index
       unsigned n_nnz = nnz_send[Color][my_rank];
       for (unsigned j = 0; j < n_nnz; j++)
        {
         values_recv[offset_nnz_recv + j] =
          values_send[offset_nnz_send + j];
         column_index_recv[offset_nnz_recv + j] =
          column_index_send[offset_nnz_send + j];
        }
 
       // row start
       unsigned n_n = Nrow_local_from_proc[Color][my_rank];
       for (unsigned j = 0; j < n_n; j++)
        {
         row_start_recv[offset_n_recv + j]  =
          row_start_send[offset_n_send + j];
        }
      }
 
     // create the local distribution for this matrix
     LinearAlgebraDistribution* temp_dist_pt =
      new LinearAlgebraDistribution
      (Local_communicator_pt,target_first_row[Color][my_rank],
       target_nrow_local[Color][my_rank]);
 
     // create the corresponding matrix
     local_matrix_pt = new CRDoubleMatrix(temp_dist_pt);
     delete temp_dist_pt; // (dist has now been copied)
 
     ///////////////////////////////////////////////////////////////////////////
     // and WAIT...
     ///////////////////////////////////////////////////////////////////////////
     if (c_recv!=0)
      {
       Vector<MPI_Status> recv_stat(c_recv);
       MPI_Waitall(c_recv,&recv_req[0],&recv_stat[0]);
       recv_req.clear();
       recv_stat.clear();
      }
 
     // build the matrix
 
     // update the row start
     unsigned nproc_contrib = nnz_start_index.size();
     for (unsigned j = 0; j < nproc_contrib; j++)
      {
       unsigned first = First_row_from_proc[Color][nnz_start_proc[j]] -
        target_first_row[Color][my_rank];
       unsigned last = first + Nrow_local_from_proc[Color][nnz_start_proc[j]];
       unsigned nnz_inc = nnz_start_index[j]-row_start_recv[first];
       for (unsigned k = first; k < last; k++)
        {
         row_start_recv[k]+=nnz_inc;
        }
      }
     row_start_recv[target_nrow_local[Color][my_rank]] = int(nnz_total);
 
     // build the matrix without a copy of the data
     local_matrix_pt->build_without_copy(matrix_pt[Color]->ncol(),
                                                  nnz_total,
                                                  values_recv,
                                                  column_index_recv,
                                                  row_start_recv);
 
     // and finally wait for the sends
     if (c_send!=0)
      {
       Vector<MPI_Status> send_stat(c_send);
       MPI_Waitall(c_send,&send_req[0],&send_stat[0]);
       send_req.clear();
       send_stat.clear();
      }
 
     // and clear the datatype
     unsigned ndatatypes = datatypes.size();
     for (unsigned i = 0; i < ndatatypes; i++)
      {
       MPI_Type_free(&datatypes[i]);
      }
    }
 
 
 
 
   /////////////////////////////////////////////////////////////////////////////
   /////////////////////////////////////////////////////////////////////////////
   /////////////////////////////////////////////////////////////////////////////
   /////////////////////////////////////////////////////////////////////////////
 
 
   // METHOD 3
   else if (Method == 3)
    {
 
     // temporary storgage for nnz recv
     unsigned* nnz_recv_temp = new unsigned[nproc*Nprec];
     for (unsigned j = 0; j < nproc*Nprec; j++)
      {
       nnz_recv_temp[j] = 0;
      }
 
     // for every matrix we assemble the duplicate of the matrix on fewer
     // processors and setup the preconditioner
     for (unsigned i = 0; i < Nprec; i++)
      {
 
       // if the matrix is global (!distributed) then just construct a copy
       // on the subset of processors
       if (!matrix_pt[i]->distributed())
        {
 
         // if this matrix is to be preconditioned my this processor
         if (i == Color)
          {
 
           // create the local distribution for this matrix
           LinearAlgebraDistribution* temp_dist_pt =
            new LinearAlgebraDistribution(Local_communicator_pt,
                                          matrix_pt[i]->nrow(),
                                          false);
 
           // create the corresponding matrix
           local_matrix_pt = new CRDoubleMatrix(temp_dist_pt);
           delete temp_dist_pt; // (dist has now been copied)
 
           // get pointers to the underlying data
           double* values_pt = matrix_pt[i]->value();
           int* column_index_pt = matrix_pt[i]->column_index();
           int* row_start_pt = matrix_pt[i]->row_start();
 
           // build the matrix without a copy of the data
           local_matrix_pt->build_without_copy(matrix_pt[i]->ncol(),
                                                        matrix_pt[i]->nnz(),
                                                        values_pt,
                                                        column_index_pt,
                                                        row_start_pt);
          }
        }
 
       // if the matrix is global (!distributed) then just construct a copy
       // on the subset of processors
       else
        {
 
         // first compute the distribution of this preconditioner on its subset
         // of processors
 
         // number of rows for this preconditioner
         unsigned nrow = matrix_pt[i]->nrow();
 
         // setup First_row_for_local_prec and Nrow_local_for_local_prec
         target_first_row[i].resize(nproc);
         target_nrow_local[i].resize(nproc);
         unsigned nproc_local = Nproc_for_prec[i];
         for (unsigned p = 0; p < nproc_local; p++)
          {
           int pp = First_proc_for_prec[i] + p;
           target_first_row[i][pp] =  unsigned(double(p*nrow)/
                                               double(nproc_local));
          }
         for (unsigned p = 0; p < nproc_local-1; p++)
          {
           int pp = First_proc_for_prec[i] + p;
           target_nrow_local[i][pp] = target_first_row[i][pp+1]
            - target_first_row[i][pp];
          }
         unsigned last_local_proc = First_proc_for_prec[i] + nproc_local - 1;
         target_nrow_local[i][last_local_proc] = nrow -
          target_first_row[i][last_local_proc];
 
         // get the details of the current distribution
         Vector<unsigned> current_first_row(nproc);
         Vector<unsigned> current_nrow_local(nproc);
         for (unsigned p = 0; p < nproc; p++)
          {
           current_first_row[p] = matrix_pt[i]->first_row(p);
           current_nrow_local[p] = matrix_pt[i]->nrow_local(p);
          }
 
         // resize storage for details of the data to be sent and received
         First_row_for_proc[i].resize(nproc,0);
         Nrow_local_for_proc[i].resize(nproc,0);
         First_row_from_proc[i].resize(nproc,0);
         Nrow_local_from_proc[i].resize(nproc,0);
 
         // for every processor compute first_row and nrow_local that will
         // will sent and received by this processor
         for (unsigned p = 0; p < nproc; p++)
          {
           // start with data to be sent
           if ((target_first_row[i][p] < (current_first_row[my_rank] +
                                          current_nrow_local[my_rank])) &&
               (current_first_row[my_rank] < (target_first_row[i][p] +
                                              target_nrow_local[i][p])))
            {
             First_row_for_proc[i][p] =
              std::max(current_first_row[my_rank],
                       target_first_row[i][p]);
             Nrow_local_for_proc[i][p] =
              std::min((current_first_row[my_rank] +
                        current_nrow_local[my_rank]),
                       (target_first_row[i][p] +
                        target_nrow_local[i][p])) - First_row_for_proc[i][p];
            }
 
           // and data to be received
           if ((target_first_row[i][my_rank] < (current_first_row[p] +
                                                current_nrow_local[p]))
               && (current_first_row[p] < (target_first_row[i][my_rank] +
                                           target_nrow_local[i][my_rank])))
            {
             First_row_from_proc[i][p] =
              std::max(current_first_row[p],
                       target_first_row[i][my_rank]);
             Nrow_local_from_proc[i][p] =
              std::min((current_first_row[p] +
                        current_nrow_local[p]),
                       (target_first_row[i][my_rank] +
                        target_nrow_local[i][my_rank]))-
              First_row_from_proc[i][p];
            }
          }
 
         // resize nnz_send
         nnz_send[i].resize(nproc);
 
         // compute the number of nnzs to be sent
         // and the number of send and receive requests to be made (nreq)
         for (unsigned p = 0; p < nproc; p++)
          {
           if (Nrow_local_for_proc[i][p] != 0)
            {
             int* row_start = matrix_pt[i]->row_start();
             unsigned k = First_row_for_proc[i][p]-current_first_row[my_rank];
             nnz_send[i][p] = row_start[k + Nrow_local_for_proc[i][p]] -
              row_start[k];
            }
          }
 
         // resize nnz_recv
         nnz_recv[i].resize(nproc);
 
         // send nnz to be sent to each processor
         for (unsigned p = 0; p < nproc; p++)
          {
 
           // send and recv
 
           // dont mpi send to self
           if (p != my_rank)
            {
 
             // non block send
             if (Nrow_local_for_proc[i][p] != 0)
              {
 
               // send to other processors
               int tag = this->compute_tag(nproc,my_rank,p,0);
               MPI_Request tr;
               req.push_back(tr);
               MPI_Isend(&nnz_send[i][p],1,MPI_UNSIGNED,p,tag,
                         Global_communicator_pt->mpi_comm(),&req[c]);
               c++;
              }
            }
           // receive from self
           else
            {
             if (Nrow_local_for_proc[i][p] != 0)
              {
               nnz_recv_temp[(i*nproc)+p] = nnz_send[i][p];
              }
            }
          }
        }
      }
 
     for (unsigned i = 0; i < Nprec; i++)
      {
         // resize nnz_recv
         nnz_recv[i].resize(nproc);
 
         // receive nnz from other processors
         for (unsigned pp = 0; pp < nproc; pp++)
          {
 
           // next processor to receive from
           unsigned p = (nproc + my_rank - pp)%nproc;
 
           // dont mpi send to send
           if (p != my_rank)
            {
 
             // blocking recv
             if (Nrow_local_from_proc[i][p] != 0)
              {
               int tag = this->compute_tag(nproc,p,my_rank,0);
               MPI_Status stat;
               unsigned nnz_temp;
               MPI_Recv(&nnz_temp,1,MPI_UNSIGNED,p,tag,
                        Global_communicator_pt->mpi_comm(),&stat);
               nnz_recv[i][p] = nnz_temp;
              }
            }
 
           // receive from self
           else
            {
             nnz_recv[i][p] = nnz_send[i][p];
            }
          }
      }
 
     // get the number of nnzs to be received from each processor
 
     // total number of nnz to be reveived
     unsigned nnz_total = 0;
     for (unsigned p = 0; p < nproc; p++)
      {
       nnz_total += nnz_recv[Color][p];
      }
 
     // compute nnz block start
     Vector<unsigned> nnz_start_proc;
     Vector<unsigned> nnz_start_index;
     unsigned row_ptr = target_first_row[Color][my_rank];
     int p = 0;
     unsigned nnz_ptr = 0;
     for (p = 0; p < int(nproc); p++)
      {
       if (First_row_from_proc[Color][p] == row_ptr &&
           Nrow_local_from_proc[Color][p] != 0 &&
           nnz_ptr != nnz_total)
        {
         nnz_start_proc.push_back(p);
         nnz_start_index.push_back(nnz_ptr);
         nnz_ptr += nnz_recv[Color][p];
         row_ptr += Nrow_local_from_proc[Color][p];
         p = -1;
        }
      }
 
     // storage for derived datatypes
     Vector<MPI_Datatype> datatypes;
 
     // storage for received data
     double* values_recv = new double[nnz_total];
     int* column_index_recv = new int[nnz_total];
     int* row_start_recv = new int[target_nrow_local[Color][my_rank]+1];
 
     ///////////////////////////////////////////////////////////////////////////
     // RECV
     ///////////////////////////////////////////////////////////////////////////
     unsigned c_recv = 0;
     Vector<MPI_Request> recv_req;
 
     // receive the contents of the vector
     for (unsigned p = 0; p < nproc; p++)
      {
 
       // use mpi methods to send to and receive from all but my rank
       if (p != my_rank)
        {
 
         // just receive
         if (nnz_recv[Color][p] != 0)
          {
 
           // create 3 MPI contiguous datatypes
           // + values
           // + column_index
           // + row_start
 
           // values
           MPI_Datatype datatype_values;
           MPI_Type_contiguous(int(nnz_recv[Color][p]),MPI_DOUBLE,
                               &datatype_values);
           MPI_Type_commit(&datatype_values);
           datatypes.push_back(datatype_values);
 
           // column index
           MPI_Datatype datatype_column_index;
           MPI_Type_contiguous(int(nnz_recv[Color][p]),MPI_INT,
                               &datatype_column_index);
           MPI_Type_commit(&datatype_column_index);
           datatypes.push_back(datatype_column_index);
 
           // row start
           MPI_Datatype datatype_row_start;
           MPI_Type_contiguous(int(Nrow_local_from_proc[Color][p]),MPI_INT,
                               &datatype_row_start);
           MPI_Type_commit(&datatype_row_start);
           datatypes.push_back(datatype_row_start);
 
           // assemble the typelist
           MPI_Datatype typelist[3];
           typelist[0] = datatype_values;
           typelist[1] = datatype_column_index;
           typelist[2] = datatype_row_start;
 
           // compute the offset for row_start
           int offset_n =
            First_row_from_proc[Color][p]-target_first_row[Color][my_rank];
 
           // compute the offset for the values and column_index
           unsigned k = 0;
           while (nnz_start_proc[k] != p)
            {
             k++;
            }
           int offset_nnz = nnz_start_index[k];
 
           // next compute the displacements
           MPI_Aint displacements[3];
           MPI_Get_address(values_recv + offset_nnz,&displacements[0]);
           MPI_Get_address(column_index_recv + offset_nnz,&displacements[1]);
           MPI_Get_address(row_start_recv + offset_n,&displacements[2]);
           for (int j = 2; j >= 0; j--)
            {
             displacements[j] -= displacements[0];
            }
 
           // set the block lengths
           int block_length[3];
           block_length[0] = block_length[1] = block_length[2] = 1;
 
           // now build the final datatype
           MPI_Datatype recv_type;
           MPI_Type_create_struct(3,block_length,displacements,typelist,
                           &recv_type);
           MPI_Type_commit(&recv_type);
           datatypes.push_back(recv_type);
 
           // recv
           int tag = this->compute_tag(nproc,p,my_rank,1);
           MPI_Request tr1;
           recv_req.push_back(tr1);
           MPI_Irecv(values_recv + offset_nnz,1,recv_type,
                     p,tag,Global_communicator_pt->mpi_comm(),
                     &recv_req[c_recv]);
           c_recv++;
          }
        }
      }
 
     ///////////////////////////////////////////////////////////////////////////
     // SEND
     ///////////////////////////////////////////////////////////////////////////
     unsigned c_send = 0;
     Vector<MPI_Request> send_req;
 
     // for every matrix we assemble the duplicate of the matrix on fewer
     // processors and setup the preconditioner
     for (unsigned i = 0; i < Nprec; i++)
      {
 
       // get pointers to the underlying data in the current matrix
       double* values_send = matrix_pt[i]->value();
       int* row_start_send = matrix_pt[i]->row_start();
       int* column_index_send = matrix_pt[i]->column_index();
 
       // send and receive the contents of the vector
       for (unsigned p = 0; p < nproc; p++)
        {
 
         // use mpi methods to send to and receive from all but my rank
         if (p != my_rank)
          {
 
           // send
           if (nnz_send[i][p] != 0)
            {
 
              // create 3 MPI contiguous datatypes
             // + values
             // + column_index
             // + row_start
 
             // values
             MPI_Datatype datatype_values;
             MPI_Type_contiguous(int(nnz_send[i][p]),MPI_DOUBLE,
                                 &datatype_values);
             MPI_Type_commit(&datatype_values);
             datatypes.push_back(datatype_values);
 
             // column index
             MPI_Datatype datatype_column_index;
             MPI_Type_contiguous(int(nnz_send[i][p]),MPI_INT,
                                 &datatype_column_index);
             MPI_Type_commit(&datatype_column_index);
             datatypes.push_back(datatype_column_index);
 
             // row start
             MPI_Datatype datatype_row_start;
             MPI_Type_contiguous(int(Nrow_local_for_proc[i][p]),MPI_INT,
                                 &datatype_row_start);
             MPI_Type_commit(&datatype_row_start);
             datatypes.push_back(datatype_row_start);
 
             // assemble the typelist
             MPI_Datatype typelist[3];
             typelist[0] = datatype_values;
             typelist[1] = datatype_column_index;
             typelist[2] = datatype_row_start;
 
             // compute the offset for row_start
             int offset_n =
              First_row_for_proc[i][p]-matrix_pt[i]->first_row(my_rank);
 
             // compute the offset for the values and column_index
             int offset_nnz = row_start_send[offset_n];
 
             // next compute the displacements
             MPI_Aint displacements[3];
             MPI_Get_address(values_send + offset_nnz,&displacements[0]);
             MPI_Get_address(column_index_send + offset_nnz,&displacements[1]);
             MPI_Get_address(row_start_send + offset_n,&displacements[2]);
             for (int j = 2; j >= 0; j--)
              {
               displacements[j] -= displacements[0];
              }
 
             // set the block lengths
             int block_length[3];
             block_length[0] = block_length[1] = block_length[2] = 1;
 
             // now build the final datatype
             MPI_Datatype send_type;
             MPI_Type_create_struct(3,block_length,displacements,typelist,
                             &send_type);
             MPI_Type_commit(&send_type);
             datatypes.push_back(send_type);
 
             // send
             int tag = this->compute_tag(nproc,my_rank,p,1);
             MPI_Request tr1;
             send_req.push_back(tr1);
             MPI_Isend(values_send + offset_nnz,1,send_type,
                       p,tag,Global_communicator_pt->mpi_comm(),
                       &send_req[c_send]);
             c_send++;
            }
          }
        }
      }
 
     // otherwise send to self (copy)
     if (nnz_recv[Color][my_rank] != 0)
      {
 
       // get pointers to the underlying data in the current matrix
       double* values_send = matrix_pt[Color]->value();
       int* row_start_send = matrix_pt[Color]->row_start();
       int* column_index_send = matrix_pt[Color]->column_index();
 
       // offset for row_start send to self
       unsigned offset_n_send =
        First_row_for_proc[Color][my_rank]-matrix_pt[Color]->first_row(my_rank);
 
       // offset for values and column+_index send to self
       unsigned offset_nnz_send = row_start_send[offset_n_send];
 
       // offset for row_start receive from self
       unsigned offset_n_recv =
        First_row_from_proc[Color][my_rank]-target_first_row[Color][my_rank];
 
       // offset for values and columm_index receive from self
       unsigned k = 0;
       while (nnz_start_proc[k] != my_rank)
        {
         k++;
        }
       unsigned offset_nnz_recv = nnz_start_index[k];
 
       // and send
 
       // values and column_index
       unsigned n_nnz = nnz_send[Color][my_rank];
       for (unsigned j = 0; j < n_nnz; j++)
        {
         values_recv[offset_nnz_recv + j] =
          values_send[offset_nnz_send + j];
         column_index_recv[offset_nnz_recv + j] =
          column_index_send[offset_nnz_send + j];
        }
 
       // row start
       unsigned n_n = Nrow_local_from_proc[Color][my_rank];
       for (unsigned j = 0; j < n_n; j++)
        {
         row_start_recv[offset_n_recv + j]  =
          row_start_send[offset_n_send + j];
        }
      }
 
     // create the local distribution for this matrix
     LinearAlgebraDistribution* temp_dist_pt =
      new LinearAlgebraDistribution
      (Local_communicator_pt,target_first_row[Color][my_rank],
       target_nrow_local[Color][my_rank]);
 
     // create the corresponding matrix
     local_matrix_pt = new CRDoubleMatrix(temp_dist_pt);
     delete temp_dist_pt; // (dist has now been copied)
 
     ///////////////////////////////////////////////////////////////////////////
     // and WAIT...
     ///////////////////////////////////////////////////////////////////////////
     if (c_recv!=0)
      {
       Vector<MPI_Status> recv_stat(c_recv);
       MPI_Waitall(c_recv,&recv_req[0],&recv_stat[0]);
       recv_req.clear();
       recv_stat.clear();
      }
 
     // build the matrix
 
     // update the row start
     unsigned nproc_contrib = nnz_start_index.size();
     for (unsigned j = 0; j < nproc_contrib; j++)
      {
       unsigned first = First_row_from_proc[Color][nnz_start_proc[j]] -
        target_first_row[Color][my_rank];
       unsigned last = first + Nrow_local_from_proc[Color][nnz_start_proc[j]];
       unsigned nnz_inc = nnz_start_index[j]-row_start_recv[first];
       for (unsigned k = first; k < last; k++)
        {
         row_start_recv[k]+=nnz_inc;
        }
      }
     row_start_recv[target_nrow_local[Color][my_rank]] = int(nnz_total);
 
     // build the matrix without a copy of the data
     local_matrix_pt->build_without_copy(matrix_pt[Color]->ncol(),
                                                  nnz_total,
                                                  values_recv,
                                                  column_index_recv,
                                                  row_start_recv);
 
     // and finally wait for the sends
     if (c_recv!=0)
      {
       Vector<MPI_Status> send_stat(c_recv);
       MPI_Waitall(c_send,&send_req[0],&send_stat[0]);
       send_req.clear();
       send_stat.clear();
      }
 
     // and clear the datatype
     unsigned ndatatypes = datatypes.size();
     for (unsigned i = 0; i < ndatatypes; i++)
      {
       MPI_Type_free(&datatypes[i]);
      }
    }
 
   // now setup the preconditioner
   Preconditioner_pt = prec_pt[Color];
   Preconditioner_pt->setup(local_matrix_pt);
 
   // clean up memory
   if (matrix_pt[0]->distributed())
    {
     delete local_matrix_pt;
    }
 
   // delete the preconditioners not used on this processor
   for (unsigned i = 0; i < Nprec; i++)
    {
     if (i != Color)
      {
       delete prec_pt[i];
      }
    }
  } //end of setup_preconditioners()
 
 //============================================================================
 /// \short Applies each preconditioner to the corresponding vector in
 /// r and z
 //=============================================================================
  void PreconditionerArray::solve_preconditioners(const Vector<DoubleVector> &r,
                                                  Vector<DoubleVector> &z)
  {
 #ifdef PARANOID
   // check that a preconditioner has been setup
   if (Preconditioner_pt == 0)
    {
     std::ostringstream error_message;
     error_message << "The preconditioners have not been setup.";
     throw OomphLibError(error_message.str(),
                         OOMPH_CURRENT_FUNCTION,
                         OOMPH_EXCEPTION_LOCATION);
    }
 
   // check that r is the correct length
   if (r.size() != Nprec)
    {
     std::ostringstream error_message;
     error_message << "This PreconditionerArray has " << Nprec
                   << " preconditioners but r only contains "
                   << r.size() << " preconditioners.";
     throw OomphLibError(error_message.str(),
                         OOMPH_CURRENT_FUNCTION,
                         OOMPH_EXCEPTION_LOCATION);
    }
 
   // check that z is the correct length
   if (z.size() != Nprec)
    {
     std::ostringstream error_message;
     error_message << "This PreconditionerArray has " << Nprec
                   << " preconditioners but z only contains "
                   << z.size() << " preconditioners.";
     throw OomphLibError(error_message.str(),
                         OOMPH_CURRENT_FUNCTION,
                         OOMPH_EXCEPTION_LOCATION);
    }
   // check that the vector has the same distribution as the
   // preconditioner
   for (unsigned i = 0; i < Nprec; i++)
    {
     if (*r[i].distribution_pt() != *Distribution_pt[i])
      {
       std::ostringstream error_message;
       error_message << "The distribution of r[" << i << "] does not have the"
                     << " the same distribution as the matrix_pt[" << i
                     << "] that was passed to setup_preconditioners(...)";
       throw OomphLibError(error_message.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
      }
    }
 #endif
 
   // the local r vector
   DoubleVector local_r(Preconditioner_pt->distribution_pt(),0.0);
 
   // number of processors
   unsigned nproc = Global_communicator_pt->nproc();
 
   // cache my global rank
   unsigned my_rank = Global_communicator_pt->my_rank();
 
   // send and receive requests
   Vector<MPI_Request> send_reqs;
   Vector<MPI_Request> recv_reqs;
 
   // cache first_row
   unsigned first_row = Preconditioner_pt->first_row();
 
   // local residual values for this processor
   double* local_r_values = local_r.values_pt();
 
   // for every vector we assemble the duplicate of the vector on the
   // appropirate subset of processors
 
   // first we post the non-blocking sends and recvs
   for (unsigned i = 0; i < Nprec; i++)
    {
 
     if (r[i].distributed())
      {
 
       // current first_row and nrow_local
       unsigned current_first_row = r[i].first_row();
 
       // send and receive the contents of the vector
       for (unsigned p = 0; p < nproc; p++)
        {
 
         // use mpi methods to send to and receive from all but my rank
         if (p != my_rank)
          {
 
           // send
           if (Nrow_local_for_proc[i][p] != 0)
            {
 
             // compute the offset for the values
             int offset_n =
              First_row_for_proc[i][p]-current_first_row;
 
             // send the values
             int tag = this->compute_tag(nproc,my_rank,p,0);
             MPI_Request tr;
             MPI_Isend(const_cast<double*>(r[i].values_pt())+offset_n,
                       int(Nrow_local_for_proc[i][p]),MPI_DOUBLE,p,tag,
                       Global_communicator_pt->mpi_comm(),&tr);
             send_reqs.push_back(tr);
            }
 
           // recv
           if (Nrow_local_from_proc[i][p] != 0)
            {
 
             // compute the offset for row_start
             int offset_n =
              First_row_from_proc[i][p]-first_row;
 
             // row_start
             int tag = this->compute_tag(nproc,p,my_rank,0);
             MPI_Request tr;
             MPI_Irecv(local_r_values + offset_n,
                       int(Nrow_local_from_proc[i][p]),MPI_DOUBLE,p,tag,
                       Global_communicator_pt->mpi_comm(),&tr);
             recv_reqs.push_back(tr);
            }
          }
        }
      }
    }
 
 
   // and now we send to self
   if (!r[Color].distributed())
    {
     // just copy to the new vector
     const double* r_pt = r[Color].values_pt();
     unsigned nrow_local = local_r.nrow_local();
     for (unsigned i = 0; i < nrow_local; i++)
      {
       local_r_values[i] = r_pt[i];
      }
    }
   else
    {
     // the incoming residual associated with the processor
     const double* r_pt = r[Color].values_pt();
 
     // current first_row and nrow_local
     unsigned current_first_row = r[Color].first_row();
 
     // cache first_row
     unsigned first_row = Preconditioner_pt->first_row();
 
     //
     if (Nrow_local_from_proc[Color][my_rank] != 0)
      {
       // offset for values send to self
       unsigned offset_n_send =
        First_row_for_proc[Color][my_rank]-current_first_row;
 
       // offset for values receive from self
       unsigned offset_n_recv =
        First_row_from_proc[Color][my_rank]-first_row;
 
       // send/receive
       unsigned n_n = Nrow_local_from_proc[Color][my_rank];
       for (unsigned j = 0; j < n_n; j++)
        {
         local_r_values[offset_n_recv + j]  = r_pt[offset_n_send + j];
        }
      }
    }
 
   // wait for the receives to complete
   unsigned n_recv = recv_reqs.size();
   if (n_recv)
    {
     MPI_Waitall(n_recv,&recv_reqs[0],MPI_STATUS_IGNORE);
    }
   recv_reqs.clear();
 
   // next solve
   // apply the local preconditioner
   DoubleVector local_z;
   Preconditioner_pt->preconditioner_solve(local_r,local_z);
   local_r.clear();
 
   // the local z values
   double* local_z_values = local_z.values_pt();
 
   // setup the vectors
   for (unsigned i = 0; i < Nprec; i++)
    {
 
     // if z[i] is not setup then set it up
     if (!z[i].built())
      {
       z[i].build(r[i].distribution_pt(),0.0);
      }
    }
 
   // first we post the non-blocking sends and recvs
   for (unsigned i = 0; i < Nprec; i++)
    {
     if (r[i].distributed())
      {
 
       // current first_row and nrow_local
       unsigned current_first_row = r[i].first_row();
 
       // send and receive the contents of the vector
       for (unsigned p = 0; p < nproc; p++)
        {
 
         // use mpi methods to send to and receive from all but my rank
         if (p != my_rank)
          {
 
           // send
           if (Nrow_local_for_proc[i][p] != 0)
            {
 
             // compute the offset for the values
             int offset_n =
              First_row_for_proc[i][p]-current_first_row;
 
             // send the values
             int tag = this->compute_tag(nproc,my_rank,p,0);
             MPI_Request tr;
             MPI_Irecv(z[i].values_pt() + offset_n,
                       int(Nrow_local_for_proc[i][p]),MPI_DOUBLE,p,tag,
                       Global_communicator_pt->mpi_comm(),&tr);
             recv_reqs.push_back(tr);
            }
 
           // recv
           if (Nrow_local_from_proc[i][p] != 0)
            {
 
             // compute the offset for row_start
             int offset_n =
              First_row_from_proc[i][p]-first_row;
 
             // vector
             int tag = this->compute_tag(nproc,p,my_rank,0);
             MPI_Request tr;
             MPI_Isend(local_z_values + offset_n,
                       int(Nrow_local_from_proc[i][p]),MPI_DOUBLE,p,tag,
                       Global_communicator_pt->mpi_comm(),&tr);
             send_reqs.push_back(tr);
            }
          }
        }
      }
     // otherwise we need to share the results
     else
      {
       // number of processors associated with this preconditioner
       unsigned nproc_local = Local_communicator_pt->nproc();
 
       // my "proc number" for this preconditioner
       unsigned my_local_rank = Local_communicator_pt->my_rank();
 
       // sends to self completed later
       if (i != Color)
        {
         // post send requests
         for (unsigned j = my_local_rank; j < Nproc_for_prec[i];
              j += nproc_local)
          {
           int p = j + First_proc_for_prec[i];
           MPI_Request tr;
           MPI_Isend(local_z_values,z[Color].nrow(),MPI_DOUBLE,p,0,
                     Global_communicator_pt->mpi_comm(),&tr);
           send_reqs.push_back(tr);
          }
 
         // compute the processor number to recv from
         int p = my_local_rank;
         while ((p - int(Nproc_for_prec[i])) >= 0)
          {
           p-= Nproc_for_prec[i];
          }
         p += First_proc_for_prec[i];
 
         // and recv
         MPI_Request tr;
         MPI_Irecv(z[i].values_pt(),z[i].nrow(),MPI_DOUBLE,p,0,
                   Global_communicator_pt->mpi_comm(),&tr);
         recv_reqs.push_back(tr);
        }
      }
    }
 
   // and now we send to self
   if (!r[Color].distributed())
    {
     // just copy to the new vector
     double* z_pt = z[Color].values_pt();
     unsigned nrow_local = local_z.nrow_local();
     for (unsigned i = 0; i < nrow_local; i++)
      {
       z_pt[i] = local_z_values[i];
      }
    }
   else
    {
     //
     double* z_pt = z[Color].values_pt();
 
     // current first_row and nrow_local
     unsigned current_first_row = r[Color].first_row();
 
     // cache first_row
     unsigned first_row = Preconditioner_pt->first_row();
 
     //
     if (Nrow_local_from_proc[Color][my_rank] != 0)
      {
       // offset for values send to self
       unsigned offset_n_send =
        First_row_for_proc[Color][my_rank]-current_first_row;
 
       // offset for values receive from self
       unsigned offset_n_recv =
        First_row_from_proc[Color][my_rank]-first_row;
 
       // send/receive
       unsigned n_n = Nrow_local_from_proc[Color][my_rank];
       for (unsigned j = 0; j < n_n; j++)
        {
         z_pt[offset_n_send + j]  =
          local_z_values[offset_n_recv + j];
        }
      }
    }
 
 
   // wait for the receives to complete
   n_recv = recv_reqs.size();
   if (n_recv)
    {
     MPI_Waitall(n_recv,&recv_reqs[0],MPI_STATUS_IGNORE);
    }
   recv_reqs.clear();
 
   // wait for the sends to complete
   unsigned n_send = send_reqs.size();
   if (n_send)
    {
     MPI_Waitall(n_send,&send_reqs[0],MPI_STATUS_IGNORE);
    }
   send_reqs.clear();
  }
 }
 
 // End of "if we have mpi"
 #endif