oomph-lib: matrices.cc Source File

Go to the documentation of this file.
 //LIC// ====================================================================
 //LIC// This file forms part of oomph-lib, the object-oriented, 
 //LIC// multi-physics finite-element library, available 
 //LIC// at http://www.oomph-lib.org.
 //LIC// 
 //LIC//    Version 1.0; svn revision $LastChangedRevision$
 //LIC//
 //LIC// $LastChangedDate$
 //LIC// 
 //LIC// Copyright (C) 2006-2016 Matthias Heil and Andrew Hazel
 //LIC// 
 //LIC// This library is free software; you can redistribute it and/or
 //LIC// modify it under the terms of the GNU Lesser General Public
 //LIC// License as published by the Free Software Foundation; either
 //LIC// version 2.1 of the License, or (at your option) any later version.
 //LIC// 
 //LIC// This library is distributed in the hope that it will be useful,
 //LIC// but WITHOUT ANY WARRANTY; without even the implied warranty of
 //LIC// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 //LIC// Lesser General Public License for more details.
 //LIC// 
 //LIC// You should have received a copy of the GNU Lesser General Public
 //LIC// License along with this library; if not, write to the Free Software
 //LIC// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 //LIC// 02110-1301  USA.
 //LIC// 
 //LIC// The authors may be contacted at oomph-lib@maths.man.ac.uk.
 //LIC// 
 //LIC//====================================================================
 //Non-inline member functions for the matrix classes
 
 #ifdef OOMPH_HAS_MPI
 #include "mpi.h"
 #endif
 
 #include <cstring>
 
 #include<set> 
 #include<map> 
 
 //#include <valgrind/callgrind.h>
 
 //oomph-lib headers
 #include "matrices.h"
 #include "linear_solver.h"
 
 
 namespace oomph
 {
 
 //============================================================================
 /// Complete LU solve (overwrites RHS with solution). This is the
 /// generic version which should not need to be over-written.
 //============================================================================
 void DoubleMatrixBase::solve(DoubleVector &rhs)
 {
 #ifdef PARANOID
  if(Linear_solver_pt==0)
   {
    throw OomphLibError("Linear_solver_pt not set in matrix",
                        OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
 #endif
 
  // Copy rhs vector into local storage so it doesn't get overwritten
  // if the linear solver decides to initialise the solution vector, say,
  // which it's quite entitled to do!
  DoubleVector actual_rhs(rhs);
 
  //Use the linear algebra interface to the linear solver
  Linear_solver_pt->solve(this,actual_rhs,rhs);
 }
 
 //============================================================================
 /// Complete LU solve (Nothing gets overwritten!). This generic
 /// version should never need to be overwritten
 //============================================================================
 void DoubleMatrixBase::solve(const DoubleVector &rhs, 
                              DoubleVector &soln)
 {
 #ifdef PARANOID
  if(Linear_solver_pt==0)
   {
    throw OomphLibError("Linear_solver_pt not set in matrix",
                        OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
 #endif
  //Use the linear algebra interface to the linear solver
  Linear_solver_pt->solve(this,rhs,soln);
 }
 
 //============================================================================
 /// Complete LU solve (overwrites RHS with solution). This is the
 /// generic version which should not need to be over-written.
 //============================================================================
 void DoubleMatrixBase::solve(Vector<double> &rhs)
 {
 #ifdef PARANOID
  if(Linear_solver_pt==0)
   {
    throw OomphLibError("Linear_solver_pt not set in matrix",
                        OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
 #endif
 
  // Copy rhs vector into local storage so it doesn't get overwritten
  // if the linear solver decides to initialise the solution vector, say,
  // which it's quite entitled to do!
  Vector<double> actual_rhs(rhs);
 
  //Use the linear algebra interface to the linear solver
  Linear_solver_pt->solve(this,actual_rhs,rhs);
 }
 
 //============================================================================
 /// Complete LU solve (Nothing gets overwritten!). This generic
 /// version should never need to be overwritten
 //============================================================================
 void DoubleMatrixBase::solve(const Vector<double> &rhs,
                              Vector<double> &soln)
 {
 #ifdef PARANOID
  if(Linear_solver_pt==0)
   {
    throw OomphLibError("Linear_solver_pt not set in matrix",
                        OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
 #endif
  //Use the linear algebra interface to the linear solver
  Linear_solver_pt->solve(this,rhs,soln);
 }
 
 
 ////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////
 
 //===============================================================
 /// Constructor, set the default linear solver to be the DenseLU 
 /// solver
 //===============================================================
 DenseDoubleMatrix::DenseDoubleMatrix(): DenseMatrix<double>()
 {
  Linear_solver_pt = Default_linear_solver_pt = new DenseLU;
 }
 
 //==============================================================
 /// Constructor to build a square n by n matrix.
 /// Set the default linear solver to be DenseLU
 //==============================================================
 DenseDoubleMatrix::DenseDoubleMatrix(const unsigned long &n) : 
  DenseMatrix<double>(n)
 {
  Linear_solver_pt = Default_linear_solver_pt = new DenseLU;
 }
  
 
 //=================================================================
 /// Constructor to build a matrix with n rows and m columns.
 /// Set the default linear solver to be DenseLU
 //=================================================================
  DenseDoubleMatrix::DenseDoubleMatrix(const unsigned long &n, 
                                       const unsigned long &m) :
   DenseMatrix<double>(n,m)
 {
  Linear_solver_pt = Default_linear_solver_pt = new DenseLU;
 }
 
 //=====================================================================
 /// Constructor to build a matrix with n rows and m columns,
 /// with initial value initial_val
 /// Set the default linear solver to be DenseLU
 //=====================================================================
 DenseDoubleMatrix::DenseDoubleMatrix(const unsigned long &n, 
                                      const unsigned long &m,
                                      const double &initial_val) :
  DenseMatrix<double>(n,m,initial_val) 
 {
  Linear_solver_pt = Default_linear_solver_pt = new DenseLU;
 }
 
 //=======================================================================
 /// Destructor delete the default linear solver
 //======================================================================
 DenseDoubleMatrix::~DenseDoubleMatrix()
 {
  //Delete the default linear solver
  delete Default_linear_solver_pt;
 }
 
 //============================================================================
 /// LU decompose a matrix, by using the default linear solver
 /// (DenseLU)
 //============================================================================
 void DenseDoubleMatrix::ludecompose()
 {
  //Use the default (DenseLU) solver to ludecompose the matrix
  static_cast<DenseLU*>(Default_linear_solver_pt)->factorise(this);
 }
 
 
 //============================================================================
 ///  Back substitute an LU decomposed matrix.
 //============================================================================
 void DenseDoubleMatrix::lubksub(DoubleVector &rhs)
 {
  //Use the default (DenseLU) solver to perform the backsubstitution
  static_cast<DenseLU*>(Default_linear_solver_pt)->backsub(rhs,rhs);
 }
 
 //============================================================================
 ///  Back substitute an LU decomposed matrix.
 //============================================================================
 void DenseDoubleMatrix::lubksub(Vector<double> &rhs)
 {
  //Use the default (DenseLU) solver to perform the backsubstitution
  static_cast<DenseLU*>(Default_linear_solver_pt)->backsub(rhs,rhs);
 }
 
 
 //============================================================================
 ///  Determine eigenvalues and eigenvectors, using
 /// Jacobi rotations. Only for symmetric matrices. Nothing gets overwritten!
 /// - \c eigen_vect(i,j) = j-th component of i-th eigenvector.
 /// - \c eigen_val[i] is the i-th eigenvalue; same ordering as in eigenvectors
 //============================================================================
 void DenseDoubleMatrix::eigenvalues_by_jacobi(Vector<double> & eigen_vals, 
                                               DenseMatrix<double> &eigen_vect)
  const
 {
 #ifdef PARANOID
  // Check Matrix is square
  if (N!=M)
   {
    throw OomphLibError(
     "This matrix is not square, the matrix MUST be square!",
     OOMPH_CURRENT_FUNCTION,
     OOMPH_EXCEPTION_LOCATION);
   }
 #endif
  // Make a copy of the matrix & check that it's symmetric
 
  // Check that the sizes of eigen_vals and eigen_vect are correct. If not 
  // correct them.
  if (eigen_vals.size()!=N) 
   { 
    eigen_vals.resize(N); 
   }
  if (eigen_vect.ncol()!=N || eigen_vect.nrow()!=N) 
   { 
    eigen_vect.resize(N); 
   }
  
  DenseDoubleMatrix working_matrix(N);
  for (unsigned long i=0;i<N;i++)
   {
    for (unsigned long j=0;j<M;j++)
     {
 #ifdef PARANOID
      if (Matrixdata[M*i+j]!=Matrixdata[M*j+i])
       {
        throw OomphLibError(
         "Matrix needs to be symmetric for eigenvalues_by_jacobi()",
 OOMPH_CURRENT_FUNCTION,
         OOMPH_EXCEPTION_LOCATION);
       }
 #endif
      working_matrix(i,j)=(*this)(i,j);
     }
   }
  
  DenseDoubleMatrix aux_eigen_vect(N);
 
  throw OomphLibError(
   "Sorry JacobiEigenSolver::jacobi() removed because of licencing problems.",
   OOMPH_CURRENT_FUNCTION,
   OOMPH_EXCEPTION_LOCATION);
 
  // // Call eigensolver
  // unsigned long nrot;
  // JacobiEigenSolver::jacobi(working_matrix, eigen_vals, aux_eigen_vect, 
  //                           nrot);
 
  // Copy across (and transpose)
  for (unsigned long i=0;i<N;i++)
   {
    for (unsigned long j=0;j<M;j++)
     {
      eigen_vect(i,j)=aux_eigen_vect(j,i);
     }
   }
 }
 
 
 //============================================================================
 ///  Multiply the matrix by the vector x: soln=Ax
 //============================================================================
 void DenseDoubleMatrix::multiply
 (const DoubleVector &x, DoubleVector &soln) const
 {
 #ifdef PARANOID
  // Check to see if x.size() = ncol().
  if (x.nrow()!=this->ncol())
   {
    std::ostringstream error_message_stream;
    error_message_stream 
     << "The x vector is not the right size. It is " << x.nrow() 
     << ", it should be " << this->ncol() << std::endl;
    throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
  // check that x is not distributed
  if (x.distributed())
   {
    std::ostringstream error_message_stream;
    error_message_stream 
     << "The x vector cannot be distributed for DenseDoubleMatrix "
     << "matrix-vector multiply" << std::endl;
    throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
  // if soln is setup...
  if (soln.built())
   {
    // check that soln is not distributed
    if (soln.distributed())
     {
      std::ostringstream error_message_stream;
      error_message_stream 
       << "The x vector cannot be distributed for DenseDoubleMatrix "
       << "matrix-vector multiply" << std::endl;
      throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
    if (soln.nrow() != this->nrow())
     {
      std::ostringstream error_message_stream;
      error_message_stream 
       << "The soln vector is setup and therefore must have the same "
       << "number of rows as the matrix";
      throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
    if (*x.distribution_pt()->communicator_pt() != 
        *soln.distribution_pt()->communicator_pt())
     {
      std::ostringstream error_message_stream;
      error_message_stream 
       << "The soln vector and the x vector must have the same communicator"
       << std::endl;
      throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
   }
 #endif
 
  // if soln is not setup then setup the distribution
  if (!soln.built())
   {
         LinearAlgebraDistribution dist(x.distribution_pt()->communicator_pt(),
                                   this->nrow(),false);
         soln.build(&dist,0.0);
   }
 
  // Initialise the solution
  soln.initialise(0.0);
 
  // Multiply the matrix A, by the vector x 
  const double* x_pt = x.values_pt();
  double* soln_pt = soln.values_pt();
  for (unsigned long i=0;i<N;i++)
   {
    for (unsigned long j=0;j<M;j++)
     {
      soln_pt[i] += Matrixdata[M*i+j]*x_pt[j];
     }
   }
 }
 
 
 //=================================================================
 /// Multiply the transposed matrix by the vector x: soln=A^T x
 //=================================================================
 void DenseDoubleMatrix::multiply_transpose(const DoubleVector &x, 
                                            DoubleVector &soln) const
 {
 #ifdef PARANOID
  // Check to see if x.size() = ncol().
  if (x.nrow()!=this->nrow())
   {
    std::ostringstream error_message_stream;
    error_message_stream 
     << "The x vector is not the right size. It is " << x.nrow() 
     << ", it should be " << this->nrow() << std::endl;
    throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
  // check that x is not distributed
  if (x.distributed())
   {
    std::ostringstream error_message_stream;
    error_message_stream 
     << "The x vector cannot be distributed for DenseDoubleMatrix "
     << "matrix-vector multiply" << std::endl;
    throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
  // if soln is setup...
  if (soln.built())
   {
    // check that soln is not distributed
    if (soln.distributed())
     {
      std::ostringstream error_message_stream;
      error_message_stream 
       << "The x vector cannot be distributed for DenseDoubleMatrix "
       << "matrix-vector multiply" << std::endl;
      throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
    if (soln.nrow() != this->ncol())
     {
      std::ostringstream error_message_stream;
      error_message_stream 
       << "The soln vector is setup and therefore must have the same "
       << "number of columns as the matrix";
      throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
    if (*soln.distribution_pt()->communicator_pt() != 
        *x.distribution_pt()->communicator_pt())
     {
      std::ostringstream error_message_stream;
      error_message_stream 
       << "The soln vector and the x vector must have the same communicator"
       << std::endl;
      throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
   }
 #endif
 
  // if soln is not setup then setup the distribution
  if (!soln.built())
   {
    LinearAlgebraDistribution* dist_pt = 
     new LinearAlgebraDistribution(x.distribution_pt()->communicator_pt(),
                                   this->ncol(),false);
    soln.build(dist_pt,0.0);
    delete dist_pt;
   }
 
  // Initialise the solution
  soln.initialise(0.0);
 
  // Matrix vector product
  double* soln_pt = soln.values_pt();
  const double* x_pt = x.values_pt();
  for (unsigned long i=0;i<N;i++)
   {  
    for (unsigned long j=0;j<M;j++)
     {
      soln_pt[j] += Matrixdata[N*i+j]*x_pt[i];
     }
   }
 }
 
 
 
 //=================================================================
 /// For every row, find the maximum absolute value of the
 /// entries in this row. Set all values that are less than alpha times
 /// this maximum to zero and return the resulting matrix in
 /// reduced_matrix. Note: Diagonal entries are retained regardless
 /// of their size. 
 //=================================================================
 void DenseDoubleMatrix::matrix_reduction(const double &alpha,
                                        DenseDoubleMatrix &reduced_matrix)
 {
 
  reduced_matrix.resize(N,M,0.0);
  // maximum value in a row
  double max_row;
   
  // Loop over rows
  for(unsigned i=0;i<N;i++)
   { 
    // Initialise max value in row
    max_row=0.0;
    
    //Loop over entries in columns
    for(unsigned long j=0;j<M;j++)
     {
      // Find max. value in row
      if(std::fabs( Matrixdata[M*i+j])>max_row)
       {
        max_row=std::fabs( Matrixdata[M*i+j]);
       }
     }
 
    // Decide if we need to retain the entries in the row
    for(unsigned long j=0;j<M;j++)
     {
      // If we're on the diagonal or the value is sufficiently large: retain
      // i.e. copy across.
      if(i==j || std::fabs(Matrixdata[M*i+j])>alpha*max_row )
       {
        reduced_matrix(i,j) =Matrixdata[M*i+j];
       }
     }
   }
  
 }
 
 
 //=============================================================================
 /// Function to multiply this matrix by the DenseDoubleMatrix  matrix_in.
 //=============================================================================
 void DenseDoubleMatrix::multiply(const DenseDoubleMatrix &matrix_in,
                                  DenseDoubleMatrix& result)
 {
 #ifdef PARANOID
  // check matrix dimensions are compatable 
  if ( this->ncol() != matrix_in.nrow()  )
   {
    std::ostringstream error_message;
    error_message
     << "Matrix dimensions incompatable for matrix-matrix multiplication"
     << "ncol() for first matrix:" << this->ncol()
     << "nrow() for second matrix: " << matrix_in.nrow();
    
    throw OomphLibError(error_message.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
 #endif
  
  // NB N is number of rows!
  unsigned long n_row = this->nrow();
  unsigned long m_col = matrix_in.ncol();
  
  // resize and intialize result
  result.resize(n_row, m_col, 0.0);
  
  //clock_t clock1 = clock();
  
  // do calculation
  unsigned long n_col=this->ncol();
  for (unsigned long k=0; k<n_col; k++)
   {
    for (unsigned long i=0; i<n_row; i++)
     {
      for (unsigned long j=0; j<m_col; j++)
       {
        result(i,j) += Matrixdata[m_col*i+k] * matrix_in(k,j);
       }
     }
   }
 }
 
 
 
 ///////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////
 
 
 //=======================================================================
 /// \short Default constructor, set the default linear solver and 
 /// matrix-matrix multiplication method.
 //========================================================================
 CCDoubleMatrix::CCDoubleMatrix() : CCMatrix<double>()
   {
    Linear_solver_pt = Default_linear_solver_pt = new SuperLUSolver;
    Matrix_matrix_multiply_method = 2;
   }
   
 //========================================================================
  /// \short Constructor: Pass vector of values, vector of row indices,
  /// vector of column starts and number of rows (can be suppressed
  /// for square matrices). Number of nonzero entries is read
  /// off from value, so make sure the vector has been shrunk
  /// to its correct length.
 //=======================================================================
  CCDoubleMatrix::CCDoubleMatrix(const Vector<double>& value,
                                 const Vector<int>& row_index,
                                 const Vector<int>& column_start,
                                 const unsigned long &n,
                                 const unsigned long &m) :
   CCMatrix<double>(value,row_index,column_start,n,m)
   {
    Linear_solver_pt = Default_linear_solver_pt = new SuperLUSolver;
    Matrix_matrix_multiply_method = 2;
   }
 
  /// Destructor: delete the default linear solver
  CCDoubleMatrix::~CCDoubleMatrix() {delete Default_linear_solver_pt;}
 
 
 //===================================================================
 /// Perform LU decomposition. Return the sign of the determinant
 //===================================================================
 void CCDoubleMatrix::ludecompose()
 {
  static_cast<SuperLUSolver*>(Default_linear_solver_pt)->factorise(this);
 }
 
 //===================================================================
 /// Do the backsubstitution
 //===================================================================
 void CCDoubleMatrix::lubksub(DoubleVector &rhs)
 {
  static_cast<SuperLUSolver*>(Default_linear_solver_pt)->backsub(rhs,rhs);
 }
 
 //===================================================================
 ///  Multiply the matrix by the vector x
 //===================================================================
 void CCDoubleMatrix::multiply(const DoubleVector &x, DoubleVector &soln) const
 {
 #ifdef PARANOID
  // Check to see if x.size() = ncol().
  if (x.nrow()!=this->ncol())
   {
    std::ostringstream error_message_stream;
    error_message_stream 
     << "The x vector is not the right size. It is " << x.nrow() 
     << ", it should be " << this->ncol() << std::endl;
    throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
  // check that x is not distributed
  if (x.distributed())
   {
    std::ostringstream error_message_stream;
    error_message_stream 
     << "The x vector cannot be distributed for CCDoubleMatrix "
     << "matrix-vector multiply" << std::endl;
    throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
  // if soln is setup...
  if (soln.built())
   {
    // check that soln is not distributed
    if (soln.distributed())
     {
      std::ostringstream error_message_stream;
      error_message_stream 
       << "The x vector cannot be distributed for CCDoubleMatrix "
       << "matrix-vector multiply" << std::endl;
      throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
    if (soln.nrow() != this->nrow())
     {
      std::ostringstream error_message_stream;
      error_message_stream 
       << "The soln vector is setup and therefore must have the same "
       << "number of rows as the matrix";
      throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
    if (*soln.distribution_pt()->communicator_pt() != 
        *x.distribution_pt()->communicator_pt())
     {
      std::ostringstream error_message_stream;
      error_message_stream 
       << "The soln vector and the x vector must have the same communicator"
       << std::endl;
      throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
   }
 #endif
 
  // if soln is not setup then setup the distribution
  if (!soln.built())
   {
    LinearAlgebraDistribution* dist_pt = 
     new LinearAlgebraDistribution(x.distribution_pt()->communicator_pt(),
                                   this->nrow(),false);
    soln.build(dist_pt,0.0);
    delete dist_pt;
   }
 
  // zero
  soln.initialise(0.0);
  
  // multiply
  double* soln_pt = soln.values_pt();
  const double* x_pt = x.values_pt();
  for (unsigned long j=0;j<N;j++)
   {
    for (long k=Column_start[j];k<Column_start[j+1];k++)
     {
      unsigned long i = Row_index[k];
      double a_ij = Value[k];
      soln_pt[i] += a_ij*x_pt[j];
     }
   }
 }
 
 
 
 
 //=================================================================
 /// Multiply the  transposed matrix by the vector x: soln=A^T x
 //=================================================================
 void CCDoubleMatrix::multiply_transpose(const DoubleVector &x, 
                                         DoubleVector &soln) const
 {
 #ifdef PARANOID
  // Check to see if x.size() = ncol().
  if (x.nrow()!=this->nrow())
   {
    std::ostringstream error_message_stream;
    error_message_stream 
     << "The x vector is not the right size. It is " << x.nrow() 
     << ", it should be " << this->nrow() << std::endl;
    throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
  // check that x is not distributed
  if (x.distributed())
   {
    std::ostringstream error_message_stream;
    error_message_stream 
     << "The x vector cannot be distributed for CCDoubleMatrix "
     << "matrix-vector multiply" << std::endl;
    throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
  // if soln is setup...
  if (soln.built())
   {
    // check that soln is not distributed
    if (soln.distributed())
     {
      std::ostringstream error_message_stream;
      error_message_stream 
       << "The x vector cannot be distributed for CCDoubleMatrix "
       << "matrix-vector multiply" << std::endl;
      throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
    if (soln.nrow() != this->ncol())
     {
      std::ostringstream error_message_stream;
      error_message_stream 
       << "The soln vector is setup and therefore must have the same "
       << "number of columns as the matrix";
      throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
    if (*soln.distribution_pt()->communicator_pt() != 
        *x.distribution_pt()->communicator_pt())
     {
      std::ostringstream error_message_stream;
      error_message_stream 
       << "The soln vector and the x vector must have the same communicator"
       << std::endl;
      throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
   }
 #endif
 
  // if soln is not setup then setup the distribution
  if (!soln.built())
   {
    LinearAlgebraDistribution* dist_pt = 
     new LinearAlgebraDistribution(x.distribution_pt()->communicator_pt(),
                                   this->ncol(),false);
    soln.build(dist_pt,0.0);
    delete dist_pt;
   }
 
  // zero
  soln.initialise(0.0);
  
  // Matrix vector product
  double* soln_pt = soln.values_pt();
  const double* x_pt = x.values_pt();
  for (unsigned long i=0;i<N;i++)
   {  
    
    for (long k=Column_start[i];k<Column_start[i+1];k++)
     {
      unsigned long j=Row_index[k];
      double a_ij=Value[k];
      soln_pt[j]+=a_ij*x_pt[i];
     }
   }
 }
 
 
 
 
 //===========================================================================
 /// Function to multiply this matrix by the CCDoubleMatrix matrix_in
 /// The multiplication method used can be selected using the flag
 /// Matrix_matrix_multiply_method. By default Method 2 is used.
 /// Method 1: First runs through this matrix and matrix_in to find the storage
 ///           requirements for result - arrays of the correct size are 
 ///           then allocated before performing the calculation.
 ///           Minimises memory requirements but more costly.
 /// Method 2: Grows storage for values and column indices of result 'on the
 ///           fly' using an array of maps. Faster but more memory
 ///           intensive.
 /// Method 3: Grows storage for values and column indices of result 'on the
 ///           fly' using a vector of vectors. Not particularly impressive
 ///           on the platforms we tried...
 //=============================================================================
 void CCDoubleMatrix::multiply(const CCDoubleMatrix& matrix_in,
                               CCDoubleMatrix& result)
 {
 #ifdef PARANOID
  // check matrix dimensions are compatible
  if ( this->ncol() != matrix_in.nrow()  )
   {
    std::ostringstream error_message;
    error_message 
     << "Matrix dimensions incompatable for matrix-matrix multiplication"
     << "ncol() for first matrix:" << this->ncol()
     << "nrow() for second matrix: " << matrix_in.nrow();
    
    throw OomphLibError(error_message.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
 #endif
  
  // NB N is number of rows!
  unsigned long N = this->nrow();
  unsigned long M = matrix_in.ncol();
  unsigned long Nnz = 0;
  
  // pointers to arrays which store result
  int* Column_start;
  double* Value;
  int* Row_index;
 
  // get pointers to matrix_in
  const int* matrix_in_col_start = matrix_in.column_start();
  const int* matrix_in_row_index = matrix_in.row_index();
  const double* matrix_in_value = matrix_in.value();
 
  // get pointers to this matrix
  const double* this_value = this->value();
  const int* this_col_start = this->column_start();
  const int* this_row_index = this->row_index();
 
  // set method
  unsigned method = Matrix_matrix_multiply_method;
 
  // clock_t clock1 = clock();
 
  // METHOD 1
  // --------
  if (method==1)
  {
   // allocate storage for column starts
   Column_start = new int[M+1];
   Column_start[0]=0;
 
   // a set to store number of non-zero rows in each column of result
   std::set<unsigned> rows;
 
   // run through columns of this matrix and matrix_in to find number of
   // non-zero entries in each column of result
   for (unsigned long this_col = 0; this_col<M; this_col++)
   {
    // run through non-zeros in this_col of this matrix
    for (int this_ptr = this_col_start[this_col];
         this_ptr < this_col_start[this_col+1];
         this_ptr++)
    {
     // find row index for non-zero
     unsigned matrix_in_col = this_row_index[this_ptr];
 
     // run through corresponding column in matrix_in
     for (int matrix_in_ptr = matrix_in_col_start[matrix_in_col];
          matrix_in_ptr < matrix_in_col_start[matrix_in_col+1];
          matrix_in_ptr++)
     {
      // find row index for non-zero in matrix_in and store in rows
      rows.insert(matrix_in_row_index[matrix_in_ptr]);
     }
    }
    // update Column_start
    Column_start[this_col+1] = Column_start[this_col] + rows.size();
 
    // wipe values in rows
    rows.clear();
   }
 
   // set Nnz
   Nnz = Column_start[M];
 
   // allocate arrays for result
   Value = new double[Nnz];
   Row_index = new int[Nnz];
 
   // set all values of Row_index to -1
   for (unsigned long i=0;i<Nnz;i++)
    Row_index[i] = -1;
 
   // Calculate values for result - first run through columns of this matrix
   for (unsigned long this_col = 0; this_col<M; this_col++)
    {
     // run through non-zeros in this_column
     for (int this_ptr = this_col_start[this_col];
          this_ptr < this_col_start[this_col+1];
          this_ptr++)
      {
       // find value of non-zero
       double this_val = this_value[this_ptr];
       
       // find row associated with non-zero
       unsigned matrix_in_col = this_row_index[this_ptr];
       
       // run through corresponding column in matrix_in
       for (int matrix_in_ptr = matrix_in_col_start[matrix_in_col];
            matrix_in_ptr < matrix_in_col_start[matrix_in_col+1];
            matrix_in_ptr++)
        {
         // find row index for non-zero in matrix_in
         int row = matrix_in_row_index[matrix_in_ptr];
         
         // find position in result to insert value
         for(int ptr = Column_start[this_col];
             ptr <= Column_start[this_col+1];
             ptr++)
          {
           if (ptr == Column_start[this_col+1])
            {
             // error - have passed end of column without finding
             // correct row index
             std::ostringstream error_message;
             error_message << "Error inserting value in result";
             
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION);
            }
           else if (Row_index[ptr] == -1 )
            {
             // first entry for this row index
             Row_index[ptr] = row;
             Value[ptr] = this_val * matrix_in_value[matrix_in_ptr];
             break;
            }
           else if ( Row_index[ptr] == row )
            {
             // row index already exists - add value
             Value[ptr] += this_val * matrix_in_value[matrix_in_ptr];
             break;
            }
          }
        }
      }
    }
  }
  
  // METHOD 2
  // --------
  else if (method==2)
   {
    // generate array of maps to store values for result
    std::map<int,double>* result_maps = new std::map<int,double>[M];
    
    // run through columns of this matrix
    for (unsigned long this_col = 0; this_col<M; this_col++)
     {
      // run through non-zeros in this_col
      for (int this_ptr = this_col_start[this_col];
           this_ptr < this_col_start[this_col+1];
           this_ptr++)
       {
        // find value of non-zero
        double this_val = this_value[this_ptr];
        
        // find row index associated with non-zero
        unsigned matrix_in_col = this_row_index[this_ptr];
        
        // run through corresponding column in matrix_in
        for (int matrix_in_ptr = matrix_in_col_start[matrix_in_col];
             matrix_in_ptr < matrix_in_col_start[matrix_in_col+1];
             matrix_in_ptr++)
         {
          // find row index for non-zero in matrix_in
          int row = matrix_in_row_index[matrix_in_ptr];
          
          // insert value
          result_maps[this_col][row] += 
           this_val * matrix_in_value[matrix_in_ptr];
         }
       }
     }
    
    // allocate Column_start
    Column_start = new int[M+1];
    
    // copy across column starts
    Column_start[0] = 0;
    for (unsigned long col=0; col<M; col++)
     {
      int size = result_maps[col].size();
      Column_start[col+1] = Column_start[col] + size;
     }
    
    // set Nnz
    Nnz = Column_start[M];
    
    // allocate other arrays
    Value = new double[Nnz];
    Row_index = new int[Nnz];
    
    // copy values and row indices
    for (unsigned long col=0; col<M; col++)
     {
      unsigned ptr = Column_start[col];
      for (std::map<int,double>::iterator i = result_maps[col].begin();
           i != result_maps[col].end();
           i ++)
       {
        Row_index[ptr]= i->first;
        Value[ptr] = i->second;
        ptr++;
       }
     }
    
    // tidy up memory
    delete[] result_maps;
   }
  
  // METHOD 3
  // --------
  else if (method==3)
   {
    // vectors of vectors to store results
    std::vector< std::vector<int> > result_rows(N);
    std::vector< std::vector<double> > result_vals(N);
    
    // run through the columns of this matrix
   for (unsigned long this_col = 0; this_col<M; this_col++)
    {
     // run through non-zeros in this_col
     for (int this_ptr = this_col_start[this_col];
          this_ptr < this_col_start[this_col+1];
          this_ptr++)
      {
       // find value of non-zero
       double this_val = this_value[this_ptr];
       
       // find row index associated with non-zero
       unsigned matrix_in_col = this_row_index[this_ptr];
       
       // run through corresponding column in matrix_in
       for (int matrix_in_ptr = matrix_in_col_start[matrix_in_col];
            matrix_in_ptr < matrix_in_col_start[matrix_in_col+1];
            matrix_in_ptr++)
        {
         // find row index for non-zero in matrix_in
         int row = matrix_in_row_index[matrix_in_ptr];
         
         // insert value
         int size = result_rows[this_col].size();
         for (int i = 0; i<=size; i++)
          {
           if (i==size)
            {
             // first entry for this row index
             result_rows[this_col].push_back(row);
             result_vals[this_col].push_back(
              this_val*matrix_in_value[matrix_in_ptr]);
            }
           else if (row==result_rows[this_col][i])
            {
             // row index already exists
             result_vals[this_col][i] += 
              this_val * matrix_in_value[matrix_in_ptr];
             break;
            }
          }
        }
      }
    }
   
   // allocate Column_start
   Column_start = new int[M+1];
   
   // copy across column starts
   Column_start[0] = 0;
   for (unsigned long col=0; col<M; col++)
    {
     int size = result_rows[col].size();
     Column_start[col+1] = Column_start[col] + size;
    }
   
   // set Nnz
   Nnz = Column_start[M];
   
   // allocate other arrays
   Value = new double[Nnz];
   Row_index = new int[Nnz];
   
   // copy across values and row indices
   for (unsigned long col=0; col<N; col++)
    {
     unsigned ptr = Column_start[col];
     unsigned n_rows=result_rows[col].size();
     for (unsigned i = 0; i < n_rows ; i++)
      {
       Row_index[ptr] = result_rows[col][i];
       Value[ptr] = result_vals[col][i];
       ptr++;
      }
    }
   }
  
  // INCORRECT VALUE FOR METHOD
  else
   {
    std::ostringstream error_message;
    error_message << "Incorrect method set in matrix-matrix multiply"
                  << "method=" << method << " not allowed";
    
    throw OomphLibError(error_message.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
  
  result.build_without_copy(Value, Row_index, Column_start, Nnz, N, M);
 }
 
 
 
 //=================================================================
 /// For every row, find the maximum absolute value of the
 /// entries in this row. Set all values that are less than alpha times
 /// this maximum to zero and return the resulting matrix in
 /// reduced_matrix. Note: Diagonal entries are retained regardless
 /// of their size. 
 //=================================================================
 void CCDoubleMatrix::matrix_reduction(const double &alpha,
                                        CCDoubleMatrix &reduced_matrix)
 {
  // number of columns in matrix
  long n_coln=ncol();     
 
  Vector<double>max_row(nrow(),0.0);
 
  // Here's the packed format for the new matrix
  Vector<int> B_row_start(1);
  Vector<int> B_column_index;
  Vector<double> B_value;
  
 
  // k is counter for the number of entries in the reduced matrix
  unsigned k=0;
 
  // Initialise row start
  B_row_start[0]=0;
 
  // Loop over columns
  for(long i=0;i<n_coln;i++)
   { 
      
    //Loop over entries in columns
    for(long j=Column_start[i];j<Column_start[i+1];j++)
     {
     
      // Find max. value in row
      if(std::fabs(Value[j])>max_row[Row_index[j]])
       {
        max_row[Row_index[j]]=std::fabs(Value[j]);
       }
     }
 
    // Decide if we need to retain the entries in the row
    for(long j=Column_start[i];j<Column_start[i+1];j++)
     {
      // If we're on the diagonal or the value is sufficiently large: retain
      // i.e. copy across.
      if(i==Row_index[j] || std::fabs(Value[j])>alpha*max_row[Row_index[j]] )
       {
        B_value.push_back(Value[j]);
        B_column_index.push_back(Row_index[j]);
        k++;
       }
     }
    // This writes the row start for the next row -- equal to 
    // to the number of entries written so far (C++ zero-based indexing!)
    B_row_start.push_back(k);
   }
 
 
  // Build the matrix from the compressed format
  dynamic_cast<CCDoubleMatrix&>(reduced_matrix).
   build(B_value,B_column_index,B_row_start,nrow(),ncol());
  }
 
 
 
 ///////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////
 
 //=============================================================================
 /// Default constructor
 //=============================================================================
 CRDoubleMatrix::CRDoubleMatrix()
   {
    // set the default solver
    Linear_solver_pt = Default_linear_solver_pt = new SuperLUSolver;
 
    // matrix not built
    Built = false;
 
     // set the serial matrix-matrix multiply method
 #ifdef OOMPH_HAS_TRILINOS
 //    Serial_matrix_matrix_multiply_method = 4;
     Serial_matrix_matrix_multiply_method = 2;
 #else
     Serial_matrix_matrix_multiply_method = 2;
 #endif
   }
 
 //=============================================================================
 /// Copy constructor
 //=============================================================================
 CRDoubleMatrix::CRDoubleMatrix(const CRDoubleMatrix& other_matrix)
 {
  // copy the distribution
  this->build_distribution(other_matrix.distribution_pt());
 
  // copy coefficients
  const double* values_pt = other_matrix.value();
  const int* column_indices = other_matrix.column_index();
  const int* row_start = other_matrix.row_start();
  
  // This is the local nnz.
  const unsigned nnz = other_matrix.nnz();
 
  // Using number of local rows since the underlying CRMatrix is local to
  // each processor.
  const unsigned nrow_local = other_matrix.nrow_local();
 
  // Storage for the (yet to be copied) data.
  double* my_values_pt = new double[nnz];
  int* my_column_indices = new int[nnz];
  int* my_row_start = new int[nrow_local+1];
  
  // Copying over the data.
  std::copy(values_pt,values_pt+nnz,my_values_pt);
  std::copy(column_indices,column_indices+nnz,my_column_indices);
  std::copy(row_start,row_start+nrow_local+1,my_row_start);
 
 
  // Build without copy since we have made a deep copy of the data structure.
  this->build_without_copy(other_matrix.ncol(),nnz,my_values_pt,
                           my_column_indices,my_row_start);
 
  // set the default solver
  Linear_solver_pt = Default_linear_solver_pt = new SuperLUSolver;
  
  // matrix is built
  Built = true;
 
  // set the serial matrix-matrix multiply method
 #ifdef OOMPH_HAS_TRILINOS
 // Serial_matrix_matrix_multiply_method = 4;
  Serial_matrix_matrix_multiply_method = 2;
 #else
  Serial_matrix_matrix_multiply_method = 2;
 #endif
 }
 
 
 //=============================================================================
 /// Constructor: just stores the distribution but does not build the
 /// matrix
 //=============================================================================
 CRDoubleMatrix::CRDoubleMatrix(const LinearAlgebraDistribution* 
                                distribution_pt)
   {
    this->build_distribution(distribution_pt);
 
    // set the default solver
    Linear_solver_pt = Default_linear_solver_pt = new SuperLUSolver;
 
    // matrix not built
    Built = false;
 
 // set the serial matrix-matrix multiply method
 #ifdef OOMPH_HAS_TRILINOS
 //    Serial_matrix_matrix_multiply_method = 4;
     Serial_matrix_matrix_multiply_method = 2;
 #else
     Serial_matrix_matrix_multiply_method = 2;
 #endif
 
   }
 
 //=============================================================================
 /// \short Constructor: Takes the distribution and the number of columns, as 
 /// well as the vector of values, vector of column indices,vector of row 
 ///starts.
 //=============================================================================
 CRDoubleMatrix::CRDoubleMatrix(const LinearAlgebraDistribution* dist_pt,
                                const unsigned& ncol,
                                const Vector<double>& value, 
                                const Vector<int>& column_index,
                                const Vector<int>& row_start) 
 {
  // build the compressed row matrix
  CR_matrix.build(value,column_index,row_start,dist_pt->nrow_local(),ncol);
 
  // store the Distribution
  this->build_distribution(dist_pt);
 
  // set the linear solver
  Linear_solver_pt = Default_linear_solver_pt = new SuperLUSolver;
 
  // set the serial matrix-matrix multiply method
 #ifdef OOMPH_HAS_TRILINOS
 // Serial_matrix_matrix_multiply_method = 4;
  Serial_matrix_matrix_multiply_method = 2;
 #else
  Serial_matrix_matrix_multiply_method = 2;
 #endif
 
  // matrix has been built
  Built = true;
 }
 
 
 //=============================================================================
 /// Destructor
 //=============================================================================
 CRDoubleMatrix::~CRDoubleMatrix()
 {
  this->clear();
  delete Default_linear_solver_pt;
  Default_linear_solver_pt = 0;
 }
  
 //=============================================================================
 /// Rebuild the matrix - assembles an empty matrix with a defined distribution
 //=============================================================================
 void CRDoubleMatrix::build(const LinearAlgebraDistribution* distribution_pt)
 {
  this->clear();
  this->build_distribution(distribution_pt);
 }
 
 //=============================================================================
 /// \short Runs through the column index vector and checks if the entries
 /// are arranged arbitrarily or if they follow the regular lexicographical
 /// of matrices. If a boolean argument is provided with the assignment
 /// TRUE then information on the first entry which is not in the correct
 /// position will also be given
 //=============================================================================
  bool CRDoubleMatrix::entries_are_sorted(const bool& doc_unordered_entries) const
  {
 #ifdef OOMPH_HAS_MPI
   // We only need to produce a warning if the matrix is distributed
   if (this->distributed())
   {
    // Create an ostringstream object to store the warning message
    std::ostringstream warning_message;
 
    // Create the warning messsage
    warning_message << "This method currently works for serial but "
                    << "has not been implemented for use with MPI!\n";
 
    // Issue the warning
    OomphLibWarning(warning_message.str(),
                    OOMPH_CURRENT_FUNCTION,
                    OOMPH_EXCEPTION_LOCATION);
   }
 #endif
   
   // Get the number of rows in this matrix
   unsigned n_rows=this->nrow();
  
   // Acquire access to the value, row_start and column_index arrays from
   // the CR matrix. Since we do not change anything in row_start_pt we
   // give it the const prefix
   const int* column_index_pt=this->column_index();
   const int* row_start_pt=this->row_start();
 
   // Loop over the rows of matrix
   for (unsigned i=0;i<n_rows;i++)
   {
    // Calculate the number of nonzeros in the i-th row
    unsigned nnz_row_i=*(row_start_pt+i+1)-*(row_start_pt+i);
 
    // Get the index of the first entry in row i
    unsigned i_row_start=*(row_start_pt+i);
   
    // Loop over the entries of the i-th row
    for (unsigned j=0;j<nnz_row_i-1;j++)
    {
     // Check if the column index of the following entry is greater than the
     // current entry
     if ((*(column_index_pt+i_row_start+j+1))<
         (*(column_index_pt+i_row_start+j)))
     {
      // If the input argument was set to TRUE we document we output information
      // of the first entry which is not in the correct position
      if (doc_unordered_entries)
      {
       // Tell the user
       oomph_info << "Matrix has not been correctly sorted!"
                  << "\nOn row: " << i
                  << "\nEntry: " << j
                  << "\nEntry 1 = " << *(column_index_pt+i_row_start+j)
                  << "\nEntry 2 = " << *(column_index_pt+i_row_start+j+1)
                  << std::endl;
      }
       
      // It hasn't worked
      return false;
     } // if ((*(column_index_pt+i_row_start+j+1)) ...
    } // for (unsigned j=0;j<nnz_row_i-1;j++)
   } // for (unsigned i=0;i<n_rows;i++)
 
   // If it gets here without a warning then the entries in each row of
   // the matrix are ordered by increasing column index
   return true;
  } // End of entries_are_sorted()
  
 //=============================================================================
 /// \short This helper function sorts the entries in the column index vector
 /// and the value vector. During the construction of the matrix the entries
 /// were most likely assigned in an arbitrary order. As a result, it cannot
 /// be assumed that the entries in the column index vector corresponding to
 /// each row of the matrix have been arranged in increasing order. During
 /// the setup an additional vector will be set up; Index_of_diagonal_entries.
 /// The i-th entry of this vector contains the index of the last entry
 /// below or on the diagonal. If there are no entries below or on the
 /// diagonal then the corresponding entry is -1. If, however, there are
 /// no entries in the row then the entry is irrelevant and is kept
 /// as the initialised value; 0. 
 //=============================================================================
  void CRDoubleMatrix::sort_entries()
  {
 #ifdef OOMPH_HAS_MPI
   // We only need to produce a warning if the matrix is distributed
   if (this->distributed())
   {
    // Create an ostringstream object to store the warning message
    std::ostringstream warning_message;
 
    // Create the warning messsage
    warning_message << "This method currently works for serial but "
                    << "has not been tested with MPI!\n";
 
    // Issue the warning
    OomphLibWarning(warning_message.str(),
                    OOMPH_CURRENT_FUNCTION,
                    OOMPH_EXCEPTION_LOCATION);
   }
 #endif
   
   // Get the number of rows in the matrix
   unsigned n_rows=this->nrow();
    
   // Acquire access to the value, row_start and column_index arrays from
   // the CR matrix. Since we do not change anything in row_start_pt we
   // give it the const prefix
   double* value_pt=this->value();
   int* column_index_pt=this->column_index();
   const int* row_start_pt=this->row_start();
 
   // Resize the Index_of_diagonal_entries vector
   Index_of_diagonal_entries.resize(n_rows,0);
         
   // Vector of pairs to store the column_index of each value in the i-th row
   // and its corresponding matrix entry
   Vector<std::pair<int,double> > column_index_and_value_row_i;
   
   // Loop over the rows of the matrix
   for (unsigned i=0;i<n_rows;i++)
   {
    // Find the number of nonzeros in the i-th row
    unsigned nnz_row_i=*(row_start_pt+i+1)-*(row_start_pt+i);
 
    // Variable to store the start of the i-th row
    unsigned i_row_start=*(row_start_pt+i);
      
    // If there are no nonzeros in this row then the i-th entry of the vector
    // Index_of_diagonal_entries is irrelevant so we can simply let it be 0
    if (nnz_row_i==0)
    {
     // Set the i-th entry
     Index_of_diagonal_entries[i]=0;
    }
    // If there are nonzeros in the i-th row
    else
    {
     // If there is more than one entry in the row resize the vector
     // column_index_and_value_row_i
     column_index_and_value_row_i.resize(nnz_row_i);
    
     // Loop over the entries in the row
     for (unsigned j=0;j<nnz_row_i;j++)
     {
      // Assign the appropriate entries to column_index_and_value_row_i
      column_index_and_value_row_i[j]=
       std::make_pair(*(column_index_pt+i_row_start+j),
                      *(value_pt+i_row_start+j));
     }
 
     // Sort the vector of pairs using the struct CRDoubleMatrixComparisonHelper
     std::sort(column_index_and_value_row_i.begin(),
               column_index_and_value_row_i.end(),
               Comparison_struct);
 
     //-----------------------------------------------------------------------
     // Now that the entries of the i-th row have been sorted we can read them
     // back into value_pt and column_index_pt:
     //-----------------------------------------------------------------------
     
     // Create a boolean variable to indicate whether or not the i-th entry of
     // Index_of_diagonal_entries has been set
     bool is_ith_entry_set=false;
   
     // Loop over the entries in the vector column_index_and_value_row_i and
     // assign its entries to value_pt and column_index_pt
     for (unsigned j=0;j<nnz_row_i;j++)
     {
      // Set the column index of the j-th nonzero value in the i-th row of
      // the matrix
      *(column_index_pt+i_row_start+j)=column_index_and_value_row_i[j].first;
    
      // Set the value of the j-th nonzero value in the i-th row of
      // the matrix
      *(value_pt+i_row_start+j)=column_index_and_value_row_i[j].second;
 
      // This if statement is used to set the i-th entry of the vector
      // Index_of_diagonal_entries if it has not yet been set
      if (!is_ith_entry_set)
      {
       // If the column index of the first entry in row i is greater than the
       // row number then the first entry must lie above the diagonal
       if (unsigned(*(column_index_pt+i_row_start))>i)
       {
        // If the column index of the first entry in the row is greater than
        // the row number, i, then the i-th entry of Index_of_diagonal_entries
        // needs to be set to -1 to indicate there are no entries below or on
        // the diagonal
        Index_of_diagonal_entries[i]=-1;
 
        // Indicate that the i-th entry of Index_of_diagonal_entries has
        // been set
        is_ith_entry_set=true;
       }
       // If there are entries below or on the diagonal
       else
       {
        // If there is only one entry in the row then we know that this will
        // be the last entry below or on the diagonal because we have
        // eliminated the possibility that if there is only one entry,
        // that it lies above the diagonal 
        if (nnz_row_i==1)
        {
         // Set the index of the current entry to be the value of i-th entry
         // of Index_of_diagonal_entries
         Index_of_diagonal_entries[i]=i_row_start+j;
 
         // Indicate that the i-th entry of Index_of_diagonal_entries has
         // been set
         is_ith_entry_set=true;
        }
        // It remains to now check the case that there is more than one
        // entry in the row. If there is more than one entry in the row
        // and there are entries below or on the diagonal then we have
        // three cases:
        //          (1) The current entry lies on the diagonal;
        //          (2) The current entry lies above the diagonal;
        //          (3) The current entry lies below the diagonal;
        // The first case can easily be checked as done below. If the second
        // case occurs then we have just passed the last entry. We know this
        // because at least one entry lies on or below the diagonal. If the
        // second case it true then we need to assign the previous entry to
        // the vector Index_of_diagonal_entries. Finally, we are left with
        // case (3), which can be split into two cases:
        //          (3.1) The current entry lies below the diagonal but it
        //                is not the last entry below or on the diagonal;
        //          (3.2) The current entry lies below the diagonal and is
        //                the last entry below or on the diagonal.
        // If case (3.1) holds then we can simply wait until we get to the
        // next entry in the row and examine that. If the next entry lies
        // on the diagonal then it will be swept up by case (1). If the
        // next entry lies above the diagonal then case (2) will sweep it up
        // and if neither is the case then we wait until the next entry and
        // so on. If, instead, case (3.2) holds then our last check simply
        // needs to check if the current entry is the last entry in the row
        // because if the last entry lies on the diagonal, case (1) will
        // sweep it up. If it lies above the diagonal, case (2) will take
        // care of it. Therefore, the only remaining case is that it lies
        // strictly below the diagonal and since it is the last entry in
        // the row it means the index of this entry needs to be assigned
        // to Index_of_diagonal_entries
         
        // Case (1) : The current entry lies on the diagonal
        else if (unsigned(*(column_index_pt+i_row_start+j))==i)
        {
         // Set the index of the current entry to be the value of i-th entry
         // of Index_of_diagonal_entries
         Index_of_diagonal_entries[i]=i_row_start+j;
 
         // Indicate that the i-th entry of Index_of_diagonal_entries has
         // been set
         is_ith_entry_set=true;
        }
        // Case (2) : The current entry lies above the diagonal
        else if (unsigned(*(column_index_pt+i_row_start+j))>i)
        {         
         // Set the index of the current entry to be the value of i-th entry
         // of Index_of_diagonal_entries
         Index_of_diagonal_entries[i]=i_row_start+j-1;
 
         // Indicate that the i-th entry of Index_of_diagonal_entries has
         // been set
         is_ith_entry_set=true;
        }
        // Case (3.2) : The current entry is the last entry in the row
        else if (j==nnz_row_i-1)
        {
         // Set the index of the current entry to be the value of i-th entry
         // of Index_of_diagonal_entries
         Index_of_diagonal_entries[i]=i_row_start+j;
 
         // Indicate that the i-th entry of Index_of_diagonal_entries has
         // been set
         is_ith_entry_set=true;   
        } // if (nnz_row_i==1) else if
       } // if (*(column_index_pt+i_row_start)>i)       
      } // if (!is_ith_entry_set)
     } // for (unsigned j=0;j<nnz_row_i;j++)       
    } // if (nnz_row_i==0) else
   } // for (unsigned i=0;i<n_rows;i++)
  } // End of sort_entries()
   
 //=============================================================================
 /// Clean method
 //=============================================================================
 void CRDoubleMatrix::clear() 
 {
  this->clear_distribution();
  CR_matrix.clean_up_memory();
  Built = false;
 
     if(Linear_solver_pt != 0) // Only clean up if it exists
  Linear_solver_pt->clean_up_memory();
 }
 
 //=============================================================================
 /// \short build method: Takes the distribution and the number of columns, as 
 /// well as the vector of values, vector of column indices,vector of row 
 ///starts.
 //=============================================================================
 void CRDoubleMatrix::build(const LinearAlgebraDistribution* distribution_pt,
                            const unsigned& ncol,
                            const Vector<double>& value, 
                            const Vector<int>& column_index,
                            const Vector<int>& row_start)
 {
  // clear
  this->clear();
 
  // store the Distribution
  this->build_distribution(distribution_pt);
 
  // set the linear solver
  Default_linear_solver_pt = new SuperLUSolver;   
 
  // now build the matrix
  this->build(ncol,value,column_index,row_start);
 }
 
 //=============================================================================
 /// \short method to rebuild the matrix, but not the distribution
 //=============================================================================
 void CRDoubleMatrix::build(const unsigned& ncol,
                            const Vector<double>& value,
                            const Vector<int>& column_index,
                            const Vector<int>& row_start)
 {
  // call the underlying build method
  CR_matrix.clean_up_memory();
  CR_matrix.build(value,column_index,row_start,
                  this->nrow_local(),ncol);
 
  // matrix has been build
  Built = true;
 }
 
 //=============================================================================
 /// \short method to rebuild the matrix, but not the distribution
 //=============================================================================
 void CRDoubleMatrix::build_without_copy(const unsigned& ncol,
                                         const unsigned& nnz,
                                         double* value,
                                         int* column_index,
                                         int* row_start)
 {
  // call the underlying build method
  CR_matrix.clean_up_memory();
  CR_matrix.build_without_copy(value,column_index,row_start,nnz,
                               this->nrow_local(),ncol);
 
  // matrix has been build
  Built = true;
 }
 
 //=============================================================================
 /// Do LU decomposition
 //=============================================================================
 void CRDoubleMatrix::ludecompose()
 {
 #ifdef PARANOID
  // check that the this matrix is built
  if (!Built)
   {
    std::ostringstream error_message_stream;
    error_message_stream 
     << "This matrix has not been built.";
    throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
 #endif
 
  // factorise using superlu or superlu dist if we oomph has mpi
  static_cast<SuperLUSolver*>(Default_linear_solver_pt)->factorise(this);
 
 }
 
 //=============================================================================
 /// Do back-substitution
 //=============================================================================
 void CRDoubleMatrix::lubksub(DoubleVector &rhs)
 {
 #ifdef PARANOID
  // check that the rhs vector is setup
  if (!rhs.built())
   {
    std::ostringstream error_message_stream;
    error_message_stream 
     << "The vector rhs has not been setup";
    throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
  // check that the rhs vector has the same distribution as this matrix
  if (!(*this->distribution_pt() == *rhs.distribution_pt()))
   {
    std::ostringstream error_message_stream;
    error_message_stream 
     << "The vector rhs must have the same distribution as the matrix";
    throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
 #endif
 
  // backsub
  DoubleVector rhs_copy(rhs);
  static_cast<SuperLUSolver*>(Default_linear_solver_pt)->backsub(rhs_copy,rhs);
 }
 
 //=============================================================================
 ///  Multiply the matrix by the vector x
 //=============================================================================
 void CRDoubleMatrix::multiply(const DoubleVector &x, DoubleVector &soln) const
 {
 #ifdef PARANOID
  // check that this matrix is built
  if (!Built)
   {
    std::ostringstream error_message_stream;
    error_message_stream 
     << "This matrix has not been built";
    throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
  // check that the distribution of x is setup
  if (!x.built())
   {
  std::ostringstream error_message_stream;
    error_message_stream 
     << "The distribution of the vector x must be setup";
    throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
  // Check to see if x.size() = ncol().
  if (this->ncol() != x.distribution_pt()->nrow())
   {
    std::ostringstream error_message_stream;
    error_message_stream 
     << "The number of rows in the x vector and the number of columns in the "
     << "matrix must be the same";
    throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
  // if the soln is distributed
  if (soln.built())
   {
    if (!(*soln.distribution_pt() == *this->distribution_pt()))
     {
      std::ostringstream error_message_stream;
      error_message_stream 
       << "The soln vector is setup and therefore must have the same "
       << "distribution as the matrix";
      throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
   }
 #endif
 
  // if soln is not setup then setup the distribution
  if (!soln.built())
   {
    // Resize and initialize the solution vector
    soln.build(this->distribution_pt(),0.0);
   }
 
  // Initialise
  soln.initialise(0.0);
 
  // if distributed and on more than one processor use trilinos
  // otherwise use the oomph-lib methods
  if (this->distributed() && 
      this->distribution_pt()->communicator_pt()->nproc() > 1)
   {
 #ifdef OOMPH_HAS_TRILINOS
  // This will only work if we have trilinos on board
  TrilinosEpetraHelpers::multiply(this,x,soln);
 #else
    std::ostringstream error_message_stream;
    error_message_stream 
     << "Matrix-vector product on multiple processors with distributed "
     << "CRDoubleMatrix requires Trilinos.";
    throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
 #endif
   }
  else
   {   
    unsigned n = this->nrow();
    const int* row_start = CR_matrix.row_start();
    const int* column_index = CR_matrix.column_index();
    const double* value = CR_matrix.value();
    double* soln_pt = soln.values_pt();
    const double* x_pt = x.values_pt();
    for (unsigned long i=0;i<n;i++)
     {  
      soln_pt[i] = 0.0;
      for (long k=row_start[i];k<row_start[i+1];k++)
       {
        unsigned long j=column_index[k];
        double a_ij=value[k];
        soln_pt[i]+=a_ij*x_pt[j];
       }
     }
   }
 }
 
 //=================================================================
 /// Multiply the transposed matrix by the vector x: soln=A^T x
 //=================================================================
 void CRDoubleMatrix::multiply_transpose(const DoubleVector &x, 
                                         DoubleVector &soln) const
 {
 #ifdef PARANOID
  // check that this matrix is built
  if (!Built)
   {
    std::ostringstream error_message_stream;
    error_message_stream 
     << "This matrix has not been built";
    throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
  // Check to see if x.size() = ncol().
  if (!(*this->distribution_pt() == *x.distribution_pt()))
   {
    std::ostringstream error_message_stream;
    error_message_stream 
     << "The x vector and this matrix must have the same distribution.";
    throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
  // if soln is setup then it should have the same distribution as x
  if (soln.built())
   {
    if (soln.distribution_pt()->nrow() != this->ncol())
     {
      std::ostringstream error_message_stream;
      error_message_stream 
       << "The soln vector is setup and therefore must have the same "
       << "number of rows as the vector x";
      throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
   }
 #endif
 
  // if soln is not setup then setup the distribution
  if (!soln.built())
   {
    LinearAlgebraDistribution* dist_pt = 
     new LinearAlgebraDistribution(x.distribution_pt()->communicator_pt(),
                                   this->ncol(),this->distributed());
    soln.build(dist_pt,0.0);
    delete dist_pt;
   }
 
  // Initialise
  soln.initialise(0.0);
 
  if (this->distributed() && 
      this->distribution_pt()->communicator_pt()->nproc() > 1)
   {
 #ifdef OOMPH_HAS_TRILINOS
    // This will only work if we have trilinos on board
    TrilinosEpetraHelpers::multiply(this,x,soln);
 #else
      std::ostringstream error_message_stream;
      error_message_stream 
       << "Matrix-vector product on multiple processors with distributed "
       << "CRDoubleMatrix requires Trilinos.";
      throw OomphLibError(error_message_stream.str(),
 OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
 #endif
   }
  else
   {
    unsigned n = this->nrow();
    const int* row_start = CR_matrix.row_start();
    const int* column_index = CR_matrix.column_index();
    const double* value = CR_matrix.value();
    double* soln_pt = soln.values_pt();
    const double* x_pt = x.values_pt();
    // Matrix vector product
    for (unsigned long i=0;i<n;i++)
     {  
      for (long k=row_start[i];k<row_start[i+1];k++)
       {
        unsigned long j=column_index[k];
        double a_ij=value[k];
        soln_pt[j]+=a_ij*x_pt[i];
       }
     }
   }
 }
 
 //===========================================================================
 /// Function to multiply this matrix by the CRDoubleMatrix matrix_in.
 /// In a serial matrix, there are 4 methods available: 
 /// Method 1: First runs through this matrix and matrix_in to find the storage
 ///           requirements for result - arrays of the correct size are 
 ///           then allocated before performing the calculation.
 ///           Minimises memory requirements but more costly. 
 /// Method 2: Grows storage for values and column indices of result 'on the
 ///           fly' using an array of maps. Faster but more memory
 ///           intensive. 
 /// Method 3: Grows storage for values and column indices of result 'on the
 ///           fly' using a vector of vectors. Not particularly impressive
 ///           on the platforms we tried... 
 /// Method 4: Trilinos Epetra Matrix Matrix multiply.
 /// Method 5: Trilinox Epetra Matrix Matrix Mulitply (ml based) 
 /// If Trilinos is installed then Method 4 is employed by default, otherwise
 /// Method 2 is employed by default. 
 /// In a distributed matrix, only Trilinos Epetra Matrix Matrix multiply
 /// is available.
 //=============================================================================
 void CRDoubleMatrix::multiply(const CRDoubleMatrix& matrix_in,
                               CRDoubleMatrix& result) const
 {
 #ifdef PARANOID
  // check that this matrix is built
  if (!Built)
   {
    std::ostringstream error_message_stream;
    error_message_stream 
     << "This matrix has not been built";
    throw OomphLibError(error_message_stream.str(),
                        OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
  // check that this matrix is built
  if (!matrix_in.built())
   {
    std::ostringstream error_message_stream;
    error_message_stream 
     << "This matrix matrix_in has not been built";
    throw OomphLibError(error_message_stream.str(),
                        OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
  // if soln is setup then it should have the same distribution as x
  if (result.built())
   {
    if (!(*result.distribution_pt() == *this->distribution_pt()))
     {
      std::ostringstream error_message_stream;
      error_message_stream 
       << "The matrix result is setup and therefore must have the same "
       << "distribution as the vector x";
      throw OomphLibError(error_message_stream.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
   }
 #endif
 
  // if the result has not been setup, then store the distribution
  if (!result.distribution_built())
   {
    result.build(this->distribution_pt());
   }
    
  // short name for Serial_matrix_matrix_multiply_method
  unsigned method = Serial_matrix_matrix_multiply_method;
 
  // if this matrix is not distributed and matrix in is not distributed
  if (!this->distributed() && !matrix_in.distributed() && 
      ((method == 1) || (method == 2) || (method == 3)))
   {
    // NB N is number of rows!
    unsigned long N = this->nrow();
    unsigned long M = matrix_in.ncol();
    unsigned long Nnz = 0;
    
    // pointers to arrays which store result
    int* Row_start = 0;
    double* Value = 0;
    int* Column_index = 0;
    
    // get pointers to matrix_in
    const int* matrix_in_row_start = matrix_in.row_start();
    const int* matrix_in_column_index = matrix_in.column_index();
    const double* matrix_in_value = matrix_in.value();
    
    // get pointers to this matrix
    const double* this_value = this->value();
    const int* this_row_start = this->row_start();
    const int* this_column_index = this->column_index();
    
    //clock_t clock1 = clock();
    
    // METHOD 1
    // --------
    if (method==1)
     {
      // allocate storage for row starts
      Row_start = new int[N+1];
      Row_start[0]=0;
      
      // a set to store number of non-zero columns in each row of result
      std::set<unsigned> columns;
      
      // run through rows of this matrix and matrix_in to find number of
      // non-zero entries in each row of result
      for (unsigned long this_row = 0; this_row<N; this_row++)
       {
        // run through non-zeros in this_row of this matrix
        for (int this_ptr = this_row_start[this_row];
             this_ptr < this_row_start[this_row+1];
             this_ptr++)
         {
          // find column index for non-zero
          int matrix_in_row = this_column_index[this_ptr];
          
          // run through corresponding row in matrix_in
          for (int matrix_in_ptr = matrix_in_row_start[matrix_in_row];
               matrix_in_ptr < matrix_in_row_start[matrix_in_row+1];
               matrix_in_ptr++)
           {
            // find column index for non-zero in matrix_in and store in columns
            columns.insert(matrix_in_column_index[matrix_in_ptr]);
           }
         }
        // update Row_start
        Row_start[this_row+1] = Row_start[this_row] + columns.size();
        
        // wipe values in columns
        columns.clear();
       }
      
      // set Nnz
      Nnz = Row_start[N];
      
      // allocate arrays for result
      Value = new double[Nnz];
      Column_index = new int[Nnz];
      
      // set all values of Column_index to -1
      for (unsigned long i=0; i<Nnz; i++)
       {
        Column_index[i] = -1;
       }
      
      // Calculate values for result - first run through rows of this matrix
      for (unsigned long this_row = 0; this_row<N; this_row++)
       {
        // run through non-zeros in this_row
        for (int this_ptr = this_row_start[this_row];
             this_ptr < this_row_start[this_row+1];
             this_ptr++)
         {
          // find value of non-zero
          double this_val = this_value[this_ptr];
          
          // find column associated with non-zero
          int matrix_in_row = this_column_index[this_ptr];
          
          // run through corresponding row in matrix_in
          for (int matrix_in_ptr = matrix_in_row_start[matrix_in_row];
               matrix_in_ptr < matrix_in_row_start[matrix_in_row+1];
               matrix_in_ptr++)
           {
            // find column index for non-zero in matrix_in
            int col = matrix_in_column_index[matrix_in_ptr];
            
            // find position in result to insert value
            for(int ptr = Row_start[this_row];
                ptr <= Row_start[this_row+1];
                ptr++)
             {
              if (ptr == Row_start[this_row+1])
               {
                // error - have passed end of row without finding
                // correct column
                std::ostringstream error_message;
                error_message << "Error inserting value in result";
                
                throw OomphLibError(error_message.str(),
                                    OOMPH_CURRENT_FUNCTION,
                                    OOMPH_EXCEPTION_LOCATION);
               }
              else if (  Column_index[ptr] == -1 )
               {
                // first entry for this column index
                Column_index[ptr] = col;
                Value[ptr] = this_val * matrix_in_value[matrix_in_ptr];
                break;
               }
              else if ( Column_index[ptr] == col )
               {
                // column index already exists - add value
                Value[ptr] += this_val * matrix_in_value[matrix_in_ptr];
                break;
               }
             }
           }
         }
       }
     }
    
    // METHOD 2
    // --------
    else if (method==2)
     {
      // generate array of maps to store values for result
      std::map<int,double>* result_maps = new std::map<int,double>[N];
      
      // run through rows of this matrix
      for (unsigned long this_row = 0; this_row<N; this_row++)
       {
        // run through non-zeros in this_row
        for (int this_ptr = this_row_start[this_row];
             this_ptr < this_row_start[this_row+1];
             this_ptr++)
         {
          // find value of non-zero
          double this_val = this_value[this_ptr];
          
          // find column index associated with non-zero
          int matrix_in_row = this_column_index[this_ptr];
          
          // run through corresponding row in matrix_in
          for (int matrix_in_ptr = matrix_in_row_start[matrix_in_row];
               matrix_in_ptr < matrix_in_row_start[matrix_in_row+1];
               matrix_in_ptr++)
           {
            // find column index for non-zero in matrix_in
            int col = matrix_in_column_index[matrix_in_ptr];
            
            // insert value
            result_maps[this_row][col] += this_val * matrix_in_value[matrix_in_ptr];
           }
         }
       }
      
      // allocate Row_start
      Row_start = new int[N+1];
      
      // copy across row starts
      Row_start[0] = 0;
      for (unsigned long row=0; row<N; row++)
       {
        int size = result_maps[row].size();
        Row_start[row+1] = Row_start[row] + size;
       }
      
      // set Nnz
      Nnz = Row_start[N];
      
      // allocate other arrays
      Value = new double[Nnz];
      Column_index = new int[Nnz];
      
      // copy values and column indices
      for (unsigned long row=0; row<N; row++)
       {
        unsigned ptr = Row_start[row];
        for (std::map<int,double>::iterator i = result_maps[row].begin();
             i != result_maps[row].end();
             i ++)
         {
          Column_index[ptr]= i->first;
          Value[ptr] = i->second;
          ptr++;
         }
       }
      
      // tidy up memory
      delete[] result_maps;
     }
  
    // METHOD 3
    // --------
    else if (method==3)
     {
      // vectors of vectors to store results
      std::vector< std::vector<int> > result_cols(N);
      std::vector< std::vector<double> > result_vals(N);
      
      // run through the rows of this matrix
      for (unsigned long this_row = 0; this_row<N; this_row++)
       {
        // run through non-zeros in this_row
        for (int this_ptr = this_row_start[this_row];
             this_ptr < this_row_start[this_row+1];
             this_ptr++)
         {
          // find value of non-zero
          double this_val = this_value[this_ptr];
          
          // find column index associated with non-zero
          int matrix_in_row = this_column_index[this_ptr];
          
          // run through corresponding row in matrix_in
          for (int matrix_in_ptr = matrix_in_row_start[matrix_in_row];
             matrix_in_ptr < matrix_in_row_start[matrix_in_row+1];
               matrix_in_ptr++)
           {
            // find column index for non-zero in matrix_in
            int col = matrix_in_column_index[matrix_in_ptr];
            
            // insert value
            int size = result_cols[this_row].size();
            for (int i = 0; i<=size; i++)
             {
              if (i==size)
               {
                // first entry for this column
                result_cols[this_row].push_back(col);
                result_vals[this_row].push_back(
                 this_val*matrix_in_value[matrix_in_ptr]);
               }
              else if (col==result_cols[this_row][i])
               {
                // column already exists
                result_vals[this_row][i] += this_val * 
                 matrix_in_value[matrix_in_ptr];
                break;
               }
             }
           }
         }
       }
      
      // allocate Row_start
      Row_start = new int[N+1];
      
      // copy across row starts
      Row_start[0] = 0;
      for (unsigned long row=0; row<N; row++)
       {
        int size = result_cols[row].size();
        Row_start[row+1] = Row_start[row] + size;
       }
      
      // set Nnz
      Nnz = Row_start[N];
      
      // allocate other arrays
      Value = new double[Nnz];
      Column_index = new int[Nnz];
      
      // copy across values and column indices
      for (unsigned long row=0; row<N; row++)
       {
        unsigned ptr = Row_start[row];
        unsigned nnn=result_cols[row].size();
        for (unsigned i = 0; i < nnn; i++) 
         {
          Column_index[ptr] = result_cols[row][i];
          Value[ptr] = result_vals[row][i];
          ptr++;
         }
       }
     }
    
    // build
    result.build_without_copy(M, Nnz, Value, Column_index, Row_start);
   }
   
  // else we have to use trilinos
  else
   {
 #ifdef OOMPH_HAS_TRILINOS
      bool use_ml = false;
      if (method == 5)
       {
        use_ml = true;
       }
      TrilinosEpetraHelpers::multiply(*this,matrix_in,result,use_ml);
 #else
      std::ostringstream error_message;
      error_message << "Serial_matrix_matrix_multiply_method = "
                    << Serial_matrix_matrix_multiply_method 
                    << " requires trilinos.";
      throw OomphLibError(error_message.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
 #endif
   }
 }
   
  
 
 //=================================================================
 /// For every row, find the maximum absolute value of the
 /// entries in this row. Set all values that are less than alpha times
 /// this maximum to zero and return the resulting matrix in
 /// reduced_matrix. Note: Diagonal entries are retained regardless
 /// of their size. 
 //=================================================================
 void CRDoubleMatrix::matrix_reduction(const double &alpha,
                                        CRDoubleMatrix &reduced_matrix)
 {
  // number of rows in matrix
  long n_row=nrow_local();     
  double max_row;
  
  // Here's the packed format for the new matrix
  Vector<int> B_row_start(1);
  Vector<int> B_column_index;
  Vector<double> B_value;
  
  // get pointers to the underlying data
  const int* row_start = CR_matrix.row_start();
  const int* column_index = CR_matrix.column_index();
  const double* value = CR_matrix.value();
  
  // k is counter for the number of entries in the reduced matrix
  unsigned k=0;
 
  // Initialise row start
  B_row_start[0]=0;
 
  // Loop over rows
  for(long i=0;i<n_row;i++)
   { 
    // Initialise max value in row
    max_row=0.0;
    
    //Loop over entries in columns
    for(long j=row_start[i];j<row_start[i+1];j++)
     {
      // Find max. value in row
      if(std::fabs(value[j])>max_row)
       {
        max_row=std::fabs(value[j]);
       }
     }
 
    // Decide if we need to retain the entries in the row
    for(long j=row_start[i];j<row_start[i+1];j++)
     {
      // If we're on the diagonal or the value is sufficiently large: retain
      // i.e. copy across.
      if(i==column_index[j] || std::fabs(value[j])>alpha*max_row )
       {
        B_value.push_back(value[j]);
        B_column_index.push_back(column_index[j]);
        k++;
       }
     }
    // This writes the row start for the next row -- equal to 
    // to the number of entries written so far (C++ zero-based indexing!)
    B_row_start.push_back(k);
   }
  
  // Build the matrix from the compressed format
  dynamic_cast<CRDoubleMatrix&>(reduced_matrix).
   build(this->ncol(),B_value,B_column_index,B_row_start);
  }
 
 //=============================================================================
 /// if this matrix is distributed then the equivalent global matrix is built
 /// using new and returned. The calling method is responsible for the 
 /// destruction of the new matrix.
 //=============================================================================
 CRDoubleMatrix* CRDoubleMatrix::global_matrix() const
 {
 #ifdef OOMPH_HAS_MPI
  // if this matrix is not distributed then this method is redundant
  if (!this->distributed() || 
      this->distribution_pt()->communicator_pt()->nproc() == 1)
   {
    return new CRDoubleMatrix(*this);
   }
 
  // nnz 
  int nnz = this->nnz();
 
  // my nrow local
  unsigned nrow_local = this->nrow_local();
 
  // nrow global
  unsigned nrow = this->nrow();
    
  // cache nproc
  int nproc = this->distribution_pt()->communicator_pt()->nproc();
 
  // get the nnzs on the other processors
  int* dist_nnz_pt = new int[nproc];
  MPI_Allgather(&nnz,1,MPI_INT,
                dist_nnz_pt,1,MPI_INT,
                this->distribution_pt()->communicator_pt()->mpi_comm());
    
  // create a int vector of first rows and nrow local and compute nnz global
  int* dist_first_row = new int[nproc];
  int* dist_nrow_local =  new int[nproc];
  int nnz_global = 0;
  for (int p = 0; p < nproc; p++)
   {
    nnz_global += dist_nnz_pt[p];
    dist_first_row[p] = this->first_row(p);
    dist_nrow_local[p] = this->nrow_local(p);
   }
 
  // compute the offset for the values and column index data
  int* nnz_offset = new int[nproc];
  nnz_offset[0] = 0;
  for (int p = 1; p < nproc; p++)
   {
    nnz_offset[p] = nnz_offset[p-1] + dist_nnz_pt[p-1];
   }
    
  // get pointers to the (current) distributed data
  // const_cast required because MPI requires non-const data when sending
  // data
  int* dist_row_start = const_cast<int*>(this->row_start());
  int* dist_column_index = const_cast<int*>(this->column_index());
  double* dist_value = const_cast<double*>(this->value());
    
  // space for the global matrix
  int* global_row_start = new int[nrow+1];
  int* global_column_index = new int[nnz_global];
  double* global_value = new double[nnz_global];
 
  // get the row starts
  MPI_Allgatherv(dist_row_start,nrow_local,MPI_INT,
                 global_row_start,dist_nrow_local,dist_first_row,MPI_INT,
                 this->distribution_pt()->communicator_pt()->mpi_comm());
    
  // get the column indexes
  MPI_Allgatherv(dist_column_index,nnz,MPI_INT,
                 global_column_index,dist_nnz_pt,nnz_offset,MPI_INT,
                 this->distribution_pt()->communicator_pt()->mpi_comm());
  
  // get the values
  MPI_Allgatherv(dist_value,nnz,MPI_DOUBLE,
                 global_value,dist_nnz_pt,nnz_offset,MPI_DOUBLE,
                 this->distribution_pt()->communicator_pt()->mpi_comm());
    
  // finally the last row start
  global_row_start[nrow] = nnz_global;
    
  // update the other row start
  for (int p = 0; p < nproc; p++)
   {
    for (int i = 0; i < dist_nrow_local[p]; i++)
     {
      unsigned j = dist_first_row[p] + i;
      global_row_start[j]+=nnz_offset[p];
     }
   }
 
  // create the global distribution
  LinearAlgebraDistribution* dist_pt = new 
   LinearAlgebraDistribution(this->distribution_pt()->communicator_pt(),nrow,false);
  
  // create the matrix
  CRDoubleMatrix* matrix_pt = new CRDoubleMatrix(dist_pt);
  
  // copy of distribution taken so delete
  delete dist_pt;
 
  // pass data into matrix
  matrix_pt->build_without_copy(this->ncol(),nnz_global,global_value,
                                global_column_index,global_row_start);
 
  // clean up
  delete dist_first_row;
  delete dist_nrow_local;
  delete nnz_offset;
  delete dist_nnz_pt;
  
  // and return
  return matrix_pt;
 #else
  return new CRDoubleMatrix(*this);
 #endif
 }
 
  //============================================================================
  /// The contents of the matrix are redistributed to match the new
  /// distribution. In a non-MPI build this method does nothing. 
  /// \b NOTE 1: The current distribution and the new distribution must have
  /// the same number of global rows.
  /// \b NOTE 2: The current distribution and the new distribution must have
  /// the same Communicator.
  //============================================================================
  void CRDoubleMatrix::redistribute(const LinearAlgebraDistribution* 
                                    const& dist_pt)
  {
 #ifdef OOMPH_HAS_MPI
 #ifdef PARANOID
    // paranoid check that the nrows for both distributions is the 
    // same
    if (dist_pt->nrow() != this->distribution_pt()->nrow())
     {
      std::ostringstream error_message;    
      error_message << "The number of global rows in the new distribution ("
                    << dist_pt->nrow() << ") is not equal to the number"
                    << " of global rows in the current distribution ("
                    << this->distribution_pt()->nrow() << ").\n"; 
      throw OomphLibError(error_message.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
    // paranoid check that the current distribution and the new distribution
    // have the same Communicator
    OomphCommunicator temp_comm(*dist_pt->communicator_pt());
    if (!(temp_comm == *this->distribution_pt()->communicator_pt()))
     {
      std::ostringstream error_message;  
      error_message << "The new distribution and the current distribution must "
                    << "have the same communicator.";
      throw OomphLibError(error_message.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
    // paranoid check that the matrix is build
    if (!this->built())
     {
      std::ostringstream error_message;  
      error_message << "The matrix must be build to be redistributed";
      throw OomphLibError(error_message.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);     
     }
 #endif
 
    // if the two distributions are not the same
    // =========================================
    if (!((*this->distribution_pt()) == *dist_pt))
     {
      // Get the number of columns to build the matrix.
      unsigned long ncol = this->ncol();    
      
      // current data
      int* current_row_start = this->row_start();
      int* current_column_index = this->column_index();
      double* current_value = this->value();
 
      // get the rank and the number of processors
      int my_rank = this->distribution_pt()->communicator_pt()->my_rank();
      int nproc = this->distribution_pt()->communicator_pt()->nproc();
 
      // if both distributions are distributed
      // =====================================
      if (this->distributed() && dist_pt->distributed())
       {
 
        // new nrow_local and first_row data
        Vector<unsigned> new_first_row(nproc);
        Vector<unsigned> new_nrow_local(nproc);
        Vector<unsigned> current_first_row(nproc);
        Vector<unsigned> current_nrow_local(nproc);
        for (int i = 0; i < nproc; i++)
         {
          new_first_row[i] = dist_pt->first_row(i);
          new_nrow_local[i] = dist_pt->nrow_local(i);
          current_first_row[i] = this->first_row(i);
          current_nrow_local[i] = this->nrow_local(i);
         }
        
        // compute which local rows are expected to be received from each 
        // processor / sent to each processor
        Vector<unsigned> first_row_for_proc(nproc,0);
        Vector<unsigned> nrow_local_for_proc(nproc,0);
        Vector<unsigned> first_row_from_proc(nproc,0);
        Vector<unsigned> nrow_local_from_proc(nproc,0);
 
        // for every processor compute first_row and nrow_local that will
        // will sent and received by this processor
        for (int p = 0; p < nproc; p++)
         {
          // start with data to be sent
          if ((new_first_row[p] < (current_first_row[my_rank] +
                                        current_nrow_local[my_rank])) &&
              (current_first_row[my_rank] < (new_first_row[p] +
                                                  new_nrow_local[p])))
          {
           first_row_for_proc[p] = 
            std::max(current_first_row[my_rank],
                     new_first_row[p]);
           nrow_local_for_proc[p] = 
            std::min((current_first_row[my_rank] +
                      current_nrow_local[my_rank]),
                     (new_first_row[p] +
                      new_nrow_local[p])) - first_row_for_proc[p];
          }
          
          // and data to be received
          if ((new_first_row[my_rank] < (current_first_row[p] +
                                                  current_nrow_local[p])) 
              && (current_first_row[p] < (new_first_row[my_rank] +
                                               new_nrow_local[my_rank])))
          {
           first_row_from_proc[p] = 
            std::max(current_first_row[p],
                     new_first_row[my_rank]);
           nrow_local_from_proc[p] = 
            std::min((current_first_row[p] +
                      current_nrow_local[p]),
                     (new_first_row[my_rank] +
                      new_nrow_local[my_rank]))-first_row_from_proc[p];
          }         
         }
 
        // determine how many nnzs to send to each processor
        Vector<unsigned> nnz_for_proc(nproc,0);
        for (int p = 0; p < nproc; p++)
         {
          if (nrow_local_for_proc[p] > 0)
           {
            nnz_for_proc[p] = (current_row_start[first_row_for_proc[p]-
                                                 current_first_row[my_rank]+
                                                 nrow_local_for_proc[p]]-
                               current_row_start[first_row_for_proc[p]-
                                                 current_first_row[my_rank]]);
           }
         }
 
        // next post non-blocking sends and recv for the nnzs
        Vector<unsigned> nnz_from_proc(nproc,0);
        Vector<MPI_Request> send_req;
        Vector<MPI_Request> nnz_recv_req;
        for (int p = 0; p < nproc; p++)
         {
          if (p != my_rank)
           {
            // send
            if (nrow_local_for_proc[p] > 0)
             {
              MPI_Request req;
              MPI_Isend(&nnz_for_proc[p],1,MPI_UNSIGNED,p,0,
                        this->distribution_pt()->communicator_pt()->mpi_comm(),&req);
              send_req.push_back(req);
             }
            
            // recv
            if (nrow_local_from_proc[p] > 0)
             {
              MPI_Request req;
              MPI_Irecv(&nnz_from_proc[p],1,MPI_UNSIGNED,p,0,
                        this->distribution_pt()->communicator_pt()->mpi_comm(),&req);
              nnz_recv_req.push_back(req);
             }
           }
          // "send to self"
          else
           {
            nnz_from_proc[p] = nnz_for_proc[p];
           }
         }
 
        // allocate new storage for the new row_start
        int* new_row_start = new int[new_nrow_local[my_rank]+1];
 
        // wait for recvs to complete
        unsigned n_recv_req = nnz_recv_req.size();
        if (n_recv_req > 0)
         {
          Vector<MPI_Status> recv_status(n_recv_req);
          MPI_Waitall(n_recv_req,&nnz_recv_req[0],&recv_status[0]);
         }
        
        // compute the nnz offset for each processor
        unsigned next_row = 0;
        unsigned nnz_count = 0;
        Vector<unsigned> nnz_offset(nproc,0);
        for (int p = 0; p < nproc; p++)
         {
          unsigned pp = 0;
          while (new_first_row[pp] != next_row) { pp++; }
          nnz_offset[pp] = nnz_count;
          nnz_count+= nnz_from_proc[pp];
          next_row += new_nrow_local[pp];
         }
 
        // allocate storage for the values and column indices
        int* new_column_index = new int[nnz_count];
        double* new_value = new double[nnz_count];
 
        // post the sends and recvs for the matrix data
        Vector<MPI_Request> recv_req;
        MPI_Aint base_address;
        MPI_Get_address(new_value,&base_address);
        for (int p = 0; p < nproc; p++)
         {
          // communicated with other processors
          if (p != my_rank)
           {
            // SEND
            if (nrow_local_for_proc[p] > 0)
             {
              // array of datatypes
              MPI_Datatype types[3];
 
              // array of offsets
              MPI_Aint offsets[3];
 
              // array of lengths
              int len[3];
 
              // row start
              unsigned first_row_to_send = first_row_for_proc[p] - 
               current_first_row[my_rank];
              MPI_Type_contiguous(nrow_local_for_proc[p],MPI_INT,
                                  &types[0]);
              MPI_Type_commit(&types[0]);
              len[0] = 1;
              MPI_Get_address(current_row_start+first_row_to_send,&offsets[0]);
              offsets[0] -= base_address;
 
              // values
              unsigned first_coef_to_send 
               = current_row_start[first_row_to_send];             
              MPI_Type_contiguous(nnz_for_proc[p],MPI_DOUBLE,
                                  &types[1]);
              MPI_Type_commit(&types[1]);
              len[1] = 1;
              MPI_Get_address(current_value+first_coef_to_send,&offsets[1]);
              offsets[1] -= base_address;
 
              // column index
              MPI_Type_contiguous(nnz_for_proc[p],MPI_DOUBLE,
                                  &types[2]);
              MPI_Type_commit(&types[2]);
              len[2] = 1;
              MPI_Get_address(current_column_index+first_coef_to_send,&offsets[2]);
              offsets[2] -= base_address;
 
              // build the combined datatype
              MPI_Datatype send_type;
              MPI_Type_create_struct(3,len,offsets,types,&send_type);
              MPI_Type_commit(&send_type);
              MPI_Type_free(&types[0]);
              MPI_Type_free(&types[1]);
              MPI_Type_free(&types[2]);
 
              // and send
              MPI_Request req;
              MPI_Isend(new_value,1,send_type,p,1,
                        this->distribution_pt()->communicator_pt()->mpi_comm(),&req);
              send_req.push_back(req);
              MPI_Type_free(&send_type);
             }
 
            // RECV
            if (nrow_local_from_proc[p] > 0)
             {
              // array of datatypes
              MPI_Datatype types[3];
 
              // array of offsets
              MPI_Aint offsets[3];
 
              // array of lengths
              int len[3];
 
              // row start
              unsigned first_row_to_recv = first_row_from_proc[p] - 
               new_first_row[my_rank];
              MPI_Type_contiguous(nrow_local_from_proc[p],MPI_INT,
                                  &types[0]);
              MPI_Type_commit(&types[0]);
              len[0] = 1;
              MPI_Get_address(new_row_start+first_row_to_recv,&offsets[0]);
              offsets[0] -= base_address;
 
              // values
              unsigned first_coef_to_recv = nnz_offset[p];
              MPI_Type_contiguous(nnz_from_proc[p],MPI_DOUBLE,
                                  &types[1]);
              MPI_Type_commit(&types[1]);
              len[1] = 1;
              MPI_Get_address(new_value+first_coef_to_recv,&offsets[1]);
              offsets[1] -= base_address;
 
              // column index
              MPI_Type_contiguous(nnz_from_proc[p],MPI_INT,
                                  &types[2]);
              MPI_Type_commit(&types[2]);
              len[2] = 1;
              MPI_Get_address(new_column_index+first_coef_to_recv,&offsets[2]);
              offsets[2] -= base_address;
 
              // build the combined datatype
              MPI_Datatype recv_type;
              MPI_Type_create_struct(3,len,offsets,types,&recv_type);
              MPI_Type_commit(&recv_type);
              MPI_Type_free(&types[0]);
              MPI_Type_free(&types[1]);
              MPI_Type_free(&types[2]);
 
              // and send
              MPI_Request req;
              MPI_Irecv(new_value,1,recv_type,p,1,
                        this->distribution_pt()->communicator_pt()->mpi_comm(),&req);
              recv_req.push_back(req);
              MPI_Type_free(&recv_type);
             }
           }
          // other wise transfer data internally 
          else 
           {
            unsigned j = first_row_for_proc[my_rank] - 
             current_first_row[my_rank];
            unsigned k = first_row_from_proc[my_rank] - 
             new_first_row[my_rank];
            for (unsigned i = 0; i < nrow_local_for_proc[my_rank]; i++)
             {
              new_row_start[k+i] = current_row_start[j+i];
             }
            unsigned first_coef_to_send = current_row_start[j];
            for (unsigned i = 0; i < nnz_for_proc[my_rank]; i++)
             {
              new_value[nnz_offset[p]+i]=current_value[first_coef_to_send+i];
             new_column_index[nnz_offset[p]+i]
              =current_column_index[first_coef_to_send+i];
             }
           }
         }
 
        // wait for all recvs to complete
        n_recv_req = recv_req.size();
        if (n_recv_req > 0)
         {
          Vector<MPI_Status> recv_status(n_recv_req);
          MPI_Waitall(n_recv_req,&recv_req[0],&recv_status[0]);
         }
 
        // next we need to update the row starts
        for (int p = 0; p < nproc; p++)
         {
          if (nrow_local_from_proc[p] > 0)
           {
            unsigned first_row = first_row_from_proc[p]-new_first_row[my_rank];
            unsigned last_row = first_row + nrow_local_from_proc[p]-1;
            int update = nnz_offset[p] - new_row_start[first_row];
             for (unsigned i = first_row; i <= last_row; i++)
             {
              new_row_start[i]+=update;
             }
           }
         }
        new_row_start[dist_pt->nrow_local()] = nnz_count;
 
        // wait for sends to complete
        unsigned n_send_req = send_req.size();
        if (n_recv_req > 0)
         {
          Vector<MPI_Status> send_status(n_recv_req);
          MPI_Waitall(n_send_req,&send_req[0],&send_status[0]);
         }
        // if (my_rank == 0)
        //  {
        //   CRDoubleMatrix* m_pt = this->global_matrix();
        //   m_pt->sparse_indexed_output("m1.dat");
        //  }
 
        // 
        this->build(dist_pt);
        this->build_without_copy(ncol,nnz_count,
                                 new_value,new_column_index,
                                        new_row_start);
        // if (my_rank == 0)
        //  {
        //   CRDoubleMatrix* m_pt = this->global_matrix();
        //   m_pt->sparse_indexed_output("m2.dat");
        //  }
 //       this->sparse_indexed_output(oomph_info);
        abort();
       }
      
      // if this matrix is distributed but the new distributed matrix is global
      // ======================================================================
      else if (this->distributed() && !dist_pt->distributed())
       {
 
        // nnz 
        int nnz = this->nnz();
        
        // nrow global
        unsigned nrow = this->nrow();
        
        // cache nproc
        int nproc = this->distribution_pt()->communicator_pt()->nproc();
        
        // get the nnzs on the other processors
        int* dist_nnz_pt = new int[nproc];
        MPI_Allgather(&nnz,1,MPI_INT,
                      dist_nnz_pt,1,MPI_INT,
                      this->distribution_pt()->communicator_pt()->mpi_comm());
        
        // create an int array of first rows and nrow local and 
        // compute nnz global
        int* dist_first_row = new int[nproc];
        int* dist_nrow_local =  new int[nproc];
        for (int p = 0; p < nproc; p++)
         {
          dist_first_row[p] = this->first_row(p);
          dist_nrow_local[p] = this->nrow_local(p);
         }
        
        // conpute the offset for the values and column index data
        // compute the nnz offset for each processor
        int next_row = 0;
        unsigned nnz_count = 0;
        Vector<unsigned> nnz_offset(nproc,0);
        for (int p = 0; p < nproc; p++)
         {
          unsigned pp = 0;
          while (dist_first_row[pp] != next_row) { pp++; }
          nnz_offset[pp] = nnz_count;
          nnz_count+=dist_nnz_pt[pp];
          next_row+=dist_nrow_local[pp];
         }
        
        // get pointers to the (current) distributed data
        int* dist_row_start = this->row_start();
        int* dist_column_index = this->column_index();
        double* dist_value = this->value();
        
        // space for the global matrix
        int* global_row_start = new int[nrow+1];
        int* global_column_index = new int[nnz_count];
        double* global_value = new double[nnz_count];
 
        // post the sends and recvs for the matrix data
        Vector<MPI_Request> recv_req;
        Vector<MPI_Request> send_req;
        MPI_Aint base_address;
        MPI_Get_address(global_value,&base_address);
 
        // SEND
        if (dist_nrow_local[my_rank] > 0)
         {
          // types
          MPI_Datatype types[3];
          
          // offsets
          MPI_Aint offsets[3];
 
          // lengths
          int len[3];
 
          // row start
          MPI_Type_contiguous(dist_nrow_local[my_rank],MPI_INT,&types[0]);
          MPI_Type_commit(&types[0]);
          MPI_Get_address(dist_row_start,&offsets[0]);
          offsets[0] -= base_address;
          len[0] = 1;
 
          // value
          MPI_Type_contiguous(nnz,MPI_DOUBLE,&types[1]);
          MPI_Type_commit(&types[1]);
          MPI_Get_address(dist_value,&offsets[1]);
          offsets[1] -= base_address;
          len[1] = 1;
 
          // column indices
          MPI_Type_contiguous(nnz,MPI_INT,&types[2]);
          MPI_Type_commit(&types[2]);
          MPI_Get_address(dist_column_index,&offsets[2]);
          offsets[2] -= base_address;
          len[2] = 1;
 
          // build the send type
          MPI_Datatype send_type;
          MPI_Type_create_struct(3,len,offsets,types,&send_type);
          MPI_Type_commit(&send_type);
          MPI_Type_free(&types[0]);
          MPI_Type_free(&types[1]);
          MPI_Type_free(&types[2]);
          
          // and send 
          for (int p = 0; p < nproc; p++)
           {
            if (p != my_rank)
             {
              MPI_Request req;
              MPI_Isend(global_value,1,send_type,p,1,
                        this->distribution_pt()->communicator_pt()->mpi_comm(),&req);
              send_req.push_back(req);
             }
           }
          MPI_Type_free(&send_type);
         }
       
        // RECV
        for (int p = 0; p < nproc; p++)
         {
          // communicated with other processors
          if (p != my_rank)
           {
            // RECV
            if (dist_nrow_local[p] > 0)
             {
 
              // types
              MPI_Datatype types[3];
              
              // offsets
              MPI_Aint offsets[3];
              
              // lengths
              int len[3];
              
              // row start
              MPI_Type_contiguous(dist_nrow_local[p],MPI_INT,&types[0]);
              MPI_Type_commit(&types[0]);
              MPI_Get_address(global_row_start+dist_first_row[p],&offsets[0]);
              offsets[0] -= base_address;
              len[0] = 1;
              
              // value
              MPI_Type_contiguous(dist_nnz_pt[p],MPI_DOUBLE,&types[1]);
              MPI_Type_commit(&types[1]);
              MPI_Get_address(global_value+nnz_offset[p],&offsets[1]);
              offsets[1] -= base_address;
              len[1] = 1;
              
              // column indices
              MPI_Type_contiguous(dist_nnz_pt[p],MPI_INT,&types[2]);
              MPI_Type_commit(&types[2]);
              MPI_Get_address(global_column_index+nnz_offset[p],&offsets[2]);
              offsets[2] -= base_address;
              len[2] = 1;
 
              // build the send type
              MPI_Datatype recv_type;
              MPI_Type_create_struct(3,len,offsets,types,&recv_type);
              MPI_Type_commit(&recv_type);
              MPI_Type_free(&types[0]);
              MPI_Type_free(&types[1]);
              MPI_Type_free(&types[2]);
          
              // and send 
              MPI_Request req;
              MPI_Irecv(global_value,1,recv_type,p,1,
                        this->distribution_pt()->communicator_pt()->mpi_comm(),&req);
              recv_req.push_back(req);
              MPI_Type_free(&recv_type);
             }
           }
          // otherwise send to self
          else
           {
            unsigned nrow_local = dist_nrow_local[my_rank];
            unsigned first_row = dist_first_row[my_rank];
            for (unsigned i = 0; i < nrow_local; i++)
             {
              global_row_start[first_row+i]=dist_row_start[i];
             }
            unsigned offset = nnz_offset[my_rank];
            for (int i = 0; i < nnz; i++)
             {
              global_value[offset+i]=dist_value[i];
              global_column_index[offset+i]=dist_column_index[i];
             }
           }
         }
          
        // wait for all recvs to complete
        unsigned n_recv_req = recv_req.size();
        if (n_recv_req > 0)
         {
          Vector<MPI_Status> recv_status(n_recv_req);
          MPI_Waitall(n_recv_req,&recv_req[0],&recv_status[0]);
         }
 
        // finally the last row start
        global_row_start[nrow] = nnz_count;
        
        // update the other row start
        for (int p = 0; p < nproc; p++)
         {
          for (int i = 0; i < dist_nrow_local[p]; i++)
           {
            unsigned j = dist_first_row[p] + i;
            global_row_start[j]+=nnz_offset[p];
           }
         }
        
        // wait for sends to complete
        unsigned n_send_req = send_req.size();
        if (n_recv_req > 0)
         {
          Vector<MPI_Status> send_status(n_recv_req);
          MPI_Waitall(n_send_req,&send_req[0],&send_status[0]);
         }
 
        // rebuild the matrix
        LinearAlgebraDistribution* dist_pt = new 
         LinearAlgebraDistribution(this->distribution_pt()->communicator_pt(),
                                   nrow,false);
        this->build(dist_pt);
        this->build_without_copy(ncol,nnz_count,
                                 global_value,global_column_index,
                                 global_row_start);
 
        // clean up
        delete dist_pt;
        delete[] dist_first_row;
        delete[] dist_nrow_local;
        delete[] dist_nnz_pt;
       }
 
      // other the matrix is not distributed but it needs to be turned 
      // into a distributed matrix
      // =============================================================
      else if (!this->distributed() && dist_pt->distributed())
       {   
 
        // cache the new nrow_local
        unsigned nrow_local = dist_pt->nrow_local();
        
        // and first_row
        unsigned first_row = dist_pt->first_row();
 
        // get pointers to the (current) distributed data
        int* global_row_start = this->row_start();
        int* global_column_index = this->column_index();
        double* global_value = this->value();
 
        // determine the number of non zeros required by this processor
        unsigned nnz = global_row_start[first_row+nrow_local] - 
         global_row_start[first_row];
 
        // allocate
        int* dist_row_start = new int[nrow_local+1];
        int* dist_column_index = new int[nnz];
        double* dist_value = new double[nnz];
        
        // copy
        int offset = global_row_start[first_row];
        for (unsigned i = 0; i <= nrow_local; i++)
         {
          dist_row_start[i] = global_row_start[first_row+i]-offset;
         }
        for (unsigned i = 0; i < nnz; i++)
         {
          dist_column_index[i] = global_column_index[offset+i];
          dist_value[i] = global_value[offset+i];
         }
 
        // rebuild
        this->build(dist_pt);
        this->build_without_copy(ncol,nnz,dist_value,
                                 dist_column_index,dist_row_start);
       }
     }
 #endif
  }
   
 //=============================================================================
 /// Compute transpose of matrix
 //=============================================================================
  void CRDoubleMatrix::get_matrix_transpose(CRDoubleMatrix* result) const
  { 
   // Get the number of non_zeros 
   unsigned long nnon_zeros=this->nnz();
     
   // Find the number of rows and columns in the transposed
   // matrix. We differentiate these from those associated
   // with the original matrix by appending the characters
   // '_t' onto the end
   unsigned long n_rows=this->nrow();
   unsigned long n_rows_t=this->ncol();
 
 #ifdef OOMPH_HAS_MPI
   // We only need to produce a warning if the matrix is distributed
   if (this->distributed())
   {
    // Create an ostringstream object to store the warning message
    std::ostringstream warning_message;
 
    // Create the warning messsage
    warning_message << "This method currently works for serial but "
                    << "has not been tested with MPI!\n";
 
    // Issue the warning
    OomphLibWarning(warning_message.str(),
                    OOMPH_CURRENT_FUNCTION,
                    OOMPH_EXCEPTION_LOCATION);
   }
 #endif
   
   // Set up the distribution for the transposed matrix  
   result->distribution_pt()->build(this->distribution_pt()->
                                    communicator_pt(),n_rows_t,false);
   
   // Acquire access to the value, row_start and column_index
   // arrays from the CR matrix
   const double* value_pt=this->value();
   const int* row_start_pt=this->row_start();
   const int* column_index_pt=this->column_index();
   
   // Allocate space for the row_start and column_index vectors
   // associated with the transpose of the current matrix. 
   Vector<double> value_t(nnon_zeros,0.0);
   Vector<int> column_index_t(nnon_zeros,0);
   Vector<int> row_start_t(n_rows_t+1,0);
 
   // Loop over the column index vector and count how many times
   // each column number occurs and increment the appropriate
   // entry in row_start_t whose i+1'th entry will contain the
   // number of non-zeros in the i'th column of the matrix (this
   // is done so that the cumulative sum done next returns the
   // correct result)
   for (unsigned i=0;i<nnon_zeros;i++)
   {
    // Assign entries to row_start_t (noting the first
    // entry is left as 0 for the cumulative sum done later)
    row_start_t[*(column_index_pt+i)+1]++;
   }
   
   // Calculate the sum of the first i entries in the row_start_t
   // vector and store the values in the i'th entry of row_start_t
   for (unsigned i=1;i<n_rows_t+1;i++)
   {
    // Calculate the cumulative sum
    row_start_t[i]+=row_start_t[i-1];
   }
 
   // Allocate space for variables to store the indices of the
   // start and end of the non-zeros in a given row of the matrix
   unsigned i_row_s=0;
   unsigned i_row_e=0;
 
   // Initialise 3 extra variables for readability of the
   // code in the subsequent piece of code
   unsigned a=0;
   unsigned b=0;
   unsigned c=0;
 
   // Vector needed to count the number of entries added
   // to each segment in column_index_t where each segment
   // is associated with each row in the transposed matrix
   Vector<int> counter(n_rows_t,0);
   
   // Set the entries in column_index_t. To do this we loop
   // over each row of the original matrix (equivalently
   // the number of columns in the transpose)
   for (unsigned i_row=0;i_row<n_rows;i_row++)
   {     
    // Here we find the indices of the start and end
    // of the non-zeros in i_row'th row of the matrix.
    // [Note, there should be a -1 on i_row_e but this
    // is ignored so that we use a strict inequality
    // in the subsequent for-loop!]
    i_row_s=*(row_start_pt+i_row);
    i_row_e=*(row_start_pt+i_row+1);
 
    // Loop over the entries in the i_row'th row
    // of the matrix
    for (unsigned j=i_row_s;j<i_row_e;j++)
    {
             
     // The value of a is the column index of the j'th
     // element in the i_row'th row of the original matrix
     // (which is also the row index of the associated
     // non-zero in the transposed matrix)
     a=*(column_index_pt+j);
 
     // The value of b will be used to find the start
     // of the appropriate segment of column_index_t
     // that the non-zero belongs in
     b=row_start_t[a];
 
     // Find the number of elements already added to
     // this segment (to find which entry of the segment
     // the value i_row, the column index of the non-zero
     // in the transposed matrix, should be assigned to)
     c=counter[*(column_index_pt+j)];
 
     // Assign the value i_row to the appropriate entry
     // in column_index_t
     column_index_t[b+c]=i_row;
     value_t[b+c]=*(value_pt+j);
 
     // Increment the j'th value of the vector counter
     // to indicate another non-zero index has been
     // added into the 
     counter[*(column_index_pt+j)]++;
 
    } // End of for-loop over i_row'th row of the matrix
    
   } // End of for-loop over rows of the matrix
 
   // Build the matrix (note: the value of n_cols for the
   // transposed matrix is n_rows for the original matrix)
   result->build(n_rows,value_t,column_index_t,row_start_t);
   
  } // End of the function 
 
 
 //=============================================================================
 /// Compute infinity (maximum) norm of matrix
 //=============================================================================
 double CRDoubleMatrix::inf_norm() const
   {
 #ifdef PARANOID
    // paranoid check that the vector is setup
    if (!this->distribution_built())
     {
      std::ostringstream error_message;
      error_message << "This matrix must be setup."; 
      throw OomphLibError(error_message.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
 #endif
    
    // compute the local norm
    unsigned nrow_local = this->nrow_local();
    double n = 0;
    const int* row_start = CR_matrix.row_start();
    const double* value = CR_matrix.value();
    for (unsigned i = 0; i < nrow_local; i++)
     {
      double a = 0;
      for (int j = row_start[i]; j < row_start[i+1]; j++)
       {
        a += fabs(value[j]);
       }
      n = std::max(n,a);
     }
    
    // if this vector is distributed and on multiple processors then gather
 #ifdef OOMPH_HAS_MPI
    double n2 = n;
    if ( this->distributed() && 
         this->distribution_pt()->communicator_pt()->nproc() > 1)
     {
      MPI_Allreduce(&n,&n2,1,MPI_DOUBLE,MPI_MAX,
                    this->distribution_pt()->communicator_pt()->mpi_comm());
     }
    n = n2;
 #endif
    
    // and return
    return n;
   }
 
 //=============================================================================
 /// Return the diagonal entries of the matrix. 
 /// This only works with square matrices. This condition may be relaxed 
 /// in the future if need be.
 //=============================================================================
 Vector<double> CRDoubleMatrix::diagonal_entries() const
 {
 
 #ifdef PARANOID
  // Check if the matrix has been built.
  if(!this->built())
   {
    std::ostringstream error_message;
    error_message << "The matrix has not been built.\n"
                  << "Please build it...\n";
    throw OomphLibError(error_message.str(),
                        OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
 
  // Check if this is a square matrix.
  if(this->nrow() != this->ncol())
   {
    std::ostringstream error_message;
    error_message << "The matrix is not square. Can only get the diagonal\n"
                  << "entries of a square matrix.\n";
    throw OomphLibError(error_message.str(),
                        OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
 #endif
   
  // We get the diagonal entries on this processor.
  unsigned nrow_local = this->nrow_local();
 
  // Create storage for the diagonal entries.
  Vector<double> result_vec;
  result_vec.reserve(nrow_local);
   
  // Get the first row for the column offset.
  unsigned first_row = this->first_row();
 
  // Loop through the local rows.
  for (unsigned i = 0; i < nrow_local; i++) 
   {
    // The column entries are globally indexed.
    unsigned diag_entry_col = first_row + i;
     
    // Push back the diagonal entry.
    result_vec.push_back(CR_matrix.get_entry(i, diag_entry_col));
   }
   
  return result_vec;
 }
 
 //=============================================================================
 /// Element-wise addition of this matrix with matrix_in.
 //=============================================================================
 void CRDoubleMatrix::add(const CRDoubleMatrix &matrix_in, 
                          CRDoubleMatrix &result_matrix) const
 {
 #ifdef PARANOID
  // Check if this matrix is built.
  if(!this->built())
   {
    std::ostringstream error_message;
    error_message << "The matrix is not built.\n"
                  << "Please build the matrix!\n";
    throw OomphLibError(error_message.str(),
                        OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
   
  // Check if this matrix_in is built.
  if(!matrix_in.built())
   {
    std::ostringstream error_message;
    error_message << "The matrix matrix_in is not built.\n"
                  << "Please build the matrix!\n";
    throw OomphLibError(error_message.str(),
                        OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
 
  // Check if the dimensions of this matrix and matrix_in are the same.
  unsigned long this_nrow = this->nrow();
  unsigned long matrix_in_nrow = matrix_in.nrow();
  if(this_nrow != matrix_in_nrow)
   {
    std::ostringstream error_message;
    error_message << "matrix_in has a different number of rows than\n"
                  << "this matrix.\n";
    throw OomphLibError(error_message.str(),
                        OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
 
  unsigned long this_ncol = this->ncol();
  unsigned long matrix_in_ncol = matrix_in.ncol();
  if(this_ncol != matrix_in_ncol)
   {
    std::ostringstream error_message;
    error_message << "matrix_in has a different number of columns than\n"
                  << "this matrix.\n";
    throw OomphLibError(error_message.str(),
                        OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
 
  // Check if the distribution is the same (Otherwise we may have to send and 
  // receive data from other processors - which is not implemented!)
  if(*(this->distribution_pt()) != *(matrix_in.distribution_pt()))
   {
    std::ostringstream error_message;
    error_message << "matrix_in must have the same distribution as\n"
                  << "this matrix.\n";
    throw OomphLibError(error_message.str(),
                        OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
  
  // If the matrix is built, check that it's existing distribution is the
  // same as the in matrix. Since we'll use the same distribution instead
  // of completely rebuilding it.
  if(result_matrix.built() && 
     (*result_matrix.distribution_pt() != *matrix_in.distribution_pt()))
   {
    std::ostringstream error_message;
    error_message << "The result_matrix is built. "
                  << "But has a different distribution from matrix_in \n"
                  << "They need to be the same.\n";
    throw OomphLibError(error_message.str(),
                        OOMPH_CURRENT_FUNCTION,
                        OOMPH_EXCEPTION_LOCATION);
   }
 #endif
  
  // To add the elements of two CRDoubleMatrices, we need to know the union of
  // the sparsity patterns. This is determined by the column indices.
  // We add the column indices and values (entries) as a key-value pair in
  // a map (per row). We then read these out into two column indices and values
  // vector for the result matrix.
 
  unsigned nrow_local = this->nrow_local();
  Vector<int> res_column_indices;
  Vector<double> res_values;
  Vector<int> res_row_start;
  res_row_start.reserve(nrow_local+1);
 
  // The row_start and column_indices
  const int* this_column_indices = this->column_index();
  const int* this_row_start = this->row_start();
  const int* in_column_indices = matrix_in.column_index();
  const int* in_row_start = matrix_in.row_start();
 
  // Values from this matrix and matrix_in.
  const double* this_values = this->value();
  const double* in_values = matrix_in.value();
 
 
  // The first entry in row_start is always zero.
  res_row_start.push_back(0);
 
  // Loop through the rows of both matrices and insert the column indices and 
  // values as a key-value pair.
  for (unsigned row_i = 0; row_i < nrow_local; row_i++) 
   {
    // Create the map for this row.
    std::map<int,double> res_row_map;
 
    // Insert the column and value pair for this matrix.
    for (int i = this_row_start[row_i]; i < this_row_start[row_i+1]; i++) 
     {
      res_row_map[this_column_indices[i]] = this_values[i];
     }
 
    // Insert the column and value pair for in matrix.
    for (int i = in_row_start[row_i]; i < in_row_start[row_i+1]; i++) 
    {
      res_row_map[in_column_indices[i]] += in_values[i];
    }
 
    // Fill in the row start
    res_row_start.push_back(res_row_start.back() + res_row_map.size());
 
    // Now insert the key into res_column_indices and value into res_values
    for(std::map<int,double>::iterator it = res_row_map.begin(); 
        it != res_row_map.end(); ++it)
     {
      res_column_indices.push_back(it->first);
      res_values.push_back(it->second);
     }
   }
 
  // Finally build the result_matrix.
  if(result_matrix.distribution_pt()->built())
   {
    // Use the existing distribution.
    result_matrix.build(this->ncol(),res_values,
                        res_column_indices,res_row_start);
   }
  else
   {
    // Build with THIS distribution
    result_matrix.build(this->distribution_pt(),this->ncol(),res_values,
                        res_column_indices,res_row_start);
   }
 }
 
 //=================================================================
 /// Namespace for helper functions for CRDoubleMatrices
 //=================================================================
 namespace CRDoubleMatrixHelpers
 {
  //============================================================================
  /// \short Builds a uniformly distributed matrix.
  /// A locally replicated matrix is constructed then redistributed using 
  /// OOMPH-LIB's default uniform row distribution.
  /// This is memory intensive thus should be used for 
  /// testing or small problems only.
  //============================================================================
  void create_uniformly_distributed_matrix(
   const unsigned &nrow, const unsigned &ncol,
   const OomphCommunicator* const comm_pt,
   const Vector<double> &values, 
   const Vector<int> &column_indices, const Vector<int> &row_start,
   CRDoubleMatrix &matrix_out)
  {
 #ifdef PARANOID
   // Check if the communicator exists.
   if(comm_pt == 0)
    {
     std::ostringstream error_message;
     error_message << "Please supply the communicator.\n";
     throw OomphLibError(error_message.str(),
                         OOMPH_CURRENT_FUNCTION,
                         OOMPH_EXCEPTION_LOCATION);
    }
   // Is the out matrix built? We need an empty matrix!
   if(matrix_out.built())
    {
     std::ostringstream error_message;
     error_message << "The result matrix has been built.\n"
                   << "Please clear the matrix.\n";
     throw OomphLibError(error_message.str(),
                         OOMPH_CURRENT_FUNCTION,
                         OOMPH_EXCEPTION_LOCATION);
    }
 #endif
 
   // Create the locally replicated distribution.
   bool distributed = false;
   LinearAlgebraDistribution 
    locally_replicated_distribution(comm_pt,nrow,distributed);
 
   // Create the matrix.
   matrix_out.build(&locally_replicated_distribution,ncol,
                    values,column_indices,row_start);
 
   // Create the distributed distribution.
   distributed = true;
   LinearAlgebraDistribution 
    distributed_distribution(comm_pt,nrow,distributed);
 
   // Redistribute the matrix.
   matrix_out.redistribute(&distributed_distribution);
  }
 
  //============================================================================
  /// Compute infinity (maximum) norm of sub blocks as if it was one matrix
  //============================================================================
  double inf_norm(const DenseMatrix<CRDoubleMatrix*> &matrix_pt)
   {
 
    // The number of block rows and columns
    const unsigned nblockrow = matrix_pt.nrow();
    const unsigned nblockcol = matrix_pt.ncol();
 
 #ifdef PARANOID
    // Check that tehre is at least one matrix.
    if(matrix_pt.nrow() == 0)
     {
      std::ostringstream error_message;
      error_message << "There are no matrices... \n";
      throw OomphLibError(error_message.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
 
 
    // Check that all matrix_pt pointers are not null 
    // and the matrices are built.
    for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++) 
     {
      for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++) 
       {
        if(matrix_pt(block_row_i,block_col_i) == 0)
         {
          std::ostringstream error_message;
          error_message << "The pointer martrix_pt(" << block_row_i 
                        << "," << block_col_i <<") is null.\n";
          throw OomphLibError(error_message.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION);
         }
 
        if(!matrix_pt(block_row_i,block_col_i)->built())
         {
          std::ostringstream error_message;
          error_message << "The matrix at martrix_pt(" << block_row_i 
                        << "," << block_col_i <<") is not built.\n";
          throw OomphLibError(error_message.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION);
         }
       }
     }
 #endif
 
 #ifdef OOMPH_HAS_MPI
    
    // The communicator pointer from block (0,0)
    const OomphCommunicator* const comm_pt 
     = matrix_pt(0,0)->distribution_pt()->communicator_pt();
 
 #ifdef PARANOID
 
    
    // Check that all communicators are the same
    for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++) 
     {
      for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++) 
       {
        // Communicator for this block matrix.
        const OomphCommunicator current_block_comm
          = *(matrix_pt(block_row_i,block_col_i)
            ->distribution_pt()->communicator_pt());
        if(*comm_pt != current_block_comm)
         {
          std::ostringstream error_message;
          error_message << "The communicator of block martrix_pt("<< block_row_i 
                        << "," << block_col_i <<") is not the same as block "
                        << "matrix_pt(0,0).\n";
          throw OomphLibError(error_message.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION);
 
         }
       }
     }
 
    // Check that all distributed boolean are the same (if on more than 1 core)
    if(comm_pt->nproc() > 1)
     {
      // Get the distributed boolean from matrix_pt(0,0)
      bool first_distributed = matrix_pt(0,0)->distributed();
      
      for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++) 
       {
        for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++) 
         {
          // Is the current block distributed?
          bool current_distributed 
            = matrix_pt(block_row_i,block_col_i)->distributed();
 
          if(first_distributed != current_distributed)
           {
            std::ostringstream error_message;
            error_message << "Block matrix_pt(" << block_row_i 
                          << ","<<block_col_i<<") and block matrix_pt(0,0) "
                          << "have a different distributed boolean.\n";
            throw OomphLibError(error_message.str(),
                                OOMPH_CURRENT_FUNCTION,
                                OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
     }
 
    // Check that all sub matrix dimensions "make sense"
    // We need to check that all the matrices in the same row has the same nrow.
    // Then repeat for the columns.
    
    // Check the nrow of each block row.
    for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++) 
     {
      // Get the nrow to compare against from the first column.
      const unsigned first_block_nrow = matrix_pt(block_row_i,0)->nrow();
      
      // Loop through the block columns.
      for (unsigned block_col_i = 1; block_col_i < nblockcol; block_col_i++) 
       {
        // If the nrow of this block is not the same as the nrow from the first
        // block in this block row, throw an error.
        const unsigned current_block_nrow 
          = matrix_pt(block_row_i,block_col_i)->nrow();
 
        if(first_block_nrow != current_block_nrow)
         {
          std::ostringstream error_message;
          error_message << "First block has nrow = " << current_block_nrow
                        << ". But martrix_pt(" << block_row_i 
                        << "," << block_col_i <<") has nrow = " 
                        << current_block_nrow << ".\n";
          throw OomphLibError(error_message.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION);
         }
       
       }
     }
 
    // Check the ncol of each block column.
    for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++) 
     {
      // Get the ncol from the first block row to compare against.
      const unsigned first_block_ncol = matrix_pt(0,block_col_i)->ncol();
      
      for (unsigned block_row_i = 1; block_row_i < nblockrow; block_row_i++) 
       {
        // Get the ncol for the current block.
        const unsigned current_block_ncol 
          = matrix_pt(block_row_i,block_col_i)->ncol();
 
        if(first_block_ncol != current_block_ncol)
         {
          std::ostringstream error_message;
          error_message << "First block has ncol = " << current_block_ncol
                        << ". But martrix_pt(" << block_row_i 
                        << "," << block_col_i <<") has ncol = " 
                        << current_block_ncol << ".\n";
          throw OomphLibError(error_message.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION);
         }
       }
     }
 
    // Check that the distribution for each block row is the same.
    for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++) 
    {
      // The first distribution of this block row.
      const LinearAlgebraDistribution first_dist 
        = *(matrix_pt(block_row_i,0)->distribution_pt());
 
      // Loop through the rest of the block columns.
      for (unsigned block_col_i = 1; block_col_i < nblockcol; block_col_i++) 
      {
        // Get the distribution from the current block.
        const LinearAlgebraDistribution current_dist 
          = matrix_pt(block_row_i,block_col_i)->distribution_pt();
 
        // Compare the first distribution against the current.
        if(first_dist != current_dist)
         {
          std::ostringstream error_message;
          error_message << "First distribution of block row " << block_row_i
                        << " is different from the distribution from "
                        << "martrix_pt(" << block_row_i 
                        << "," << block_col_i <<").\n";
          throw OomphLibError(error_message.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION);
         }
      }
    }
 #endif
  
 #endif
 
    // Loop thrpugh the block rows, then block columns to 
    // compute the local inf norm
    double inf_norm = 0;
    for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++) 
     {
      // Get the number of local rows from the first block.
      unsigned block_nrow_local = matrix_pt(block_row_i,0)->nrow_local();
 
      // Loop through the block_nrow_local in this block row
      for (unsigned local_row_i = 0; local_row_i < block_nrow_local; 
           local_row_i++) 
       {
        double abs_sum_of_row = 0;
        // Loop through the block columns
        for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++) 
         {
          // Locally cache the pointer to the current block.
          CRDoubleMatrix* block_pt = matrix_pt(block_row_i,block_col_i);
  
          const int* row_start = block_pt->row_start();  
          const double* value = block_pt->value();
 
          // Loop through the values
          for (int val_i = row_start[local_row_i]; 
               val_i < row_start[local_row_i+1]; val_i++)
           {
            abs_sum_of_row += fabs(value[val_i]);
           }
         }
        // Store the max row
        inf_norm = std::max(inf_norm,abs_sum_of_row);
       }
     }
 
    // if this vector is distributed and on multiple processors then gather
 #ifdef OOMPH_HAS_MPI
    double inf_norm_local = inf_norm;
    if ( matrix_pt(0,0)->distributed() && comm_pt->nproc() > 1)
     {
      MPI_Allreduce(&inf_norm,&inf_norm_local,1,MPI_DOUBLE,MPI_MAX,
                    comm_pt->mpi_comm());
     }
    inf_norm = inf_norm_local;
 #endif
    
    // and return
    return inf_norm;
   }
 
  //============================================================================
  /// \short Calculates the largest Gershgorin disc whilst preserving the sign. 
  /// Let A be an n by n matrix, with entries aij. For \f$ i \in \{ 1,...,n \} \f$ let
  /// \f$ R_i = \sum_{i\neq j} |a_{ij}| \f$ be the sum of the absolute values of the
  /// non-diagonal entries in the i-th row. Let \f$ D(a_{ii},R_i) \f$ be the closed 
  /// disc centered at \f$ a_{ii} \f$ with radius \f$ R_i \f$, such a disc is called a 
  /// Gershgorin disc.
  /// 
  /// \n
  /// 
  /// We calculate \f$ |D(a_{ii},R_i)|_{max} \f$ and multiply by the sign of the diagonal
  /// entry.
  ///
  /// \n
  ///
  /// The DenseMatrix of CRDoubleMatrices are treated as if they are one 
  /// large matrix. Therefore the dimensions of the sub matrices has to 
  /// "make sense", there is a paranoid check for this.
  //============================================================================
  double gershgorin_eigenvalue_estimate(
      const DenseMatrix<CRDoubleMatrix*> &matrix_pt)
   {
 
    // The number of block rows and columns
    const unsigned nblockrow = matrix_pt.nrow();
    const unsigned nblockcol = matrix_pt.ncol();
 
 #ifdef PARANOID
    // Check that tehre is at least one matrix.
    if(matrix_pt.nrow() == 0)
     {
      std::ostringstream error_message;
      error_message << "There are no matrices... \n";
      throw OomphLibError(error_message.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION);
     }
 
 
    // Check that all matrix_pt pointers are not null 
    // and the matrices are built.
    for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++) 
     {
      for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++) 
       {
        if(matrix_pt(block_row_i,block_col_i) == 0)
         {
          std::ostringstream error_message;
          error_message << "The pointer martrix_pt(" << block_row_i 
                        << "," << block_col_i <<") is null.\n";
          throw OomphLibError(error_message.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION);
         }
 
        if(!matrix_pt(block_row_i,block_col_i)->built())
         {
          std::ostringstream error_message;
          error_message << "The matrix at martrix_pt(" << block_row_i 
                        << "," << block_col_i <<") is not built.\n";
          throw OomphLibError(error_message.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION);
         }
       }
     }
 #endif
 
 
 #ifdef OOMPH_HAS_MPI
 
    // The communicator pointer from block (0,0)
    // All communicators should be the same, we check this next.
    const OomphCommunicator* const comm_pt 
     = matrix_pt(0,0)->distribution_pt()->communicator_pt();
 
 #ifdef PARANOID
 
    // Check that all communicators are the same
    for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++) 
     {
      for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++) 
       {
        // Communicator for this block matrix.
        const OomphCommunicator current_block_comm
          = *(matrix_pt(block_row_i,block_col_i)
            ->distribution_pt()->communicator_pt());
        if(*comm_pt != current_block_comm)
         {
          std::ostringstream error_message;
          error_message << "The communicator of block martrix_pt("<< block_row_i 
                        << "," << block_col_i <<") is not the same as block "
                        << "matrix_pt(0,0).\n";
          throw OomphLibError(error_message.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION);
 
         }
       }
     }
 
    // Check that all distributed boolean are the same (if on more than 1 core)
    if(comm_pt->nproc() > 1)
     {
      // Get the distributed boolean from matrix_pt(0,0)
      bool first_distributed = matrix_pt(0,0)->distributed();
      
      for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++) 
       {
        for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++) 
         {
          // Is the current block distributed?
          bool current_distributed 
            = matrix_pt(block_row_i,block_col_i)->distributed();
 
          if(first_distributed != current_distributed)
           {
            std::ostringstream error_message;
            error_message << "Block matrix_pt(" << block_row_i 
                          << ","<<block_col_i<<") and block matrix_pt(0,0) "
                          << "have a different distributed boolean.\n";
            throw OomphLibError(error_message.str(),
                                OOMPH_CURRENT_FUNCTION,
                                OOMPH_EXCEPTION_LOCATION);
           }
         }
       }
     }
 
    // Check that all sub matrix dimensions "make sense"
    // We need to check that all the matrices in the same row has the same nrow.
    // Then repeat for the columns.
    
    // Check the nrow of each block row.
    for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++) 
     {
      // Get the nrow to compare against from the first column.
      const unsigned first_block_nrow = matrix_pt(block_row_i,0)->nrow();
      
      // Loop through the block columns.
      for (unsigned block_col_i = 1; block_col_i < nblockcol; block_col_i++) 
       {
        // If the nrow of this block is not the same as the nrow from the first
        // block in this block row, throw an error.
        const unsigned current_block_nrow 
          = matrix_pt(block_row_i,block_col_i)->nrow();
 
        if(first_block_nrow != current_block_nrow)
         {
          std::ostringstream error_message;
          error_message << "First block has nrow = " << current_block_nrow
                        << ". But martrix_pt(" << block_row_i 
                        << "," << block_col_i <<") has nrow = " 
                        << current_block_nrow << ".\n";
          throw OomphLibError(error_message.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION);
         }
       
       }
     }
 
    // Check the ncol of each block column.
    for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++) 
     {
      // Get the ncol from the first block row to compare against.
      const unsigned first_block_ncol = matrix_pt(0,block_col_i)->ncol();
      
      for (unsigned block_row_i = 1; block_row_i < nblockrow; block_row_i++) 
       {
        // Get the ncol for the current block.
        const unsigned current_block_ncol 
          = matrix_pt(block_row_i,block_col_i)->ncol();
 
        if(first_block_ncol != current_block_ncol)
         {
          std::ostringstream error_message;
          error_message << "First block has ncol = " << current_block_ncol
                        << ". But martrix_pt(" << block_row_i 
                        << "," << block_col_i <<") has ncol = " 
                        << current_block_ncol << ".\n";
          throw OomphLibError(error_message.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION);
         }
       }
     }
 
    // Check that the distribution for each block row is the same.
    for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++) 
    {
      // The first distribution of this block row.
      const LinearAlgebraDistribution first_dist 
        = *(matrix_pt(block_row_i,0)->distribution_pt());
 
      // Loop through the rest of the block columns.
      for (unsigned block_col_i = 1; block_col_i < nblockcol; block_col_i++) 
      {
        // Get the distribution from the current block.
        const LinearAlgebraDistribution current_dist 
          = matrix_pt(block_row_i,block_col_i)->distribution_pt();
 
        // Compare the first distribution against the current.
        if(first_dist != current_dist)
         {
          std::ostringstream error_message;
          error_message << "First distribution of block row " << block_row_i
                        << " is different from the distribution from "
                        << "martrix_pt(" << block_row_i 
                        << "," << block_col_i <<").\n";
          throw OomphLibError(error_message.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION);
         }
      }
    }
 
 #endif
 #endif
 
    // Loop thrpugh the block rows, then block columns to 
    // compute the local inf norm
    double extreme_disc = 0;
    for (unsigned block_row_i = 0; block_row_i < nblockrow; block_row_i++) 
     {
      // Get the number of local rows from the first block.
      unsigned block_nrow_local = matrix_pt(block_row_i,0)->nrow_local();
 
      // Loop through the block_nrow_local in this block row
      for (unsigned local_row_i = 0; local_row_i < block_nrow_local; 
           local_row_i++) 
       {
        double abs_sum_of_row = 0;
        // Loop through the block columns
        for (unsigned block_col_i = 0; block_col_i < nblockcol; block_col_i++) 
         {
          // Locally cache the pointer to the current block.
          CRDoubleMatrix* block_pt = matrix_pt(block_row_i,block_col_i);
  
          const int* row_start = block_pt->row_start();  
          const double* value = block_pt->value();
 
          // Loop through the values
          for (int val_i = row_start[local_row_i]; 
               val_i < row_start[local_row_i+1]; val_i++)
           {
            abs_sum_of_row += fabs(value[val_i]);
           }
         }
 
        // Now minus the diagonal entry...
        // Locate the diagonal block matrix.
        double* s_values = matrix_pt(block_row_i,block_row_i)->value();
        int* s_column_index = matrix_pt(block_row_i,block_row_i)->column_index();
        int* s_row_start = matrix_pt(block_row_i,block_row_i)->row_start();
        //int s_nrow_local = matrix_pt(block_row_i,block_row_i)->nrow_local();
        int s_first_row = matrix_pt(block_row_i,block_row_i)->first_row();
 
        // Get the diagonal value...
        double diagonal_value = 0;
        bool found = false;
        for (int j = s_row_start[local_row_i];
             j < s_row_start[local_row_i+1] && !found; j++)
         {
          if (s_column_index[j] == int(local_row_i + s_first_row))
           {
            diagonal_value = s_values[j];
            found = true;
           }
         }
       
        // Check if the diagonal entry is found.
        if(!found)
         {
          std::ostringstream error_message;
          error_message << "The diagonal entry for the block("
                        << block_row_i<<","<<block_row_i<<")\n"
                        << "on local row " << local_row_i << " does not exist."
                        << std::endl;
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
        } 
        
        // This is the disc.
        abs_sum_of_row -= fabs(diagonal_value);
 
        // Now we have to check if the diagonal entry is
        // on the left or right side of zero.
        if(diagonal_value > 0)
         {
          double extreme_disc_local = diagonal_value + abs_sum_of_row;
          extreme_disc = std::max(extreme_disc_local,extreme_disc);
         }
        else
         {
          double extreme_disc_local = diagonal_value - abs_sum_of_row;
          extreme_disc = std::min(extreme_disc_local,extreme_disc);
         }
       } // Loop through local row (of all block column)
     } // Loop through block row
 
    // if this vector is distributed and on multiple processors then gather
 #ifdef OOMPH_HAS_MPI
    double extreme_disc_local = extreme_disc;
    if ( matrix_pt(0,0)->distributed() && comm_pt->nproc() > 1)
     {
      if(extreme_disc > 0)
       {
        MPI_Allreduce(&extreme_disc,&extreme_disc_local,1,MPI_DOUBLE,MPI_MAX,
                      comm_pt->mpi_comm());
       }
      else
       {
        MPI_Allreduce(&extreme_disc,&extreme_disc_local,1,MPI_DOUBLE,MPI_MIN,
                      comm_pt->mpi_comm());
       }
     }
    extreme_disc = extreme_disc_local;
 #endif
    
    // and return
    return extreme_disc;
   }
 
  //============================================================================
  /// \short Concatenate CRDoubleMatrix matrices. 
  /// The in matrices are concatenated such that the block structure of the
  /// in matrices are preserved in the result matrix. Communication between 
  /// processors is required. If the block structure of the sub matrices does
  /// not need to be preserved, consider using
  /// CRDoubleMatrixHelpers::concatenate_without_communication(...).
  /// 
  /// The matrix manipulation functions
  /// CRDoubleMatrixHelpers::concatenate(...) and
  /// CRDoubleMatrixHelpers::concatenate_without_communication(...)
  /// are analogous to the Vector manipulation functions
  /// DoubleVectorHelpers::concatenate(...) and
  /// DoubleVectorHelpers::concatenate_without_communication(...).
  /// Please look at the DoubleVector functions for an illustration of the 
  /// differences between concatenate(...) and 
  /// concatenate_without_communication(...).
  /// 
  /// Distribution of the result matrix:
  /// If the result matrix does not have a distribution built, then it will be
  /// given a uniform row distribution. Otherwise we use the existing 
  /// distribution. This gives the user the ability to define their own 
  /// distribution, or save computing power if a distribution has 
  /// been pre-built.
  /// 
  /// NOTE: ALL the matrices pointed to by matrix_pt has to be built. This is
  /// not the case with concatenate_without_communication(...)
  //============================================================================
  void concatenate(const DenseMatrix<CRDoubleMatrix*> &matrix_pt,
                   CRDoubleMatrix &result_matrix)
  {
   
   // The number of block rows and block columns.
   unsigned matrix_nrow = matrix_pt.nrow();
   unsigned matrix_ncol = matrix_pt.ncol();
   
   // PARANOID checks involving only the in matrices.
 #ifdef PARANOID
   // Are there matrices to concatenate?
   if(matrix_nrow == 0)
    {
     std::ostringstream error_message;
     error_message << "There are no matrices to concatenate.\n";
     throw OomphLibError(error_message.str(),
                         OOMPH_CURRENT_FUNCTION,
                         OOMPH_EXCEPTION_LOCATION);
    }
 
   // Does this matrix need concatenating?
   if((matrix_nrow == 1) && (matrix_ncol == 1))
    {
     std::ostringstream warning_message;
     warning_message << "There is only one matrix to concatenate...\n"
                     << "This does not require concatenating...\n";
     OomphLibWarning(warning_message.str(),
                     OOMPH_CURRENT_FUNCTION,
                     OOMPH_EXCEPTION_LOCATION);
    }
 
   // Are all sub matrices built?
   for(unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
    {
     for(unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
      {
       if (!(matrix_pt(block_row_i,block_col_i)->built()))
        {
         std::ostringstream error_message;
         error_message
          << "The sub matrix (" << block_row_i << "," << block_col_i << ")\n"
          << "is not built. \n";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
        }
      }
    }
 
   // Do all dimensions of sub matrices "make sense"?
   // Compare the number of rows of each block matrix in a block row.
   for(unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
    {
     // Use the first column to compare against the rest.
     unsigned long current_block_nrow = matrix_pt(block_row_i,0)->nrow();
 
     // Compare against columns 1 to matrix_ncol - 1
     for(unsigned block_col_i = 1; block_col_i < matrix_ncol; block_col_i++)
      {
       // Get the nrow for this sub block.
       unsigned long subblock_nrow 
        = matrix_pt(block_row_i,block_col_i)->nrow();
 
       if(current_block_nrow != subblock_nrow)
        {
         std::ostringstream error_message;
         error_message
          << "The sub matrix (" << block_row_i << "," << block_col_i << ")\n"
          << "requires nrow = " << current_block_nrow 
          << ", but has nrow = " << subblock_nrow <<".\n";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
        }
      }
    }
 
   // Compare the number of columns of each block matrix in a block column.
   for(unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
    {
     // Use the first row to compare against the rest.
     unsigned long current_block_ncol = matrix_pt(0,block_col_i)->ncol();
 
     // Compare against rows 1 to matrix_nrow - 1
     for(unsigned block_row_i = 1; block_row_i < matrix_nrow; block_row_i++)
      {
       // Get the ncol for this sub block.
       unsigned long subblock_ncol 
        = matrix_pt(block_row_i,block_col_i)->ncol();
 
       if(current_block_ncol != subblock_ncol)
        {
         std::ostringstream error_message;
         error_message
          << "The sub matrix (" << block_row_i << "," << block_col_i << ")\n"
          << "requires ncol = " << current_block_ncol
          << ", but has ncol = " << subblock_ncol << ".\n";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
        }
      }
    }
 #endif
   
   // The communicator pointer from block (0,0)
   const OomphCommunicator* const comm_pt 
    = matrix_pt(0,0)->distribution_pt()->communicator_pt();
 
   // Check if the block (0,0) is distributed or not.
   bool distributed = matrix_pt(0,0)->distributed(); 
   
   // If the result matrix does not have a distribution, we create a uniform
   // distribution.
   if(!result_matrix.distribution_pt()->built())
    {
     // Sum of sub matrix nrow. We use the first column.
     unsigned tmp_nrow = 0;
     for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++) 
      {
       tmp_nrow +=matrix_pt(block_row_i,0)->nrow();
      }
 
     LinearAlgebraDistribution tmp_distribution(comm_pt,tmp_nrow,distributed);
 
     result_matrix.build(&tmp_distribution);
    }
   else
    // A distribution is supplied for the result matrix.
    {
 #ifdef PARANOID
     // Check if the sum of the nrow from the sub matrices is the same as the 
     // the nrow from the result matrix.
 
     // Sum of sub matrix nrow. We use the first column.
     unsigned tmp_nrow = 0;
     for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++) 
      {
       tmp_nrow +=matrix_pt(block_row_i,0)->nrow();
      }
 
     if(tmp_nrow != result_matrix.nrow())
      {
       std::ostringstream error_message;
       error_message << "The total number of rows from the matrices to\n"
                     << "concatenate does not match the nrow from the\n"
                     << "result matrix\n";
       throw OomphLibError(error_message.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
      }
 #endif
    }
 
 #ifdef PARANOID
   
   // Are all the communicators the same?
   // Compare the communicator for sub matrices (against the result matrix).
   {
    const OomphCommunicator communicator 
     = *(result_matrix.distribution_pt()->communicator_pt());
 
    // Are all communicator pointers the same?
    for(unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
     {
      for(unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
       {
        const OomphCommunicator another_communicator 
         = *(matrix_pt(block_row_i,block_col_i)
             ->distribution_pt()->communicator_pt());
 
        if(!(communicator == another_communicator))
         {
          std::ostringstream error_message;
          error_message
           << "The OomphCommunicator of the sub matrix (" 
           << block_row_i << "," << block_col_i << ")\n"
           << "does not have the same communicator as the result matrix. \n";
          throw OomphLibError(error_message.str(),
                              OOMPH_CURRENT_FUNCTION,
                              OOMPH_EXCEPTION_LOCATION); 
         }
       }
     }
   }
 
   // Are all the distributed boolean the same? This only applies if we have
   // more than one processor. If there is only one processor, then it does
   // not matter if it is distributed or not - they are conceptually the same.
   if(comm_pt->nproc() != 1)
    {
     // Compare distributed for sub matrices (against the result matrix).
     const bool res_distributed = result_matrix.distributed();
 
     // Loop over all sub blocks.
     for(unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
      {
       for(unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
        {
         const bool another_distributed 
          = matrix_pt(block_row_i,block_col_i)->distributed();
 
         if(res_distributed != another_distributed)
          {
           std::ostringstream error_message;
           error_message
            << "The distributed boolean of the sub matrix (" 
            << block_row_i << "," << block_col_i << ")\n"
            << "is not the same as the result matrix. \n";
           throw OomphLibError(error_message.str(),
                               OOMPH_CURRENT_FUNCTION,
                               OOMPH_EXCEPTION_LOCATION); 
          }
        }
      }
    }
 #endif
 
 
   // Get the number of columns up to each block for offset 
   // in calculating the result column indices.
   // Since the number of columns in each block column is the same,
   // we only loop through the first block row (row zero).
   Vector<unsigned long> sum_of_ncol_up_to_block(matrix_ncol);
 
   // Also compute the total number of columns to build the resulting matrix.
   unsigned long res_ncol = 0;
 
   for(unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
    {
     sum_of_ncol_up_to_block[block_col_i] = res_ncol;
     res_ncol += matrix_pt(0,block_col_i)->ncol();
    }
     
   // We begin the process of extracting and ordering the values, 
   // column_indices and row_start of all the sub blocks.
   if((comm_pt->nproc() == 1) || !distributed)
    // Serial version of the code.
    {
     // Get the total number of non zero entries so we can reserve storage for
     // the values and column_indices vectors.
     unsigned long res_nnz = 0;
     for (unsigned block_row_i = 0; 
          block_row_i < matrix_nrow; block_row_i++) 
      {
       for (unsigned block_col_i = 0; 
            block_col_i < matrix_ncol; block_col_i++) 
        {
         res_nnz+=matrix_pt(block_row_i,block_col_i)->nnz();
        }
      }
       
     // Declare the vectors required to build a CRDoubleMatrix
     Vector<double> res_values;
     Vector<int> res_column_indices;
     Vector<int> res_row_start;
 
     // Reserve space for the vectors.
     res_values.reserve(res_nnz);
     res_column_indices.reserve(res_nnz);
     res_row_start.reserve(result_matrix.nrow()+1);
 
     // Now we fill in the data.
 
     // Running sum of nnz per row.
     int nnz_running_sum = 0;
 
     // Loop through the block rows.
     for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++) 
      {
       // Get the number of rows in this block row, from the first block.
       unsigned long block_row_nrow = matrix_pt(block_row_i,0)->nrow();
 
       // Loop through the number of rows in this block row
       for (unsigned row_i = 0; row_i < block_row_nrow; row_i++) 
        {
         // The row start is the nnz at the start of the row.
         res_row_start.push_back(nnz_running_sum);
 
         // Loop through the block columns
         for (unsigned block_col_i = 0; 
              block_col_i < matrix_ncol; block_col_i++) 
          {
           // Get the current block.
           CRDoubleMatrix* current_block_pt
            = matrix_pt(block_row_i,block_col_i);
 
           // Get the values, column_indices and row_start for this block.
           double* current_block_values = current_block_pt->value();
           int* current_block_column_indices 
            = current_block_pt->column_index();
           int* current_block_row_start = current_block_pt->row_start();
 
           for (int val_i = current_block_row_start[row_i]; 
                val_i < current_block_row_start[row_i+1]; val_i++)
            {
             res_values.push_back(current_block_values[val_i]);
             res_column_indices.push_back(current_block_column_indices[val_i]
                                          + sum_of_ncol_up_to_block[block_col_i]);
            }
 
           // Update the running sum of nnz per row
           nnz_running_sum += current_block_row_start[row_i+1] 
            - current_block_row_start[row_i];
          } // for block cols
        } // for rows
      } // for block rows
 
     // Fill in the last row start
     res_row_start.push_back(res_nnz);
 
     // Build the matrix
     result_matrix.build(res_ncol,res_values,res_column_indices,res_row_start);
    }
   // Otherwise we are dealing with a distributed matrix.
   else
    {
 #ifdef OOMPH_HAS_MPI
     
     // Flag to enable timing. This is for debugging 
     // and/or testing purposes only.
     bool enable_timing = false;
 
     // Get the number of processors
     unsigned nproc = comm_pt->nproc();
 
     // My rank
     unsigned my_rank = comm_pt->my_rank();
 
     // Storage for the data (per processor) to send.
     Vector<Vector<unsigned> > column_indices_to_send(nproc);
     Vector<Vector<double> > values_to_send(nproc);
       
     // The sum of the nrow for the sub blocks (so far). This is used as an
     // offset to calculate the global equation number in the result matrix.
     unsigned long sum_of_block_nrow = 0;
         
     double t_prep_data_start;
     if(enable_timing)
      {
       t_prep_data_start = TimingHelpers::timer();
      }
 
     // Get the pointer to the result distribution, for convenience...
     LinearAlgebraDistribution* res_distribution_pt
      = result_matrix.distribution_pt();
 
     // loop over the sub blocks to calculate the global_eqn, get the values
     // and column indices.
     for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++) 
      {
       // Get the number of local rows in this block_row from the first block.
       unsigned current_block_nrow_local = matrix_pt(block_row_i,0)
        ->nrow_local();
 
       // Get the first_row for this block_row
       unsigned current_block_row_first_row = matrix_pt(block_row_i,0)
        ->first_row();
 
       // Loop through the number of local rows
       for(unsigned sub_local_eqn = 0;
           sub_local_eqn < current_block_nrow_local; sub_local_eqn++)
        {
         // Calculate the corresponding (res_global_eqn) equation number 
         // for this local row number in this block.
         unsigned long res_global_eqn = sub_local_eqn 
          + current_block_row_first_row + sum_of_block_nrow;
 
         // Get the processor that this global row belongs to.
         // The rank_of_global_row(...) function loops through all the 
         // processors and does two unsigned comparisons. Since we have to do
         // this for every row, it may be better to store a list mapping for
         // very large number of processors.
         unsigned res_p = res_distribution_pt
          ->rank_of_global_row(res_global_eqn);
 
         // With the res_p, we get the res_first_row to 
         // work out the res_local_eqn
         unsigned res_first_row = res_distribution_pt->first_row(res_p);
         unsigned res_local_eqn = res_global_eqn - res_first_row;
 
         // Loop through the block columns, calculate the nnz. This is used to
         // reserve space for the value and column_indices Vectors.
         unsigned long current_row_nnz = 0;
         for (unsigned block_col_i = 0; 
              block_col_i < matrix_ncol; block_col_i++) 
          {
           // Get the row_start
           int* current_block_row_start = matrix_pt(block_row_i,block_col_i)
            ->row_start();
             
           // Update the nnz for this row.
           current_row_nnz += current_block_row_start[sub_local_eqn+1] 
            - current_block_row_start[sub_local_eqn];
          } // for block column, get nnz.
 
         // Reserve space for efficiency.
         //unsigned capacity_in_res_p_vec 
         //  = column_indices_to_send[res_p].capacity();
 
         // Reserve memory for nnz+2, since we need to store the res_local_eqn
         // and nnz as well as the data (values/column indices). 
         // Note: The two reserve functions are called per row. 
         // If the matrix is very sparse (just a few elements per row), it
         // will be more efficient to not reserve and let the STL vector 
         // handle this. On average, this implementation is more efficient.
         //column_indices_to_send[res_p].reserve(capacity_in_res_p_vec 
         //    + current_row_nnz+2);
         //values_to_send[res_p].reserve(capacity_in_res_p_vec
         //    + current_row_nnz+2);
 
         // Push back the res_local_eqn and nnz
         column_indices_to_send[res_p].push_back(res_local_eqn);
         column_indices_to_send[res_p].push_back(current_row_nnz);
         values_to_send[res_p].push_back(res_local_eqn);
         values_to_send[res_p].push_back(current_row_nnz);
 
         // Loop through the block columns again and get the values 
         // and column_indices
         for (unsigned block_col_i = 0; 
              block_col_i < matrix_ncol; block_col_i++) 
          {
           // Cache the pointer to the current block for convenience.
           CRDoubleMatrix* current_block_pt
            = matrix_pt(block_row_i,block_col_i);
 
           // Values, column indices and row_start for the current block.
           double* current_block_values = current_block_pt->value();
           int* current_block_column_indices 
            = current_block_pt->column_index();
           int* current_block_row_start = current_block_pt->row_start();
 
           // Loop though the values and column_indices
           for (int val_i = current_block_row_start[sub_local_eqn]; 
                val_i < current_block_row_start[sub_local_eqn+1]; val_i++) 
            {
             // Push back the value.
             values_to_send[res_p].push_back(current_block_values[val_i]);
 
             // Push back the (offset) column index.
             column_indices_to_send[res_p].push_back(
              current_block_column_indices[val_i]
              + sum_of_ncol_up_to_block[block_col_i]);
            } // for block columns
          } // for block column, get values and column_indices.
        } // for sub_local_eqn
 
       // update the sum_of_block_nrow
       sum_of_block_nrow += matrix_pt(block_row_i,0)->nrow();
 
      } // for block_row
       
     if(enable_timing)
      {
       double t_prep_data_finish = TimingHelpers::timer();
       double t_prep_data_time = t_prep_data_finish - t_prep_data_start;
       oomph_info << "Time for prep data: " << t_prep_data_time << std::endl; 
      }
 
     // Prepare to send data!
 
     // Storage for the number of data to be sent to each processor.
     Vector<int> send_n(nproc,0);
 
     // Storage for all the values/column indices to be sent 
     // to each processor.
     Vector<double> send_values_data;
     Vector<unsigned> send_column_indices_data;
 
     // Storage location within send_values_data 
     // (and send_column_indices_data) for data to be sent to each processor.
     Vector<int> send_displacement(nproc,0);
 
     double t_total_ndata_start;
     if(enable_timing)
      t_total_ndata_start = TimingHelpers::timer();
 
     // Get the total amount of data which needs to be sent, so we can reserve
     // space for it.
     unsigned total_ndata = 0;
     for (unsigned rank = 0; rank < nproc; rank++) 
      {
       if(rank != my_rank)
        {
         total_ndata += values_to_send[rank].size();
        }
      }
 
     if(enable_timing)
      {
       double t_total_ndata_finish = TimingHelpers::timer();
       double t_total_ndata_time= t_total_ndata_finish - t_total_ndata_start;
       oomph_info << "Time for total_ndata: " << t_total_ndata_time << std::endl; 
      }
 
     double t_flat_pack_start;
     if(enable_timing)
      t_flat_pack_start = TimingHelpers::timer();
 
     // Now we don't have to re-allocate data/memory when push_back is called.
     // Nb. Using push_back without reserving memory may cause multiple
     // re-allocation behind the scenes, this is expensive.
     send_values_data.reserve(total_ndata);
     send_column_indices_data.reserve(total_ndata);
 
     // Loop over all the processors to "flat pack" the data for sending.
     for (unsigned rank = 0; rank < nproc; rank++) 
      {
       // Set the offset for the current processor
       // This only has to be done once for both values and column indices.
       send_displacement[rank] = send_values_data.size();
 
       // Don't bother to do anything if 
       // the processor in the loop is the current processor.
       if(rank != my_rank)
        {
         // Put the values into the send data vector.
         // n_data is the same for both values and column indices.
         unsigned n_data = values_to_send[rank].size();
         for (unsigned j = 0; j < n_data; j++) 
          {
           send_values_data.push_back(values_to_send[rank][j]);
           send_column_indices_data.push_back(
            column_indices_to_send[rank][j]);
          } // for
        } // if rank != my_rank
 
       // Find the number of data to be added to the vector.
       // send_n is the same for both values and column indices.
       send_n[rank] = send_values_data.size() - send_displacement[rank];
      } // loop over processors
 
     if(enable_timing)
      {
       double t_flat_pack_finish = TimingHelpers::timer();
       double t_flat_pack_time = t_flat_pack_finish - t_flat_pack_start;
       oomph_info << "t_flat_pack_time: " << t_flat_pack_time << std::endl; 
      }
 
     double t_sendn_start;
     if(enable_timing)
      t_sendn_start = TimingHelpers::timer();
 
     // Strorage for the number of data to be received from each processor
     Vector<int> receive_n(nproc,0);
 
     MPI_Alltoall(&send_n[0],1,MPI_INT,&receive_n[0],1,MPI_INT,
                  comm_pt->mpi_comm());
 
     if(enable_timing)
      {
       double t_sendn_finish = TimingHelpers::timer();
       double t_sendn_time = t_sendn_finish - t_sendn_start;
       oomph_info << "t_sendn_time: " << t_sendn_time << std::endl; 
      }
 
 
     // Prepare the data to be received
     // by working out the displacement from the received data
     // receive_displacement is the same for both values and column indices.
     Vector<int> receive_displacement(nproc,0);
     int receive_data_count = 0;
     for (unsigned rank = 0; rank < nproc; rank++) 
      {
       receive_displacement[rank] = receive_data_count;
       receive_data_count += receive_n[rank];
      }
 
     // Now resize the receive buffer for all data from all processors.
     // Make sure that it has a size of at least one.
     if(receive_data_count == 0){receive_data_count++;}
     Vector<double> receive_values_data(receive_data_count);
     Vector<unsigned> receive_column_indices_data(receive_data_count);
 
     // Make sure that the send buffer has size at least one
     // so that we don't get a segmentation fault.
     if(send_values_data.size()==0){send_values_data.resize(1);}
 
     double t_send_data_start;
     if(enable_timing)
      t_send_data_start = TimingHelpers::timer();
 
     // Now send the data between all processors
     MPI_Alltoallv(&send_values_data[0],&send_n[0],&send_displacement[0],
                   MPI_DOUBLE,
                   &receive_values_data[0],&receive_n[0],
                   &receive_displacement[0],
                   MPI_DOUBLE,
                   comm_pt->mpi_comm());
 
     // Now send the data between all processors
     MPI_Alltoallv(&send_column_indices_data[0],&send_n[0],
                   &send_displacement[0],
                   MPI_UNSIGNED,
                   &receive_column_indices_data[0],&receive_n[0],
                   &receive_displacement[0],
                   MPI_UNSIGNED,
                   comm_pt->mpi_comm());
 
     if(enable_timing)
      {
       double t_send_data_finish = TimingHelpers::timer();
       double t_send_data_time = t_send_data_finish - t_send_data_start;
       oomph_info << "t_send_data_time: " << t_send_data_time << std::endl; 
      }
 
     // All the rows for this processor are stored in:
     // from other processors:
     // receive_column_indices_data and receive_values_data
     // from this processor:
     // column_indices_to_send[my_rank] and values_to_send[my_rank]
     //
     // They are in some order determined by the distribution. 
     // We need to re-arrange them. To do this, we do some pre-processing.
 
     // nrow_local for this processor.
     unsigned res_nrow_local = res_distribution_pt->nrow_local();
 
     // Per row, store:
     // 1) where this row came from, 0 - this proc, 1 - other procs.
     // 2) the nnz,
     // 3) the offset - where the values/columns in the receive data vectors 
     //                 begins. This is different from the offset of where 
     //                 the data from a certain processor starts.
     Vector<Vector<unsigned> > value_column_locations(res_nrow_local,
                                                      Vector<unsigned>(3,0));
 
     // Store the local nnz so we can reserve space for 
     // the values and column indices.
     unsigned long res_nnz_local = 0;
 
     double t_locations_start;
     if(enable_timing)
      t_locations_start = TimingHelpers::timer();
 
     // Loop through the data currently on this processor.
     unsigned location_i = 0;
     unsigned my_column_indices_to_send_size
      = column_indices_to_send[my_rank].size();
     while(location_i < my_column_indices_to_send_size)
      {
       unsigned current_local_eqn 
        = column_indices_to_send[my_rank][location_i++];
 
       unsigned current_nnz 
        = column_indices_to_send[my_rank][location_i++];
 
       // No need to fill [*][0] with 0 since it is already initialised to 0.
 
       // Store the nnz.
       value_column_locations[current_local_eqn][1] = current_nnz;
 
       // Also increment the res_local_nnz
       res_nnz_local += current_nnz;
 
       // Store the offset.
       value_column_locations[current_local_eqn][2] = location_i;
 
       // Update the location_i so it starts at the next row.
       location_i += current_nnz;
      }
 
     // Loop through the data from different processors.
 
     // Check to see if data has been received.
     bool data_has_been_received = false;
     unsigned send_rank = 0;
     while(send_rank < nproc)
      {
       if(receive_n[send_rank] > 0)
        {
         data_has_been_received = true;
         break;
        }
       send_rank++;
      }
 
     location_i = 0; // start at 0.
     if(data_has_been_received)
      {
       unsigned receive_column_indices_data_size
        = receive_column_indices_data.size();
       while(location_i < receive_column_indices_data_size)
        {
         unsigned current_local_eqn 
          = receive_column_indices_data[location_i++];
         unsigned current_nnz = receive_column_indices_data[location_i++];
 
         // These comes from other processors.
         value_column_locations[current_local_eqn][0] = 1;
 
         // Store the nnz.
         value_column_locations[current_local_eqn][1] = current_nnz;
 
         // Also increment the res_local_nnz
         res_nnz_local += current_nnz;
 
         // Store the offset.
         value_column_locations[current_local_eqn][2] = location_i;
 
         // Update the location_i so it starts at the next row.
         location_i += current_nnz;
        }
      } 
 
     if(enable_timing)
      {
       double t_locations_finish = TimingHelpers::timer();
       double t_locations_time = t_locations_finish - t_locations_start;
       oomph_info << "t_locations_time: " << t_locations_time << std::endl; 
      }
 
     double t_fillvecs_start;
     if(enable_timing)
      t_fillvecs_start = TimingHelpers::timer();
 
     // Now loop through the locations and store the values 
     // the column indices in the correct order.
     Vector<int> res_column_indices;
     Vector<double> res_values;
     Vector<int> res_row_start;
 
     res_column_indices.reserve(res_nnz_local);
     res_values.reserve(res_nnz_local);
     res_row_start.reserve(res_nrow_local+1);
 
     // Running sum of nnz for the row_start. Must be int because
     // res_row_start is templated with int.
     int nnz_running_sum = 0;
 
     // Now insert the rows.
     for (unsigned local_row_i = 0; 
          local_row_i < res_nrow_local; local_row_i++) 
      {
       // Fill the res_row_start with the nnz so far.
       res_row_start.push_back(nnz_running_sum);
 
       bool data_is_from_other_proc 
        = bool(value_column_locations[local_row_i][0]);
 
       unsigned row_i_nnz = value_column_locations[local_row_i][1];
 
       unsigned row_i_offset = value_column_locations[local_row_i][2];
 
       if(data_is_from_other_proc)
        {
         // Insert range [offset, offset+nnz) from 
         // receive_column_indices_data and receive_values_data into
         // res_column_indices and res_values respectively.
         res_column_indices.insert(res_column_indices.end(),
                                   receive_column_indices_data.begin()
                                   +row_i_offset,
                                   receive_column_indices_data.begin()
                                   +row_i_offset+row_i_nnz);
 
         res_values.insert(res_values.end(),
                           receive_values_data.begin()+row_i_offset,
                           receive_values_data.begin()+row_i_offset+row_i_nnz);
        }
       else
        {
         res_column_indices.insert(res_column_indices.end(),
                                   column_indices_to_send[my_rank].begin()
                                   +row_i_offset,
                                   column_indices_to_send[my_rank].begin()
                                   +row_i_offset+row_i_nnz);
 
         res_values.insert(res_values.end(),
                           values_to_send[my_rank].begin()+row_i_offset,
                           values_to_send[my_rank].begin()+row_i_offset+row_i_nnz);
        }
 
       // Update the running sum of nnz
       nnz_running_sum += row_i_nnz;
      }
 
     // Insert the last row_start value
     res_row_start.push_back(res_nnz_local);
 
     if(enable_timing)
      {
       double t_fillvecs_finish = TimingHelpers::timer();
       double t_fillvecs_time = t_fillvecs_finish - t_fillvecs_start;
       oomph_info << "t_fillvecs_time: " << t_fillvecs_time << std::endl; 
      }
 
     double t_buildres_start;
     if(enable_timing)
      t_buildres_start = TimingHelpers::timer();
 
     // build the matrix.
     result_matrix.build(res_ncol,res_values,res_column_indices,res_row_start);
 
     if(enable_timing)
      {
       double t_buildres_finish = TimingHelpers::timer();
       double t_buildres_time = t_buildres_finish - t_buildres_start;
       oomph_info << "t_buildres_time: " << t_buildres_time << std::endl; 
      }
     //  */
 #endif
    }
  }
 
  //============================================================================
  /// \short Concatenate CRDoubleMatrix matrices.
  /// 
  /// The Vector row_distribution_pt contains the LinearAlgebraDistribution 
  /// of each block row.
  /// The Vector col_distribution_pt contains the LinearAlgebraDistribution 
  /// of each block column.
  /// The DenseMatrix matrix_pt contains pointers to the CRDoubleMatrices 
  /// to concatenate.
  /// The CRDoubleMatrix result_matrix is the result matrix.
  /// 
  /// The result matrix is a permutation of the sub matrices such that the data
  /// stays on the same processor when the result matrix is built, there is no
  /// communication between processors.
  /// Thus the block structure of the sub matrices are NOT preserved in the
  /// result matrix. The rows are block-permuted, defined by the concatenation
  /// of the distributions in row_distribution_pt. Similarly, the columns are 
  /// block-permuted, defined by the concatenation of the distributions in 
  /// col_distribution_pt. 
  /// For more details on the block-permutation, see 
  /// LinearAlgebraDistributionHelpers::concatenate(...).
  /// 
  /// If one wishes to preserve the block structure of the sub matrices in the
  /// result matrix, consider using CRDoubleMatrixHelpers::concatenate(...),
  /// which uses communication between processors to ensure that the block
  /// structure of the sub matrices are preserved.
  /// 
  /// The matrix manipulation functions
  /// CRDoubleMatrixHelpers::concatenate(...) and
  /// CRDoubleMatrixHelpers::concatenate_without_communication(...)
  /// are analogous to the Vector manipulation functions
  /// DoubleVectorHelpers::concatenate(...) and
  /// DoubleVectorHelpers::concatenate_without_communication(...).
  /// Please look at the DoubleVector functions for an illustration of the 
  /// differences between concatenate(...) and 
  /// concatenate_without_communication(...).
  /// 
  /// Distribution of the result matrix:
  /// If the result matrix does not have a distribution built, then it will be
  /// given a distribution built from the concatenation of the distributions
  /// from row_distribution_pt, see 
  /// LinearAlgebraDistributionHelpers::concatenate(...) for more detail. 
  /// Otherwise we use the existing distribution. 
  /// If there is an existing distribution then it must be the same as the 
  /// distribution from the concatenation of row distributions as described 
  /// above. 
  /// Why don't we always compute the distribution "on the fly"?
  /// Because a non-uniform distribution requires communication. 
  /// All block preconditioner distributions are concatenations of the 
  /// distributions of the individual blocks.
  //============================================================================
  void concatenate_without_communication(
   const Vector<LinearAlgebraDistribution*> &row_distribution_pt,
   const Vector<LinearAlgebraDistribution*> &col_distribution_pt,
   const DenseMatrix<CRDoubleMatrix*> &matrix_pt,
   CRDoubleMatrix &result_matrix)
  {
   // The number of block rows and block columns.
   unsigned matrix_nrow = matrix_pt.nrow();
   unsigned matrix_ncol = matrix_pt.ncol();
  
   // PARANOID checks involving in matrices and block_distribution only.
   // PARANOID checks involving the result matrix will come later since
   // we have to create the result matrix distribution from the in distribution
   // if it does not already exist.
 #ifdef PARANOID
   
   // Are there matrices to concatenate?
   if(matrix_nrow == 0 || matrix_ncol == 0)
    {
     std::ostringstream error_message;
     error_message << "There are no matrices to concatenate.\n";
     throw OomphLibError(error_message.str(),
                         OOMPH_CURRENT_FUNCTION,
                         OOMPH_EXCEPTION_LOCATION);
    }
 
   // Does this matrix need concatenating?
   if((matrix_nrow == 1) && (matrix_ncol == 1))
    {
     std::ostringstream warning_message;
     warning_message << "There is only one matrix to concatenate...\n"
                     << "This does not require concatenating...\n";
     OomphLibWarning(warning_message.str(),
                     OOMPH_CURRENT_FUNCTION,
                     OOMPH_EXCEPTION_LOCATION);
    }
 
 
 
   // The distribution for each block row is stored in row_distribution_pt.
   // So the number of distributions in row_distribution_pt must be the
   // same as matrix_nrow.
   if(matrix_nrow != row_distribution_pt.size())
    {
     std::ostringstream error_message;
     error_message << "The number of row distributions must be the same as\n"
                   << "the number of block rows.";
     throw OomphLibError(error_message.str(),
                         OOMPH_CURRENT_FUNCTION,
                         OOMPH_EXCEPTION_LOCATION);
    }
 
   // The number of distributions for the columns must match the number of
   // block columns.
   if(matrix_ncol != col_distribution_pt.size())
    {
     std::ostringstream error_message;
     error_message << "The number of column distributions must be the same as\n"
                   << "the number of block columns.";
     throw OomphLibError(error_message.str(),
                         OOMPH_CURRENT_FUNCTION,
                         OOMPH_EXCEPTION_LOCATION);
    }
 
   // Check that all pointers in row_distribution_pt is not null.
   for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++) 
    {
     if(row_distribution_pt[block_row_i] == 0)
      {
       std::ostringstream error_message;
       error_message << "The row distribution pointer in position "
                     << block_row_i <<" is null.\n";
       throw OomphLibError(error_message.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
      }
    }
 
   // Check that all pointers in row_distribution_pt is not null.
   for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++) 
    {
     if(col_distribution_pt[block_col_i] == 0)
      {
       std::ostringstream error_message;
       error_message << "The column distribution pointer in position "
                     << block_col_i <<" is null.\n";
       throw OomphLibError(error_message.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
      }
    }
 
   // Check that all distributions are built.
   // First the row distributions
   for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++) 
    {
     if(!row_distribution_pt[block_row_i]->built())
      {
       std::ostringstream error_message;
       error_message << "The distribution pointer in position "
                     << block_row_i <<" is not built.\n";
       throw OomphLibError(error_message.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
      }
    }
   // Now the column distributions
   for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++) 
    {
     if(!col_distribution_pt[block_col_i]->built())
      {
       std::ostringstream error_message;
       error_message << "The distribution pointer in position "
                     << block_col_i <<" is not built.\n";
       throw OomphLibError(error_message.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
      }
    }
 
   // Check that all communicators in row_distribution_pt are the same.
   const OomphCommunicator first_row_comm
    = *(row_distribution_pt[0]->communicator_pt());
 
   for (unsigned block_row_i = 1; block_row_i < matrix_nrow; block_row_i++) 
    {
     const OomphCommunicator current_comm 
      = *(row_distribution_pt[block_row_i]->communicator_pt());
 
     if(first_row_comm != current_comm)
      {
       std::ostringstream error_message;
       error_message <<"The communicator from the row distribution in position "
                     << block_row_i << " is not the same as the first "
                     << "communicator from row_distribution_pt";
       throw OomphLibError(error_message.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
      }
    }
 
   // Check that all communicators in col_distribution_pt are the same as the 
   // first row communicator from above.
   for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++) 
    {
     const OomphCommunicator current_comm 
      = *(col_distribution_pt[block_col_i]->communicator_pt());
 
     if(first_row_comm != current_comm)
      {
       std::ostringstream error_message;
       error_message <<"The communicator from the col distribution in position "
                     << block_col_i << " is not the same as the first "
                     << "communicator from row_distribution_pt";
       throw OomphLibError(error_message.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
      }
    }
 
   // Are all sub matrices built? If the matrix_pt is not null, make sure
   // that it is built.
   for(unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
    {
     for(unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
      {
       if (matrix_pt(block_row_i,block_col_i) != 0 &&
           !(matrix_pt(block_row_i,block_col_i)->built()))
        {
         std::ostringstream error_message;
         error_message
          << "The sub matrix_pt(" << block_row_i << "," << block_col_i << ")\n"
          << "is not built.\n";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION);
        }
      }
    }
  
   // For the matrices which are built, do they have the same communicator as
   // the first communicator from row_distribution_pt?
   for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++) 
    {
     for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++) 
      {
       if(matrix_pt(block_row_i,block_col_i) != 0)
        {
         const OomphCommunicator current_comm
          = *(matrix_pt(block_row_i,block_col_i)
              ->distribution_pt()
              ->communicator_pt());
         if(first_row_comm != current_comm)
          {
           std::ostringstream error_message;
           error_message
            << "The sub matrix_pt(" << block_row_i << ","<<block_col_i<< ")\n"
            << "does not have the same communicator pointer as those in\n"
            << "(row|col)_distribution_pt.\n";
           throw OomphLibError(error_message.str(),
                               OOMPH_CURRENT_FUNCTION,
                               OOMPH_EXCEPTION_LOCATION);
          }
        }
      }
    }
  
   // Do all dimensions of sub matrices "make sense"?
   // Compare the number of rows of each block matrix in a block row.
   for(unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
    {
     // Use the first column to compare against the rest.
     unsigned long current_block_nrow = row_distribution_pt[block_row_i]
      ->nrow();
 
     // Compare against columns 0 to matrix_ncol - 1
     for(unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
      {
       // Perform the check if the matrix_pt is not null.
       if(matrix_pt(block_row_i,block_col_i) != 0)
        {
         // Get the nrow for this sub block.
         unsigned long subblock_nrow 
          = matrix_pt(block_row_i,block_col_i)->nrow();
 
         if(current_block_nrow != subblock_nrow)
          {
           std::ostringstream error_message;
           error_message
            << "The sub matrix (" << block_row_i << "," << block_col_i << ")\n"
            << "requires nrow = " << current_block_nrow 
            << ", but has nrow = " << subblock_nrow <<".\n"
            << "Either the row_distribution_pt is incorrect or "
            << "the sub matrices are incorrect.\n";
           throw OomphLibError(error_message.str(),
                               OOMPH_CURRENT_FUNCTION,
                               OOMPH_EXCEPTION_LOCATION);
          }
        }
      }
    }
 
   // Compare the number of columns of each block matrix in a block column.
   for(unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
    {
     // Get the current block ncol from the linear algebra distribution.
     // Note that we assume that the dimensions are symmetrical.
     unsigned current_block_ncol = col_distribution_pt[block_col_i]
      ->nrow();
 
     for(unsigned block_row_i = 0; 
         block_row_i < matrix_nrow; block_row_i++)
      {
       if(matrix_pt(block_row_i,block_col_i) != 0)
        {
         // Get the ncol for this sub block.
         unsigned subblock_ncol 
          = matrix_pt(block_row_i,block_col_i)->ncol();
   
         if(current_block_ncol != subblock_ncol)
          {
           std::ostringstream error_message;
           error_message
            << "The sub matrix (" << block_row_i << "," << block_col_i << ")\n"
            << "requires ncol = " << current_block_ncol
            << ", but has ncol = " << subblock_ncol << ".\n"
            << "Either the col_distribution_pt is incorrect or "
            << "the sub matrices are incorrect.\n";
           throw OomphLibError(error_message.str(),
                               OOMPH_CURRENT_FUNCTION,
                               OOMPH_EXCEPTION_LOCATION);
          }
        }
      }
    }
 
   // Ensure that the distributions for all sub matrices in the same block row
   // are the same. This is because we permute the row across several matrices.
 
   // Loop through each block row.
   for (unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++) 
    {
     // Get the distribution from the first block in this row.
     LinearAlgebraDistribution* block_row_distribution_pt
      = row_distribution_pt[block_row_i];
 
     // Loop through the block columns
     for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++) 
      {
       if(matrix_pt(block_row_i,block_col_i) != 0)
        {
         // Get the distribution for this block.
         LinearAlgebraDistribution* current_block_distribution_pt
          = matrix_pt(block_row_i,block_col_i)->distribution_pt();
    
         // Ensure that the in matrices is a square block matrix.
         if((*block_row_distribution_pt) != (*current_block_distribution_pt))
          {
           std::ostringstream error_message;
           error_message << "Sub block("<<block_row_i<<","<<block_col_i<<")"
                         << "does not have the same distributoin as the first"
                         << "block in this block row.\n"
                         << "All distributions on a block row must be the same"
                         << "for this function to concatenate matrices.\n";
           throw OomphLibError(error_message.str(),
                               OOMPH_CURRENT_FUNCTION,
                               OOMPH_EXCEPTION_LOCATION);
          }
        }
      }
    }
 #endif
   
   // The communicator pointer from the first row_distribution_pt
   const OomphCommunicator* const comm_pt 
    = row_distribution_pt[0]->communicator_pt();
 
   // Renamed for so it makes more sense.
   unsigned nblock_row = matrix_nrow; 
 
   // If the result matrix does not have a distribution, then we concatenate
   // the sub matrix distributions.
   if(!result_matrix.distribution_pt()->built())
    {
     // The result distribution
     LinearAlgebraDistribution tmp_distribution;
     LinearAlgebraDistributionHelpers::concatenate(row_distribution_pt,
                                                   tmp_distribution);
     
     result_matrix.build(&tmp_distribution);
    }
   else
    // A distribution is supplied for the result matrix.
    {
 #ifdef PARANOID
     // Check that the result distribution is a concatenation of the 
     // distributions of the sub matrices.
 
     LinearAlgebraDistribution wanted_distribution;
 
     LinearAlgebraDistributionHelpers::concatenate(row_distribution_pt,
                                                   wanted_distribution);
 
     if(*(result_matrix.distribution_pt()) != wanted_distribution)
      {
       std::ostringstream error_message;
       error_message << "The result distribution is not correct.\n"
                     << "Please call the function without a result\n"
                     << "distribution (clear the result matrix) or check the\n"
                     << "distribution of the result matrix.\n"
                     << "The result distribution must be the same as the one \n"
                     << "created by\n"
                     << "LinearAlgebraDistributionHelpers::concatenate(...)";
       throw OomphLibError(error_message.str(),
                           OOMPH_CURRENT_FUNCTION,
                           OOMPH_EXCEPTION_LOCATION);
      }
 #endif
    }
 
   // The rest of the paranoid checks.
 #ifdef PARANOID
 
   // Make sure that the communicator from the result matrix is the same as
   // all the others. This test is redundant if this function created the
   // result matrix distribution, since then it is guaranteed that the 
   // communicators are the same.
   {
    // Communicator from the result matrix.
    const OomphCommunicator res_comm
     = *(result_matrix.distribution_pt()->communicator_pt());
 
    // Is the result communicator pointer the same as the others?
    // Since we have already tested the others, we only need to compare against
    // one of them. Say the first communicator from row_distribution_pt.
    const OomphCommunicator first_comm 
     = *(row_distribution_pt[0]->communicator_pt());
 
    if(res_comm != first_comm)
     {
      std::ostringstream error_message;
      error_message
       << "The OomphCommunicator of the result matrix is not the same as the "
       << "others!"; 
      throw OomphLibError(error_message.str(),
                          OOMPH_CURRENT_FUNCTION,
                          OOMPH_EXCEPTION_LOCATION); 
     }
   }
   
   // Are all the distributed boolean the same? This only applies if we have
   // more than one processor. If there is only one processor, then it does
   // not matter if it is distributed or not - they are conceptually the same.
   if(comm_pt->nproc() != 1)
    {
     // Compare distributed for sub matrices (against the result matrix).
     const bool res_distributed = result_matrix.distributed();
 
     // Loop over all sub blocks.
     for(unsigned block_row_i = 0; block_row_i < matrix_nrow; block_row_i++)
      {
       for(unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++)
        {
         if(matrix_pt(block_row_i,block_col_i) != 0)
          {
           const bool another_distributed 
            = matrix_pt(block_row_i,block_col_i)->distributed();
 
           if(res_distributed != another_distributed)
            {
             std::ostringstream error_message;
             error_message
              << "The distributed boolean of the sub matrix (" 
              << block_row_i << "," << block_col_i << ")\n"
              << "is not the same as the result matrix. \n";
             throw OomphLibError(error_message.str(),
                                 OOMPH_CURRENT_FUNCTION,
                                 OOMPH_EXCEPTION_LOCATION); 
            }
          }
        }
      }
 
     // Do this test for row_distribution_pt
     const bool first_row_distribution_distributed
      = row_distribution_pt[0]->distributed();
 
     for (unsigned block_row_i = 1; block_row_i < matrix_nrow; block_row_i++) 
      {
       const bool another_distributed 
        = row_distribution_pt[block_row_i]->distributed();
       
       if(first_row_distribution_distributed != another_distributed)
        {
         std::ostringstream error_message;
         error_message
          << "The distributed boolean of row_distribution_pt[" 
          << block_row_i << "]\n"
          << "is not the same as the one from row_distribution_pt[0]. \n";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION); 
        }
      }
     
     // Repeat for col_distribution_pt
     for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++) 
      {
       const bool another_distributed 
        = col_distribution_pt[block_col_i]->distributed();
       
       if(first_row_distribution_distributed != another_distributed)
        {
         std::ostringstream error_message;
         error_message
          << "The distributed boolean of col_distribution_pt[" 
          << block_col_i << "]\n"
          << "is not the same as the one from row_distribution_pt[0]. \n";
         throw OomphLibError(error_message.str(),
                             OOMPH_CURRENT_FUNCTION,
                             OOMPH_EXCEPTION_LOCATION); 
        }
      }
    }
 #endif
 
   ////////////////// END OF PARANOID TESTS ////////////////////////////////////
   
   // The number of processors.
   unsigned nproc = comm_pt->nproc();
   
   // Cache the result distribution pointer for convenience.
   LinearAlgebraDistribution* res_distribution_pt
    = result_matrix.distribution_pt();
 
   // nrow_local for the result matrix
   unsigned res_nrow_local = res_distribution_pt->nrow_local();
   
   // renamed for readability.
   unsigned nblock_col = matrix_ncol;
 
   // construct the block offset
 //  DenseMatrix<unsigned> col_offset(nproc,nblock_col,0);
   std::vector<std::vector<unsigned> > col_offset(
       nproc,
       std::vector<unsigned>(nblock_col));
   unsigned off = 0;
   for (unsigned proc_i = 0; proc_i < nproc; proc_i++) 
    {
     for (unsigned block_i = 0; block_i < nblock_col; block_i++) 
      {
       col_offset[proc_i][block_i] = off;
       off +=col_distribution_pt[block_i]->nrow_local(proc_i);
      }
    }
 
   // Do some pre-processing for the processor number a global row number is 
   // on. This is required when permuting the column entries.
   // We need to do this for each distribution, so we have a vector of 
   // vectors. First index corresponds to the distribution, the second is
   // the processor number.
   std::vector<std::vector<unsigned> > p_for_rows(
       nblock_col,std::vector<unsigned>());
   // initialise 2D vector
   for (unsigned blocki = 0; blocki < nblock_col; blocki++) 
   {
     int blockinrow = col_distribution_pt[blocki]->nrow();
     p_for_rows[blocki].resize(blockinrow);
     // FOR each global index in the block, work out the corresponding proc.
     for (int rowi = 0; rowi < blockinrow; rowi++) 
     {
       unsigned p = 0;
       int b_first_row = col_distribution_pt[blocki]->first_row(p);
       int b_nrow_local = col_distribution_pt[blocki]->nrow_local(p);
 
       while (rowi < b_first_row ||
              rowi >= b_nrow_local + b_first_row)
       {
         p++;
         b_first_row = col_distribution_pt[blocki]->first_row(p);
         b_nrow_local = col_distribution_pt[blocki]->nrow_local(p);
       }
       p_for_rows[blocki][rowi] = p;
     }
   }
 
   // determine nnz of all blocks on this processor only.
   // This is used to create storage space.
   unsigned long res_nnz = 0;
   for (unsigned row_i = 0; row_i < nblock_row; row_i++) 
    {
     for (unsigned col_i = 0; col_i < nblock_col; col_i++) 
      {
       if(matrix_pt(row_i,col_i) !=0)
        {
         res_nnz += matrix_pt(row_i,col_i)->nnz();
        }
      }
    }
 
      // My rank
 //    unsigned my_rank = comm_pt->my_rank();
 //    my_rank = my_rank;
 
     // Turn the above into a string.
 //      std::ostringstream myrankstream;
 //      myrankstream << "THISDOESNOTHINGnp" << my_rank << std::endl;
 //      std::string myrankstring = myrankstream.str();
 
 
 //CALLGRIND_ZERO_STATS;
 //CALLGRIND_START_INSTRUMENTATION;
 
   // storage for the result matrix.
   int* res_row_start = new int[res_nrow_local+1];
   int* res_column_index = new int[res_nnz];
   double* res_value = new double[res_nnz];
     
   // initialise the zero-th entry
   res_row_start[0] = 0;
     
   // loop over the block rows
   unsigned long res_i = 0; // index for the result matrix.
   unsigned long res_row_i = 0; // index for the row
   for (unsigned i = 0; i < nblock_row; i++) 
    {
     // loop over the rows of the current block local rows.
     unsigned block_nrow = row_distribution_pt[i]->nrow_local();
     for (unsigned k = 0; k < block_nrow; k++) 
      {
       // initialise res_row_start
       res_row_start[res_row_i+1] = res_row_start[res_row_i];
 
       // Loop over the block columns
       for (unsigned j = 0; j < nblock_col; j++) 
        {
         // if block(i,j) pointer is not null then
         if (matrix_pt(i,j) != 0) 
          {
           // get pointers for the elements in the current block
           int* b_row_start = matrix_pt(i,j)->row_start();
           int* b_column_index = matrix_pt(i,j)->column_index();
           double* b_value = matrix_pt(i,j)->value();
 
           //memcpy( &dst[dstIdx], &src[srcIdx], numElementsToCopy * sizeof( Element ) );
           // no ele to copy
           int numEleToCopy = b_row_start[k+1] - b_row_start[k];
           memcpy(res_value+res_i,b_value+b_row_start[k],numEleToCopy*sizeof(double));
           // Loop through the current local row.
           for (int l = b_row_start[k]; l < b_row_start[k+1]; l++)
            {
             // if b_column_index[l] was a row index, what processor
             // would it be on
 //            unsigned p = col_distribution_pt[j]
 //              ->rank_of_global_row_map(b_column_index[l]);
             unsigned p = p_for_rows[j][b_column_index[l]];
 
             int b_first_row = col_distribution_pt[j]->first_row(p);
 //            int b_nrow_local = col_distribution_pt[j]->nrow_local(p);
 
 //            while (b_column_index[l] < b_first_row || 
 //                   b_column_index[l] >= b_nrow_local+b_first_row)
 //             {
 //              p++;
 //              b_first_row = col_distribution_pt[j]->first_row(p);
 //              b_nrow_local = col_distribution_pt[j]->nrow_local(p);
 //             }
 
             // determine the local equation number in the block j/processor p
             // "column block"
             int eqn = b_column_index[l]-b_first_row;
 
             // add to the result matrix
 //            res_value[res_i] = b_value[l];
             res_column_index[res_i] = col_offset[p][j]+eqn;
             res_row_start[res_row_i+1]++;
             res_i++;
            }
          }
        }
 
       // increment the row pt
       res_row_i++;
 
      }
    }
 //CALLGRIND_STOP_INSTRUMENTATION;
 //CALLGRIND_DUMP_STATS_AT(myrankstring.c_str());
 
 
   // Get the number of columns of the result matrix.
   unsigned res_ncol = 0;
   for (unsigned block_col_i = 0; block_col_i < matrix_ncol; block_col_i++) 
   {
     res_ncol += col_distribution_pt[block_col_i]->nrow();
   }
 
   // Build the result matrix.
   result_matrix.build_without_copy(res_ncol,res_nnz,
                                    res_value,
                                    res_column_index,
                                    res_row_start);
  }
 
 
 
  //============================================================================
  /// \short Concatenate CRDoubleMatrix matrices.
  /// This calls the other concatenate_without_communication(...) function,
  /// passing block_distribution_pt as both the row_distribution_pt and
  /// col_distribution_pt. This should only be called for block square matrices.
  //============================================================================
  void concatenate_without_communication(
   const Vector<LinearAlgebraDistribution*> &block_distribution_pt,
   const DenseMatrix<CRDoubleMatrix*> &matrix_pt,
   CRDoubleMatrix &result_matrix)
  {
  
 #ifdef PARANOID
   // The number of block rows and block columns.
   unsigned matrix_nrow = matrix_pt.nrow();
   unsigned matrix_ncol = matrix_pt.ncol();
 
   // Are there matrices to concatenate?
   if(matrix_nrow == 0)
    {
     std::ostringstream error_message;
     error_message << "There are no matrices to concatenate.\n";
     throw OomphLibError(error_message.str(),
                         OOMPH_CURRENT_FUNCTION,
                         OOMPH_EXCEPTION_LOCATION);
    }
 
   // Ensure that the sub matrices is a square block matrix.
   if(matrix_nrow != matrix_ncol)
    {
     std::ostringstream error_message;
     error_message<<"The number of block rows and block columns\n"
                  <<"must be the same. Otherwise, call the other\n"
                  <<"concatenate_without_communication function, passing in\n"
                  <<"a Vector of distributions describing how to permute the\n"
                  <<"columns.";
     throw OomphLibError(error_message.str(),
                         OOMPH_CURRENT_FUNCTION,
                         OOMPH_EXCEPTION_LOCATION);
    }
 #endif
   
   concatenate_without_communication(
    block_distribution_pt,block_distribution_pt,matrix_pt,result_matrix);
  }
 
 } // CRDoubleMatrixHelpers
 
 } // namespace oomph