oomph-lib: navier_stokes_preconditioners.cc Source File

Go to the documentation of this file.
 //LIC// ====================================================================
 //LIC// This file forms part of oomph-lib, the object-oriented, 
 //LIC// multi-physics finite-element library, available 
 //LIC// at http://www.oomph-lib.org.
 //LIC// 
 //LIC//    Version 1.0; svn revision $LastChangedRevision$
 //LIC//
 //LIC// $LastChangedDate$
 //LIC// 
 //LIC// Copyright (C) 2006-2016 Matthias Heil and Andrew Hazel
 //LIC// 
 //LIC// This library is free software; you can redistribute it and/or
 //LIC// modify it under the terms of the GNU Lesser General Public
 //LIC// License as published by the Free Software Foundation; either
 //LIC// version 2.1 of the License, or (at your option) any later version.
 //LIC// 
 //LIC// This library is distributed in the hope that it will be useful,
 //LIC// but WITHOUT ANY WARRANTY; without even the implied warranty of
 //LIC// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 //LIC// Lesser General Public License for more details.
 //LIC// 
 //LIC// You should have received a copy of the GNU Lesser General Public
 //LIC// License along with this library; if not, write to the Free Software
 //LIC// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 //LIC// 02110-1301  USA.
 //LIC// 
 //LIC// The authors may be contacted at oomph-lib@maths.man.ac.uk.
 //LIC// 
 //LIC//====================================================================
 #include "navier_stokes_preconditioners.h"
 
 namespace oomph
 {
 
 
 ///////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////
 
 
 //======start_of_namespace============================================
 /// Namespace for exact solution for pressure advection diffusion 
 /// problem
 //====================================================================
  namespace PressureAdvectionDiffusionValidation
  {
   
 
   /// Flag for solution
   unsigned Flag=0;
   
   /// Peclet number -- overwrite with actual Reynolds number
   double Peclet=0.0;
   
   /// Wind
   void wind_function(const Vector<double>& x, Vector<double>& wind)
   {
    if (Flag==0)
     {
      wind[0]=sin(6.0*x[1]);
      wind[1]=cos(6.0*x[0]);
     }
    else
     {
      wind[0]=1.0;
      wind[1]=0.0;
     }
   }
   
   /// Exact solution as a Vector
   void get_exact_u(const Vector<double>& x, Vector<double>& u)
   {
    u.resize(3);
    wind_function(x,u);
    if (Flag==0)
     {
      u[2]=x[0]*x[0]*pow(1.0-x[0],2.0)*x[1]*x[1]*pow(1.0-x[1],2.0);
     }
    else
     {
      u[2]=0.1E1-Peclet*x[0]*(0.1E1-0.5*x[0]);
     }
   }
   
   /// Exact solution as a scalar
   void get_exact_u(const Vector<double>& x, double& u)
   {
    if (Flag==0)
     {
      u=x[0]*x[0]*pow(1.0-x[0],2.0)*x[1]*x[1]*pow(1.0-x[1],2.0);
     }
    else
     {
      u=0.1E1-Peclet*x[0]*(0.1E1-0.5*x[0]);
     }
   }
   
   /// Source function required to make the solution above an exact solution 
   double source_function(const Vector<double>& x_vect)
   {
 
    double x[2];
    x[0]=x_vect[0];
    x[1]=x_vect[1];
 
    
    double source=0.0;
 
    if (Flag==0)
     {
      source=
       Peclet*(sin(0.6E1*x[1])*(2.0*x[0]*pow(1.0-x[0],2.0)*x[1]*x[1]*pow(
                                 1.0-x[1],2.0)-2.0*x[0]*x[0]*(1.0-x[0])*x[1]*x[1]*pow(1.0-x[1],2.0))+cos(0.6E1*x
                                                                                                         [0])*(2.0*x[0]*x[0]*pow(1.0-x[0],2.0)*x[1]*pow(1.0-x[1],2.0)-2.0*x[0]*x[0]*pow(
                                                                                                                1.0-x[0],2.0)*x[1]*x[1]*(1.0-x[1])))-2.0*pow(1.0-x[0],2.0)*x[1]*x[1]*pow(1.0-x
                                                                                                                                                                                         [1],2.0)+8.0*x[0]*(1.0-x[0])*x[1]*x[1]*pow(1.0-x[1],2.0)-2.0*x[0]*x[0]*x[1]*x
       [1]*pow(1.0-x[1],2.0)-2.0*x[0]*x[0]*pow(1.0-x[0],2.0)*pow(1.0-x[1],2.0)+8.0*x
       [0]*x[0]*pow(1.0-x[0],2.0)*x[1]*(1.0-x[1])-2.0*x[0]*x[0]*pow(1.0-x[0],2.0)*x[1]
       *x[1];
     }
    else
     {
      source=Peclet*(-0.1E1*Peclet*(0.1E1-0.5*x[0])+0.5*Peclet*x[0])-0.1E1*Peclet
 ;
 
     }
    
    return source;
   }
   
   
  } // end of namespace
  
  
 ////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////
  
 
 //===========================================================================
 /// Setup the least-squares commutator Navier Stokes preconditioner. This
 /// extracts blocks corresponding to the velocity and pressure unknowns,
 /// creates the matrices actually needed in the application of the
 /// preconditioner and deletes what can be deleted... Note that
 /// this preconditioner needs a CRDoubleMatrix.
 //============================================================================
  void NavierStokesSchurComplementPreconditioner::
  setup()
  {
   // For debugging...
   bool doc_block_matrices=false;
 
   // For output timing results - to be removed soon. Ray
   bool raytime_flag = false;
 
   //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   // NOTE: In the interest of minimising memory usage, several containers
   //       are recycled, therefore their content/meaning changes
   //       throughout this function. The code is carefully annotated
   //       but you'll have to read it line by line!
   //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   double t_clean_up_memory_start = TimingHelpers::timer();
   // make sure any old data is deleted
   clean_up_memory();
   double t_clean_up_memory_end = TimingHelpers::timer();
   double clean_up_memory_time = t_clean_up_memory_end 
                                 - t_clean_up_memory_start;
   if(raytime_flag)
   {
     oomph_info << "LSC: clean_up_memory_time: " 
                << clean_up_memory_time << std::endl; 
   }
   
 
 #ifdef PARANOID
   // paranoid check that the navier stokes mesh pt has been set
   if (Navier_stokes_mesh_pt == 0)
    {
     std::ostringstream error_message;
     error_message << "The navier stokes elements mesh pointer must be set.\n"
                   << "Use method set_navier_stokes_mesh(...)";
     throw OomphLibError(error_message.str(),
                         OOMPH_CURRENT_FUNCTION,
                         OOMPH_EXCEPTION_LOCATION);
    }
 #endif
 
 
   // Set the mesh 
   this->set_nmesh(1);
   this->set_mesh(0,Navier_stokes_mesh_pt,
                  Allow_multiple_element_type_in_navier_stokes_mesh);
   
   // Get blocks
   // ----------
 
   // In comes the current Jacobian. Recast it to a CR double matrix;
   // shout if that can't be done.
   CRDoubleMatrix* cr_matrix_pt = dynamic_cast<CRDoubleMatrix*>(matrix_pt());
 
 
 #ifdef PARANOID
   if (cr_matrix_pt==0)
    {
     std::ostringstream error_message;
     error_message 
      << "NavierStokesSchurComplementPreconditioner only works with "
      << "CRDoubleMatrix matrices" << std::endl;
     throw OomphLibError(error_message.str(),
                         OOMPH_CURRENT_FUNCTION,
                         OOMPH_EXCEPTION_LOCATION);
    }
 #endif
 
 
   if (doc_block_matrices)
    {
     std::stringstream junk;
     junk << "j_matrix" << comm_pt()->my_rank()
          << ".dat";
     oomph_info << "About to output " << junk.str() << std::endl;
     cr_matrix_pt->sparse_indexed_output_with_offset(junk.str());
     oomph_info << "Done output of " << junk.str() << std::endl;
    }
 
 
   // Set up block look up schemes (done automatically in the
   // BlockPreconditioner base class, based on the information 
   // provided in the block-preconditionable elements in the problem)
 
   // this preconditioner has two types of block:
   // type 0: velocity - corresponding to DOFs 0 to n-2
   // type 1: pressure - corresponding to DOF n-1
   double t_block_setup_start = TimingHelpers::timer();
   unsigned ndof_types = 0;
 
   if (this->is_subsidiary_block_preconditioner())
    {
     ndof_types = this->ndof_types();
    }
   else
    {
     // This is the upper-most master block preconditioner, the Navier-Stokes
     // mesh is in position 0
     ndof_types = this->ndof_types_in_mesh(0);
    }
 
   Vector<unsigned> dof_to_block_map(ndof_types);
   dof_to_block_map[ndof_types-1]=1;
 
   this->block_setup(dof_to_block_map);
 
   double t_block_setup_finish = TimingHelpers::timer();
   double block_setup_time = t_block_setup_finish - t_block_setup_start;
   if(Doc_time)
    {
     oomph_info << "Time for block_setup(...) [sec]: "
                << block_setup_time << "\n";
    }
 
   if(raytime_flag)
   {
     oomph_info << "LSC: block_setup: " << block_setup_time << std::endl; 
   }
   
 
   // determine whether the F preconditioner is a block preconditioner (and
   // therefore a subsidiary preconditioner)
   BlockPreconditioner<CRDoubleMatrix>* F_block_preconditioner_pt = 
    dynamic_cast<BlockPreconditioner<CRDoubleMatrix>* >(F_preconditioner_pt);
   F_preconditioner_is_block_preconditioner = true;
   if (F_block_preconditioner_pt == 0)
    {
     F_preconditioner_is_block_preconditioner = false;
    }
   
   // Get B (the divergence block)
   double t_get_B_start = TimingHelpers::timer();
   CRDoubleMatrix* b_pt = new CRDoubleMatrix;
   this->get_block(1,0,*b_pt);
 
   double t_get_B_finish = TimingHelpers::timer();
   double get_B_time = t_get_B_finish - t_get_B_start;
   if(Doc_time)
    {
     oomph_info << "Time to get B [sec]: "
                << get_B_time << "\n";
    }
 
   if(raytime_flag)
   {
   oomph_info << "LSC: get block B get_B_time: " << get_B_time << std::endl; 
   }
 
   if (doc_block_matrices)
    {
     std::stringstream junk;
     junk << "b_matrix" << comm_pt()->my_rank()
          << ".dat";
     b_pt->sparse_indexed_output_with_offset(junk.str());
     oomph_info << "Done output of " << junk.str() << std::endl;
    }
   
   
   // get the inverse velocity and pressure mass matrices
   CRDoubleMatrix* inv_v_mass_pt = 0;
   CRDoubleMatrix* inv_p_mass_pt = 0;
 
   double ivmm_assembly_start_t = TimingHelpers::timer();
   if (Use_LSC)
    {   
     // We only need the velocity mass matrix
     assemble_inv_press_and_veloc_mass_matrix_diagonal(inv_p_mass_pt,
                                                       inv_v_mass_pt,
                                                       false);
    }
   else
    {
     // We need both mass matrices
     assemble_inv_press_and_veloc_mass_matrix_diagonal(inv_p_mass_pt,
                                                       inv_v_mass_pt,
                                                       true);
    }
 
   double ivmm_assembly_finish_t = TimingHelpers::timer();
 
     double 
      ivmm_assembly_time = ivmm_assembly_finish_t - ivmm_assembly_start_t;
   if (Doc_time)
    {
 
     oomph_info << "Time to assemble inverse diagonal velocity and pressure"
                << "mass matrices) [sec]: "
                << ivmm_assembly_time << "\n";
    }
   if(raytime_flag)
   {
    oomph_info << "LSC: ivmm_assembly_time: " 
               << ivmm_assembly_time << std::endl; 
   }
 
 
   if (doc_block_matrices)
    {
     std::stringstream junk;
     junk << "inv_v_mass_matrix" 
          << comm_pt()->my_rank()
          << ".dat";
     inv_v_mass_pt->sparse_indexed_output_with_offset(junk.str());
     oomph_info << "Done output of " << junk.str() << std::endl;
    }
   
   
   // Get gradient matrix Bt
   CRDoubleMatrix* bt_pt = new CRDoubleMatrix;
   double t_get_Bt_start = TimingHelpers::timer();
   this->get_block(0,1,*bt_pt);
   double t_get_Bt_finish = TimingHelpers::timer();
   
     double t_get_Bt_time = t_get_Bt_finish - t_get_Bt_start;
   if(Doc_time)
    {
     oomph_info << "Time to get Bt [sec]: "
                << t_get_Bt_time << std::endl;
    }
   if(raytime_flag)
   {
   oomph_info << "LSC: get block Bt: " 
              << t_get_Bt_time << std::endl;  
   }
 
   if (doc_block_matrices)   
    {
     std::stringstream junk;
     junk << "bt_matrix" << comm_pt()->my_rank()
          << ".dat";
     bt_pt->sparse_indexed_output_with_offset(junk.str());
     oomph_info << "Done output of " << junk.str() << std::endl;
    }
   
   
   // Build pressure Poisson matrix 
   CRDoubleMatrix* p_matrix_pt = new CRDoubleMatrix;
 
   // Multiply inverse velocity mass matrix by gradient matrix B^T
   double t_QBt_matrix_start = TimingHelpers::timer();
   CRDoubleMatrix* qbt_pt = new CRDoubleMatrix;
   inv_v_mass_pt->multiply(*bt_pt, *qbt_pt);
   delete bt_pt; bt_pt = 0;
 
   // Store product in bt_pt 
   bt_pt = qbt_pt;
   double t_QBt_matrix_finish = TimingHelpers::timer();
 
   double t_QBt_time = t_QBt_matrix_finish - t_QBt_matrix_start;
   if(Doc_time)
    {
     oomph_info << "Time to generate QBt [sec]: "
                << t_QBt_time << std::endl;
    }
   delete inv_v_mass_pt; inv_v_mass_pt = 0;
   if(raytime_flag)
   {
   oomph_info << "LSC: t_QBt_time (matrix multiplicaton): " 
              << t_QBt_time << std::endl;  
   }
   
   // Multiply B from left by divergence matrix B and store result in 
   // pressure Poisson matrix.
   double t_p_matrix_start = TimingHelpers::timer();
   b_pt->multiply(*bt_pt, *p_matrix_pt);
   double t_p_matrix_finish = TimingHelpers::timer();
 
     double t_p_time = t_p_matrix_finish - t_p_matrix_start;
   if(Doc_time)
    {
     oomph_info << "Time to generate P [sec]: "
                << t_p_time << std::endl;
    }
   // Kill divergence matrix because we don't need it any more
   delete b_pt; b_pt = 0;
  
   if(raytime_flag)
   {
   oomph_info << "LSC: t_p_time (matrix multiplication): " 
              << t_p_time << std::endl;
   }
 
 
   // Build the matvec operator for QBt
   double t_QBt_MV_start = TimingHelpers::timer();
   QBt_mat_vec_pt = new MatrixVectorProduct;
   this->setup_matrix_vector_product(QBt_mat_vec_pt,bt_pt,1);
   double t_QBt_MV_finish = TimingHelpers::timer();
 
   double t_p_time2 = t_QBt_MV_finish - t_QBt_MV_start;
   if(Doc_time)
    {
     oomph_info << "Time to build QBt matrix vector operator [sec]: "
                << t_p_time2 << std::endl;
    }
 
   // Kill gradient matrix B^T (it's been overwritten anyway and
   // needs to be recomputed afresh below)
   delete bt_pt; bt_pt = 0;
   
   if(raytime_flag)
   {
   oomph_info << "LSC: QBt (setup MV product): " << t_p_time2 << std::endl;
   }
 
   // Do we need the Fp stuff?
   if (!Use_LSC)
    {
     // Get pressure advection diffusion matrix Fp and store in 
     // a "big" matrix (same size as the problem's Jacobian)
     double t_get_Fp_start = TimingHelpers::timer();
     CRDoubleMatrix full_fp_matrix;
     get_pressure_advection_diffusion_matrix(full_fp_matrix);
     
     // Now extract the pressure pressure block
     CRDoubleMatrix* fp_matrix_pt = new CRDoubleMatrix;
     this->get_block_other_matrix(1,1,&full_fp_matrix,*fp_matrix_pt);
     double t_get_Fp_finish = TimingHelpers::timer();
     if(Doc_time)
      {
       double t_get_Fp_time = t_get_Fp_finish - t_get_Fp_start;
       oomph_info << "Time to get Fp [sec]: "
                  << t_get_Fp_time << std::endl;
      }
     
     // Build vector product of pressure advection diffusion matrix with
     // inverse pressure mass matrix
     CRDoubleMatrix* fp_qp_inv_pt = new CRDoubleMatrix; 
     fp_matrix_pt->multiply(*inv_p_mass_pt, *fp_qp_inv_pt);
     
     // Build the matvec operator for E = F_p Q_p^{-1}
     double t_Fp_Qp_inv_MV_start = TimingHelpers::timer();
     E_mat_vec_pt = new MatrixVectorProduct;
     this->setup_matrix_vector_product(E_mat_vec_pt,fp_qp_inv_pt,1);
     double t_Fp_Qp_inv_MV_finish = TimingHelpers::timer();
     if(Doc_time)
      {
       double t_p_time = t_Fp_Qp_inv_MV_finish - t_Fp_Qp_inv_MV_start;
       oomph_info << "Time to build Fp Qp^{-1} matrix vector operator [sec]: "
                  << t_p_time << std::endl;
      }
     // Kill pressure advection diffusion and inverse pressure mass matrices
     delete inv_p_mass_pt; inv_p_mass_pt = 0;
     delete fp_qp_inv_pt; fp_qp_inv_pt = 0;
    }
 
 
   // Get momentum block F
   CRDoubleMatrix* f_pt = new CRDoubleMatrix;
   double t_get_F_start = TimingHelpers::timer();
   this->get_block(0,0,*f_pt);
   double t_get_F_finish = TimingHelpers::timer();
 
   double t_get_F_time = t_get_F_finish - t_get_F_start;
   if(Doc_time)
    {
     oomph_info << "Time to get F [sec]: "
                << t_get_F_time << std::endl;
    }
   if(raytime_flag)
   {
     oomph_info << "LSC: get_block t_get_F_time: " 
                << t_get_F_time << std::endl;
   }
   
   // form the matrix vector product helper
   double t_F_MV_start = TimingHelpers::timer();
   F_mat_vec_pt = new MatrixVectorProduct;
   this->setup_matrix_vector_product(F_mat_vec_pt,f_pt,0);
   double t_F_MV_finish = TimingHelpers::timer();
 
     double t_F_MV_time = t_F_MV_finish - t_F_MV_start;
   if(Doc_time)
    {
     oomph_info << "Time to build F Matrix Vector Operator [sec]: "
                << t_F_MV_time << std::endl;
    }
   if(raytime_flag)
   {
    oomph_info << "LSC: MV product setup t_F_MV_time: " 
               << t_F_MV_time << std::endl;
   }
 
   
   // if F is a block preconditioner then we can delete the F matrix
   if (F_preconditioner_is_block_preconditioner)
    {
     delete f_pt; f_pt = 0;
    }
   
   // Rebuild Bt (remember that we temporarily overwrote
   // it by its product with the inverse velocity mass matrix)
   t_get_Bt_start = TimingHelpers::timer();
   bt_pt = new CRDoubleMatrix;
   this->get_block(0,1,*bt_pt);
   t_get_Bt_finish = TimingHelpers::timer();
     double t_get_Bt_time2 = t_get_Bt_finish - t_get_Bt_start;
   if(Doc_time)
    {
 
     oomph_info << "Time to get Bt [sec]: "
                << t_get_Bt_time2 << std::endl;
    }
   if(raytime_flag)
   {
   oomph_info << "LSC: get_block t_get_Bt_time2: " 
              << t_get_Bt_time2 << std::endl;
   }
  
 
   // form the matrix vector operator for Bt
   double t_Bt_MV_start = TimingHelpers::timer();
   Bt_mat_vec_pt = new MatrixVectorProduct;
   this->setup_matrix_vector_product(Bt_mat_vec_pt,bt_pt,1);
 
 //  if(Doc_time)
 //   {
 //    oomph_info << "Time to build Bt Matrix Vector Operator [sec]: "
 //               << t_Bt_MV_time << std::endl;
 //   }
 
   delete bt_pt; bt_pt = 0;
 
   double t_Bt_MV_finish = TimingHelpers::timer();
 
   double t_Bt_MV_time = t_Bt_MV_finish - t_Bt_MV_start;
   if(raytime_flag)
   {
   oomph_info << "LSC: MV product setup t_Bt_MV_time: " 
                << t_Bt_MV_time << std::endl;
   }
 
   // if the P preconditioner has not been setup
   if (P_preconditioner_pt == 0)
    {
     P_preconditioner_pt = new SuperLUPreconditioner;
     Using_default_p_preconditioner = true;
    }
 
   // Setup the preconditioner for the Pressure matrix
   double t_p_prec_start = TimingHelpers::timer();
 
   if (doc_block_matrices)
    {
     std::stringstream junk;
     junk << "p_matrix" << comm_pt()->my_rank()
          << ".dat";
     p_matrix_pt->sparse_indexed_output_with_offset(junk.str());
     oomph_info << "Done output of " << junk.str() << std::endl;
    }
   
   P_preconditioner_pt->setup(p_matrix_pt);
   delete p_matrix_pt; p_matrix_pt = 0;
   double t_p_prec_finish = TimingHelpers::timer();
 
     double t_p_prec_time = t_p_prec_finish - t_p_prec_start;
   if(Doc_time)
    {
     oomph_info << "P sub-preconditioner setup time [sec]: "
                << t_p_prec_time << "\n";
    }
   if(raytime_flag)
   {
   oomph_info << "LSC: p_prec setup time: " << t_p_prec_time << std::endl;
   }
 
   
   // Set up solver for solution of system with momentum matrix
   // ----------------------------------------------------------
 
   // if the F preconditioner has not been setup
   if (F_preconditioner_pt == 0)
    {
     F_preconditioner_pt = new SuperLUPreconditioner;
     Using_default_f_preconditioner = true;
    }
 
   // if F is a block preconditioner
   double t_f_prec_start = TimingHelpers::timer();
   if (F_preconditioner_is_block_preconditioner)
    {
     unsigned nvelocity_dof_types
       = Navier_stokes_mesh_pt->finite_element_pt(0)->dim();
     
     Vector<unsigned> dof_map(nvelocity_dof_types);
     for (unsigned i = 0; i < nvelocity_dof_types; i++)
      {
       dof_map[i] = i;
      }
 
     F_block_preconditioner_pt->
      turn_into_subsidiary_block_preconditioner(this,dof_map);
 
     F_block_preconditioner_pt->setup(matrix_pt());
    }
   // otherwise F is not a block preconditioner
   else
    {
     F_preconditioner_pt->setup(f_pt);
     delete f_pt; f_pt = 0;
    }
   double t_f_prec_finish = TimingHelpers::timer();
     double t_f_prec_time = t_f_prec_finish - t_f_prec_start;
   if(Doc_time)
    {
 
     oomph_info << "F sub-preconditioner setup time [sec]: "
                << t_f_prec_time << "\n";
    }
   if(raytime_flag)
   {
    oomph_info << "LSC: f_prec setup time: " << t_f_prec_time << std::endl; 
   }
 
   // Remember that the preconditioner has been setup so
   // the stored information can be wiped when we
   // come here next...
   Preconditioner_has_been_setup = true;
  }
 
 
 
 //=======================================================================
  /// Apply preconditioner to r.
 //=======================================================================
  void NavierStokesSchurComplementPreconditioner:: 
  preconditioner_solve(const DoubleVector &r, DoubleVector &z)
  {
 #ifdef PARANOID
   if (Preconditioner_has_been_setup==false)
    {
     std::ostringstream error_message;
     error_message << "setup must be called before using preconditioner_solve";
     throw OomphLibError(
      error_message.str(),
      OOMPH_CURRENT_FUNCTION,
      OOMPH_EXCEPTION_LOCATION);
    }
   if (z.built())
    {
     if (z.nrow() != r.nrow())
      {
       std::ostringstream error_message;
       error_message << "The vectors z and r must have the same number of "
                     << "of global rows";
       throw OomphLibError(
        error_message.str(),
        OOMPH_CURRENT_FUNCTION,
        OOMPH_EXCEPTION_LOCATION);      
      }
    }
 #endif
 
   // if z is not setup then give it the same distribution
   if (!z.distribution_pt()->built())
    {
     z.build(r.distribution_pt(),0.0);
    }
 
   // Step 1 - apply approximate Schur inverse to pressure unknowns (block 1)
   // -----------------------------------------------------------------------
 
   // Working vectors
   DoubleVector temp_vec;
   DoubleVector another_temp_vec;
   DoubleVector yet_another_temp_vec;
 
   // Copy pressure values from residual vector to temp_vec:
   // Loop over all entries in the global vector (this one
   // includes velocity and pressure dofs in some random fashion)
   this->get_block_vector(1,r,temp_vec);
 
   // NOTE: The vector temp_vec now contains the vector r_p.
 
   // LSC version
   if (Use_LSC)
    {
     // Solve first pressure Poisson system
 #ifdef PARANOID
     // check a solver has been set
     if (P_preconditioner_pt==0)
      {
       std::ostringstream error_message;
       error_message << "P_preconditioner_pt has not been set.";
       throw OomphLibError(
        error_message.str(),
        OOMPH_CURRENT_FUNCTION,
        OOMPH_EXCEPTION_LOCATION);
      }
 #endif
     
     // use some Preconditioner's preconditioner_solve function
     P_preconditioner_pt->preconditioner_solve(temp_vec, another_temp_vec);
     
     // NOTE: The vector another_temp_vec now contains the vector P^{-1} r_p
     
     // Multiply another_temp_vec by matrix E and stick the result into temp_vec
     temp_vec.clear();  
     QBt_mat_vec_pt->multiply(another_temp_vec, temp_vec);
     another_temp_vec.clear();
     F_mat_vec_pt->multiply(temp_vec,another_temp_vec);
     temp_vec.clear();
     QBt_mat_vec_pt->multiply_transpose(another_temp_vec, temp_vec);
     
     
     // NOTE: The vector temp_vec now contains E P^{-1} r_p
     
     // Solve second pressure Poisson system using preconditioner_solve
     another_temp_vec.clear();
     P_preconditioner_pt->preconditioner_solve(temp_vec, another_temp_vec);
 
     // NOTE: The vector another_temp_vec now contains z_p = P^{-1} E P^{-1} r_p
     //       as required (apart from the sign which we'll fix in the
     //       next step.
    }
   // Fp version
   else
    {
       
     // Multiply temp_vec by matrix E and stick the result into 
     // yet_another_temp_vec
     E_mat_vec_pt->multiply(temp_vec,yet_another_temp_vec);
         
     // NOTE: The vector yet_another_temp_vec now contains Fp Qp^{-1} r_p
 
     // Solve pressure Poisson system
 #ifdef PARANOID
     // check a solver has been set
     if (P_preconditioner_pt==0)
      {
       std::ostringstream error_message;
       error_message << "P_preconditioner_pt has not been set.";
       throw OomphLibError(
        error_message.str(),
        OOMPH_CURRENT_FUNCTION,
        OOMPH_EXCEPTION_LOCATION);
      }
 #endif
     
     // Solve second pressure Poisson system using preconditioner_solve
     another_temp_vec.clear();
     P_preconditioner_pt->preconditioner_solve(yet_another_temp_vec, 
                                               another_temp_vec);
     
     // NOTE: The vector another_temp_vec now contains 
     //       z_p = P^{-1} Fp Qp^{-1} r_p
     //       as required (apart from the sign which we'll fix in the
     //       next step.
 
    }
 
   // Now copy another_temp_vec (i.e. z_p) back into the global vector z.
   // Loop over all entries in the global results vector z:
   temp_vec.build(another_temp_vec.distribution_pt(),0.0);
   temp_vec -= another_temp_vec;
   return_block_vector(1,temp_vec,z);
 
     
   // Step 2 - apply preconditioner to velocity unknowns (block 0)
   // ------------------------------------------------------------
   
   // Recall that another_temp_vec (computed above) contains the
   // negative of the solution of the Schur complement systen, -z_p.
   // Multiply by G (stored in Block_matrix_pt(0,1) and store
   // result in temp_vec (vector resizes itself).
   temp_vec.clear();
   Bt_mat_vec_pt->multiply(another_temp_vec, temp_vec);
 
   // NOTE: temp_vec now contains -G z_p
 
   // The vector another_temp_vec is no longer needed -- re-use it to store
   // velocity quantities:
   another_temp_vec.clear();
 
   // Loop over all enries in the global vector and find the
   // entries associated with the velocities:
   get_block_vector(0,r,another_temp_vec);
   another_temp_vec += temp_vec;
  
   // NOTE:  The vector another_temp_vec now contains r_u - G z_p
 
   // Solve momentum system
 #ifdef PARANOID
   // check a solver has been set
   if (F_preconditioner_pt==0)
    {
     std::ostringstream error_message;
     error_message << "F_preconditioner_pt has not been set."; 
     throw OomphLibError(
      error_message.str(),
      OOMPH_CURRENT_FUNCTION,
      OOMPH_EXCEPTION_LOCATION);
    }
 #endif
 
   // use some Preconditioner's preconditioner solve
   // and return
   if (F_preconditioner_is_block_preconditioner)
    {
     return_block_vector(0,another_temp_vec,z);
     F_preconditioner_pt->preconditioner_solve(z,z);
    }
   else
    {
     F_preconditioner_pt->preconditioner_solve(another_temp_vec, temp_vec);
     return_block_vector(0,temp_vec,z);
    }
  }
 
 
 //========================================================================
 /// Helper function to assemble the inverse diagonals of the pressure and
 /// velocity mass matrix from the elemental contributions defined in
 /// NavierStokesElementWithDiagonalMassMatrices::
 /// get_pressure_and_velocity_mass_matrix_diagonal(...)
 /// If do_both=true, both are computed, otherwise only the velocity
 /// mass matrix (the LSC version of the preconditioner only needs
 /// that one)
 //========================================================================
  void NavierStokesSchurComplementPreconditioner:: 
  assemble_inv_press_and_veloc_mass_matrix_diagonal(
   CRDoubleMatrix*& inv_p_mass_pt,
   CRDoubleMatrix*& inv_v_mass_pt,
   const bool& do_both)
  {
 
   // determine the velocity rows required by this processor
   unsigned v_first_row = this->block_distribution_pt(0)->first_row();
   unsigned v_nrow_local = this->block_distribution_pt(0)->nrow_local();
   unsigned v_nrow = this->block_distribution_pt(0)->nrow();
   
   // create storage for the diagonals
   double* v_values = new double[v_nrow_local];
   for (unsigned i = 0; i < v_nrow_local; i++)
    {
     v_values[i] = 0.0;
    }
 
   // Equivalent information for pressure mass matrix (only needed for 
   // Fp version)
   unsigned p_first_row=0;
   unsigned p_nrow_local=0;
   unsigned p_nrow=0;
   double* p_values = 0;
   if (!Use_LSC)
    {
     // determine the pressure rows required by this processor
     p_first_row = this->block_distribution_pt(1)->first_row();
     p_nrow_local = this->block_distribution_pt(1)->nrow_local();
     p_nrow = this->block_distribution_pt(1)->nrow();
   
     // create storage for the diagonals
     p_values = new double[p_nrow_local];
     for (unsigned i = 0; i < p_nrow_local; i++)
      {
       p_values[i] = 0.0;
      }
    }
 
   // if the problem is distributed
   bool distributed = false;
 #ifdef OOMPH_HAS_MPI
   if (problem_pt()->distributed() ||
       this->master_distribution_pt()->distributed())
    {
     distributed = true;
    }
 #endif
 
   // next we get the diagonal velocity mass matrix data
   if (distributed)
    {
 
 #ifdef OOMPH_HAS_MPI
 
     // the number of processors
     unsigned nproc = comm_pt()->nproc();
 
     // and my rank
     unsigned my_rank = comm_pt()->my_rank();
 
     // determine the rows for which we have lookup rows
 
     // if the problem is NOT distributed then we only classify global equations
     // on this processor to avoid duplication (as every processor holds 
     // every element)
     unsigned first_lookup_row = 0; 
     unsigned last_lookup_row = 0;
     first_lookup_row = this->master_distribution_pt()->first_row();
     last_lookup_row = first_lookup_row + 
      this->master_distribution_pt()->nrow_local() - 1;
 
     // find number of local elements
     unsigned n_el = Navier_stokes_mesh_pt->nelement();
     
     // get the master distribution pt
     const LinearAlgebraDistribution* master_distribution_pt = 
      this->master_distribution_pt();
 
     // Do the two blocks (0: veloc; 1: press)
     unsigned max_block=0;
     if (!Use_LSC) max_block=1;
     for (unsigned block_index=0;block_index<=max_block;block_index++)
      {
       
       // Local working variables: Default to velocity
       unsigned v_or_p_first_row=v_first_row;
       double* v_or_p_values=v_values;
       // Switch to pressure
       if (block_index==1)
        {
         v_or_p_first_row=p_first_row;
         v_or_p_values=p_values;
        }
     
 
       // the diagonal mass matrix contributions that have been
       // classified and should be sent to another processor
       Vector<double>* classified_contributions_send 
        = new Vector<double>[nproc];
       
       // the corresponding block indices
       Vector<unsigned>* classified_indices_send
        = new Vector<unsigned>[nproc];
       
       // the matrix contributions that cannot be classified by this processor
       // and therefore must be sent to another for classification
       Vector<double>* unclassified_contributions_send
        = new Vector<double>[nproc];
       
       // the corresponding global indices that require classification
       Vector<unsigned>* unclassified_indices_send
        = new Vector<unsigned>[nproc];
       
       // get the velocity or pressure distribution pt
       const LinearAlgebraDistribution* velocity_or_press_dist_pt 
        = this->block_distribution_pt(block_index);
       
       // get the contribution for each element
       for (unsigned e = 0; e < n_el; e++)
        {
         
         // Get element
         GeneralisedElement* el_pt=Navier_stokes_mesh_pt->element_pt(e);
 
         // check that the element is not halo
         if (!el_pt->is_halo())
          {
           
           // find number of degrees of freedom in the element
           // (this is slightly too big because it includes the
           // pressure dofs but this doesn't matter)
           unsigned el_dof = el_pt->ndof();
           
           // Allocate local storage for the element's contribution to the
           // mass matrix diagonal
           Vector<double> el_vmm_diagonal(el_dof,0.0);
           Vector<double> el_pmm_diagonal(el_dof,0.0);
           
           unsigned which_one=2;
           if (block_index==1) which_one=1;
 
           NavierStokesElementWithDiagonalMassMatrices* cast_el_pt=0;
           cast_el_pt=dynamic_cast<NavierStokesElementWithDiagonalMassMatrices*>
            (el_pt);
           if (cast_el_pt!=0)
            {
             cast_el_pt->get_pressure_and_velocity_mass_matrix_diagonal( 
              el_pmm_diagonal,el_vmm_diagonal,which_one);
            }
 
           // get the contribution for each dof
           for (unsigned i = 0; i < el_dof; i++)
            {
             
             //Get the equation number
             unsigned eqn_number = el_pt->eqn_number(i);
             
             // if I have lookup information on this processor
             if ((eqn_number >= first_lookup_row) && 
                 (eqn_number <= last_lookup_row)    )
              {
               
               // Only use the dofs that we're dealing with here
               if ( this->block_number(eqn_number)==int(block_index) )
                {
                 
                 // get the index in the block
                 unsigned index = this->index_in_block(eqn_number);
                 
                 // determine which processor requires the block index
                 for (unsigned p = 0; p < nproc; p++)
                  {
                   if ( (index >= velocity_or_press_dist_pt->first_row(p)) &&
                        (index < (velocity_or_press_dist_pt->first_row(p)
                                  +velocity_or_press_dist_pt->nrow_local(p)) ) )
                    {
                     
                     // if it is required by this processor then add the 
                     // contribution
                     if (p == my_rank)
                      {
                       if (block_index==0)
                        {
                         v_or_p_values[index-v_or_p_first_row] 
                          += el_vmm_diagonal[i];
                        }
                       else if (block_index==1)
                        {
                         v_or_p_values[index-v_or_p_first_row] 
                          += el_pmm_diagonal[i];
                        }
                      }
                     // otherwise store it for communication
                     else
                      {
                       if (block_index==0)
                        {
                         classified_contributions_send[p]
                          .push_back(el_vmm_diagonal[i]);
                         classified_indices_send[p].push_back(index);
                        }
                       else if (block_index==1)
                        {
                         classified_contributions_send[p]
                          .push_back(el_pmm_diagonal[i]);
                         classified_indices_send[p].push_back(index);
                        }
                      }
                    }
                  }
                }
              }
             // if we do not have the lookup information on this processor
             // then we send the mass matrix contribution to a processor
             // which we know to have the lookup information
             // the assumption: the processor for which the master block
             // preconditioner distribution will definitely hold the lookup
             // data for eqn_number (although others may)
             else if (problem_pt()->distributed())
              {
               
               // determine which processor requires the block index
               unsigned p = 0;
               while (!(eqn_number >=master_distribution_pt->first_row(p) &&
                        (eqn_number<(master_distribution_pt->first_row(p)
                                     +master_distribution_pt->nrow_local(p)))))
                {
                 p++;
                }
               
               // store the data
               if (block_index==0)
                {
                 unclassified_contributions_send[p]
                  .push_back(el_vmm_diagonal[i]);
                 unclassified_indices_send[p].push_back(eqn_number);
                }
               else if (block_index==1)
                {
                 unclassified_contributions_send[p]
                  .push_back(el_pmm_diagonal[i]);
                 unclassified_indices_send[p].push_back(eqn_number);
                }
               
              }
            }
          }
        }
       
       //next the unclassified contributions are communicated to 
       //processors that can classify them
       
       //first determine how many unclassified rows are to be sent to
       //each processor
       unsigned* n_unclassified_send = new unsigned[nproc];
       for (unsigned p = 0; p < nproc; p++)
        {
         if (p == my_rank)
          {
           n_unclassified_send[p] = 0;
          }
         else
          {
           n_unclassified_send[p] 
            = unclassified_contributions_send[p].size();
          }
        }
       
       //then all-to-all com number of unclassified to be sent / recv
       unsigned* n_unclassified_recv = new unsigned[nproc];
       MPI_Alltoall(n_unclassified_send,1,MPI_UNSIGNED,
                    n_unclassified_recv,1,MPI_UNSIGNED,
                    comm_pt()->mpi_comm());
       
       //the base displacement for the sends
       MPI_Aint base_displacement;
       MPI_Get_address(v_or_p_values,&base_displacement); 
       
       //allocate storage for the data to be received
       //and post the sends and recvs
       Vector<double*> unclassified_contributions_recv(nproc);
       Vector<unsigned*> unclassified_indices_recv(nproc);
       Vector<MPI_Request> unclassified_recv_requests;
       Vector<MPI_Request> unclassified_send_requests;
       Vector<unsigned> unclassified_recv_proc;
       for (unsigned p = 0; p < nproc; p++)
        {
         if (p != my_rank)
          {
           //recv
           if (n_unclassified_recv[p] > 0)
            {
             unclassified_contributions_recv[p] 
              = new double[n_unclassified_recv[p]];
             unclassified_indices_recv[p] = new 
              unsigned[n_unclassified_recv[p]];
               
             //data for the struct data type
             MPI_Datatype recv_types[2];
             MPI_Aint recv_displacements[2];
             int recv_sz[2];
               
             //contributions
             MPI_Type_contiguous(n_unclassified_recv[p],MPI_DOUBLE,
                                 &recv_types[0]);
             MPI_Type_commit(&recv_types[0]);
             MPI_Get_address(unclassified_contributions_recv[p],
                         &recv_displacements[0]);
             recv_displacements[0] -= base_displacement;
             recv_sz[0] = 1;
               
             //indices
             MPI_Type_contiguous(n_unclassified_recv[p],MPI_UNSIGNED,
                                 &recv_types[1]);
             MPI_Type_commit(&recv_types[1]);
             MPI_Get_address(unclassified_indices_recv[p],
                         &recv_displacements[1]);
             recv_displacements[1] -= base_displacement;
             recv_sz[1] = 1;
               
             //build the final recv type
             MPI_Datatype final_recv_type;
             MPI_Type_create_struct(2,recv_sz,recv_displacements,recv_types,
                             &final_recv_type);
             MPI_Type_commit(&final_recv_type);
               
             //and recv
             MPI_Request req;
             MPI_Irecv(v_or_p_values,1,final_recv_type,p,0,
                       comm_pt()->mpi_comm(),&req);
             unclassified_recv_requests.push_back(req);
             unclassified_recv_proc.push_back(p); 
             MPI_Type_free(&recv_types[0]);
             MPI_Type_free(&recv_types[1]);
             MPI_Type_free(&final_recv_type);
            }
             
           //send
           if (n_unclassified_send[p] > 0)
            {
             //data for the struct data type
             MPI_Datatype send_types[2];
             MPI_Aint send_displacements[2];
             int send_sz[2];
               
             //contributions
             MPI_Type_contiguous(n_unclassified_send[p],MPI_DOUBLE,
                                 &send_types[0]);
             MPI_Type_commit(&send_types[0]);
             MPI_Get_address(&unclassified_contributions_send[p][0],
                         &send_displacements[0]);
             send_displacements[0] -= base_displacement;
             send_sz[0] = 1;
               
             //indices
             MPI_Type_contiguous(n_unclassified_send[p],MPI_UNSIGNED,
                                 &send_types[1]);
             MPI_Type_commit(&send_types[1]);
             MPI_Get_address(&unclassified_indices_send[p][0],
                         &send_displacements[1]);
             send_displacements[1] -= base_displacement;
             send_sz[1] = 1;
               
             //build the final send type
             MPI_Datatype final_send_type;
             MPI_Type_create_struct(2,send_sz,send_displacements,send_types,
                             &final_send_type);
             MPI_Type_commit(&final_send_type);
               
             //and send
             MPI_Request req;
             MPI_Isend(v_or_p_values,1,final_send_type,p,0,
                       comm_pt()->mpi_comm(),&req);
             unclassified_send_requests.push_back(req);
             MPI_Type_free(&send_types[0]);
             MPI_Type_free(&send_types[1]);
             MPI_Type_free(&final_send_type);
            }
          }
        }
         
       //next classify the data as it is received
       unsigned n_unclassified_recv_req = unclassified_recv_requests.size();
       while (n_unclassified_recv_req > 0)
        {
         //get the processor number and remove the completed request
         //for the vector of requests
         int req_num;
         MPI_Waitany(n_unclassified_recv_req,&unclassified_recv_requests[0],
                     &req_num,MPI_STATUS_IGNORE);
         unsigned p = unclassified_recv_proc[req_num];
         unclassified_recv_requests.erase(unclassified_recv_requests.begin()
                                          +req_num);    
         unclassified_recv_proc.erase(unclassified_recv_proc.begin()+req_num);
         n_unclassified_recv_req--;
           
         //next classify the dofs 
         //and store them for sending to other processors if required
         unsigned n_recv = n_unclassified_recv[p];
         for (unsigned i = 0; i < n_recv; i++)
          {
           unsigned eqn_number = unclassified_indices_recv[p][i];
           //Only deal with our block unknowns
           if ( this->block_number(eqn_number)==int(block_index) )
            {
               
             //get the index in the block
             unsigned index = this->index_in_block(eqn_number);
               
             //determine which processor requires the block index
             for (unsigned pp = 0; pp < nproc; pp++)
              {
                 
                 
               if ( (index >= velocity_or_press_dist_pt->first_row(pp)) && 
                    (index < (velocity_or_press_dist_pt->first_row(pp)          
                              +velocity_or_press_dist_pt->nrow_local(pp)) ) )
                {
                 
                 //if it is required by this processor then add the 
                 //contribution
                 if (pp == my_rank)
                  {
                   v_or_p_values[index-v_or_p_first_row] 
                    += unclassified_contributions_recv[p][i];
                  }
                 //otherwise store it for communication
                 else
                  {
                   double v = unclassified_contributions_recv[p][i];
                   classified_contributions_send[pp].push_back(v);
                   classified_indices_send[pp].push_back(index);
                  }
                }
              }
            }
          }
           
         //clean up
         delete[] unclassified_contributions_recv[p];
         delete[] unclassified_indices_recv[p];
        }
       delete[] n_unclassified_recv;
         
       //now all indices have been classified
         
       //next the classified contributions are communicated to 
       //processors that require them
         
       //first determine how many classified rows are to be sent to
       //each processor
       unsigned* n_classified_send = new unsigned[nproc];
       for (unsigned p = 0; p < nproc; p++)
        {
         if (p == my_rank)
          {
           n_classified_send[p] = 0;
          }
         else
          {
           n_classified_send[p] 
            = classified_contributions_send[p].size();
          }
        }
         
       //then all-to-all number of classified to be sent / recv
       unsigned* n_classified_recv = new unsigned[nproc];
       MPI_Alltoall(n_classified_send,1,MPI_UNSIGNED,
                    n_classified_recv,1,MPI_UNSIGNED,
                    comm_pt()->mpi_comm());
         
       //allocate storage for the data to be received
       //and post the sends and recvs
       Vector<double*> classified_contributions_recv(nproc);
       Vector<unsigned*> classified_indices_recv(nproc);
       Vector<MPI_Request> classified_recv_requests;
       Vector<MPI_Request> classified_send_requests;
       Vector<unsigned> classified_recv_proc;
       for (unsigned p = 0; p < nproc; p++)
        {
         if (p != my_rank)
          {
           //recv
           if (n_classified_recv[p] > 0)
            {
             classified_contributions_recv[p] 
              = new double[n_classified_recv[p]];
             classified_indices_recv[p] = new unsigned[n_classified_recv[p]];
               
             //data for the struct data type
             MPI_Datatype recv_types[2];
             MPI_Aint recv_displacements[2];
             int recv_sz[2];
               
             //contributions
             MPI_Type_contiguous(n_classified_recv[p],MPI_DOUBLE,
                                 &recv_types[0]);
             MPI_Type_commit(&recv_types[0]);
             MPI_Get_address(classified_contributions_recv[p],
                         &recv_displacements[0]);
             recv_displacements[0] -= base_displacement;
             recv_sz[0] = 1;
               
             //indices
             MPI_Type_contiguous(n_classified_recv[p],MPI_UNSIGNED,
                                 &recv_types[1]);
             MPI_Type_commit(&recv_types[1]);
             MPI_Get_address(classified_indices_recv[p],
                         &recv_displacements[1]);
             recv_displacements[1] -= base_displacement;
             recv_sz[1] = 1;
               
             //build the final recv type
             MPI_Datatype final_recv_type;
             MPI_Type_create_struct(2,recv_sz,recv_displacements,recv_types,
                             &final_recv_type);
             MPI_Type_commit(&final_recv_type);
               
             //and recv
             MPI_Request req;
             MPI_Irecv(v_or_p_values,1,final_recv_type,p,0,
                       comm_pt()->mpi_comm(),&req);
             classified_recv_requests.push_back(req);
             classified_recv_proc.push_back(p);
             MPI_Type_free(&recv_types[0]);
             MPI_Type_free(&recv_types[1]);
             MPI_Type_free(&final_recv_type);
            }
             
           //send
           if (n_classified_send[p] > 0)
            {
             //data for the struct data type
             MPI_Datatype send_types[2];
             MPI_Aint send_displacements[2];
             int send_sz[2];
               
             //contributions
             MPI_Type_contiguous(n_classified_send[p],MPI_DOUBLE,
                                 &send_types[0]);
             MPI_Type_commit(&send_types[0]);
             MPI_Get_address(&classified_contributions_send[p][0],
                         &send_displacements[0]);
             send_displacements[0] -= base_displacement;
             send_sz[0] = 1;
               
             //indices
             MPI_Type_contiguous(n_classified_send[p],MPI_UNSIGNED,
                                 &send_types[1]);
             MPI_Type_commit(&send_types[1]);
             MPI_Get_address(&classified_indices_send[p][0],
                         &send_displacements[1]);
             send_displacements[1] -= base_displacement;
             send_sz[1] = 1;
               
             //build the final send type
             MPI_Datatype final_send_type;
             MPI_Type_create_struct(2,send_sz,send_displacements,send_types,
                             &final_send_type);
             MPI_Type_commit(&final_send_type);
               
             //and send
             MPI_Request req;
             MPI_Isend(v_or_p_values,1,final_send_type,p,0,
                       comm_pt()->mpi_comm(),&req);
             classified_send_requests.push_back(req);
             MPI_Type_free(&send_types[0]);
             MPI_Type_free(&send_types[1]);
             MPI_Type_free(&final_send_type);
            }
          }
        }
         
       //next classify the data as it is received
       unsigned n_classified_recv_req = classified_recv_requests.size();
       while (n_classified_recv_req > 0)
        {
         //get the processor number and remove the completed request
         //for the vector of requests
         int req_num;
         MPI_Waitany(n_classified_recv_req,&classified_recv_requests[0],
                     &req_num,MPI_STATUS_IGNORE);
         unsigned p = classified_recv_proc[req_num];
         classified_recv_requests.erase(classified_recv_requests.begin()
                                        +req_num);    
         classified_recv_proc.erase(classified_recv_proc.begin()+req_num);
         n_classified_recv_req--;
           
         //next classify the dofs 
         //and store them for sending to other processors if required
         unsigned n_recv = n_classified_recv[p];
         for (unsigned i = 0; i < n_recv; i++)
          {
           v_or_p_values[classified_indices_recv[p][i]-v_or_p_first_row] 
            += classified_contributions_recv[p][i];
          }
           
         //clean up
         delete[] classified_contributions_recv[p];
         delete[] classified_indices_recv[p];
        }
         
       //wait for the unclassified sends to complete
       unsigned n_unclassified_send_req = unclassified_send_requests.size();
       if (n_unclassified_send_req > 0)
        {
         MPI_Waitall(n_unclassified_send_req,&unclassified_send_requests[0],
                     MPI_STATUS_IGNORE);
        }
       delete[] unclassified_contributions_send;
       delete[] unclassified_indices_send;
       delete[] n_unclassified_send;
         
       //wait for the classified sends to complete
       unsigned n_classified_send_req = classified_send_requests.size();
       if (n_classified_send_req > 0) 
        {
         MPI_Waitall(n_classified_send_req,&classified_send_requests[0],
                     MPI_STATUS_IGNORE);
        }
       delete[] classified_indices_send;
       delete[] classified_contributions_send;
       delete[] n_classified_recv;
       delete[] n_classified_send;
 
       // Copy the values back where they belong
       if (block_index==0)
        {
         v_values=v_or_p_values;
        }
       else if (block_index==1)
        {
         p_values=v_or_p_values;
        }
         
      }
     
 #endif
     
    }
   // or if the problem is not distributed
   else
    {
 
     // find number of elements
     unsigned n_el = Navier_stokes_mesh_pt->nelement();
     
     // Fp needs pressure and velocity mass matrices
     unsigned which_one=0;
     if (Use_LSC) which_one=2;
 
     // get the contribution for each element
     for (unsigned e = 0; e < n_el; e++)
      {
 
       // Get element
       GeneralisedElement* el_pt=Navier_stokes_mesh_pt->element_pt(e);
       
       // find number of degrees of freedom in the element
       // (this is slightly too big because it includes the
       // pressure dofs but this doesn't matter)
       unsigned el_dof = el_pt->ndof();
       
       // allocate local storage for the element's contribution to the
       // pressure and velocity mass matrix diagonal
       Vector<double> el_vmm_diagonal(el_dof,0.0);
       Vector<double> el_pmm_diagonal(el_dof,0.0);
       
       NavierStokesElementWithDiagonalMassMatrices* cast_el_pt=0;
       cast_el_pt=dynamic_cast<NavierStokesElementWithDiagonalMassMatrices*>(
        el_pt);
       if (cast_el_pt!=0)
        {
         cast_el_pt->get_pressure_and_velocity_mass_matrix_diagonal( 
          el_pmm_diagonal,el_vmm_diagonal,which_one);
        }
       else
        {
 #ifdef PARANOID
         std::ostringstream error_message;
         error_message 
          << "Navier-Stokes mesh contains element that is not of type \n"
          << "NavierStokesElementWithDiagonalMassMatrices. \n"
          << "The element is in fact of type " 
          << typeid(*el_pt).name() 
          << "\nWe'll assume that it does not make a used contribution \n" 
          << "to the inverse diagonal mass matrix used in the preconditioner\n" 
          << "and (to avoid divisions by zero) fill in dummy unit entries.\n"
          << "[This case currently arises with flux control problems\n"
          << "where for simplicity the NetFluxControlElement has been added \n"
          << "to the Navier Stokes mesh -- this should probably be changed at\n"
          << "some point -- if you get this warning in any other context\n"
          << "you should check your code very carefully]\n";
         OomphLibWarning(
          error_message.str(),
          "NavierStokesSchurComplementPreconditioner::assemble_inv_press_and_veloc_mass_matrix_diagonal()",
          OOMPH_EXCEPTION_LOCATION);       
 #endif 
 
         // Fill in dummy entries to stop division by zero below
         for (unsigned j=0;j<el_dof;j++)
          {
           el_vmm_diagonal[j]=1.0;
           el_pmm_diagonal[j]=1.0;
          }
        }
       
       // Get the contribution for each dof
       for (unsigned i = 0; i < el_dof; i++)
        {
         //Get the equation number
         unsigned eqn_number = el_pt->eqn_number(i);
         
         // Get the velocity dofs
         if (this->block_number(eqn_number)==0)
          {
           // get the index in the block
           unsigned index = this->index_in_block(eqn_number);
           
           // if it is required on this processor
           if ((index >= v_first_row) &&
               (index < (v_first_row + v_nrow_local) ) )
            {
             v_values[index-v_first_row] += el_vmm_diagonal[i];
            }
          }
         // Get the pressure dofs
         else if (this->block_number(eqn_number)==1)
          {
           if (!Use_LSC)
            {
             // get the index in the block
             unsigned index = this->index_in_block(eqn_number);
             
             // if it is required on this processor
             if ((index >= p_first_row)&&
                 (index < (p_first_row + p_nrow_local)) )
              {
               p_values[index-p_first_row] += el_pmm_diagonal[i];
              }
            }
          }
        } 
      }
    }
   
   // Create column index and row start for velocity mass matrix
   int* v_column_index = new int[v_nrow_local]; 
   int* v_row_start = new int[v_nrow_local+1];
   for (unsigned i = 0; i < v_nrow_local; i++)
    {
 #ifdef PARANOID
     if (v_values[i]==0.0)
      {
       std::ostringstream error_message;
       error_message << "Zero entry in diagonal of velocity mass matrix\n"
                     << "Index: " << i << std::endl;
       throw OomphLibError(
        error_message.str(),
        OOMPH_CURRENT_FUNCTION,
        OOMPH_EXCEPTION_LOCATION);
      }
 #endif
     v_values[i] = 1.0/v_values[i];   
     v_column_index[i] = v_first_row + i;
     v_row_start[i] = i;
    }
   v_row_start[v_nrow_local] = v_nrow_local;
   
   // Build the velocity mass matrix
   inv_v_mass_pt = new CRDoubleMatrix(this->block_distribution_pt(0));
   inv_v_mass_pt->build_without_copy(v_nrow,v_nrow_local,
                                     v_values,v_column_index,
                                     v_row_start);
   
   // Create pressure mass matrix
   if (!Use_LSC)
    {
     // Create column index and row start for pressure mass matrix
     int* p_column_index = new int[p_nrow_local];
     int* p_row_start = new int[p_nrow_local+1];
     for (unsigned i = 0; i < p_nrow_local; i++)
      {
       
 #ifdef PARANOID
       if (p_values[i]==0.0)
        {
         std::ostringstream error_message;
         error_message << "Zero entry in diagonal of pressure mass matrix\n"
                       << "Index: " << i << std::endl;
         throw OomphLibError(
          error_message.str(),
          OOMPH_CURRENT_FUNCTION,
          OOMPH_EXCEPTION_LOCATION);
        }
 #endif
       p_values[i] = 1.0/p_values[i];
       
       p_column_index[i] = p_first_row + i;
       p_row_start[i] = i;
      }
     p_row_start[p_nrow_local] = p_nrow_local;
     
     // Build the pressure mass matrix
     inv_p_mass_pt = new CRDoubleMatrix(this->block_distribution_pt(1));
     inv_p_mass_pt->build_without_copy(p_nrow,p_nrow_local,
                                       p_values,p_column_index,
                                       p_row_start);
 
    }
   
  }
 
 //=========================================================================
 /// Helper function to delete preconditioner data.
 //=========================================================================
  void NavierStokesSchurComplementPreconditioner::clean_up_memory()
  {
   if (Preconditioner_has_been_setup)
    {
     // delete matvecs
     delete Bt_mat_vec_pt; 
     Bt_mat_vec_pt = 0;
 
     delete F_mat_vec_pt;
     F_mat_vec_pt = 0;
 
     delete QBt_mat_vec_pt;
     QBt_mat_vec_pt = 0;
 
     delete E_mat_vec_pt;
     E_mat_vec_pt = 0;
     
     // delete stuff from velocity solve
     if (Using_default_f_preconditioner)
      {
       delete F_preconditioner_pt;
       F_preconditioner_pt = 0;
      }
     
     // delete stuff from Schur complement approx
     if (Using_default_p_preconditioner)
      {
       delete P_preconditioner_pt;
       P_preconditioner_pt = 0;
      }
    }
  }
 
 
 }// end oomph namespace