Ifpack2 Templated Preconditioning Package  Version 1.0
Ifpack2_Relaxation_def.hpp
1 /*@HEADER
2 // ***********************************************************************
3 //
4 // Ifpack2: Tempated Object-Oriented Algebraic Preconditioner Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ***********************************************************************
40 //@HEADER
41 */
42 
43 #ifndef IFPACK2_RELAXATION_DEF_HPP
44 #define IFPACK2_RELAXATION_DEF_HPP
45 
46 #include <Teuchos_StandardParameterEntryValidators.hpp>
47 #include <Teuchos_TimeMonitor.hpp>
48 #include <Tpetra_ConfigDefs.hpp>
49 #include <Tpetra_CrsMatrix.hpp>
50 #include <Tpetra_Experimental_BlockCrsMatrix.hpp>
51 #include <Ifpack2_Utilities.hpp>
52 #include <Ifpack2_Relaxation_decl.hpp>
53 
54 // mfh 28 Mar 2013: Uncomment out these three lines to compute
55 // statistics on diagonal entries in compute().
56 // #ifndef IFPACK2_RELAXATION_COMPUTE_DIAGONAL_STATS
57 // # define IFPACK2_RELAXATION_COMPUTE_DIAGONAL_STATS 1
58 // #endif // IFPACK2_RELAXATION_COMPUTE_DIAGONAL_STATS
59 
60 namespace {
61  // Validate that a given int is nonnegative.
62  class NonnegativeIntValidator : public Teuchos::ParameterEntryValidator {
63  public:
64  // Constructor (does nothing).
65  NonnegativeIntValidator () {}
66 
67  // ParameterEntryValidator wants this method.
68  Teuchos::ParameterEntryValidator::ValidStringsList validStringValues () const {
69  return Teuchos::null;
70  }
71 
72  // Actually validate the parameter's value.
73  void
74  validate (const Teuchos::ParameterEntry& entry,
75  const std::string& paramName,
76  const std::string& sublistName) const
77  {
78  using std::endl;
79  Teuchos::any anyVal = entry.getAny (true);
80  const std::string entryName = entry.getAny (false).typeName ();
81 
82  TEUCHOS_TEST_FOR_EXCEPTION(
83  anyVal.type () != typeid (int),
84  Teuchos::Exceptions::InvalidParameterType,
85  "Parameter \"" << paramName << "\" in sublist \"" << sublistName
86  << "\" has the wrong type." << endl << "Parameter: " << paramName
87  << endl << "Type specified: " << entryName << endl
88  << "Type required: int" << endl);
89 
90  const int val = Teuchos::any_cast<int> (anyVal);
91  TEUCHOS_TEST_FOR_EXCEPTION(
92  val < 0, Teuchos::Exceptions::InvalidParameterValue,
93  "Parameter \"" << paramName << "\" in sublist \"" << sublistName
94  << "\" is negative." << endl << "Parameter: " << paramName
95  << endl << "Value specified: " << val << endl
96  << "Required range: [0, INT_MAX]" << endl);
97  }
98 
99  // ParameterEntryValidator wants this method.
100  const std::string getXMLTypeName () const {
101  return "NonnegativeIntValidator";
102  }
103 
104  // ParameterEntryValidator wants this method.
105  void
106  printDoc (const std::string& docString,
107  std::ostream &out) const
108  {
109  Teuchos::StrUtils::printLines (out, "# ", docString);
110  out << "#\tValidator Used: " << std::endl;
111  out << "#\t\tNonnegativeIntValidator" << std::endl;
112  }
113  };
114 
115  // A way to get a small positive number (eps() for floating-point
116  // types, or 1 for integer types) when Teuchos::ScalarTraits doesn't
117  // define it (for example, for integer values).
118  template<class Scalar, const bool isOrdinal=Teuchos::ScalarTraits<Scalar>::isOrdinal>
119  class SmallTraits {
120  public:
121  // Return eps if Scalar is a floating-point type, else return 1.
122  static const Scalar eps ();
123  };
124 
125  // Partial specialization for when Scalar is not a floating-point type.
126  template<class Scalar>
127  class SmallTraits<Scalar, true> {
128  public:
129  static const Scalar eps () {
130  return Teuchos::ScalarTraits<Scalar>::one ();
131  }
132  };
133 
134  // Partial specialization for when Scalar is a floating-point type.
135  template<class Scalar>
136  class SmallTraits<Scalar, false> {
137  public:
138  static const Scalar eps () {
139  return Teuchos::ScalarTraits<Scalar>::eps ();
140  }
141  };
142 } // namespace (anonymous)
143 
144 namespace Ifpack2 {
145 
146 template<class MatrixType>
147 void Relaxation<MatrixType>::
148 setMatrix (const Teuchos::RCP<const row_matrix_type>& A)
149 {
150  if (A.getRawPtr () != A_.getRawPtr ()) { // it's a different matrix
151  Importer_ = Teuchos::null;
152  Diagonal_ = Teuchos::null; // ??? what if this comes from the user???
153  isInitialized_ = false;
154  IsComputed_ = false;
155  diagOffsets_ = Teuchos::null;
156  savedDiagOffsets_ = false;
157  hasBlockCrsMatrix_ = false;
158  if (! A.is_null ()) {
159  IsParallel_ = (A->getRowMap ()->getComm ()->getSize () > 1);
160  }
161  A_ = A;
162  }
163 }
164 
165 
166 template<class MatrixType>
168 Relaxation (const Teuchos::RCP<const row_matrix_type>& A)
169 : A_ (A),
170  Time_ (Teuchos::rcp (new Teuchos::Time ("Ifpack2::Relaxation"))),
171  NumSweeps_ (1),
172  PrecType_ (Ifpack2::Details::JACOBI),
173  DampingFactor_ (STS::one ()),
174  IsParallel_ ((A.is_null () || A->getRowMap ().is_null () || A->getRowMap ()->getComm ().is_null ()) ?
175  false : // a reasonable default if there's no communicator
176  A->getRowMap ()->getComm ()->getSize () > 1),
177  ZeroStartingSolution_ (true),
178  DoBackwardGS_ (false),
179  DoL1Method_ (false),
180  L1Eta_ (Teuchos::as<magnitude_type> (1.5)),
181  MinDiagonalValue_ (STS::zero ()),
182  fixTinyDiagEntries_ (false),
183  checkDiagEntries_ (false),
184  isInitialized_ (false),
185  IsComputed_ (false),
186  NumInitialize_ (0),
187  NumCompute_ (0),
188  NumApply_ (0),
189  InitializeTime_ (0.0), // Times are double anyway, so no need for ScalarTraits.
190  ComputeTime_ (0.0),
191  ApplyTime_ (0.0),
192  ComputeFlops_ (0.0),
193  ApplyFlops_ (0.0),
194  globalMinMagDiagEntryMag_ (STM::zero ()),
195  globalMaxMagDiagEntryMag_ (STM::zero ()),
196  globalNumSmallDiagEntries_ (0),
197  globalNumZeroDiagEntries_ (0),
198  globalNumNegDiagEntries_ (0),
199  globalDiagNormDiff_(Teuchos::ScalarTraits<magnitude_type>::zero()),
200  savedDiagOffsets_ (false),
201  hasBlockCrsMatrix_ (false)
202 {
203  this->setObjectLabel ("Ifpack2::Relaxation");
204 }
205 
206 //==========================================================================
207 template<class MatrixType>
209 }
210 
211 template<class MatrixType>
212 Teuchos::RCP<const Teuchos::ParameterList>
214 {
215  using Teuchos::Array;
216  using Teuchos::ParameterList;
217  using Teuchos::parameterList;
218  using Teuchos::RCP;
219  using Teuchos::rcp;
220  using Teuchos::rcp_const_cast;
221  using Teuchos::rcp_implicit_cast;
222  using Teuchos::setStringToIntegralParameter;
223  typedef Teuchos::ParameterEntryValidator PEV;
224 
225  if (validParams_.is_null ()) {
226  RCP<ParameterList> pl = parameterList ("Ifpack2::Relaxation");
227 
228  // Set a validator that automatically converts from the valid
229  // string options to their enum values.
230  Array<std::string> precTypes (3);
231  precTypes[0] = "Jacobi";
232  precTypes[1] = "Gauss-Seidel";
233  precTypes[2] = "Symmetric Gauss-Seidel";
234  Array<Details::RelaxationType> precTypeEnums (3);
235  precTypeEnums[0] = Details::JACOBI;
236  precTypeEnums[1] = Details::GS;
237  precTypeEnums[2] = Details::SGS;
238  const std::string defaultPrecType ("Jacobi");
239  setStringToIntegralParameter<Details::RelaxationType> ("relaxation: type",
240  defaultPrecType, "Relaxation method", precTypes (), precTypeEnums (),
241  pl.getRawPtr ());
242 
243  const int numSweeps = 1;
244  RCP<PEV> numSweepsValidator =
245  rcp_implicit_cast<PEV> (rcp (new NonnegativeIntValidator));
246  pl->set ("relaxation: sweeps", numSweeps, "Number of relaxation sweeps",
247  rcp_const_cast<const PEV> (numSweepsValidator));
248 
249  const scalar_type dampingFactor = STS::one ();
250  pl->set ("relaxation: damping factor", dampingFactor);
251 
252  const bool zeroStartingSolution = true;
253  pl->set ("relaxation: zero starting solution", zeroStartingSolution);
254 
255  const bool doBackwardGS = false;
256  pl->set ("relaxation: backward mode", doBackwardGS);
257 
258  const bool doL1Method = false;
259  pl->set ("relaxation: use l1", doL1Method);
260 
261  const magnitude_type l1eta = (STM::one() + STM::one() + STM::one()) /
262  (STM::one() + STM::one()); // 1.5
263  pl->set ("relaxation: l1 eta", l1eta);
264 
265  const scalar_type minDiagonalValue = STS::zero ();
266  pl->set ("relaxation: min diagonal value", minDiagonalValue);
267 
268  const bool fixTinyDiagEntries = false;
269  pl->set ("relaxation: fix tiny diagonal entries", fixTinyDiagEntries);
270 
271  const bool checkDiagEntries = false;
272  pl->set ("relaxation: check diagonal entries", checkDiagEntries);
273 
274  Teuchos::ArrayRCP<local_ordinal_type> localSmoothingIndices = Teuchos::null;
275  pl->set("relaxation: local smoothing indices", localSmoothingIndices);
276 
277  validParams_ = rcp_const_cast<const ParameterList> (pl);
278  }
279  return validParams_;
280 }
281 
282 
283 template<class MatrixType>
284 void Relaxation<MatrixType>::setParametersImpl (Teuchos::ParameterList& pl)
285 {
286  using Teuchos::getIntegralValue;
287  using Teuchos::ParameterList;
288  using Teuchos::RCP;
289  typedef scalar_type ST; // just to make code below shorter
290 
291  pl.validateParametersAndSetDefaults (* getValidParameters ());
292 
293  const Details::RelaxationType precType =
294  getIntegralValue<Details::RelaxationType> (pl, "relaxation: type");
295  const int numSweeps = pl.get<int> ("relaxation: sweeps");
296  const ST dampingFactor = pl.get<ST> ("relaxation: damping factor");
297  const bool zeroStartSol = pl.get<bool> ("relaxation: zero starting solution");
298  const bool doBackwardGS = pl.get<bool> ("relaxation: backward mode");
299  const bool doL1Method = pl.get<bool> ("relaxation: use l1");
300  const magnitude_type l1Eta = pl.get<magnitude_type> ("relaxation: l1 eta");
301  const ST minDiagonalValue = pl.get<ST> ("relaxation: min diagonal value");
302  const bool fixTinyDiagEntries = pl.get<bool> ("relaxation: fix tiny diagonal entries");
303  const bool checkDiagEntries = pl.get<bool> ("relaxation: check diagonal entries");
304  Teuchos::ArrayRCP<local_ordinal_type> localSmoothingIndices = pl.get<Teuchos::ArrayRCP<local_ordinal_type> >("relaxation: local smoothing indices");
305 
306 
307  // "Commit" the changes, now that we've validated everything.
308  PrecType_ = precType;
309  NumSweeps_ = numSweeps;
310  DampingFactor_ = dampingFactor;
311  ZeroStartingSolution_ = zeroStartSol;
312  DoBackwardGS_ = doBackwardGS;
313  DoL1Method_ = doL1Method;
314  L1Eta_ = l1Eta;
315  MinDiagonalValue_ = minDiagonalValue;
316  fixTinyDiagEntries_ = fixTinyDiagEntries;
317  checkDiagEntries_ = checkDiagEntries;
318  localSmoothingIndices_ = localSmoothingIndices;
319 }
320 
321 
322 template<class MatrixType>
323 void Relaxation<MatrixType>::setParameters (const Teuchos::ParameterList& pl)
324 {
325  // FIXME (aprokop 18 Oct 2013) Casting away const is bad here.
326  // but otherwise, we will get [unused] in pl
327  this->setParametersImpl(const_cast<Teuchos::ParameterList&>(pl));
328 }
329 
330 
331 template<class MatrixType>
332 Teuchos::RCP<const Teuchos::Comm<int> >
334  TEUCHOS_TEST_FOR_EXCEPTION(
335  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::getComm: "
336  "The input matrix A is null. Please call setMatrix() with a nonnull "
337  "input matrix before calling this method.");
338  return A_->getRowMap ()->getComm ();
339 }
340 
341 
342 template<class MatrixType>
343 Teuchos::RCP<const typename Relaxation<MatrixType>::row_matrix_type>
345  return A_;
346 }
347 
348 
349 template<class MatrixType>
350 Teuchos::RCP<const Tpetra::Map<typename MatrixType::local_ordinal_type,
351  typename MatrixType::global_ordinal_type,
352  typename MatrixType::node_type> >
354  TEUCHOS_TEST_FOR_EXCEPTION(
355  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::getDomainMap: "
356  "The input matrix A is null. Please call setMatrix() with a nonnull "
357  "input matrix before calling this method.");
358  return A_->getDomainMap ();
359 }
360 
361 
362 template<class MatrixType>
363 Teuchos::RCP<const Tpetra::Map<typename MatrixType::local_ordinal_type,
364  typename MatrixType::global_ordinal_type,
365  typename MatrixType::node_type> >
367  TEUCHOS_TEST_FOR_EXCEPTION(
368  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::getRangeMap: "
369  "The input matrix A is null. Please call setMatrix() with a nonnull "
370  "input matrix before calling this method.");
371  return A_->getRangeMap ();
372 }
373 
374 
375 template<class MatrixType>
377  return true;
378 }
379 
380 
381 template<class MatrixType>
383  return(NumInitialize_);
384 }
385 
386 
387 template<class MatrixType>
389  return(NumCompute_);
390 }
391 
392 
393 template<class MatrixType>
395  return(NumApply_);
396 }
397 
398 
399 template<class MatrixType>
401  return(InitializeTime_);
402 }
403 
404 
405 template<class MatrixType>
407  return(ComputeTime_);
408 }
409 
410 
411 template<class MatrixType>
413  return(ApplyTime_);
414 }
415 
416 
417 template<class MatrixType>
419  return(ComputeFlops_);
420 }
421 
422 
423 template<class MatrixType>
425  return(ApplyFlops_);
426 }
427 
428 
429 template<class MatrixType>
430 void
432 apply (const Tpetra::MultiVector<scalar_type, local_ordinal_type, global_ordinal_type, node_type>& X,
433  Tpetra::MultiVector<scalar_type, local_ordinal_type, global_ordinal_type, node_type>& Y,
434  Teuchos::ETransp mode,
435  scalar_type alpha,
436  scalar_type beta) const
437 {
438  using Teuchos::as;
439  using Teuchos::RCP;
440  using Teuchos::rcp;
441  using Teuchos::rcpFromRef;
442  typedef Tpetra::MultiVector<scalar_type, local_ordinal_type,
444  TEUCHOS_TEST_FOR_EXCEPTION(
445  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::apply: "
446  "The input matrix A is null. Please call setMatrix() with a nonnull "
447  "input matrix, then call compute(), before calling this method.");
448  TEUCHOS_TEST_FOR_EXCEPTION(
449  ! isComputed (),
450  std::runtime_error,
451  "Ifpack2::Relaxation::apply: You must call compute() on this Ifpack2 "
452  "preconditioner instance before you may call apply(). You may call "
453  "isComputed() to find out if compute() has been called already.");
454  TEUCHOS_TEST_FOR_EXCEPTION(
455  X.getNumVectors() != Y.getNumVectors(),
456  std::runtime_error,
457  "Ifpack2::Relaxation::apply: X and Y have different numbers of columns. "
458  "X has " << X.getNumVectors() << " columns, but Y has "
459  << Y.getNumVectors() << " columns.");
460  TEUCHOS_TEST_FOR_EXCEPTION(
461  beta != STS::zero (), std::logic_error,
462  "Ifpack2::Relaxation::apply: beta = " << beta << " != 0 case not "
463  "implemented.");
464  {
465  // Reset the timer each time, since Relaxation uses the same Time
466  // object to track times for different methods.
467  Teuchos::TimeMonitor timeMon (*Time_, true);
468 
469  // Special case: alpha == 0.
470  if (alpha == STS::zero ()) {
471  // No floating-point operations, so no need to update a count.
472  Y.putScalar (STS::zero ());
473  }
474  else {
475  // If X and Y alias one another, then we need to create an
476  // auxiliary vector, Xcopy (a deep copy of X).
477  RCP<const MV> Xcopy;
478  // FIXME (mfh 12 Sep 2014) This test for aliasing is incomplete.
479  {
480  auto X_lcl_host = X.template getLocalView<Kokkos::HostSpace> ();
481  auto Y_lcl_host = Y.template getLocalView<Kokkos::HostSpace> ();
482  if (X_lcl_host.ptr_on_device () == Y_lcl_host.ptr_on_device ()) {
483  Xcopy = rcp (new MV (X, Teuchos::Copy));
484  } else {
485  Xcopy = rcpFromRef (X);
486  }
487  }
488 
489  // Each of the following methods updates the flop count itself.
490  // All implementations handle zeroing out the starting solution
491  // (if necessary) themselves.
492  switch (PrecType_) {
493  case Ifpack2::Details::JACOBI:
494  ApplyInverseJacobi(*Xcopy,Y);
495  break;
496  case Ifpack2::Details::GS:
497  ApplyInverseGS(*Xcopy,Y);
498  break;
499  case Ifpack2::Details::SGS:
500  ApplyInverseSGS(*Xcopy,Y);
501  break;
502  default:
503  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
504  "Ifpack2::Relaxation::apply: Invalid preconditioner type enum value "
505  << PrecType_ << ". Please report this bug to the Ifpack2 developers.");
506  }
507  if (alpha != STS::one ()) {
508  Y.scale (alpha);
509  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
510  const double numVectors = as<double> (Y.getNumVectors ());
511  ApplyFlops_ += numGlobalRows * numVectors;
512  }
513  }
514  }
515  ApplyTime_ += Time_->totalElapsedTime ();
516  ++NumApply_;
517 }
518 
519 
520 template<class MatrixType>
521 void
523 applyMat (const Tpetra::MultiVector<scalar_type, local_ordinal_type, global_ordinal_type, node_type>& X,
524  Tpetra::MultiVector<scalar_type, local_ordinal_type, global_ordinal_type, node_type>& Y,
525  Teuchos::ETransp mode) const
526 {
527  TEUCHOS_TEST_FOR_EXCEPTION(
528  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::applyMat: "
529  "The input matrix A is null. Please call setMatrix() with a nonnull "
530  "input matrix, then call compute(), before calling this method.");
531  TEUCHOS_TEST_FOR_EXCEPTION(
532  ! isComputed (), std::runtime_error, "Ifpack2::Relaxation::applyMat: "
533  "isComputed() must be true before you may call applyMat(). "
534  "Please call compute() before calling this method.");
535  TEUCHOS_TEST_FOR_EXCEPTION(
536  X.getNumVectors () != Y.getNumVectors (), std::invalid_argument,
537  "Ifpack2::Relaxation::applyMat: X.getNumVectors() = " << X.getNumVectors ()
538  << " != Y.getNumVectors() = " << Y.getNumVectors () << ".");
539  A_->apply (X, Y, mode);
540 }
541 
542 
543 template<class MatrixType>
545 {
546  TEUCHOS_TEST_FOR_EXCEPTION(
547  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::initialize: "
548  "The input matrix A is null. Please call setMatrix() with a nonnull "
549  "input matrix before calling this method.");
550 
551  if (A_.is_null ()) {
552  hasBlockCrsMatrix_ = false;
553  }
554  else { // A_ is not null
555  Teuchos::RCP<const block_crs_matrix_type> A_bcrs =
556  Teuchos::rcp_dynamic_cast<const block_crs_matrix_type> (A_);
557  if (A_bcrs.is_null ()) {
558  hasBlockCrsMatrix_ = false;
559  }
560  else { // A_ is a block_crs_matrix_type
561  hasBlockCrsMatrix_ = true;
562  }
563  }
564 
565  // Initialization for Relaxation is trivial, so we say it takes zero time.
566  //InitializeTime_ += Time_->totalElapsedTime ();
567  ++NumInitialize_;
568  isInitialized_ = true;
569 }
570 
571 template<class MatrixType>
573 {
574  using Teuchos::Array;
575  using Teuchos::ArrayRCP;
576  using Teuchos::ArrayView;
577  using Teuchos::as;
578  using Teuchos::Comm;
579  using Teuchos::RCP;
580  using Teuchos::rcp;
581  using Teuchos::REDUCE_MAX;
582  using Teuchos::REDUCE_MIN;
583  using Teuchos::REDUCE_SUM;
584  using Teuchos::rcp_dynamic_cast;
585  using Teuchos::reduceAll;
586 
587  {
588  // Reset the timer each time, since Relaxation uses the same Time
589  // object to track times for different methods.
590  Teuchos::TimeMonitor timeMon (*Time_, true);
591 
592  TEUCHOS_TEST_FOR_EXCEPTION(
593  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::"
594  "computeBlockCrs: The input matrix A is null. Please call setMatrix() "
595  "with a nonnull input matrix, then call initialize(), before calling "
596  "this method.");
597  const block_crs_matrix_type* blockCrsA =
598  dynamic_cast<const block_crs_matrix_type*> (A_.getRawPtr ());
599  TEUCHOS_TEST_FOR_EXCEPTION(
600  blockCrsA == NULL, std::logic_error, "Ifpack2::Relaxation::"
601  "computeBlockCrs: A_ is not a BlockCrsMatrix, but it should be if we "
602  "got this far. Please report this bug to the Ifpack2 developers.");
603 
604  const scalar_type one = STS::one ();
605 
606  // Reset state.
607  IsComputed_ = false;
608 
609  const local_ordinal_type blockSize = blockCrsA->getBlockSize ();
610 
611  if (! savedDiagOffsets_) {
612  BlockDiagonal_ = Teuchos::null;
613  BlockDiagonal_ = rcp(new block_crs_matrix_type(*Ifpack2::Details::computeDiagonalGraph(blockCrsA->getCrsGraph()), blockSize));
614  blockCrsA->getLocalDiagOffsets (diagOffsets_);
615  savedDiagOffsets_ = true;
616  }
617  blockCrsA->getLocalDiagCopy (*BlockDiagonal_, diagOffsets_ ());
618 
619  const size_t numMyRows = A_->getNodeNumRows ();
620 
621  if (DoL1Method_ && IsParallel_) {
622  const scalar_type two = one + one;
623  const size_t maxLength = A_->getNodeMaxNumRowEntries ();
624  Array<local_ordinal_type> indices (maxLength);
625  Array<scalar_type> values (maxLength * blockSize * blockSize);
626  size_t numEntries = 0;
627 
628  for (size_t i = 0; i < numMyRows; ++i) {
629  blockCrsA->getLocalRowCopy (i, indices (), values (), numEntries);
630  scalar_type* diagBlock = (scalar_type*) BlockDiagonal_->getLocalBlock (i,i).getRawPtr ();
631  for (local_ordinal_type subRow = 0; subRow < blockSize; ++subRow) {
632  magnitude_type diagonal_boost = STM::zero ();
633  for (size_t k = 0 ; k < numEntries ; ++k) {
634  if (static_cast<size_t> (indices[k]) > numMyRows) {
635  const size_t offset = blockSize*blockSize*k + subRow*blockSize;
636  for (local_ordinal_type subCol = 0; subCol < blockSize; ++subCol) {
637  diagonal_boost += STS::magnitude (values[offset+subCol] / two);
638  }
639  }
640  }
641  if (STS::magnitude (diagBlock[subRow*(blockSize+1)]) < L1Eta_ * diagonal_boost) {
642  diagBlock[subRow*(blockSize+1)] += diagonal_boost;
643  }
644  }
645  }
646  }
647 
648  blockDiagonalFactorizationPivots.resize (numMyRows * blockSize);
649  int info;
650  for (size_t i = 0 ; i < numMyRows; ++i) {
651  typename block_crs_matrix_type::little_block_type diagBlock =
652  BlockDiagonal_->getLocalBlock (i, i);
653  diagBlock.factorize (&blockDiagonalFactorizationPivots[i*blockSize], info);
654  }
655 
656  Importer_ = A_->getGraph ()->getImporter ();
657  } // end TimeMonitor scope
658 
659  ComputeTime_ += Time_->totalElapsedTime ();
660  ++NumCompute_;
661  IsComputed_ = true;
662 }
663 
664 template<class MatrixType>
666 {
667  using Teuchos::Array;
668  using Teuchos::ArrayRCP;
669  using Teuchos::ArrayView;
670  using Teuchos::as;
671  using Teuchos::Comm;
672  using Teuchos::RCP;
673  using Teuchos::rcp;
674  using Teuchos::REDUCE_MAX;
675  using Teuchos::REDUCE_MIN;
676  using Teuchos::REDUCE_SUM;
677  using Teuchos::rcp_dynamic_cast;
678  using Teuchos::reduceAll;
679  typedef Tpetra::Vector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> vector_type;
680  const scalar_type zero = STS::zero ();
681  const scalar_type one = STS::one ();
682 
683  // We don't count initialization in compute() time.
684  if (! isInitialized ()) {
685  initialize ();
686  }
687 
688  if (hasBlockCrsMatrix_) {
689  computeBlockCrs ();
690  return;
691  }
692 
693  {
694  // Reset the timer each time, since Relaxation uses the same Time
695  // object to track times for different methods.
696  Teuchos::TimeMonitor timeMon (*Time_, true);
697 
698  TEUCHOS_TEST_FOR_EXCEPTION(
699  A_.is_null (), std::runtime_error, "Ifpack2::Relaxation::compute: "
700  "The input matrix A is null. Please call setMatrix() with a nonnull "
701  "input matrix, then call initialize(), before calling this method.");
702 
703  // Reset state.
704  IsComputed_ = false;
705 
706  TEUCHOS_TEST_FOR_EXCEPTION(
707  NumSweeps_ < 0, std::logic_error,
708  "Ifpack2::Relaxation::compute: NumSweeps_ = " << NumSweeps_ << " < 0. "
709  "Please report this bug to the Ifpack2 developers.");
710 
711  Diagonal_ = rcp (new vector_type (A_->getRowMap ()));
712 
713  TEUCHOS_TEST_FOR_EXCEPTION(
714  Diagonal_.is_null (), std::logic_error,
715  "Ifpack2::Relaxation::compute: Vector of diagonal entries has not been "
716  "created yet. Please report this bug to the Ifpack2 developers.");
717 
718  // Extract the diagonal entries. The CrsMatrix static graph
719  // version is faster for subsequent calls to compute(), since it
720  // caches the diagonal offsets.
721  //
722  // isStaticGraph() == true means that the matrix was created with
723  // a const graph. The only requirement is that the structure of
724  // the matrix never changes, so isStaticGraph() == true is a bit
725  // more conservative than we need. However, Tpetra doesn't (as of
726  // 05 Apr 2013) have a way to tell if the graph hasn't changed
727  // since the last time we used it.
728  {
729  // NOTE (mfh 07 Jul 2013): We must cast here to CrsMatrix
730  // instead of MatrixType, because isStaticGraph is a CrsMatrix
731  // method (not inherited from RowMatrix's interface). It's
732  // perfectly valid to do relaxation on a RowMatrix which is not
733  // a CrsMatrix.
734  const crs_matrix_type* crsMat =
735  dynamic_cast<const crs_matrix_type*> (A_.getRawPtr ());
736  if (crsMat == NULL || ! crsMat->isStaticGraph ()) {
737  A_->getLocalDiagCopy (*Diagonal_); // slow path
738  } else {
739  if (! savedDiagOffsets_) { // we haven't precomputed offsets
740  crsMat->getLocalDiagOffsets (diagOffsets_);
741  savedDiagOffsets_ = true;
742  }
743  crsMat->getLocalDiagCopy (*Diagonal_, diagOffsets_ ());
744 #ifdef HAVE_TPETRA_DEBUG
745  // Validate the fast-path diagonal against the slow-path diagonal.
746  vector_type D_copy (A_->getRowMap ());
747  A_->getLocalDiagCopy (D_copy);
748  D_copy.update (STS::one (), *Diagonal_, -STS::one ());
749  const magnitude_type err = D_copy.normInf ();
750  // The two diagonals should be exactly the same, so their
751  // difference should be exactly zero.
752  TEUCHOS_TEST_FOR_EXCEPTION(
753  err != STM::zero(), std::logic_error, "Ifpack2::Relaxation::compute: "
754  "\"fast-path\" diagonal computation failed. \\|D1 - D2\\|_inf = "
755  << err << ".");
756 #endif // HAVE_TPETRA_DEBUG
757  }
758  }
759 
760  // If we're checking the computed inverse diagonal, then keep a
761  // copy of the original diagonal entries for later comparison.
762  RCP<vector_type> origDiag;
763  if (checkDiagEntries_) {
764  origDiag = rcp (new vector_type (A_->getRowMap ()));
765  Tpetra::deep_copy (*origDiag, *Diagonal_);
766  }
767 
768  // "Host view" means that if the Node type is a GPU Node, the
769  // ArrayRCP points to host memory, not device memory. It will
770  // write back to device memory (Diagonal_) at end of scope.
771  ArrayRCP<scalar_type> diagHostView = Diagonal_->get1dViewNonConst ();
772 
773  // The view below is only valid as long as diagHostView is within
774  // scope. We extract a raw pointer in release mode because we
775  // don't trust older versions of compilers (like GCC 4.4.x or
776  // Intel < 13) to optimize away ArrayView::operator[].
777 #ifdef HAVE_IFPACK2_DEBUG
778  ArrayView<scalar_type> diag = diagHostView ();
779 #else
780  scalar_type* KOKKOSCLASSIC_RESTRICT const diag = diagHostView.getRawPtr ();
781 #endif // HAVE_IFPACK2_DEBUG
782 
783  const size_t numMyRows = A_->getNodeNumRows ();
784 
785  // Setup for L1 Methods.
786  // Here we add half the value of the off-processor entries in the row,
787  // but only if diagonal isn't sufficiently large.
788  //
789  // This follows from Equation (6.5) in: Baker, Falgout, Kolev and
790  // Yang. "Multigrid Smoothers for Ultraparallel Computing." SIAM
791  // J. Sci. Comput., Vol. 33, No. 5. (2011), pp. 2864-2887.
792  if (DoL1Method_ && IsParallel_) {
793  const scalar_type two = one + one;
794  const size_t maxLength = A_->getNodeMaxNumRowEntries ();
795  Array<local_ordinal_type> indices (maxLength);
796  Array<scalar_type> values (maxLength);
797  size_t numEntries;
798 
799  for (size_t i = 0; i < numMyRows; ++i) {
800  A_->getLocalRowCopy (i, indices (), values (), numEntries);
801  magnitude_type diagonal_boost = STM::zero ();
802  for (size_t k = 0 ; k < numEntries ; ++k) {
803  if (static_cast<size_t> (indices[k]) > numMyRows) {
804  diagonal_boost += STS::magnitude (values[k] / two);
805  }
806  }
807  if (STS::magnitude (diag[i]) < L1Eta_ * diagonal_boost) {
808  diag[i] += diagonal_boost;
809  }
810  }
811 
812  }
813 
814  //
815  // Invert the diagonal entries of the matrix (not in place).
816  //
817 
818  // Precompute some quantities for "fixing" zero or tiny diagonal
819  // entries. We'll only use them if this "fixing" is enabled.
820  //
821  // SmallTraits covers for the lack of eps() in
822  // Teuchos::ScalarTraits when its template parameter is not a
823  // floating-point type. (Ifpack2 sometimes gets instantiated for
824  // integer Scalar types.)
825  const scalar_type oneOverMinDiagVal = (MinDiagonalValue_ == zero) ?
826  one / SmallTraits<scalar_type>::eps () :
827  one / MinDiagonalValue_;
828  // It's helpful not to have to recompute this magnitude each time.
829  const magnitude_type minDiagValMag = STS::magnitude (MinDiagonalValue_);
830 
831  if (checkDiagEntries_) {
832  //
833  // Check diagonal elements, replace zero elements with the minimum
834  // diagonal value, and store their inverses. Count the number of
835  // "small" and zero diagonal entries while we're at it.
836  //
837  size_t numSmallDiagEntries = 0; // "small" includes zero
838  size_t numZeroDiagEntries = 0; // # zero diagonal entries
839  size_t numNegDiagEntries = 0; // # negative (real parts of) diagonal entries
840 
841  // As we go, keep track of the diagonal entries with the least and
842  // greatest magnitude. We could use the trick of starting the min
843  // with +Inf and the max with -Inf, but that doesn't work if
844  // scalar_type is a built-in integer type. Thus, we have to start
845  // by reading the first diagonal entry redundantly.
846  // scalar_type minMagDiagEntry = zero;
847  // scalar_type maxMagDiagEntry = zero;
848  magnitude_type minMagDiagEntryMag = STM::zero ();
849  magnitude_type maxMagDiagEntryMag = STM::zero ();
850  if (numMyRows > 0) {
851  const scalar_type d_0 = diag[0];
852  const magnitude_type d_0_mag = STS::magnitude (d_0);
853  // minMagDiagEntry = d_0;
854  // maxMagDiagEntry = d_0;
855  minMagDiagEntryMag = d_0_mag;
856  maxMagDiagEntryMag = d_0_mag;
857  }
858 
859  // Go through all the diagonal entries. Compute counts of
860  // small-magnitude, zero, and negative-real-part entries. Invert
861  // the diagonal entries that aren't too small. For those that are
862  // too small in magnitude, replace them with 1/MinDiagonalValue_
863  // (or 1/eps if MinDiagonalValue_ happens to be zero).
864  for (size_t i = 0 ; i < numMyRows; ++i) {
865  const scalar_type d_i = diag[i];
866  const magnitude_type d_i_mag = STS::magnitude (d_i);
867  const magnitude_type d_i_real = STS::real (d_i);
868 
869  // We can't compare complex numbers, but we can compare their
870  // real parts.
871  if (d_i_real < STM::zero ()) {
872  ++numNegDiagEntries;
873  }
874  if (d_i_mag < minMagDiagEntryMag) {
875  // minMagDiagEntry = d_i;
876  minMagDiagEntryMag = d_i_mag;
877  }
878  if (d_i_mag > maxMagDiagEntryMag) {
879  // maxMagDiagEntry = d_i;
880  maxMagDiagEntryMag = d_i_mag;
881  }
882 
883  if (fixTinyDiagEntries_) {
884  // <= not <, in case minDiagValMag is zero.
885  if (d_i_mag <= minDiagValMag) {
886  ++numSmallDiagEntries;
887  if (d_i_mag == STM::zero ()) {
888  ++numZeroDiagEntries;
889  }
890  diag[i] = oneOverMinDiagVal;
891  } else {
892  diag[i] = one / d_i;
893  }
894  }
895  else { // Don't fix zero or tiny diagonal entries.
896  // <= not <, in case minDiagValMag is zero.
897  if (d_i_mag <= minDiagValMag) {
898  ++numSmallDiagEntries;
899  if (d_i_mag == STM::zero ()) {
900  ++numZeroDiagEntries;
901  }
902  }
903  diag[i] = one / d_i;
904  }
905  }
906 
907  // We're done computing the inverse diagonal, so invalidate the view.
908  // This ensures that the operations below, that use methods on Vector,
909  // produce correct results even on a GPU Node.
910  diagHostView = Teuchos::null;
911 
912  // Count floating-point operations of computing the inverse diagonal.
913  //
914  // FIXME (mfh 30 Mar 2013) Shouldn't counts be global, not local?
915  if (STS::isComplex) { // magnitude: at least 3 flops per diagonal entry
916  ComputeFlops_ += 4.0 * numMyRows;
917  } else {
918  ComputeFlops_ += numMyRows;
919  }
920 
921  // Collect global data about the diagonal entries. Only do this
922  // (which involves O(1) all-reduces) if the user asked us to do
923  // the extra work.
924  //
925  // FIXME (mfh 28 Mar 2013) This is wrong if some processes have
926  // zero rows. Fixing this requires one of two options:
927  //
928  // 1. Use a custom reduction operation that ignores processes
929  // with zero diagonal entries.
930  // 2. Split the communicator, compute all-reduces using the
931  // subcommunicator over processes that have a nonzero number
932  // of diagonal entries, and then broadcast from one of those
933  // processes (if there is one) to the processes in the other
934  // subcommunicator.
935  const Comm<int>& comm = * (A_->getRowMap ()->getComm ());
936 
937  // Compute global min and max magnitude of entries.
938  Array<magnitude_type> localVals (2);
939  localVals[0] = minMagDiagEntryMag;
940  // (- (min (- x))) is the same as (max x).
941  localVals[1] = -maxMagDiagEntryMag;
942  Array<magnitude_type> globalVals (2);
943  reduceAll<int, magnitude_type> (comm, REDUCE_MIN, 2,
944  localVals.getRawPtr (),
945  globalVals.getRawPtr ());
946  globalMinMagDiagEntryMag_ = globalVals[0];
947  globalMaxMagDiagEntryMag_ = -globalVals[1];
948 
949  Array<size_t> localCounts (3);
950  localCounts[0] = numSmallDiagEntries;
951  localCounts[1] = numZeroDiagEntries;
952  localCounts[2] = numNegDiagEntries;
953  Array<size_t> globalCounts (3);
954  reduceAll<int, size_t> (comm, REDUCE_SUM, 3,
955  localCounts.getRawPtr (),
956  globalCounts.getRawPtr ());
957  globalNumSmallDiagEntries_ = globalCounts[0];
958  globalNumZeroDiagEntries_ = globalCounts[1];
959  globalNumNegDiagEntries_ = globalCounts[2];
960 
961  // Forestall "set but not used" compiler warnings.
962  // (void) minMagDiagEntry;
963  // (void) maxMagDiagEntry;
964 
965  // Compute and save the difference between the computed inverse
966  // diagonal, and the original diagonal's inverse.
967  vector_type diff (A_->getRowMap ());
968  diff.reciprocal (*origDiag);
969  diff.update (-one, *Diagonal_, one);
970  globalDiagNormDiff_ = diff.norm2 ();
971  }
972  else { // don't check diagonal elements
973  if (fixTinyDiagEntries_) {
974  // Go through all the diagonal entries. Invert those that
975  // aren't too small in magnitude. For those that are too
976  // small in magnitude, replace them with oneOverMinDiagVal.
977  for (size_t i = 0 ; i < numMyRows; ++i) {
978  const scalar_type d_i = diag[i];
979  const magnitude_type d_i_mag = STS::magnitude (d_i);
980 
981  // <= not <, in case minDiagValMag is zero.
982  if (d_i_mag <= minDiagValMag) {
983  diag[i] = oneOverMinDiagVal;
984  } else {
985  diag[i] = one / d_i;
986  }
987  }
988  }
989  else { // don't fix tiny or zero diagonal entries
990  for (size_t i = 0 ; i < numMyRows; ++i) {
991  diag[i] = one / diag[i];
992  }
993  }
994  if (STS::isComplex) { // magnitude: at least 3 flops per diagonal entry
995  ComputeFlops_ += 4.0 * numMyRows;
996  } else {
997  ComputeFlops_ += numMyRows;
998  }
999  }
1000 
1001  if (IsParallel_ && (PrecType_ == Ifpack2::Details::GS ||
1002  PrecType_ == Ifpack2::Details::SGS)) {
1003  Importer_ = A_->getGraph ()->getImporter ();
1004  // mfh 21 Mar 2013: The Import object may be null, but in that
1005  // case, the domain and column Maps are the same and we don't
1006  // need to Import anyway.
1007  }
1008  } // end TimeMonitor scope
1009 
1010  ComputeTime_ += Time_->totalElapsedTime ();
1011  ++NumCompute_;
1012  IsComputed_ = true;
1013 }
1014 
1015 
1016 template<class MatrixType>
1017 void
1019 ApplyInverseJacobi (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1020  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1021 {
1022  using Teuchos::as;
1023  typedef Tpetra::MultiVector<scalar_type, local_ordinal_type,
1025 
1026  if (hasBlockCrsMatrix_) {
1027  ApplyInverseJacobi_BlockCrsMatrix (X, Y);
1028  return;
1029  }
1030 
1031  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
1032  const double numVectors = as<double> (X.getNumVectors ());
1033  if (ZeroStartingSolution_) {
1034  // For the first Jacobi sweep, if we are allowed to assume that
1035  // the initial guess is zero, then Jacobi is just diagonal
1036  // scaling. (A_ij * x_j = 0 for i != j, since x_j = 0.)
1037  // Compute it as Y(i,j) = DampingFactor_ * X(i,j) * D(i).
1038  Y.elementWiseMultiply (DampingFactor_, *Diagonal_, X, STS::zero ());
1039 
1040  // Count (global) floating-point operations. Ifpack2 represents
1041  // this as a floating-point number rather than an integer, so that
1042  // overflow (for a very large number of calls, or a very large
1043  // problem) is approximate instead of catastrophic.
1044  double flopUpdate = 0.0;
1045  if (DampingFactor_ == STS::one ()) {
1046  // Y(i,j) = X(i,j) * D(i): one multiply for each entry of Y.
1047  flopUpdate = numGlobalRows * numVectors;
1048  } else {
1049  // Y(i,j) = DampingFactor_ * X(i,j) * D(i):
1050  // Two multiplies per entry of Y.
1051  flopUpdate = 2.0 * numGlobalRows * numVectors;
1052  }
1053  ApplyFlops_ += flopUpdate;
1054  if (NumSweeps_ == 1) {
1055  return;
1056  }
1057  }
1058  // If we were allowed to assume that the starting guess was zero,
1059  // then we have already done the first sweep above.
1060  const int startSweep = ZeroStartingSolution_ ? 1 : 0;
1061  // We don't need to initialize the result MV, since the sparse
1062  // mat-vec will clobber its contents anyway.
1063  MV A_times_Y (Y.getMap (), as<size_t>(numVectors), false);
1064  for (int j = startSweep; j < NumSweeps_; ++j) {
1065  // Each iteration: Y = Y + \omega D^{-1} (X - A*Y)
1066  applyMat (Y, A_times_Y);
1067  A_times_Y.update (STS::one (), X, -STS::one ());
1068  Y.elementWiseMultiply (DampingFactor_, *Diagonal_, A_times_Y, STS::one ());
1069  }
1070 
1071  // For each column of output, for each pass over the matrix:
1072  //
1073  // - One + and one * for each matrix entry
1074  // - One / and one + for each row of the matrix
1075  // - If the damping factor is not one: one * for each row of the
1076  // matrix. (It's not fair to count this if the damping factor is
1077  // one, since the implementation could skip it. Whether it does
1078  // or not is the implementation's choice.)
1079 
1080  // Floating-point operations due to the damping factor, per matrix
1081  // row, per direction, per columm of output.
1082  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
1083  const double dampingFlops = (DampingFactor_ == STS::one ()) ? 0.0 : 1.0;
1084  ApplyFlops_ += as<double> (NumSweeps_ - startSweep) * numVectors *
1085  (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
1086 }
1087 
1088 template<class MatrixType>
1089 void
1093  Tpetra::MultiVector<scalar_type, local_ordinal_type,
1094  global_ordinal_type,node_type>& Y) const
1095 {
1096  typedef Tpetra::Experimental::BlockMultiVector<scalar_type,
1098  typedef Tpetra::MultiVector<scalar_type, local_ordinal_type,
1100  typedef typename block_crs_matrix_type::little_block_type little_block_type;
1101  typedef typename block_crs_matrix_type::little_vec_type little_vec_type;
1102 
1103  if (ZeroStartingSolution_) {
1104  Y.putScalar (STS::zero ());
1105  }
1106 
1107  const size_t numVectors = X.getNumVectors ();
1108 
1109  const block_crs_matrix_type* blockMat =
1110  dynamic_cast<const block_crs_matrix_type*> (A_.getRawPtr ());
1111 
1112  const local_ordinal_type blockSize = blockMat->getBlockSize ();
1113  BMV A_times_Y_Block (* (blockMat->getRowMap ()), * (Y.getMap ()),
1114  blockSize, numVectors);
1115  MV A_times_Y = A_times_Y_Block.getMultiVectorView();
1116  BMV yBlock (Y, * (blockMat->getRowMap ()), blockSize);
1117  for (int j = 0; j < NumSweeps_; ++j) {
1118  blockMat->apply (Y, A_times_Y);
1119  A_times_Y.update (STS::one (), X, -STS::one ());
1120  const size_t numRows = blockMat->getNodeNumRows ();
1121  for (size_t i = 0; i < numVectors; ++i) {
1122  for (size_t k = 0; k < numRows; ++k) {
1123  // Each iteration: Y = Y + \omega D^{-1} (X - A*Y)
1124  little_block_type factorizedBlockDiag = BlockDiagonal_->getLocalBlock (k, k);
1125  little_vec_type xloc = A_times_Y_Block.getLocalBlock (k, i);
1126  little_vec_type yloc = yBlock.getLocalBlock (k, i);
1127  factorizedBlockDiag.solve (xloc, &blockDiagonalFactorizationPivots[i*blockSize]);
1128  yloc.update (DampingFactor_, xloc);
1129  }
1130  }
1131  }
1132 }
1133 
1134 template<class MatrixType>
1135 void
1137 ApplyInverseGS (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1138  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1139 {
1140  typedef Relaxation<MatrixType> this_type;
1141  // The CrsMatrix version is faster, because it can access the sparse
1142  // matrix data directly, rather than by copying out each row's data
1143  // in turn. Thus, we check whether the RowMatrix is really a
1144  // CrsMatrix.
1145  //
1146  // FIXME (mfh 07 Jul 2013) See note on crs_matrix_type typedef
1147  // declaration in Ifpack2_Relaxation_decl.hpp header file. The code
1148  // will still be correct if the cast fails, but it will use an
1149  // unoptimized kernel.
1150  const block_crs_matrix_type* blockCrsMat =
1151  dynamic_cast<const block_crs_matrix_type*> (A_.getRawPtr ());
1152  const crs_matrix_type* crsMat =
1153  dynamic_cast<const crs_matrix_type*> (A_.getRawPtr ());
1154  if (blockCrsMat != NULL) {
1155  const_cast<this_type*> (this)->ApplyInverseGS_BlockCrsMatrix (*blockCrsMat, X, Y);
1156  } else if (crsMat != NULL) {
1157  ApplyInverseGS_CrsMatrix (*crsMat, X, Y);
1158  } else {
1159  ApplyInverseGS_RowMatrix (X, Y);
1160  }
1161 }
1162 
1163 
1164 template<class MatrixType>
1165 void
1167 ApplyInverseGS_RowMatrix (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1168  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1169 {
1170  using Teuchos::Array;
1171  using Teuchos::ArrayRCP;
1172  using Teuchos::ArrayView;
1173  using Teuchos::as;
1174  using Teuchos::RCP;
1175  using Teuchos::rcp;
1176  using Teuchos::rcpFromRef;
1177  typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> MV;
1178 
1179  // Tpetra's GS implementation for CrsMatrix handles zeroing out the
1180  // starting multivector itself. The generic RowMatrix version here
1181  // does not, so we have to zero out Y here.
1182  if (ZeroStartingSolution_) {
1183  Y.putScalar (STS::zero ());
1184  }
1185 
1186  const size_t NumVectors = X.getNumVectors ();
1187  const size_t maxLength = A_->getNodeMaxNumRowEntries ();
1188  Array<local_ordinal_type> Indices (maxLength);
1189  Array<scalar_type> Values (maxLength);
1190 
1191  // Local smoothing stuff
1192  const size_t numMyRows = A_->getNodeNumRows();
1193  const local_ordinal_type* rowInd = 0;
1194  size_t numActive = numMyRows;
1195  bool do_local = ! localSmoothingIndices_.is_null ();
1196  if (do_local) {
1197  rowInd = localSmoothingIndices_.getRawPtr ();
1198  numActive = localSmoothingIndices_.size ();
1199  }
1200 
1201  RCP<MV> Y2;
1202  if (IsParallel_) {
1203  if (Importer_.is_null ()) { // domain and column Maps are the same.
1204  // We will copy Y into Y2 below, so no need to fill with zeros here.
1205  Y2 = rcp (new MV (Y.getMap (), NumVectors, false));
1206  } else {
1207  // FIXME (mfh 21 Mar 2013) We probably don't need to fill with
1208  // zeros here, since we are doing an Import into Y2 below
1209  // anyway. However, it doesn't hurt correctness.
1210  Y2 = rcp (new MV (Importer_->getTargetMap (), NumVectors));
1211  }
1212  }
1213  else {
1214  Y2 = rcpFromRef (Y);
1215  }
1216 
1217  // Diagonal
1218  ArrayRCP<const scalar_type> d_rcp = Diagonal_->get1dView ();
1219  ArrayView<const scalar_type> d_ptr = d_rcp();
1220 
1221  // Constant stride check
1222  bool constant_stride = X.isConstantStride() && Y2->isConstantStride();
1223 
1224  if (constant_stride) {
1225  // extract 1D RCPs
1226  size_t x_stride = X.getStride();
1227  size_t y2_stride = Y2->getStride();
1228  ArrayRCP<scalar_type> y2_rcp = Y2->get1dViewNonConst();
1229  ArrayRCP<const scalar_type> x_rcp = X.get1dView();
1230  ArrayView<scalar_type> y2_ptr = y2_rcp();
1231  ArrayView<const scalar_type> x_ptr = x_rcp();
1232  Array<scalar_type> dtemp(NumVectors,STS::zero());
1233 
1234  for (int j = 0; j < NumSweeps_; ++j) {
1235  // data exchange is here, once per sweep
1236  if (IsParallel_) {
1237  if (Importer_.is_null ()) {
1238  *Y2 = Y; // just copy, since domain and column Maps are the same
1239  } else {
1240  Y2->doImport (Y, *Importer_, Tpetra::INSERT);
1241  }
1242  }
1243 
1244  if (! DoBackwardGS_) { // Forward sweep
1245  for (size_t ii = 0; ii < numActive; ++ii) {
1246  local_ordinal_type i = as<local_ordinal_type> (do_local ? rowInd[ii] : ii);
1247  size_t NumEntries;
1248  A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
1249  dtemp.assign(NumVectors,STS::zero());
1250 
1251  for (size_t k = 0; k < NumEntries; ++k) {
1252  const local_ordinal_type col = Indices[k];
1253  for (size_t m = 0; m < NumVectors; ++m) {
1254  dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m];
1255  }
1256  }
1257 
1258  for (size_t m = 0; m < NumVectors; ++m) {
1259  y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]);
1260  }
1261  }
1262  }
1263  else { // Backward sweep
1264  // ptrdiff_t is the same size as size_t, but is signed. Being
1265  // signed is important so that i >= 0 is not trivially true.
1266  for (ptrdiff_t ii = as<ptrdiff_t> (numActive) - 1; ii >= 0; --ii) {
1267  local_ordinal_type i = as<local_ordinal_type> (do_local ? rowInd[ii] : ii);
1268  size_t NumEntries;
1269  A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
1270  dtemp.assign (NumVectors, STS::zero ());
1271 
1272  for (size_t k = 0; k < NumEntries; ++k) {
1273  const local_ordinal_type col = Indices[k];
1274  for (size_t m = 0; m < NumVectors; ++m) {
1275  dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m];
1276  }
1277  }
1278 
1279  for (size_t m = 0; m < NumVectors; ++m) {
1280  y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]);
1281  }
1282  }
1283  }
1284  // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map.
1285  if (IsParallel_) {
1286  Tpetra::deep_copy (Y, *Y2);
1287  }
1288  }
1289  }
1290  else {
1291  // extract 2D RCPS
1292  ArrayRCP<ArrayRCP<scalar_type> > y2_ptr = Y2->get2dViewNonConst ();
1293  ArrayRCP<ArrayRCP<const scalar_type> > x_ptr = X.get2dView ();
1294 
1295  for (int j = 0; j < NumSweeps_; ++j) {
1296  // data exchange is here, once per sweep
1297  if (IsParallel_) {
1298  if (Importer_.is_null ()) {
1299  *Y2 = Y; // just copy, since domain and column Maps are the same
1300  } else {
1301  Y2->doImport (Y, *Importer_, Tpetra::INSERT);
1302  }
1303  }
1304 
1305  if (! DoBackwardGS_) { // Forward sweep
1306  for (size_t ii = 0; ii < numActive; ++ii) {
1307  local_ordinal_type i = as<local_ordinal_type> (do_local ? rowInd[ii] : ii);
1308  size_t NumEntries;
1309  A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
1310 
1311  for (size_t m = 0; m < NumVectors; ++m) {
1312  scalar_type dtemp = STS::zero ();
1313  ArrayView<const scalar_type> x_local = (x_ptr())[m]();
1314  ArrayView<scalar_type> y2_local = (y2_ptr())[m]();
1315 
1316  for (size_t k = 0; k < NumEntries; ++k) {
1317  const local_ordinal_type col = Indices[k];
1318  dtemp += Values[k] * y2_local[col];
1319  }
1320  y2_local[i] += DampingFactor_ * d_ptr[i] * (x_local[i] - dtemp);
1321  }
1322  }
1323  }
1324  else { // Backward sweep
1325  // ptrdiff_t is the same size as size_t, but is signed. Being
1326  // signed is important so that i >= 0 is not trivially true.
1327  for (ptrdiff_t ii = as<ptrdiff_t> (numActive) - 1; ii >= 0; --ii) {
1328  local_ordinal_type i = as<local_ordinal_type> (do_local ? rowInd[ii] : ii);
1329 
1330  size_t NumEntries;
1331  A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
1332 
1333  for (size_t m = 0; m < NumVectors; ++m) {
1334  scalar_type dtemp = STS::zero ();
1335  ArrayView<const scalar_type> x_local = (x_ptr())[m]();
1336  ArrayView<scalar_type> y2_local = (y2_ptr())[m]();
1337 
1338  for (size_t k = 0; k < NumEntries; ++k) {
1339  const local_ordinal_type col = Indices[k];
1340  dtemp += Values[k] * y2_local[col];
1341  }
1342  y2_local[i] += DampingFactor_ * d_ptr[i] * (x_local[i] - dtemp);
1343  }
1344  }
1345  }
1346 
1347  // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map.
1348  if (IsParallel_) {
1349  Tpetra::deep_copy (Y, *Y2);
1350  }
1351  }
1352  }
1353 
1354 
1355  // See flop count discussion in implementation of ApplyInverseGS_CrsMatrix().
1356  const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0;
1357  const double numVectors = as<double> (X.getNumVectors ());
1358  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
1359  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
1360  ApplyFlops_ += 2.0 * NumSweeps_ * numVectors *
1361  (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
1362 }
1363 
1364 
1365 template<class MatrixType>
1366 void
1368 ApplyInverseGS_CrsMatrix (const crs_matrix_type& A,
1369  const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1370  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1371 {
1372  using Teuchos::as;
1373  const Tpetra::ESweepDirection direction =
1374  DoBackwardGS_ ? Tpetra::Backward : Tpetra::Forward;
1375  if (localSmoothingIndices_.is_null ()) {
1376  A.gaussSeidelCopy (Y, X, *Diagonal_, DampingFactor_, direction,
1377  NumSweeps_, ZeroStartingSolution_);
1378  }
1379  else {
1380  A.reorderedGaussSeidelCopy (Y, X, *Diagonal_, localSmoothingIndices_ (),
1381  DampingFactor_, direction,
1382  NumSweeps_, ZeroStartingSolution_);
1383  }
1384 
1385  // For each column of output, for each sweep over the matrix:
1386  //
1387  // - One + and one * for each matrix entry
1388  // - One / and one + for each row of the matrix
1389  // - If the damping factor is not one: one * for each row of the
1390  // matrix. (It's not fair to count this if the damping factor is
1391  // one, since the implementation could skip it. Whether it does
1392  // or not is the implementation's choice.)
1393 
1394  // Floating-point operations due to the damping factor, per matrix
1395  // row, per direction, per columm of output.
1396  const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0;
1397  const double numVectors = as<double> (X.getNumVectors ());
1398  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
1399  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
1400  ApplyFlops_ += NumSweeps_ * numVectors *
1401  (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
1402 }
1403 
1404 template<class MatrixType>
1405 void
1407 ApplyInverseGS_BlockCrsMatrix (const block_crs_matrix_type& A,
1408  const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1409  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y)
1410 {
1411  using Teuchos::RCP;
1412  using Teuchos::rcp;
1413  using Teuchos::rcpFromRef;
1414  typedef Tpetra::Experimental::BlockMultiVector<scalar_type,
1416  typedef Tpetra::MultiVector<scalar_type,
1418 
1419  //FIXME: (tcf) 8/21/2014 -- may be problematic for multiple right hand sides
1420  //
1421  // NOTE (mfh 12 Sep 2014) I don't think it should be a problem for
1422  // multiple right-hand sides, unless the input or output MultiVector
1423  // does not have constant stride. We should check for that case
1424  // here, in case it doesn't work in localGaussSeidel (which is
1425  // entirely possible).
1426  BMV yBlock (Y, * (A.getGraph ()->getDomainMap ()), A.getBlockSize ());
1427  const BMV xBlock (X, * (A.getColMap ()), A.getBlockSize ());
1428 
1429  bool performImport = false;
1430  RCP<BMV> yBlockCol;
1431  if (Importer_.is_null ()) {
1432  yBlockCol = rcpFromRef (yBlock);
1433  }
1434  else {
1435  if (yBlockColumnPointMap_.is_null () ||
1436  yBlockColumnPointMap_->getNumVectors () != yBlock.getNumVectors () ||
1437  yBlockColumnPointMap_->getBlockSize () != yBlock.getBlockSize ()) {
1438  yBlockColumnPointMap_ =
1439  rcp (new BMV (* (A.getColMap ()), A.getBlockSize (),
1440  static_cast<local_ordinal_type> (yBlock.getNumVectors ())));
1441  }
1442  yBlockCol = yBlockColumnPointMap_;
1443  performImport = true;
1444  }
1445 
1446  if (ZeroStartingSolution_) {
1447  yBlockCol->putScalar (STS::zero ());
1448  }
1449  else if (performImport) {
1450  yBlockCol->doImport (yBlock, *Importer_, Tpetra::INSERT);
1451  }
1452 
1453  const Tpetra::ESweepDirection direction =
1454  DoBackwardGS_ ? Tpetra::Backward : Tpetra::Forward;
1455 
1456  for (int sweep = 0; sweep < NumSweeps_; ++sweep) {
1457  if (performImport && sweep > 0) {
1458  yBlockCol->doImport(yBlock, *Importer_, Tpetra::INSERT);
1459  }
1460  A.localGaussSeidel (xBlock, *yBlockCol, *BlockDiagonal_,
1461  &blockDiagonalFactorizationPivots[0],
1462  DampingFactor_, direction);
1463  if (performImport) {
1464  RCP<const MV> yBlockColPointDomain =
1465  yBlockCol->getMultiVectorView ().offsetView (A.getDomainMap (), 0);
1466  Tpetra::deep_copy (Y, *yBlockColPointDomain);
1467  }
1468  }
1469 }
1470 
1471 
1472 template<class MatrixType>
1473 void
1475 ApplyInverseSGS (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1476  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1477 {
1478  typedef Relaxation<MatrixType> this_type;
1479  // The CrsMatrix version is faster, because it can access the sparse
1480  // matrix data directly, rather than by copying out each row's data
1481  // in turn. Thus, we check whether the RowMatrix is really a
1482  // CrsMatrix.
1483  //
1484  // FIXME (mfh 07 Jul 2013) See note on crs_matrix_type typedef
1485  // declaration in Ifpack2_Relaxation_decl.hpp header file. The code
1486  // will still be correct if the cast fails, but it will use an
1487  // unoptimized kernel.
1488  const block_crs_matrix_type* blockCrsMat = dynamic_cast<const block_crs_matrix_type*> (A_.getRawPtr());
1489  const crs_matrix_type* crsMat = dynamic_cast<const crs_matrix_type*> (&(*A_));
1490  if (blockCrsMat != NULL) {
1491  const_cast<this_type*> (this)->ApplyInverseSGS_BlockCrsMatrix(*blockCrsMat, X, Y);
1492  }
1493  else if (crsMat != NULL) {
1494  ApplyInverseSGS_CrsMatrix (*crsMat, X, Y);
1495  } else {
1496  ApplyInverseSGS_RowMatrix (X, Y);
1497  }
1498 }
1499 
1500 
1501 template<class MatrixType>
1502 void
1504 ApplyInverseSGS_RowMatrix (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1505  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1506 {
1507  using Teuchos::Array;
1508  using Teuchos::ArrayRCP;
1509  using Teuchos::ArrayView;
1510  using Teuchos::as;
1511  using Teuchos::RCP;
1512  using Teuchos::rcp;
1513  using Teuchos::rcpFromRef;
1514  typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> MV;
1515 
1516  // Tpetra's GS implementation for CrsMatrix handles zeroing out the
1517  // starting multivector itself. The generic RowMatrix version here
1518  // does not, so we have to zero out Y here.
1519  if (ZeroStartingSolution_) {
1520  Y.putScalar (STS::zero ());
1521  }
1522 
1523  const size_t NumVectors = X.getNumVectors ();
1524  const size_t maxLength = A_->getNodeMaxNumRowEntries ();
1525  Array<local_ordinal_type> Indices (maxLength);
1526  Array<scalar_type> Values (maxLength);
1527 
1528  // Local smoothing stuff
1529  const size_t numMyRows = A_->getNodeNumRows();
1530  const local_ordinal_type * rowInd = 0;
1531  size_t numActive = numMyRows;
1532  bool do_local = localSmoothingIndices_.is_null();
1533  if(do_local) {
1534  rowInd = localSmoothingIndices_.getRawPtr();
1535  numActive = localSmoothingIndices_.size();
1536  }
1537 
1538 
1539  RCP<MV> Y2;
1540  if (IsParallel_) {
1541  if (Importer_.is_null ()) { // domain and column Maps are the same.
1542  // We will copy Y into Y2 below, so no need to fill with zeros here.
1543  Y2 = rcp (new MV (Y.getMap (), NumVectors, false));
1544  } else {
1545  // FIXME (mfh 21 Mar 2013) We probably don't need to fill with
1546  // zeros here, since we are doing an Import into Y2 below
1547  // anyway. However, it doesn't hurt correctness.
1548  Y2 = rcp (new MV (Importer_->getTargetMap (), NumVectors));
1549  }
1550  }
1551  else {
1552  Y2 = rcpFromRef (Y);
1553  }
1554 
1555  // Diagonal
1556  ArrayRCP<const scalar_type> d_rcp = Diagonal_->get1dView();
1557  ArrayView<const scalar_type> d_ptr = d_rcp();
1558 
1559  // Constant stride check
1560  bool constant_stride = X.isConstantStride() && Y2->isConstantStride();
1561 
1562  if(constant_stride) {
1563  // extract 1D RCPs
1564  size_t x_stride = X.getStride();
1565  size_t y2_stride = Y2->getStride();
1566  ArrayRCP<scalar_type> y2_rcp = Y2->get1dViewNonConst();
1567  ArrayRCP<const scalar_type> x_rcp = X.get1dView();
1568  ArrayView<scalar_type> y2_ptr = y2_rcp();
1569  ArrayView<const scalar_type> x_ptr = x_rcp();
1570  Array<scalar_type> dtemp(NumVectors,STS::zero());
1571  for (int iter = 0; iter < NumSweeps_; ++iter) {
1572  // only one data exchange per sweep
1573  if (IsParallel_) {
1574  if (Importer_.is_null ()) {
1575  // just copy, since domain and column Maps are the same
1576  Tpetra::deep_copy (*Y2, Y);
1577  } else {
1578  Y2->doImport (Y, *Importer_, Tpetra::INSERT);
1579  }
1580  }
1581 
1582  for (int j = 0; j < NumSweeps_; j++) {
1583  // data exchange is here, once per sweep
1584  if (IsParallel_) {
1585  if (Importer_.is_null ()) {
1586  // just copy, since domain and column Maps are the same
1587  Tpetra::deep_copy (*Y2, Y);
1588  } else {
1589  Y2->doImport (Y, *Importer_, Tpetra::INSERT);
1590  }
1591  }
1592 
1593  for (size_t ii = 0; ii < numActive; ++ii) {
1594  local_ordinal_type i = as<local_ordinal_type>(do_local ? rowInd[ii] : ii);
1595  size_t NumEntries;
1596  A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
1597  dtemp.assign(NumVectors,STS::zero());
1598 
1599  for (size_t k = 0; k < NumEntries; ++k) {
1600  const local_ordinal_type col = Indices[k];
1601  for (size_t m = 0; m < NumVectors; ++m) {
1602  dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m];
1603  }
1604  }
1605 
1606  for (size_t m = 0; m < NumVectors; ++m) {
1607  y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]);
1608  }
1609  }
1610 
1611  // ptrdiff_t is the same size as size_t, but is signed. Being
1612  // signed is important so that i >= 0 is not trivially true.
1613  for (ptrdiff_t ii = as<ptrdiff_t> (numActive) - 1; ii >= 0; --ii) {
1614  local_ordinal_type i = as<local_ordinal_type>(do_local ? rowInd[ii] : ii);
1615  size_t NumEntries;
1616  A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
1617  dtemp.assign(NumVectors,STS::zero());
1618 
1619  for (size_t k = 0; k < NumEntries; ++k) {
1620  const local_ordinal_type col = Indices[k];
1621  for (size_t m = 0; m < NumVectors; ++m) {
1622  dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m];
1623  }
1624  }
1625 
1626  for (size_t m = 0; m < NumVectors; ++m) {
1627  y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]);
1628  }
1629  }
1630  }
1631 
1632  // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map.
1633  if (IsParallel_) {
1634  Tpetra::deep_copy (Y, *Y2);
1635  }
1636  }
1637  }
1638  else {
1639  // extract 2D RCPs
1640  ArrayRCP<ArrayRCP<scalar_type> > y2_ptr = Y2->get2dViewNonConst ();
1641  ArrayRCP<ArrayRCP<const scalar_type> > x_ptr = X.get2dView ();
1642 
1643  for (int iter = 0; iter < NumSweeps_; ++iter) {
1644  // only one data exchange per sweep
1645  if (IsParallel_) {
1646  if (Importer_.is_null ()) {
1647  // just copy, since domain and column Maps are the same
1648  Tpetra::deep_copy (*Y2, Y);
1649  } else {
1650  Y2->doImport (Y, *Importer_, Tpetra::INSERT);
1651  }
1652  }
1653 
1654  for (size_t ii = 0; ii < numActive; ++ii) {
1655  local_ordinal_type i = as<local_ordinal_type>(do_local ? rowInd[ii] : ii);
1656  const scalar_type diag = d_ptr[i];
1657  size_t NumEntries;
1658  A_->getLocalRowCopy (as<local_ordinal_type> (i), Indices (), Values (), NumEntries);
1659 
1660  for (size_t m = 0; m < NumVectors; ++m) {
1661  scalar_type dtemp = STS::zero ();
1662  ArrayView<const scalar_type> x_local = (x_ptr())[m]();
1663  ArrayView<scalar_type> y2_local = (y2_ptr())[m]();
1664 
1665  for (size_t k = 0; k < NumEntries; ++k) {
1666  const local_ordinal_type col = Indices[k];
1667  dtemp += Values[k] * y2_local[col];
1668  }
1669  y2_local[i] += DampingFactor_ * (x_local[i] - dtemp) * diag;
1670  }
1671  }
1672 
1673  // ptrdiff_t is the same size as size_t, but is signed. Being
1674  // signed is important so that i >= 0 is not trivially true.
1675  for (ptrdiff_t ii = as<ptrdiff_t> (numActive) - 1; ii >= 0; --ii) {
1676  local_ordinal_type i = as<local_ordinal_type>(do_local ? rowInd[ii] : ii);
1677  const scalar_type diag = d_ptr[i];
1678  size_t NumEntries;
1679  A_->getLocalRowCopy (as<local_ordinal_type> (i), Indices (), Values (), NumEntries);
1680 
1681  for (size_t m = 0; m < NumVectors; ++m) {
1682  scalar_type dtemp = STS::zero ();
1683  ArrayView<const scalar_type> x_local = (x_ptr())[m]();
1684  ArrayView<scalar_type> y2_local = (y2_ptr())[m]();
1685 
1686  for (size_t k = 0; k < NumEntries; ++k) {
1687  const local_ordinal_type col = Indices[k];
1688  dtemp += Values[k] * y2_local[col];
1689  }
1690  y2_local[i] += DampingFactor_ * (x_local[i] - dtemp) * diag;
1691  }
1692  }
1693 
1694  // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map.
1695  if (IsParallel_) {
1696  Tpetra::deep_copy (Y, *Y2);
1697  }
1698  }
1699  }
1700 
1701  // See flop count discussion in implementation of ApplyInverseSGS_CrsMatrix().
1702  const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0;
1703  const double numVectors = as<double> (X.getNumVectors ());
1704  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
1705  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
1706  ApplyFlops_ += 2.0 * NumSweeps_ * numVectors *
1707  (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
1708 }
1709 
1710 
1711 template<class MatrixType>
1712 void
1714 ApplyInverseSGS_CrsMatrix (const crs_matrix_type& A,
1715  const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1716  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
1717 {
1718  using Teuchos::as;
1719  const Tpetra::ESweepDirection direction = Tpetra::Symmetric;
1720  if (localSmoothingIndices_.is_null ()) {
1721  A.gaussSeidelCopy (Y, X, *Diagonal_, DampingFactor_, direction,
1722  NumSweeps_, ZeroStartingSolution_);
1723  }
1724  else {
1725  A.reorderedGaussSeidelCopy (Y, X, *Diagonal_, localSmoothingIndices_ (),
1726  DampingFactor_, direction,
1727  NumSweeps_, ZeroStartingSolution_);
1728  }
1729 
1730  // For each column of output, for each sweep over the matrix:
1731  //
1732  // - One + and one * for each matrix entry
1733  // - One / and one + for each row of the matrix
1734  // - If the damping factor is not one: one * for each row of the
1735  // matrix. (It's not fair to count this if the damping factor is
1736  // one, since the implementation could skip it. Whether it does
1737  // or not is the implementation's choice.)
1738  //
1739  // Each sweep of symmetric Gauss-Seidel / SOR counts as two sweeps,
1740  // one forward and one backward.
1741 
1742  // Floating-point operations due to the damping factor, per matrix
1743  // row, per direction, per columm of output.
1744  const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0;
1745  const double numVectors = as<double> (X.getNumVectors ());
1746  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
1747  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
1748  ApplyFlops_ += 2.0 * NumSweeps_ * numVectors *
1749  (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
1750 }
1751 
1752 template<class MatrixType>
1753 void
1755 ApplyInverseSGS_BlockCrsMatrix (const block_crs_matrix_type& A,
1756  const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1757  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y)
1758 {
1759  using Teuchos::RCP;
1760  using Teuchos::rcp;
1761  using Teuchos::rcpFromRef;
1762  typedef Tpetra::Experimental::BlockMultiVector<scalar_type,
1764  typedef Tpetra::MultiVector<scalar_type,
1766 
1767  //FIXME: (tcf) 8/21/2014 -- may be problematic for multiple right hand sides
1768  //
1769  // NOTE (mfh 12 Sep 2014) I don't think it should be a problem for
1770  // multiple right-hand sides, unless the input or output MultiVector
1771  // does not have constant stride. We should check for that case
1772  // here, in case it doesn't work in localGaussSeidel (which is
1773  // entirely possible).
1774  BMV yBlock (Y, * (A.getGraph ()->getDomainMap ()), A.getBlockSize ());
1775  const BMV xBlock (X, * (A.getColMap ()), A.getBlockSize ());
1776 
1777  bool performImport = false;
1778  RCP<BMV> yBlockCol;
1779  if (Importer_.is_null ()) {
1780  yBlockCol = Teuchos::rcpFromRef (yBlock);
1781  }
1782  else {
1783  if (yBlockColumnPointMap_.is_null () ||
1784  yBlockColumnPointMap_->getNumVectors () != yBlock.getNumVectors () ||
1785  yBlockColumnPointMap_->getBlockSize () != yBlock.getBlockSize ()) {
1786  yBlockColumnPointMap_ =
1787  rcp (new BMV (* (A.getColMap ()), A.getBlockSize (),
1788  static_cast<local_ordinal_type> (yBlock.getNumVectors ())));
1789  }
1790  yBlockCol = yBlockColumnPointMap_;
1791  performImport = true;
1792  }
1793 
1794  if (ZeroStartingSolution_) {
1795  yBlockCol->putScalar (STS::zero ());
1796  }
1797  else if (performImport) {
1798  yBlockCol->doImport (yBlock, *Importer_, Tpetra::INSERT);
1799  }
1800 
1801  // FIXME (mfh 12 Sep 2014) Shouldn't this come from the user's parameter?
1802  const Tpetra::ESweepDirection direction = Tpetra::Symmetric;
1803 
1804  for (int sweep = 0; sweep < NumSweeps_; ++sweep) {
1805  if (performImport && sweep > 0) {
1806  yBlockCol->doImport (yBlock, *Importer_, Tpetra::INSERT);
1807  }
1808  A.localGaussSeidel (xBlock, *yBlockCol, *BlockDiagonal_,
1809  &blockDiagonalFactorizationPivots[0],
1810  DampingFactor_, direction);
1811  if (performImport) {
1812  RCP<const MV> yBlockColPointDomain =
1813  yBlockCol->getMultiVectorView ().offsetView (A.getDomainMap (), 0);
1814  MV yBlockView = yBlock.getMultiVectorView ();
1815  Tpetra::deep_copy (yBlockView, *yBlockColPointDomain);
1816  }
1817  }
1818 }
1819 
1820 
1821 template<class MatrixType>
1823 {
1824  std::ostringstream os;
1825 
1826  // Output is a valid YAML dictionary in flow style. If you don't
1827  // like everything on a single line, you should call describe()
1828  // instead.
1829  os << "\"Ifpack2::Relaxation\": {";
1830 
1831  os << "Initialized: " << (isInitialized () ? "true" : "false") << ", "
1832  << "Computed: " << (isComputed () ? "true" : "false") << ", ";
1833 
1834  // It's useful to print this instance's relaxation method (Jacobi,
1835  // Gauss-Seidel, or symmetric Gauss-Seidel). If you want more info
1836  // than that, call describe() instead.
1837  os << "Type: ";
1838  if (PrecType_ == Ifpack2::Details::JACOBI) {
1839  os << "Jacobi";
1840  } else if (PrecType_ == Ifpack2::Details::GS) {
1841  os << "Gauss-Seidel";
1842  } else if (PrecType_ == Ifpack2::Details::SGS) {
1843  os << "Symmetric Gauss-Seidel";
1844  } else {
1845  os << "INVALID";
1846  }
1847 
1848  os << ", " << "sweeps: " << NumSweeps_ << ", "
1849  << "damping factor: " << DampingFactor_ << ", ";
1850  if (DoL1Method_) {
1851  os << "use l1: " << DoL1Method_ << ", "
1852  << "l1 eta: " << L1Eta_ << ", ";
1853  }
1854 
1855  if (A_.is_null ()) {
1856  os << "Matrix: null";
1857  }
1858  else {
1859  os << "Global matrix dimensions: ["
1860  << A_->getGlobalNumRows () << ", " << A_->getGlobalNumCols () << "]"
1861  << ", Global nnz: " << A_->getGlobalNumEntries();
1862  }
1863 
1864  os << "}";
1865  return os.str ();
1866 }
1867 
1868 
1869 template<class MatrixType>
1870 void
1872 describe (Teuchos::FancyOStream &out,
1873  const Teuchos::EVerbosityLevel verbLevel) const
1874 {
1875  using Teuchos::OSTab;
1876  using Teuchos::TypeNameTraits;
1877  using Teuchos::VERB_DEFAULT;
1878  using Teuchos::VERB_NONE;
1879  using Teuchos::VERB_LOW;
1880  using Teuchos::VERB_MEDIUM;
1881  using Teuchos::VERB_HIGH;
1882  using Teuchos::VERB_EXTREME;
1883  using std::endl;
1884 
1885  const Teuchos::EVerbosityLevel vl =
1886  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
1887 
1888  const int myRank = this->getComm ()->getRank ();
1889 
1890  // none: print nothing
1891  // low: print O(1) info from Proc 0
1892  // medium:
1893  // high:
1894  // extreme:
1895 
1896  if (vl != VERB_NONE && myRank == 0) {
1897  // Describable interface asks each implementation to start with a tab.
1898  OSTab tab1 (out);
1899  // Output is valid YAML; hence the quotes, to protect the colons.
1900  out << "\"Ifpack2::Relaxation\":" << endl;
1901  OSTab tab2 (out);
1902  out << "MatrixType: \"" << TypeNameTraits<MatrixType>::name () << "\""
1903  << endl;
1904  if (this->getObjectLabel () != "") {
1905  out << "Label: " << this->getObjectLabel () << endl;
1906  }
1907  out << "Initialized: " << (isInitialized () ? "true" : "false") << endl
1908  << "Computed: " << (isComputed () ? "true" : "false") << endl
1909  << "Parameters: " << endl;
1910  {
1911  OSTab tab3 (out);
1912  out << "\"relaxation: type\": ";
1913  if (PrecType_ == Ifpack2::Details::JACOBI) {
1914  out << "Jacobi";
1915  } else if (PrecType_ == Ifpack2::Details::GS) {
1916  out << "Gauss-Seidel";
1917  } else if (PrecType_ == Ifpack2::Details::SGS) {
1918  out << "Symmetric Gauss-Seidel";
1919  } else {
1920  out << "INVALID";
1921  }
1922  // We quote these parameter names because they contain colons.
1923  // YAML uses the colon to distinguish key from value.
1924  out << endl
1925  << "\"relaxation: sweeps\": " << NumSweeps_ << endl
1926  << "\"relaxation: damping factor\": " << DampingFactor_ << endl
1927  << "\"relaxation: min diagonal value\": " << MinDiagonalValue_ << endl
1928  << "\"relaxation: zero starting solution\": " << ZeroStartingSolution_ << endl
1929  << "\"relaxation: backward mode\": " << DoBackwardGS_ << endl
1930  << "\"relaxation: use l1\": " << DoL1Method_ << endl
1931  << "\"relaxation: l1 eta\": " << L1Eta_ << endl;
1932  }
1933  out << "Computed quantities:" << endl;
1934  {
1935  OSTab tab3 (out);
1936  out << "Global number of rows: " << A_->getGlobalNumRows () << endl
1937  << "Global number of columns: " << A_->getGlobalNumCols () << endl;
1938  }
1939  if (checkDiagEntries_ && isComputed ()) {
1940  out << "Properties of input diagonal entries:" << endl;
1941  {
1942  OSTab tab3 (out);
1943  out << "Magnitude of minimum-magnitude diagonal entry: "
1944  << globalMinMagDiagEntryMag_ << endl
1945  << "Magnitude of maximum-magnitude diagonal entry: "
1946  << globalMaxMagDiagEntryMag_ << endl
1947  << "Number of diagonal entries with small magnitude: "
1948  << globalNumSmallDiagEntries_ << endl
1949  << "Number of zero diagonal entries: "
1950  << globalNumZeroDiagEntries_ << endl
1951  << "Number of diagonal entries with negative real part: "
1952  << globalNumNegDiagEntries_ << endl
1953  << "Abs 2-norm diff between computed and actual inverse "
1954  << "diagonal: " << globalDiagNormDiff_ << endl;
1955  }
1956  }
1957  if (isComputed ()) {
1958  out << "Saved diagonal offsets: "
1959  << (savedDiagOffsets_ ? "true" : "false") << endl;
1960  }
1961  out << "Call counts and total times (in seconds): " << endl;
1962  {
1963  OSTab tab3 (out);
1964  out << "initialize: " << endl;
1965  {
1966  OSTab tab4 (out);
1967  out << "Call count: " << NumInitialize_ << endl;
1968  out << "Total time: " << InitializeTime_ << endl;
1969  }
1970  out << "compute: " << endl;
1971  {
1972  OSTab tab4 (out);
1973  out << "Call count: " << NumCompute_ << endl;
1974  out << "Total time: " << ComputeTime_ << endl;
1975  }
1976  out << "apply: " << endl;
1977  {
1978  OSTab tab4 (out);
1979  out << "Call count: " << NumApply_ << endl;
1980  out << "Total time: " << ApplyTime_ << endl;
1981  }
1982  }
1983  }
1984 }
1985 
1986 } // namespace Ifpack2
1987 
1988 #define IFPACK2_RELAXATION_INSTANT(S,LO,GO,N) \
1989  template class Ifpack2::Relaxation< Tpetra::RowMatrix<S, LO, GO, N> >;
1990 
1991 #endif // IFPACK2_RELAXATION_DEF_HPP
bool hasTransposeApply() const
Whether apply() and applyMat() let you apply the transpose or conjugate transpose.
Definition: Ifpack2_Relaxation_def.hpp:376
double getComputeFlops() const
Total number of floating-point operations over all calls to compute().
Definition: Ifpack2_Relaxation_def.hpp:418
double getApplyFlops() const
Total number of floating-point operations over all calls to apply().
Definition: Ifpack2_Relaxation_def.hpp:424
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Print the object&#39;s attributes to the given output stream.
Definition: Ifpack2_Relaxation_def.hpp:1872
void compute()
Compute the preconditioner.
Definition: Ifpack2_Relaxation_def.hpp:665
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which the matrix and vectors are distributed.
Definition: Ifpack2_Relaxation_def.hpp:333
int getNumApply() const
Total number of calls to apply().
Definition: Ifpack2_Relaxation_def.hpp:394
Ifpack2 implementation details.
double getComputeTime() const
Total time in seconds spent in all calls to compute().
Definition: Ifpack2_Relaxation_def.hpp:406
MatrixType::node_type node_type
The Node type used by the input MatrixType.
Definition: Ifpack2_Relaxation_decl.hpp:246
Teuchos::RCP< const Tpetra::Map< local_ordinal_type, global_ordinal_type, node_type > > getDomainMap() const
Returns the Tpetra::Map object associated with the domain of this operator.
Definition: Ifpack2_Relaxation_def.hpp:353
void setParameters(const Teuchos::ParameterList &params)
Set the relaxation / preconditioner parameters.
Definition: Ifpack2_Relaxation_def.hpp:323
File for utility functions.
std::string description() const
A simple one-line description of this object.
Definition: Ifpack2_Relaxation_def.hpp:1822
int getNumInitialize() const
Total number of calls to initialize().
Definition: Ifpack2_Relaxation_def.hpp:382
bool isComputed() const
Return true if compute() has been called.
Definition: Ifpack2_Relaxation_decl.hpp:412
Teuchos::RCP< const Tpetra::Map< local_ordinal_type, global_ordinal_type, node_type > > getRangeMap() const
Returns the Tpetra::Map object associated with the range of this operator.
Definition: Ifpack2_Relaxation_def.hpp:366
MatrixType::global_ordinal_type global_ordinal_type
The type of global indices in the input MatrixType.
Definition: Ifpack2_Relaxation_decl.hpp:243
double getInitializeTime() const
Total time in seconds spent in all calls to initialize().
Definition: Ifpack2_Relaxation_def.hpp:400
void applyMat(const Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &X, Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS) const
Apply the preconditioner to X, returning the result in Y.
Definition: Ifpack2_Relaxation_def.hpp:523
Definition: Ifpack2_Details_Amesos2Wrapper_decl.hpp:63
Teuchos::RCP< const row_matrix_type > getMatrix() const
The matrix to be preconditioned.
Definition: Ifpack2_Relaxation_def.hpp:344
MatrixType::local_ordinal_type local_ordinal_type
The type of local indices in the input MatrixType.
Definition: Ifpack2_Relaxation_decl.hpp:240
MatrixType::scalar_type scalar_type
The type of the entries of the input MatrixType.
Definition: Ifpack2_Relaxation_decl.hpp:237
virtual ~Relaxation()
Destructor.
Definition: Ifpack2_Relaxation_def.hpp:208
int getNumCompute() const
Total number of calls to compute().
Definition: Ifpack2_Relaxation_def.hpp:388
bool isInitialized() const
Returns true if the preconditioner has been successfully initialized.
Definition: Ifpack2_Relaxation_decl.hpp:397
Relaxation preconditioners for Tpetra::RowMatrix and Tpetra::CrsMatrix sparse matrices.
Definition: Ifpack2_Relaxation_decl.hpp:222
Preconditioners and smoothers for Tpetra sparse matrices.
Definition: Ifpack2_AdditiveSchwarz_decl.hpp:72
double getApplyTime() const
Total time in seconds spent in all calls to apply().
Definition: Ifpack2_Relaxation_def.hpp:412
void initialize()
Initialize the preconditioner.
Definition: Ifpack2_Relaxation_def.hpp:544
void apply(const Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &X, Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, scalar_type alpha=Teuchos::ScalarTraits< scalar_type >::one(), scalar_type beta=Teuchos::ScalarTraits< scalar_type >::zero()) const
Apply the preconditioner to X, returning the result in Y.
Definition: Ifpack2_Relaxation_def.hpp:432
Teuchos::RCP< const Teuchos::ParameterList > getValidParameters() const
Return a list of all the parameters that this class accepts.
Definition: Ifpack2_Relaxation_def.hpp:213
Teuchos::ScalarTraits< scalar_type >::magnitudeType magnitude_type
The type of the magnitude (absolute value) of a matrix entry.
Definition: Ifpack2_Relaxation_decl.hpp:249