Tpetra parallel linear algebra  Version of the Day
Tpetra_Experimental_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
43 #define TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
44 
47 
50 
51 namespace Tpetra {
52 namespace Experimental {
53 
54  template<class Scalar, class LO, class GO, class Node>
55  std::ostream&
56  BlockCrsMatrix<Scalar, LO, GO, Node>::
57  markLocalErrorAndGetStream ()
58  {
59  * (this->localError_) = true;
60  if ((*errs_).is_null ()) {
61  *errs_ = Teuchos::rcp (new std::ostringstream ());
62  }
63  return **errs_;
64  }
65 
66  template<class Scalar, class LO, class GO, class Node>
69  dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
70  graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
71  blockSize_ (static_cast<LO> (0)),
72  ind_ (NULL),
73  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
74  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
75  columnPadding_ (0), // no padding by default
76  rowMajor_ (true), // row major blocks by default
77  localError_ (new bool (false)),
78  errs_ (new Teuchos::RCP<std::ostringstream> ()), // ptr to a null ptr
79  computedDiagonalGraph_(false)
80  {
81  }
82 
83  template<class Scalar, class LO, class GO, class Node>
86  const LO blockSize) :
87  dist_object_type (graph.getMap ()),
88  graph_ (graph),
89  rowMeshMap_ (* (graph.getRowMap ())),
90  blockSize_ (blockSize),
91  ind_ (NULL), // to be initialized below
92  val_ (NULL), // to be initialized below
93  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
94  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
95  columnPadding_ (0), // no padding by default
96  rowMajor_ (true), // row major blocks by default
97  localError_ (new bool (false)),
98  errs_ (new Teuchos::RCP<std::ostringstream> ()), // ptr to a null ptr
99  computedDiagonalGraph_(false)
100  {
101  TEUCHOS_TEST_FOR_EXCEPTION(
102  ! graph_.isSorted (), std::invalid_argument, "Tpetra::Experimental::"
103  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
104  "rows (isSorted() is false). This class assumes sorted rows.");
105 
106  graphRCP_ = Teuchos::rcpFromRef(graph_);
107 
108  // Trick to test whether LO is nonpositive, without a compiler
109  // warning in case LO is unsigned (which is generally a bad idea
110  // anyway). I don't promise that the trick works, but it
111  // generally does with gcc at least, in my experience.
112  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
113  TEUCHOS_TEST_FOR_EXCEPTION(
114  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::Experimental::"
115  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
116  " <= 0. The block size must be positive.");
117 
118  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
119  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
120 
121  {
122  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
123  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
124 
125  row_map_type ptr_d = graph.getLocalGraph ().row_map;
126  // FIXME (mfh 23 Mar 2015) Once we write a Kokkos kernel for the
127  // mat-vec, we won't need a host version of this.
128  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
129  Kokkos::deep_copy (ptr_h_nc, ptr_d);
130  ptr_ = ptr_h_nc;
131  }
132  ind_ = graph.getNodePackedIndices ().getRawPtr ();
133  valView_.resize (graph.getNodeNumEntries () * offsetPerBlock ());
134  val_ = valView_.getRawPtr ();
135  }
136 
137  template<class Scalar, class LO, class GO, class Node>
140  const map_type& domainPointMap,
141  const map_type& rangePointMap,
142  const LO blockSize) :
143  dist_object_type (graph.getMap ()),
144  graph_ (graph),
145  rowMeshMap_ (* (graph.getRowMap ())),
146  domainPointMap_ (domainPointMap),
147  rangePointMap_ (rangePointMap),
148  blockSize_ (blockSize),
149  ind_ (NULL), // to be initialized below
150  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
151  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
152  columnPadding_ (0), // no padding by default
153  rowMajor_ (true), // row major blocks by default
154  localError_ (new bool (false)),
155  errs_ (new Teuchos::RCP<std::ostringstream> ()), // ptr to a null ptr
156  computedDiagonalGraph_(false)
157  {
158  TEUCHOS_TEST_FOR_EXCEPTION(
159  ! graph_.isSorted (), std::invalid_argument, "Tpetra::Experimental::"
160  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
161  "rows (isSorted() is false). This class assumes sorted rows.");
162 
163  graphRCP_ = Teuchos::rcpFromRef(graph_);
164 
165  // Trick to test whether LO is nonpositive, without a compiler
166  // warning in case LO is unsigned (which is generally a bad idea
167  // anyway). I don't promise that the trick works, but it
168  // generally does with gcc at least, in my experience.
169  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
170  TEUCHOS_TEST_FOR_EXCEPTION(
171  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::Experimental::"
172  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
173  " <= 0. The block size must be positive.");
174 
175  {
176  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
177  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
178 
179  row_map_type ptr_d = graph.getLocalGraph ().row_map;
180  // FIXME (mfh 23 Mar 2015) Once we write a Kokkos kernel for the
181  // mat-vec, we won't need a host version of this.
182  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
183  Kokkos::deep_copy (ptr_h_nc, ptr_d);
184  ptr_ = ptr_h_nc;
185  }
186  ind_ = graph.getNodePackedIndices ().getRawPtr ();
187  valView_.resize (graph.getNodeNumEntries () * offsetPerBlock ());
188  val_ = valView_.getRawPtr ();
189  }
190 
191  template<class Scalar, class LO, class GO, class Node>
192  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
195  { // Copy constructor of map_type does a shallow copy.
196  // We're only returning by RCP for backwards compatibility.
197  return Teuchos::rcp (new map_type (domainPointMap_));
198  }
199 
200  template<class Scalar, class LO, class GO, class Node>
201  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
203  getRangeMap () const
204  { // Copy constructor of map_type does a shallow copy.
205  // We're only returning by RCP for backwards compatibility.
206  return Teuchos::rcp (new map_type (rangePointMap_));
207  }
208 
209  template<class Scalar, class LO, class GO, class Node>
210  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
212  getRowMap () const
213  {
214  return graph_.getRowMap();
215  }
216 
217  template<class Scalar, class LO, class GO, class Node>
218  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
220  getColMap () const
221  {
222  return graph_.getColMap();
223  }
224 
225  template<class Scalar, class LO, class GO, class Node>
229  {
230  return graph_.getGlobalNumRows();
231  }
232 
233  template<class Scalar, class LO, class GO, class Node>
234  size_t
237  {
238  return graph_.getNodeNumRows();
239  }
240 
241  template<class Scalar, class LO, class GO, class Node>
242  size_t
245  {
246  return graph_.getNodeMaxNumRowEntries();
247  }
248 
249  template<class Scalar, class LO, class GO, class Node>
250  void
252  apply (const mv_type& X,
253  mv_type& Y,
254  Teuchos::ETransp mode,
255  Scalar alpha,
256  Scalar beta) const
257  {
259  TEUCHOS_TEST_FOR_EXCEPTION(
260  mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
261  std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::apply: "
262  "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
263  "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
264 
265  BMV X_view;
266  BMV Y_view;
267  const LO blockSize = getBlockSize ();
268  try {
269  X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
270  Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
271  }
272  catch (std::invalid_argument& e) {
273  TEUCHOS_TEST_FOR_EXCEPTION(
274  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
275  "apply: Either the input MultiVector X or the output MultiVector Y "
276  "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
277  "graph. BlockMultiVector's constructor threw the following exception: "
278  << e.what ());
279  }
280 
281  try {
282  // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
283  // either that or mark fields of this class as 'mutable'. The
284  // problem is that applyBlock wants to do lazy initialization of
285  // temporary block multivectors.
286  const_cast<this_type*> (this)->applyBlock (X_view, Y_view, mode, alpha, beta);
287  } catch (std::invalid_argument& e) {
288  TEUCHOS_TEST_FOR_EXCEPTION(
289  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
290  "apply: The implementation method applyBlock complained about having "
291  "an invalid argument. It reported the following: " << e.what ());
292  } catch (std::logic_error& e) {
293  TEUCHOS_TEST_FOR_EXCEPTION(
294  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
295  "apply: The implementation method applyBlock complained about a "
296  "possible bug in its implementation. It reported the following: "
297  << e.what ());
298  } catch (std::exception& e) {
299  TEUCHOS_TEST_FOR_EXCEPTION(
300  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
301  "apply: The implementation method applyBlock threw an exception which "
302  "is neither std::invalid_argument nor std::logic_error, but is a "
303  "subclass of std::exception. It reported the following: "
304  << e.what ());
305  } catch (...) {
306  TEUCHOS_TEST_FOR_EXCEPTION(
307  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
308  "apply: The implementation method applyBlock threw an exception which "
309  "is not an instance of a subclass of std::exception. This probably "
310  "indicates a bug. Please report this to the Tpetra developers.");
311  }
312  }
313 
314  template<class Scalar, class LO, class GO, class Node>
315  void
319  Teuchos::ETransp mode,
320  const Scalar alpha,
321  const Scalar beta)
322  {
323  TEUCHOS_TEST_FOR_EXCEPTION(
324  X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
325  "Tpetra::Experimental::BlockCrsMatrix::applyBlock: "
326  "X and Y have different block sizes. X.getBlockSize() = "
327  << X.getBlockSize () << " != Y.getBlockSize() = "
328  << Y.getBlockSize () << ".");
329 
330  if (mode == Teuchos::NO_TRANS) {
331  applyBlockNoTrans (X, Y, alpha, beta);
332  } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
333  applyBlockTrans (X, Y, mode, alpha, beta);
334  } else {
335  TEUCHOS_TEST_FOR_EXCEPTION(
336  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
337  "applyBlock: Invalid 'mode' argument. Valid values are "
338  "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
339  }
340  }
341 
342  template<class Scalar, class LO, class GO, class Node>
343  void
345  setAllToScalar (const Scalar& alpha)
346  {
347  const LO numLocalMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
348  for (LO lclRow = 0; lclRow < numLocalMeshRows; ++lclRow) {
349  const size_t meshBeg = ptr_[lclRow];
350  const size_t meshEnd = ptr_[lclRow+1];
351  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
352  little_block_type A_cur = getNonConstLocalBlockFromAbsOffset (absBlkOff);
353  A_cur.fill (static_cast<impl_scalar_type> (alpha));
354  }
355  }
356  }
357 
358  template<class Scalar, class LO, class GO, class Node>
359  LO
361  replaceLocalValues (const LO localRowInd,
362  const LO colInds[],
363  const Scalar vals[],
364  const LO numColInds) const
365  {
366  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
367  // We modified no values, because the input local row index is
368  // invalid on the calling process. That may not be an error, if
369  // numColInds is zero anyway; it doesn't matter. This is the
370  // advantage of returning the number of valid indices.
371  return static_cast<LO> (0);
372  }
373  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
374 
375  const size_t absRowBlockOffset = ptr_[localRowInd];
376  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
377  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
378  size_t hint = 0; // Guess for the relative offset into the current row
379  size_t pointOffset = 0; // Current offset into input values
380  LO validCount = 0; // number of valid column indices in colInds
381 
382  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
383  const size_t relBlockOffset =
384  findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
385  if (relBlockOffset != STINV) {
386  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
387  little_block_type A_old =
388  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
390  getConstLocalBlockFromInput (vIn, pointOffset);
391  A_old.assign (A_new);
392  hint = relBlockOffset + 1;
393  ++validCount;
394  }
395  }
396  return validCount;
397  }
398 
399  template <class Scalar, class LO, class GO, class Node>
400  void
402  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
403  {
404 
405  const map_type& rowMap = * (graph_.getRowMap());
406  const map_type& colMap = * (graph_.getColMap ());
407 
408  const size_t myNumRows = rowMeshMap_.getNodeNumElements();
409  if (static_cast<size_t> (offsets.size ()) != myNumRows) {
410  offsets.resize (static_cast<size_t> (myNumRows));
411  }
412 
413 #ifdef HAVE_TPETRA_DEBUG
414  bool allRowMapDiagEntriesInColMap = true;
415  bool allDiagEntriesFound = true;
416 #endif // HAVE_TPETRA_DEBUG
417 
418  for (size_t r = 0; r < myNumRows; ++r) {
419  const GO rgid = rowMap.getGlobalElement (r);
420  const LO rlid = colMap.getLocalElement (rgid);
421 
422 #ifdef HAVE_TPETRA_DEBUG
423  if (rlid == Teuchos::OrdinalTraits<LO>::invalid ()) {
424  allRowMapDiagEntriesInColMap = false;
425  }
426 #endif // HAVE_TPETRA_DEBUG
427 
428  if (rlid != Teuchos::OrdinalTraits<LO>::invalid ()) {
429  RowInfo rowinfo = graph_.getRowInfo (r);
430  if (rowinfo.numEntries > 0) {
431  offsets[r] = graph_.findLocalIndex (rowinfo, rlid);
432  }
433  else {
434  offsets[r] = Teuchos::OrdinalTraits<size_t>::invalid ();
435 #ifdef HAVE_TPETRA_DEBUG
436  allDiagEntriesFound = false;
437 #endif // HAVE_TPETRA_DEBUG
438  }
439  }
440  }
441 
442 #ifdef HAVE_TPETRA_DEBUG
443  using Teuchos::reduceAll;
444  using std::endl;
445  const char tfecfFuncName[] = "getLocalDiagOffsets";
446 
447  const bool localSuccess =
448  allRowMapDiagEntriesInColMap && allDiagEntriesFound;
449  int localResults[3];
450  localResults[0] = allRowMapDiagEntriesInColMap ? 1 : 0;
451  localResults[1] = allDiagEntriesFound ? 1 : 0;
452  // min-all-reduce will compute least rank of all the processes
453  // that didn't succeed.
454  localResults[2] =
455  ! localSuccess ? getComm ()->getRank () : getComm ()->getSize ();
456  int globalResults[3];
457  globalResults[0] = 0;
458  globalResults[1] = 0;
459  globalResults[2] = 0;
460  reduceAll<int, int> (* (getComm ()), Teuchos::REDUCE_MIN,
461  3, localResults, globalResults);
462  if (globalResults[0] == 0 || globalResults[1] == 0) {
463  std::ostringstream os; // build error message
464  const bool both =
465  globalResults[0] == 0 && globalResults[1] == 0;
466  os << ": At least one process (including Process " << globalResults[2]
467  << ") had the following issue" << (both ? "s" : "") << ":" << endl;
468  if (globalResults[0] == 0) {
469  os << " - The column Map does not contain at least one diagonal entry "
470  "of the matrix." << endl;
471  }
472  if (globalResults[1] == 0) {
473  os << " - There is a row on that / those process(es) that does not "
474  "contain a diagonal entry." << endl;
475  }
476  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error, os.str());
477  }
478 #endif // HAVE_TPETRA_DEBUG
479  }
480 
481  template <class Scalar, class LO, class GO, class Node>
482  void
485  {
486  using Teuchos::rcp;
487 
488  if (computedDiagonalGraph_) {
489  // FIXME (mfh 12 Aug 2014) Consider storing the "diagonal graph"
490  // separately from the matrix. It should really go in the
491  // preconditioner, not here. We could do this by adding a
492  // method that accepts a nonconst diagonal graph, and updates
493  // it. btw it would probably be a better idea to use a
494  // BlockMultiVector to store the diagonal, not a graph.
495  return;
496  }
497 
498  const size_t maxDiagEntPerRow = 1;
499  // NOTE (mfh 12 Aug 2014) We could also pass in the column Map
500  // here. However, we still would have to do LID->GID lookups to
501  // make sure that we are using the correct diagonal column
502  // indices, so it probably wouldn't help much.
503  diagonalGraph_ =
504  rcp (new crs_graph_type (graph_.getRowMap (), maxDiagEntPerRow,
506  const map_type& meshRowMap = * (graph_.getRowMap ());
507 
508  Teuchos::Array<GO> diagGblColInds (maxDiagEntPerRow);
509 
510  for (LO lclRowInd = meshRowMap.getMinLocalIndex ();
511  lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) {
512  const GO gblRowInd = meshRowMap.getGlobalElement (lclRowInd);
513  diagGblColInds[0] = gblRowInd;
514  diagonalGraph_->insertGlobalIndices (gblRowInd, diagGblColInds ());
515  }
516  diagonalGraph_->fillComplete (graph_.getDomainMap (),
517  graph_.getRangeMap ());
518  computedDiagonalGraph_ = true;
519  }
520 
521  template <class Scalar, class LO, class GO, class Node>
522  Teuchos::RCP<CrsGraph<LO, GO, Node> >
524  getDiagonalGraph () const
525  {
526  TEUCHOS_TEST_FOR_EXCEPTION(
527  ! computedDiagonalGraph_, std::runtime_error, "Tpetra::Experimental::"
528  "BlockCrsMatrix::getDiagonalGraph: You must call computeDiagonalGraph() "
529  "before calling this method.");
530  return diagonalGraph_;
531  }
532 
533  template <class Scalar, class LO, class GO, class Node>
534  void
538  BlockCrsMatrix<Scalar, LO, GO, Node> & factorizedDiagonal,
539  const int * factorizationPivots,
540  const Scalar omega,
541  const ESweepDirection direction) const
542  {
543  const LO numLocalMeshRows =
544  static_cast<LO> (rowMeshMap_.getNodeNumElements ());
545  const LO numVecs = static_cast<LO> (X.getNumVectors ());
546 
547  // If using (new) Kokkos, replace localMem with thread-local
548  // memory. Note that for larger block sizes, this will affect the
549  // two-level parallelization. Look to Stokhos for best practice
550  // on making this fast for GPUs.
551  const LO blockSize = getBlockSize ();
552  Teuchos::Array<impl_scalar_type> localMem (blockSize);
553  Teuchos::Array<impl_scalar_type> localMat (blockSize*blockSize);
554  little_vec_type X_lcl (localMem.getRawPtr (), blockSize, 1);
555 
556  const LO * columnIndices;
557  Scalar * Dmat;
558  LO numIndices;
559 
560  // FIXME (mfh 12 Aug 2014) This probably won't work if LO is unsigned.
561  LO rowBegin = 0, rowEnd = 0, rowStride = 0;
562  if (direction == Forward) {
563  rowBegin = 1;
564  rowEnd = numLocalMeshRows+1;
565  rowStride = 1;
566  }
567  else if (direction == Backward) {
568  rowBegin = numLocalMeshRows;
569  rowEnd = 0;
570  rowStride = -1;
571  }
572  else if (direction == Symmetric) {
573  this->localGaussSeidel (B, X, factorizedDiagonal, factorizationPivots, omega, Forward);
574  this->localGaussSeidel (B, X, factorizedDiagonal, factorizationPivots, omega, Backward);
575  return;
576  }
577 
578  const Scalar one_minus_omega = Teuchos::ScalarTraits<Scalar>::one()-omega;
579  const Scalar minus_omega = -omega;
580 
581  if (numVecs == 1) {
582  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
583  const LO actlRow = lclRow - 1;
584 
585  little_vec_type B_cur = B.getLocalBlock (actlRow, 0);
586  X_lcl.assign (B_cur);
587  X_lcl.scale (omega);
588 
589  const size_t meshBeg = ptr_[actlRow];
590  const size_t meshEnd = ptr_[actlRow+1];
591  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
592  const LO meshCol = ind_[absBlkOff];
594  getConstLocalBlockFromAbsOffset (absBlkOff);
595 
596  little_vec_type X_cur = X.getLocalBlock (meshCol, 0);
597 
598  // X_lcl += alpha*A_cur*X_cur
599  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
600  X_lcl.matvecUpdate (alpha, A_cur, X_cur);
601  } // for each entry in the current local row of the matrx
602 
603  factorizedDiagonal.getLocalRowView (actlRow, columnIndices,
604  Dmat, numIndices);
605  little_block_type D_lcl =
606  getNonConstLocalBlockFromInput (reinterpret_cast<impl_scalar_type*> (Dmat), 0);
607 
608  D_lcl.solve (X_lcl, &factorizationPivots[actlRow*blockSize_]);
609  little_vec_type X_update = X.getLocalBlock (actlRow, 0);
610  X_update.assign(X_lcl);
611  } // for each local row of the matrix
612  }
613  else {
614  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
615  for (LO j = 0; j < numVecs; ++j) {
616  LO actlRow = lclRow-1;
617 
618  little_vec_type B_cur = B.getLocalBlock (actlRow, j);
619  X_lcl.assign (B_cur);
620  X_lcl.scale (omega);
621 
622  const size_t meshBeg = ptr_[actlRow];
623  const size_t meshEnd = ptr_[actlRow+1];
624  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
625  const LO meshCol = ind_[absBlkOff];
627  getConstLocalBlockFromAbsOffset (absBlkOff);
628 
629  little_vec_type X_cur = X.getLocalBlock (meshCol, j);
630 
631  // X_lcl += alpha*A_cur*X_cur
632  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
633  X_lcl.matvecUpdate (alpha, A_cur, X_cur);
634  } // for each entry in the current local row of the matrx
635 
636  factorizedDiagonal.getLocalRowView (actlRow, columnIndices,
637  Dmat, numIndices);
638  little_block_type D_lcl =
639  getNonConstLocalBlockFromInput (reinterpret_cast<impl_scalar_type*> (Dmat), 0);
640 
641  D_lcl.solve (X_lcl, &factorizationPivots[actlRow*blockSize_]);
642 
643  little_vec_type X_update = X.getLocalBlock (actlRow, j);
644  X_update.assign(X_lcl);
645  } // for each entry in the current local row of the matrix
646  } // for each local row of the matrix
647  }
648  }
649 
650  template <class Scalar, class LO, class GO, class Node>
651  void
656  const Scalar& dampingFactor,
657  const ESweepDirection direction,
658  const int numSweeps,
659  const bool zeroInitialGuess) const
660  {
661  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
662  // interface for block Gauss-Seidel.
663  TEUCHOS_TEST_FOR_EXCEPTION(
664  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
665  "gaussSeidelCopy: Not implemented.");
666  }
667 
668  template <class Scalar, class LO, class GO, class Node>
669  void
674  const ArrayView<LO>& rowIndices,
675  const Scalar& dampingFactor,
676  const ESweepDirection direction,
677  const int numSweeps,
678  const bool zeroInitialGuess) const
679  {
680  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
681  // interface for block Gauss-Seidel.
682  TEUCHOS_TEST_FOR_EXCEPTION(
683  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
684  "reorderedGaussSeidelCopy: Not implemented.");
685  }
686 
687  template <class Scalar, class LO, class GO, class Node>
688  void
691  const Teuchos::ArrayView<const size_t>& offsets) const
692  {
693  using Teuchos::ArrayRCP;
694  using Teuchos::ArrayView;
695  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
696 
697  const size_t myNumRows = rowMeshMap_.getNodeNumElements();
698  const LO* columnIndices;
699  Scalar* vals;
700  LO numColumns;
701  Teuchos::Array<LO> cols(1);
702 
703  // FIXME (mfh 12 Aug 2014) Should use a "little block" for this instead.
704  Teuchos::Array<Scalar> zeroMat (blockSize_*blockSize_, ZERO);
705  for (size_t i = 0; i < myNumRows; ++i) {
706  cols[0] = i;
707  if (offsets[i] == Teuchos::OrdinalTraits<size_t>::invalid ()) {
708  diag.replaceLocalValues (i, cols.getRawPtr (), zeroMat.getRawPtr (), 1);
709  }
710  else {
711  getLocalRowView (i, columnIndices, vals, numColumns);
712  diag.replaceLocalValues (i, cols.getRawPtr(), &vals[offsets[i]*blockSize_*blockSize_], 1);
713  }
714  }
715  }
716 
717 
718  template<class Scalar, class LO, class GO, class Node>
719  LO
721  absMaxLocalValues (const LO localRowInd,
722  const LO colInds[],
723  const Scalar vals[],
724  const LO numColInds) const
725  {
726  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
727  // We modified no values, because the input local row index is
728  // invalid on the calling process. That may not be an error, if
729  // numColInds is zero anyway; it doesn't matter. This is the
730  // advantage of returning the number of valid indices.
731  return static_cast<LO> (0);
732  }
733  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
734 
735  const size_t absRowBlockOffset = ptr_[localRowInd];
736  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
737  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
738  size_t hint = 0; // Guess for the relative offset into the current row
739  size_t pointOffset = 0; // Current offset into input values
740  LO validCount = 0; // number of valid column indices in colInds
741 
742  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
743  const size_t relBlockOffset =
744  findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
745  if (relBlockOffset != STINV) {
746  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
747  little_block_type A_old =
748  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
750  getConstLocalBlockFromInput (vIn, pointOffset);
751  A_old.absmax (A_new);
752  hint = relBlockOffset + 1;
753  ++validCount;
754  }
755  }
756  return validCount;
757  }
758 
759 
760  template<class Scalar, class LO, class GO, class Node>
761  LO
763  sumIntoLocalValues (const LO localRowInd,
764  const LO colInds[],
765  const Scalar vals[],
766  const LO numColInds) const
767  {
768  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
769  // We modified no values, because the input local row index is
770  // invalid on the calling process. That may not be an error, if
771  // numColInds is zero anyway; it doesn't matter. This is the
772  // advantage of returning the number of valid indices.
773  return static_cast<LO> (0);
774  }
775  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
776 
777  const size_t absRowBlockOffset = ptr_[localRowInd];
778  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
779  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
780  size_t hint = 0; // Guess for the relative offset into the current row
781  size_t pointOffset = 0; // Current offset into input values
782  LO validCount = 0; // number of valid column indices in colInds
783 
784  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
785  const size_t relBlockOffset =
786  findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
787  if (relBlockOffset != STINV) {
788  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
789  little_block_type A_old =
790  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
792  getConstLocalBlockFromInput (vIn, pointOffset);
793  A_old.update (static_cast<impl_scalar_type> (STS::one ()), A_new);
794  hint = relBlockOffset + 1;
795  ++validCount;
796  }
797  }
798  return validCount;
799  }
800 
801  template<class Scalar, class LO, class GO, class Node>
802  LO
804  getLocalRowView (const LO localRowInd,
805  const LO*& colInds,
806  Scalar*& vals,
807  LO& numInds) const
808  {
809  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
810  colInds = NULL;
811  vals = NULL;
812  numInds = 0;
813  return Teuchos::OrdinalTraits<LO>::invalid ();
814  }
815  else {
816  const size_t absBlockOffsetStart = ptr_[localRowInd];
817  colInds = ind_ + absBlockOffsetStart;
818 
819  impl_scalar_type* const vOut = val_ + absBlockOffsetStart * offsetPerBlock ();
820  vals = reinterpret_cast<Scalar*> (vOut);
821 
822  numInds = ptr_[localRowInd + 1] - absBlockOffsetStart;
823  return 0; // indicates no error
824  }
825  }
826 
827  template<class Scalar, class LO, class GO, class Node>
828  void
830  getLocalRowCopy (LO LocalRow,
831  const Teuchos::ArrayView<LO>& Indices,
832  const Teuchos::ArrayView<Scalar>& Values,
833  size_t &NumEntries) const
834  {
835  const LO *colInds;
836  Scalar *vals;
837  LO numInds;
838  getLocalRowView(LocalRow,colInds,vals,numInds);
839  if (numInds > Indices.size() || numInds > Values.size()) {
840  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
841  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
842  << numInds << " row entries");
843  }
844  for (LO i=0; i<numInds; ++i) {
845  Indices[i] = colInds[i];
846  Values[i] = vals[i];
847  }
848  NumEntries = numInds;
849  }
850 
851  template<class Scalar, class LO, class GO, class Node>
852  LO
854  getLocalRowOffsets (const LO localRowInd,
855  ptrdiff_t offsets[],
856  const LO colInds[],
857  const LO numColInds) const
858  {
859  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
860  // We got no offsets, because the input local row index is
861  // invalid on the calling process. That may not be an error, if
862  // numColInds is zero anyway; it doesn't matter. This is the
863  // advantage of returning the number of valid indices.
864  return static_cast<LO> (0);
865  }
866 
867  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
868  size_t hint = 0; // Guess for the relative offset into the current row
869  LO validCount = 0; // number of valid column indices in colInds
870 
871  for (LO k = 0; k < numColInds; ++k) {
872  const size_t relBlockOffset =
873  findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
874  if (relBlockOffset != STINV) {
875  offsets[k] = relBlockOffset;
876  hint = relBlockOffset + 1;
877  ++validCount;
878  }
879  }
880  return validCount;
881  }
882 
883 
884  template<class Scalar, class LO, class GO, class Node>
885  LO
887  replaceLocalValuesByOffsets (const LO localRowInd,
888  const ptrdiff_t offsets[],
889  const Scalar vals[],
890  const LO numOffsets) const
891  {
892  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
893  // We modified no values, because the input local row index is
894  // invalid on the calling process. That may not be an error, if
895  // numColInds is zero anyway; it doesn't matter. This is the
896  // advantage of returning the number of valid indices.
897  return static_cast<LO> (0);
898  }
899  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
900 
901  const size_t absRowBlockOffset = ptr_[localRowInd];
902  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
903  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
904  size_t pointOffset = 0; // Current offset into input values
905  LO validCount = 0; // number of valid offsets
906 
907  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
908  const size_t relBlockOffset = offsets[k];
909  if (relBlockOffset != STINV) {
910  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
911  little_block_type A_old =
912  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
914  getConstLocalBlockFromInput (vIn, pointOffset);
915  A_old.assign (A_new);
916  ++validCount;
917  }
918  }
919  return validCount;
920  }
921 
922 
923  template<class Scalar, class LO, class GO, class Node>
924  LO
926  absMaxLocalValuesByOffsets (const LO localRowInd,
927  const ptrdiff_t offsets[],
928  const Scalar vals[],
929  const LO numOffsets) const
930  {
931  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
932  // We modified no values, because the input local row index is
933  // invalid on the calling process. That may not be an error, if
934  // numColInds is zero anyway; it doesn't matter. This is the
935  // advantage of returning the number of valid indices.
936  return static_cast<LO> (0);
937  }
938  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
939 
940  const size_t absRowBlockOffset = ptr_[localRowInd];
941  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
942  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
943  size_t pointOffset = 0; // Current offset into input values
944  LO validCount = 0; // number of valid offsets
945 
946  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
947  const size_t relBlockOffset = offsets[k];
948  if (relBlockOffset != STINV) {
949  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
950  little_block_type A_old =
951  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
953  getConstLocalBlockFromInput (vIn, pointOffset);
954  A_old.absmax (A_new);
955  ++validCount;
956  }
957  }
958  return validCount;
959  }
960 
961 
962  template<class Scalar, class LO, class GO, class Node>
963  LO
965  sumIntoLocalValuesByOffsets (const LO localRowInd,
966  const ptrdiff_t offsets[],
967  const Scalar vals[],
968  const LO numOffsets) const
969  {
970  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
971  // We modified no values, because the input local row index is
972  // invalid on the calling process. That may not be an error, if
973  // numColInds is zero anyway; it doesn't matter. This is the
974  // advantage of returning the number of valid indices.
975  return static_cast<LO> (0);
976  }
977  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
978 
979  const size_t absRowBlockOffset = ptr_[localRowInd];
980  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
981  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
982  size_t pointOffset = 0; // Current offset into input values
983  LO validCount = 0; // number of valid offsets
984 
985  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
986  const size_t relBlockOffset = offsets[k];
987  if (relBlockOffset != STINV) {
988  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
989  little_block_type A_old =
990  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
992  getConstLocalBlockFromInput (vIn, pointOffset);
993  A_old.update (static_cast<impl_scalar_type> (STS::one ()), A_new);
994  ++validCount;
995  }
996  }
997  return validCount;
998  }
999 
1000 
1001  template<class Scalar, class LO, class GO, class Node>
1002  size_t
1004  getNumEntriesInLocalRow (const LO localRowInd) const
1005  {
1006  const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
1007  if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1008  return static_cast<LO> (0); // the calling process doesn't have that row
1009  } else {
1010  return static_cast<LO> (numEntInGraph);
1011  }
1012  }
1013 
1014  template<class Scalar, class LO, class GO, class Node>
1015  void
1019  const Teuchos::ETransp mode,
1020  const Scalar alpha,
1021  const Scalar beta)
1022  {
1023  (void) X;
1024  (void) Y;
1025  (void) mode;
1026  (void) alpha;
1027  (void) beta;
1028 
1029  TEUCHOS_TEST_FOR_EXCEPTION(
1030  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::apply: "
1031  "transpose and conjugate transpose modes are not yet implemented.");
1032  }
1033 
1034  template<class Scalar, class LO, class GO, class Node>
1035  void
1039  const Scalar alpha,
1040  const Scalar beta)
1041  {
1042  using Teuchos::RCP;
1043  using Teuchos::rcp;
1044  typedef Tpetra::Import<LO, GO, Node> import_type;
1045  typedef Tpetra::Export<LO, GO, Node> export_type;
1046  const Scalar zero = STS::zero ();
1047  const Scalar one = STS::one ();
1048  RCP<const import_type> import = graph_.getImporter ();
1049  // "export" is a reserved C++ keyword, so we can't use it.
1050  RCP<const export_type> theExport = graph_.getExporter ();
1051 
1052  // FIXME (mfh 20 May 2014) X.mv_ and Y.mv_ requires a friend
1053  // declaration, which is useful only for debugging.
1054  TEUCHOS_TEST_FOR_EXCEPTION(
1055  X.mv_.getCopyOrView () != Teuchos::View, std::invalid_argument,
1056  "Tpetra::Experimental::BlockCrsMatrix::applyBlockNoTrans: "
1057  "The input BlockMultiVector X has deep copy semantics, "
1058  "not view semantics (as it should).");
1059  TEUCHOS_TEST_FOR_EXCEPTION(
1060  Y.mv_.getCopyOrView () != Teuchos::View, std::invalid_argument,
1061  "Tpetra::Experimental::BlockCrsMatrix::applyBlockNoTrans: "
1062  "The output BlockMultiVector Y has deep copy semantics, "
1063  "not view semantics (as it should).");
1064 
1065  if (alpha == zero) {
1066  if (beta == zero) {
1067  Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
1068  } else if (beta != one) {
1069  Y.scale (beta);
1070  }
1071  } else { // alpha != 0
1072  const BMV* X_colMap = NULL;
1073  if (import.is_null ()) {
1074  try {
1075  X_colMap = &X;
1076  } catch (std::exception& e) {
1077  TEUCHOS_TEST_FOR_EXCEPTION(
1078  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1079  "applyBlockNoTrans:" << std::endl << "Tpetra::MultiVector::"
1080  "operator= threw an exception: " << std::endl << e.what ());
1081  }
1082  } else {
1083  // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1084  // Y_rowMap_ below. This lets us do lazy initialization
1085  // correctly with view semantics of BlockCrsMatrix. All views
1086  // of this BlockCrsMatrix have the same outer pointer. That
1087  // way, we can set the inner pointer in one view, and all
1088  // other views will see it.
1089  if ((*X_colMap_).is_null () ||
1090  (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1091  (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1092  *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1093  static_cast<LO> (X.getNumVectors ())));
1094  }
1095  (**X_colMap_).doImport (X, *import, Tpetra::REPLACE);
1096  try {
1097  X_colMap = &(**X_colMap_);
1098  } catch (std::exception& e) {
1099  TEUCHOS_TEST_FOR_EXCEPTION(
1100  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1101  "applyBlockNoTrans:" << std::endl << "Tpetra::MultiVector::"
1102  "operator= threw an exception: " << std::endl << e.what ());
1103  }
1104  }
1105 
1106  BMV* Y_rowMap = NULL;
1107  if (theExport.is_null ()) {
1108  Y_rowMap = &Y;
1109  } else if ((*Y_rowMap_).is_null () ||
1110  (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1111  (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1112  *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1113  static_cast<LO> (X.getNumVectors ())));
1114  try {
1115  Y_rowMap = &(**Y_rowMap_);
1116  } catch (std::exception& e) {
1117  TEUCHOS_TEST_FOR_EXCEPTION(
1118  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1119  "applyBlockNoTrans:" << std::endl << "Tpetra::MultiVector::"
1120  "operator= threw an exception: " << std::endl << e.what ());
1121  }
1122  }
1123 
1124  localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1125 
1126  if (! theExport.is_null ()) {
1127  Y.doExport (*Y_rowMap, *theExport, Tpetra::REPLACE);
1128  }
1129  }
1130  }
1131 
1132  template<class Scalar, class LO, class GO, class Node>
1133  void
1137  const Scalar alpha,
1138  const Scalar beta)
1139  {
1140  // If using (new) Kokkos, prefer Kokkos::ArithTraits to
1141  // Teuchos::ScalarTraits.
1142  const Scalar zero = STS::zero ();
1143  const Scalar one = STS::one ();
1144  const LO numLocalMeshRows =
1145  static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1146  const LO numVecs = static_cast<LO> (X.getNumVectors ());
1147 
1148  // If using (new) Kokkos, replace localMem with thread-local
1149  // memory. Note that for larger block sizes, this will affect the
1150  // two-level parallelization. Look to Stokhos for best practice
1151  // on making this fast for GPUs.
1152  const LO blockSize = getBlockSize ();
1153  Teuchos::Array<impl_scalar_type> localMem (blockSize);
1154  little_vec_type Y_lcl (localMem.getRawPtr (), blockSize, 1);
1155 
1156  if (numVecs == 1) {
1157  for (LO lclRow = 0; lclRow < numLocalMeshRows; ++lclRow) {
1158  little_vec_type Y_cur = Y.getLocalBlock (lclRow, 0);
1159 
1160  if (beta == zero) {
1161  Y_lcl.fill (zero);
1162  } else if (beta == one) {
1163  Y_lcl.assign (Y_cur);
1164  } else {
1165  Y_lcl.assign (Y_cur);
1166  Y_lcl.scale (beta);
1167  }
1168 
1169  const size_t meshBeg = ptr_[lclRow];
1170  const size_t meshEnd = ptr_[lclRow+1];
1171  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1172  const LO meshCol = ind_[absBlkOff];
1173  const_little_block_type A_cur =
1174  getConstLocalBlockFromAbsOffset (absBlkOff);
1175  little_vec_type X_cur = X.getLocalBlock (meshCol, 0);
1176  // Y_lcl += alpha*A_cur*X_cur
1177  Y_lcl.matvecUpdate (alpha, A_cur, X_cur);
1178  } // for each entry in the current local row of the matrx
1179 
1180  Y_cur.assign (Y_lcl);
1181  } // for each local row of the matrix
1182  }
1183  else {
1184  for (LO lclRow = 0; lclRow < numLocalMeshRows; ++lclRow) {
1185  for (LO j = 0; j < numVecs; ++j) {
1186  little_vec_type Y_cur = Y.getLocalBlock (lclRow, j);
1187 
1188  if (beta == zero) {
1189  Y_lcl.fill (zero);
1190  } else if (beta == one) {
1191  Y_lcl.assign (Y_cur);
1192  } else {
1193  Y_lcl.assign (Y_cur);
1194  Y_lcl.scale (beta);
1195  }
1196 
1197  const size_t meshBeg = ptr_[lclRow];
1198  const size_t meshEnd = ptr_[lclRow+1];
1199  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1200  const LO meshCol = ind_[absBlkOff];
1201  const_little_block_type A_cur =
1202  getConstLocalBlockFromAbsOffset (absBlkOff);
1203  little_vec_type X_cur = X.getLocalBlock (meshCol, j);
1204  // Y_lcl += alpha*A_cur*X_cur
1205  Y_lcl.matvecUpdate (alpha, A_cur, X_cur);
1206  } // for each entry in the current local row of the matrix
1207 
1208  Y_cur.assign (Y_lcl);
1209  } // for each entry in the current row of Y
1210  } // for each local row of the matrix
1211  }
1212  }
1213 
1214  template<class Scalar, class LO, class GO, class Node>
1215  size_t
1217  findRelOffsetOfColumnIndex (const LO localRowIndex,
1218  const LO colIndexToFind,
1219  const size_t hint) const
1220  {
1221  const size_t absStartOffset = ptr_[localRowIndex];
1222  const size_t absEndOffset = ptr_[localRowIndex+1];
1223  const size_t numEntriesInRow = absEndOffset - absStartOffset;
1224 
1225  // If the hint was correct, then the hint is the offset to return.
1226  if (hint < numEntriesInRow && ind_[absStartOffset+hint] == colIndexToFind) {
1227  // Always return the offset relative to the current row.
1228  return hint;
1229  }
1230 
1231  // The hint was wrong, so we must search for the given column
1232  // index in the column indices for the given row.
1233  size_t relOffset = Teuchos::OrdinalTraits<size_t>::invalid ();
1234 
1235  // We require that the graph have sorted rows. However, binary
1236  // search only pays if the current row is longer than a certain
1237  // amount. We set this to 32, but you might want to tune this.
1238  const size_t maxNumEntriesForLinearSearch = 32;
1239  if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1240  // Use binary search. It would probably be better for us to
1241  // roll this loop by hand. If we wrote it right, a smart
1242  // compiler could perhaps use conditional loads and avoid
1243  // branches (according to Jed Brown on May 2014).
1244  const LO* beg = ind_ + absStartOffset;
1245  const LO* end = ind_ + absEndOffset;
1246  std::pair<const LO*, const LO*> p =
1247  std::equal_range (beg, end, colIndexToFind);
1248  if (p.first != p.second) {
1249  // offset is relative to the current row
1250  relOffset = static_cast<size_t> (p.first - beg);
1251  }
1252  }
1253  else { // use linear search
1254  for (size_t k = 0; k < numEntriesInRow; ++k) {
1255  if (colIndexToFind == ind_[absStartOffset + k]) {
1256  relOffset = k; // offset is relative to the current row
1257  break;
1258  }
1259  }
1260  }
1261 
1262  return relOffset;
1263  }
1264 
1265  template<class Scalar, class LO, class GO, class Node>
1266  LO
1268  offsetPerBlock () const
1269  {
1270  const LO numRows = blockSize_;
1271 
1272  LO numCols = blockSize_;
1273  if (columnPadding_ > 0) { // Column padding == 0 means no padding.
1274  const LO numColsRoundedDown = (blockSize_ / columnPadding_) * columnPadding_;
1275  numCols = (numColsRoundedDown < numCols) ?
1276  (numColsRoundedDown + columnPadding_) :
1277  numColsRoundedDown;
1278  }
1279  return numRows * numCols;
1280  }
1281 
1282  template<class Scalar, class LO, class GO, class Node>
1286  const size_t pointOffset) const
1287  {
1288  if (rowMajor_) {
1289  const size_t rowStride = (columnPadding_ == 0) ?
1290  static_cast<size_t> (blockSize_) : static_cast<size_t> (columnPadding_);
1291  return const_little_block_type (val + pointOffset, blockSize_, rowStride, 1);
1292  } else {
1293  return const_little_block_type (val + pointOffset, blockSize_, 1, blockSize_);
1294  }
1295  }
1296 
1297  template<class Scalar, class LO, class GO, class Node>
1301  const size_t pointOffset) const
1302  {
1303  if (rowMajor_) {
1304  const size_t rowStride = (columnPadding_ == 0) ?
1305  static_cast<size_t> (blockSize_) : static_cast<size_t> (columnPadding_);
1306  return little_block_type (val + pointOffset, blockSize_, rowStride, 1);
1307  } else {
1308  return little_block_type (val + pointOffset, blockSize_, 1, blockSize_);
1309  }
1310  }
1311 
1312  template<class Scalar, class LO, class GO, class Node>
1315  getConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
1316  {
1317  if (absBlockOffset >= ptr_[rowMeshMap_.getNodeNumElements ()]) {
1318  // An empty block signifies an error. We don't expect to see
1319  // this error in correct code, but it's helpful for avoiding
1320  // memory corruption in case there is a bug.
1321  return const_little_block_type (NULL, 0, 0, 0);
1322  } else {
1323  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
1324  return getConstLocalBlockFromInput (val_, absPointOffset);
1325  }
1326  }
1327 
1328  template<class Scalar, class LO, class GO, class Node>
1331  getNonConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
1332  {
1333  if (absBlockOffset >= ptr_[rowMeshMap_.getNodeNumElements ()]) {
1334  // An empty block signifies an error. We don't expect to see
1335  // this error in correct code, but it's helpful for avoiding
1336  // memory corruption in case there is a bug.
1337  return little_block_type (NULL, 0, 0, 0);
1338  } else {
1339  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
1340  return getNonConstLocalBlockFromInput (const_cast<impl_scalar_type*> (val_),
1341  absPointOffset);
1342  }
1343  }
1344 
1345  template<class Scalar, class LO, class GO, class Node>
1348  getLocalBlock (const LO localRowInd, const LO localColInd) const
1349  {
1350  const size_t absRowBlockOffset = ptr_[localRowInd];
1351 
1352  size_t hint = 0;
1353  const size_t relBlockOffset =
1354  findRelOffsetOfColumnIndex (localRowInd, localColInd, hint);
1355 
1356  if (relBlockOffset != Teuchos::OrdinalTraits<size_t>::invalid ()) {
1357  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1358  return getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1359  }
1360  else {
1361  return little_block_type (NULL, 0, 0, 0);
1362  }
1363  }
1364 
1365  // template<class Scalar, class LO, class GO, class Node>
1366  // void
1367  // BlockCrsMatrix<Scalar, LO, GO, Node>::
1368  // clearLocalErrorStateAndStream ()
1369  // {
1370  // typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
1371  // * (const_cast<this_type*> (this)->localError_) = false;
1372  // *errs_ = Teuchos::null;
1373  // }
1374 
1375  template<class Scalar, class LO, class GO, class Node>
1376  bool
1379  {
1380  using std::endl;
1382  const this_type* src = dynamic_cast<const this_type* > (&source);
1383 
1384  if (src == NULL) {
1385  std::ostream& err = markLocalErrorAndGetStream ();
1386  err << "checkSizes: The source object of the Import or Export "
1387  "must be a BlockCrsMatrix with the same template parameters as the "
1388  "target object." << endl;
1389  }
1390  else {
1391  // Use a string of ifs, not if-elseifs, because we want to know
1392  // all the errors.
1393  if (src->getBlockSize () != this->getBlockSize ()) {
1394  std::ostream& err = markLocalErrorAndGetStream ();
1395  err << "checkSizes: The source and target objects of the Import or "
1396  << "Export must have the same block sizes. The source's block "
1397  << "size = " << src->getBlockSize () << " != the target's block "
1398  << "size = " << this->getBlockSize () << "." << endl;
1399  }
1400  if (! src->graph_.isFillComplete ()) {
1401  std::ostream& err = markLocalErrorAndGetStream ();
1402  err << "checkSizes: The source object of the Import or Export is "
1403  "not fill complete. Both source and target objects must be fill "
1404  "complete." << endl;
1405  }
1406  if (! this->graph_.isFillComplete ()) {
1407  std::ostream& err = markLocalErrorAndGetStream ();
1408  err << "checkSizes: The target object of the Import or Export is "
1409  "not fill complete. Both source and target objects must be fill "
1410  "complete." << endl;
1411  }
1412  if (src->graph_.getColMap ().is_null ()) {
1413  std::ostream& err = markLocalErrorAndGetStream ();
1414  err << "checkSizes: The source object of the Import or Export does "
1415  "not have a column Map. Both source and target objects must have "
1416  "column Maps." << endl;
1417  }
1418  if (this->graph_.getColMap ().is_null ()) {
1419  std::ostream& err = markLocalErrorAndGetStream ();
1420  err << "checkSizes: The target object of the Import or Export does "
1421  "not have a column Map. Both source and target objects must have "
1422  "column Maps." << endl;
1423  }
1424  }
1425 
1426  return ! (* (this->localError_));
1427  }
1428 
1429  template<class Scalar, class LO, class GO, class Node>
1430  void
1433  size_t numSameIDs,
1434  const Teuchos::ArrayView<const LO>& permuteToLIDs,
1435  const Teuchos::ArrayView<const LO>& permuteFromLIDs)
1436  {
1437  using std::endl;
1439  const bool debug = false;
1440 
1441  if (debug) {
1442  std::ostringstream os;
1443  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1444  os << "Proc " << myRank << ": copyAndPermute: "
1445  << "numSameIDs: " << numSameIDs
1446  << ", permuteToLIDs.size(): " << permuteToLIDs.size ()
1447  << ", permuteFromLIDs.size(): " << permuteFromLIDs.size ()
1448  << endl;
1449  std::cerr << os.str ();
1450  }
1451 
1452  // There's no communication in this method, so it's safe just to
1453  // return on error.
1454 
1455  if (* (this->localError_)) {
1456  std::ostream& err = this->markLocalErrorAndGetStream ();
1457  err << "copyAndPermute: The target object of the Import or Export is "
1458  "already in an error state." << endl;
1459  return;
1460  }
1461  if (permuteToLIDs.size () != permuteFromLIDs.size ()) {
1462  std::ostream& err = this->markLocalErrorAndGetStream ();
1463  err << "copyAndPermute: permuteToLIDs.size() = " << permuteToLIDs.size ()
1464  << " != permuteFromLIDs.size() = " << permuteFromLIDs.size () << "."
1465  << endl;
1466  return;
1467  }
1468 
1469  const this_type* src = dynamic_cast<const this_type* > (&source);
1470  if (src == NULL) {
1471  std::ostream& err = this->markLocalErrorAndGetStream ();
1472  err << "copyAndPermute: The source object of the Import or Export is "
1473  "either not a BlockCrsMatrix, or does not have the right template "
1474  "parameters. checkSizes() should have caught this. "
1475  "Please report this bug to the Tpetra developers." << endl;
1476  return;
1477  }
1478  if (* (src->localError_)) {
1479  std::ostream& err = this->markLocalErrorAndGetStream ();
1480  err << "copyAndPermute: The source object of the Import or Export is "
1481  "already in an error state." << endl;
1482  return;
1483  }
1484 
1485  bool lclErr = false;
1486 #ifdef HAVE_TPETRA_DEBUG
1487  std::set<LO> invalidSrcCopyRows;
1488  std::set<LO> invalidDstCopyRows;
1489  std::set<LO> invalidDstCopyCols;
1490  std::set<LO> invalidDstPermuteCols;
1491  std::set<LO> invalidPermuteFromRows;
1492 #endif // HAVE_TPETRA_DEBUG
1493 
1494  // Copy the initial sequence of rows that are the same.
1495  //
1496  // The two graphs might have different column Maps, so we need to
1497  // do this using global column indices. This is purely local, so
1498  // we only need to check for local sameness of the two column
1499  // Maps.
1500 
1501 #ifdef HAVE_TPETRA_DEBUG
1502  const map_type& srcRowMap = * (src->graph_.getRowMap ());
1503 #endif // HAVE_TPETRA_DEBUG
1504  const map_type& dstRowMap = * (this->graph_.getRowMap ());
1505  const map_type& srcColMap = * (src->graph_.getColMap ());
1506  const map_type& dstColMap = * (this->graph_.getColMap ());
1507  const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
1508  const size_t numPermute = static_cast<size_t> (permuteFromLIDs.size ());
1509 
1510  if (debug) {
1511  std::ostringstream os;
1512  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1513  os << "Proc " << myRank << ": copyAndPermute: "
1514  << "canUseLocalColumnIndices: "
1515  << (canUseLocalColumnIndices ? "true" : "false")
1516  << ", numPermute: " << numPermute
1517  << endl;
1518  std::cerr << os.str ();
1519  }
1520 
1521  if (canUseLocalColumnIndices) {
1522  // Copy local rows that are the "same" in both source and target.
1523  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1524 #ifdef HAVE_TPETRA_DEBUG
1525  if (! srcRowMap.isNodeLocalElement (localRow)) {
1526  lclErr = true;
1527  invalidSrcCopyRows.insert (localRow);
1528  continue; // skip invalid rows
1529  }
1530 #endif // HAVE_TPETRA_DEBUG
1531 
1532  const LO* lclSrcCols;
1533  Scalar* vals;
1534  LO numEntries;
1535  // If this call fails, that means the mesh row local index is
1536  // invalid. That means the Import or Export is invalid somehow.
1537  LO err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
1538  if (err != 0) {
1539  lclErr = true;
1540 #ifdef HAVE_TPETRA_DEBUG
1541  (void) invalidSrcCopyRows.insert (localRow);
1542 #endif // HAVE_TPETRA_DEBUG
1543  }
1544  else {
1545  err = this->replaceLocalValues (localRow, lclSrcCols, vals, numEntries);
1546  if (err != numEntries) {
1547  lclErr = true;
1548  if (! dstRowMap.isNodeLocalElement (localRow)) {
1549 #ifdef HAVE_TPETRA_DEBUG
1550  invalidDstCopyRows.insert (localRow);
1551 #endif // HAVE_TPETRA_DEBUG
1552  }
1553  else {
1554  // Once there's an error, there's no sense in saving
1555  // time, so we check whether the column indices were
1556  // invalid. However, only remember which ones were
1557  // invalid in a debug build, because that might take a
1558  // lot of space.
1559  for (LO k = 0; k < numEntries; ++k) {
1560  if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
1561  lclErr = true;
1562 #ifdef HAVE_TPETRA_DEBUG
1563  (void) invalidDstCopyCols.insert (lclSrcCols[k]);
1564 #endif // HAVE_TPETRA_DEBUG
1565  }
1566  }
1567  }
1568  }
1569  }
1570  } // for each "same" local row
1571 
1572  // Copy the "permute" local rows.
1573  for (size_t k = 0; k < numPermute; ++k) {
1574  const LO srcLclRow = static_cast<LO> (permuteFromLIDs[k]);
1575  const LO dstLclRow = static_cast<LO> (permuteToLIDs[k]);
1576 
1577  const LO* lclSrcCols;
1578  Scalar* vals;
1579  LO numEntries;
1580  LO err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
1581  if (err != 0) {
1582  lclErr = true;
1583 #ifdef HAVE_TPETRA_DEBUG
1584  invalidPermuteFromRows.insert (srcLclRow);
1585 #endif // HAVE_TPETRA_DEBUG
1586  }
1587  else {
1588  err = this->replaceLocalValues (dstLclRow, lclSrcCols, vals, numEntries);
1589  if (err != numEntries) {
1590  lclErr = true;
1591 #ifdef HAVE_TPETRA_DEBUG
1592  for (LO c = 0; c < numEntries; ++c) {
1593  if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
1594  invalidDstPermuteCols.insert (lclSrcCols[c]);
1595  }
1596  }
1597 #endif // HAVE_TPETRA_DEBUG
1598  }
1599  }
1600  }
1601  }
1602  else { // must convert column indices to global
1603  // Reserve space to store the destination matrix's local column indices.
1604  const size_t maxNumEnt = src->graph_.getNodeMaxNumRowEntries ();
1605  Teuchos::Array<LO> lclDstCols (maxNumEnt);
1606 
1607  // Copy local rows that are the "same" in both source and target.
1608  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1609  const LO* lclSrcCols;
1610  Scalar* vals;
1611  LO numEntries;
1612  // If this call fails, that means the mesh row local index is
1613  // invalid. That means the Import or Export is invalid somehow.
1614  LO err = 0;
1615  try {
1616  err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
1617  } catch (std::exception& e) {
1618  if (debug) {
1619  std::ostringstream os;
1620  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1621  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1622  << localRow << ", src->getLocalRowView() threw an exception: "
1623  << e.what ();
1624  std::cerr << os.str ();
1625  }
1626  throw e;
1627  }
1628 
1629  if (err != 0) {
1630  lclErr = true;
1631 #ifdef HAVE_TPETRA_DEBUG
1632  invalidSrcCopyRows.insert (localRow);
1633 #endif // HAVE_TPETRA_DEBUG
1634  }
1635  else {
1636  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
1637  lclErr = true;
1638  if (debug) {
1639  std::ostringstream os;
1640  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1641  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1642  << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
1643  << maxNumEnt << endl;
1644  std::cerr << os.str ();
1645  }
1646  }
1647  else {
1648  // Convert the source matrix's local column indices to the
1649  // destination matrix's local column indices.
1650  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
1651  for (LO j = 0; j < numEntries; ++j) {
1652  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
1653  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
1654  lclErr = true;
1655 #ifdef HAVE_TPETRA_DEBUG
1656  invalidDstCopyCols.insert (lclDstColsView[j]);
1657 #endif // HAVE_TPETRA_DEBUG
1658  }
1659  }
1660  try {
1661  err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), vals, numEntries);
1662  } catch (std::exception& e) {
1663  if (debug) {
1664  std::ostringstream os;
1665  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1666  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1667  << localRow << ", this->replaceLocalValues() threw an exception: "
1668  << e.what ();
1669  std::cerr << os.str ();
1670  }
1671  throw e;
1672  }
1673  if (err != numEntries) {
1674  lclErr = true;
1675  if (debug) {
1676  std::ostringstream os;
1677  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1678  os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
1679  "localRow " << localRow << ", this->replaceLocalValues "
1680  "returned " << err << " instead of numEntries = "
1681  << numEntries << endl;
1682  std::cerr << os.str ();
1683  }
1684  }
1685  }
1686  }
1687  }
1688 
1689  // Copy the "permute" local rows.
1690  for (size_t k = 0; k < numPermute; ++k) {
1691  const LO srcLclRow = static_cast<LO> (permuteFromLIDs[k]);
1692  const LO dstLclRow = static_cast<LO> (permuteToLIDs[k]);
1693 
1694  const LO* lclSrcCols;
1695  Scalar* vals;
1696  LO numEntries;
1697  LO err = 0;
1698  try {
1699  err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
1700  } catch (std::exception& e) {
1701  if (debug) {
1702  std::ostringstream os;
1703  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1704  os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
1705  "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
1706  << ", src->getLocalRowView() threw an exception: " << e.what ();
1707  std::cerr << os.str ();
1708  }
1709  throw e;
1710  }
1711 
1712  if (err != 0) {
1713  lclErr = true;
1714 #ifdef HAVE_TPETRA_DEBUG
1715  invalidPermuteFromRows.insert (srcLclRow);
1716 #endif // HAVE_TPETRA_DEBUG
1717  }
1718  else {
1719  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
1720  lclErr = true;
1721  }
1722  else {
1723  // Convert the source matrix's local column indices to the
1724  // destination matrix's local column indices.
1725  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
1726  for (LO j = 0; j < numEntries; ++j) {
1727  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
1728  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
1729  lclErr = true;
1730 #ifdef HAVE_TPETRA_DEBUG
1731  invalidDstPermuteCols.insert (lclDstColsView[j]);
1732 #endif // HAVE_TPETRA_DEBUG
1733  }
1734  }
1735  err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), vals, numEntries);
1736  if (err != numEntries) {
1737  lclErr = true;
1738  }
1739  }
1740  }
1741  }
1742  }
1743 
1744  if (lclErr) {
1745  std::ostream& err = this->markLocalErrorAndGetStream ();
1746 #ifdef HAVE_TPETRA_DEBUG
1747  err << "copyAndPermute: The graph structure of the source of the "
1748  "Import or Export must be a subset of the graph structure of the "
1749  "target. ";
1750  err << "invalidSrcCopyRows = [";
1751  for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
1752  it != invalidSrcCopyRows.end (); ++it) {
1753  err << *it;
1754  typename std::set<LO>::const_iterator itp1 = it;
1755  itp1++;
1756  if (itp1 != invalidSrcCopyRows.end ()) {
1757  err << ",";
1758  }
1759  }
1760  err << "], invalidDstCopyRows = [";
1761  for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
1762  it != invalidDstCopyRows.end (); ++it) {
1763  err << *it;
1764  typename std::set<LO>::const_iterator itp1 = it;
1765  itp1++;
1766  if (itp1 != invalidDstCopyRows.end ()) {
1767  err << ",";
1768  }
1769  }
1770  err << "], invalidDstCopyCols = [";
1771  for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
1772  it != invalidDstCopyCols.end (); ++it) {
1773  err << *it;
1774  typename std::set<LO>::const_iterator itp1 = it;
1775  itp1++;
1776  if (itp1 != invalidDstCopyCols.end ()) {
1777  err << ",";
1778  }
1779  }
1780  err << "], invalidDstPermuteCols = [";
1781  for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
1782  it != invalidDstPermuteCols.end (); ++it) {
1783  err << *it;
1784  typename std::set<LO>::const_iterator itp1 = it;
1785  itp1++;
1786  if (itp1 != invalidDstPermuteCols.end ()) {
1787  err << ",";
1788  }
1789  }
1790  err << "], invalidPermuteFromRows = [";
1791  for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
1792  it != invalidPermuteFromRows.end (); ++it) {
1793  err << *it;
1794  typename std::set<LO>::const_iterator itp1 = it;
1795  itp1++;
1796  if (itp1 != invalidPermuteFromRows.end ()) {
1797  err << ",";
1798  }
1799  }
1800  err << "]" << std::endl;
1801 #else
1802  err << "copyAndPermute: The graph structure of the source of the "
1803  "Import or Export must be a subset of the graph structure of the "
1804  "target." << std::endl;
1805 #endif // HAVE_TPETRA_DEBUG
1806  }
1807 
1808  if (debug) {
1809  std::ostringstream os;
1810  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1811  const bool lclSuccess = ! (* (this->localError_));
1812  os << "*** Proc " << myRank << ": copyAndPermute "
1813  << (lclSuccess ? "succeeded" : "FAILED");
1814  if (lclSuccess) {
1815  os << endl;
1816  } else {
1817  os << ": error messages: " << this->errorMessages (); // comes w/ endl
1818  }
1819  std::cerr << os.str ();
1820  }
1821  }
1822 
1823  namespace { // (anonymous)
1824 
1843  template<class LO, class GO, class D>
1844  size_t
1845  packRowCount (const size_t numEnt,
1846  const size_t numBytesPerValue,
1847  const size_t blkSize)
1848  {
1850 
1851  if (numEnt == 0) {
1852  // Empty rows always take zero bytes, to ensure sparsity.
1853  return 0;
1854  }
1855  else {
1856  // We store the number of entries as a local index (LO).
1857  LO numEntLO = 0; // packValueCount wants this.
1858  GO gid;
1859  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
1860  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
1861  const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
1862  return numEntLen + gidsLen + valsLen;
1863  }
1864  }
1865 
1876  template<class ST, class LO, class GO, class D>
1877  size_t
1878  unpackRowCount (const typename Tpetra::Details::PackTraits<LO, D>::input_buffer_type& imports,
1879  const size_t offset,
1880  const size_t numBytes,
1881  const size_t numBytesPerValue)
1882  {
1883  using Kokkos::subview;
1885  typedef typename PackTraits<LO, D>::input_buffer_type input_buffer_type;
1886  typedef typename input_buffer_type::size_type size_type;
1887 
1888  if (numBytes == 0) {
1889  // Empty rows always take zero bytes, to ensure sparsity.
1890  return static_cast<size_t> (0);
1891  }
1892  else {
1893  LO numEntLO = 0;
1894  const size_t theNumBytes = PackTraits<LO, D>::packValueCount (numEntLO);
1895 #ifdef HAVE_TPETRA_DEBUG
1896  TEUCHOS_TEST_FOR_EXCEPTION(
1897  theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
1898  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
1899  << ".");
1900 #endif // HAVE_TPETRA_DEBUG
1901  const std::pair<size_type, size_type> rng (offset, offset + theNumBytes);
1902  input_buffer_type inBuf = subview (imports, rng); // imports (offset, theNumBytes);
1903  const size_t actualNumBytes = PackTraits<LO, D>::unpackValue (numEntLO, inBuf);
1904 #ifdef HAVE_TPETRA_DEBUG
1905  TEUCHOS_TEST_FOR_EXCEPTION(
1906  theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
1907  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
1908  << ".");
1909 #else
1910  (void) actualNumBytes;
1911 #endif // HAVE_TPETRA_DEBUG
1912  return static_cast<size_t> (numEntLO);
1913  }
1914  }
1915 
1923  template<class ST, class LO, class GO, class D>
1924  size_t
1925  packRowForBlockCrs (const typename Tpetra::Details::PackTraits<LO, D>::output_buffer_type& exports,
1926  const size_t offset,
1927  const size_t numEnt,
1930  const size_t numBytesPerValue,
1931  const size_t blockSize)
1932  {
1933  using Kokkos::subview;
1935  // NOTE (mfh 02 Feb 2015) This assumes that output_buffer_type is
1936  // the same, no matter what type we're packing. It's a reasonable
1937  // assumption, given that we go through the trouble of PackTraits
1938  // just so that we can pack data of different types in the same
1939  // buffer.
1940  typedef typename PackTraits<LO, D>::output_buffer_type output_buffer_type;
1941  typedef typename output_buffer_type::size_type size_type;
1942  typedef typename std::pair<size_type, size_type> pair_type;
1943 
1944  if (numEnt == 0) {
1945  // Empty rows always take zero bytes, to ensure sparsity.
1946  return 0;
1947  }
1948  const size_t numScalarEnt = numEnt * blockSize * blockSize;
1949 
1950  const GO gid = 0; // packValueCount wants this
1951  const LO numEntLO = static_cast<size_t> (numEnt);
1952 
1953  const size_t numEntBeg = offset;
1954  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
1955  const size_t gidsBeg = numEntBeg + numEntLen;
1956  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
1957  const size_t valsBeg = gidsBeg + gidsLen;
1958  const size_t valsLen = numScalarEnt * numBytesPerValue;
1959 
1960  output_buffer_type numEntOut =
1961  subview (exports, pair_type (numEntBeg, numEntBeg + numEntLen));
1962  output_buffer_type gidsOut =
1963  subview (exports, pair_type (gidsBeg, gidsBeg + gidsLen));
1964  output_buffer_type valsOut =
1965  subview (exports, pair_type (valsBeg, valsBeg + valsLen));
1966 
1967  size_t numBytesOut = 0;
1968  numBytesOut += PackTraits<LO, D>::packValue (numEntOut, numEntLO);
1969  numBytesOut += PackTraits<GO, D>::packArray (gidsOut, gidsIn, numEnt);
1970  numBytesOut += PackTraits<ST, D>::packArray (valsOut, valsIn, numScalarEnt);
1971 
1972  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
1973  TEUCHOS_TEST_FOR_EXCEPTION(
1974  numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
1975  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
1976  << expectedNumBytes << ".");
1977  return numBytesOut;
1978  }
1979 
1980  // Return the number of bytes actually read / used.
1981  template<class ST, class LO, class GO, class D>
1982  size_t
1983  unpackRowForBlockCrs (const typename Tpetra::Details::PackTraits<GO, D>::output_array_type& gidsOut,
1986  const size_t offset,
1987  const size_t numBytes,
1988  const size_t numEnt,
1989  const size_t numBytesPerValue,
1990  const size_t blockSize)
1991  {
1992  using Kokkos::subview;
1994  // NOTE (mfh 02 Feb 2015) This assumes that input_buffer_type is
1995  // the same, no matter what type we're unpacking. It's a
1996  // reasonable assumption, given that we go through the trouble of
1997  // PackTraits just so that we can pack data of different types in
1998  // the same buffer.
1999  typedef typename PackTraits<LO, D>::input_buffer_type input_buffer_type;
2000  typedef typename input_buffer_type::size_type size_type;
2001  typedef typename std::pair<size_type, size_type> pair_type;
2002 
2003  if (numBytes == 0) {
2004  // Rows with zero bytes always have zero entries.
2005  return 0;
2006  }
2007  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2008  TEUCHOS_TEST_FOR_EXCEPTION(
2009  static_cast<size_t> (imports.dimension_0 ()) <= offset,
2010  std::logic_error, "unpackRow: imports.dimension_0() = "
2011  << imports.dimension_0 () << " <= offset = " << offset << ".");
2012  TEUCHOS_TEST_FOR_EXCEPTION(
2013  static_cast<size_t> (imports.dimension_0 ()) < offset + numBytes,
2014  std::logic_error, "unpackRow: imports.dimension_0() = "
2015  << imports.dimension_0 () << " < offset + numBytes = "
2016  << (offset + numBytes) << ".");
2017 
2018  const GO gid = 0; // packValueCount wants this
2019  const LO lid = 0; // packValueCount wants this
2020 
2021  const size_t numEntBeg = offset;
2022  const size_t numEntLen = PackTraits<LO, D>::packValueCount (lid);
2023  const size_t gidsBeg = numEntBeg + numEntLen;
2024  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2025  const size_t valsBeg = gidsBeg + gidsLen;
2026  const size_t valsLen = numScalarEnt * numBytesPerValue;
2027 
2028  input_buffer_type numEntIn =
2029  subview (imports, pair_type (numEntBeg, numEntBeg + numEntLen));
2030  input_buffer_type gidsIn =
2031  subview (imports, pair_type (gidsBeg, gidsBeg + gidsLen));
2032  input_buffer_type valsIn =
2033  subview (imports, pair_type (valsBeg, valsBeg + valsLen));
2034 
2035  size_t numBytesOut = 0;
2036  LO numEntOut;
2037  numBytesOut += PackTraits<LO, D>::unpackValue (numEntOut, numEntIn);
2038  TEUCHOS_TEST_FOR_EXCEPTION(
2039  static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
2040  "unpackRow: Expected number of entries " << numEnt
2041  << " != actual number of entries " << numEntOut << ".");
2042 
2043  numBytesOut += PackTraits<GO, D>::unpackArray (gidsOut, gidsIn, numEnt);
2044  numBytesOut += PackTraits<ST, D>::unpackArray (valsOut, valsIn, numScalarEnt);
2045 
2046  TEUCHOS_TEST_FOR_EXCEPTION(
2047  numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = "
2048  << numBytesOut << " != numBytes = " << numBytes << ".");
2049  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2050  TEUCHOS_TEST_FOR_EXCEPTION(
2051  numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
2052  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
2053  << expectedNumBytes << ".");
2054  return numBytesOut;
2055  }
2056  } // namespace (anonymous)
2057 
2058  template<class Scalar, class LO, class GO, class Node>
2059  void
2061  packAndPrepare (const Tpetra::SrcDistObject& source,
2062  const Teuchos::ArrayView<const LO>& exportLIDs,
2063  Teuchos::Array<packet_type>& exports,
2064  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
2065  size_t& constantNumPackets,
2066  Tpetra::Distributor& /* distor */)
2067  {
2068  using std::endl;
2070  using Kokkos::MemoryUnmanaged;
2071  using Kokkos::subview;
2072  using Kokkos::View;
2074  typedef typename Node::execution_space execution_space;
2075  typedef typename View<int*, execution_space>::HostMirror::execution_space HES;
2077  typedef typename Teuchos::ArrayView<const LO>::size_type size_type;
2078  const bool debug = false;
2079 
2080  if (debug) {
2081  std::ostringstream os;
2082  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2083  os << "Proc " << myRank << ": packAndPrepare: exportLIDs.size() = "
2084  << exportLIDs.size () << ", numPacketsPerLID.size() = "
2085  << numPacketsPerLID.size () << endl;
2086  std::cerr << os.str ();
2087  }
2088 
2089  if (* (this->localError_)) {
2090  std::ostream& err = this->markLocalErrorAndGetStream ();
2091  err << "packAndPrepare: The target object of the Import or Export is "
2092  "already in an error state." << endl;
2093  return;
2094  }
2095 
2096  const this_type* src = dynamic_cast<const this_type* > (&source);
2097  // Should have checked for these cases in checkSizes().
2098  if (src == NULL) {
2099  std::ostream& err = this->markLocalErrorAndGetStream ();
2100  err << "packAndPrepare: The source (input) object of the Import or "
2101  "Export is either not a BlockCrsMatrix, or does not have the right "
2102  "template parameters. checkSizes() should have caught this. "
2103  "Please report this bug to the Tpetra developers." << endl;
2104  return;
2105  }
2106 
2107  const crs_graph_type& srcGraph = src->graph_;
2108  const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2109  const size_type numExportLIDs = exportLIDs.size ();
2110 
2111  if (numExportLIDs != numPacketsPerLID.size ()) {
2112  std::ostream& err = this->markLocalErrorAndGetStream ();
2113  err << "packAndPrepare: exportLIDs.size() = " << numExportLIDs
2114  << " != numPacketsPerLID.size() = " << numPacketsPerLID.size ()
2115  << "." << endl;
2116  return;
2117  }
2118 
2119  // Graphs and matrices are allowed to have a variable number of
2120  // entries per row. We could test whether all rows have the same
2121  // number of entries, but DistObject can only use this
2122  // optimization if all rows on _all_ processes have the same
2123  // number of entries. Rather than do the all-reduce necessary to
2124  // test for this unlikely case, we tell DistObject (by setting
2125  // constantNumPackets to zero) to assume that different rows may
2126  // have different numbers of entries.
2127  constantNumPackets = 0;
2128 
2129  // Compute the number of bytes ("packets") per row to pack. While
2130  // we're at it, compute the total # of block entries to send, and
2131  // the max # of block entries in any of the rows we're sending.
2132  size_t totalNumBytes = 0;
2133  size_t totalNumEntries = 0;
2134  size_t maxRowLength = 0;
2135  for (size_type i = 0; i < numExportLIDs; ++i) {
2136  const LO lclRow = exportLIDs[i];
2137  size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2138  // If any given LIDs are invalid, the above might return either
2139  // zero or the invalid size_t value. If the former, we have no
2140  // way to tell, but that's OK; it just means the calling process
2141  // won't pack anything (it has nothing to pack anyway). If the
2142  // latter, we replace it with zero (that row is not owned by the
2143  // calling process, so it has no entries to pack).
2144  if (numEnt == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2145  numEnt = 0;
2146  }
2147  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2148 
2149  // The 'if' branch implicitly assumes that packRowCount() returns
2150  // zero if numEnt == 0.
2151  size_t numBytesPerValue = 0;
2152  if (numEnt > 0) {
2153  // Get a locally indexed view of the current row's data. If
2154  // the current row has > 0 entries, we need an entry in order
2155  // to figure out the byte count of the packed row. (We really
2156  // only need it if ST's size is determined at run time.)
2157  Scalar* valsRaw = NULL;
2158  const LO* lidsRaw = NULL;
2159  LO actualNumEnt = 0;
2160  const LO errCode =
2161  src->getLocalRowView (lclRow, lidsRaw, valsRaw, actualNumEnt);
2162 
2163  if (numEnt != static_cast<size_t> (actualNumEnt)) {
2164  std::ostream& err = this->markLocalErrorAndGetStream ();
2165  err << "packAndPrepare: Local row " << i << " claims to have " <<
2166  numEnt << "entry/ies, but the View returned by getLocalRowView() "
2167  "has " << actualNumEnt << " entry/ies. This should never happen. "
2168  "Please report this bug to the Tpetra developers." << endl;
2169  return;
2170  }
2171  if (errCode == Teuchos::OrdinalTraits<LO>::invalid ()) {
2172  std::ostream& err = this->markLocalErrorAndGetStream ();
2173  err << "packAndPrepare: Local row " << i << " is not in the row Map "
2174  "of the source object on the calling process." << endl;
2175  return;
2176  }
2177 
2178  const ST* valsRawST = const_cast<const ST*> (reinterpret_cast<ST*> (valsRaw));
2179  View<const ST*, HES, MemoryUnmanaged> vals (valsRawST, numScalarEnt);
2180 
2181  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
2182  // space here for now, this doesn't assume UVM. That may change
2183  // in the future, if we ever start packing on the device.
2184  numBytesPerValue = PackTraits<ST, HES>::packValueCount (vals(0));
2185  }
2186 
2187  const size_t numBytes =
2188  packRowCount<LO, GO, HES> (numEnt, numBytesPerValue, blockSize);
2189  numPacketsPerLID[i] = numBytes;
2190  totalNumBytes += numBytes;
2191  totalNumEntries += numEnt;
2192  maxRowLength = std::max (maxRowLength, numEnt);
2193  }
2194 
2195  if (debug) {
2196  const int myRank = graph_.getComm ()->getRank ();
2197  std::ostringstream os;
2198  os << "Proc " << myRank << ": packAndPrepare: totalNumBytes = "
2199  << totalNumBytes << endl;
2200  std::cerr << os.str ();
2201  }
2202 
2203  // We use a "struct of arrays" approach to packing each row's
2204  // entries. All the column indices (as global indices) go first,
2205  // then all their owning process ranks, and then the values.
2206  exports.resize (totalNumBytes);
2207  if (totalNumEntries > 0) {
2208  View<char*, HES, MemoryUnmanaged> exportsK (exports.getRawPtr (),
2209  totalNumBytes);
2210 
2211  // Current position (in bytes) in the 'exports' output array.
2212  size_t offset = 0;
2213 
2214  // For each block row of the matrix owned by the calling
2215  // process, pack that block row's column indices and values into
2216  // the exports array.
2217 
2218  // Source matrix's column Map. We verified in checkSizes() that
2219  // the column Map exists (is not null).
2220  const map_type& srcColMap = * (srcGraph.getColMap ());
2221 
2222  // Temporary buffer for global column indices.
2223  View<GO*, HES> gblColInds;
2224  {
2225  GO gid = 0;
2226  gblColInds = PackTraits<GO, HES>::allocateArray (gid, maxRowLength, "gids");
2227  }
2228 
2229  // Pack the data for each row to send, into the 'exports' buffer.
2230  for (size_type i = 0; i < numExportLIDs; ++i) {
2231  const LO lclRowInd = exportLIDs[i];
2232  const LO* lclColIndsRaw;
2233  Scalar* valsRaw;
2234  LO numEntLO;
2235  // It's OK to ignore the return value, since if the calling
2236  // process doesn't own that local row, then the number of
2237  // entries in that row on the calling process is zero.
2238  (void) src->getLocalRowView (lclRowInd, lclColIndsRaw, valsRaw, numEntLO);
2239  const size_t numEnt = static_cast<size_t> (numEntLO);
2240  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2241  View<const LO*, HES, MemoryUnmanaged> lclColInds (lclColIndsRaw, numEnt);
2242  const ST* valsRawST = const_cast<const ST*> (reinterpret_cast<ST*> (valsRaw));
2243  View<const ST*, HES, MemoryUnmanaged> vals (valsRawST, numScalarEnt);
2244 
2245  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
2246  // space here for now, this doesn't assume UVM. That may
2247  // change in the future, if we ever start packing on device.
2248  const size_t numBytesPerValue = numEnt == 0 ?
2249  static_cast<size_t> (0) :
2250  PackTraits<ST, HES>::packValueCount (vals(0));
2251 
2252  // Convert column indices from local to global.
2253  for (size_t j = 0; j < numEnt; ++j) {
2254  gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
2255  }
2256 
2257  // Copy the row's data into the current spot in the exports array.
2258  const size_t numBytes =
2259  packRowForBlockCrs<ST, LO, GO, HES> (exportsK, offset, numEnt, gblColInds,
2260  vals, numBytesPerValue, blockSize);
2261  // Keep track of how many bytes we packed.
2262  offset += numBytes;
2263  } // for each LID (of a row) to send
2264 
2265  if (offset != totalNumBytes) {
2266  std::ostream& err = this->markLocalErrorAndGetStream ();
2267  err << "packAndPreapre: At end of method, the final offset (in bytes) "
2268  << offset << " does not equal the total number of bytes packed "
2269  << totalNumBytes << ". "
2270  << "Please report this bug to the Tpetra developers." << endl;
2271  return;
2272  }
2273  } // if totalNumEntries > 0
2274 
2275  if (debug) {
2276  std::ostringstream os;
2277  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2278  const bool lclSuccess = ! (* (this->localError_));
2279  os << "*** Proc " << myRank << ": packAndPrepare "
2280  << (lclSuccess ? "succeeded" : "FAILED")
2281  << " (totalNumEntries = " << totalNumEntries << ") ***" << endl;
2282  std::cerr << os.str ();
2283  }
2284  }
2285 
2286 
2287  template<class Scalar, class LO, class GO, class Node>
2288  void
2290  unpackAndCombine (const Teuchos::ArrayView<const LO>& importLIDs,
2291  const Teuchos::ArrayView<const packet_type>& imports,
2292  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
2293  size_t /* constantNumPackets */, // not worthwhile to use this
2294  Tpetra::Distributor& /* distor */,
2296  {
2297  using std::endl;
2299  using Kokkos::MemoryUnmanaged;
2300  using Kokkos::subview;
2301  using Kokkos::View;
2303  typedef typename Teuchos::ArrayView<const LO>::size_type size_type;
2304  typedef typename Node::execution_space execution_space;
2305  typedef typename View<int*, execution_space>::HostMirror::execution_space HES;
2306  typedef std::pair<typename View<int*, HES>::size_type,
2307  typename View<int*, HES>::size_type> pair_type;
2308  typedef View<GO*, HES, MemoryUnmanaged> gids_out_type;
2309  typedef View<LO*, HES, MemoryUnmanaged> lids_out_type;
2310  typedef View<ST*, HES, MemoryUnmanaged> vals_out_type;
2311  typedef typename PackTraits<GO, HES>::input_buffer_type input_buffer_type;
2312  const char prefix[] = "Tpetra::Experimental::BlockCrsMatrix::unpackAndCombine: ";
2313  const bool debug = false;
2314 
2315  if (debug) {
2316  std::ostringstream os;
2317  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2318  os << "Proc " << myRank << ": unpackAndCombine" << endl;
2319  std::cerr << os.str ();
2320  }
2321 
2322  // It should not cause deadlock to return on error in this method,
2323  // since this method does not communicate.
2324 
2325  if (* (this->localError_)) {
2326  std::ostream& err = this->markLocalErrorAndGetStream ();
2327  err << prefix << "The target object of the Import or Export is "
2328  "already in an error state." << endl;
2329  return;
2330  }
2331  if (importLIDs.size () != numPacketsPerLID.size ()) {
2332  std::ostream& err = this->markLocalErrorAndGetStream ();
2333  err << prefix << "importLIDs.size() = " << importLIDs.size () << " != "
2334  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << "." << endl;
2335  return;
2336  }
2337  if (CM != ADD && CM != INSERT && CM != REPLACE && CM != ABSMAX && CM != ZERO) {
2338  std::ostream& err = this->markLocalErrorAndGetStream ();
2339  err << prefix << "Invalid CombineMode value " << CM << ". Valid "
2340  << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
2341  << endl;
2342  return;
2343  }
2344 
2345  // Target matrix's column Map. Use to convert the global column
2346  // indices in the receive buffer to local indices. We verified in
2347  // checkSizes() that the column Map exists (is not null).
2348  const map_type& tgtColMap = * (this->graph_.getColMap ());
2349 
2350  const size_type numImportLIDs = importLIDs.size ();
2351  if (CM == ZERO || numImportLIDs == 0) {
2352  if (debug) {
2353  std::ostringstream os;
2354  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2355  os << "Proc " << myRank << ": unpackAndCombine: Nothing to do" << endl;
2356  std::cerr << os.str ();
2357  }
2358  return; // nothing to do; no need to combine entries
2359  }
2360 
2361  if (debug) {
2362  std::ostringstream os;
2363  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2364  os << "Proc " << myRank << ": unpackAndCombine: Getting ready" << endl;
2365  std::cerr << os.str ();
2366  }
2367 
2368  input_buffer_type importsK (imports.getRawPtr (), imports.size ());
2369  const size_t blockSize = this->getBlockSize ();
2370  const size_t maxRowNumEnt = graph_.getNodeMaxNumRowEntries ();
2371  const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
2372 
2373  size_t numBytesPerValue;
2374  {
2375  // FIXME (mfh 17 Feb 2015) What do I do about Scalar types with
2376  // run-time size? We already assume that all entries in both
2377  // the source and target matrices have the same size. If the
2378  // calling process owns at least one entry in either matrix, we
2379  // can use that entry to set the size. However, it is possible
2380  // that the calling process owns no entries. In that case,
2381  // we're in trouble. One way to fix this would be for each
2382  // row's data to contain the run-time size. This is only
2383  // necessary if the size is not a compile-time constant.
2384  Scalar val;
2385  numBytesPerValue = PackTraits<ST, HES>::packValueCount (val);
2386  }
2387 
2388  // Temporary space to cache incoming global column indices and
2389  // values. Column indices come in as global indices, in case the
2390  // source object's column Map differs from the target object's
2391  // (this's) column Map.
2392  View<GO*, HES> gblColInds;
2393  View<LO*, HES> lclColInds;
2394  View<ST*, HES> vals;
2395  {
2396  GO gid = 0;
2397  LO lid = 0;
2398  // FIXME (mfh 17 Feb 2015) What do I do about Scalar types with
2399  // run-time size? We already assume that all entries in both
2400  // the source and target matrices have the same size. If the
2401  // calling process owns at least one entry in either matrix, we
2402  // can use that entry to set the size. However, it is possible
2403  // that the calling process owns no entries. In that case,
2404  // we're in trouble. One way to fix this would be for each
2405  // row's data to contain the run-time size. This is only
2406  // necessary if the size is not a compile-time constant.
2407  Scalar val;
2408  gblColInds = PackTraits<GO, HES>::allocateArray (gid, maxRowNumEnt, "gids");
2409  lclColInds = PackTraits<LO, HES>::allocateArray (lid, maxRowNumEnt, "lids");
2410  vals = PackTraits<ST, HES>::allocateArray (val, maxRowNumScalarEnt, "vals");
2411  }
2412 
2413  size_t offset = 0;
2414  for (size_type i = 0; i < numImportLIDs; ++i) {
2415  const size_t numBytes = numPacketsPerLID[i];
2416  if (numBytes == 0) {
2417  // Empty buffer for that row means that the row is empty.
2418  continue;
2419  }
2420  const size_t numEnt =
2421  unpackRowCount<ST, LO, GO, HES> (importsK, offset, numBytes, numBytesPerValue);
2422  if (numEnt > maxRowNumEnt) {
2423  std::ostream& err = this->markLocalErrorAndGetStream ();
2424  err << prefix << "At i = " << i << ", numEnt = " << numEnt
2425  << " > maxRowNumEnt = " << maxRowNumEnt << endl;
2426  continue;
2427  }
2428 
2429  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2430  const LO lclRow = importLIDs[i];
2431 
2432  gids_out_type gidsOut = subview (gblColInds, pair_type (0, numEnt));
2433  vals_out_type valsOut = subview (vals, pair_type (0, numScalarEnt));
2434 
2435  const size_t numBytesOut =
2436  unpackRowForBlockCrs<ST, LO, GO, HES> (gidsOut, valsOut, importsK, offset, numBytes,
2437  numEnt, numBytesPerValue, blockSize);
2438  if (numBytes != numBytesOut) {
2439  std::ostream& err = this->markLocalErrorAndGetStream ();
2440  err << prefix << "At i = " << i << ", numBytes = " << numBytes
2441  << " != numBytesOut = " << numBytesOut << ".";
2442  continue;
2443  }
2444 
2445  // Convert incoming global indices to local indices.
2446  lids_out_type lidsOut = subview (lclColInds, pair_type (0, numEnt));
2447  for (size_t k = 0; k < numEnt; ++k) {
2448  lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
2449 #ifdef HAVE_TPETRA_DEBUG
2450  if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
2451  std::ostream& err = this->markLocalErrorAndGetStream ();
2452  err << prefix << "At i = " << i << ", GID " << gidsOut(k)
2453  << " is not owned by the calling process.";
2454  continue;
2455  }
2456 #endif // HAVE_TPETRA_DEBUG
2457  }
2458 
2459  // Combine the incoming data with the matrix's current data.
2460  LO numCombd = 0;
2461  const LO* const lidsRaw = const_cast<const LO*> (lidsOut.ptr_on_device ());
2462  const Scalar* const valsRaw =
2463  reinterpret_cast<const Scalar*> (const_cast<const ST*> (valsOut.ptr_on_device ()));
2464  if (CM == ADD) {
2465  numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2466  } else if (CM == INSERT || CM == REPLACE) {
2467  numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2468  } else if (CM == ABSMAX) {
2469  numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2470  }
2471 #ifdef HAVE_TPETRA_DEBUG
2472  if (static_cast<LO> (numEnt) != numCombd) {
2473  std::ostream& err = this->markLocalErrorAndGetStream ();
2474  err << prefix << "At i = " << i << ", numEnt = " << numEnt
2475  << " != numCombd = " << numCombd << ".";
2476  continue;
2477  }
2478 #else
2479  (void) numCombd; // ignore, just for now
2480 #endif // HAVE_TPETRA_DEBUG
2481 
2482  // Don't update offset until current LID has succeeded.
2483  offset += numBytes;
2484  } // for each import LID i
2485 
2486  if (debug) {
2487  std::ostringstream os;
2488  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2489  const bool lclSuccess = ! (* (this->localError_));
2490  os << "*** Proc " << myRank << ": unpackAndCombine "
2491  << (lclSuccess ? "succeeded" : "FAILED")
2492  << " ***" << endl;
2493  std::cerr << os.str ();
2494  }
2495  }
2496 
2497 
2498  template<class Scalar, class LO, class GO, class Node>
2499  std::string
2501  {
2502  using Teuchos::TypeNameTraits;
2503  std::ostringstream os;
2504  os << "\"Tpetra::BlockCrsMatrix\": { "
2505  << "Template parameters: { "
2506  << "Scalar: " << TypeNameTraits<Scalar>::name ()
2507  << "LO: " << TypeNameTraits<LO>::name ()
2508  << "GO: " << TypeNameTraits<GO>::name ()
2509  << "Node: " << TypeNameTraits<Node>::name ()
2510  << " }"
2511  << ", Label: \"" << this->getObjectLabel () << "\""
2512  << ", Global dimensions: ["
2513  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
2514  << graph_.getRangeMap ()->getGlobalNumElements () << "]"
2515  << ", Block size: " << getBlockSize ()
2516  << " }";
2517  return os.str ();
2518  }
2519 
2520 
2521  template<class Scalar, class LO, class GO, class Node>
2522  void
2524  describe (Teuchos::FancyOStream& out,
2525  const Teuchos::EVerbosityLevel verbLevel) const
2526  {
2527  using Teuchos::ArrayRCP;
2528  using Teuchos::CommRequest;
2529  using Teuchos::FancyOStream;
2530  using Teuchos::getFancyOStream;
2531  using Teuchos::ireceive;
2532  using Teuchos::isend;
2533  using Teuchos::outArg;
2534  using Teuchos::TypeNameTraits;
2535  using Teuchos::VERB_DEFAULT;
2536  using Teuchos::VERB_NONE;
2537  using Teuchos::VERB_LOW;
2538  using Teuchos::VERB_MEDIUM;
2539  // using Teuchos::VERB_HIGH;
2540  using Teuchos::VERB_EXTREME;
2541  using Teuchos::RCP;
2542  using Teuchos::wait;
2543  using std::endl;
2544 
2545  // Set default verbosity if applicable.
2546  const Teuchos::EVerbosityLevel vl =
2547  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
2548 
2549  if (vl == VERB_NONE) {
2550  return; // print nothing
2551  }
2552 
2553  // describe() always starts with a tab before it prints anything.
2554  Teuchos::OSTab tab0 (out);
2555 
2556  out << "\"Tpetra::BlockCrsMatrix\":" << endl;
2557  Teuchos::OSTab tab1 (out);
2558 
2559  out << "Template parameters:" << endl;
2560  {
2561  Teuchos::OSTab tab2 (out);
2562  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
2563  << "LO: " << TypeNameTraits<LO>::name () << endl
2564  << "GO: " << TypeNameTraits<GO>::name () << endl
2565  << "Node: " << TypeNameTraits<Node>::name () << endl;
2566  }
2567  out << "Label: \"" << this->getObjectLabel () << "\"" << endl
2568  << "Global dimensions: ["
2569  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
2570  << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
2571 
2572  const LO blockSize = getBlockSize ();
2573  out << "Block size: " << blockSize << endl;
2574 
2575  // constituent objects
2576  if (vl >= VERB_MEDIUM) {
2577  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
2578  const int myRank = comm.getRank ();
2579  if (myRank == 0) {
2580  out << "Row Map:" << endl;
2581  }
2582  getRowMap()->describe (out, vl);
2583 
2584  if (! getColMap ().is_null ()) {
2585  if (getColMap() == getRowMap()) {
2586  if (myRank == 0) {
2587  out << "Column Map: same as row Map" << endl;
2588  }
2589  }
2590  else {
2591  if (myRank == 0) {
2592  out << "Column Map:" << endl;
2593  }
2594  getColMap ()->describe (out, vl);
2595  }
2596  }
2597  if (! getDomainMap ().is_null ()) {
2598  if (getDomainMap () == getRowMap ()) {
2599  if (myRank == 0) {
2600  out << "Domain Map: same as row Map" << endl;
2601  }
2602  }
2603  else if (getDomainMap () == getColMap ()) {
2604  if (myRank == 0) {
2605  out << "Domain Map: same as column Map" << endl;
2606  }
2607  }
2608  else {
2609  if (myRank == 0) {
2610  out << "Domain Map:" << endl;
2611  }
2612  getDomainMap ()->describe (out, vl);
2613  }
2614  }
2615  if (! getRangeMap ().is_null ()) {
2616  if (getRangeMap () == getDomainMap ()) {
2617  if (myRank == 0) {
2618  out << "Range Map: same as domain Map" << endl;
2619  }
2620  }
2621  else if (getRangeMap () == getRowMap ()) {
2622  if (myRank == 0) {
2623  out << "Range Map: same as row Map" << endl;
2624  }
2625  }
2626  else {
2627  if (myRank == 0) {
2628  out << "Range Map: " << endl;
2629  }
2630  getRangeMap ()->describe (out, vl);
2631  }
2632  }
2633  }
2634 
2635  if (vl >= VERB_EXTREME) {
2636  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
2637  const int myRank = comm.getRank ();
2638  const int numProcs = comm.getSize ();
2639 
2640  // Print the calling process' data to the given output stream.
2641  RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
2642  RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
2643  FancyOStream& os = *osPtr;
2644  os << "Process " << myRank << ":" << endl;
2645  Teuchos::OSTab tab2 (os);
2646 
2647  const map_type& meshRowMap = * (graph_.getRowMap ());
2648  const map_type& meshColMap = * (graph_.getColMap ());
2649  for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
2650  meshLclRow <= meshRowMap.getMaxLocalIndex ();
2651  ++meshLclRow) {
2652  const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
2653  os << "Row " << meshGblRow << ": {";
2654 
2655  const LO* lclColInds = NULL;
2656  Scalar* vals = NULL;
2657  LO numInds = 0;
2658  this->getLocalRowView (meshLclRow, lclColInds, vals, numInds);
2659 
2660  for (LO k = 0; k < numInds; ++k) {
2661  const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
2662 
2663  os << "Col " << gblCol << ": [";
2664  for (LO i = 0; i < blockSize; ++i) {
2665  for (LO j = 0; j < blockSize; ++j) {
2666  os << vals[blockSize*blockSize*k + i*blockSize + j];
2667  if (j + 1 < blockSize) {
2668  os << ", ";
2669  }
2670  }
2671  if (i + 1 < blockSize) {
2672  os << "; ";
2673  }
2674  }
2675  os << "]";
2676  if (k + 1 < numInds) {
2677  os << ", ";
2678  }
2679  }
2680  os << "}" << endl;
2681  }
2682 
2683  // Print data on Process 0. This will automatically respect the
2684  // current indentation level.
2685  if (myRank == 0) {
2686  out << lclOutStrPtr->str ();
2687  lclOutStrPtr = Teuchos::null; // clear it to save space
2688  }
2689 
2690  const int sizeTag = 1337;
2691  const int dataTag = 1338;
2692 
2693  ArrayRCP<char> recvDataBuf; // only used on Process 0
2694 
2695  // Send string sizes and data from each process in turn to
2696  // Process 0, and print on that process.
2697  for (int p = 1; p < numProcs; ++p) {
2698  if (myRank == 0) {
2699  // Receive the incoming string's length.
2700  ArrayRCP<size_t> recvSize (1);
2701  recvSize[0] = 0;
2702  RCP<CommRequest<int> > recvSizeReq =
2703  ireceive<int, size_t> (recvSize, p, sizeTag, comm);
2704  wait<int> (comm, outArg (recvSizeReq));
2705  const size_t numCharsToRecv = recvSize[0];
2706 
2707  // Allocate space for the string to receive. Reuse receive
2708  // buffer space if possible. We can do this because in the
2709  // current implementation, we only have one receive in
2710  // flight at a time. Leave space for the '\0' at the end,
2711  // in case the sender doesn't send it.
2712  if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
2713  recvDataBuf.resize (numCharsToRecv + 1);
2714  }
2715  ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
2716  // Post the receive of the actual string data.
2717  RCP<CommRequest<int> > recvDataReq =
2718  ireceive<int, char> (recvData, p, dataTag, comm);
2719  wait<int> (comm, outArg (recvDataReq));
2720 
2721  // Print the received data. This will respect the current
2722  // indentation level. Make sure that the string is
2723  // null-terminated.
2724  recvDataBuf[numCharsToRecv] = '\0';
2725  out << recvDataBuf.getRawPtr ();
2726  }
2727  else if (myRank == p) { // if I am not Process 0, and my rank is p
2728  // This deep-copies the string at most twice, depending on
2729  // whether std::string reference counts internally (it
2730  // generally does, so this won't deep-copy at all).
2731  const std::string stringToSend = lclOutStrPtr->str ();
2732  lclOutStrPtr = Teuchos::null; // clear original to save space
2733 
2734  // Send the string's length to Process 0.
2735  const size_t numCharsToSend = stringToSend.size ();
2736  ArrayRCP<size_t> sendSize (1);
2737  sendSize[0] = numCharsToSend;
2738  RCP<CommRequest<int> > sendSizeReq =
2739  isend<int, size_t> (sendSize, 0, sizeTag, comm);
2740  wait<int> (comm, outArg (sendSizeReq));
2741 
2742  // Send the actual string to Process 0. We know that the
2743  // string has length > 0, so it's save to take the address
2744  // of the first entry. Make a nonowning ArrayRCP to hold
2745  // the string. Process 0 will add a null termination
2746  // character at the end of the string, after it receives the
2747  // message.
2748  ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
2749  RCP<CommRequest<int> > sendDataReq =
2750  isend<int, char> (sendData, 0, dataTag, comm);
2751  wait<int> (comm, outArg (sendDataReq));
2752  }
2753  } // for each process rank p other than 0
2754  } // extreme verbosity level (print the whole matrix)
2755  }
2756 
2757  template<class Scalar, class LO, class GO, class Node>
2758  Teuchos::RCP<const Teuchos::Comm<int> >
2760  getComm() const
2761  {
2762  return graph_.getComm();
2763  }
2764 
2765  template<class Scalar, class LO, class GO, class Node>
2766  Teuchos::RCP<Node>
2768  getNode() const
2769  {
2770  return graph_.getNode();
2771 
2772  }
2773 
2774  template<class Scalar, class LO, class GO, class Node>
2778  {
2779  return graph_.getGlobalNumCols();
2780  }
2781 
2782  template<class Scalar, class LO, class GO, class Node>
2783  size_t
2786  {
2787  return graph_.getNodeNumCols();
2788  }
2789 
2790  template<class Scalar, class LO, class GO, class Node>
2791  GO
2794  {
2795  return graph_.getIndexBase();
2796  }
2797 
2798  template<class Scalar, class LO, class GO, class Node>
2802  {
2803  return graph_.getGlobalNumEntries();
2804  }
2805 
2806  template<class Scalar, class LO, class GO, class Node>
2807  size_t
2810  {
2811  return graph_.getNodeNumEntries();
2812  }
2813 
2814  template<class Scalar, class LO, class GO, class Node>
2815  size_t
2817  getNumEntriesInGlobalRow (GO globalRow) const
2818  {
2819  return graph_.getNumEntriesInGlobalRow(globalRow);
2820  }
2821 
2822  template<class Scalar, class LO, class GO, class Node>
2826  {
2827  return getGlobalNumDiags();
2828  }
2829 
2830  template<class Scalar, class LO, class GO, class Node>
2831  size_t
2834  {
2835  return getNodeNumDiags();
2836  }
2837 
2838  template<class Scalar, class LO, class GO, class Node>
2839  size_t
2842  {
2843  return graph_.getGlobalMaxNumRowEntries();
2844  }
2845 
2846  template<class Scalar, class LO, class GO, class Node>
2847  bool
2849  hasColMap() const
2850  {
2851  return graph_.hasColMap();
2852  }
2853 
2854  template<class Scalar, class LO, class GO, class Node>
2855  bool
2858  {
2859  return graph_.isLowerTriangular();
2860  }
2861 
2862  template<class Scalar, class LO, class GO, class Node>
2863  bool
2866  {
2867  return graph_.isUpperTriangular();
2868  }
2869 
2870  template<class Scalar, class LO, class GO, class Node>
2871  bool
2874  {
2875  return graph_.isLocallyIndexed();
2876  }
2877 
2878  template<class Scalar, class LO, class GO, class Node>
2879  bool
2882  {
2883  return graph_.isGloballyIndexed();
2884  }
2885 
2886  template<class Scalar, class LO, class GO, class Node>
2887  bool
2890  {
2891  return true;
2892  }
2893 
2894  template<class Scalar, class LO, class GO, class Node>
2895  bool
2898  {
2899  return true;
2900  }
2901 
2902 
2903  template<class Scalar, class LO, class GO, class Node>
2904  void
2906  getGlobalRowCopy (GO GlobalRow,
2907  const Teuchos::ArrayView<GO> &Indices,
2908  const Teuchos::ArrayView<Scalar> &Values,
2909  size_t &NumEntries) const
2910  {
2911  TEUCHOS_TEST_FOR_EXCEPTION(
2912  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowCopy: "
2913  "This class doesn't support global matrix indexing.");
2914 
2915  }
2916 
2917  template<class Scalar, class LO, class GO, class Node>
2918  void
2920  getGlobalRowView (GO GlobalRow,
2921  ArrayView<const GO> &indices,
2922  ArrayView<const Scalar> &values) const
2923  {
2924  TEUCHOS_TEST_FOR_EXCEPTION(
2925  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowView: "
2926  "This class doesn't support global matrix indexing.");
2927 
2928  }
2929 
2930  template<class Scalar, class LO, class GO, class Node>
2931  void
2933  getLocalRowView (LO LocalRow,
2934  ArrayView<const LO> &indices,
2935  ArrayView<const Scalar> &values) const
2936  {
2937  TEUCHOS_TEST_FOR_EXCEPTION(
2938  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowView: "
2939  "This class doesn't support global matrix indexing.");
2940 
2941  }
2942 
2943 
2944  template<class Scalar, class LO, class GO, class Node>
2945  void
2948  {
2949  TEUCHOS_TEST_FOR_EXCEPTION(
2950  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getLocalDiagCopy: "
2951  "not implemented.");
2952 
2953  }
2954 
2955  template<class Scalar, class LO, class GO, class Node>
2956  void
2959  {
2960  TEUCHOS_TEST_FOR_EXCEPTION(
2961  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::leftScale: "
2962  "not implemented.");
2963 
2964  }
2965 
2966  template<class Scalar, class LO, class GO, class Node>
2967  void
2970  {
2971  TEUCHOS_TEST_FOR_EXCEPTION(
2972  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::rightScale: "
2973  "not implemented.");
2974 
2975  }
2976 
2977  template<class Scalar, class LO, class GO, class Node>
2978  Teuchos::RCP<const Tpetra::RowGraph<LO, GO, Node> >
2980  getGraph() const
2981  {
2982  return graphRCP_;
2983  }
2984 
2985  template<class Scalar, class LO, class GO, class Node>
2989  {
2990  TEUCHOS_TEST_FOR_EXCEPTION(
2991  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getFrobeniusNorm: "
2992  "not implemented.");
2993  }
2994 
2995 } // namespace Experimental
2996 } // namespace Tpetra
2997 
2998 //
2999 // Explicit instantiation macro
3000 //
3001 // Must be expanded from within the Tpetra namespace!
3002 //
3003 #define TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3004  template class Experimental::BlockCrsMatrix< S, LO, GO, NODE >;
3005 
3006 #endif // TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
Teuchos::ArrayRCP< const LocalOrdinal > getNodePackedIndices() const
Get an Teuchos::ArrayRCP of the packed column-indices.
LO absMaxLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValuesByOffsets, but for the ABSMAX combine mode.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
bool isGloballyIndexed() const
If graph indices are in the global range, this function returns true. Otherwise, this function return...
Teuchos::RCP< const Comm< int > > getComm() const
Returns the communicator.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
virtual bool supportsRowViews() const
Whether this object implements getLocalRowView() and getGlobalRowView().
virtual void copyAndPermute(const Tpetra::SrcDistObject &source, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Perform copies and permutations that are local to this process.
virtual bool isLowerTriangular() const
Whether this matrix is lower triangular.
device_type::execution_space execution_space
The Kokkos execution space.
Teuchos::RCP< const map_type > getRowMap() const
Returns the Map that describes the row distribution in this graph.
Teuchos::RCP< const map_type > getColMap() const
get the (mesh) map for the columns of this block matrix.
LO getBlockSize() const
The number of degrees of freedom per mesh point.
void fill(const Scalar &alpha) const
(*this)(i,j) := alpha for all (i,j).
Teuchos::RCP< const map_type > getRowMap() const
get the (mesh) map for the rows of this block matrix.
Teuchos::RCP< const map_type > getRangeMap() const
Returns the Map associated with the domain of this graph.
mv_type mv_
The Tpetra::MultiVector used to represent the data.
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given column indices, in the given row.
size_t getNodeNumCols() const
Returns the number of columns connected to the locally owned rows of this graph.
local_graph_type getLocalGraph() const
Get the local graph.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
bool hasColMap() const
Whether the graph has a column Map.
void putScalar(const Scalar &val)
Fill all entries with the given value val.
void matvecUpdate(const Scalar &alpha, const LittleBlockType &A, const LittleVectorType &X) const
(*this) := (*this) + alpha * A * X (matrix-vector multiply).
size_t getGlobalMaxNumRowEntries() const
Maximum number of entries in all rows over all processes.
global_size_t getGlobalNumRows() const
Returns the number of global rows in the graph.
size_t getNodeNumRows() const
get the local number of block rows
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
void getLocalDiagOffsets(Teuchos::ArrayRCP< size_t > &offsets) const
Get offsets of the diagonal entries in the matrix.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
Traits class for packing / unpacking data of type T, using Kokkos data structures that live in the gi...
One or more distributed dense vectors.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const
Print a description of this object to the given output stream.
MultiVector for multiple degrees of freedom per mesh point.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
global_size_t getGlobalNumRows() const
get the global number of block rows
void deep_copy(MultiVector< DS, DL, DG, DN, dstClassic > &dst, const MultiVector< SS, SL, SG, SN, srcClassic > &src)
Copy the contents of the MultiVector src into dst.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
Teuchos::RCP< const import_type > getImporter() const
Returns the importer associated with this graph.
void absmax(const LittleBlockType &X) const
(*this)(i,j) := max(abs((*this)(i,j)), abs(X(i,j))) for all (i,j).
Nonowning view of a square dense block in a block matrix.
LocalOrdinal getMinLocalIndex() const
The minimum local index.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Like sumIntoLocalValues, but for the ABSMAX combine mode.
Teuchos::RCP< const map_type > getRangeMap() const
Get the (point) range Map of this matrix.
virtual bool isUpperTriangular() const
Whether this matrix is upper triangular.
Allocation information for a locally owned row in a CrsGraph or CrsMatrix.
bool locallySameAs(const Map< LocalOrdinal, GlobalOrdinal, node_type > &map) const
Is the given Map locally the same as the input Map?
size_t getNodeMaxNumRowEntries() const
Maximum number of entries in all rows owned by the calling process.
void assign(const LittleVectorType &X) const
*this := X.
void scale(const Scalar &val)
Multiply all entries in place by the given value val.
virtual bool checkSizes(const Tpetra::SrcDistObject &source)
Compare the source and target (this) objects for compatibility.
little_vec_type getLocalBlock(const LO localRowIndex, const LO colIndex) const
Get a view of the degrees of freedom at the given mesh point.
Kokkos::View< char *, D, Kokkos::MemoryUnmanaged > output_buffer_type
The type of an output buffer of bytes.
bool isNodeLocalElement(LocalOrdinal localIndex) const
Whether the given local index is valid for this Map on the calling process.
virtual bool hasColMap() const
Whether this matrix has a well-defined column map.
bool isFillComplete() const
Returns true if fillComplete() has been called and the graph is in compute mode.
Teuchos::RCP< const map_type > getDomainMap() const
Get the (point) domain Map of this matrix.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
virtual size_t getNodeNumDiags() const
The number of local diagonal entries, based on global row/column index comparisons.
global_size_t getGlobalNumEntries() const
Returns the global number of entries in the graph.
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
virtual GO getIndexBase() const
The index base for global indices in this matrix.
virtual bool isGloballyIndexed() const
Whether matrix indices are globally indexed.
size_t global_size_t
Global size_t object.
Insert new values that don&#39;t currently exist.
bool isLocallyIndexed() const
If graph indices are in the local range, this function returns true. Otherwise, this function returns...
std::string description() const
One-line description of this object.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
Teuchos::RCP< const map_type > getDomainMap() const
Returns the Map associated with the domain of this graph.
Teuchos::RCP< node_type > getNode() const
Returns the underlying node.
GlobalOrdinal getIndexBase() const
Returns the index base for global indices for this graph.
virtual global_size_t getGlobalNumCols() const
The global number of columns of this matrix.
ESweepDirection
Sweep direction for Gauss-Seidel or Successive Over-Relaxation (SOR).
virtual bool isFillComplete() const
Whether fillComplete() has been called.
void update(const Scalar &alpha, const LittleBlockType &X) const
*this := *this + alpha * X.
Nonowning view of a set of degrees of freedom corresponding to a mesh point in a block vector or mult...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
LO getNumVectors() const
Get the number of columns (vectors) in the BlockMultiVector.
bool isUpperTriangular() const
Whether the graph is locally upper triangular.
Sets up and executes a communication plan for a Tpetra DistObject.
GlobalOrdinal getGlobalElement(LocalOrdinal localIndex) const
The global index corresponding to the given local index.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
void reorderedGaussSeidelCopy(MultiVector< Scalar, LO, GO, Node > &X, const MultiVector< Scalar, LO, GO, Node > &B, const MultiVector< Scalar, LO, GO, Node > &D, const ArrayView< LO > &rowIndices, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of reorderedGaussSeidel(), with fewer requirements on X.
Tpetra::MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node >::mag_type mag_type
Type of a norm result.
Kokkos::View< const value_type *, D, Kokkos::MemoryUnmanaged > input_array_type
The type of an input array of value_type.
virtual size_t getNodeNumCols() const
The number of columns needed to apply the forward operator on this node.
virtual size_t getNodeNumEntries() const
The local number of stored (structurally nonzero) entries.
virtual void leftScale(const Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the left with the given Vector x.
void getLocalDiagCopy(BlockCrsMatrix< Scalar, LO, GO, Node > &diag, const Teuchos::ArrayView< const size_t > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets.
virtual global_size_t getGlobalNumEntries() const
The global number of stored (structurally nonzero) entries.
virtual void getGlobalRowCopy(GO GlobalRow, const Teuchos::ArrayView< GO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Get a copy of the given global row&#39;s entries.
void localGaussSeidel(const BlockMultiVector< Scalar, LO, GO, Node > &Residual, BlockMultiVector< Scalar, LO, GO, Node > &Solution, BlockCrsMatrix< Scalar, LO, GO, Node > &factorizedDiagonal, const int *factorizationPivots, const Scalar omega, const ESweepDirection direction) const
Local Gauss-Seidel solve given a factorized diagonal.
void getLocalRowCopy(LO LocalRow, const Teuchos::ArrayView< LO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Not implemented.
Kokkos::View< const char *, D, Kokkos::MemoryUnmanaged > input_buffer_type
The type of an input buffer of bytes.
Replace old value with maximum of magnitudes of old and new values.
global_size_t getGlobalNumCols() const
Returns the number of global columns in the graph.
Abstract base class for objects that can be the source of an Import or Export operation.
LO getLocalRowView(const LO localRowInd, const LO *&colInds, Scalar *&vals, LO &numInds) const
Get a view of the row, using local indices.
void gaussSeidelCopy(MultiVector< Scalar, LO, GO, Node > &X, const MultiVector< Scalar, LO, GO, Node > &B, const MultiVector< Scalar, LO, GO, Node > &D, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of gaussSeidel(), with fewer requirements on X.
Replace existing values with new values.
Teuchos::RCP< const map_type > getColMap() const
Returns the Map that describes the column distribution in this graph.
Replace old values with zero.
std::string errorMessages() const
The current stream of error messages.
Teuchos::DataAccess getCopyOrView() const
Get whether this has copy (copyOrView = Teuchos::Copy) or view (copyOrView = Teuchos::View) semantics...
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given column indices, in the given row.
virtual size_t getGlobalMaxNumRowEntries() const
The maximum number of entries across all rows/columns on all nodes.
void fill(const Scalar &alpha) const
(*this)(i,j) := alpha for all (i,j).
size_t getNodeNumEntries() const
Returns the local number of entries in the graph.
Kokkos::Details::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
size_t findLocalIndex(RowInfo rowinfo, LocalOrdinal ind, size_t hint=0) const
Find the column offset corresponding to the given (local) column index.
LocalOrdinal getLocalElement(GlobalOrdinal globalIndex) const
The local index corresponding to the given global index.
virtual void rightScale(const Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the right with the given Vector x.
size_t getNodeNumElements() const
The number of elements belonging to the calling process.
virtual Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const
The Frobenius norm of the matrix.
size_t getNumEntriesInLocalRow(LocalOrdinal localRow) const
Returns the current number of entries on this node in the specified local row.
Kokkos::View< value_type *, D, Kokkos::MemoryUnmanaged > output_array_type
The type of an output array of value_type.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
A distributed dense vector.
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which this matrix is distributed.
size_t getNodeNumRows() const
Returns the number of graph rows owned on the calling node.
void assign(const LittleBlockType &X) const
*this := X.
void doExport(const SrcDistObject &source, const Export< LO, GO, Node > &exporter, CombineMode CM)
Export data into this object using an Export object ("forward mode").
virtual global_size_t getGlobalNumDiags() const
The number of global diagonal entries, based on global row/column index comparisons.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
BlockMultiVector< Scalar, LO, GO, Node >::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
virtual Teuchos::RCP< Node > getNode() const
The Kokkos Node instance.
Declaration of Tpetra::Experimental::BlockCrsMatrix.
size_t getNodeMaxNumRowEntries() const
The maximum number of entries across all rows/columns on this node.
bool isLowerTriangular() const
Whether the graph is locally lower triangular.
LocalOrdinal getMaxLocalIndex() const
The maximum local index on the calling process.
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const
The current number of entries on the calling process in the specified global row. ...
virtual Teuchos::RCP< const Tpetra::RowGraph< LO, GO, Node > > getGraph() const
Get the (mesh) graph.
size_t getNumEntriesInLocalRow(const LO localRowInd) const
Return the number of entries in the given row on the calling process.
Teuchos::RCP< const export_type > getExporter() const
Returns the exporter associated with this graph.
virtual void getGlobalRowView(GO GlobalRow, Teuchos::ArrayView< const GO > &indices, Teuchos::ArrayView< const Scalar > &values) const
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
virtual bool isLocallyIndexed() const
Whether matrix indices are locally indexed.
RowInfo getRowInfo(const size_t myRow) const
Get information about the locally owned row with local index myRow.
size_t getNumEntriesInGlobalRow(GlobalOrdinal globalRow) const
Returns the current number of entries on this node in the specified global row.