Tpetra parallel linear algebra  Version of the Day
Tpetra_Import_Util2.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_IMPORT_UTIL2_HPP
43 #define TPETRA_IMPORT_UTIL2_HPP
44 
49 
50 #include "Tpetra_ConfigDefs.hpp"
52 #include "Tpetra_Import.hpp"
53 #include "Tpetra_HashTable.hpp"
54 #include "Tpetra_Map.hpp"
55 #include "Tpetra_Util.hpp"
56 #include "Tpetra_Distributor.hpp"
57 #include <Teuchos_Array.hpp>
58 #include <utility>
59 
60 // Tpetra::CrsMatrix uses the functions below in its implementation.
61 // To avoid a circular include issue, only include the declarations
62 // for CrsMatrix. We will include the definition after the functions
63 // here have been defined.
65 
66 
67 namespace { // (anonymous)
68 
69  template<class T, class D>
70  Kokkos::View<T*, D, Kokkos::MemoryUnmanaged>
71  getNonconstView (const Teuchos::ArrayView<T>& x)
72  {
73  typedef Kokkos::View<T*, D, Kokkos::MemoryUnmanaged> view_type;
74  typedef typename view_type::size_type size_type;
75  const size_type numEnt = static_cast<size_type> (x.size ());
76  return view_type (x.getRawPtr (), numEnt);
77  }
78 
79  template<class T, class D>
80  Kokkos::View<const T*, D, Kokkos::MemoryUnmanaged>
81  getConstView (const Teuchos::ArrayView<const T>& x)
82  {
83  typedef Kokkos::View<const T*, D, Kokkos::MemoryUnmanaged> view_type;
84  typedef typename view_type::size_type size_type;
85  const size_type numEnt = static_cast<size_type> (x.size ());
86  return view_type (x.getRawPtr (), numEnt);
87  }
88 
89  // For a given Node type, return the associated execution space.
90  template<class NodeType>
91  struct NodeToExecSpace {
92 #ifdef KOKKOS_HAVE_SERIAL
93  typedef Kokkos::Serial execution_space;
94 #else
95  typedef Kokkos::HostSpace::execution_space execution_space;
96 #endif // KOKKOS_HAVE_SERIAL
97  };
98 
99  // Partial specialization for the new (Kokkos refactor) Node types.
100  template<class ExecSpace>
101  struct NodeToExecSpace<Kokkos::Compat::KokkosDeviceWrapperNode<ExecSpace> > {
102  typedef typename ExecSpace::execution_space execution_space;
103  };
104 
105  // For a given Kokkos (execution or memory) space, return both its
106  // execution space, and the corresponding host execution space.
107  template<class Space>
108  struct GetHostExecSpace {
109  typedef typename Space::execution_space execution_space;
110  typedef typename Kokkos::View<int*, execution_space>::HostMirror::execution_space host_execution_space;
111  };
112 
113 } // namespace (anonymous)
114 
115 namespace Tpetra {
116 namespace Import_Util {
117 
132 template<typename Scalar,
133  typename LocalOrdinal,
134  typename GlobalOrdinal,
135  typename Node>
136 void
137 packAndPrepareWithOwningPIDs (const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& SourceMatrix,
138  const Teuchos::ArrayView<const LocalOrdinal>& exportLIDs,
139  Teuchos::Array<char>& exports,
140  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
141  size_t& constantNumPackets,
142  Distributor &distor,
143  const Teuchos::ArrayView<const int>& SourcePids);
144 
160 template<typename Scalar,
161  typename LocalOrdinal,
162  typename GlobalOrdinal,
163  typename Node>
164 size_t
165 unpackAndCombineWithOwningPIDsCount (const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& SourceMatrix,
166  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
167  const Teuchos::ArrayView<const char> &imports,
168  const Teuchos::ArrayView<size_t> &numPacketsPerLID,
169  size_t constantNumPackets,
170  Distributor &distor,
171  CombineMode combineMode,
172  size_t numSameIDs,
173  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
174  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs);
175 
190 template<typename Scalar,
191  typename LocalOrdinal,
192  typename GlobalOrdinal,
193  typename Node>
194 void
195 unpackAndCombineIntoCrsArrays (const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& SourceMatrix,
196  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
197  const Teuchos::ArrayView<const char>& imports,
198  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
199  size_t constantNumPackets,
200  Distributor &distor,
201  CombineMode combineMode,
202  size_t numSameIDs,
203  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
204  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
205  size_t TargetNumRows,
206  size_t TargetNumNonzeros,
207  int MyTargetPID,
208  const Teuchos::ArrayView<size_t>& rowPointers,
209  const Teuchos::ArrayView<GlobalOrdinal>& columnIndices,
210  const Teuchos::ArrayView<Scalar>& values,
211  const Teuchos::ArrayView<const int>& SourcePids,
212  Teuchos::Array<int>& TargetPids);
213 
216 template<typename Scalar, typename Ordinal>
217 void
218 sortCrsEntries (const Teuchos::ArrayView<size_t>& CRS_rowptr,
219  const Teuchos::ArrayView<Ordinal>& CRS_colind,
220  const Teuchos::ArrayView<Scalar>&CRS_vals);
221 
226 template<typename Scalar, typename Ordinal>
227 void
228 sortAndMergeCrsEntries (const Teuchos::ArrayView<size_t>& CRS_rowptr,
229  const Teuchos::ArrayView<Ordinal>& CRS_colind,
230  const Teuchos::ArrayView<Scalar>& CRS_vals);
231 
247 template <typename LocalOrdinal, typename GlobalOrdinal, typename Node>
248 void
249 lowCommunicationMakeColMapAndReindex (const Teuchos::ArrayView<const size_t> &rowPointers,
250  const Teuchos::ArrayView<LocalOrdinal> &columnIndices_LID,
251  const Teuchos::ArrayView<GlobalOrdinal> &columnIndices_GID,
252  const Tpetra::RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > & domainMap,
253  const Teuchos::ArrayView<const int> &owningPids,
254  Teuchos::Array<int> &remotePids,
255  Teuchos::RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > & colMap);
256 
257 
258 } // namespace Import_Util
259 } // namespace Tpetra
260 
261 
262 //
263 // Implementations
264 //
265 
266 namespace { // (anonymous)
267 
284  template<class LO, class GO, class D>
285  size_t
286  packRowCount (const size_t numEnt,
287  const size_t numBytesPerValue)
288  {
290 
291  if (numEnt == 0) {
292  // Empty rows always take zero bytes, to ensure sparsity.
293  return 0;
294  }
295  else {
296  // We store the number of entries as a local index (LO).
297  LO numEntLO = 0; // packValueCount wants this.
298  GO gid;
299  int lid;
300  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
301  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
302  const size_t pidsLen = numEnt * PackTraits<int, D>::packValueCount (lid);
303  const size_t valsLen = numEnt * numBytesPerValue;
304  return numEntLen + gidsLen + pidsLen + valsLen;
305  }
306  }
307 
308  template<class LO, class D>
309  size_t
310  unpackRowCount (const typename Tpetra::Details::PackTraits<LO, D>::input_buffer_type& imports,
311  const size_t offset,
312  const size_t numBytes,
313  const size_t numBytesPerValue)
314  {
315  using Kokkos::subview;
317  typedef typename PackTraits<LO, D>::input_buffer_type input_buffer_type;
318  typedef typename input_buffer_type::size_type size_type;
319 
320  if (numBytes == 0) {
321  // Empty rows always take zero bytes, to ensure sparsity.
322  return static_cast<size_t> (0);
323  }
324  else {
325  LO numEntLO = 0;
326  const size_t theNumBytes = PackTraits<LO, D>::packValueCount (numEntLO);
327 #ifdef HAVE_TPETRA_DEBUG
328  TEUCHOS_TEST_FOR_EXCEPTION(
329  theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
330  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
331  << ".");
332 #endif // HAVE_TPETRA_DEBUG
333  const std::pair<size_type, size_type> rng (offset, offset + theNumBytes);
334  input_buffer_type inBuf = subview (imports, rng); // imports (offset, theNumBytes);
335  const size_t actualNumBytes = PackTraits<LO, D>::unpackValue (numEntLO, inBuf);
336  (void)actualNumBytes;
337 #ifdef HAVE_TPETRA_DEBUG
338  TEUCHOS_TEST_FOR_EXCEPTION(
339  theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
340  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
341  << ".");
342 #endif // HAVE_TPETRA_DEBUG
343  return static_cast<size_t> (numEntLO);
344  }
345  }
346 
347  // Return the number of bytes packed.
348  template<class ST, class LO, class GO, class D>
349  size_t
350  packRow (const typename Tpetra::Details::PackTraits<LO, D>::output_buffer_type& exports,
351  const size_t offset,
352  const size_t numEnt,
356  const size_t numBytesPerValue)
357  {
358  using Kokkos::subview;
360  // NOTE (mfh 02 Feb 2015) This assumes that output_buffer_type is
361  // the same, no matter what type we're packing. It's a reasonable
362  // assumption, given that we go through the trouble of PackTraits
363  // just so that we can pack data of different types in the same
364  // buffer.
365  typedef typename PackTraits<LO, D>::output_buffer_type output_buffer_type;
366  typedef typename output_buffer_type::size_type size_type;
367  typedef typename std::pair<size_type, size_type> pair_type;
368 
369  if (numEnt == 0) {
370  // Empty rows always take zero bytes, to ensure sparsity.
371  return 0;
372  }
373 
374  const GO gid = 0; // packValueCount wants this
375  const LO numEntLO = static_cast<size_t> (numEnt);
376  const int pid = 0; // packValueCount wants this
377 
378  const size_t numEntBeg = offset;
379  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
380  const size_t gidsBeg = numEntBeg + numEntLen;
381  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
382  const size_t pidsBeg = gidsBeg + gidsLen;
383  const size_t pidsLen = numEnt * PackTraits<int, D>::packValueCount (pid);
384  const size_t valsBeg = pidsBeg + pidsLen;
385  const size_t valsLen = numEnt * numBytesPerValue;
386 
387  output_buffer_type numEntOut =
388  subview (exports, pair_type (numEntBeg, numEntBeg + numEntLen));
389  output_buffer_type gidsOut =
390  subview (exports, pair_type (gidsBeg, gidsBeg + gidsLen));
391  output_buffer_type pidsOut =
392  subview (exports, pair_type (pidsBeg, pidsBeg + pidsLen));
393  output_buffer_type valsOut =
394  subview (exports, pair_type (valsBeg, valsBeg + valsLen));
395 
396  size_t numBytesOut = 0;
397  numBytesOut += PackTraits<LO, D>::packValue (numEntOut, numEntLO);
398  numBytesOut += PackTraits<GO, D>::packArray (gidsOut, gidsIn, numEnt);
399  numBytesOut += PackTraits<int, D>::packArray (pidsOut, pidsIn, numEnt);
400  numBytesOut += PackTraits<ST, D>::packArray (valsOut, valsIn, numEnt);
401 
402  const size_t expectedNumBytes = numEntLen + gidsLen + pidsLen + valsLen;
403  TEUCHOS_TEST_FOR_EXCEPTION(
404  numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
405  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
406  << expectedNumBytes << ".");
407 
408  return numBytesOut;
409  }
410 
411  // Return the number of bytes actually read / used.
412  template<class ST, class LO, class GO, class D>
413  size_t
414  unpackRow (const typename Tpetra::Details::PackTraits<GO, D>::output_array_type& gidsOut,
418  const size_t offset,
419  const size_t numBytes,
420  const size_t numEnt,
421  const size_t numBytesPerValue)
422  {
423  using Kokkos::subview;
425  // NOTE (mfh 02 Feb 2015) This assumes that input_buffer_type is
426  // the same, no matter what type we're unpacking. It's a
427  // reasonable assumption, given that we go through the trouble of
428  // PackTraits just so that we can pack data of different types in
429  // the same buffer.
430  typedef typename PackTraits<LO, D>::input_buffer_type input_buffer_type;
431  typedef typename input_buffer_type::size_type size_type;
432  typedef typename std::pair<size_type, size_type> pair_type;
433 
434  if (numBytes == 0) {
435  // Rows with zero bytes always have zero entries.
436  return 0;
437  }
438  TEUCHOS_TEST_FOR_EXCEPTION(
439  static_cast<size_t> (imports.dimension_0 ()) <= offset, std::logic_error,
440  "unpackRow: imports.dimension_0() = " << imports.dimension_0 () <<
441  " <= offset = " << offset << ".");
442  TEUCHOS_TEST_FOR_EXCEPTION(
443  static_cast<size_t> (imports.dimension_0 ()) < offset + numBytes,
444  std::logic_error, "unpackRow: imports.dimension_0() = "
445  << imports.dimension_0 () << " < offset + numBytes = "
446  << (offset + numBytes) << ".");
447 
448  const GO gid = 0; // packValueCount wants this
449  const LO lid = 0; // packValueCount wants this
450  const int pid = 0; // packValueCount wants this
451 
452  const size_t numEntBeg = offset;
453  const size_t numEntLen = PackTraits<LO, D>::packValueCount (lid);
454  const size_t gidsBeg = numEntBeg + numEntLen;
455  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
456  const size_t pidsBeg = gidsBeg + gidsLen;
457  const size_t pidsLen = numEnt * PackTraits<int, D>::packValueCount (pid);
458  const size_t valsBeg = pidsBeg + pidsLen;
459  const size_t valsLen = numEnt * numBytesPerValue;
460 
461  input_buffer_type numEntIn = subview (imports, pair_type (numEntBeg, numEntBeg + numEntLen));
462  input_buffer_type gidsIn = subview (imports, pair_type (gidsBeg, gidsBeg + gidsLen));
463  input_buffer_type pidsIn = subview (imports, pair_type (pidsBeg, pidsBeg + pidsLen));
464  input_buffer_type valsIn = subview (imports, pair_type (valsBeg, valsBeg + valsLen));
465 
466  size_t numBytesOut = 0;
467  LO numEntOut;
468  numBytesOut += PackTraits<LO, D>::unpackValue (numEntOut, numEntIn);
469  TEUCHOS_TEST_FOR_EXCEPTION(
470  static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
471  "unpackRow: Expected number of entries " << numEnt
472  << " != actual number of entries " << numEntOut << ".");
473 
474  numBytesOut += PackTraits<GO, D>::unpackArray (gidsOut, gidsIn, numEnt);
475  numBytesOut += PackTraits<int, D>::unpackArray (pidsOut, pidsIn, numEnt);
476  numBytesOut += PackTraits<ST, D>::unpackArray (valsOut, valsIn, numEnt);
477 
478  TEUCHOS_TEST_FOR_EXCEPTION(
479  numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = "
480  << numBytesOut << " != numBytes = " << numBytes << ".");
481  const size_t expectedNumBytes = numEntLen + gidsLen + pidsLen + valsLen;
482  TEUCHOS_TEST_FOR_EXCEPTION(
483  numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
484  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
485  << expectedNumBytes << ".");
486  return numBytesOut;
487  }
488 
489 } // namespace (anonymous)
490 
491 
492 namespace Tpetra {
493 namespace Import_Util {
494 
495 template<typename Scalar,
496  typename LocalOrdinal,
497  typename GlobalOrdinal,
498  typename Node>
499 void
501  const Teuchos::ArrayView<const LocalOrdinal>& exportLIDs,
502  Teuchos::Array<char>& exports,
503  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
504  size_t& constantNumPackets,
505  Distributor &distor,
506  const Teuchos::ArrayView<const int>& SourcePids)
507 {
509  using Kokkos::MemoryUnmanaged;
510  using Kokkos::subview;
511  using Kokkos::View;
512  using Teuchos::Array;
513  using Teuchos::ArrayView;
514  using Teuchos::as;
515  using Teuchos::RCP;
516  typedef LocalOrdinal LO;
517  typedef GlobalOrdinal GO;
519  typedef typename matrix_type::impl_scalar_type ST;
520  typedef typename NodeToExecSpace<Node>::execution_space execution_space;
521  typedef typename GetHostExecSpace<execution_space>::host_execution_space HES;
522  typedef Map<LocalOrdinal,GlobalOrdinal,Node> map_type;
523  typedef typename ArrayView<const LO>::size_type size_type;
524  typedef std::pair<typename View<int*, HES>::size_type,
525  typename View<int*, HES>::size_type> pair_type;
526  const char prefix[] = "Tpetra::Import_Util::packAndPrepareWithOwningPIDs: ";
527 
528  // FIXME (mfh 03 Jan 2015) Currently, it might be the case that if a
529  // graph or matrix owns no entries on a process, then it reports
530  // that it is neither locally nor globally indexed, even if the
531  // graph or matrix has a column Map. This should change once we get
532  // rid of lazy initialization in CrsGraph and CrsMatrix.
533  TEUCHOS_TEST_FOR_EXCEPTION(
534  ! SourceMatrix.isLocallyIndexed (), std::invalid_argument,
535  prefix << "SourceMatrix must be locally indexed.");
536  TEUCHOS_TEST_FOR_EXCEPTION(
537  SourceMatrix.getColMap ().is_null (), std::logic_error,
538  prefix << "The source matrix claims to be locally indexed, but its column "
539  "Map is null. This should never happen. Please report this bug to the "
540  "Tpetra developers.");
541  const size_type numExportLIDs = exportLIDs.size ();
542  TEUCHOS_TEST_FOR_EXCEPTION(
543  numExportLIDs != numPacketsPerLID.size (), std::invalid_argument, prefix
544  << "exportLIDs.size() = " << numExportLIDs << "!= numPacketsPerLID.size() "
545  << " = " << numPacketsPerLID.size () << ".");
546  TEUCHOS_TEST_FOR_EXCEPTION(
547  static_cast<size_t> (SourcePids.size ()) != SourceMatrix.getColMap ()->getNodeNumElements (),
548  std::invalid_argument, prefix << "SourcePids.size() = "
549  << SourcePids.size ()
550  << "!= SourceMatrix.getColMap()->getNodeNumElements() = "
551  << SourceMatrix.getColMap ()->getNodeNumElements () << ".");
552 
553  // This tells the caller that different rows may have different
554  // numbers of entries. That is, the number of packets per LID might
555  // not be a constant.
556  constantNumPackets = 0;
557 
558  // Compute the number of bytes ("packets") per row to pack. While
559  // we're at it, compute the total number of matrix entries to send,
560  // and the max number of entries in any of the rows we're sending.
561  size_t totalNumBytes = 0;
562  size_t totalNumEntries = 0;
563  size_t maxRowLength = 0;
564  for (size_type i = 0; i < numExportLIDs; ++i) {
565  const LO lclRow = exportLIDs[i];
566  const size_t numEnt = SourceMatrix.getNumEntriesInLocalRow (lclRow);
567 
568  // The 'if' branch implicitly assumes that packRowCount() returns
569  // zero if numEnt == 0.
570  size_t numBytesPerValue = 0;
571  if (numEnt > 0) {
572  // Get a locally indexed view of the current row's data. If the
573  // current row has > 0 entries, we need an entry in order to
574  // figure out the byte count of the packed row. (We really only
575  // need it if ST's size is determined at run time.)
576  ArrayView<const Scalar> valsView;
577  ArrayView<const LO> lidsView;
578  SourceMatrix.getLocalRowView (lclRow, lidsView, valsView);
579  const ST* valsViewRaw = reinterpret_cast<const ST*> (valsView.getRawPtr ());
580  View<const ST*, HES, MemoryUnmanaged> valsViewK (valsViewRaw, valsView.size ());
581  TEUCHOS_TEST_FOR_EXCEPTION(
582  static_cast<size_t> (valsViewK.dimension_0 ()) != numEnt,
583  std::logic_error, prefix << "Local row " << i << " claims to have "
584  << numEnt << "entry/ies, but the View returned by getLocalRowView() "
585  "has " << valsViewK.dimension_0 () << " entry/ies. This should never "
586  "happen. Please report this bug to the Tpetra developers.");
587 
588  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
589  // space here for now, this doesn't assume UVM. That may change
590  // in the future, if we ever start packing on the device.
591  numBytesPerValue = PackTraits<ST, HES>::packValueCount (valsViewK(0));
592  }
593 
594  const size_t numBytes =
595  packRowCount<LO, GO, HES> (numEnt, numBytesPerValue);
596  numPacketsPerLID[i] = numBytes;
597  totalNumBytes += numBytes;
598  totalNumEntries += numEnt;
599  maxRowLength = std::max (maxRowLength, numEnt);
600  }
601 
602  // We use a "struct of arrays" approach to packing each row's
603  // entries. All the column indices (as global indices) go first,
604  // then all their owning process ranks, and then the values.
605  if (totalNumEntries > 0) {
606  exports.resize (totalNumBytes);
607  View<char*, HES, MemoryUnmanaged> exportsK (exports.getRawPtr (),
608  totalNumBytes);
609  // Current position (in bytes) in the 'exports' output array.
610  size_t offset = 0;
611 
612  // For each row of the matrix owned by the calling process, pack
613  // that row's column indices and values into the exports array.
614 
615  // Locally indexed matrices always have a column Map.
616  const map_type& colMap = * (SourceMatrix.getColMap ());
617 
618  // Temporary buffers for a copy of the column gids/pids
619  View<GO*, HES> gids;
620  View<int*, HES> pids;
621  {
622  GO gid;
623  int pid;
624  gids = PackTraits<GO, HES>::allocateArray (gid, maxRowLength, "gids");
625  pids = PackTraits<int, HES>::allocateArray (pid, maxRowLength, "pids");
626  }
627 
628  for (size_type i = 0; i < numExportLIDs; i++) {
629  const LO lclRow = exportLIDs[i];
630 
631  // Get a locally indexed view of the current row's data.
632  ArrayView<const Scalar> valsView;
633  ArrayView<const LO> lidsView;
634  SourceMatrix.getLocalRowView (lclRow, lidsView, valsView);
635  const ST* valsViewRaw = reinterpret_cast<const ST*> (valsView.getRawPtr ());
636  View<const ST*, HES, MemoryUnmanaged> valsViewK (valsViewRaw, valsView.size ());
637  const size_t numEnt = static_cast<size_t> (valsViewK.dimension_0 ());
638 
639  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
640  // space here for now, this doesn't assume UVM. That may change
641  // in the future, if we ever start packing on the device.
642  const size_t numBytesPerValue = numEnt == 0 ?
643  static_cast<size_t> (0) :
644  PackTraits<ST, HES>::packValueCount (valsViewK(0));
645 
646  // Convert column indices as LIDs to column indices as GIDs.
647  View<GO*, HES> gidsView = subview (gids, pair_type (0, numEnt));
648  View<int*, HES> pidsView = subview (pids, pair_type (0, numEnt));
649  for (size_t k = 0; k < numEnt; ++k) {
650  gidsView(k) = colMap.getGlobalElement (lidsView[k]);
651  pidsView(k) = SourcePids[lidsView[k]];
652  }
653 
654  // Copy the row's data into the current spot in the exports array.
655  const size_t numBytes =
656  packRow<ST, LO, GO, HES> (exportsK, offset, numEnt,
657  gidsView, pidsView, valsViewK,
658  numBytesPerValue);
659  // Keep track of how many bytes we packed.
660  offset += numBytes;
661  }
662 
663 #ifdef HAVE_TPETRA_DEBUG
664  TEUCHOS_TEST_FOR_EXCEPTION(
665  offset != totalNumBytes, std::logic_error, prefix << "At end of method, "
666  "the final offset (in bytes) " << offset << " does not equal the total "
667  "number of bytes packed " << totalNumBytes << ". Please report this bug "
668  "to the Tpetra developers.");
669 #endif // HAVE_TPETRA_DEBUG
670  }
671 }
672 
673 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
674 size_t
676  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
677  const Teuchos::ArrayView<const char> &imports,
678  const Teuchos::ArrayView<size_t> &numPacketsPerLID,
679  size_t constantNumPackets,
680  Distributor &distor,
681  CombineMode combineMode,
682  size_t numSameIDs,
683  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
684  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
685 {
686  using Kokkos::MemoryUnmanaged;
687  using Kokkos::View;
688  typedef LocalOrdinal LO;
689  typedef GlobalOrdinal GO;
690  typedef CrsMatrix<Scalar, LO, GO, Node> matrix_type;
691  typedef typename matrix_type::impl_scalar_type ST;
692  typedef typename Teuchos::ArrayView<const LO>::size_type size_type;
693  typedef typename NodeToExecSpace<Node>::execution_space execution_space;
694  typedef typename GetHostExecSpace<execution_space>::host_execution_space HES;
695  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
696 
697  TEUCHOS_TEST_FOR_EXCEPTION(
698  permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
699  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != "
700  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
701  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
702  // process, then the matrix is neither locally nor globally indexed.
703  const bool locallyIndexed = SourceMatrix.isLocallyIndexed ();
704  TEUCHOS_TEST_FOR_EXCEPTION(
705  ! locallyIndexed, std::invalid_argument, prefix << "The input CrsMatrix "
706  "'SourceMatrix' must be locally indexed.");
707  TEUCHOS_TEST_FOR_EXCEPTION(
708  importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
709  prefix << "importLIDs.size() = " << importLIDs.size () << " != "
710  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << ".");
711 
712  View<const char*, HES, MemoryUnmanaged> importsK (imports.getRawPtr (),
713  imports.size ());
714 
715  // Number of matrix entries to unpack (returned by this function).
716  size_t nnz = 0;
717 
718  // Count entries copied directly from the source matrix without permuting.
719  for (size_t sourceLID = 0; sourceLID < numSameIDs; ++sourceLID) {
720  const LO srcLID = static_cast<LO> (sourceLID);
721  nnz += SourceMatrix.getNumEntriesInLocalRow (srcLID);
722  }
723 
724  // Count entries copied directly from the source matrix with permuting.
725  const size_type numPermuteToLIDs = permuteToLIDs.size ();
726  for (size_type p = 0; p < numPermuteToLIDs; ++p) {
727  nnz += SourceMatrix.getNumEntriesInLocalRow (permuteFromLIDs[p]);
728  }
729 
730  // Count entries received from other MPI processes.
731  size_t offset = 0;
732  const size_type numImportLIDs = importLIDs.size ();
733  for (size_type i = 0; i < numImportLIDs; ++i) {
734  const size_t numBytes = numPacketsPerLID[i];
735  // FIXME (mfh 07 Feb 2015) Ask the matrix (rather, one of its
736  // values, if it has one) for the (possibly run-time-depenendent)
737  // number of bytes of one of its entries.
738  const size_t numEnt = unpackRowCount<LO, HES> (importsK, offset,
739  numBytes, sizeof (ST));
740  nnz += numEnt;
741  offset += numBytes;
742  }
743  return nnz;
744 }
745 
746 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
747 void
749  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
750  const Teuchos::ArrayView<const char>& imports,
751  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
752  const size_t constantNumPackets,
753  Distributor& distor,
754  const CombineMode combineMode,
755  const size_t numSameIDs,
756  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
757  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
758  size_t TargetNumRows,
759  size_t TargetNumNonzeros,
760  const int MyTargetPID,
761  const Teuchos::ArrayView<size_t>& CSR_rowptr,
762  const Teuchos::ArrayView<GlobalOrdinal>& CSR_colind,
763  const Teuchos::ArrayView<Scalar>& CSR_vals,
764  const Teuchos::ArrayView<const int>& SourcePids,
765  Teuchos::Array<int>& TargetPids)
766 {
768  using Kokkos::MemoryUnmanaged;
769  using Kokkos::subview;
770  using Kokkos::View;
771  using Teuchos::ArrayView;
772  using Teuchos::as;
773  using Teuchos::av_reinterpret_cast;
774  using Teuchos::outArg;
775  using Teuchos::REDUCE_MAX;
776  using Teuchos::reduceAll;
777  typedef LocalOrdinal LO;
778  typedef GlobalOrdinal GO;
779  typedef typename NodeToExecSpace<Node>::execution_space execution_space;
780  typedef typename GetHostExecSpace<execution_space>::host_execution_space HES;
782  typedef typename matrix_type::impl_scalar_type ST;
783  typedef Map<LocalOrdinal,GlobalOrdinal,Node> map_type;
784  typedef typename ArrayView<const LO>::size_type size_type;
785  //typedef std::pair<typename Kokkos::View<int*, HES>::size_type,
786  // typename Kokkos::View<int*, HES>::size_type> pair_type;
787  const char prefix[] = "Tpetra::Import_Util::unpackAndCombineIntoCrsArrays: ";
788 
789  const size_t N = TargetNumRows;
790  const size_t mynnz = TargetNumNonzeros;
791  // In the case of reduced communicators, the SourceMatrix won't have
792  // the right "MyPID", so thus we have to supply it.
793  const int MyPID = MyTargetPID;
794 
795  TEUCHOS_TEST_FOR_EXCEPTION(
796  TargetNumRows + 1 != static_cast<size_t> (CSR_rowptr.size ()),
797  std::invalid_argument, prefix << "CSR_rowptr.size() = " <<
798  CSR_rowptr.size () << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
799  TEUCHOS_TEST_FOR_EXCEPTION(
800  permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
801  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size ()
802  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size () << ".");
803  const size_type numImportLIDs = importLIDs.size ();
804  TEUCHOS_TEST_FOR_EXCEPTION(
805  numImportLIDs != numPacketsPerLID.size (), std::invalid_argument,
806  prefix << "importLIDs.size() = " << numImportLIDs << " != "
807  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
808 
809  // Kokkos::View of the input buffer of bytes to unpack.
810  View<const char*, HES, MemoryUnmanaged> importsK (imports.getRawPtr (),
811  imports.size ());
812  // Zero the rowptr
813  for (size_t i = 0; i< N+1; ++i) {
814  CSR_rowptr[i] = 0;
815  }
816 
817  // SameIDs: Always first, always in the same place
818  for (size_t i = 0; i < numSameIDs; ++i) {
819  CSR_rowptr[i] = SourceMatrix.getNumEntriesInLocalRow (static_cast<LO> (i));
820  }
821 
822  // PermuteIDs: Still local, but reordered
823  size_t numPermuteIDs = permuteToLIDs.size ();
824  for (size_t i = 0; i < numPermuteIDs; ++i) {
825  CSR_rowptr[permuteToLIDs[i]] =
826  SourceMatrix.getNumEntriesInLocalRow (permuteFromLIDs[i]);
827  }
828 
829  // Setup CSR_rowptr for remotes
830  {
831  size_t offset = 0;
832  for (size_type k = 0; k < numImportLIDs; ++k) {
833  const size_t numBytes = numPacketsPerLID[k];
834  // FIXME (mfh 07 Feb 2015) Ask the matrix (rather, one of its
835  // values, if it has one) for the (possibly run-time -
836  // depenendent) number of bytes of one of its entries.
837  const size_t numEnt = unpackRowCount<LO, HES> (importsK, offset,
838  numBytes, sizeof (ST));
839  CSR_rowptr[importLIDs[k]] += numEnt;
840  offset += numBytes;
841  }
842  }
843 
844  // If multiple processes contribute to the same row, we may need to
845  // update row offsets. This tracks that.
846  Teuchos::Array<size_t> NewStartRow (N + 1);
847 
848  // Turn row length into a real CSR_rowptr
849  size_t last_len = CSR_rowptr[0];
850  CSR_rowptr[0] = 0;
851  for (size_t i = 1; i < N+1; ++i) {
852  size_t new_len = CSR_rowptr[i];
853  CSR_rowptr[i] = last_len + CSR_rowptr[i-1];
854  NewStartRow[i] = CSR_rowptr[i];
855  last_len = new_len;
856  }
857 
858  TEUCHOS_TEST_FOR_EXCEPTION(
859  CSR_rowptr[N] != mynnz, std::invalid_argument, prefix << "CSR_rowptr[last]"
860  " = " << CSR_rowptr[N] << "!= mynnz = " << mynnz << ".");
861 
862  // Preseed TargetPids with -1 for local
863  if (static_cast<size_t> (TargetPids.size ()) != mynnz) {
864  TargetPids.resize (mynnz);
865  }
866  TargetPids.assign (mynnz, -1);
867 
868  // Grab pointers for SourceMatrix
869  ArrayRCP<const size_t> Source_rowptr_RCP;
870  ArrayRCP<const LO> Source_colind_RCP;
871  ArrayRCP<const Scalar> Source_vals_RCP;
872  SourceMatrix.getAllValues (Source_rowptr_RCP, Source_colind_RCP, Source_vals_RCP);
873  ArrayView<const size_t> Source_rowptr = Source_rowptr_RCP ();
874  ArrayView<const LO> Source_colind = Source_colind_RCP ();
875  ArrayView<const Scalar> Source_vals = Source_vals_RCP ();
876 
877  const map_type& sourceColMap = * (SourceMatrix.getColMap());
878  ArrayView<const GO> globalColElts = sourceColMap.getNodeElementList();
879 
880  // SameIDs: Copy the data over
881  for (size_t i = 0; i < numSameIDs; ++i) {
882  size_t FromRow = Source_rowptr[i];
883  size_t ToRow = CSR_rowptr[i];
884  NewStartRow[i] += Source_rowptr[i+1] - Source_rowptr[i];
885 
886  for (size_t j = Source_rowptr[i]; j < Source_rowptr[i+1]; ++j) {
887  CSR_vals[ToRow + j - FromRow] = Source_vals[j];
888  CSR_colind[ToRow + j - FromRow] = globalColElts[Source_colind[j]];
889  TargetPids[ToRow + j - FromRow] =
890  (SourcePids[Source_colind[j]] != MyPID) ?
891  SourcePids[Source_colind[j]] : -1;
892  }
893  }
894 
895  size_t numBytesPerValue = 0;
896  if (PackTraits<ST, HES>::compileTimeSize) {
897  ST val; // assume that ST is default constructible
898  numBytesPerValue = PackTraits<ST, HES>::packValueCount (val);
899  }
900  else {
901  // Since the packed data come from the source matrix, we can use
902  // the source matrix to get the number of bytes per Scalar value
903  // stored in the matrix. This assumes that all Scalar values in
904  // the source matrix require the same number of bytes. If the
905  // source matrix has no entries on the calling process, then we
906  // have to ask the target matrix (via the output CSR arrays). If
907  // the target matrix has no entries on input on the calling
908  // process, then hope that some process does have some idea how
909  // big a Scalar value is. Of course, if no processes have any
910  // entries, then no values should be packed (though this does
911  // assume that in our packing scheme, rows with zero entries take
912  // zero bytes).
913  if (Source_rowptr.size () == 0 || Source_rowptr[Source_rowptr.size () - 1] == 0) {
914  numBytesPerValue = PackTraits<ST, HES>::packValueCount (CSR_vals[0]);
915  }
916  else {
917  numBytesPerValue = PackTraits<ST, HES>::packValueCount (Source_vals[0]);
918  }
919  size_t lclNumBytesPerValue = numBytesPerValue;
920  Teuchos::RCP<const Teuchos::Comm<int> > comm = SourceMatrix.getComm ();
921  reduceAll<int, size_t> (*comm, REDUCE_MAX, lclNumBytesPerValue,
922  outArg (numBytesPerValue));
923  }
924 
925  // PermuteIDs: Copy the data over
926  for (size_t i = 0; i < numPermuteIDs; ++i) {
927  LO FromLID = permuteFromLIDs[i];
928  size_t FromRow = Source_rowptr[FromLID];
929  size_t ToRow = CSR_rowptr[permuteToLIDs[i]];
930 
931  NewStartRow[permuteToLIDs[i]] += Source_rowptr[FromLID+1]-Source_rowptr[FromLID];
932 
933  for (size_t j = Source_rowptr[FromLID]; j < Source_rowptr[FromLID+1]; ++j) {
934  CSR_vals[ToRow + j - FromRow] = Source_vals[j];
935  CSR_colind[ToRow + j - FromRow] = globalColElts[Source_colind[j]];
936  TargetPids[ToRow + j - FromRow] =
937  (SourcePids[Source_colind[j]] != MyPID) ?
938  SourcePids[Source_colind[j]] : -1;
939  }
940  }
941 
942  // RemoteIDs: Loop structure following UnpackAndCombine
943  if (imports.size () > 0) {
944  size_t offset = 0;
945 #ifdef HAVE_TPETRA_DEBUG
946  int lclErr = 0;
947 #endif
948 
949  for (size_t i = 0; i < static_cast<size_t> (numImportLIDs); ++i) {
950  const size_t numBytes = numPacketsPerLID[i];
951  if (numBytes == 0) {
952  // Empty buffer for that row means that the row is empty.
953  continue;
954  }
955  // FIXME (mfh 07 Feb 2015) Ask the matrix (rather, one of its
956  // values, if it has one) for the (possibly run-time -
957  // depenendent) number of bytes of one of its entries.
958  const size_t numEnt = unpackRowCount<LO, HES> (importsK, offset, numBytes,
959  numBytesPerValue);
960  const LO lclRow = importLIDs[i];
961  const size_t StartRow = NewStartRow[lclRow];
962  NewStartRow[lclRow] += numEnt;
963 
964  View<GO*, HES, MemoryUnmanaged> gidsOut =
965  getNonconstView<GO, HES> (CSR_colind (StartRow, numEnt));
966  View<int*, HES, MemoryUnmanaged> pidsOut =
967  getNonconstView<int, HES> (TargetPids (StartRow, numEnt));
968  ArrayView<Scalar> valsOutS = CSR_vals (StartRow, numEnt);
969  View<ST*, HES, MemoryUnmanaged> valsOut =
970  getNonconstView<ST, HES> (av_reinterpret_cast<ST> (valsOutS));
971 
972  const size_t numBytesOut =
973  unpackRow<ST, LO, GO, HES> (gidsOut, pidsOut, valsOut, importsK,
974  offset, numBytes, numEnt, numBytesPerValue);
975  if (numBytesOut != numBytes) {
976 #ifdef HAVE_TPETRA_DEBUG
977  lclErr = 1;
978 #endif
979  break;
980  }
981 
982  // Correct target PIDs.
983  for (size_t j = 0; j < numEnt; ++j) {
984  const int pid = pidsOut[j];
985  pidsOut[j] = (pid != MyPID) ? pid : -1;
986  }
987  offset += numBytes;
988  }
989 #ifdef HAVE_TPETRA_DEBUG
990  TEUCHOS_TEST_FOR_EXCEPTION(
991  offset != static_cast<size_t> (imports.size ()), std::logic_error, prefix
992  << "After unpacking and counting all the import packets, the final offset"
993  " in bytes " << offset << " != imports.size() = " << imports.size () <<
994  ". Please report this bug to the Tpetra developers.");
995  TEUCHOS_TEST_FOR_EXCEPTION(
996  lclErr != 0, std::logic_error, prefix << "numBytes != numBytesOut "
997  "somewhere in unpack loop. This should never happen. "
998  "Please report this bug to the Tpetra developers.");
999 #endif // HAVE_TPETRA_DEBUG
1000  }
1001 }
1002 
1003 // Note: This should get merged with the other Tpetra sort routines eventually.
1004 template<typename Scalar, typename Ordinal>
1005 void
1006 sortCrsEntries (const Teuchos::ArrayView<size_t> &CRS_rowptr,
1007  const Teuchos::ArrayView<Ordinal> & CRS_colind,
1008  const Teuchos::ArrayView<Scalar> &CRS_vals)
1009 {
1010  // For each row, sort column entries from smallest to largest.
1011  // Use shell sort. Stable sort so it is fast if indices are already sorted.
1012  // Code copied from Epetra_CrsMatrix::SortEntries()
1013  size_t NumRows = CRS_rowptr.size()-1;
1014  size_t nnz = CRS_colind.size();
1015 
1016  for(size_t i = 0; i < NumRows; i++){
1017  size_t start=CRS_rowptr[i];
1018  if(start >= nnz) continue;
1019 
1020  Scalar* locValues = &CRS_vals[start];
1021  size_t NumEntries = CRS_rowptr[i+1] - start;
1022  Ordinal* locIndices = &CRS_colind[start];
1023 
1024  Ordinal n = NumEntries;
1025  Ordinal m = n/2;
1026 
1027  while(m > 0) {
1028  Ordinal max = n - m;
1029  for(Ordinal j = 0; j < max; j++) {
1030  for(Ordinal k = j; k >= 0; k-=m) {
1031  if(locIndices[k+m] >= locIndices[k])
1032  break;
1033  Scalar dtemp = locValues[k+m];
1034  locValues[k+m] = locValues[k];
1035  locValues[k] = dtemp;
1036  Ordinal itemp = locIndices[k+m];
1037  locIndices[k+m] = locIndices[k];
1038  locIndices[k] = itemp;
1039  }
1040  }
1041  m = m/2;
1042  }
1043  }
1044 }
1045 
1046 // Note: This should get merged with the other Tpetra sort routines eventually.
1047 template<typename Scalar, typename Ordinal>
1048 void
1049 sortAndMergeCrsEntries (const Teuchos::ArrayView<size_t> &CRS_rowptr,
1050  const Teuchos::ArrayView<Ordinal> & CRS_colind,
1051  const Teuchos::ArrayView<Scalar> &CRS_vals)
1052 {
1053  // For each row, sort column entries from smallest to largest,
1054  // merging column ids that are identify by adding values. Use shell
1055  // sort. Stable sort so it is fast if indices are already sorted.
1056  // Code copied from Epetra_CrsMatrix::SortEntries()
1057 
1058  size_t NumRows = CRS_rowptr.size()-1;
1059  size_t nnz = CRS_colind.size();
1060  size_t new_curr=CRS_rowptr[0], old_curr=CRS_rowptr[0];
1061 
1062  for(size_t i = 0; i < NumRows; i++){
1063  size_t start=CRS_rowptr[i];
1064  if(start >= nnz) continue;
1065 
1066  Scalar* locValues = &CRS_vals[start];
1067  size_t NumEntries = CRS_rowptr[i+1] - start;
1068  Ordinal* locIndices = &CRS_colind[start];
1069 
1070  // Sort phase
1071  Ordinal n = NumEntries;
1072  Ordinal m = n/2;
1073 
1074  while(m > 0) {
1075  Ordinal max = n - m;
1076  for(Ordinal j = 0; j < max; j++) {
1077  for(Ordinal k = j; k >= 0; k-=m) {
1078  if(locIndices[k+m] >= locIndices[k])
1079  break;
1080  Scalar dtemp = locValues[k+m];
1081  locValues[k+m] = locValues[k];
1082  locValues[k] = dtemp;
1083  Ordinal itemp = locIndices[k+m];
1084  locIndices[k+m] = locIndices[k];
1085  locIndices[k] = itemp;
1086  }
1087  }
1088  m = m/2;
1089  }
1090 
1091  // Merge & shrink
1092  for(size_t j=CRS_rowptr[i]; j < CRS_rowptr[i+1]; j++) {
1093  if(j > CRS_rowptr[i] && CRS_colind[j]==CRS_colind[new_curr-1]) {
1094  CRS_vals[new_curr-1] += CRS_vals[j];
1095  }
1096  else if(new_curr==j) {
1097  new_curr++;
1098  }
1099  else {
1100  CRS_colind[new_curr] = CRS_colind[j];
1101  CRS_vals[new_curr] = CRS_vals[j];
1102  new_curr++;
1103  }
1104  }
1105 
1106  CRS_rowptr[i] = old_curr;
1107  old_curr=new_curr;
1108  }
1109 
1110  CRS_rowptr[NumRows] = new_curr;
1111 }
1112 
1113 template <typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1114 void
1115 lowCommunicationMakeColMapAndReindex (const ArrayView<const size_t> &rowptr,
1116  const ArrayView<LocalOrdinal> &colind_LID,
1117  const ArrayView<GlobalOrdinal> &colind_GID,
1118  const Teuchos::RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> >& domainMapRCP,
1119  const Teuchos::ArrayView<const int> &owningPIDs,
1120  Teuchos::Array<int> &remotePIDs,
1121  Teuchos::RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > & colMap)
1122 {
1123  using Teuchos::rcp;
1124  typedef LocalOrdinal LO;
1125  typedef GlobalOrdinal GO;
1126  typedef Tpetra::global_size_t GST;
1127  typedef Tpetra::Map<LO, GO, Node> map_type;
1128  const char prefix[] = "lowCommunicationMakeColMapAndReindex: ";
1129 
1130  // The domainMap is an RCP because there is a shortcut for a
1131  // (common) special case to return the columnMap = domainMap.
1132  const map_type& domainMap = *domainMapRCP;
1133 
1134  // Scan all column indices and sort into two groups:
1135  // Local: those whose GID matches a GID of the domain map on this processor and
1136  // Remote: All others.
1137  const size_t numDomainElements = domainMap.getNodeNumElements ();
1138  Teuchos::Array<bool> LocalGIDs;
1139  if (numDomainElements > 0) {
1140  LocalGIDs.resize (numDomainElements, false); // Assume domain GIDs are not local
1141  }
1142 
1143  // In principle it is good to have RemoteGIDs and RemotGIDList be as
1144  // long as the number of remote GIDs on this processor, but this
1145  // would require two passes through the column IDs, so we make it
1146  // the max of 100 and the number of block rows.
1147  //
1148  // FIXME (mfh 11 Feb 2015) Tpetra::Details::HashTable can hold at
1149  // most INT_MAX entries, but it's possible to have more rows than
1150  // that (if size_t is 64 bits and int is 32 bits).
1151  const size_t numMyRows = rowptr.size () - 1;
1152  const int hashsize = std::max (static_cast<int> (numMyRows), 100);
1153 
1154  Tpetra::Details::HashTable<GO, LO> RemoteGIDs (hashsize);
1155  Teuchos::Array<GO> RemoteGIDList;
1156  RemoteGIDList.reserve (hashsize);
1157  Teuchos::Array<int> PIDList;
1158  PIDList.reserve (hashsize);
1159 
1160  // Here we start using the *LocalOrdinal* colind_LID array. This is
1161  // safe even if both columnIndices arrays are actually the same
1162  // (because LocalOrdinal==GO). For *local* GID's set
1163  // colind_LID with with their LID in the domainMap. For *remote*
1164  // GIDs, we set colind_LID with (numDomainElements+NumRemoteColGIDs)
1165  // before the increment of the remote count. These numberings will
1166  // be separate because no local LID is greater than
1167  // numDomainElements.
1168 
1169  size_t NumLocalColGIDs = 0;
1170  LO NumRemoteColGIDs = 0;
1171  for (size_t i = 0; i < numMyRows; ++i) {
1172  for(size_t j = rowptr[i]; j < rowptr[i+1]; ++j) {
1173  const GO GID = colind_GID[j];
1174  // Check if GID matches a row GID
1175  const LO LID = domainMap.getLocalElement (GID);
1176  if(LID != -1) {
1177  const bool alreadyFound = LocalGIDs[LID];
1178  if (! alreadyFound) {
1179  LocalGIDs[LID] = true; // There is a column in the graph associated with this domain map GID
1180  NumLocalColGIDs++;
1181  }
1182  colind_LID[j] = LID;
1183  }
1184  else {
1185  const LO hash_value = RemoteGIDs.get (GID);
1186  if (hash_value == -1) { // This means its a new remote GID
1187  const int PID = owningPIDs[j];
1188  TEUCHOS_TEST_FOR_EXCEPTION(
1189  PID == -1, std::invalid_argument, prefix << "Cannot figure out if "
1190  "PID is owned.");
1191  colind_LID[j] = static_cast<LO> (numDomainElements + NumRemoteColGIDs);
1192  RemoteGIDs.add (GID, NumRemoteColGIDs);
1193  RemoteGIDList.push_back (GID);
1194  PIDList.push_back (PID);
1195  NumRemoteColGIDs++;
1196  }
1197  else {
1198  colind_LID[j] = static_cast<LO> (numDomainElements + hash_value);
1199  }
1200  }
1201  }
1202  }
1203 
1204  // Possible short-circuit: If all domain map GIDs are present as
1205  // column indices, then set ColMap=domainMap and quit.
1206  if (domainMap.getComm ()->getSize () == 1) {
1207  // Sanity check: When there is only one process, there can be no
1208  // remoteGIDs.
1209  TEUCHOS_TEST_FOR_EXCEPTION(
1210  NumRemoteColGIDs != 0, std::runtime_error, prefix << "There is only one "
1211  "process in the domain Map's communicator, which means that there are no "
1212  "\"remote\" indices. Nevertheless, some column indices are not in the "
1213  "domain Map.");
1214  if (static_cast<size_t> (NumLocalColGIDs) == numDomainElements) {
1215  // In this case, we just use the domainMap's indices, which is,
1216  // not coincidently, what we clobbered colind with up above
1217  // anyway. No further reindexing is needed.
1218  colMap = domainMapRCP;
1219  return;
1220  }
1221  }
1222 
1223  // Now build the array containing column GIDs
1224  // Build back end, containing remote GIDs, first
1225  const LO numMyCols = NumLocalColGIDs + NumRemoteColGIDs;
1226  Teuchos::Array<GO> ColIndices;
1227  GO* RemoteColIndices = NULL;
1228  if (numMyCols > 0) {
1229  ColIndices.resize (numMyCols);
1230  if (NumLocalColGIDs != static_cast<size_t> (numMyCols)) {
1231  RemoteColIndices = &ColIndices[NumLocalColGIDs]; // Points to back half of ColIndices
1232  }
1233  }
1234 
1235  for (LO i = 0; i < NumRemoteColGIDs; ++i) {
1236  RemoteColIndices[i] = RemoteGIDList[i];
1237  }
1238 
1239  // Build permute array for *remote* reindexing.
1240  Teuchos::Array<LO> RemotePermuteIDs (NumRemoteColGIDs);
1241  for (LO i = 0; i < NumRemoteColGIDs; ++i) {
1242  RemotePermuteIDs[i]=i;
1243  }
1244 
1245  // Sort External column indices so that all columns coming from a
1246  // given remote processor are contiguous. This is a sort with two
1247  // auxillary arrays: RemoteColIndices and RemotePermuteIDs.
1248  Tpetra::sort3 (PIDList.begin (), PIDList.end (),
1249  ColIndices.begin () + NumLocalColGIDs,
1250  RemotePermuteIDs.begin ());
1251 
1252  // Stash the RemotePIDs.
1253  //
1254  // Note: If Teuchos::Array had a shrink_to_fit like std::vector,
1255  // we'd call it here.
1256  remotePIDs = PIDList;
1257 
1258  // Sort external column indices so that columns from a given remote
1259  // processor are not only contiguous but also in ascending
1260  // order. NOTE: I don't know if the number of externals associated
1261  // with a given remote processor is known at this point ... so I
1262  // count them here.
1263 
1264  // NTS: Only sort the RemoteColIndices this time...
1265  LO StartCurrent = 0, StartNext = 1;
1266  while (StartNext < NumRemoteColGIDs) {
1267  if (PIDList[StartNext]==PIDList[StartNext-1]) {
1268  StartNext++;
1269  }
1270  else {
1271  Tpetra::sort2 (ColIndices.begin () + NumLocalColGIDs + StartCurrent,
1272  ColIndices.begin () + NumLocalColGIDs + StartNext,
1273  RemotePermuteIDs.begin () + StartCurrent);
1274  StartCurrent = StartNext;
1275  StartNext++;
1276  }
1277  }
1278  Tpetra::sort2 (ColIndices.begin () + NumLocalColGIDs + StartCurrent,
1279  ColIndices.begin () + NumLocalColGIDs + StartNext,
1280  RemotePermuteIDs.begin () + StartCurrent);
1281 
1282  // Reverse the permutation to get the information we actually care about
1283  Teuchos::Array<LO> ReverseRemotePermuteIDs (NumRemoteColGIDs);
1284  for (LO i = 0; i < NumRemoteColGIDs; ++i) {
1285  ReverseRemotePermuteIDs[RemotePermuteIDs[i]] = i;
1286  }
1287 
1288  // Build permute array for *local* reindexing.
1289  bool use_local_permute = false;
1290  Teuchos::Array<LO> LocalPermuteIDs (numDomainElements);
1291 
1292  // Now fill front end. Two cases:
1293  //
1294  // (1) If the number of Local column GIDs is the same as the number
1295  // of Local domain GIDs, we can simply read the domain GIDs into
1296  // the front part of ColIndices, otherwise
1297  //
1298  // (2) We step through the GIDs of the domainMap, checking to see if
1299  // each domain GID is a column GID. we want to do this to
1300  // maintain a consistent ordering of GIDs between the columns
1301  // and the domain.
1302  Teuchos::ArrayView<const GO> domainGlobalElements = domainMap.getNodeElementList();
1303  if (static_cast<size_t> (NumLocalColGIDs) == numDomainElements) {
1304  if (NumLocalColGIDs > 0) {
1305  // Load Global Indices into first numMyCols elements column GID list
1306  std::copy (domainGlobalElements.begin (), domainGlobalElements.end (),
1307  ColIndices.begin ());
1308  }
1309  }
1310  else {
1311  LO NumLocalAgain = 0;
1312  use_local_permute = true;
1313  for (size_t i = 0; i < numDomainElements; ++i) {
1314  if (LocalGIDs[i]) {
1315  LocalPermuteIDs[i] = NumLocalAgain;
1316  ColIndices[NumLocalAgain++] = domainGlobalElements[i];
1317  }
1318  }
1319  TEUCHOS_TEST_FOR_EXCEPTION(
1320  static_cast<size_t> (NumLocalAgain) != NumLocalColGIDs,
1321  std::runtime_error, prefix << "Local ID count test failed.");
1322  }
1323 
1324  // Make column Map
1325  const GST minus_one = Teuchos::OrdinalTraits<GST>::invalid ();
1326  colMap = rcp (new map_type (minus_one, ColIndices, domainMap.getIndexBase (),
1327  domainMap.getComm (), domainMap.getNode ()));
1328 
1329  // Low-cost reindex of the matrix
1330  for (size_t i = 0; i < numMyRows; ++i) {
1331  for (size_t j = rowptr[i]; j < rowptr[i+1]; ++j) {
1332  const LO ID = colind_LID[j];
1333  if (static_cast<size_t> (ID) < numDomainElements) {
1334  if (use_local_permute) {
1335  colind_LID[j] = LocalPermuteIDs[colind_LID[j]];
1336  }
1337  // In the case where use_local_permute==false, we just copy
1338  // the DomainMap's ordering, which it so happens is what we
1339  // put in colind_LID to begin with.
1340  }
1341  else {
1342  colind_LID[j] = NumLocalColGIDs + ReverseRemotePermuteIDs[colind_LID[j]-numDomainElements];
1343  }
1344  }
1345  }
1346 }
1347 
1348 } // namespace Import_Util
1349 } // namespace Tpetra
1350 
1351 // We can include the definitions for Tpetra::CrsMatrix now that the above
1352 // functions have been defined. For ETI, this isn't necessary, so we just
1353 // including the generated hpp
1354 #include "Tpetra_CrsMatrix.hpp"
1355 
1356 #endif // TPETRA_IMPORT_UTIL_HPP
void sortAndMergeCrsEntries(const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< Ordinal > &CRS_colind, const Teuchos::ArrayView< Scalar > &CRS_vals)
Sort and merge the entries of the (raw CSR) matrix by column index within each row.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void sort3(const IT1 &first1, const IT1 &last1, const IT2 &first2, const IT3 &first3)
Sort the first array, and apply the same permutation to the second and third arrays.
Traits class for packing / unpacking data of type T, using Kokkos data structures that live in the gi...
Teuchos::RCP< const map_type > getColMap() const
The Map that describes the column distribution in this matrix.
Declaration of the Tpetra::CrsMatrix class.
bool isLocallyIndexed() const
Whether the matrix is locally indexed on the calling process.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Kokkos::View< char *, D, Kokkos::MemoryUnmanaged > output_buffer_type
The type of an output buffer of bytes.
void getLocalRowView(LocalOrdinal LocalRow, Teuchos::ArrayView< const LocalOrdinal > &indices, Teuchos::ArrayView< const Scalar > &values) const
Get a constant, nonpersisting view of a row of this matrix, using local row and column indices...
void sortCrsEntries(const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< Ordinal > &CRS_colind, const Teuchos::ArrayView< Scalar > &CRS_vals)
Sort the entries of the (raw CSR) matrix by column index within each row.
size_t global_size_t
Global size_t object.
size_t unpackAndCombineWithOwningPIDsCount(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &SourceMatrix, const Teuchos::ArrayView< const LocalOrdinal > &importLIDs, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteToLIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteFromLIDs)
Special version of Tpetra::CrsMatrix::unpackAndCombine that also unpacks owning process ranks...
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which the matrix is distributed.
void lowCommunicationMakeColMapAndReindex(const Teuchos::ArrayView< const size_t > &rowPointers, const Teuchos::ArrayView< LocalOrdinal > &columnIndices_LID, const Teuchos::ArrayView< GlobalOrdinal > &columnIndices_GID, const Tpetra::RCP< const Tpetra::Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap, const Teuchos::ArrayView< const int > &owningPids, Teuchos::Array< int > &remotePids, Teuchos::RCP< const Tpetra::Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap)
lowCommunicationMakeColMapAndReindex
Sets up and executes a communication plan for a Tpetra DistObject.
CombineMode
Rule for combining data in an Import or Export.
Kokkos::View< const value_type *, D, Kokkos::MemoryUnmanaged > input_array_type
The type of an input array of value_type.
Kokkos::View< const char *, D, Kokkos::MemoryUnmanaged > input_buffer_type
The type of an input buffer of bytes.
void unpackAndCombineIntoCrsArrays(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &SourceMatrix, const Teuchos::ArrayView< const LocalOrdinal > &importLIDs, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteToLIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, int MyTargetPID, const Teuchos::ArrayView< size_t > &rowPointers, const Teuchos::ArrayView< GlobalOrdinal > &columnIndices, const Teuchos::ArrayView< Scalar > &values, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2)
Sort the first array, and apply the resulting permutation to the second array.
size_t getNodeNumElements() const
The number of elements belonging to the calling process.
ValueType get(const KeyType key)
Get the value corresponding to the given key.
Describes a parallel distribution of objects over processes.
void add(const KeyType key, const ValueType value)
Add a key and its value to the hash table.
Kokkos::View< value_type *, D, Kokkos::MemoryUnmanaged > output_array_type
The type of an output array of value_type.
Stand-alone utility functions and macros.
size_t getNumEntriesInLocalRow(LocalOrdinal localRow) const
Returns the current number of entries on this node in the specified local row.
void packAndPrepareWithOwningPIDs(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &SourceMatrix, const Teuchos::ArrayView< const LocalOrdinal > &exportLIDs, Teuchos::Array< char > &exports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, size_t &constantNumPackets, Distributor &distor, const Teuchos::ArrayView< const int > &SourcePids)
Special version of Tpetra::CrsMatrix::packAndPrepare that also packs owning process ranks...