Tpetra parallel linear algebra  Version of the Day
Tpetra_Details_unpackCrsGraphAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
43 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
44 
45 #include "TpetraCore_config.h"
46 #include "Teuchos_Array.hpp"
47 #include "Teuchos_ArrayView.hpp"
53 #include "Tpetra_CrsGraph_decl.hpp"
56 #include "Kokkos_Core.hpp"
57 #include <memory>
58 #include <string>
59 
78 
79 namespace Tpetra {
80 
81 #ifndef DOXYGEN_SHOULD_SKIP_THIS
82 // Forward declaration of Distributor
83 class Distributor;
84 #endif // DOXYGEN_SHOULD_SKIP_THIS
85 
86 //
87 // Users must never rely on anything in the Details namespace.
88 //
89 namespace Details {
90 
91 namespace UnpackAndCombineCrsGraphImpl {
92 
102 template<class Packet, class GO, class Device, class BufferDevice>
103 KOKKOS_FUNCTION int
104 unpackRow (const Kokkos::View<GO*,Device,Kokkos::MemoryUnmanaged>& gids_out,
105  const Kokkos::View<int*,Device,Kokkos::MemoryUnmanaged>& pids_out,
106  const Kokkos::View<const Packet*,BufferDevice>& imports,
107  const size_t offset,
108  const size_t num_ent)
109 {
110  using size_type = typename Kokkos::View<GO*,Device>::size_type;
111 
112  if (num_ent == 0) {
113  // Empty rows always take zero bytes, to ensure sparsity.
114  return 0;
115  }
116 
117  // Unpack GIDs
118  for (size_type k=0; k<num_ent; k++)
119  gids_out(k) = imports(offset+k);
120 
121  // Unpack PIDs
122  if (pids_out.size() > 0) {
123  for (size_type k=0; k<num_ent; k++) {
124  pids_out(k) = static_cast<int>(imports(offset+num_ent+k));
125  }
126  }
127 
128  return 0;
129 }
130 
141 template<class LocalOrdinal,
142  class Packet,
143  class RowView,
144  class IndicesView,
145  class BufferDevice>
147 
148  using LO = LocalOrdinal;
149  using GO = typename IndicesView::value_type;
150  using packet_type = Packet;
151  using row_ptrs_type = RowView;
152  using indices_type = IndicesView;
153  using buffer_device_type = BufferDevice;
154 
155  using device_type = typename IndicesView::device_type;
156  using execution_space = typename device_type::execution_space;
157 
158  using num_packets_per_lid_type = Kokkos::View<const size_t*, buffer_device_type>;
159  using offsets_type = Kokkos::View<const size_t*, device_type>;
160  using input_buffer_type = Kokkos::View<const packet_type*, buffer_device_type>;
161  using import_lids_type = Kokkos::View<const LO*, buffer_device_type>;
162 
163  using gids_scratch_type = Kokkos::View<GO*, device_type>;
164  using pids_scratch_type = Kokkos::View<int*,device_type>;
165 
166  row_ptrs_type row_ptrs_beg;
167  row_ptrs_type row_ptrs_end;
168  indices_type indices;
169  input_buffer_type imports;
170  num_packets_per_lid_type num_packets_per_lid;
171  import_lids_type import_lids;
172  offsets_type offsets;
173  size_t max_num_ent;
174  bool unpack_pids;
175  Kokkos::Experimental::UniqueToken<execution_space,
176  Kokkos::Experimental::UniqueTokenScope::Global> tokens;
177  gids_scratch_type gids_scratch;
178  pids_scratch_type pids_scratch;
179 
180  public:
181  using value_type = Kokkos::pair<int, LO>;
182 
184  const row_ptrs_type& row_ptrs_beg_in,
185  const row_ptrs_type& row_ptrs_end_in,
186  const indices_type& indices_in,
187  const input_buffer_type& imports_in,
188  const num_packets_per_lid_type& num_packets_per_lid_in,
189  const import_lids_type& import_lids_in,
190  const offsets_type& offsets_in,
191  const size_t max_num_ent_in,
192  const bool unpack_pids_in) :
193  row_ptrs_beg(row_ptrs_beg_in),
194  row_ptrs_end(row_ptrs_end_in),
195  indices(indices_in),
196  imports(imports_in),
197  num_packets_per_lid(num_packets_per_lid_in),
198  import_lids(import_lids_in),
199  offsets(offsets_in),
200  max_num_ent(max_num_ent_in),
201  unpack_pids(unpack_pids_in),
202  tokens(execution_space()),
203  gids_scratch("gids_scratch", tokens.size() * max_num_ent),
204  pids_scratch("pids_scratch", tokens.size() * max_num_ent)
205  {}
206 
207  KOKKOS_INLINE_FUNCTION void init(value_type& dst) const
208  {
209  using Tpetra::Details::OrdinalTraits;
210  dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
211  }
212 
213  KOKKOS_INLINE_FUNCTION void
214  join(volatile value_type& dst, const volatile value_type& src) const
215  {
216  // `dst` should reflect the first (least) bad index and
217  // all other associated error codes and data. Thus, we need only
218  // check if the `src` object shows an error and if its associated
219  // bad index is less than `dst`'s bad index.
220  using Tpetra::Details::OrdinalTraits;
221  if (src.second != OrdinalTraits<LO>::invalid()) {
222  // An error in the src; check if
223  // 1. `dst` shows errors
224  // 2. If `dst` does show errors, if src's bad index is less than
225  // *this' bad index
226  if (dst.second == OrdinalTraits<LO>::invalid() ||
227  src.second < dst.second) {
228  dst = src;
229  }
230  }
231  }
232 
233  KOKKOS_INLINE_FUNCTION
234  void operator()(const LO i, value_type& dst) const
235  {
236  using Kokkos::View;
237  using Kokkos::subview;
238  using Kokkos::MemoryUnmanaged;
239  using size_type = typename execution_space::size_type;
240  using slice = typename Kokkos::pair<size_type, size_type>;
241 
242  using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
243  using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
244 
245  const size_t num_packets_this_lid = num_packets_per_lid(i);
246  const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
247  : num_packets_this_lid;
248  if (unpack_pids && num_packets_this_lid%2 != 0) {
249  // Attempting to unpack PIDs, but num_packets_this_lid is not even; this
250  // should never
251  dst = Kokkos::make_pair(1, i);
252  return;
253  }
254 
255  // Only unpack data if there is a nonzero number to unpack
256  if (num_ent == 0) {
257  return;
258  }
259 
260  // there is actually something in the row
261  const size_t buf_size = imports.size();
262  const size_t offset = offsets(i);
263 
264  if (offset > buf_size || offset + num_packets_this_lid > buf_size) {
265  dst = Kokkos::make_pair(2, i); // out of bounds
266  return;
267  }
268 
269  // Get subviews in to the scratch arrays. The token returned from acquire
270  // is an integer in [0, tokens.size()). It is used to grab a unique (to
271  // this thread) subview of the scratch arrays.
272  const size_type token = tokens.acquire();
273  const size_t a = static_cast<size_t>(token) * max_num_ent;
274  const size_t b = a + num_ent;
275  gids_out_type gids_out = subview(gids_scratch, slice(a, b));
276  pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
277 
278  const int err = unpackRow (gids_out, pids_out, imports, offset, num_ent);
279 
280  if (err != 0) {
281  dst = Kokkos::make_pair(3, i);
282  tokens.release(token);
283  return;
284  }
285 
286  auto import_lid = import_lids(i);
287  for (size_t k = 0; k < num_ent; ++k) {
288  indices(row_ptrs_end(import_lid)) = gids_out(k);
289  // this is OK; don't need atomic, since LIDs to pack don't have repeats.
290  row_ptrs_end(import_lid) += 1;
291  }
292 
293  tokens.release(token);
294  }
295 
296 };
297 
298 template<class NumPackets, class ImportLids, class Device>
299 Kokkos::UnorderedMap<typename ImportLids::non_const_value_type,
300  typename NumPackets::non_const_value_type,
301  Device>
302 computeCrsPadding(const NumPackets& num_packets_per_lid,
303  const ImportLids& import_lids,
304  const bool unpack_pids)
305 {
306  // Create a mapping of {LID: extra space needed} to rapidly look up which LIDs
307  // need additional padding.
308  using key_type = typename ImportLids::non_const_value_type;
309  using val_type = typename NumPackets::non_const_value_type;
310  Kokkos::UnorderedMap<key_type, val_type, Device> padding(import_lids.size());
311  auto policy = Kokkos::RangePolicy<typename Device::execution_space>(0, import_lids.size());
312  Kokkos::parallel_for("Fill padding", policy,
313  KOKKOS_LAMBDA(typename ImportLids::size_type i) {
314  auto how_much_padding = (unpack_pids) ? num_packets_per_lid(i)/2
315  : num_packets_per_lid(i);
316  padding.insert(import_lids(i), how_much_padding);
317  }
318  );
319  TEUCHOS_TEST_FOR_EXCEPTION(padding.failed_insert(), std::runtime_error,
320  "computeCrsPadding: failed to insert one or more indices in to padding map");
321  return padding;
322 }
323 
330 template<class LocalOrdinal, class Packet, class RowView,
331  class IndicesView, class BufferDevice>
332 void
334 (const RowView& row_ptrs_beg,
335  const RowView& row_ptrs_end,
336  IndicesView& indices,
337  const Kokkos::View<const Packet*, BufferDevice, Kokkos::MemoryUnmanaged>& imports,
338  const Kokkos::View<const size_t*, BufferDevice, Kokkos::MemoryUnmanaged>& num_packets_per_lid,
339  const Kokkos::View<const LocalOrdinal*, BufferDevice, Kokkos::MemoryUnmanaged>& import_lids,
340  const bool unpack_pids)
341 {
342 
343  using ImportLidsView =
344  Kokkos::View<const LocalOrdinal*, BufferDevice, Kokkos::MemoryUnmanaged>;
345  using NumPacketsView =
346  Kokkos::View<const size_t*, BufferDevice, Kokkos::MemoryUnmanaged>;
347  using LO = LocalOrdinal;
348  using execution_space = typename BufferDevice::execution_space;
349  using range_policy =
350  Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
351  using unpack_functor_type =
353 
354  const char prefix[] =
355  "Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackAndCombine: ";
356 
357  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
358  if (num_import_lids == 0) {
359  // Nothing to unpack
360  return;
361  }
362 
363  using device_type = typename IndicesView::device_type;
364 
365  // Resize row pointers and indices to accommodate incoming data
366  auto padding =
367  computeCrsPadding<NumPacketsView, ImportLidsView, device_type>
368  (num_packets_per_lid, import_lids, unpack_pids);
369  padCrsArrays<RowView, IndicesView, decltype (padding) > (row_ptrs_beg, row_ptrs_end, indices, padding);
370 
371  // Get the offsets
372  Kokkos::View<size_t*, device_type> offsets("offsets", num_import_lids+1);
373  computeOffsetsFromCounts(offsets, num_packets_per_lid);
374 
375  // Determine the maximum number of entries in any row in the graph. The
376  // maximum number of entries is needed to allocate unpack buffers on the
377  // device.
378  size_t max_num_ent;
379  Kokkos::parallel_reduce
380  ("MaxReduce",
381  range_policy (0, LO (num_packets_per_lid.size ())),
382  KOKKOS_LAMBDA (const LO i, size_t& running_max_num_ent) {
383  const size_t num_packets_this_lid = num_packets_per_lid(i);
384  const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2 :
385  num_packets_this_lid;
386  if (num_ent > running_max_num_ent) {
387  running_max_num_ent = num_ent;
388  }
389  }, Kokkos::Max<size_t> (max_num_ent));
390 
391  // Now do the actual unpack!
392  unpack_functor_type f (row_ptrs_beg, row_ptrs_end, indices, imports,
393  num_packets_per_lid, import_lids, offsets,
394  max_num_ent, unpack_pids);
395 
396  typename unpack_functor_type::value_type x;
397  Kokkos::parallel_reduce(range_policy(0, static_cast<LO>(num_import_lids)), f, x);
398  auto x_h = x.to_std_pair();
399  TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
400  prefix << "UnpackAndCombineFunctor reported error code "
401  << x_h.first << " for the first bad row " << x_h.second);
402 }
403 
404 template<class Packet, class LocalGraph, class BufferDevice>
405 size_t
407  const LocalGraph& local_graph,
408  const Kokkos::View<const typename LocalGraph::data_type*,
409  typename LocalGraph::device_type,
410  Kokkos::MemoryUnmanaged> permute_from_lids,
411  const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
412  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
413  const size_t num_same_ids)
414 {
415  using Kokkos::parallel_reduce;
416  using local_graph_type = LocalGraph;
417  using LO = typename local_graph_type::data_type;
418  using device_type = typename local_graph_type::device_type;
419  using execution_space = typename device_type::execution_space;
420  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
421 
422  size_t count = 0;
423  LO num_items;
424 
425  // Number of graph entries to unpack (returned by this function).
426  num_items = static_cast<LO>(num_same_ids);
427  if (num_items) {
428  size_t kcnt = 0;
429  parallel_reduce(
430  range_policy(0, num_items),
431  KOKKOS_LAMBDA(const LO lid, size_t& update) {
432  update += static_cast<size_t>(local_graph.row_map[lid+1]
433  -local_graph.row_map[lid]);
434  }, kcnt);
435  count += kcnt;
436  }
437 
438  // Count entries copied directly from the source graph with permuting.
439  num_items = static_cast<LO>(permute_from_lids.extent(0));
440  if (num_items) {
441  size_t kcnt = 0;
442  parallel_reduce(
443  range_policy(0, num_items),
444  KOKKOS_LAMBDA(const LO i, size_t& update) {
445  const LO lid = permute_from_lids(i);
446  update += static_cast<size_t>(local_graph.row_map[lid+1]
447  - local_graph.row_map[lid]);
448  }, kcnt);
449  count += kcnt;
450  }
451 
452  {
453  // Count entries received from other MPI processes.
454  size_t tot_num_ent = 0;
455  parallel_reduce("SumReduce",
456  num_packets_per_lid.size(),
457  KOKKOS_LAMBDA(const int& i, size_t& lsum) {
458  lsum += num_packets_per_lid(i) / 2;
459  }, Kokkos::Sum<size_t>(tot_num_ent));
460  count += tot_num_ent;
461  }
462 
463  return count;
464 }
465 
467 template<class Packet, class LO, class Device, class BufferDevice>
468 void
470  const Kokkos::View<size_t*, Device>& tgt_rowptr,
471  const Kokkos::View<const LO*, BufferDevice>& import_lids,
472  const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
473  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid)
474 {
475  using Kokkos::parallel_reduce;
476  using device_type = Device;
477  using execution_space = typename device_type::execution_space;
478  using size_type = typename Kokkos::View<size_t*,device_type>::size_type;
479  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
480 
481  const size_type N = num_packets_per_lid.extent(0);
482  parallel_for("Setup row pointers for remotes",
483  range_policy(0, N),
484  KOKKOS_LAMBDA(const size_t i){
485  using atomic_incr_type = typename std::remove_reference<decltype(tgt_rowptr(0))>::type;
486  const size_t num_packets_this_lid = num_packets_per_lid(i);
487  const size_t num_ent = num_packets_this_lid / 2;
488  Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
489  });
490 }
491 
492 // Convert array of row lengths to a CRS pointer array
493 template<class Device>
494 void
495 makeCrsRowPtrFromLengths(
496  const Kokkos::View<size_t*,Device,Kokkos::MemoryUnmanaged>& tgt_rowptr,
497  const Kokkos::View<size_t*,Device>& new_start_row)
498 {
499  using Kokkos::parallel_scan;
500  using device_type = Device;
501  using execution_space = typename device_type::execution_space;
502  using size_type = typename Kokkos::View<size_t*,device_type>::size_type;
503  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
504  const size_type N = new_start_row.extent(0);
505  parallel_scan(
506  range_policy(0, N),
507  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
508  auto cur_val = tgt_rowptr(i);
509  if (final) {
510  tgt_rowptr(i) = update;
511  new_start_row(i) = tgt_rowptr(i);
512  }
513  update += cur_val;
514  }
515  );
516 }
517 
518 template<class LocalGraph, class LocalMap>
519 void
520 copyDataFromSameIDs(
521  const Kokkos::View<typename LocalMap::global_ordinal_type*,
522  typename LocalMap::device_type>& tgt_colind,
523  const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
524  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
525  const Kokkos::View<size_t*, typename LocalMap::device_type>& tgt_rowptr,
526  const Kokkos::View<const int*, typename LocalMap::device_type>& src_pids,
527  const LocalGraph& local_graph,
528  const LocalMap& local_col_map,
529  const size_t num_same_ids,
530  const int my_pid)
531 {
532  using Kokkos::parallel_for;
533  using device_type = typename LocalMap::device_type;
534  using LO = typename LocalMap::local_ordinal_type;
535  using execution_space = typename device_type::execution_space;
536  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
537 
538  parallel_for(
539  range_policy(0, num_same_ids),
540  KOKKOS_LAMBDA(const size_t i) {
541  using atomic_incr_type =typename std::remove_reference<decltype(new_start_row(0))>::type;
542 
543  const LO src_lid = static_cast<LO>(i);
544  size_t src_row = local_graph.row_map(src_lid);
545 
546  const LO tgt_lid = static_cast<LO>(i);
547  const size_t tgt_row = tgt_rowptr(tgt_lid);
548 
549  const size_t nsr = local_graph.row_map(src_lid+1)
550  - local_graph.row_map(src_lid);
551  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
552 
553  for (size_t j=local_graph.row_map(src_lid);
554  j<local_graph.row_map(src_lid+1); ++j) {
555  LO src_col = local_graph.entries(j);
556  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
557  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
558  }
559  }
560  );
561 }
562 
563 template<class LocalGraph, class LocalMap, class BufferDevice>
564 void
565 copyDataFromPermuteIDs(
566  const Kokkos::View<typename LocalMap::global_ordinal_type*,
567  typename LocalMap::device_type>& tgt_colind,
568  const Kokkos::View<int*,
569  typename LocalMap::device_type>& tgt_pids,
570  const Kokkos::View<size_t*,
571  typename LocalMap::device_type>& new_start_row,
572  const Kokkos::View<size_t*,
573  typename LocalMap::device_type>& tgt_rowptr,
574  const Kokkos::View<const int*,
575  typename LocalMap::device_type>& src_pids,
576  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
577  BufferDevice, Kokkos::MemoryUnmanaged>& permute_to_lids,
578  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
579  BufferDevice, Kokkos::MemoryUnmanaged>& permute_from_lids,
580  const LocalGraph& local_graph,
581  const LocalMap& local_col_map,
582  const int my_pid)
583 {
584  using Kokkos::parallel_for;
585  using device_type = typename LocalMap::device_type;
586  using LO = typename LocalMap::local_ordinal_type;
587  using execution_space = typename device_type::execution_space;
588  using size_type = typename Kokkos::View<LO*,device_type>::size_type;
589  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
590 
591  const size_type num_permute_to_lids = permute_to_lids.extent(0);
592 
593  parallel_for(
594  range_policy(0, num_permute_to_lids),
595  KOKKOS_LAMBDA(const size_t i) {
596  using atomic_incr_type = typename std::remove_reference<decltype(new_start_row(0))>::type;
597 
598  const LO src_lid = permute_from_lids(i);
599  const size_t src_row = local_graph.row_map(src_lid);
600 
601  const LO tgt_lid = permute_to_lids(i);
602  const size_t tgt_row = tgt_rowptr(tgt_lid);
603 
604  size_t nsr = local_graph.row_map(src_lid+1)
605  - local_graph.row_map(src_lid);
606  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
607 
608  for (size_t j=local_graph.row_map(src_lid);
609  j<local_graph.row_map(src_lid+1); ++j) {
610  LO src_col = local_graph.entries(j);
611  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
612  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
613  }
614  }
615  );
616 }
617 
618 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
619 void
620 unpackAndCombineIntoCrsArrays2(
621  const Kokkos::View<typename LocalMap::global_ordinal_type*, typename LocalMap::device_type>& tgt_colind,
622  const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
623  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
624  const Kokkos::View<const size_t*, typename LocalMap::device_type>& offsets,
625  const Kokkos::View<
626  const typename LocalMap::local_ordinal_type*,
627  BufferDevice,
628  Kokkos::MemoryUnmanaged>& import_lids,
629  const Kokkos::View<const Packet*, BufferDevice>& imports,
630  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
631  const LocalGraph& /* local_graph */,
632  const LocalMap /*& local_col_map*/,
633  const int my_pid)
634 {
635  using Kokkos::View;
636  using Kokkos::subview;
637  using Kokkos::MemoryUnmanaged;
638  using Kokkos::parallel_reduce;
639  using Kokkos::atomic_fetch_add;
640 
641  using device_type = typename LocalMap::device_type;
642  using LO = typename LocalMap::local_ordinal_type;
643  using GO = typename LocalMap::global_ordinal_type;
644  using execution_space = typename device_type::execution_space;
645  using size_type = typename Kokkos::View<LO*, device_type>::size_type;
646  using slice = typename Kokkos::pair<size_type, size_type>;
647  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
648 
649  using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
650  using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
651 
652  const size_type num_import_lids = import_lids.size();
653  const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays2: ";
654 
655  // RemoteIDs: Loop structure following UnpackAndCombine
656  int gbl_err_count;
657  parallel_reduce("Unpack and combine into CRS",
658  range_policy(0, num_import_lids),
659  KOKKOS_LAMBDA(const size_t i, int& err) {
660  using atomic_incr_type = typename std::remove_reference< decltype( new_start_row(0) )>::type;
661  const size_t num_packets_this_lid = num_packets_per_lid(i);
662  const size_t num_ent = num_packets_this_lid / 2;
663  const size_t offset = offsets(i);
664  const LO lcl_row = import_lids(i);
665  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
666  const size_t end_row = start_row + num_ent;
667 
668  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
669  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
670 
671  err += unpackRow (gids_out, pids_out, imports, offset, num_ent);
672 
673  // Correct target PIDs.
674  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
675  const int pid = pids_out(j);
676  pids_out(j) = (pid != my_pid) ? pid : -1;
677  }
678  }, gbl_err_count);
679 
680  TEUCHOS_TEST_FOR_EXCEPTION(gbl_err_count != 0,
681  std::invalid_argument, prefix <<
682  "Attempting to unpack PIDs, but num_ent is not even; this should never "
683  "happen! Please report this bug to the Tpetra developers.");
684 
685  return;
686 }
687 
688 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
689 void
691  const LocalGraph & local_graph,
692  const LocalMap & local_col_map,
693  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
694  BufferDevice,
695  Kokkos::MemoryUnmanaged>& import_lids,
696  const Kokkos::View<const Packet*, BufferDevice>& imports,
697  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
698  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
699  BufferDevice,
700  Kokkos::MemoryUnmanaged>& permute_to_lids,
701  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
702  BufferDevice,
703  Kokkos::MemoryUnmanaged>& permute_from_lids,
704  const Kokkos::View<size_t*,
705  typename LocalMap::device_type,
706  Kokkos::MemoryUnmanaged>& tgt_rowptr,
707  const Kokkos::View<typename LocalMap::global_ordinal_type*,
708  typename LocalMap::device_type,
709  Kokkos::MemoryUnmanaged>& tgt_colind,
710  const Kokkos::View<const int*,
711  typename LocalMap::device_type,
712  Kokkos::MemoryUnmanaged>& src_pids,
713  const Kokkos::View<int*,
714  typename LocalMap::device_type,
715  Kokkos::MemoryUnmanaged>& tgt_pids,
716  const size_t num_same_ids,
717  const size_t tgt_num_rows,
718  const size_t tgt_num_nonzeros,
719  const int my_tgt_pid)
720 {
721  using Kokkos::View;
722  using Kokkos::subview;
723  using Kokkos::parallel_for;
724  using Kokkos::MemoryUnmanaged;
725  using packet_type = Packet;
726  using local_map_type = LocalMap;
727  using local_graph_type = LocalGraph;
728  using buffer_device_type = BufferDevice;
729  using device_type = typename LocalMap::device_type;
730  using LO = typename LocalMap::local_ordinal_type;
731  using execution_space = typename device_type::execution_space;
732  using size_type = typename Kokkos::View<LO*, device_type>::size_type;
733  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
734 
735  const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays: ";
736 
737  const size_t N = tgt_num_rows;
738  const size_t mynnz = tgt_num_nonzeros;
739 
740  // In the case of reduced communicators, the sourceGraph won't have
741  // the right "my_pid", so thus we have to supply it.
742  const int my_pid = my_tgt_pid;
743 
744  // FIXME (mfh 24 Jun 2019)
745  //
746  // 1. Only zero the entries of tgt_rowptr that actually need it.
747  // 2. Consider merging these three kernels into one.
748 
749  // Zero the rowptr
750  parallel_for(
751  range_policy(0, N+1),
752  KOKKOS_LAMBDA(const size_t i) {
753  tgt_rowptr(i) = 0;
754  }
755  );
756 
757  // same IDs: Always first, always in the same place
758  parallel_for(
759  range_policy(0, num_same_ids),
760  KOKKOS_LAMBDA(const size_t i) {
761  const LO tgt_lid = static_cast<LO>(i);
762  const LO src_lid = static_cast<LO>(i);
763  tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
764  - local_graph.row_map(src_lid);
765  }
766  );
767 
768  // Permute IDs: Still local, but reordered
769  const size_type num_permute_to_lids = permute_to_lids.extent(0);
770  parallel_for(
771  range_policy(0, num_permute_to_lids),
772  KOKKOS_LAMBDA(const size_t i) {
773  const LO tgt_lid = permute_to_lids(i);
774  const LO src_lid = permute_from_lids(i);
775  tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
776  - local_graph.row_map(src_lid);
777  }
778  );
779 
780  // Get the offsets from the number of packets per LID
781  const size_type num_import_lids = import_lids.extent(0);
782  View<size_t*, device_type> offsets("offsets", num_import_lids+1);
783  computeOffsetsFromCounts(offsets, num_packets_per_lid);
784 
785 #ifdef HAVE_TPETRA_DEBUG
786  {
787  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
788  const bool condition =
789  nth_offset_h != static_cast<size_t>(imports.extent(0));
790  TEUCHOS_TEST_FOR_EXCEPTION
791  (condition, std::logic_error, prefix
792  << "The final offset in bytes " << nth_offset_h
793  << " != imports.size() = " << imports.extent(0)
794  << ". Please report this bug to the Tpetra developers.");
795  }
796 #endif // HAVE_TPETRA_DEBUG
797 
798  // Setup row pointers for remotes
799  setupRowPointersForRemotes<packet_type,LO,device_type,buffer_device_type>(
800  tgt_rowptr, import_lids, imports, num_packets_per_lid);
801 
802  // If multiple processes contribute to the same row, we may need to
803  // update row offsets. This tracks that.
804  View<size_t*, device_type> new_start_row("new_start_row", N+1);
805 
806  // Turn row length into a real CRS row pointer
807  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
808  {
809  auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
810  bool condition = nth_tgt_rowptr_h != mynnz;
811  TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
812  prefix << "CRS_rowptr[last] = " <<
813  nth_tgt_rowptr_h << "!= mynnz = " << mynnz << ".");
814  }
815 
816  // SameIDs: Copy the data over
817  copyDataFromSameIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
818  tgt_rowptr, src_pids, local_graph, local_col_map, num_same_ids, my_pid);
819 
820  copyDataFromPermuteIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
821  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
822  local_graph, local_col_map, my_pid);
823 
824  if (imports.extent(0) <= 0) {
825  return;
826  }
827 
828  unpackAndCombineIntoCrsArrays2<
829  packet_type,local_graph_type,local_map_type,buffer_device_type>(
830  tgt_colind, tgt_pids, new_start_row, offsets, import_lids, imports,
831  num_packets_per_lid, local_graph, local_col_map, my_pid);
832 
833  return;
834 }
835 
836 } // namespace UnpackAndCombineCrsGraphImpl
837 
871 template<class LO, class GO, class Node>
872 void
874  CrsGraph<LO, GO, Node>& graph,
875  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,Node>::packet_type>& imports,
876  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
877  const Teuchos::ArrayView<const LO>& importLIDs,
878  size_t /* constantNumPackets */,
879  Distributor & /* distor */,
880  CombineMode /* combineMode */)
881 {
882 
883  TEUCHOS_TEST_FOR_EXCEPTION(!graph.isGloballyIndexed(), std::invalid_argument,
884  "Graph must be globally indexed!");
885 
886 
887  using Kokkos::View;
888  using UnpackAndCombineCrsGraphImpl::unpackAndCombine;
889  using graph_type = CrsGraph<LO,GO,Node>;
890  using device_type = typename Node::device_type;
891  using buffer_device_type = typename graph_type::buffer_device_type;
892  using execution_space = typename device_type::execution_space;
893  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
894  using row_ptrs_type = typename graph_type::local_graph_type::row_map_type::non_const_type;
895  using indices_type = typename graph_type::t_GlobalOrdinal_1D;
896 
897  // Convert all Teuchos::Array to Kokkos::View.
898 
899  buffer_device_type bufferOutputDevice;
900 
901  // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
902  // them to device. Since unpacking is done directly in to the local graph
903  // (lclGraph), no copying needs to be performed after unpacking.
904  auto imports_d =
905  create_mirror_view_from_raw_host_array(bufferOutputDevice,
906  imports.getRawPtr(), imports.size(),
907  true, "imports");
908 
909  auto num_packets_per_lid_d =
910  create_mirror_view_from_raw_host_array(bufferOutputDevice,
911  numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
912  true, "num_packets_per_lid");
913 
914  auto import_lids_d =
915  create_mirror_view_from_raw_host_array(bufferOutputDevice,
916  importLIDs.getRawPtr(), importLIDs.size(),
917  true, "import_lids");
918 
919  // We are OK using the protected data directly (k_*) because this function is
920  // a friend of CrsGraph
921  indices_type indices("indices", graph.k_gblInds1D_.extent(0));
922  Kokkos::deep_copy(indices, graph.k_gblInds1D_);
923 
924  row_ptrs_type row_ptrs_beg("row_ptrs_beg", graph.k_rowPtrs_.extent(0));
925  Kokkos::deep_copy(row_ptrs_beg, graph.k_rowPtrs_);
926 
927  const size_t N = (row_ptrs_beg.extent(0) == 0 ? 0 : row_ptrs_beg.extent(0) - 1);
928  row_ptrs_type row_ptrs_end("row_ptrs_end", N);
929 
930  bool refill_num_row_entries = false;
931  if (graph.k_numRowEntries_.extent(0) > 0) {
932  // Case 1: Packed storage
933  refill_num_row_entries = true;
934  auto num_row_entries = graph.k_numRowEntries_;
935  Kokkos::parallel_for("Fill end row pointers", range_policy(0, N),
936  KOKKOS_LAMBDA(const size_t i){
937  row_ptrs_end(i) = row_ptrs_beg(i) + num_row_entries(i);
938  });
939 
940  } else {
941  // mfh If packed storage, don't need row_ptrs_end to be separate allocation;
942  // could just have it alias row_ptrs_beg+1.
943 
944  // Case 2: Packed storage
945  Kokkos::parallel_for("Fill end row pointers",
946  range_policy(0, N), KOKKOS_LAMBDA(const size_t i){
947  row_ptrs_end(i) = row_ptrs_beg(i+1);
948  });
949  }
950 
951  // Now do the actual unpack!
952  unpackAndCombine<LO, GO, row_ptrs_type, indices_type, buffer_device_type>
953  (row_ptrs_beg, row_ptrs_end, indices, imports_d,
954  num_packets_per_lid_d, import_lids_d, false);
955 
956  // mfh Later, permit graph to be locally indexed, and check whether
957  // incoming column indices are in the column Map. If not, error.
958  if (refill_num_row_entries) {
959  Kokkos::parallel_for("Fill num entries",
960  range_policy(0, N), KOKKOS_LAMBDA(const size_t i){
961  graph.k_numRowEntries_(i) = row_ptrs_end(i) - row_ptrs_beg(i);
962  });
963  }
964  graph.k_rowPtrs_ = row_ptrs_beg;
965  graph.k_gblInds1D_ = indices;
966 
967  return;
968 }
969 
970 template<class LO, class GO, class Node>
971 void
972 unpackCrsGraphAndCombineNew(
973  CrsGraph<LO, GO, Node>& /* sourceGraph */,
974  const Kokkos::DualView<const typename CrsGraph<LO,GO,Node>::packet_type*,
975  typename CrsGraph<LO,GO,Node>::buffer_device_type>& /* imports */,
976  const Kokkos::DualView<const size_t*,
977  typename CrsGraph<LO,GO,Node>::buffer_device_type>& /* numPacketsPerLID */,
978  const Kokkos::DualView<const LO*,
979  typename CrsGraph<LO,GO,Node>::buffer_device_type>& /* importLIDs */,
980  const size_t /* constantNumPackets */,
981  Distributor& /* distor */,
982  const CombineMode /* combineMode */)
983 {
984  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "METHOD NOT COMPLETE");
985 #if 0
986  using UnpackAndCombineCrsGraphImpl::unpackAndCombine;
988  using Kokkos::View;
989  using device_type = typename Node::device_type;
990  using graph_type = CrsGraph<LO, GO, Node>;
991  using packet_type = typename graph_type::packet_type;
992  using local_graph_type = typename graph_type::local_graph_type;
993  using buffer_device_type = typename graph_type::buffer_device_type;
994  using buffer_memory_space = typename buffer_device_type::memory_space;
995  using memory_space = typename device_type::memory_space;
996 
997  using row_ptrs_type = typename graph_type::local_graph_type::row_map_type::non_const_type;
998  using execution_space = typename device_type::execution_space;
999  using indices_type = Kokkos::View<GO*, execution_space>;
1000 
1001  static_assert(std::is_same<device_type, typename local_graph_type::device_type>::value,
1002  "Node::device_type and LocalGraph::device_type must be "
1003  "the same.");
1004 
1005  {
1006  auto numPacketsPerLID_nc = castAwayConstDualView(numPacketsPerLID);
1007  numPacketsPerLID_nc.sync_device ();
1008  }
1009  auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1010 
1011  TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1012  auto import_lids_d = importLIDs.view_device ();
1013 
1014  {
1015  auto imports_nc = castAwayConstDualView(imports);
1016  imports_nc.sync_device ();
1017  }
1018  auto imports_d = imports.view_device ();
1019 
1020  // Now do the actual unpack!
1021  // TJF: Should be grabbed from the Graph
1022  indices_type indices;
1023  row_ptrs_type row_ptrs_beg;
1024  row_ptrs_type row_ptrs_end;
1025  unpackAndCombine<LO,packet_type,row_ptrs_type,indices_type,device_type,buffer_device_type>(
1026  row_ptrs_beg, row_ptrs_end, indices, imports_d,
1027  num_packets_per_lid_d, import_lids_d, false);
1028 #endif // 0
1029 }
1030 
1080 template<class LocalOrdinal, class GlobalOrdinal, class Node>
1081 size_t
1083  const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> & sourceGraph,
1084  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1085  const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type> &imports,
1086  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1087  size_t /* constantNumPackets */,
1088  Distributor &/* distor */,
1089  CombineMode /* combineMode */,
1090  size_t numSameIDs,
1091  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1092  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1093 {
1094  using Kokkos::MemoryUnmanaged;
1095  using Kokkos::View;
1096  using device_type = typename Node::device_type;
1097  using packet_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type;
1098  using local_graph_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::local_graph_type;
1099  using buffer_device_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::buffer_device_type;
1100  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1101 
1102  TEUCHOS_TEST_FOR_EXCEPTION
1103  (permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1104  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size() << " != "
1105  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1106  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1107  // process, then the graph is neither locally nor globally indexed.
1108  const bool locallyIndexed = sourceGraph.isLocallyIndexed();
1109  TEUCHOS_TEST_FOR_EXCEPTION
1110  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1111  "CrsGraph 'sourceGraph' must be locally indexed.");
1112  TEUCHOS_TEST_FOR_EXCEPTION
1113  (importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
1114  prefix << "importLIDs.size() = " << importLIDs.size() << " != "
1115  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
1116 
1117  auto local_graph = sourceGraph.getLocalGraph();
1118  auto permute_from_lids_d =
1120  permuteFromLIDs.getRawPtr(),
1121  permuteFromLIDs.size(), true,
1122  "permute_from_lids");
1123  auto imports_d =
1124  create_mirror_view_from_raw_host_array(buffer_device_type(),
1125  imports.getRawPtr(),
1126  imports.size(), true,
1127  "imports");
1128  auto num_packets_per_lid_d =
1129  create_mirror_view_from_raw_host_array(buffer_device_type(),
1130  numPacketsPerLID.getRawPtr(),
1131  numPacketsPerLID.size(), true,
1132  "num_packets_per_lid");
1133 
1135  packet_type,local_graph_type,buffer_device_type>(
1136  local_graph, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs);
1137 }
1138 
1152 template<class LocalOrdinal, class GlobalOrdinal, class Node>
1153 void
1155  const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> & sourceGraph,
1156  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1157  const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type>& imports,
1158  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1159  const size_t /* constantNumPackets */,
1160  Distributor& /* distor */,
1161  const CombineMode /* combineMode */,
1162  const size_t numSameIDs,
1163  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1164  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1165  size_t TargetNumRows,
1166  size_t TargetNumNonzeros,
1167  const int MyTargetPID,
1168  const Teuchos::ArrayView<size_t>& CRS_rowptr,
1169  const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1170  const Teuchos::ArrayView<const int>& SourcePids,
1171  Teuchos::Array<int>& TargetPids)
1172 {
1173  using Kokkos::View;
1174  using Kokkos::deep_copy;
1175  using Teuchos::outArg;
1176  using Teuchos::REDUCE_MAX;
1177  using Teuchos::reduceAll;
1178  using LO = LocalOrdinal;
1179  using GO = GlobalOrdinal;
1180  using crs_graph_type = CrsGraph<LO, GO, Node>;
1181  using packet_type = typename crs_graph_type::packet_type;
1182  using local_graph_type = typename crs_graph_type::local_graph_type;
1183  using buffer_device_type = typename crs_graph_type::buffer_device_type;
1184  using device_type = typename Node::device_type;
1185  using size_type = typename Teuchos::ArrayView<const LO>::size_type;
1186 
1187  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1188 
1189  TEUCHOS_TEST_FOR_EXCEPTION(
1190  TargetNumRows + 1 != static_cast<size_t>(CRS_rowptr.size()),
1191  std::invalid_argument, prefix << "CRS_rowptr.size() = " <<
1192  CRS_rowptr.size() << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
1193 
1194  TEUCHOS_TEST_FOR_EXCEPTION(
1195  permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1196  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size()
1197  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1198  const size_type numImportLIDs = importLIDs.size();
1199 
1200  TEUCHOS_TEST_FOR_EXCEPTION(
1201  numImportLIDs != numPacketsPerLID.size(), std::invalid_argument,
1202  prefix << "importLIDs.size() = " << numImportLIDs << " != "
1203  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
1204 
1205  // Preseed TargetPids with -1 for local
1206  if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
1207  TargetPids.resize(TargetNumNonzeros);
1208  }
1209  TargetPids.assign(TargetNumNonzeros, -1);
1210 
1211  // Grab pointers for sourceGraph
1212  auto local_graph = sourceGraph.getLocalGraph();
1213  auto local_col_map = sourceGraph.getColMap()->getLocalMap();
1214 
1215  // Convert input arrays to Kokkos::View
1216  device_type outputDevice;
1217  buffer_device_type bufferOutputDevice;
1218 
1219  Kokkos::View<const LO*, buffer_device_type> import_lids_d =
1221  (bufferOutputDevice, importLIDs.getRawPtr(),
1222  importLIDs.size(), true, "import_lids");
1223 
1224  Kokkos::View<const packet_type*, buffer_device_type> imports_d =
1226  (bufferOutputDevice, imports.getRawPtr(),
1227  imports.size(), true, "imports");
1228 
1229  Kokkos::View<const size_t*, buffer_device_type> num_packets_per_lid_d =
1230  create_mirror_view_from_raw_host_array(bufferOutputDevice,
1231  numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
1232  true, "num_packets_per_lid");
1233 
1234  Kokkos::View<const LO*, buffer_device_type> permute_to_lids_d =
1235  create_mirror_view_from_raw_host_array(bufferOutputDevice,
1236  permuteToLIDs.getRawPtr(), permuteToLIDs.size(),
1237  true, "permute_to_lids");
1238 
1239  Kokkos::View<const LO*, buffer_device_type> permute_from_lids_d =
1240  create_mirror_view_from_raw_host_array(bufferOutputDevice,
1241  permuteFromLIDs.getRawPtr(), permuteFromLIDs.size(),
1242  true, "permute_from_lids");
1243 
1244  Kokkos::View<size_t*, device_type> crs_rowptr_d =
1246  CRS_rowptr.getRawPtr(), CRS_rowptr.size(),
1247  true, "crs_rowptr");
1248 
1249  Kokkos::View<GO*, device_type> crs_colind_d =
1251  CRS_colind.getRawPtr(), CRS_colind.size(),
1252  true, "crs_colidx");
1253 
1254  Kokkos::View<const int*, device_type> src_pids_d =
1256  SourcePids.getRawPtr(), SourcePids.size(),
1257  true, "src_pids");
1258 
1259  Kokkos::View<int*, device_type> tgt_pids_d =
1261  TargetPids.getRawPtr(), TargetPids.size(),
1262  true, "tgt_pids");
1263 
1264  using local_map_type = decltype(local_col_map);
1266  packet_type,local_graph_type,local_map_type,buffer_device_type>(
1267  local_graph, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d,
1268  permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, src_pids_d,
1269  tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID);
1270 
1271  // FIXME (mfh 25 Jun 2019) HostMirror of CudaUVMSpace is CudaUVMSpace!!!
1272 
1273  // Copy outputs back to host
1274  typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1275  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1276  deep_copy(crs_rowptr_h, crs_rowptr_d);
1277 
1278  typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1279  CRS_colind.getRawPtr(), CRS_colind.size());
1280  deep_copy(crs_colind_h, crs_colind_d);
1281 
1282  typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1283  TargetPids.getRawPtr(), TargetPids.size());
1284  deep_copy(tgt_pids_h, tgt_pids_d);
1285 
1286 }
1287 
1288 } // namespace Details
1289 } // namespace Tpetra
1290 
1291 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_INSTANT( LO, GO, NT ) \
1292  template void \
1293  Details::unpackCrsGraphAndCombine<LO, GO, NT>( \
1294  CrsGraph<LO, GO, NT>&, \
1295  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1296  const Teuchos::ArrayView<const size_t>&, \
1297  const Teuchos::ArrayView<const LO>&, \
1298  size_t, \
1299  Distributor&, \
1300  CombineMode); \
1301  template void \
1302  Details::unpackCrsGraphAndCombineNew<LO, GO, NT>( \
1303  CrsGraph<LO, GO, NT>&, \
1304  const Kokkos::DualView<const CrsGraph<LO, GO, NT>::packet_type*, \
1305  CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1306  const Kokkos::DualView<const size_t*, \
1307  CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1308  const Kokkos::DualView<const LO*, \
1309  CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1310  const size_t, \
1311  Distributor&, \
1312  const CombineMode); \
1313  template void \
1314  Details::unpackAndCombineIntoCrsArrays<LO, GO, NT>( \
1315  const CrsGraph<LO, GO, NT> &, \
1316  const Teuchos::ArrayView<const LO>&, \
1317  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1318  const Teuchos::ArrayView<const size_t>&, \
1319  const size_t, \
1320  Distributor&, \
1321  const CombineMode, \
1322  const size_t, \
1323  const Teuchos::ArrayView<const LO>&, \
1324  const Teuchos::ArrayView<const LO>&, \
1325  size_t, \
1326  size_t, \
1327  const int, \
1328  const Teuchos::ArrayView<size_t>&, \
1329  const Teuchos::ArrayView<GO>&, \
1330  const Teuchos::ArrayView<const int>&, \
1331  Teuchos::Array<int>&); \
1332  template size_t \
1333  Details::unpackAndCombineWithOwningPIDsCount<LO, GO, NT>( \
1334  const CrsGraph<LO, GO, NT> &, \
1335  const Teuchos::ArrayView<const LO> &, \
1336  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type> &, \
1337  const Teuchos::ArrayView<const size_t>&, \
1338  size_t, \
1339  Distributor &, \
1340  CombineMode, \
1341  size_t, \
1342  const Teuchos::ArrayView<const LO>&, \
1343  const Teuchos::ArrayView<const LO>&);
1344 
1345 #endif // TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
Declaration of the Tpetra::CrsGraph class.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Functions for manipulating CRS arrays.
Declaration and definition of Tpetra::Details::getEntryOnHost.
void unpackAndCombine(const RowView &row_ptrs_beg, const RowView &row_ptrs_end, IndicesView &indices, const Kokkos::View< const Packet *, BufferDevice, Kokkos::MemoryUnmanaged > &imports, const Kokkos::View< const size_t *, BufferDevice, Kokkos::MemoryUnmanaged > &num_packets_per_lid, const Kokkos::View< const LocalOrdinal *, BufferDevice, Kokkos::MemoryUnmanaged > &import_lids, const bool unpack_pids)
Perform the unpack operation for the graph.
KOKKOS_FUNCTION int unpackRow(const Kokkos::View< GO *, Device, Kokkos::MemoryUnmanaged > &gids_out, const Kokkos::View< int *, Device, Kokkos::MemoryUnmanaged > &pids_out, const Kokkos::View< const Packet *, BufferDevice > &imports, const size_t offset, const size_t num_ent)
Unpack a single row of a CrsGraph.
void setupRowPointersForRemotes(const Kokkos::View< size_t *, Device > &tgt_rowptr, const Kokkos::View< const LO *, BufferDevice > &import_lids, const Kokkos::View< const Packet *, BufferDevice > &, const Kokkos::View< const size_t *, BufferDevice > &num_packets_per_lid)
Setup row pointers for remotes.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
global_ordinal_type packet_type
Type of each entry of the DistObject communication buffer.
local_graph_type::row_map_type::const_type k_rowPtrs_
Row offsets for "1-D" storage.
num_row_entries_type k_numRowEntries_
The number of local entries in each locally owned row.
t_GlobalOrdinal_1D k_gblInds1D_
Global column indices for all rows.
typename dist_object_type::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, execution_space > local_graph_type
The type of the part of the sparse graph on each MPI process.
local_graph_type getLocalGraph() const
Get the local graph.
bool isGloballyIndexed() const override
Whether the graph's column indices are stored as global indices.
bool isLocallyIndexed() const override
Whether the graph's column indices are stored as local indices.
Sets up and executes a communication plan for a Tpetra DistObject.
Implementation details of Tpetra.
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LocalOrdinal, GlobalOrdinal, Node > &sourceGraph, const Teuchos::ArrayView< const LocalOrdinal > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LocalOrdinal, GlobalOrdinal, Node >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t, Distributor &, CombineMode, size_t numSameIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteToLIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
void unpackCrsGraphAndCombine(CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, Distributor &distor, CombineMode combineMode)
Unpack the imported column indices and combine into graph.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LocalOrdinal, GlobalOrdinal, Node > &sourceGraph, const Teuchos::ArrayView< const LocalOrdinal > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LocalOrdinal, GlobalOrdinal, Node >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t, Distributor &, const CombineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteToLIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GlobalOrdinal > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
CombineMode
Rule for combining data in an Import or Export.