42 #ifndef TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
43 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
45 #include "TpetraCore_config.h"
46 #include "Teuchos_Array.hpp"
47 #include "Teuchos_ArrayView.hpp"
56 #include "Kokkos_Core.hpp"
81 #ifndef DOXYGEN_SHOULD_SKIP_THIS
91 namespace UnpackAndCombineCrsGraphImpl {
102 template<
class Packet,
class GO,
class Device,
class BufferDevice>
104 unpackRow (
const Kokkos::View<GO*,Device,Kokkos::MemoryUnmanaged>& gids_out,
105 const Kokkos::View<int*,Device,Kokkos::MemoryUnmanaged>& pids_out,
106 const Kokkos::View<const Packet*,BufferDevice>& imports,
108 const size_t num_ent)
110 using size_type =
typename Kokkos::View<GO*,Device>::size_type;
118 for (size_type k=0; k<num_ent; k++)
119 gids_out(k) = imports(offset+k);
122 if (pids_out.size() > 0) {
123 for (size_type k=0; k<num_ent; k++) {
124 pids_out(k) =
static_cast<int>(imports(offset+num_ent+k));
141 template<
class LocalOrdinal,
148 using LO = LocalOrdinal;
149 using GO =
typename IndicesView::value_type;
150 using packet_type = Packet;
151 using row_ptrs_type = RowView;
152 using indices_type = IndicesView;
153 using buffer_device_type = BufferDevice;
155 using device_type =
typename IndicesView::device_type;
156 using execution_space =
typename device_type::execution_space;
158 using num_packets_per_lid_type = Kokkos::View<const size_t*, buffer_device_type>;
159 using offsets_type = Kokkos::View<const size_t*, device_type>;
160 using input_buffer_type = Kokkos::View<const packet_type*, buffer_device_type>;
161 using import_lids_type = Kokkos::View<const LO*, buffer_device_type>;
163 using gids_scratch_type = Kokkos::View<GO*, device_type>;
164 using pids_scratch_type = Kokkos::View<int*,device_type>;
166 row_ptrs_type row_ptrs_beg;
167 row_ptrs_type row_ptrs_end;
168 indices_type indices;
169 input_buffer_type imports;
170 num_packets_per_lid_type num_packets_per_lid;
171 import_lids_type import_lids;
172 offsets_type offsets;
175 Kokkos::Experimental::UniqueToken<execution_space,
176 Kokkos::Experimental::UniqueTokenScope::Global> tokens;
177 gids_scratch_type gids_scratch;
178 pids_scratch_type pids_scratch;
181 using value_type = Kokkos::pair<int, LO>;
184 const row_ptrs_type& row_ptrs_beg_in,
185 const row_ptrs_type& row_ptrs_end_in,
186 const indices_type& indices_in,
187 const input_buffer_type& imports_in,
188 const num_packets_per_lid_type& num_packets_per_lid_in,
189 const import_lids_type& import_lids_in,
190 const offsets_type& offsets_in,
191 const size_t max_num_ent_in,
192 const bool unpack_pids_in) :
193 row_ptrs_beg(row_ptrs_beg_in),
194 row_ptrs_end(row_ptrs_end_in),
197 num_packets_per_lid(num_packets_per_lid_in),
198 import_lids(import_lids_in),
200 max_num_ent(max_num_ent_in),
201 unpack_pids(unpack_pids_in),
202 tokens(execution_space()),
203 gids_scratch(
"gids_scratch", tokens.size() * max_num_ent),
204 pids_scratch(
"pids_scratch", tokens.size() * max_num_ent)
207 KOKKOS_INLINE_FUNCTION
void init(value_type& dst)
const
209 using Tpetra::Details::OrdinalTraits;
210 dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
213 KOKKOS_INLINE_FUNCTION
void
214 join(
volatile value_type& dst,
const volatile value_type& src)
const
220 using Tpetra::Details::OrdinalTraits;
221 if (src.second != OrdinalTraits<LO>::invalid()) {
226 if (dst.second == OrdinalTraits<LO>::invalid() ||
227 src.second < dst.second) {
233 KOKKOS_INLINE_FUNCTION
234 void operator()(
const LO i, value_type& dst)
const
237 using Kokkos::subview;
238 using Kokkos::MemoryUnmanaged;
239 using size_type =
typename execution_space::size_type;
240 using slice =
typename Kokkos::pair<size_type, size_type>;
242 using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
243 using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
245 const size_t num_packets_this_lid = num_packets_per_lid(i);
246 const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
247 : num_packets_this_lid;
248 if (unpack_pids && num_packets_this_lid%2 != 0) {
251 dst = Kokkos::make_pair(1, i);
261 const size_t buf_size = imports.size();
262 const size_t offset = offsets(i);
264 if (offset > buf_size || offset + num_packets_this_lid > buf_size) {
265 dst = Kokkos::make_pair(2, i);
272 const size_type token = tokens.acquire();
273 const size_t a =
static_cast<size_t>(token) * max_num_ent;
274 const size_t b = a + num_ent;
275 gids_out_type gids_out = subview(gids_scratch, slice(a, b));
276 pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
278 const int err =
unpackRow (gids_out, pids_out, imports, offset, num_ent);
281 dst = Kokkos::make_pair(3, i);
282 tokens.release(token);
286 auto import_lid = import_lids(i);
287 for (
size_t k = 0; k < num_ent; ++k) {
288 indices(row_ptrs_end(import_lid)) = gids_out(k);
290 row_ptrs_end(import_lid) += 1;
293 tokens.release(token);
298 template<
class NumPackets,
class ImportL
ids,
class Device>
299 Kokkos::UnorderedMap<
typename ImportLids::non_const_value_type,
300 typename NumPackets::non_const_value_type,
302 computeCrsPadding(
const NumPackets& num_packets_per_lid,
303 const ImportLids& import_lids,
304 const bool unpack_pids)
308 using key_type =
typename ImportLids::non_const_value_type;
309 using val_type =
typename NumPackets::non_const_value_type;
310 Kokkos::UnorderedMap<key_type, val_type, Device> padding(import_lids.size());
311 auto policy = Kokkos::RangePolicy<typename Device::execution_space>(0, import_lids.size());
312 Kokkos::parallel_for(
"Fill padding", policy,
313 KOKKOS_LAMBDA(
typename ImportLids::size_type i) {
314 auto how_much_padding = (unpack_pids) ? num_packets_per_lid(i)/2
315 : num_packets_per_lid(i);
316 padding.insert(import_lids(i), how_much_padding);
319 TEUCHOS_TEST_FOR_EXCEPTION(padding.failed_insert(), std::runtime_error,
320 "computeCrsPadding: failed to insert one or more indices in to padding map");
330 template<
class LocalOrdinal,
class Packet,
class RowView,
331 class IndicesView,
class BufferDevice>
334 (
const RowView& row_ptrs_beg,
335 const RowView& row_ptrs_end,
336 IndicesView& indices,
337 const Kokkos::View<const Packet*, BufferDevice, Kokkos::MemoryUnmanaged>& imports,
338 const Kokkos::View<const size_t*, BufferDevice, Kokkos::MemoryUnmanaged>& num_packets_per_lid,
339 const Kokkos::View<const LocalOrdinal*, BufferDevice, Kokkos::MemoryUnmanaged>& import_lids,
340 const bool unpack_pids)
343 using ImportLidsView =
344 Kokkos::View<const LocalOrdinal*, BufferDevice, Kokkos::MemoryUnmanaged>;
345 using NumPacketsView =
346 Kokkos::View<const size_t*, BufferDevice, Kokkos::MemoryUnmanaged>;
347 using LO = LocalOrdinal;
348 using execution_space =
typename BufferDevice::execution_space;
350 Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
351 using unpack_functor_type =
354 const char prefix[] =
355 "Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackAndCombine: ";
357 const size_t num_import_lids =
static_cast<size_t>(import_lids.extent(0));
358 if (num_import_lids == 0) {
363 using device_type =
typename IndicesView::device_type;
367 computeCrsPadding<NumPacketsView, ImportLidsView, device_type>
368 (num_packets_per_lid, import_lids, unpack_pids);
369 padCrsArrays<RowView, IndicesView, decltype (padding) > (row_ptrs_beg, row_ptrs_end, indices, padding);
372 Kokkos::View<size_t*, device_type> offsets(
"offsets", num_import_lids+1);
379 Kokkos::parallel_reduce
381 range_policy (0, LO (num_packets_per_lid.size ())),
382 KOKKOS_LAMBDA (
const LO i,
size_t& running_max_num_ent) {
383 const size_t num_packets_this_lid = num_packets_per_lid(i);
384 const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2 :
385 num_packets_this_lid;
386 if (num_ent > running_max_num_ent) {
387 running_max_num_ent = num_ent;
389 }, Kokkos::Max<size_t> (max_num_ent));
392 unpack_functor_type f (row_ptrs_beg, row_ptrs_end, indices, imports,
393 num_packets_per_lid, import_lids, offsets,
394 max_num_ent, unpack_pids);
396 typename unpack_functor_type::value_type x;
397 Kokkos::parallel_reduce(range_policy(0,
static_cast<LO
>(num_import_lids)), f, x);
398 auto x_h = x.to_std_pair();
399 TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
400 prefix <<
"UnpackAndCombineFunctor reported error code "
401 << x_h.first <<
" for the first bad row " << x_h.second);
404 template<
class Packet,
class LocalGraph,
class BufferDevice>
407 const LocalGraph& local_graph,
408 const Kokkos::View<
const typename LocalGraph::data_type*,
409 typename LocalGraph::device_type,
410 Kokkos::MemoryUnmanaged> permute_from_lids,
411 const Kokkos::View<const Packet*, BufferDevice>& ,
412 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
413 const size_t num_same_ids)
415 using Kokkos::parallel_reduce;
416 using local_graph_type = LocalGraph;
417 using LO =
typename local_graph_type::data_type;
418 using device_type =
typename local_graph_type::device_type;
419 using execution_space =
typename device_type::execution_space;
420 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
426 num_items =
static_cast<LO
>(num_same_ids);
430 range_policy(0, num_items),
431 KOKKOS_LAMBDA(
const LO lid,
size_t& update) {
432 update +=
static_cast<size_t>(local_graph.row_map[lid+1]
433 -local_graph.row_map[lid]);
439 num_items =
static_cast<LO
>(permute_from_lids.extent(0));
443 range_policy(0, num_items),
444 KOKKOS_LAMBDA(
const LO i,
size_t& update) {
445 const LO lid = permute_from_lids(i);
446 update +=
static_cast<size_t>(local_graph.row_map[lid+1]
447 - local_graph.row_map[lid]);
454 size_t tot_num_ent = 0;
455 parallel_reduce(
"SumReduce",
456 num_packets_per_lid.size(),
457 KOKKOS_LAMBDA(
const int& i,
size_t& lsum) {
458 lsum += num_packets_per_lid(i) / 2;
459 }, Kokkos::Sum<size_t>(tot_num_ent));
460 count += tot_num_ent;
467 template<
class Packet,
class LO,
class Device,
class BufferDevice>
470 const Kokkos::View<size_t*, Device>& tgt_rowptr,
471 const Kokkos::View<const LO*, BufferDevice>& import_lids,
472 const Kokkos::View<const Packet*, BufferDevice>& ,
473 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid)
475 using Kokkos::parallel_reduce;
476 using device_type = Device;
477 using execution_space =
typename device_type::execution_space;
478 using size_type =
typename Kokkos::View<size_t*,device_type>::size_type;
479 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
481 const size_type N = num_packets_per_lid.extent(0);
482 parallel_for(
"Setup row pointers for remotes",
484 KOKKOS_LAMBDA(
const size_t i){
485 using atomic_incr_type =
typename std::remove_reference<decltype(tgt_rowptr(0))>::type;
486 const size_t num_packets_this_lid = num_packets_per_lid(i);
487 const size_t num_ent = num_packets_this_lid / 2;
488 Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
493 template<
class Device>
495 makeCrsRowPtrFromLengths(
496 const Kokkos::View<size_t*,Device,Kokkos::MemoryUnmanaged>& tgt_rowptr,
497 const Kokkos::View<size_t*,Device>& new_start_row)
499 using Kokkos::parallel_scan;
500 using device_type = Device;
501 using execution_space =
typename device_type::execution_space;
502 using size_type =
typename Kokkos::View<size_t*,device_type>::size_type;
503 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
504 const size_type N = new_start_row.extent(0);
507 KOKKOS_LAMBDA(
const size_t& i,
size_t& update,
const bool&
final) {
508 auto cur_val = tgt_rowptr(i);
510 tgt_rowptr(i) = update;
511 new_start_row(i) = tgt_rowptr(i);
518 template<
class LocalGraph,
class LocalMap>
521 const Kokkos::View<
typename LocalMap::global_ordinal_type*,
522 typename LocalMap::device_type>& tgt_colind,
523 const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
524 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
525 const Kokkos::View<size_t*, typename LocalMap::device_type>& tgt_rowptr,
526 const Kokkos::View<const int*, typename LocalMap::device_type>& src_pids,
527 const LocalGraph& local_graph,
528 const LocalMap& local_col_map,
529 const size_t num_same_ids,
532 using Kokkos::parallel_for;
533 using device_type =
typename LocalMap::device_type;
534 using LO =
typename LocalMap::local_ordinal_type;
535 using execution_space =
typename device_type::execution_space;
536 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
539 range_policy(0, num_same_ids),
540 KOKKOS_LAMBDA(
const size_t i) {
541 using atomic_incr_type =
typename std::remove_reference<decltype(new_start_row(0))>::type;
543 const LO src_lid =
static_cast<LO
>(i);
544 size_t src_row = local_graph.row_map(src_lid);
546 const LO tgt_lid =
static_cast<LO
>(i);
547 const size_t tgt_row = tgt_rowptr(tgt_lid);
549 const size_t nsr = local_graph.row_map(src_lid+1)
550 - local_graph.row_map(src_lid);
551 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
553 for (
size_t j=local_graph.row_map(src_lid);
554 j<local_graph.row_map(src_lid+1); ++j) {
555 LO src_col = local_graph.entries(j);
556 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
557 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
563 template<
class LocalGraph,
class LocalMap,
class BufferDevice>
565 copyDataFromPermuteIDs(
566 const Kokkos::View<
typename LocalMap::global_ordinal_type*,
567 typename LocalMap::device_type>& tgt_colind,
568 const Kokkos::View<
int*,
569 typename LocalMap::device_type>& tgt_pids,
570 const Kokkos::View<
size_t*,
571 typename LocalMap::device_type>& new_start_row,
572 const Kokkos::View<
size_t*,
573 typename LocalMap::device_type>& tgt_rowptr,
574 const Kokkos::View<
const int*,
575 typename LocalMap::device_type>& src_pids,
576 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
577 BufferDevice, Kokkos::MemoryUnmanaged>& permute_to_lids,
578 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
579 BufferDevice, Kokkos::MemoryUnmanaged>& permute_from_lids,
580 const LocalGraph& local_graph,
581 const LocalMap& local_col_map,
584 using Kokkos::parallel_for;
585 using device_type =
typename LocalMap::device_type;
586 using LO =
typename LocalMap::local_ordinal_type;
587 using execution_space =
typename device_type::execution_space;
588 using size_type =
typename Kokkos::View<LO*,device_type>::size_type;
589 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
591 const size_type num_permute_to_lids = permute_to_lids.extent(0);
594 range_policy(0, num_permute_to_lids),
595 KOKKOS_LAMBDA(
const size_t i) {
596 using atomic_incr_type =
typename std::remove_reference<decltype(new_start_row(0))>::type;
598 const LO src_lid = permute_from_lids(i);
599 const size_t src_row = local_graph.row_map(src_lid);
601 const LO tgt_lid = permute_to_lids(i);
602 const size_t tgt_row = tgt_rowptr(tgt_lid);
604 size_t nsr = local_graph.row_map(src_lid+1)
605 - local_graph.row_map(src_lid);
606 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
608 for (
size_t j=local_graph.row_map(src_lid);
609 j<local_graph.row_map(src_lid+1); ++j) {
610 LO src_col = local_graph.entries(j);
611 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
612 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
618 template<
class Packet,
class LocalGraph,
class LocalMap,
class BufferDevice>
620 unpackAndCombineIntoCrsArrays2(
621 const Kokkos::View<typename LocalMap::global_ordinal_type*, typename LocalMap::device_type>& tgt_colind,
622 const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
623 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
624 const Kokkos::View<const size_t*, typename LocalMap::device_type>& offsets,
626 const typename LocalMap::local_ordinal_type*,
628 Kokkos::MemoryUnmanaged>& import_lids,
629 const Kokkos::View<const Packet*, BufferDevice>& imports,
630 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
636 using Kokkos::subview;
637 using Kokkos::MemoryUnmanaged;
638 using Kokkos::parallel_reduce;
639 using Kokkos::atomic_fetch_add;
641 using device_type =
typename LocalMap::device_type;
642 using LO =
typename LocalMap::local_ordinal_type;
643 using GO =
typename LocalMap::global_ordinal_type;
644 using execution_space =
typename device_type::execution_space;
645 using size_type =
typename Kokkos::View<LO*, device_type>::size_type;
646 using slice =
typename Kokkos::pair<size_type, size_type>;
647 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
649 using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
650 using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
652 const size_type num_import_lids = import_lids.size();
653 const char prefix[] =
"UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays2: ";
657 parallel_reduce(
"Unpack and combine into CRS",
658 range_policy(0, num_import_lids),
659 KOKKOS_LAMBDA(
const size_t i,
int& err) {
660 using atomic_incr_type =
typename std::remove_reference< decltype( new_start_row(0) )>::type;
661 const size_t num_packets_this_lid = num_packets_per_lid(i);
662 const size_t num_ent = num_packets_this_lid / 2;
663 const size_t offset = offsets(i);
664 const LO lcl_row = import_lids(i);
665 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
666 const size_t end_row = start_row + num_ent;
668 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
669 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
671 err +=
unpackRow (gids_out, pids_out, imports, offset, num_ent);
674 for (
size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
675 const int pid = pids_out(j);
676 pids_out(j) = (pid != my_pid) ? pid : -1;
680 TEUCHOS_TEST_FOR_EXCEPTION(gbl_err_count != 0,
681 std::invalid_argument, prefix <<
682 "Attempting to unpack PIDs, but num_ent is not even; this should never "
683 "happen! Please report this bug to the Tpetra developers.");
688 template<
class Packet,
class LocalGraph,
class LocalMap,
class BufferDevice>
691 const LocalGraph & local_graph,
692 const LocalMap & local_col_map,
693 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
695 Kokkos::MemoryUnmanaged>& import_lids,
696 const Kokkos::View<const Packet*, BufferDevice>& imports,
697 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
698 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
700 Kokkos::MemoryUnmanaged>& permute_to_lids,
701 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
703 Kokkos::MemoryUnmanaged>& permute_from_lids,
704 const Kokkos::View<
size_t*,
705 typename LocalMap::device_type,
706 Kokkos::MemoryUnmanaged>& tgt_rowptr,
707 const Kokkos::View<
typename LocalMap::global_ordinal_type*,
708 typename LocalMap::device_type,
709 Kokkos::MemoryUnmanaged>& tgt_colind,
710 const Kokkos::View<
const int*,
711 typename LocalMap::device_type,
712 Kokkos::MemoryUnmanaged>& src_pids,
713 const Kokkos::View<
int*,
714 typename LocalMap::device_type,
715 Kokkos::MemoryUnmanaged>& tgt_pids,
716 const size_t num_same_ids,
717 const size_t tgt_num_rows,
718 const size_t tgt_num_nonzeros,
719 const int my_tgt_pid)
722 using Kokkos::subview;
723 using Kokkos::parallel_for;
724 using Kokkos::MemoryUnmanaged;
725 using packet_type = Packet;
726 using local_map_type = LocalMap;
727 using local_graph_type = LocalGraph;
728 using buffer_device_type = BufferDevice;
729 using device_type =
typename LocalMap::device_type;
730 using LO =
typename LocalMap::local_ordinal_type;
731 using execution_space =
typename device_type::execution_space;
732 using size_type =
typename Kokkos::View<LO*, device_type>::size_type;
733 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
735 const char prefix[] =
"UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays: ";
737 const size_t N = tgt_num_rows;
738 const size_t mynnz = tgt_num_nonzeros;
742 const int my_pid = my_tgt_pid;
751 range_policy(0, N+1),
752 KOKKOS_LAMBDA(
const size_t i) {
759 range_policy(0, num_same_ids),
760 KOKKOS_LAMBDA(
const size_t i) {
761 const LO tgt_lid =
static_cast<LO
>(i);
762 const LO src_lid =
static_cast<LO
>(i);
763 tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
764 - local_graph.row_map(src_lid);
769 const size_type num_permute_to_lids = permute_to_lids.extent(0);
771 range_policy(0, num_permute_to_lids),
772 KOKKOS_LAMBDA(
const size_t i) {
773 const LO tgt_lid = permute_to_lids(i);
774 const LO src_lid = permute_from_lids(i);
775 tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
776 - local_graph.row_map(src_lid);
781 const size_type num_import_lids = import_lids.extent(0);
782 View<size_t*, device_type> offsets(
"offsets", num_import_lids+1);
785 #ifdef HAVE_TPETRA_DEBUG
787 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
788 const bool condition =
789 nth_offset_h !=
static_cast<size_t>(imports.extent(0));
790 TEUCHOS_TEST_FOR_EXCEPTION
791 (condition, std::logic_error, prefix
792 <<
"The final offset in bytes " << nth_offset_h
793 <<
" != imports.size() = " << imports.extent(0)
794 <<
". Please report this bug to the Tpetra developers.");
799 setupRowPointersForRemotes<packet_type,LO,device_type,buffer_device_type>(
800 tgt_rowptr, import_lids, imports, num_packets_per_lid);
804 View<size_t*, device_type> new_start_row(
"new_start_row", N+1);
807 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
809 auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
810 bool condition = nth_tgt_rowptr_h != mynnz;
811 TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
812 prefix <<
"CRS_rowptr[last] = " <<
813 nth_tgt_rowptr_h <<
"!= mynnz = " << mynnz <<
".");
817 copyDataFromSameIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
818 tgt_rowptr, src_pids, local_graph, local_col_map, num_same_ids, my_pid);
820 copyDataFromPermuteIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
821 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
822 local_graph, local_col_map, my_pid);
824 if (imports.extent(0) <= 0) {
828 unpackAndCombineIntoCrsArrays2<
829 packet_type,local_graph_type,local_map_type,buffer_device_type>(
830 tgt_colind, tgt_pids, new_start_row, offsets, import_lids, imports,
831 num_packets_per_lid, local_graph, local_col_map, my_pid);
871 template<
class LO,
class GO,
class Node>
876 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
877 const Teuchos::ArrayView<const LO>& importLIDs,
884 "Graph must be globally indexed!");
888 using UnpackAndCombineCrsGraphImpl::unpackAndCombine;
890 using device_type =
typename Node::device_type;
891 using buffer_device_type =
typename graph_type::buffer_device_type;
892 using execution_space =
typename device_type::execution_space;
893 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
894 using row_ptrs_type =
typename graph_type::local_graph_type::row_map_type::non_const_type;
895 using indices_type =
typename graph_type::t_GlobalOrdinal_1D;
899 buffer_device_type bufferOutputDevice;
906 imports.getRawPtr(), imports.size(),
909 auto num_packets_per_lid_d =
911 numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
912 true,
"num_packets_per_lid");
916 importLIDs.getRawPtr(), importLIDs.size(),
917 true,
"import_lids");
921 indices_type indices(
"indices", graph.
k_gblInds1D_.extent(0));
924 row_ptrs_type row_ptrs_beg(
"row_ptrs_beg", graph.
k_rowPtrs_.extent(0));
927 const size_t N = (row_ptrs_beg.extent(0) == 0 ? 0 : row_ptrs_beg.extent(0) - 1);
928 row_ptrs_type row_ptrs_end(
"row_ptrs_end", N);
930 bool refill_num_row_entries =
false;
933 refill_num_row_entries =
true;
935 Kokkos::parallel_for(
"Fill end row pointers", range_policy(0, N),
936 KOKKOS_LAMBDA(
const size_t i){
937 row_ptrs_end(i) = row_ptrs_beg(i) + num_row_entries(i);
945 Kokkos::parallel_for(
"Fill end row pointers",
946 range_policy(0, N), KOKKOS_LAMBDA(
const size_t i){
947 row_ptrs_end(i) = row_ptrs_beg(i+1);
952 unpackAndCombine<LO, GO, row_ptrs_type, indices_type, buffer_device_type>
953 (row_ptrs_beg, row_ptrs_end, indices, imports_d,
954 num_packets_per_lid_d, import_lids_d,
false);
958 if (refill_num_row_entries) {
959 Kokkos::parallel_for(
"Fill num entries",
960 range_policy(0, N), KOKKOS_LAMBDA(
const size_t i){
970 template<
class LO,
class GO,
class Node>
972 unpackCrsGraphAndCombineNew(
976 const Kokkos::DualView<
const size_t*,
978 const Kokkos::DualView<
const LO*,
984 TEUCHOS_TEST_FOR_EXCEPTION(
true, std::logic_error,
"METHOD NOT COMPLETE");
986 using UnpackAndCombineCrsGraphImpl::unpackAndCombine;
989 using device_type =
typename Node::device_type;
991 using packet_type =
typename graph_type::packet_type;
992 using local_graph_type =
typename graph_type::local_graph_type;
993 using buffer_device_type =
typename graph_type::buffer_device_type;
994 using buffer_memory_space =
typename buffer_device_type::memory_space;
995 using memory_space =
typename device_type::memory_space;
997 using row_ptrs_type =
typename graph_type::local_graph_type::row_map_type::non_const_type;
998 using execution_space =
typename device_type::execution_space;
999 using indices_type = Kokkos::View<GO*, execution_space>;
1001 static_assert(std::is_same<device_type, typename local_graph_type::device_type>::value,
1002 "Node::device_type and LocalGraph::device_type must be "
1007 numPacketsPerLID_nc.sync_device ();
1009 auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1011 TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1012 auto import_lids_d = importLIDs.view_device ();
1016 imports_nc.sync_device ();
1018 auto imports_d = imports.view_device ();
1022 indices_type indices;
1023 row_ptrs_type row_ptrs_beg;
1024 row_ptrs_type row_ptrs_end;
1025 unpackAndCombine<LO,packet_type,row_ptrs_type,indices_type,device_type,buffer_device_type>(
1026 row_ptrs_beg, row_ptrs_end, indices, imports_d,
1027 num_packets_per_lid_d, import_lids_d,
false);
1080 template<
class LocalOrdinal,
class GlobalOrdinal,
class Node>
1084 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1086 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1091 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1092 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1094 using Kokkos::MemoryUnmanaged;
1096 using device_type =
typename Node::device_type;
1100 const char prefix[] =
"unpackAndCombineWithOwningPIDsCount: ";
1102 TEUCHOS_TEST_FOR_EXCEPTION
1103 (permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1104 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size() <<
" != "
1105 "permuteFromLIDs.size() = " << permuteFromLIDs.size() <<
".");
1109 TEUCHOS_TEST_FOR_EXCEPTION
1110 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1111 "CrsGraph 'sourceGraph' must be locally indexed.");
1112 TEUCHOS_TEST_FOR_EXCEPTION
1113 (importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
1114 prefix <<
"importLIDs.size() = " << importLIDs.size() <<
" != "
1115 "numPacketsPerLID.size() = " << numPacketsPerLID.size() <<
".");
1118 auto permute_from_lids_d =
1120 permuteFromLIDs.getRawPtr(),
1121 permuteFromLIDs.size(),
true,
1122 "permute_from_lids");
1125 imports.getRawPtr(),
1126 imports.size(),
true,
1128 auto num_packets_per_lid_d =
1130 numPacketsPerLID.getRawPtr(),
1131 numPacketsPerLID.size(),
true,
1132 "num_packets_per_lid");
1135 packet_type,local_graph_type,buffer_device_type>(
1136 local_graph, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs);
1152 template<
class LocalOrdinal,
class GlobalOrdinal,
class Node>
1156 const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1158 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1162 const size_t numSameIDs,
1163 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1164 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1165 size_t TargetNumRows,
1166 size_t TargetNumNonzeros,
1167 const int MyTargetPID,
1168 const Teuchos::ArrayView<size_t>& CRS_rowptr,
1169 const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1170 const Teuchos::ArrayView<const int>& SourcePids,
1171 Teuchos::Array<int>& TargetPids)
1175 using Teuchos::outArg;
1176 using Teuchos::REDUCE_MAX;
1177 using Teuchos::reduceAll;
1178 using LO = LocalOrdinal;
1179 using GO = GlobalOrdinal;
1181 using packet_type =
typename crs_graph_type::packet_type;
1182 using local_graph_type =
typename crs_graph_type::local_graph_type;
1183 using buffer_device_type =
typename crs_graph_type::buffer_device_type;
1184 using device_type =
typename Node::device_type;
1185 using size_type =
typename Teuchos::ArrayView<const LO>::size_type;
1187 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1189 TEUCHOS_TEST_FOR_EXCEPTION(
1190 TargetNumRows + 1 !=
static_cast<size_t>(CRS_rowptr.size()),
1191 std::invalid_argument, prefix <<
"CRS_rowptr.size() = " <<
1192 CRS_rowptr.size() <<
"!= TargetNumRows+1 = " << TargetNumRows+1 <<
".");
1194 TEUCHOS_TEST_FOR_EXCEPTION(
1195 permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1196 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size()
1197 <<
"!= permuteFromLIDs.size() = " << permuteFromLIDs.size() <<
".");
1198 const size_type numImportLIDs = importLIDs.size();
1200 TEUCHOS_TEST_FOR_EXCEPTION(
1201 numImportLIDs != numPacketsPerLID.size(), std::invalid_argument,
1202 prefix <<
"importLIDs.size() = " << numImportLIDs <<
" != "
1203 "numPacketsPerLID.size() = " << numPacketsPerLID.size() <<
".");
1206 if (
static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
1207 TargetPids.resize(TargetNumNonzeros);
1209 TargetPids.assign(TargetNumNonzeros, -1);
1213 auto local_col_map = sourceGraph.
getColMap()->getLocalMap();
1216 device_type outputDevice;
1217 buffer_device_type bufferOutputDevice;
1219 Kokkos::View<const LO*, buffer_device_type> import_lids_d =
1221 (bufferOutputDevice, importLIDs.getRawPtr(),
1222 importLIDs.size(),
true,
"import_lids");
1224 Kokkos::View<const packet_type*, buffer_device_type> imports_d =
1226 (bufferOutputDevice, imports.getRawPtr(),
1227 imports.size(),
true,
"imports");
1229 Kokkos::View<const size_t*, buffer_device_type> num_packets_per_lid_d =
1231 numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
1232 true,
"num_packets_per_lid");
1234 Kokkos::View<const LO*, buffer_device_type> permute_to_lids_d =
1236 permuteToLIDs.getRawPtr(), permuteToLIDs.size(),
1237 true,
"permute_to_lids");
1239 Kokkos::View<const LO*, buffer_device_type> permute_from_lids_d =
1241 permuteFromLIDs.getRawPtr(), permuteFromLIDs.size(),
1242 true,
"permute_from_lids");
1244 Kokkos::View<size_t*, device_type> crs_rowptr_d =
1246 CRS_rowptr.getRawPtr(), CRS_rowptr.size(),
1247 true,
"crs_rowptr");
1249 Kokkos::View<GO*, device_type> crs_colind_d =
1251 CRS_colind.getRawPtr(), CRS_colind.size(),
1252 true,
"crs_colidx");
1254 Kokkos::View<const int*, device_type> src_pids_d =
1256 SourcePids.getRawPtr(), SourcePids.size(),
1259 Kokkos::View<int*, device_type> tgt_pids_d =
1261 TargetPids.getRawPtr(), TargetPids.size(),
1264 using local_map_type = decltype(local_col_map);
1266 packet_type,local_graph_type,local_map_type,buffer_device_type>(
1267 local_graph, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d,
1268 permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, src_pids_d,
1269 tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID);
1274 typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1275 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1278 typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1279 CRS_colind.getRawPtr(), CRS_colind.size());
1282 typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1283 TargetPids.getRawPtr(), TargetPids.size());
1291 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_INSTANT( LO, GO, NT ) \
1293 Details::unpackCrsGraphAndCombine<LO, GO, NT>( \
1294 CrsGraph<LO, GO, NT>&, \
1295 const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1296 const Teuchos::ArrayView<const size_t>&, \
1297 const Teuchos::ArrayView<const LO>&, \
1302 Details::unpackCrsGraphAndCombineNew<LO, GO, NT>( \
1303 CrsGraph<LO, GO, NT>&, \
1304 const Kokkos::DualView<const CrsGraph<LO, GO, NT>::packet_type*, \
1305 CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1306 const Kokkos::DualView<const size_t*, \
1307 CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1308 const Kokkos::DualView<const LO*, \
1309 CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1312 const CombineMode); \
1314 Details::unpackAndCombineIntoCrsArrays<LO, GO, NT>( \
1315 const CrsGraph<LO, GO, NT> &, \
1316 const Teuchos::ArrayView<const LO>&, \
1317 const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1318 const Teuchos::ArrayView<const size_t>&, \
1321 const CombineMode, \
1323 const Teuchos::ArrayView<const LO>&, \
1324 const Teuchos::ArrayView<const LO>&, \
1328 const Teuchos::ArrayView<size_t>&, \
1329 const Teuchos::ArrayView<GO>&, \
1330 const Teuchos::ArrayView<const int>&, \
1331 Teuchos::Array<int>&); \
1333 Details::unpackAndCombineWithOwningPIDsCount<LO, GO, NT>( \
1334 const CrsGraph<LO, GO, NT> &, \
1335 const Teuchos::ArrayView<const LO> &, \
1336 const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type> &, \
1337 const Teuchos::ArrayView<const size_t>&, \
1342 const Teuchos::ArrayView<const LO>&, \
1343 const Teuchos::ArrayView<const LO>&);
Declaration of the Tpetra::CrsGraph class.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Functions for manipulating CRS arrays.
Declaration and definition of Tpetra::Details::getEntryOnHost.
void unpackAndCombine(const RowView &row_ptrs_beg, const RowView &row_ptrs_end, IndicesView &indices, const Kokkos::View< const Packet *, BufferDevice, Kokkos::MemoryUnmanaged > &imports, const Kokkos::View< const size_t *, BufferDevice, Kokkos::MemoryUnmanaged > &num_packets_per_lid, const Kokkos::View< const LocalOrdinal *, BufferDevice, Kokkos::MemoryUnmanaged > &import_lids, const bool unpack_pids)
Perform the unpack operation for the graph.
KOKKOS_FUNCTION int unpackRow(const Kokkos::View< GO *, Device, Kokkos::MemoryUnmanaged > &gids_out, const Kokkos::View< int *, Device, Kokkos::MemoryUnmanaged > &pids_out, const Kokkos::View< const Packet *, BufferDevice > &imports, const size_t offset, const size_t num_ent)
Unpack a single row of a CrsGraph.
void setupRowPointersForRemotes(const Kokkos::View< size_t *, Device > &tgt_rowptr, const Kokkos::View< const LO *, BufferDevice > &import_lids, const Kokkos::View< const Packet *, BufferDevice > &, const Kokkos::View< const size_t *, BufferDevice > &num_packets_per_lid)
Setup row pointers for remotes.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
global_ordinal_type packet_type
Type of each entry of the DistObject communication buffer.
local_graph_type::row_map_type::const_type k_rowPtrs_
Row offsets for "1-D" storage.
num_row_entries_type k_numRowEntries_
The number of local entries in each locally owned row.
t_GlobalOrdinal_1D k_gblInds1D_
Global column indices for all rows.
typename dist_object_type::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, execution_space > local_graph_type
The type of the part of the sparse graph on each MPI process.
local_graph_type getLocalGraph() const
Get the local graph.
bool isGloballyIndexed() const override
Whether the graph's column indices are stored as global indices.
bool isLocallyIndexed() const override
Whether the graph's column indices are stored as local indices.
Unpacks and combines a single row of the CrsGraph.
Sets up and executes a communication plan for a Tpetra DistObject.
Implementation details of Tpetra.
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LocalOrdinal, GlobalOrdinal, Node > &sourceGraph, const Teuchos::ArrayView< const LocalOrdinal > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LocalOrdinal, GlobalOrdinal, Node >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t, Distributor &, CombineMode, size_t numSameIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteToLIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
void unpackCrsGraphAndCombine(CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, Distributor &distor, CombineMode combineMode)
Unpack the imported column indices and combine into graph.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LocalOrdinal, GlobalOrdinal, Node > &sourceGraph, const Teuchos::ArrayView< const LocalOrdinal > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LocalOrdinal, GlobalOrdinal, Node >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t, Distributor &, const CombineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteToLIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GlobalOrdinal > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
CombineMode
Rule for combining data in an Import or Export.