Tpetra parallel linear algebra  Version of the Day
Tpetra_Distributor.hpp
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DISTRIBUTOR_HPP
43 #define TPETRA_DISTRIBUTOR_HPP
44 
45 #include "Tpetra_Util.hpp"
46 #include <Teuchos_as.hpp>
47 #include <Teuchos_Describable.hpp>
48 #include <Teuchos_ParameterListAcceptorDefaultBase.hpp>
49 #include <Teuchos_VerboseObject.hpp>
51 
52 // If TPETRA_DISTRIBUTOR_TIMERS is defined, Distributor will time
53 // doPosts (both versions) and doWaits, and register those timers with
54 // Teuchos::TimeMonitor so that summarize() or report() will show
55 // results.
56 
57 // #ifndef TPETRA_DISTRIBUTOR_TIMERS
58 // # define TPETRA_DISTRIBUTOR_TIMERS 1
59 // #endif // TPETRA_DISTRIBUTOR_TIMERS
60 
61 #ifdef TPETRA_DISTRIBUTOR_TIMERS
62 # undef TPETRA_DISTRIBUTOR_TIMERS
63 #endif // TPETRA_DISTRIBUTOR_TIMERS
64 
65 #include "KokkosCompat_View.hpp"
66 #include "Kokkos_Core.hpp"
67 #include "Kokkos_TeuchosCommAdapters.hpp"
68 #include <memory>
69 #include <sstream>
70 #include <type_traits>
71 
72 namespace Tpetra {
73 
74  namespace Details {
80  DISTRIBUTOR_ISEND, // Use MPI_Isend (Teuchos::isend)
81  DISTRIBUTOR_RSEND, // Use MPI_Rsend (Teuchos::readySend)
82  DISTRIBUTOR_SEND, // Use MPI_Send (Teuchos::send)
83  DISTRIBUTOR_SSEND // Use MPI_Ssend (Teuchos::ssend)
84  };
85 
90  std::string
92 
98  DISTRIBUTOR_NOT_INITIALIZED, // Not initialized yet
99  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS, // By createFromSends
100  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS, // By createFromRecvs
101  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS_N_RECVS, // By createFromSendsAndRecvs
102  DISTRIBUTOR_INITIALIZED_BY_REVERSE, // By createReverseDistributor
103  DISTRIBUTOR_INITIALIZED_BY_COPY, // By copy constructor
104  };
105 
110  std::string
112 
113  } // namespace Details
114 
121  Teuchos::Array<std::string> distributorSendTypes ();
122 
190  class Distributor :
191  public Teuchos::Describable,
192  public Teuchos::ParameterListAcceptorDefaultBase {
193  public:
195 
196 
205  explicit Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm);
206 
218  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
219  const Teuchos::RCP<Teuchos::FancyOStream>& out);
220 
234  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
235  const Teuchos::RCP<Teuchos::ParameterList>& plist);
236 
253  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
254  const Teuchos::RCP<Teuchos::FancyOStream>& out,
255  const Teuchos::RCP<Teuchos::ParameterList>& plist);
256 
258  Distributor (const Distributor& distributor);
259 
264  virtual ~Distributor () = default;
265 
271  void swap (Distributor& rhs);
272 
274 
276 
281  void setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& plist);
282 
287  Teuchos::RCP<const Teuchos::ParameterList> getValidParameters () const;
288 
290 
292 
312  size_t createFromSends (const Teuchos::ArrayView<const int>& exportProcIDs);
313 
347  template <class Ordinal>
348  void
349  createFromRecvs (const Teuchos::ArrayView<const Ordinal>& remoteIDs,
350  const Teuchos::ArrayView<const int>& remoteProcIDs,
351  Teuchos::Array<Ordinal>& exportIDs,
352  Teuchos::Array<int>& exportProcIDs);
353 
361  void
362  createFromSendsAndRecvs (const Teuchos::ArrayView<const int>& exportProcIDs,
363  const Teuchos::ArrayView<const int>& remoteProcIDs);
364 
366 
368 
372  size_t getNumReceives() const;
373 
377  size_t getNumSends() const;
378 
380  bool hasSelfMessage() const;
381 
383  size_t getMaxSendLength() const;
384 
386  size_t getTotalReceiveLength() const;
387 
392  Teuchos::ArrayView<const int> getProcsFrom() const;
393 
398  Teuchos::ArrayView<const int> getProcsTo() const;
399 
407  Teuchos::ArrayView<const size_t> getLengthsFrom() const;
408 
416  Teuchos::ArrayView<const size_t> getLengthsTo() const;
417 
423  return howInitialized_;
424  }
425 
427 
429 
440  Teuchos::RCP<Distributor> getReverse() const;
441 
443 
445 
466  template <class Packet>
467  void
468  doPostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
469  size_t numPackets,
470  const Teuchos::ArrayView<Packet> &imports);
471 
493  template <class Packet>
494  void
495  doPostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
496  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
497  const Teuchos::ArrayView<Packet> &imports,
498  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
499 
524  template <class Packet>
525  void
526  doPosts (const Teuchos::ArrayRCP<const Packet> &exports,
527  size_t numPackets,
528  const Teuchos::ArrayRCP<Packet> &imports);
529 
548  template <class Packet>
549  void
550  doPosts (const Teuchos::ArrayRCP<const Packet> &exports,
551  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
552  const Teuchos::ArrayRCP<Packet> &imports,
553  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
554 
561  void doWaits ();
562 
567  template <class Packet>
568  void
569  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
570  size_t numPackets,
571  const Teuchos::ArrayView<Packet> &imports);
572 
577  template <class Packet>
578  void
579  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
580  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
581  const Teuchos::ArrayView<Packet> &imports,
582  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
583 
588  template <class Packet>
589  void
590  doReversePosts (const Teuchos::ArrayRCP<const Packet> &exports,
591  size_t numPackets,
592  const Teuchos::ArrayRCP<Packet> &imports);
593 
598  template <class Packet>
599  void
600  doReversePosts (const Teuchos::ArrayRCP<const Packet> &exports,
601  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
602  const Teuchos::ArrayRCP<Packet> &imports,
603  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
604 
611  void doReverseWaits ();
612 
633  template <class ExpView, class ImpView>
634  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
636  const ExpView &exports,
637  size_t numPackets,
638  const ImpView &imports);
639 
661  template <class ExpView, class ImpView>
662  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
663  doPostsAndWaits (const ExpView &exports,
664  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
665  const ImpView &imports,
666  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
667 
692  template <class ExpView, class ImpView>
693  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
694  doPosts (const ExpView &exports,
695  size_t numPackets,
696  const ImpView &imports);
697 
716  template <class ExpView, class ImpView>
717  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
718  doPosts (const ExpView &exports,
719  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
720  const ImpView &imports,
721  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
722 
727  template <class ExpView, class ImpView>
728  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
729  doReversePostsAndWaits (const ExpView &exports,
730  size_t numPackets,
731  const ImpView &imports);
732 
737  template <class ExpView, class ImpView>
738  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
739  doReversePostsAndWaits (const ExpView &exports,
740  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
741  const ImpView &imports,
742  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
743 
748  template <class ExpView, class ImpView>
749  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
750  doReversePosts (const ExpView &exports,
751  size_t numPackets,
752  const ImpView &imports);
753 
758  template <class ExpView, class ImpView>
759  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
760  doReversePosts (const ExpView &exports,
761  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
762  const ImpView &imports,
763  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
764 
768  void getLastDoStatistics(size_t & bytes_sent, size_t & bytes_recvd) const{
769  bytes_sent = lastRoundBytesSend_;
770  bytes_recvd = lastRoundBytesRecv_;
771  }
772 
774 
776 
778  std::string description() const;
779 
801  void
802  describe (Teuchos::FancyOStream& out,
803  const Teuchos::EVerbosityLevel verbLevel =
804  Teuchos::Describable::verbLevel_default) const;
806 
807  private:
809  Teuchos::RCP<const Teuchos::Comm<int> > comm_;
810 
812  Teuchos::RCP<Teuchos::FancyOStream> out_;
813 
815  Details::EDistributorHowInitialized howInitialized_;
816 
818 
819 
822 
824  bool barrierBetween_;
825 
827  bool verbose_;
829 
833  bool selfMessage_;
834 
844  size_t numSends_;
845 
850  Teuchos::Array<int> procsTo_;
851 
860  Teuchos::Array<size_t> startsTo_;
861 
867  Teuchos::Array<size_t> lengthsTo_;
868 
872  size_t maxSendLength_;
873 
889  Teuchos::Array<size_t> indicesTo_;
890 
900  size_t numReceives_;
901 
908  size_t totalReceiveLength_;
909 
915  Teuchos::Array<size_t> lengthsFrom_;
916 
922  Teuchos::Array<int> procsFrom_;
923 
929  Teuchos::Array<size_t> startsFrom_;
930 
936  Teuchos::Array<size_t> indicesFrom_;
937 
944  Teuchos::Array<Teuchos::RCP<Teuchos::CommRequest<int> > > requests_;
945 
950  mutable Teuchos::RCP<Distributor> reverseDistributor_;
951 
953  size_t lastRoundBytesSend_;
954 
956  size_t lastRoundBytesRecv_;
957 
958 #ifdef TPETRA_DISTRIBUTOR_TIMERS
959  Teuchos::RCP<Teuchos::Time> timer_doPosts3_;
960  Teuchos::RCP<Teuchos::Time> timer_doPosts4_;
961  Teuchos::RCP<Teuchos::Time> timer_doWaits_;
962  Teuchos::RCP<Teuchos::Time> timer_doPosts3_recvs_;
963  Teuchos::RCP<Teuchos::Time> timer_doPosts4_recvs_;
964  Teuchos::RCP<Teuchos::Time> timer_doPosts3_barrier_;
965  Teuchos::RCP<Teuchos::Time> timer_doPosts4_barrier_;
966  Teuchos::RCP<Teuchos::Time> timer_doPosts3_sends_;
967  Teuchos::RCP<Teuchos::Time> timer_doPosts4_sends_;
968 
970  void makeTimers ();
971 #endif // TPETRA_DISTRIBUTOR_TIMERS
972 
984  bool useDistinctTags_;
985 
990  int getTag (const int pathTag) const;
991 
1002  void computeReceives ();
1003 
1016  template <class Ordinal>
1017  void computeSends (const Teuchos::ArrayView<const Ordinal> &remoteGIDs,
1018  const Teuchos::ArrayView<const int> &remoteProcIDs,
1019  Teuchos::Array<Ordinal> &exportGIDs,
1020  Teuchos::Array<int> &exportProcIDs);
1021 
1023  void createReverseDistributor() const;
1024 
1025 
1030  std::string
1031  localDescribeToString (const Teuchos::EVerbosityLevel vl) const;
1032  }; // class Distributor
1033 
1034 
1035  template <class Packet>
1037  doPostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1038  size_t numPackets,
1039  const Teuchos::ArrayView<Packet>& imports)
1040  {
1041  using Teuchos::arcp;
1042  using Teuchos::ArrayRCP;
1043  typedef typename ArrayRCP<const Packet>::size_type size_type;
1044 
1045  TEUCHOS_TEST_FOR_EXCEPTION(
1046  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1047  "doPostsAndWaits(3 args): There are " << requests_.size () <<
1048  " outstanding nonblocking messages pending. It is incorrect to call "
1049  "this method with posts outstanding.");
1050 
1051  // doPosts() accepts the exports and imports arrays as ArrayRCPs,
1052  // requiring that the memory location is persisting (as is
1053  // necessary for nonblocking receives). However, it need only
1054  // persist until doWaits() completes, so it is safe for us to use
1055  // a nonpersisting reference in this case. The use of a
1056  // nonpersisting reference is purely a performance optimization.
1057 
1058  //const Packet* exportsPtr = exports.getRawPtr();
1059  //ArrayRCP<const Packet> exportsArcp (exportsPtr, static_cast<size_type> (0),
1060  // exports.size(), false);
1061  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (),
1062  static_cast<size_type> (0),
1063  exports.size(), false);
1064 
1065  // For some reason, neither of the options below (that use arcp)
1066  // compile for Packet=std::complex<double> with GCC 4.5.1. The
1067  // issue only arises with the exports array. This is why we
1068  // construct a separate nonowning ArrayRCP.
1069 
1070  // doPosts (arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false),
1071  // numPackets,
1072  // arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
1073  // doPosts (arcp<const Packet> (exportsPtr, 0, exports.size(), false),
1074  // numPackets,
1075  // arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
1076  doPosts (exportsArcp,
1077  numPackets,
1078  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false));
1079  doWaits ();
1080 
1081  lastRoundBytesSend_ = exports.size () * sizeof (Packet);
1082  lastRoundBytesRecv_ = imports.size () * sizeof (Packet);
1083  }
1084 
1085  template <class Packet>
1087  doPostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1088  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1089  const Teuchos::ArrayView<Packet> &imports,
1090  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1091  {
1092  using Teuchos::arcp;
1093  using Teuchos::ArrayRCP;
1094 
1095  TEUCHOS_TEST_FOR_EXCEPTION(
1096  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1097  "doPostsAndWaits: There are " << requests_.size () << " outstanding "
1098  "nonblocking messages pending. It is incorrect to call doPostsAndWaits "
1099  "with posts outstanding.");
1100 
1101  // doPosts() accepts the exports and imports arrays as ArrayRCPs,
1102  // requiring that the memory location is persisting (as is
1103  // necessary for nonblocking receives). However, it need only
1104  // persist until doWaits() completes, so it is safe for us to use
1105  // a nonpersisting reference in this case.
1106 
1107  // mfh 04 Apr 2012: For some reason, calling arcp<const Packet>
1108  // for Packet=std::complex<T> (e.g., T=float) fails to compile
1109  // with some versions of GCC. The issue only arises with the
1110  // exports array. This is why we construct a separate nonowning
1111  // ArrayRCP.
1112  typedef typename ArrayRCP<const Packet>::size_type size_type;
1113  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (),
1114  static_cast<size_type> (0),
1115  exports.size (), false);
1116  // mfh 04 Apr 2012: This is the offending code. This statement
1117  // would normally be in place of "exportsArcp" in the
1118  // doPosts() call below.
1119  //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false),
1120  doPosts (exportsArcp,
1121  numExportPacketsPerLID,
1122  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false),
1123  numImportPacketsPerLID);
1124  doWaits ();
1125 
1126  lastRoundBytesSend_ = exports.size () * sizeof (Packet);
1127  lastRoundBytesRecv_ = imports.size () * sizeof (Packet);
1128  }
1129 
1130 
1131  template <class Packet>
1133  doPosts (const Teuchos::ArrayRCP<const Packet>& exports,
1134  size_t numPackets,
1135  const Teuchos::ArrayRCP<Packet>& imports)
1136  {
1137  using Teuchos::Array;
1138  using Teuchos::ArrayRCP;
1139  using Teuchos::ArrayView;
1140  using Teuchos::as;
1141  using Teuchos::FancyOStream;
1142  using Teuchos::includesVerbLevel;
1143  using Teuchos::ireceive;
1144  using Teuchos::isend;
1145  using Teuchos::OSTab;
1146  using Teuchos::readySend;
1147  using Teuchos::send;
1148  using Teuchos::ssend;
1149  using Teuchos::TypeNameTraits;
1150  using Teuchos::typeName;
1151  using std::endl;
1152  typedef Array<size_t>::size_type size_type;
1153 
1154 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1155  Teuchos::TimeMonitor timeMon (*timer_doPosts3_);
1156 #endif // TPETRA_DISTRIBUTOR_TIMERS
1157 
1158  const int myRank = comm_->getRank ();
1159  // Run-time configurable parameters that come from the input
1160  // ParameterList set by setParameterList().
1161  const Details::EDistributorSendType sendType = sendType_;
1162  const bool doBarrier = barrierBetween_;
1163 
1164  Teuchos::OSTab tab0 (out_);
1165  std::unique_ptr<std::string> prefix;
1166  if (verbose_) {
1167  std::ostringstream os;
1168  os << "Proc " << myRank << ": Distributor::doPosts(3-arg, ArrayRCP): ";
1169  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1170  os << endl;
1171  *out_ << os.str ();
1172  }
1173  Teuchos::OSTab tab1 (out_);
1174 
1175  TEUCHOS_TEST_FOR_EXCEPTION(
1176  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error,
1177  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): Ready-send "
1178  "version requires a barrier between posting receives and posting ready "
1179  "sends. This should have been checked before. "
1180  "Please report this bug to the Tpetra developers.");
1181 
1182  size_t selfReceiveOffset = 0;
1183 
1184  // mfh 30 Mar 2016: See Github Issue #227 to see why we need to
1185  // check whether we're doing reverse mode before checking the
1186  // length of the imports array.
1187  if (howInitialized_ != Details::DISTRIBUTOR_INITIALIZED_BY_REVERSE) {
1188  // Each message has the same number of packets.
1189  //
1190  // FIXME (mfh 18 Jul 2014): Relaxing this test from strict
1191  // inequality to a less-than seems to have fixed Bug 6170. It's
1192  // OK for the 'imports' array to be longer than it needs to be;
1193  // I'm just curious why it would be.
1194  const size_t totalNumImportPackets = totalReceiveLength_ * numPackets;
1195  TEUCHOS_TEST_FOR_EXCEPTION
1196  (static_cast<size_t> (imports.size ()) < totalNumImportPackets,
1197  std::invalid_argument,
1198  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1199  "The 'imports' array must have enough entries to hold the expected number "
1200  "of import packets. imports.size() = " << imports.size () << " < "
1201  "totalNumImportPackets = " << totalNumImportPackets << ".");
1202  }
1203 
1204  // MPI tag for nonblocking receives and blocking sends in this
1205  // method. Some processes might take the "fast" path
1206  // (indicesTo_.empty()) and others might take the "slow" path for
1207  // the same doPosts() call, so the path tag must be the same for
1208  // both.
1209  const int pathTag = 0;
1210  const int tag = this->getTag (pathTag);
1211 
1212 #ifdef HAVE_TPETRA_DEBUG
1213  TEUCHOS_TEST_FOR_EXCEPTION
1214  (requests_.size () != 0,
1215  std::logic_error,
1216  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): Process "
1217  << myRank << ": requests_.size() = " << requests_.size () << " != 0.");
1218 #endif // HAVE_TPETRA_DEBUG
1219 
1220  // Distributor uses requests_.size() as the number of outstanding
1221  // nonblocking message requests, so we resize to zero to maintain
1222  // this invariant.
1223  //
1224  // numReceives_ does _not_ include the self message, if there is
1225  // one. Here, we do actually send a message to ourselves, so we
1226  // include any self message in the "actual" number of receives to
1227  // post.
1228  //
1229  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
1230  // doesn't (re)allocate its array of requests. That happens in
1231  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
1232  // demand), or Resize_().
1233  const size_type actualNumReceives = as<size_type> (numReceives_) +
1234  as<size_type> (selfMessage_ ? 1 : 0);
1235  requests_.resize (0);
1236 
1237  if (verbose_) {
1238  std::ostringstream os;
1239  os << *prefix << (indicesTo_.empty () ? "Fast" : "Slow")
1240  << ": Post receives" << endl;
1241  *out_ << os.str ();
1242  }
1243 
1244  // Post the nonblocking receives. It's common MPI wisdom to post
1245  // receives before sends. In MPI terms, this means favoring
1246  // adding to the "posted queue" (of receive requests) over adding
1247  // to the "unexpected queue" (of arrived messages not yet matched
1248  // with a receive).
1249  {
1250 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1251  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3_recvs_);
1252 #endif // TPETRA_DISTRIBUTOR_TIMERS
1253 
1254  size_t curBufOffset = 0;
1255  for (size_type i = 0; i < actualNumReceives; ++i) {
1256  const size_t curBufLen = lengthsFrom_[i] * numPackets;
1257  if (procsFrom_[i] != myRank) {
1258  if (verbose_) {
1259  std::ostringstream os;
1260  os << *prefix << (indicesTo_.empty () ? "Fast" : "Slow")
1261  << ": Post irecv: {source: " << procsFrom_[i]
1262  << ", tag: " << tag << "}" << endl;
1263  *out_ << os.str ();
1264  }
1265  // If my process is receiving these packet(s) from another
1266  // process (not a self-receive):
1267  //
1268  // 1. Set up the persisting view (recvBuf) of the imports
1269  // array, given the offset and size (total number of
1270  // packets from process procsFrom_[i]).
1271  // 2. Start the Irecv and save the resulting request.
1272  TEUCHOS_TEST_FOR_EXCEPTION(
1273  curBufOffset + curBufLen > static_cast<size_t> (imports.size ()),
1274  std::logic_error,
1275  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1276  "Exceeded size of 'imports' array in packing loop on Process " <<
1277  myRank << ". imports.size() = " << imports.size () << " < "
1278  "curBufOffset(" << curBufOffset << ") + curBufLen(" << curBufLen
1279  << ").");
1280  ArrayRCP<Packet> recvBuf =
1281  imports.persistingView (curBufOffset, curBufLen);
1282  requests_.push_back (ireceive<int, Packet> (recvBuf, procsFrom_[i],
1283  tag, *comm_));
1284  }
1285  else { // Receiving from myself
1286  selfReceiveOffset = curBufOffset; // Remember the self-recv offset
1287  }
1288  curBufOffset += curBufLen;
1289  }
1290  }
1291 
1292  if (doBarrier) {
1293 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1294  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3_barrier_);
1295 #endif // TPETRA_DISTRIBUTOR_TIMERS
1296 
1297  if (verbose_) {
1298  std::ostringstream os;
1299  os << *prefix << (indicesTo_.empty () ? "Fast" : "Slow")
1300  << ": Barrier" << endl;
1301  *out_ << os.str ();
1302  }
1303  // If we are using ready sends (MPI_Rsend) below, we need to do
1304  // a barrier before we post the ready sends. This is because a
1305  // ready send requires that its matching receive has already
1306  // been posted before the send has been posted. The only way to
1307  // guarantee that in this case is to use a barrier.
1308  comm_->barrier ();
1309  }
1310 
1311 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1312  Teuchos::TimeMonitor timeMonSends (*timer_doPosts3_sends_);
1313 #endif // TPETRA_DISTRIBUTOR_TIMERS
1314 
1315  // setup scan through procsTo_ list starting with higher numbered procs
1316  // (should help balance message traffic)
1317  //
1318  // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
1319  // It doesn't depend on the input at all.
1320  size_t numBlocks = numSends_ + selfMessage_;
1321  size_t procIndex = 0;
1322  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myRank)) {
1323  ++procIndex;
1324  }
1325  if (procIndex == numBlocks) {
1326  procIndex = 0;
1327  }
1328 
1329  size_t selfNum = 0;
1330  size_t selfIndex = 0;
1331 
1332  if (verbose_) {
1333  std::ostringstream os;
1334  os << *prefix << (indicesTo_.empty () ? "Fast" : "Slow")
1335  << ": Post sends" << endl;
1336  *out_ << os.str ();
1337  }
1338 
1339  if (indicesTo_.empty ()) {
1340  // Data are already blocked (laid out) by process, so we don't
1341  // need a separate send buffer (besides the exports array).
1342  for (size_t i = 0; i < numBlocks; ++i) {
1343  size_t p = i + procIndex;
1344  if (p > (numBlocks - 1)) {
1345  p -= numBlocks;
1346  }
1347 
1348  if (procsTo_[p] != myRank) {
1349  if (verbose_) {
1350  std::ostringstream os;
1351  os << *prefix << ": Post send: {target: "
1352  << procsTo_[p] << ", tag: " << tag << "}" << endl;
1353  *out_ << os.str ();
1354  }
1355 
1356  ArrayView<const Packet> tmpSend =
1357  exports.view (startsTo_[p]*numPackets, lengthsTo_[p]*numPackets);
1358 
1359  if (sendType == Details::DISTRIBUTOR_SEND) {
1360  send<int, Packet> (tmpSend.getRawPtr (),
1361  as<int> (tmpSend.size ()),
1362  procsTo_[p], tag, *comm_);
1363  }
1364  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1365  ArrayRCP<const Packet> tmpSendBuf =
1366  exports.persistingView (startsTo_[p] * numPackets,
1367  lengthsTo_[p] * numPackets);
1368  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1369  tag, *comm_));
1370  }
1371  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1372  readySend<int, Packet> (tmpSend.getRawPtr (),
1373  as<int> (tmpSend.size ()),
1374  procsTo_[p], tag, *comm_);
1375  }
1376  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1377  ssend<int, Packet> (tmpSend.getRawPtr (),
1378  as<int> (tmpSend.size ()),
1379  procsTo_[p], tag, *comm_);
1380  } else {
1381  TEUCHOS_TEST_FOR_EXCEPTION(
1382  true, std::logic_error,
1383  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1384  "Invalid send type. We should never get here. "
1385  "Please report this bug to the Tpetra developers.");
1386  }
1387  }
1388  else { // "Sending" the message to myself
1389  selfNum = p;
1390  }
1391  }
1392 
1393  if (selfMessage_) {
1394  if (verbose_) {
1395  std::ostringstream os;
1396  os << *prefix << "Fast: Self-send" << endl;
1397  *out_ << os.str ();
1398  }
1399  // This is how we "send a message to ourself": we copy from
1400  // the export buffer to the import buffer. That saves
1401  // Teuchos::Comm implementations other than MpiComm (in
1402  // particular, SerialComm) the trouble of implementing self
1403  // messages correctly. (To do this right, SerialComm would
1404  // need internal buffer space for messages, keyed on the
1405  // message's tag.)
1406  std::copy (exports.begin()+startsTo_[selfNum]*numPackets,
1407  exports.begin()+startsTo_[selfNum]*numPackets+lengthsTo_[selfNum]*numPackets,
1408  imports.begin()+selfReceiveOffset);
1409  }
1410  }
1411  else { // data are not blocked by proc, use send buffer
1412  // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking
1413  // sends), because the buffer is only long enough for one send.
1414  ArrayRCP<Packet> sendArray (maxSendLength_ * numPackets); // send buffer
1415 
1416  TEUCHOS_TEST_FOR_EXCEPTION(
1417  sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
1418  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1419  "The \"send buffer\" code path doesn't currently work with "
1420  "nonblocking sends.");
1421 
1422  for (size_t i = 0; i < numBlocks; ++i) {
1423  size_t p = i + procIndex;
1424  if (p > (numBlocks - 1)) {
1425  p -= numBlocks;
1426  }
1427 
1428  if (procsTo_[p] != myRank) {
1429  if (verbose_) {
1430  std::ostringstream os;
1431  os << *prefix << "Slow: Post send: "
1432  "{target: " << procsTo_[p] << ", tag: " << tag << "}" << endl;
1433  *out_ << os.str ();
1434  }
1435 
1436  typename ArrayView<const Packet>::iterator srcBegin, srcEnd;
1437  size_t sendArrayOffset = 0;
1438  size_t j = startsTo_[p];
1439  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
1440  srcBegin = exports.begin() + indicesTo_[j]*numPackets;
1441  srcEnd = srcBegin + numPackets;
1442  std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset);
1443  sendArrayOffset += numPackets;
1444  }
1445  ArrayView<const Packet> tmpSend =
1446  sendArray.view (0, lengthsTo_[p]*numPackets);
1447 
1448  if (sendType == Details::DISTRIBUTOR_SEND) {
1449  send<int, Packet> (tmpSend.getRawPtr (),
1450  as<int> (tmpSend.size ()),
1451  procsTo_[p], tag, *comm_);
1452  }
1453  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1454  ArrayRCP<const Packet> tmpSendBuf =
1455  sendArray.persistingView (0, lengthsTo_[p] * numPackets);
1456  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1457  tag, *comm_));
1458  }
1459  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1460  readySend<int, Packet> (tmpSend.getRawPtr (),
1461  as<int> (tmpSend.size ()),
1462  procsTo_[p], tag, *comm_);
1463  }
1464  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1465  ssend<int, Packet> (tmpSend.getRawPtr (),
1466  as<int> (tmpSend.size ()),
1467  procsTo_[p], tag, *comm_);
1468  }
1469  else {
1470  TEUCHOS_TEST_FOR_EXCEPTION(
1471  true, std::logic_error,
1472  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1473  "Invalid send type. We should never get here. "
1474  "Please report this bug to the Tpetra developers.");
1475  }
1476  }
1477  else { // "Sending" the message to myself
1478  selfNum = p;
1479  selfIndex = startsTo_[p];
1480  }
1481  }
1482 
1483  if (selfMessage_) {
1484  if (verbose_) {
1485  std::ostringstream os;
1486  os << *prefix << "Slow: Self-send" << endl;
1487  *out_ << os.str ();
1488  }
1489  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
1490  std::copy (exports.begin()+indicesTo_[selfIndex]*numPackets,
1491  exports.begin()+indicesTo_[selfIndex]*numPackets + numPackets,
1492  imports.begin() + selfReceiveOffset);
1493  ++selfIndex;
1494  selfReceiveOffset += numPackets;
1495  }
1496  }
1497  }
1498 
1499  if (verbose_) {
1500  std::ostringstream os;
1501  os << *prefix << "Done!" << endl;
1502  *out_ << os.str ();
1503  }
1504  }
1505 
1506  template <class Packet>
1508  doPosts (const Teuchos::ArrayRCP<const Packet>& exports,
1509  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1510  const Teuchos::ArrayRCP<Packet>& imports,
1511  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1512  {
1513  using Teuchos::Array;
1514  using Teuchos::ArrayRCP;
1515  using Teuchos::ArrayView;
1516  using Teuchos::as;
1517  using Teuchos::ireceive;
1518  using Teuchos::isend;
1519  using Teuchos::readySend;
1520  using Teuchos::send;
1521  using Teuchos::ssend;
1522  using Teuchos::TypeNameTraits;
1523 #ifdef HAVE_TEUCHOS_DEBUG
1524  using Teuchos::OSTab;
1525 #endif // HAVE_TEUCHOS_DEBUG
1526  using std::endl;
1527  typedef Array<size_t>::size_type size_type;
1528 
1529  Teuchos::OSTab tab (out_);
1530 
1531 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1532  Teuchos::TimeMonitor timeMon (*timer_doPosts4_);
1533 #endif // TPETRA_DISTRIBUTOR_TIMERS
1534 
1535  // Run-time configurable parameters that come from the input
1536  // ParameterList set by setParameterList().
1537  const Details::EDistributorSendType sendType = sendType_;
1538  const bool doBarrier = barrierBetween_;
1539 
1540 // #ifdef HAVE_TEUCHOS_DEBUG
1541 // // Prepare for verbose output, if applicable.
1542 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
1543 // Teuchos::RCP<Teuchos::FancyOStream> out = this->getOStream ();
1544 // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
1545 // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
1546 
1547 // if (doPrint) {
1548 // // Only need one process to print out parameters.
1549 // *out << "Distributor::doPosts (4 args)" << endl;
1550 // }
1551 // // Add one tab level. We declare this outside the doPrint scopes
1552 // // so that the tab persists until the end of this method.
1553 // Teuchos::OSTab tab = this->getOSTab ();
1554 // if (doPrint) {
1555 // *out << "Parameters:" << endl;
1556 // {
1557 // OSTab tab2 (out);
1558 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
1559 // << endl << "barrierBetween: " << doBarrier << endl;
1560 // }
1561 // }
1562 // #endif // HAVE_TEUCHOS_DEBUG
1563 
1564  TEUCHOS_TEST_FOR_EXCEPTION(
1565  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
1566  std::logic_error,
1567  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): Ready-send "
1568  "version requires a barrier between posting receives and posting ready "
1569  "ends. This should have been checked before. "
1570  "Please report this bug to the Tpetra developers.");
1571 
1572  const int myProcID = comm_->getRank ();
1573  size_t selfReceiveOffset = 0;
1574 
1575 #ifdef HAVE_TEUCHOS_DEBUG
1576  // Different messages may have different numbers of packets.
1577  size_t totalNumImportPackets = 0;
1578  for (size_t ii = 0; ii < static_cast<size_t> (numImportPacketsPerLID.size ()); ++ii) {
1579  totalNumImportPackets += numImportPacketsPerLID[ii];
1580  }
1581  TEUCHOS_TEST_FOR_EXCEPTION(
1582  static_cast<size_t> (imports.size ()) < totalNumImportPackets,
1583  std::runtime_error,
1584  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): The 'imports' "
1585  "array must have enough entries to hold the expected number of import "
1586  "packets. imports.size() = " << imports.size() << " < "
1587  "totalNumImportPackets = " << totalNumImportPackets << ".");
1588 #endif // HAVE_TEUCHOS_DEBUG
1589 
1590  // MPI tag for nonblocking receives and blocking sends in this
1591  // method. Some processes might take the "fast" path
1592  // (indicesTo_.empty()) and others might take the "slow" path for
1593  // the same doPosts() call, so the path tag must be the same for
1594  // both.
1595  const int pathTag = 1;
1596  const int tag = this->getTag (pathTag);
1597 
1598 #ifdef HAVE_TEUCHOS_DEBUG
1599  TEUCHOS_TEST_FOR_EXCEPTION
1600  (requests_.size () != 0,
1601  std::logic_error,
1602  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): Process "
1603  << myProcID << ": requests_.size() = " << requests_.size ()
1604  << " != 0.");
1605 #endif // HAVE_TEUCHOS_DEBUG
1606  if (verbose_) {
1607  std::ostringstream os;
1608  os << "Proc " << myProcID << ": doPosts(4 args, Teuchos::ArrayRCP, "
1609  << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
1610  *out_ << os.str ();
1611  }
1612 
1613  // Distributor uses requests_.size() as the number of outstanding
1614  // nonblocking message requests, so we resize to zero to maintain
1615  // this invariant.
1616  //
1617  // numReceives_ does _not_ include the self message, if there is
1618  // one. Here, we do actually send a message to ourselves, so we
1619  // include any self message in the "actual" number of receives to
1620  // post.
1621  //
1622  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
1623  // doesn't (re)allocate its array of requests. That happens in
1624  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
1625  // demand), or Resize_().
1626  const size_type actualNumReceives = as<size_type> (numReceives_) +
1627  as<size_type> (selfMessage_ ? 1 : 0);
1628  requests_.resize (0);
1629 
1630  // Post the nonblocking receives. It's common MPI wisdom to post
1631  // receives before sends. In MPI terms, this means favoring
1632  // adding to the "posted queue" (of receive requests) over adding
1633  // to the "unexpected queue" (of arrived messages not yet matched
1634  // with a receive).
1635  {
1636 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1637  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4_recvs_);
1638 #endif // TPETRA_DISTRIBUTOR_TIMERS
1639 
1640  size_t curBufferOffset = 0;
1641  size_t curLIDoffset = 0;
1642  for (size_type i = 0; i < actualNumReceives; ++i) {
1643  size_t totalPacketsFrom_i = 0;
1644  for (size_t j = 0; j < lengthsFrom_[i]; ++j) {
1645  totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
1646  }
1647  curLIDoffset += lengthsFrom_[i];
1648  if (procsFrom_[i] != myProcID && totalPacketsFrom_i) {
1649  // If my process is receiving these packet(s) from another
1650  // process (not a self-receive), and if there is at least
1651  // one packet to receive:
1652  //
1653  // 1. Set up the persisting view (recvBuf) into the imports
1654  // array, given the offset and size (total number of
1655  // packets from process procsFrom_[i]).
1656  // 2. Start the Irecv and save the resulting request.
1657  ArrayRCP<Packet> recvBuf =
1658  imports.persistingView (curBufferOffset, totalPacketsFrom_i);
1659  requests_.push_back (ireceive<int, Packet> (recvBuf, procsFrom_[i],
1660  tag, *comm_));
1661  }
1662  else { // Receiving these packet(s) from myself
1663  selfReceiveOffset = curBufferOffset; // Remember the offset
1664  }
1665  curBufferOffset += totalPacketsFrom_i;
1666  }
1667  }
1668 
1669  if (doBarrier) {
1670 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1671  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4_barrier_);
1672 #endif // TPETRA_DISTRIBUTOR_TIMERS
1673  // If we are using ready sends (MPI_Rsend) below, we need to do
1674  // a barrier before we post the ready sends. This is because a
1675  // ready send requires that its matching receive has already
1676  // been posted before the send has been posted. The only way to
1677  // guarantee that in this case is to use a barrier.
1678  comm_->barrier ();
1679  }
1680 
1681 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1682  Teuchos::TimeMonitor timeMonSends (*timer_doPosts4_sends_);
1683 #endif // TPETRA_DISTRIBUTOR_TIMERS
1684 
1685  // setup arrays containing starting-offsets into exports for each send,
1686  // and num-packets-to-send for each send.
1687  Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0);
1688  size_t maxNumPackets = 0;
1689  size_t curPKToffset = 0;
1690  for (size_t pp=0; pp<numSends_; ++pp) {
1691  sendPacketOffsets[pp] = curPKToffset;
1692  size_t numPackets = 0;
1693  for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) {
1694  numPackets += numExportPacketsPerLID[j];
1695  }
1696  if (numPackets > maxNumPackets) maxNumPackets = numPackets;
1697  packetsPerSend[pp] = numPackets;
1698  curPKToffset += numPackets;
1699  }
1700 
1701  // setup scan through procsTo_ list starting with higher numbered procs
1702  // (should help balance message traffic)
1703  size_t numBlocks = numSends_+ selfMessage_;
1704  size_t procIndex = 0;
1705  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myProcID)) {
1706  ++procIndex;
1707  }
1708  if (procIndex == numBlocks) {
1709  procIndex = 0;
1710  }
1711 
1712  size_t selfNum = 0;
1713  size_t selfIndex = 0;
1714 
1715  if (indicesTo_.empty()) {
1716  if (verbose_) {
1717  std::ostringstream os;
1718  os << "Proc " << myProcID
1719  << ": doPosts(4 args, Teuchos::ArrayRCP, fast): posting sends" << endl;
1720  *out_ << os.str ();
1721  }
1722 
1723  // Data are already blocked (laid out) by process, so we don't
1724  // need a separate send buffer (besides the exports array).
1725  for (size_t i = 0; i < numBlocks; ++i) {
1726  size_t p = i + procIndex;
1727  if (p > (numBlocks - 1)) {
1728  p -= numBlocks;
1729  }
1730 
1731  if (procsTo_[p] != myProcID && packetsPerSend[p] > 0) {
1732  ArrayView<const Packet> tmpSend =
1733  exports.view (sendPacketOffsets[p], packetsPerSend[p]);
1734 
1735  if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first
1736  send<int, Packet> (tmpSend.getRawPtr (),
1737  as<int> (tmpSend.size ()),
1738  procsTo_[p], tag, *comm_);
1739  }
1740  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1741  readySend<int, Packet> (tmpSend.getRawPtr (),
1742  as<int> (tmpSend.size ()),
1743  procsTo_[p], tag, *comm_);
1744  }
1745  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1746  ArrayRCP<const Packet> tmpSendBuf =
1747  exports.persistingView (sendPacketOffsets[p], packetsPerSend[p]);
1748  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1749  tag, *comm_));
1750  }
1751  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1752  ssend<int, Packet> (tmpSend.getRawPtr (),
1753  as<int> (tmpSend.size ()),
1754  procsTo_[p], tag, *comm_);
1755  }
1756  else {
1757  TEUCHOS_TEST_FOR_EXCEPTION(
1758  true, std::logic_error,
1759  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): "
1760  "Invalid send type. We should never get here. Please report "
1761  "this bug to the Tpetra developers.");
1762  }
1763  }
1764  else { // "Sending" the message to myself
1765  selfNum = p;
1766  }
1767  }
1768 
1769  if (selfMessage_) {
1770  std::copy (exports.begin()+sendPacketOffsets[selfNum],
1771  exports.begin()+sendPacketOffsets[selfNum]+packetsPerSend[selfNum],
1772  imports.begin()+selfReceiveOffset);
1773  }
1774  if (verbose_) {
1775  std::ostringstream os;
1776  os << "Proc " << myProcID
1777  << ": doPosts(4 args, Teuchos::ArrayRCP, fast) done" << endl;
1778  *out_ << os.str ();
1779  }
1780  }
1781  else { // data are not blocked by proc, use send buffer
1782  if (verbose_) {
1783  std::ostringstream os;
1784  os << "Proc " << myProcID
1785  << ": doPosts(4 args, Teuchos::ArrayRCP, slow): posting sends" << endl;
1786  *out_ << os.str ();
1787  }
1788 
1789  // FIXME (mfh 05 Mar 2013) This may be broken for Isend.
1790  ArrayRCP<Packet> sendArray (maxNumPackets); // send buffer
1791 
1792  TEUCHOS_TEST_FOR_EXCEPTION(
1793  sendType == Details::DISTRIBUTOR_ISEND,
1794  std::logic_error,
1795  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): "
1796  "The \"send buffer\" code path may not necessarily work with nonblocking sends.");
1797 
1798  Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
1799  size_t ioffset = 0;
1800  for (int j=0; j<numExportPacketsPerLID.size(); ++j) {
1801  indicesOffsets[j] = ioffset;
1802  ioffset += numExportPacketsPerLID[j];
1803  }
1804 
1805  for (size_t i = 0; i < numBlocks; ++i) {
1806  size_t p = i + procIndex;
1807  if (p > (numBlocks - 1)) {
1808  p -= numBlocks;
1809  }
1810 
1811  if (procsTo_[p] != myProcID) {
1812  typename ArrayView<const Packet>::iterator srcBegin, srcEnd;
1813  size_t sendArrayOffset = 0;
1814  size_t j = startsTo_[p];
1815  size_t numPacketsTo_p = 0;
1816  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
1817  srcBegin = exports.begin() + indicesOffsets[j];
1818  srcEnd = srcBegin + numExportPacketsPerLID[j];
1819  numPacketsTo_p += numExportPacketsPerLID[j];
1820  std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset);
1821  sendArrayOffset += numExportPacketsPerLID[j];
1822  }
1823  if (numPacketsTo_p > 0) {
1824  ArrayView<const Packet> tmpSend =
1825  sendArray.view (0, numPacketsTo_p);
1826 
1827  if (sendType == Details::DISTRIBUTOR_RSEND) {
1828  readySend<int, Packet> (tmpSend.getRawPtr (),
1829  as<int> (tmpSend.size ()),
1830  procsTo_[p], tag, *comm_);
1831  }
1832  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1833  ArrayRCP<const Packet> tmpSendBuf =
1834  sendArray.persistingView (0, numPacketsTo_p);
1835  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1836  tag, *comm_));
1837  }
1838  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1839  ssend<int, Packet> (tmpSend.getRawPtr (),
1840  as<int> (tmpSend.size ()),
1841  procsTo_[p], tag, *comm_);
1842  }
1843  else { // if (sendType == Details::DISTRIBUTOR_SSEND)
1844  send<int, Packet> (tmpSend.getRawPtr (),
1845  as<int> (tmpSend.size ()),
1846  procsTo_[p], tag, *comm_);
1847  }
1848  }
1849  }
1850  else { // "Sending" the message to myself
1851  selfNum = p;
1852  selfIndex = startsTo_[p];
1853  }
1854  }
1855 
1856  if (selfMessage_) {
1857  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
1858  std::copy (exports.begin()+indicesOffsets[selfIndex],
1859  exports.begin()+indicesOffsets[selfIndex]+numExportPacketsPerLID[selfIndex],
1860  imports.begin() + selfReceiveOffset);
1861  selfReceiveOffset += numExportPacketsPerLID[selfIndex];
1862  ++selfIndex;
1863  }
1864  }
1865  if (verbose_) {
1866  std::ostringstream os;
1867  os << "Proc " << myProcID
1868  << ": doPosts(4 args, Teuchos::ArrayRCP, slow) done" << endl;
1869  *out_ << os.str ();
1870  }
1871  }
1872  }
1873 
1874  template <class Packet>
1876  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1877  size_t numPackets,
1878  const Teuchos::ArrayView<Packet>& imports)
1879  {
1880  using Teuchos::arcp;
1881  using Teuchos::ArrayRCP;
1882  using Teuchos::as;
1883 
1884  // doReversePosts() takes exports and imports as ArrayRCPs,
1885  // requiring that the memory locations are persisting. However,
1886  // they need only persist within the scope of that routine, so it
1887  // is safe for us to use nonpersisting references in this case.
1888 
1889  // mfh 04 Apr 2012: For some reason, calling arcp<const Packet>
1890  // for Packet=std::complex<T> (e.g., T=float) fails to compile
1891  // with some versions of GCC. The issue only arises with the
1892  // exports array. This is why we construct a separate nonowning
1893  // ArrayRCP.
1894  typedef typename ArrayRCP<const Packet>::size_type size_type;
1895  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr(), as<size_type> (0),
1896  exports.size(), false);
1897  // mfh 04 Apr 2012: This is the offending code. This statement
1898  // would normally be in place of "exportsArcp" in the
1899  // doReversePosts() call below.
1900  //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false)
1901  doReversePosts (exportsArcp,
1902  numPackets,
1903  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false));
1904  doReverseWaits ();
1905 
1906  lastRoundBytesSend_ = exports.size() * sizeof(Packet);
1907  lastRoundBytesRecv_ = imports.size() * sizeof(Packet);
1908  }
1909 
1910  template <class Packet>
1912  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1913  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1914  const Teuchos::ArrayView<Packet> &imports,
1915  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1916  {
1917  using Teuchos::as;
1918  using Teuchos::arcp;
1919  using Teuchos::ArrayRCP;
1920 
1921  TEUCHOS_TEST_FOR_EXCEPTION(
1922  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1923  "doReversePostsAndWaits(4 args): There are " << requests_.size ()
1924  << " outstanding nonblocking messages pending. It is incorrect to call "
1925  "this method with posts outstanding.");
1926 
1927  // doReversePosts() accepts the exports and imports arrays as
1928  // ArrayRCPs, requiring that the memory location is persisting (as
1929  // is necessary for nonblocking receives). However, it need only
1930  // persist until doReverseWaits() completes, so it is safe for us
1931  // to use a nonpersisting reference in this case. The use of a
1932  // nonpersisting reference is purely a performance optimization.
1933 
1934  // mfh 02 Apr 2012: For some reason, calling arcp<const Packet>
1935  // for Packet=std::complex<double> fails to compile with some
1936  // versions of GCC. The issue only arises with the exports array.
1937  // This is why we construct a separate nonowning ArrayRCP.
1938  typedef typename ArrayRCP<const Packet>::size_type size_type;
1939  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (), as<size_type> (0),
1940  exports.size (), false);
1941  doReversePosts (exportsArcp,
1942  numExportPacketsPerLID,
1943  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false),
1944  numImportPacketsPerLID);
1945  doReverseWaits ();
1946 
1947  lastRoundBytesSend_ = exports.size() * sizeof(Packet);
1948  lastRoundBytesRecv_ = imports.size() * sizeof(Packet);
1949  }
1950 
1951  template <class Packet>
1953  doReversePosts (const Teuchos::ArrayRCP<const Packet>& exports,
1954  size_t numPackets,
1955  const Teuchos::ArrayRCP<Packet>& imports)
1956  {
1957  // FIXME (mfh 29 Mar 2012) WHY?
1958  TEUCHOS_TEST_FOR_EXCEPTION(
1959  ! indicesTo_.empty (), std::runtime_error,
1960  "Tpetra::Distributor::doReversePosts(3 args): Can only do reverse "
1961  "communication when original data are blocked by process.");
1962  if (reverseDistributor_.is_null ()) {
1963  createReverseDistributor ();
1964  }
1965  reverseDistributor_->doPosts (exports, numPackets, imports);
1966  }
1967 
1968  template <class Packet>
1970  doReversePosts (const Teuchos::ArrayRCP<const Packet>& exports,
1971  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1972  const Teuchos::ArrayRCP<Packet>& imports,
1973  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1974  {
1975  // FIXME (mfh 29 Mar 2012) WHY?
1976  TEUCHOS_TEST_FOR_EXCEPTION(
1977  ! indicesTo_.empty (), std::runtime_error,
1978  "Tpetra::Distributor::doReversePosts(3 args): Can only do reverse "
1979  "communication when original data are blocked by process.");
1980  if (reverseDistributor_.is_null ()) {
1981  createReverseDistributor ();
1982  }
1983  reverseDistributor_->doPosts (exports, numExportPacketsPerLID,
1984  imports, numImportPacketsPerLID);
1985  }
1986 
1987  template <class ExpView, class ImpView>
1988  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
1990  doPostsAndWaits (const ExpView& exports,
1991  size_t numPackets,
1992  const ImpView& imports)
1993  {
1994  using Teuchos::RCP;
1995  using Teuchos::rcp;
1996  using std::endl;
1997 
1998  RCP<Teuchos::OSTab> tab0, tab1;
1999  if (verbose_) {
2000  tab0 = rcp (new Teuchos::OSTab (out_));
2001  const int myRank = comm_->getRank ();
2002  std::ostringstream os;
2003  os << "Proc " << myRank
2004  << ": Distributor::doPostsAndWaits(3 args, Kokkos): "
2005  << "{sendType: " << DistributorSendTypeEnumToString (sendType_)
2006  << ", barrierBetween: " << barrierBetween_ << "}" << endl;
2007  *out_ << os.str ();
2008  tab1 = rcp (new Teuchos::OSTab (out_));
2009  }
2010 
2011  TEUCHOS_TEST_FOR_EXCEPTION(
2012  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
2013  "doPostsAndWaits(3 args): There are " << requests_.size () <<
2014  " outstanding nonblocking messages pending. It is incorrect to call "
2015  "this method with posts outstanding.");
2016 
2017  if (verbose_) {
2018  const int myRank = comm_->getRank ();
2019  std::ostringstream os;
2020  os << "Proc " << myRank
2021  << ": Distributor::doPostsAndWaits: Call doPosts" << endl;
2022  *out_ << os.str ();
2023  }
2024  doPosts (exports, numPackets, imports);
2025  if (verbose_) {
2026  const int myRank = comm_->getRank ();
2027  std::ostringstream os;
2028  os << "Proc " << myRank
2029  << ": Distributor::doPostsAndWaits: Call doWaits" << endl;
2030  *out_ << os.str ();
2031  }
2032  doWaits ();
2033  }
2034 
2035  template <class ExpView, class ImpView>
2036  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2038  doPostsAndWaits (const ExpView& exports,
2039  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2040  const ImpView& imports,
2041  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2042  {
2043  TEUCHOS_TEST_FOR_EXCEPTION(
2044  requests_.size () != 0, std::runtime_error,
2045  "Tpetra::Distributor::doPostsAndWaits(4 args): There are "
2046  << requests_.size () << " outstanding nonblocking messages pending. "
2047  "It is incorrect to call this method with posts outstanding.");
2048 
2049  doPosts (exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
2050  doWaits ();
2051  }
2052 
2053 
2054  template <class ExpView, class ImpView>
2055  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2057  doPosts (const ExpView &exports,
2058  size_t numPackets,
2059  const ImpView &imports)
2060  {
2061  using Teuchos::Array;
2062  using Teuchos::as;
2063  using Teuchos::FancyOStream;
2064  using Teuchos::includesVerbLevel;
2065  using Teuchos::ireceive;
2066  using Teuchos::isend;
2067  using Teuchos::OSTab;
2068  using Teuchos::readySend;
2069  using Teuchos::send;
2070  using Teuchos::ssend;
2071  using Teuchos::TypeNameTraits;
2072  using Teuchos::typeName;
2073  using std::endl;
2074  using Kokkos::Compat::create_const_view;
2075  using Kokkos::Compat::create_view;
2076  using Kokkos::Compat::subview_offset;
2077  using Kokkos::Compat::deep_copy_offset;
2078  typedef Array<size_t>::size_type size_type;
2079  typedef ExpView exports_view_type;
2080  typedef ImpView imports_view_type;
2081 
2082 #ifdef KOKKOS_ENABLE_CUDA
2083  static_assert (! std::is_same<typename ExpView::memory_space, Kokkos::CudaUVMSpace>::value &&
2084  ! std::is_same<typename ImpView::memory_space, Kokkos::CudaUVMSpace>::value,
2085  "Please do not use Tpetra::Distributor with UVM "
2086  "allocations. See GitHub issue #1088.");
2087 #endif // KOKKOS_ENABLE_CUDA
2088 
2089 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2090  Teuchos::TimeMonitor timeMon (*timer_doPosts3_);
2091 #endif // TPETRA_DISTRIBUTOR_TIMERS
2092 
2093  const int myRank = comm_->getRank ();
2094  // Run-time configurable parameters that come from the input
2095  // ParameterList set by setParameterList().
2096  const Details::EDistributorSendType sendType = sendType_;
2097  const bool doBarrier = barrierBetween_;
2098 
2099  Teuchos::OSTab tab0 (out_);
2100  if (verbose_) {
2101  std::ostringstream os;
2102  os << "Proc " << myRank
2103  << ": Distributor::doPosts(3 args, Kokkos)" << endl;
2104  *out_ << os.str ();
2105  }
2106  Teuchos::OSTab tab1 (out_);
2107 
2108  TEUCHOS_TEST_FOR_EXCEPTION(
2109  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
2110  std::logic_error,
2111  "Tpetra::Distributor::doPosts(3 args, Kokkos): Ready-send version "
2112  "requires a barrier between posting receives and posting ready sends. "
2113  "This should have been checked before. "
2114  "Please report this bug to the Tpetra developers.");
2115 
2116  size_t selfReceiveOffset = 0;
2117 
2118  // mfh 30 Mar 2016: See Github Issue #227 to see why we need to
2119  // check whether we're doing reverse mode before checking the
2120  // length of the imports array.
2121  if (false /* howInitialized_ != Details::DISTRIBUTOR_INITIALIZED_BY_REVERSE */) {
2122  // Each message has the same number of packets.
2123  const size_t totalNumImportPackets = totalReceiveLength_ * numPackets;
2124 
2125  if (verbose_) {
2126  std::ostringstream os;
2127  os << "Proc " << myRank << ": doPosts: totalNumImportPackets = " <<
2128  totalNumImportPackets << " = " << totalReceiveLength_ << " * " <<
2129  numPackets << "; imports.extent(0) = " << imports.extent (0)
2130  << endl;
2131  *out_ << os.str ();
2132  }
2133 
2134 #ifdef HAVE_TPETRA_DEBUG
2135  // mfh 31 Mar 2016: Extra special all-reduce check to help diagnose #227.
2136  {
2137  const size_t importBufSize = static_cast<size_t> (imports.extent (0));
2138  const int lclBad = (importBufSize < totalNumImportPackets) ? 1 : 0;
2139  int gblBad = 0;
2140  using Teuchos::reduceAll;
2141  using Teuchos::REDUCE_MAX;
2142  using Teuchos::outArg;
2143  reduceAll (*comm_, REDUCE_MAX, lclBad, outArg (gblBad));
2144  TEUCHOS_TEST_FOR_EXCEPTION
2145  (gblBad != 0,
2146  std::runtime_error,
2147  "Tpetra::Distributor::doPosts(3 args, Kokkos): "
2148  "On one or more MPI processes, the 'imports' array "
2149  "does not have enough entries to hold the expected number of "
2150  "import packets. ");
2151  }
2152 #else
2153  TEUCHOS_TEST_FOR_EXCEPTION
2154  (static_cast<size_t> (imports.extent (0)) < totalNumImportPackets,
2155  std::runtime_error,
2156  "Tpetra::Distributor::doPosts(3 args, Kokkos): The 'imports' "
2157  "array must have enough entries to hold the expected number of import "
2158  "packets. imports.extent(0) = " << imports.extent (0) << " < "
2159  "totalNumImportPackets = " << totalNumImportPackets << " = "
2160  "totalReceiveLength_ (" << totalReceiveLength_ << ") * numPackets ("
2161  << numPackets << ").");
2162 #endif // HAVE_TPETRA_DEBUG
2163  }
2164 
2165  // MPI tag for nonblocking receives and blocking sends in this
2166  // method. Some processes might take the "fast" path
2167  // (indicesTo_.empty()) and others might take the "slow" path for
2168  // the same doPosts() call, so the path tag must be the same for
2169  // both.
2170  const int pathTag = 0;
2171  const int tag = this->getTag (pathTag);
2172 
2173 #ifdef HAVE_TPETRA_DEBUG
2174  TEUCHOS_TEST_FOR_EXCEPTION
2175  (requests_.size () != 0,
2176  std::logic_error,
2177  "Tpetra::Distributor::doPosts(3 args, Kokkos): Process "
2178  << myRank << ": requests_.size() = " << requests_.size () << " != 0.");
2179 #endif // HAVE_TPETRA_DEBUG
2180 
2181  // Distributor uses requests_.size() as the number of outstanding
2182  // nonblocking message requests, so we resize to zero to maintain
2183  // this invariant.
2184  //
2185  // numReceives_ does _not_ include the self message, if there is
2186  // one. Here, we do actually send a message to ourselves, so we
2187  // include any self message in the "actual" number of receives to
2188  // post.
2189  //
2190  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
2191  // doesn't (re)allocate its array of requests. That happens in
2192  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
2193  // demand), or Resize_().
2194  const size_type actualNumReceives = as<size_type> (numReceives_) +
2195  as<size_type> (selfMessage_ ? 1 : 0);
2196  requests_.resize (0);
2197 
2198  if (verbose_) {
2199  std::ostringstream os;
2200  os << "Proc " << myRank << ": doPosts(3 args, Kokkos, "
2201  << (indicesTo_.empty () ? "fast" : "slow") << "): Post receives"
2202  << endl;
2203  *out_ << os.str ();
2204  }
2205 
2206  // Post the nonblocking receives. It's common MPI wisdom to post
2207  // receives before sends. In MPI terms, this means favoring
2208  // adding to the "posted queue" (of receive requests) over adding
2209  // to the "unexpected queue" (of arrived messages not yet matched
2210  // with a receive).
2211  {
2212 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2213  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3_recvs_);
2214 #endif // TPETRA_DISTRIBUTOR_TIMERS
2215 
2216  size_t curBufferOffset = 0;
2217  for (size_type i = 0; i < actualNumReceives; ++i) {
2218  const size_t curBufLen = lengthsFrom_[i] * numPackets;
2219  if (procsFrom_[i] != myRank) {
2220  if (verbose_) {
2221  std::ostringstream os;
2222  os << "Proc " << myRank << ": doPosts(3 args, Kokkos, "
2223  << (indicesTo_.empty () ? "fast" : "slow") << "): "
2224  << "Post irecv: {source: " << procsFrom_[i]
2225  << ", tag: " << tag << "}" << endl;
2226  *out_ << os.str ();
2227  }
2228  // If my process is receiving these packet(s) from another
2229  // process (not a self-receive):
2230  //
2231  // 1. Set up the persisting view (recvBuf) of the imports
2232  // array, given the offset and size (total number of
2233  // packets from process procsFrom_[i]).
2234  // 2. Start the Irecv and save the resulting request.
2235  TEUCHOS_TEST_FOR_EXCEPTION(
2236  curBufferOffset + curBufLen > static_cast<size_t> (imports.size ()),
2237  std::logic_error, "Tpetra::Distributor::doPosts(3 args, Kokkos): "
2238  "Exceeded size of 'imports' array in packing loop on Process " <<
2239  myRank << ". imports.size() = " << imports.size () << " < "
2240  "curBufferOffset(" << curBufferOffset << ") + curBufLen(" <<
2241  curBufLen << ").");
2242  imports_view_type recvBuf =
2243  subview_offset (imports, curBufferOffset, curBufLen);
2244  requests_.push_back (ireceive<int> (recvBuf, procsFrom_[i],
2245  tag, *comm_));
2246  }
2247  else { // Receiving from myself
2248  selfReceiveOffset = curBufferOffset; // Remember the self-recv offset
2249  }
2250  curBufferOffset += curBufLen;
2251  }
2252  }
2253 
2254  if (doBarrier) {
2255 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2256  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3_barrier_);
2257 #endif // TPETRA_DISTRIBUTOR_TIMERS
2258 
2259  if (verbose_) {
2260  std::ostringstream os;
2261  os << "Proc " << myRank << ": doPosts(3 args, Kokkos, "
2262  << (indicesTo_.empty () ? "fast" : "slow") << "): Barrier" << endl;
2263  *out_ << os.str ();
2264  }
2265  // If we are using ready sends (MPI_Rsend) below, we need to do
2266  // a barrier before we post the ready sends. This is because a
2267  // ready send requires that its matching receive has already
2268  // been posted before the send has been posted. The only way to
2269  // guarantee that in this case is to use a barrier.
2270  comm_->barrier ();
2271  }
2272 
2273 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2274  Teuchos::TimeMonitor timeMonSends (*timer_doPosts3_sends_);
2275 #endif // TPETRA_DISTRIBUTOR_TIMERS
2276 
2277  // setup scan through procsTo_ list starting with higher numbered procs
2278  // (should help balance message traffic)
2279  //
2280  // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
2281  // It doesn't depend on the input at all.
2282  size_t numBlocks = numSends_ + selfMessage_;
2283  size_t procIndex = 0;
2284  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myRank)) {
2285  ++procIndex;
2286  }
2287  if (procIndex == numBlocks) {
2288  procIndex = 0;
2289  }
2290 
2291  size_t selfNum = 0;
2292  size_t selfIndex = 0;
2293 
2294  if (verbose_) {
2295  std::ostringstream os;
2296  os << "Proc " << myRank << ": doPosts(3 args, Kokkos, "
2297  << (indicesTo_.empty () ? "fast" : "slow") << "): Post sends" << endl;
2298  *out_ << os.str ();
2299  }
2300 
2301  if (indicesTo_.empty()) {
2302  if (verbose_) {
2303  std::ostringstream os;
2304  os << "Proc " << myRank
2305  << ": doPosts(3 args, Kokkos, fast): posting sends" << endl;
2306  *out_ << os.str ();
2307  }
2308 
2309  // Data are already blocked (laid out) by process, so we don't
2310  // need a separate send buffer (besides the exports array).
2311  for (size_t i = 0; i < numBlocks; ++i) {
2312  size_t p = i + procIndex;
2313  if (p > (numBlocks - 1)) {
2314  p -= numBlocks;
2315  }
2316 
2317  if (procsTo_[p] != myRank) {
2318  if (verbose_) {
2319  std::ostringstream os;
2320  os << "Proc " << myRank << ": doPosts(3 args, Kokkos, fast): Post send: "
2321  "{target: " << procsTo_[p] << ", tag: " << tag << "}" << endl;
2322  *out_ << os.str ();
2323  }
2324 
2325  exports_view_type tmpSend = subview_offset(
2326  exports, startsTo_[p]*numPackets, lengthsTo_[p]*numPackets);
2327 
2328  if (sendType == Details::DISTRIBUTOR_SEND) {
2329  send<int> (tmpSend,
2330  as<int> (tmpSend.size ()),
2331  procsTo_[p], tag, *comm_);
2332  }
2333  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2334  exports_view_type tmpSendBuf =
2335  subview_offset (exports, startsTo_[p] * numPackets,
2336  lengthsTo_[p] * numPackets);
2337  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2338  tag, *comm_));
2339  }
2340  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2341  readySend<int> (tmpSend,
2342  as<int> (tmpSend.size ()),
2343  procsTo_[p], tag, *comm_);
2344  }
2345  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2346  ssend<int> (tmpSend,
2347  as<int> (tmpSend.size ()),
2348  procsTo_[p], tag, *comm_);
2349  } else {
2350  TEUCHOS_TEST_FOR_EXCEPTION(
2351  true,
2352  std::logic_error,
2353  "Tpetra::Distributor::doPosts(3 args, Kokkos): "
2354  "Invalid send type. We should never get here. "
2355  "Please report this bug to the Tpetra developers.");
2356  }
2357  }
2358  else { // "Sending" the message to myself
2359  selfNum = p;
2360  }
2361  }
2362 
2363  if (selfMessage_) {
2364  if (verbose_) {
2365  std::ostringstream os;
2366  os << "Proc " << myRank
2367  << ": doPosts(3 args, Kokkos, fast): Self-send" << endl;
2368  *out_ << os.str ();
2369  }
2370  // This is how we "send a message to ourself": we copy from
2371  // the export buffer to the import buffer. That saves
2372  // Teuchos::Comm implementations other than MpiComm (in
2373  // particular, SerialComm) the trouble of implementing self
2374  // messages correctly. (To do this right, SerialComm would
2375  // need internal buffer space for messages, keyed on the
2376  // message's tag.)
2377  deep_copy_offset(imports, exports, selfReceiveOffset,
2378  startsTo_[selfNum]*numPackets,
2379  lengthsTo_[selfNum]*numPackets);
2380  }
2381  if (verbose_) {
2382  std::ostringstream os;
2383  os << "Proc " << myRank << ": doPosts(3 args, Kokkos, fast) done" << endl;
2384  *out_ << os.str ();
2385  }
2386  }
2387  else { // data are not blocked by proc, use send buffer
2388  if (verbose_) {
2389  std::ostringstream os;
2390  os << "Proc " << myRank
2391  << ": doPosts(3 args, Kokkos, slow): posting sends" << endl;
2392  *out_ << os.str ();
2393  }
2394 
2395  typedef typename ExpView::non_const_value_type Packet;
2396  typedef typename ExpView::array_layout Layout;
2397  typedef typename ExpView::device_type Device;
2398  typedef typename ExpView::memory_traits Mem;
2399  Kokkos::View<Packet*,Layout,Device,Mem> sendArray ("sendArray",
2400  maxSendLength_ * numPackets);
2401 
2402  // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking
2403  // sends), because the buffer is only long enough for one send.
2404  TEUCHOS_TEST_FOR_EXCEPTION(
2405  sendType == Details::DISTRIBUTOR_ISEND,
2406  std::logic_error,
2407  "Tpetra::Distributor::doPosts(3 args, Kokkos): The \"send buffer\" code path "
2408  "doesn't currently work with nonblocking sends.");
2409 
2410  for (size_t i = 0; i < numBlocks; ++i) {
2411  size_t p = i + procIndex;
2412  if (p > (numBlocks - 1)) {
2413  p -= numBlocks;
2414  }
2415 
2416  if (procsTo_[p] != myRank) {
2417  if (verbose_) {
2418  std::ostringstream os;
2419  os << "Proc " << myRank
2420  << ": doPosts(3 args, Kokkos, slow): Post send: {target: "
2421  << procsTo_[p] << ", tag: " << tag << "}" << endl;
2422  *out_ << os.str ();
2423  }
2424 
2425  size_t sendArrayOffset = 0;
2426  size_t j = startsTo_[p];
2427  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
2428  deep_copy_offset(sendArray, exports, sendArrayOffset,
2429  indicesTo_[j]*numPackets, numPackets);
2430  sendArrayOffset += numPackets;
2431  }
2432  ImpView tmpSend =
2433  subview_offset(sendArray, size_t(0), lengthsTo_[p]*numPackets);
2434 
2435  if (sendType == Details::DISTRIBUTOR_SEND) {
2436  send<int> (tmpSend,
2437  as<int> (tmpSend.size ()),
2438  procsTo_[p], tag, *comm_);
2439  }
2440  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2441  exports_view_type tmpSendBuf =
2442  subview_offset (sendArray, size_t(0), lengthsTo_[p] * numPackets);
2443  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2444  tag, *comm_));
2445  }
2446  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2447  readySend<int> (tmpSend,
2448  as<int> (tmpSend.size ()),
2449  procsTo_[p], tag, *comm_);
2450  }
2451  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2452  ssend<int> (tmpSend,
2453  as<int> (tmpSend.size ()),
2454  procsTo_[p], tag, *comm_);
2455  }
2456  else {
2457  TEUCHOS_TEST_FOR_EXCEPTION(
2458  true,
2459  std::logic_error,
2460  "Tpetra::Distributor::doPosts(3 args, Kokkos): "
2461  "Invalid send type. We should never get here. "
2462  "Please report this bug to the Tpetra developers.");
2463  }
2464  }
2465  else { // "Sending" the message to myself
2466  selfNum = p;
2467  selfIndex = startsTo_[p];
2468  }
2469  }
2470 
2471  if (selfMessage_) {
2472  if (verbose_) {
2473  std::ostringstream os;
2474  os << "Proc " << myRank
2475  << ": doPosts(3 args, Kokkos, slow): Self-send" << endl;
2476  *out_ << os.str ();
2477  }
2478  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
2479  deep_copy_offset(imports, exports, selfReceiveOffset,
2480  indicesTo_[selfIndex]*numPackets, numPackets);
2481  ++selfIndex;
2482  selfReceiveOffset += numPackets;
2483  }
2484  }
2485  if (verbose_) {
2486  std::ostringstream os;
2487  os << "Proc " << myRank
2488  << ": doPosts(3 args, Kokkos, slow) done" << endl;
2489  *out_ << os.str ();
2490  }
2491  }
2492 
2493  if (verbose_) {
2494  std::ostringstream os;
2495  os << "Proc " << myRank << ": doPosts done" << endl;
2496  *out_ << os.str ();
2497  }
2498  }
2499 
2500  template <class ExpView, class ImpView>
2501  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2503  doPosts (const ExpView &exports,
2504  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2505  const ImpView &imports,
2506  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2507  {
2508  using Teuchos::Array;
2509  using Teuchos::as;
2510  using Teuchos::ireceive;
2511  using Teuchos::isend;
2512  using Teuchos::readySend;
2513  using Teuchos::send;
2514  using Teuchos::ssend;
2515  using Teuchos::TypeNameTraits;
2516 #ifdef HAVE_TEUCHOS_DEBUG
2517  using Teuchos::OSTab;
2518 #endif // HAVE_TEUCHOS_DEBUG
2519  using std::endl;
2520  using Kokkos::Compat::create_const_view;
2521  using Kokkos::Compat::create_view;
2522  using Kokkos::Compat::subview_offset;
2523  using Kokkos::Compat::deep_copy_offset;
2524  typedef Array<size_t>::size_type size_type;
2525  typedef ExpView exports_view_type;
2526  typedef ImpView imports_view_type;
2527 
2528 #ifdef KOKKOS_ENABLE_CUDA
2529  static_assert (! std::is_same<typename ExpView::memory_space, Kokkos::CudaUVMSpace>::value &&
2530  ! std::is_same<typename ImpView::memory_space, Kokkos::CudaUVMSpace>::value,
2531  "Please do not use Tpetra::Distributor with UVM "
2532  "allocations. See GitHub issue #1088.");
2533 #endif // KOKKOS_ENABLE_CUDA
2534 
2535  Teuchos::OSTab tab (out_);
2536 
2537 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2538  Teuchos::TimeMonitor timeMon (*timer_doPosts4_);
2539 #endif // TPETRA_DISTRIBUTOR_TIMERS
2540 
2541  // Run-time configurable parameters that come from the input
2542  // ParameterList set by setParameterList().
2543  const Details::EDistributorSendType sendType = sendType_;
2544  const bool doBarrier = barrierBetween_;
2545 
2546 // #ifdef HAVE_TEUCHOS_DEBUG
2547 // // Prepare for verbose output, if applicable.
2548 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
2549 // RCP<Teuchos::FancyOStream> out = this->getOStream ();
2550 // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
2551 // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
2552 
2553 // if (doPrint) {
2554 // // Only need one process to print out parameters.
2555 // *out << "Distributor::doPosts (4 args)" << endl;
2556 // }
2557 // // Add one tab level. We declare this outside the doPrint scopes
2558 // // so that the tab persists until the end of this method.
2559 // Teuchos::OSTab tab = this->getOSTab ();
2560 // if (doPrint) {
2561 // *out << "Parameters:" << endl;
2562 // {
2563 // OSTab tab2 (out);
2564 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
2565 // << endl << "barrierBetween: " << doBarrier << endl;
2566 // }
2567 // }
2568 // #endif // HAVE_TEUCHOS_DEBUG
2569 
2570  TEUCHOS_TEST_FOR_EXCEPTION(
2571  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
2572  std::logic_error, "Tpetra::Distributor::doPosts(4 args, Kokkos): Ready-send "
2573  "version requires a barrier between posting receives and posting ready "
2574  "sends. This should have been checked before. "
2575  "Please report this bug to the Tpetra developers.");
2576 
2577  const int myProcID = comm_->getRank ();
2578  size_t selfReceiveOffset = 0;
2579 
2580 #ifdef HAVE_TEUCHOS_DEBUG
2581  // Different messages may have different numbers of packets.
2582  size_t totalNumImportPackets = 0;
2583  for (size_type ii = 0; ii < numImportPacketsPerLID.size (); ++ii) {
2584  totalNumImportPackets += numImportPacketsPerLID[ii];
2585  }
2586  TEUCHOS_TEST_FOR_EXCEPTION(
2587  imports.extent (0) < totalNumImportPackets, std::runtime_error,
2588  "Tpetra::Distributor::doPosts(4 args, Kokkos): The 'imports' array must have "
2589  "enough entries to hold the expected number of import packets. "
2590  "imports.extent(0) = " << imports.extent (0) << " < "
2591  "totalNumImportPackets = " << totalNumImportPackets << ".");
2592 #endif // HAVE_TEUCHOS_DEBUG
2593 
2594  // MPI tag for nonblocking receives and blocking sends in this
2595  // method. Some processes might take the "fast" path
2596  // (indicesTo_.empty()) and others might take the "slow" path for
2597  // the same doPosts() call, so the path tag must be the same for
2598  // both.
2599  const int pathTag = 1;
2600  const int tag = this->getTag (pathTag);
2601 
2602 #ifdef HAVE_TEUCHOS_DEBUG
2603  TEUCHOS_TEST_FOR_EXCEPTION
2604  (requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
2605  "doPosts(4 args, Kokkos): Process " << myProcID << ": requests_.size () = "
2606  << requests_.size () << " != 0.");
2607 #endif // HAVE_TEUCHOS_DEBUG
2608  if (verbose_) {
2609  std::ostringstream os;
2610  os << "Proc " << myProcID << ": doPosts(4 args, Kokkos, "
2611  << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
2612  *out_ << os.str ();
2613  }
2614 
2615  // Distributor uses requests_.size() as the number of outstanding
2616  // nonblocking message requests, so we resize to zero to maintain
2617  // this invariant.
2618  //
2619  // numReceives_ does _not_ include the self message, if there is
2620  // one. Here, we do actually send a message to ourselves, so we
2621  // include any self message in the "actual" number of receives to
2622  // post.
2623  //
2624  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
2625  // doesn't (re)allocate its array of requests. That happens in
2626  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
2627  // demand), or Resize_().
2628  const size_type actualNumReceives = as<size_type> (numReceives_) +
2629  as<size_type> (selfMessage_ ? 1 : 0);
2630  requests_.resize (0);
2631 
2632  // Post the nonblocking receives. It's common MPI wisdom to post
2633  // receives before sends. In MPI terms, this means favoring
2634  // adding to the "posted queue" (of receive requests) over adding
2635  // to the "unexpected queue" (of arrived messages not yet matched
2636  // with a receive).
2637  {
2638 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2639  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4_recvs_);
2640 #endif // TPETRA_DISTRIBUTOR_TIMERS
2641 
2642  size_t curBufferOffset = 0;
2643  size_t curLIDoffset = 0;
2644  for (size_type i = 0; i < actualNumReceives; ++i) {
2645  size_t totalPacketsFrom_i = 0;
2646  for (size_t j = 0; j < lengthsFrom_[i]; ++j) {
2647  totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
2648  }
2649  curLIDoffset += lengthsFrom_[i];
2650  if (procsFrom_[i] != myProcID && totalPacketsFrom_i) {
2651  // If my process is receiving these packet(s) from another
2652  // process (not a self-receive), and if there is at least
2653  // one packet to receive:
2654  //
2655  // 1. Set up the persisting view (recvBuf) into the imports
2656  // array, given the offset and size (total number of
2657  // packets from process procsFrom_[i]).
2658  // 2. Start the Irecv and save the resulting request.
2659  imports_view_type recvBuf =
2660  subview_offset (imports, curBufferOffset, totalPacketsFrom_i);
2661  requests_.push_back (ireceive<int> (recvBuf, procsFrom_[i],
2662  tag, *comm_));
2663  }
2664  else { // Receiving these packet(s) from myself
2665  selfReceiveOffset = curBufferOffset; // Remember the offset
2666  }
2667  curBufferOffset += totalPacketsFrom_i;
2668  }
2669  }
2670 
2671  if (doBarrier) {
2672 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2673  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4_barrier_);
2674 #endif // TPETRA_DISTRIBUTOR_TIMERS
2675  // If we are using ready sends (MPI_Rsend) below, we need to do
2676  // a barrier before we post the ready sends. This is because a
2677  // ready send requires that its matching receive has already
2678  // been posted before the send has been posted. The only way to
2679  // guarantee that in this case is to use a barrier.
2680  comm_->barrier ();
2681  }
2682 
2683 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2684  Teuchos::TimeMonitor timeMonSends (*timer_doPosts4_sends_);
2685 #endif // TPETRA_DISTRIBUTOR_TIMERS
2686 
2687  // setup arrays containing starting-offsets into exports for each send,
2688  // and num-packets-to-send for each send.
2689  Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0);
2690  size_t maxNumPackets = 0;
2691  size_t curPKToffset = 0;
2692  for (size_t pp=0; pp<numSends_; ++pp) {
2693  sendPacketOffsets[pp] = curPKToffset;
2694  size_t numPackets = 0;
2695  for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) {
2696  numPackets += numExportPacketsPerLID[j];
2697  }
2698  if (numPackets > maxNumPackets) maxNumPackets = numPackets;
2699  packetsPerSend[pp] = numPackets;
2700  curPKToffset += numPackets;
2701  }
2702 
2703  // setup scan through procsTo_ list starting with higher numbered procs
2704  // (should help balance message traffic)
2705  size_t numBlocks = numSends_+ selfMessage_;
2706  size_t procIndex = 0;
2707  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myProcID)) {
2708  ++procIndex;
2709  }
2710  if (procIndex == numBlocks) {
2711  procIndex = 0;
2712  }
2713 
2714  size_t selfNum = 0;
2715  size_t selfIndex = 0;
2716  if (indicesTo_.empty()) {
2717  if (verbose_) {
2718  std::ostringstream os;
2719  os << "Proc " << myProcID
2720  << ": doPosts(4 args, Kokkos, fast): posting sends" << endl;
2721  *out_ << os.str ();
2722  }
2723 
2724  // Data are already blocked (laid out) by process, so we don't
2725  // need a separate send buffer (besides the exports array).
2726  for (size_t i = 0; i < numBlocks; ++i) {
2727  size_t p = i + procIndex;
2728  if (p > (numBlocks - 1)) {
2729  p -= numBlocks;
2730  }
2731 
2732  if (procsTo_[p] != myProcID && packetsPerSend[p] > 0) {
2733  exports_view_type tmpSend =
2734  subview_offset(exports, sendPacketOffsets[p], packetsPerSend[p]);
2735 
2736  if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first
2737  send<int> (tmpSend,
2738  as<int> (tmpSend.size ()),
2739  procsTo_[p], tag, *comm_);
2740  }
2741  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2742  readySend<int> (tmpSend,
2743  as<int> (tmpSend.size ()),
2744  procsTo_[p], tag, *comm_);
2745  }
2746  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2747  exports_view_type tmpSendBuf =
2748  subview_offset (exports, sendPacketOffsets[p], packetsPerSend[p]);
2749  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2750  tag, *comm_));
2751  }
2752  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2753  ssend<int> (tmpSend,
2754  as<int> (tmpSend.size ()),
2755  procsTo_[p], tag, *comm_);
2756  }
2757  else {
2758  TEUCHOS_TEST_FOR_EXCEPTION(
2759  true, std::logic_error,
2760  "Tpetra::Distributor::doPosts(4 args, Kokkos): "
2761  "Invalid send type. We should never get here. "
2762  "Please report this bug to the Tpetra developers.");
2763  }
2764  }
2765  else { // "Sending" the message to myself
2766  selfNum = p;
2767  }
2768  }
2769 
2770  if (selfMessage_) {
2771  deep_copy_offset(imports, exports, selfReceiveOffset,
2772  sendPacketOffsets[selfNum], packetsPerSend[selfNum]);
2773  }
2774  if (verbose_) {
2775  std::ostringstream os;
2776  os << "Proc " << myProcID << ": doPosts(4 args, Kokkos, fast) done" << endl;
2777  *out_ << os.str ();
2778  }
2779  }
2780  else { // data are not blocked by proc, use send buffer
2781  if (verbose_) {
2782  std::ostringstream os;
2783  os << "Proc " << myProcID << ": doPosts(4 args, Kokkos, slow): posting sends" << endl;
2784  *out_ << os.str ();
2785  }
2786 
2787  // FIXME (mfh 05 Mar 2013) This may be broken for Isend.
2788  typedef typename ExpView::non_const_value_type Packet;
2789  typedef typename ExpView::array_layout Layout;
2790  typedef typename ExpView::device_type Device;
2791  typedef typename ExpView::memory_traits Mem;
2792  Kokkos::View<Packet*,Layout,Device,Mem> sendArray ("sendArray", maxNumPackets); // send buffer
2793 
2794  TEUCHOS_TEST_FOR_EXCEPTION(
2795  sendType == Details::DISTRIBUTOR_ISEND,
2796  std::logic_error,
2797  "Tpetra::Distributor::doPosts(4 args, Kokkos): "
2798  "The \"send buffer\" code path may not necessarily work with nonblocking sends.");
2799 
2800  Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
2801  size_t ioffset = 0;
2802  for (int j=0; j<numExportPacketsPerLID.size(); ++j) {
2803  indicesOffsets[j] = ioffset;
2804  ioffset += numExportPacketsPerLID[j];
2805  }
2806 
2807  for (size_t i = 0; i < numBlocks; ++i) {
2808  size_t p = i + procIndex;
2809  if (p > (numBlocks - 1)) {
2810  p -= numBlocks;
2811  }
2812 
2813  if (procsTo_[p] != myProcID) {
2814  size_t sendArrayOffset = 0;
2815  size_t j = startsTo_[p];
2816  size_t numPacketsTo_p = 0;
2817  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
2818  numPacketsTo_p += numExportPacketsPerLID[j];
2819  deep_copy_offset(sendArray, exports, sendArrayOffset,
2820  indicesOffsets[j], numExportPacketsPerLID[j]);
2821  sendArrayOffset += numExportPacketsPerLID[j];
2822  }
2823  if (numPacketsTo_p > 0) {
2824  ImpView tmpSend =
2825  subview_offset(sendArray, size_t(0), numPacketsTo_p);
2826 
2827  if (sendType == Details::DISTRIBUTOR_RSEND) {
2828  readySend<int> (tmpSend,
2829  as<int> (tmpSend.size ()),
2830  procsTo_[p], tag, *comm_);
2831  }
2832  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2833  exports_view_type tmpSendBuf =
2834  subview_offset (sendArray, size_t(0), numPacketsTo_p);
2835  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2836  tag, *comm_));
2837  }
2838  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2839  ssend<int> (tmpSend,
2840  as<int> (tmpSend.size ()),
2841  procsTo_[p], tag, *comm_);
2842  }
2843  else { // if (sendType == Details::DISTRIBUTOR_SSEND)
2844  send<int> (tmpSend,
2845  as<int> (tmpSend.size ()),
2846  procsTo_[p], tag, *comm_);
2847  }
2848  }
2849  }
2850  else { // "Sending" the message to myself
2851  selfNum = p;
2852  selfIndex = startsTo_[p];
2853  }
2854  }
2855 
2856  if (selfMessage_) {
2857  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
2858  deep_copy_offset(imports, exports, selfReceiveOffset,
2859  indicesOffsets[selfIndex],
2860  numExportPacketsPerLID[selfIndex]);
2861  selfReceiveOffset += numExportPacketsPerLID[selfIndex];
2862  ++selfIndex;
2863  }
2864  }
2865  if (verbose_) {
2866  std::ostringstream os;
2867  os << "Proc " << myProcID
2868  << ": doPosts(4 args, Kokkos, slow) done" << endl;
2869  *out_ << os.str ();
2870  }
2871  }
2872  }
2873 
2874  template <class ExpView, class ImpView>
2875  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2877  doReversePostsAndWaits (const ExpView& exports,
2878  size_t numPackets,
2879  const ImpView& imports)
2880  {
2881  doReversePosts (exports, numPackets, imports);
2882  doReverseWaits ();
2883  }
2884 
2885  template <class ExpView, class ImpView>
2886  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2888  doReversePostsAndWaits (const ExpView& exports,
2889  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2890  const ImpView& imports,
2891  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2892  {
2893  TEUCHOS_TEST_FOR_EXCEPTION(requests_.size() != 0, std::runtime_error,
2894  "Tpetra::Distributor::doReversePostsAndWaits(4 args): There are "
2895  << requests_.size() << " outstanding nonblocking messages pending. It "
2896  "is incorrect to call this method with posts outstanding.");
2897 
2898  doReversePosts (exports, numExportPacketsPerLID, imports,
2899  numImportPacketsPerLID);
2900  doReverseWaits ();
2901  }
2902 
2903  template <class ExpView, class ImpView>
2904  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2906  doReversePosts (const ExpView &exports,
2907  size_t numPackets,
2908  const ImpView &imports)
2909  {
2910  // FIXME (mfh 29 Mar 2012) WHY?
2911  TEUCHOS_TEST_FOR_EXCEPTION(
2912  ! indicesTo_.empty (), std::runtime_error,
2913  "Tpetra::Distributor::doReversePosts(3 args): Can only do "
2914  "reverse communication when original data are blocked by process.");
2915  if (reverseDistributor_.is_null ()) {
2916  createReverseDistributor ();
2917  }
2918  reverseDistributor_->doPosts (exports, numPackets, imports);
2919  }
2920 
2921  template <class ExpView, class ImpView>
2922  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2924  doReversePosts (const ExpView &exports,
2925  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2926  const ImpView &imports,
2927  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2928  {
2929  // FIXME (mfh 29 Mar 2012) WHY?
2930  TEUCHOS_TEST_FOR_EXCEPTION(
2931  ! indicesTo_.empty (), std::runtime_error,
2932  "Tpetra::Distributor::doReversePosts(3 args): Can only do "
2933  "reverse communication when original data are blocked by process.");
2934  if (reverseDistributor_.is_null ()) {
2935  createReverseDistributor ();
2936  }
2937  reverseDistributor_->doPosts (exports, numExportPacketsPerLID,
2938  imports, numImportPacketsPerLID);
2939  }
2940 
2941  template <class OrdinalType>
2942  void Distributor::
2943  computeSends (const Teuchos::ArrayView<const OrdinalType> & importGIDs,
2944  const Teuchos::ArrayView<const int> & importProcIDs,
2945  Teuchos::Array<OrdinalType> & exportGIDs,
2946  Teuchos::Array<int> & exportProcIDs)
2947  {
2948  // NOTE (mfh 19 Apr 2012): There was a note on this code saying:
2949  // "assumes that size_t >= Ordinal". The code certainly does
2950  // assume that sizeof(size_t) >= sizeof(OrdinalType) as well as
2951  // sizeof(size_t) >= sizeof(int). This is because it casts the
2952  // OrdinalType elements of importGIDs (along with their
2953  // corresponding process IDs, as int) to size_t, and does a
2954  // doPostsAndWaits<size_t>() to send the packed data.
2955  using Teuchos::Array;
2956  using Teuchos::ArrayView;
2957  using std::endl;
2958  typedef typename ArrayView<const OrdinalType>::size_type size_type;
2959 
2960  Teuchos::OSTab tab (out_);
2961  const int myRank = comm_->getRank ();
2962  if (verbose_) {
2963  std::ostringstream os;
2964  os << "Proc " << myRank << ": computeSends" << endl;
2965  *out_ << os.str ();
2966  }
2967 
2968  TEUCHOS_TEST_FOR_EXCEPTION(
2969  importGIDs.size () != importProcIDs.size (), std::invalid_argument,
2970  "Tpetra::Distributor::computeSends: On Process " << myRank << ": "
2971  "importProcIDs.size() = " << importProcIDs.size ()
2972  << " != importGIDs.size() = " << importGIDs.size () << ".");
2973 
2974  const size_type numImports = importProcIDs.size ();
2975  Array<size_t> importObjs (2*numImports);
2976  // Pack pairs (importGIDs[i], my process ID) to send into importObjs.
2977  for (size_type i = 0; i < numImports; ++i) {
2978  importObjs[2*i] = static_cast<size_t> (importGIDs[i]);
2979  importObjs[2*i+1] = static_cast<size_t> (myRank);
2980  }
2981  //
2982  // Use a temporary Distributor to send the (importGIDs[i], myRank)
2983  // pairs to importProcIDs[i].
2984  //
2985  Distributor tempPlan (comm_, out_);
2986  if (verbose_) {
2987  std::ostringstream os;
2988  os << "Proc " << myRank << ": computeSends: tempPlan.createFromSends" << endl;
2989  *out_ << os.str ();
2990  }
2991 
2992  // mfh 20 Mar 2014: An extra-cautious cast from unsigned to
2993  // signed, in order to forestall any possible causes for Bug 6069.
2994  const size_t numExportsAsSizeT = tempPlan.createFromSends (importProcIDs);
2995  const size_type numExports = static_cast<size_type> (numExportsAsSizeT);
2996  TEUCHOS_TEST_FOR_EXCEPTION(
2997  numExports < 0, std::logic_error, "Tpetra::Distributor::computeSends: "
2998  "tempPlan.createFromSends() returned numExports = " << numExportsAsSizeT
2999  << " as a size_t, which overflows to " << numExports << " when cast to "
3000  << Teuchos::TypeNameTraits<size_type>::name () << ". "
3001  "Please report this bug to the Tpetra developers.");
3002  TEUCHOS_TEST_FOR_EXCEPTION(
3003  static_cast<size_type> (tempPlan.getTotalReceiveLength ()) != numExports,
3004  std::logic_error, "Tpetra::Distributor::computeSends: tempPlan.getTotal"
3005  "ReceiveLength() = " << tempPlan.getTotalReceiveLength () << " != num"
3006  "Exports = " << numExports << ". Please report this bug to the "
3007  "Tpetra developers.");
3008 
3009  if (numExports > 0) {
3010  exportGIDs.resize (numExports);
3011  exportProcIDs.resize (numExports);
3012  }
3013 
3014  // exportObjs: Packed receive buffer. (exportObjs[2*i],
3015  // exportObjs[2*i+1]) will give the (GID, PID) pair for export i,
3016  // after tempPlan.doPostsAndWaits(...) finishes below.
3017  //
3018  // FIXME (mfh 19 Mar 2014) This only works if OrdinalType fits in
3019  // size_t. This issue might come up, for example, on a 32-bit
3020  // machine using 64-bit global indices. I will add a check here
3021  // for that case.
3022  TEUCHOS_TEST_FOR_EXCEPTION(
3023  sizeof (size_t) < sizeof (OrdinalType), std::logic_error,
3024  "Tpetra::Distributor::computeSends: sizeof(size_t) = " << sizeof(size_t)
3025  << " < sizeof(" << Teuchos::TypeNameTraits<OrdinalType>::name () << ") = "
3026  << sizeof (OrdinalType) << ". This violates an assumption of the "
3027  "method. It's not hard to work around (just use Array<OrdinalType> as "
3028  "the export buffer, not Array<size_t>), but we haven't done that yet. "
3029  "Please report this bug to the Tpetra developers.");
3030 
3031  TEUCHOS_TEST_FOR_EXCEPTION(
3032  tempPlan.getTotalReceiveLength () < static_cast<size_t> (numExports),
3033  std::logic_error,
3034  "Tpetra::Distributor::computeSends: tempPlan.getTotalReceiveLength() = "
3035  << tempPlan.getTotalReceiveLength() << " < numExports = " << numExports
3036  << ". Please report this bug to the Tpetra developers.");
3037 
3038  Array<size_t> exportObjs (tempPlan.getTotalReceiveLength () * 2);
3039  if (verbose_) {
3040  std::ostringstream os;
3041  os << "Proc " << myRank << ": computeSends: tempPlan.doPostsAndWaits" << endl;
3042  *out_ << os.str ();
3043  }
3044  tempPlan.doPostsAndWaits<size_t> (importObjs (), 2, exportObjs ());
3045 
3046  // Unpack received (GID, PID) pairs into exportIDs resp. exportProcIDs.
3047  for (size_type i = 0; i < numExports; ++i) {
3048  exportGIDs[i] = static_cast<OrdinalType> (exportObjs[2*i]);
3049  exportProcIDs[i] = static_cast<int> (exportObjs[2*i+1]);
3050  }
3051 
3052  if (verbose_) {
3053  std::ostringstream os;
3054  os << "Proc " << myRank << ": computeSends done" << endl;
3055  *out_ << os.str ();
3056  }
3057  }
3058 
3059  template <class OrdinalType>
3060  void Distributor::
3061  createFromRecvs (const Teuchos::ArrayView<const OrdinalType> &remoteGIDs,
3062  const Teuchos::ArrayView<const int> &remoteProcIDs,
3063  Teuchos::Array<OrdinalType> &exportGIDs,
3064  Teuchos::Array<int> &exportProcIDs)
3065  {
3066  using std::endl;
3067 
3068  Teuchos::OSTab tab (out_);
3069  const int myRank = comm_->getRank();
3070 
3071  if (verbose_) {
3072  *out_ << "Proc " << myRank << ": createFromRecvs" << endl;
3073  }
3074 
3075 #ifdef HAVE_TPETRA_DEBUG
3076  using Teuchos::outArg;
3077  using Teuchos::reduceAll;
3078 
3079  // In debug mode, first test locally, then do an all-reduce to
3080  // make sure that all processes passed.
3081  const int errProc =
3082  (remoteGIDs.size () != remoteProcIDs.size ()) ? myRank : -1;
3083  int maxErrProc = -1;
3084  reduceAll<int, int> (*comm_, Teuchos::REDUCE_MAX, errProc, outArg (maxErrProc));
3085  TEUCHOS_TEST_FOR_EXCEPTION(maxErrProc != -1, std::runtime_error,
3086  Teuchos::typeName (*this) << "::createFromRecvs(): lists of remote IDs "
3087  "and remote process IDs must have the same size on all participating "
3088  "processes. Maximum process ID with error: " << maxErrProc << ".");
3089 #else // NOT HAVE_TPETRA_DEBUG
3090 
3091  // In non-debug mode, just test locally.
3092  TEUCHOS_TEST_FOR_EXCEPTION(
3093  remoteGIDs.size () != remoteProcIDs.size (), std::invalid_argument,
3094  Teuchos::typeName (*this) << "::createFromRecvs<" <<
3095  Teuchos::TypeNameTraits<OrdinalType>::name () << ">(): On Process " <<
3096  myRank << ": remoteGIDs.size() = " << remoteGIDs.size () << " != "
3097  "remoteProcIDs.size() = " << remoteProcIDs.size () << ".");
3098 #endif // HAVE_TPETRA_DEBUG
3099 
3100  computeSends (remoteGIDs, remoteProcIDs, exportGIDs, exportProcIDs);
3101 
3102  const size_t numProcsSendingToMe = createFromSends (exportProcIDs ());
3103 
3104  if (verbose_) {
3105  // NOTE (mfh 20 Mar 2014) If remoteProcIDs could contain
3106  // duplicates, then its length might not be the right check here,
3107  // even if we account for selfMessage_. selfMessage_ is set in
3108  // createFromSends.
3109  std::ostringstream os;
3110  os << "Proc " << myRank << ": {numProcsSendingToMe: "
3111  << numProcsSendingToMe << ", remoteProcIDs.size(): "
3112  << remoteProcIDs.size () << ", selfMessage_: "
3113  << (selfMessage_ ? "true" : "false") << "}" << std::endl;
3114  *out_ << os.str ();
3115  }
3116 
3117  if (verbose_) {
3118  *out_ << "Proc " << myRank << ": createFromRecvs done" << endl;
3119  }
3120 
3121  howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS;
3122  }
3123 
3124 
3125 } // namespace Tpetra
3126 
3127 #endif // TPETRA_DISTRIBUTOR_HPP
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Stand-alone utility functions and macros.
Sets up and executes a communication plan for a Tpetra DistObject.
void doPostsAndWaits(const Teuchos::ArrayView< const Packet > &exports, size_t numPackets, const Teuchos::ArrayView< Packet > &imports)
Execute the (forward) communication plan.
void doReversePostsAndWaits(const Teuchos::ArrayView< const Packet > &exports, size_t numPackets, const Teuchos::ArrayView< Packet > &imports)
Execute the reverse communication plan.
size_t getMaxSendLength() const
Maximum number of values this process will send to another single process.
void createFromRecvs(const Teuchos::ArrayView< const Ordinal > &remoteIDs, const Teuchos::ArrayView< const int > &remoteProcIDs, Teuchos::Array< Ordinal > &exportIDs, Teuchos::Array< int > &exportProcIDs)
Set up Distributor using list of process ranks from which to receive.
Teuchos::ArrayView< const int > getProcsTo() const
Ranks of the processes to which this process will send values.
void doReversePosts(const Teuchos::ArrayRCP< const Packet > &exports, size_t numPackets, const Teuchos::ArrayRCP< Packet > &imports)
Post the data for a reverse plan, but do not execute the waits yet.
size_t getNumReceives() const
The number of processes from which we will receive data.
void doPosts(const Teuchos::ArrayRCP< const Packet > &exports, size_t numPackets, const Teuchos::ArrayRCP< Packet > &imports)
Post the data for a forward plan, but do not execute the waits yet.
void setParameterList(const Teuchos::RCP< Teuchos::ParameterList > &plist)
Set Distributor parameters.
size_t getTotalReceiveLength() const
Total number of values this process will receive from other processes.
virtual ~Distributor()=default
Destructor (virtual for memory safety).
bool hasSelfMessage() const
Whether the calling process will send or receive messages to itself.
void swap(Distributor &rhs)
Swap the contents of rhs with those of *this.
Teuchos::ArrayView< const size_t > getLengthsTo() const
Number of values this process will send to each process.
Teuchos::ArrayView< const int > getProcsFrom() const
Ranks of the processes sending values to this process.
Teuchos::RCP< Distributor > getReverse() const
A reverse communication plan Distributor.
Distributor(const Teuchos::RCP< const Teuchos::Comm< int > > &comm)
Construct using the specified communicator and default parameters.
std::string description() const
Return a one-line description of this object.
size_t createFromSends(const Teuchos::ArrayView< const int > &exportProcIDs)
Set up Distributor using list of process ranks to which this process will send.
void createFromSendsAndRecvs(const Teuchos::ArrayView< const int > &exportProcIDs, const Teuchos::ArrayView< const int > &remoteProcIDs)
Set up Distributor using list of process ranks to which to send, and list of process ranks from which...
Teuchos::RCP< const Teuchos::ParameterList > getValidParameters() const
List of valid Distributor parameters.
Teuchos::ArrayView< const size_t > getLengthsFrom() const
Number of values this process will receive from each process.
Details::EDistributorHowInitialized howInitialized() const
Return an enum indicating whether and how a Distributor was initialized.
size_t getNumSends() const
The number of processes to which we will send data.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Describe this object in a human-readable way to the given output stream.
void getLastDoStatistics(size_t &bytes_sent, size_t &bytes_recvd) const
Information on the last call to do/doReverse.
Implementation details of Tpetra.
std::string DistributorSendTypeEnumToString(EDistributorSendType sendType)
Convert an EDistributorSendType enum value to a string.
EDistributorSendType
The type of MPI send that Distributor should use.
EDistributorHowInitialized
Enum indicating how and whether a Distributor was initialized.
std::string DistributorHowInitializedEnumToString(EDistributorHowInitialized how)
Convert an EDistributorHowInitialized enum value to a string.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Teuchos::Array< std::string > distributorSendTypes()
Valid values for Distributor's "Send type" parameter.