Tpetra parallel linear algebra  Version of the Day
Tpetra_Distributor.hpp
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DISTRIBUTOR_HPP
43 #define TPETRA_DISTRIBUTOR_HPP
44 
45 #include "Tpetra_Util.hpp"
46 #include <Teuchos_as.hpp>
47 #include <Teuchos_Describable.hpp>
48 #include <Teuchos_ParameterListAcceptorDefaultBase.hpp>
49 #include <Teuchos_VerboseObject.hpp>
50 
51 // If TPETRA_DISTRIBUTOR_TIMERS is defined, Distributor will time
52 // doPosts (both versions) and doWaits, and register those timers with
53 // Teuchos::TimeMonitor so that summarize() or report() will show
54 // results.
55 
56 // #ifndef TPETRA_DISTRIBUTOR_TIMERS
57 // # define TPETRA_DISTRIBUTOR_TIMERS 1
58 // #endif // TPETRA_DISTRIBUTOR_TIMERS
59 
60 #ifdef TPETRA_DISTRIBUTOR_TIMERS
61 # undef TPETRA_DISTRIBUTOR_TIMERS
62 #endif // TPETRA_DISTRIBUTOR_TIMERS
63 
64 #include "KokkosCompat_View.hpp"
65 #include "Kokkos_Core.hpp"
66 #include "Kokkos_TeuchosCommAdapters.hpp"
67 
68 
69 namespace Tpetra {
70 
71  namespace Details {
77  DISTRIBUTOR_ISEND, // Use MPI_Isend (Teuchos::isend)
78  DISTRIBUTOR_RSEND, // Use MPI_Rsend (Teuchos::readySend)
79  DISTRIBUTOR_SEND, // Use MPI_Send (Teuchos::send)
80  DISTRIBUTOR_SSEND // Use MPI_Ssend (Teuchos::ssend)
81  };
82 
87  std::string
89 
95  DISTRIBUTOR_NOT_INITIALIZED, // Not initialized yet
96  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS, // By createFromSends
97  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS, // By createFromRecvs
98  DISTRIBUTOR_INITIALIZED_BY_REVERSE, // By createReverseDistributor
99  DISTRIBUTOR_INITIALIZED_BY_COPY // By copy constructor
100  };
101 
106  std::string
108 
109  } // namespace Details
110 
117  Array<std::string> distributorSendTypes ();
118 
186  class Distributor :
187  public Teuchos::Describable,
188  public Teuchos::ParameterListAcceptorDefaultBase,
189  public Teuchos::VerboseObject<Distributor> {
190  public:
192 
193 
201  explicit Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm);
202 
212  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
213  const Teuchos::RCP<Teuchos::FancyOStream>& out);
214 
227  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
228  const Teuchos::RCP<Teuchos::ParameterList>& plist);
229 
244  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
245  const Teuchos::RCP<Teuchos::FancyOStream>& out,
246  const Teuchos::RCP<Teuchos::ParameterList>& plist);
247 
249  Distributor (const Distributor &distributor);
250 
252  virtual ~Distributor ();
253 
259  void swap (Distributor& rhs);
260 
262 
264 
269  void setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& plist);
270 
275  Teuchos::RCP<const Teuchos::ParameterList> getValidParameters () const;
276 
278 
280 
300  size_t createFromSends (const ArrayView<const int>& exportNodeIDs);
301 
335  template <class Ordinal>
336  void
337  createFromRecvs (const ArrayView<const Ordinal>& remoteIDs,
338  const ArrayView<const int>& remoteNodeIDs,
339  Array<Ordinal>& exportIDs,
340  Array<int>& exportNodeIDs);
341 
343 
345 
349  size_t getNumReceives() const;
350 
354  size_t getNumSends() const;
355 
357  bool hasSelfMessage() const;
358 
360  size_t getMaxSendLength() const;
361 
363  size_t getTotalReceiveLength() const;
364 
369  ArrayView<const int> getImagesFrom() const;
370 
375  ArrayView<const int> getImagesTo() const;
376 
384  ArrayView<const size_t> getLengthsFrom() const;
385 
393  ArrayView<const size_t> getLengthsTo() const;
394 
400  return howInitialized_;
401  }
402 
404 
406 
417  RCP<Distributor> getReverse() const;
418 
420 
422 
443  template <class Packet>
444  void
445  doPostsAndWaits (const ArrayView<const Packet> &exports,
446  size_t numPackets,
447  const ArrayView<Packet> &imports);
448 
470  template <class Packet>
471  void
472  doPostsAndWaits (const ArrayView<const Packet> &exports,
473  const ArrayView<size_t> &numExportPacketsPerLID,
474  const ArrayView<Packet> &imports,
475  const ArrayView<size_t> &numImportPacketsPerLID);
476 
501  template <class Packet>
502  void
503  doPosts (const ArrayRCP<const Packet> &exports,
504  size_t numPackets,
505  const ArrayRCP<Packet> &imports);
506 
525  template <class Packet>
526  void
527  doPosts (const ArrayRCP<const Packet> &exports,
528  const ArrayView<size_t> &numExportPacketsPerLID,
529  const ArrayRCP<Packet> &imports,
530  const ArrayView<size_t> &numImportPacketsPerLID);
531 
538  void doWaits ();
539 
544  template <class Packet>
545  void
546  doReversePostsAndWaits (const ArrayView<const Packet> &exports,
547  size_t numPackets,
548  const ArrayView<Packet> &imports);
549 
554  template <class Packet>
555  void
556  doReversePostsAndWaits (const ArrayView<const Packet> &exports,
557  const ArrayView<size_t> &numExportPacketsPerLID,
558  const ArrayView<Packet> &imports,
559  const ArrayView<size_t> &numImportPacketsPerLID);
560 
565  template <class Packet>
566  void
567  doReversePosts (const ArrayRCP<const Packet> &exports,
568  size_t numPackets,
569  const ArrayRCP<Packet> &imports);
570 
575  template <class Packet>
576  void
577  doReversePosts (const ArrayRCP<const Packet> &exports,
578  const ArrayView<size_t> &numExportPacketsPerLID,
579  const ArrayRCP<Packet> &imports,
580  const ArrayView<size_t> &numImportPacketsPerLID);
581 
588  void doReverseWaits ();
589 
610  template <class Packet, class Layout, class Device, class Mem>
611  void
612  doPostsAndWaits (
613  const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
614  size_t numPackets,
615  const Kokkos::View<Packet*, Layout, Device, Mem> &imports);
616 
638  template <class Packet, class Layout, class Device, class Mem>
639  void
640  doPostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
641  const ArrayView<size_t> &numExportPacketsPerLID,
642  const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
643  const ArrayView<size_t> &numImportPacketsPerLID);
644 
669  template <class Packet, class Layout, class Device, class Mem>
670  void
671  doPosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
672  size_t numPackets,
673  const Kokkos::View<Packet*, Layout, Device, Mem> &imports);
674 
693  template <class Packet, class Layout, class Device, class Mem>
694  void
695  doPosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
696  const ArrayView<size_t> &numExportPacketsPerLID,
697  const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
698  const ArrayView<size_t> &numImportPacketsPerLID);
699 
704  template <class Packet, class Layout, class Device, class Mem>
705  void
706  doReversePostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
707  size_t numPackets,
708  const Kokkos::View<Packet*, Layout, Device, Mem> &imports);
709 
714  template <class Packet, class Layout, class Device, class Mem>
715  void
716  doReversePostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
717  const ArrayView<size_t> &numExportPacketsPerLID,
718  const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
719  const ArrayView<size_t> &numImportPacketsPerLID);
720 
725  template <class Packet, class Layout, class Device, class Mem>
726  void
727  doReversePosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
728  size_t numPackets,
729  const Kokkos::View<Packet*, Layout, Device, Mem> &imports);
730 
735  template <class Packet, class Layout, class Device, class Mem>
736  void
737  doReversePosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
738  const ArrayView<size_t> &numExportPacketsPerLID,
739  const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
740  const ArrayView<size_t> &numImportPacketsPerLID);
741 
745  void getLastDoStatistics(size_t & bytes_sent, size_t & bytes_recvd) const{
746  bytes_sent = lastRoundBytesSend_;
747  bytes_recvd = lastRoundBytesRecv_;
748  }
749 
750 
752 
754 
756  std::string description() const;
757 
759  void describe (Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const;
760 
762 
763  private:
765  RCP<const Comm<int> > comm_;
766 
768  Teuchos::RCP<Teuchos::FancyOStream> out_;
769 
771  Details::EDistributorHowInitialized howInitialized_;
772 
774 
775 
778 
780  bool barrierBetween_;
781 
783  bool debug_;
784 
786 
795  bool enable_cuda_rdma_;
796 
798 
807  size_t numExports_;
808 
812  bool selfMessage_;
813 
823  size_t numSends_;
824 
829  Array<int> imagesTo_;
830 
839  Array<size_t> startsTo_;
840 
846  Array<size_t> lengthsTo_;
847 
851  size_t maxSendLength_;
852 
868  Array<size_t> indicesTo_;
869 
879  size_t numReceives_;
880 
887  size_t totalReceiveLength_;
888 
894  Array<size_t> lengthsFrom_;
895 
901  Array<int> imagesFrom_;
902 
908  Array<size_t> startsFrom_;
909 
916  Array<size_t> indicesFrom_;
917 
924  Array<RCP<Teuchos::CommRequest<int> > > requests_;
925 
930  mutable RCP<Distributor> reverseDistributor_;
931 
932 
934  size_t lastRoundBytesSend_;
935 
937  size_t lastRoundBytesRecv_;
938 
939 #ifdef TPETRA_DISTRIBUTOR_TIMERS
940  Teuchos::RCP<Teuchos::Time> timer_doPosts3_;
941  Teuchos::RCP<Teuchos::Time> timer_doPosts4_;
942  Teuchos::RCP<Teuchos::Time> timer_doWaits_;
943  Teuchos::RCP<Teuchos::Time> timer_doPosts3_recvs_;
944  Teuchos::RCP<Teuchos::Time> timer_doPosts4_recvs_;
945  Teuchos::RCP<Teuchos::Time> timer_doPosts3_barrier_;
946  Teuchos::RCP<Teuchos::Time> timer_doPosts4_barrier_;
947  Teuchos::RCP<Teuchos::Time> timer_doPosts3_sends_;
948  Teuchos::RCP<Teuchos::Time> timer_doPosts4_sends_;
949 
951  void makeTimers ();
952 #endif // TPETRA_DISTRIBUTOR_TIMERS
953 
965  bool useDistinctTags_;
966 
971  int getTag (const int pathTag) const;
972 
986  void
987  init (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
988  const Teuchos::RCP<Teuchos::ParameterList>& plist);
989 
1000  void computeReceives ();
1001 
1014  template <class Ordinal>
1015  void computeSends (const ArrayView<const Ordinal> &importIDs,
1016  const ArrayView<const int> &importNodeIDs,
1017  Array<Ordinal> &exportIDs,
1018  Array<int> &exportNodeIDs);
1019 
1021  void createReverseDistributor() const;
1022 
1023  }; // class Distributor
1024 
1025 
1026  template <class Packet>
1027  void Distributor::
1028  doPostsAndWaits (const ArrayView<const Packet>& exports,
1029  size_t numPackets,
1030  const ArrayView<Packet>& imports)
1031  {
1032  using Teuchos::arcp;
1033  using Teuchos::ArrayRCP;
1034  typedef typename ArrayRCP<const Packet>::size_type size_type;
1035 
1036  TEUCHOS_TEST_FOR_EXCEPTION(
1037  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1038  "doPostsAndWaits(3 args): There are " << requests_.size () <<
1039  " outstanding nonblocking messages pending. It is incorrect to call "
1040  "this method with posts outstanding.");
1041 
1042  // doPosts() accepts the exports and imports arrays as ArrayRCPs,
1043  // requiring that the memory location is persisting (as is
1044  // necessary for nonblocking receives). However, it need only
1045  // persist until doWaits() completes, so it is safe for us to use
1046  // a nonpersisting reference in this case. The use of a
1047  // nonpersisting reference is purely a performance optimization.
1048 
1049  //const Packet* exportsPtr = exports.getRawPtr();
1050  //ArrayRCP<const Packet> exportsArcp (exportsPtr, static_cast<size_type> (0),
1051  // exports.size(), false);
1052  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (),
1053  static_cast<size_type> (0),
1054  exports.size(), false);
1055 
1056  // For some reason, neither of the options below (that use arcp)
1057  // compile for Packet=std::complex<double> with GCC 4.5.1. The
1058  // issue only arises with the exports array. This is why we
1059  // construct a separate nonowning ArrayRCP.
1060 
1061  // doPosts (arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false),
1062  // numPackets,
1063  // arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
1064  // doPosts (arcp<const Packet> (exportsPtr, 0, exports.size(), false),
1065  // numPackets,
1066  // arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
1067  doPosts (exportsArcp,
1068  numPackets,
1069  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false));
1070  doWaits ();
1071 
1072  lastRoundBytesSend_ = exports.size () * sizeof (Packet);
1073  lastRoundBytesRecv_ = imports.size () * sizeof (Packet);
1074  }
1075 
1076  template <class Packet>
1077  void Distributor::
1078  doPostsAndWaits (const ArrayView<const Packet>& exports,
1079  const ArrayView<size_t> &numExportPacketsPerLID,
1080  const ArrayView<Packet> &imports,
1081  const ArrayView<size_t> &numImportPacketsPerLID)
1082  {
1083  using Teuchos::arcp;
1084  using Teuchos::ArrayRCP;
1085 
1086  TEUCHOS_TEST_FOR_EXCEPTION(
1087  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1088  "doPostsAndWaits: There are " << requests_.size () << " outstanding "
1089  "nonblocking messages pending. It is incorrect to call doPostsAndWaits "
1090  "with posts outstanding.");
1091 
1092  // doPosts() accepts the exports and imports arrays as ArrayRCPs,
1093  // requiring that the memory location is persisting (as is
1094  // necessary for nonblocking receives). However, it need only
1095  // persist until doWaits() completes, so it is safe for us to use
1096  // a nonpersisting reference in this case.
1097 
1098  // mfh 04 Apr 2012: For some reason, calling arcp<const Packet>
1099  // for Packet=std::complex<T> (e.g., T=float) fails to compile
1100  // with some versions of GCC. The issue only arises with the
1101  // exports array. This is why we construct a separate nonowning
1102  // ArrayRCP.
1103  typedef typename ArrayRCP<const Packet>::size_type size_type;
1104  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (),
1105  static_cast<size_type> (0),
1106  exports.size (), false);
1107  // mfh 04 Apr 2012: This is the offending code. This statement
1108  // would normally be in place of "exportsArcp" in the
1109  // doPosts() call below.
1110  //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false),
1111  doPosts (exportsArcp,
1112  numExportPacketsPerLID,
1113  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false),
1114  numImportPacketsPerLID);
1115  doWaits ();
1116 
1117  lastRoundBytesSend_ = exports.size () * sizeof (Packet);
1118  lastRoundBytesRecv_ = imports.size () * sizeof (Packet);
1119  }
1120 
1121 
1122  template <class Packet>
1123  void Distributor::
1124  doPosts (const ArrayRCP<const Packet>& exports,
1125  size_t numPackets,
1126  const ArrayRCP<Packet>& imports)
1127  {
1128  using Teuchos::Array;
1129  using Teuchos::as;
1130  using Teuchos::FancyOStream;
1131  using Teuchos::includesVerbLevel;
1132  using Teuchos::ireceive;
1133  using Teuchos::isend;
1134  using Teuchos::OSTab;
1135  using Teuchos::readySend;
1136  using Teuchos::send;
1137  using Teuchos::ssend;
1138  using Teuchos::TypeNameTraits;
1139  using Teuchos::typeName;
1140  using std::endl;
1141  typedef Array<size_t>::size_type size_type;
1142 
1143  Teuchos::OSTab tab (out_);
1144 
1145 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1146  Teuchos::TimeMonitor timeMon (*timer_doPosts3_);
1147 #endif // TPETRA_DISTRIBUTOR_TIMERS
1148 
1149  // Run-time configurable parameters that come from the input
1150  // ParameterList set by setParameterList().
1151  const Details::EDistributorSendType sendType = sendType_;
1152  const bool doBarrier = barrierBetween_;
1153 
1154 // #ifdef HAVE_TEUCHOS_DEBUG
1155 // // Prepare for verbose output, if applicable.
1156 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
1157 // (void) verbLevel; // Silence "unused variable" compiler warning.
1158 // RCP<FancyOStream> out = this->getOStream ();
1159 // // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
1160 // // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
1161 // const bool doPrint = out.get () && (comm_->getRank () == 0);
1162 
1163 // if (doPrint) {
1164 // // Only need one process to print out parameters.
1165 // *out << "Distributor::doPosts (3 args)" << endl;
1166 // }
1167 // // Add one tab level. We declare this outside the doPrint scopes
1168 // // so that the tab persists until the end of this method.
1169 // OSTab tab = this->getOSTab ();
1170 // if (doPrint) {
1171 // *out << "Parameters:" << endl;
1172 // {
1173 // OSTab tab2 (out);
1174 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
1175 // << endl << "barrierBetween: " << doBarrier << endl;
1176 // }
1177 // }
1178 // #endif // HAVE_TEUCHOS_DEBUG
1179 
1180  TEUCHOS_TEST_FOR_EXCEPTION(
1181  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error,
1182  "Tpetra::Distributor::doPosts(3 args): Ready-send version requires a "
1183  "barrier between posting receives and posting ready sends. This should "
1184  "have been checked before. "
1185  "Please report this bug to the Tpetra developers.");
1186 
1187  const int myImageID = comm_->getRank ();
1188  size_t selfReceiveOffset = 0;
1189 
1190  // Each message has the same number of packets.
1191  //
1192  // FIXME (mfh 18 Jul 2014): Relaxing this test from strict
1193  // inequality to a less-than seems to have fixed Bug 6170. It's
1194  // OK for the 'imports' array to be longer than it needs to be;
1195  // I'm just curious why it would be.
1196  const size_t totalNumImportPackets = totalReceiveLength_ * numPackets;
1197  TEUCHOS_TEST_FOR_EXCEPTION(
1198  static_cast<size_t> (imports.size ()) < totalNumImportPackets,
1199  std::invalid_argument, "Tpetra::Distributor::doPosts(3 args): The "
1200  "'imports' array must have enough entries to hold the expected number "
1201  "of import packets. imports.size() = " << imports.size () << " < "
1202  "totalNumImportPackets = " << totalNumImportPackets << ".");
1203 
1204  // MPI tag for nonblocking receives and blocking sends in this
1205  // method. Some processes might take the "fast" path
1206  // (indicesTo_.empty()) and others might take the "slow" path for
1207  // the same doPosts() call, so the path tag must be the same for
1208  // both.
1209  const int pathTag = 0;
1210  const int tag = this->getTag (pathTag);
1211 
1212  if (debug_) {
1213  TEUCHOS_TEST_FOR_EXCEPTION(
1214  requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
1215  "doPosts(3 args): Process " << myImageID << ": requests_.size() = "
1216  << requests_.size () << " != 0.");
1217  std::ostringstream os;
1218  os << myImageID << ": doPosts(3,"
1219  << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
1220  *out_ << os.str ();
1221  }
1222 
1223  // Distributor uses requests_.size() as the number of outstanding
1224  // nonblocking message requests, so we resize to zero to maintain
1225  // this invariant.
1226  //
1227  // numReceives_ does _not_ include the self message, if there is
1228  // one. Here, we do actually send a message to ourselves, so we
1229  // include any self message in the "actual" number of receives to
1230  // post.
1231  //
1232  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
1233  // doesn't (re)allocate its array of requests. That happens in
1234  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
1235  // demand), or Resize_().
1236  const size_type actualNumReceives = as<size_type> (numReceives_) +
1237  as<size_type> (selfMessage_ ? 1 : 0);
1238  requests_.resize (0);
1239 
1240  // Post the nonblocking receives. It's common MPI wisdom to post
1241  // receives before sends. In MPI terms, this means favoring
1242  // adding to the "posted queue" (of receive requests) over adding
1243  // to the "unexpected queue" (of arrived messages not yet matched
1244  // with a receive).
1245  {
1246 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1247  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3_recvs_);
1248 #endif // TPETRA_DISTRIBUTOR_TIMERS
1249 
1250  size_t curBufOffset = 0;
1251  for (size_type i = 0; i < actualNumReceives; ++i) {
1252  const size_t curBufLen = lengthsFrom_[i] * numPackets;
1253  if (imagesFrom_[i] != myImageID) {
1254  // If my process is receiving these packet(s) from another
1255  // process (not a self-receive):
1256  //
1257  // 1. Set up the persisting view (recvBuf) of the imports
1258  // array, given the offset and size (total number of
1259  // packets from process imagesFrom_[i]).
1260  // 2. Start the Irecv and save the resulting request.
1261  TEUCHOS_TEST_FOR_EXCEPTION(
1262  curBufOffset + curBufLen > static_cast<size_t> (imports.size ()),
1263  std::logic_error, "Tpetra::Distributor::doPosts(3 args): Exceeded "
1264  "size of 'imports' array in packing loop on Process " << myImageID
1265  << ". imports.size() = " << imports.size () << " < offset + length"
1266  " = " << (curBufOffset + curBufLen) << ".");
1267 
1268  ArrayRCP<Packet> recvBuf =
1269  imports.persistingView (curBufOffset, curBufLen);
1270  requests_.push_back (ireceive<int, Packet> (recvBuf, imagesFrom_[i],
1271  tag, *comm_));
1272  if (debug_) {
1273  std::ostringstream os;
1274  os << myImageID << ": doPosts(3,"
1275  << (indicesTo_.empty () ? "fast" : "slow") << "): "
1276  << "Posted irecv from Proc " << imagesFrom_[i] << " with "
1277  "specified tag " << tag << endl;
1278  *out_ << os.str ();
1279  }
1280  }
1281  else { // Receiving from myself
1282  selfReceiveOffset = curBufOffset; // Remember the self-recv offset
1283  }
1284  curBufOffset += curBufLen;
1285  }
1286  }
1287 
1288  if (doBarrier) {
1289 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1290  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3_barrier_);
1291 #endif // TPETRA_DISTRIBUTOR_TIMERS
1292  // If we are using ready sends (MPI_Rsend) below, we need to do
1293  // a barrier before we post the ready sends. This is because a
1294  // ready send requires that its matching receive has already
1295  // been posted before the send has been posted. The only way to
1296  // guarantee that in this case is to use a barrier.
1297  comm_->barrier ();
1298  }
1299 
1300 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1301  Teuchos::TimeMonitor timeMonSends (*timer_doPosts3_sends_);
1302 #endif // TPETRA_DISTRIBUTOR_TIMERS
1303 
1304  // setup scan through imagesTo_ list starting with higher numbered images
1305  // (should help balance message traffic)
1306  //
1307  // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
1308  // It doesn't depend on the input at all.
1309  size_t numBlocks = numSends_ + selfMessage_;
1310  size_t imageIndex = 0;
1311  while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) {
1312  ++imageIndex;
1313  }
1314  if (imageIndex == numBlocks) {
1315  imageIndex = 0;
1316  }
1317 
1318  size_t selfNum = 0;
1319  size_t selfIndex = 0;
1320 
1321  if (indicesTo_.empty()) {
1322  if (debug_) {
1323  std::ostringstream os;
1324  os << myImageID << ": doPosts(3,fast): posting sends" << endl;
1325  *out_ << os.str ();
1326  }
1327 
1328  // Data are already blocked (laid out) by process, so we don't
1329  // need a separate send buffer (besides the exports array).
1330  for (size_t i = 0; i < numBlocks; ++i) {
1331  size_t p = i + imageIndex;
1332  if (p > (numBlocks - 1)) {
1333  p -= numBlocks;
1334  }
1335 
1336  if (imagesTo_[p] != myImageID) {
1337  ArrayView<const Packet> tmpSend =
1338  exports.view (startsTo_[p]*numPackets, lengthsTo_[p]*numPackets);
1339 
1340  if (sendType == Details::DISTRIBUTOR_SEND) {
1341  send<int, Packet> (tmpSend.getRawPtr (),
1342  as<int> (tmpSend.size ()),
1343  imagesTo_[p], tag, *comm_);
1344  }
1345  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1346  ArrayRCP<const Packet> tmpSendBuf =
1347  exports.persistingView (startsTo_[p] * numPackets,
1348  lengthsTo_[p] * numPackets);
1349  requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p],
1350  tag, *comm_));
1351  }
1352  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1353  readySend<int, Packet> (tmpSend.getRawPtr (),
1354  as<int> (tmpSend.size ()),
1355  imagesTo_[p], tag, *comm_);
1356  }
1357  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1358  ssend<int, Packet> (tmpSend.getRawPtr (),
1359  as<int> (tmpSend.size ()),
1360  imagesTo_[p], tag, *comm_);
1361  } else {
1362  TEUCHOS_TEST_FOR_EXCEPTION(
1363  true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): "
1364  "Invalid send type. We should never get here. "
1365  "Please report this bug to the Tpetra developers.");
1366  }
1367 
1368  if (debug_) {
1369  std::ostringstream os;
1370  os << myImageID << ": doPosts(3,fast): "
1371  << "Posted send to Proc " << imagesTo_[i]
1372  << " w/ specified tag " << tag << endl;
1373  *out_ << os.str ();
1374  }
1375  }
1376  else { // "Sending" the message to myself
1377  selfNum = p;
1378  }
1379  }
1380 
1381  if (selfMessage_) {
1382  // This is how we "send a message to ourself": we copy from
1383  // the export buffer to the import buffer. That saves
1384  // Teuchos::Comm implementations other than MpiComm (in
1385  // particular, SerialComm) the trouble of implementing self
1386  // messages correctly. (To do this right, SerialComm would
1387  // need internal buffer space for messages, keyed on the
1388  // message's tag.)
1389  std::copy (exports.begin()+startsTo_[selfNum]*numPackets,
1390  exports.begin()+startsTo_[selfNum]*numPackets+lengthsTo_[selfNum]*numPackets,
1391  imports.begin()+selfReceiveOffset);
1392  }
1393  if (debug_) {
1394  std::ostringstream os;
1395  os << myImageID << ": doPosts(3,fast) done" << endl;
1396  *out_ << os.str ();
1397  }
1398  }
1399  else { // data are not blocked by image, use send buffer
1400  if (debug_) {
1401  std::ostringstream os;
1402  os << myImageID << ": doPosts(3,slow): posting sends" << endl;
1403  *out_ << os.str ();
1404  }
1405 
1406  // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking
1407  // sends), because the buffer is only long enough for one send.
1408  ArrayRCP<Packet> sendArray (maxSendLength_ * numPackets); // send buffer
1409 
1410  TEUCHOS_TEST_FOR_EXCEPTION(
1411  sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
1412  "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" code path "
1413  << "doesn't currently work with nonblocking sends.");
1414 
1415  for (size_t i = 0; i < numBlocks; ++i) {
1416  size_t p = i + imageIndex;
1417  if (p > (numBlocks - 1)) {
1418  p -= numBlocks;
1419  }
1420 
1421  if (imagesTo_[p] != myImageID) {
1422  typename ArrayView<const Packet>::iterator srcBegin, srcEnd;
1423  size_t sendArrayOffset = 0;
1424  size_t j = startsTo_[p];
1425  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
1426  srcBegin = exports.begin() + indicesTo_[j]*numPackets;
1427  srcEnd = srcBegin + numPackets;
1428  std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset);
1429  sendArrayOffset += numPackets;
1430  }
1431  ArrayView<const Packet> tmpSend =
1432  sendArray.view (0, lengthsTo_[p]*numPackets);
1433 
1434  if (sendType == Details::DISTRIBUTOR_SEND) {
1435  send<int, Packet> (tmpSend.getRawPtr (),
1436  as<int> (tmpSend.size ()),
1437  imagesTo_[p], tag, *comm_);
1438  }
1439  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1440  ArrayRCP<const Packet> tmpSendBuf =
1441  sendArray.persistingView (0, lengthsTo_[p] * numPackets);
1442  requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p],
1443  tag, *comm_));
1444  }
1445  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1446  readySend<int, Packet> (tmpSend.getRawPtr (),
1447  as<int> (tmpSend.size ()),
1448  imagesTo_[p], tag, *comm_);
1449  }
1450  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1451  ssend<int, Packet> (tmpSend.getRawPtr (),
1452  as<int> (tmpSend.size ()),
1453  imagesTo_[p], tag, *comm_);
1454  }
1455  else {
1456  TEUCHOS_TEST_FOR_EXCEPTION(
1457  true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): "
1458  "Invalid send type. We should never get here. "
1459  "Please report this bug to the Tpetra developers.");
1460  }
1461 
1462  if (debug_) {
1463  std::ostringstream os;
1464  os << myImageID << ": doPosts(3,slow): "
1465  << "Posted send to Proc " << imagesTo_[i]
1466  << " w/ specified tag " << tag << endl;
1467  *out_ << os.str ();
1468  }
1469  }
1470  else { // "Sending" the message to myself
1471  selfNum = p;
1472  selfIndex = startsTo_[p];
1473  }
1474  }
1475 
1476  if (selfMessage_) {
1477  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
1478  std::copy (exports.begin()+indicesTo_[selfIndex]*numPackets,
1479  exports.begin()+indicesTo_[selfIndex]*numPackets + numPackets,
1480  imports.begin() + selfReceiveOffset);
1481  ++selfIndex;
1482  selfReceiveOffset += numPackets;
1483  }
1484  }
1485  if (debug_) {
1486  std::ostringstream os;
1487  os << myImageID << ": doPosts(3,slow) done" << endl;
1488  *out_ << os.str ();
1489  }
1490  }
1491  }
1492 
1493  template <class Packet>
1494  void Distributor::
1495  doPosts (const ArrayRCP<const Packet>& exports,
1496  const ArrayView<size_t>& numExportPacketsPerLID,
1497  const ArrayRCP<Packet>& imports,
1498  const ArrayView<size_t>& numImportPacketsPerLID)
1499  {
1500  using Teuchos::Array;
1501  using Teuchos::as;
1502  using Teuchos::ireceive;
1503  using Teuchos::isend;
1504  using Teuchos::readySend;
1505  using Teuchos::send;
1506  using Teuchos::ssend;
1507  using Teuchos::TypeNameTraits;
1508 #ifdef HAVE_TEUCHOS_DEBUG
1509  using Teuchos::OSTab;
1510 #endif // HAVE_TEUCHOS_DEBUG
1511  using std::endl;
1512  typedef Array<size_t>::size_type size_type;
1513 
1514  Teuchos::OSTab tab (out_);
1515 
1516 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1517  Teuchos::TimeMonitor timeMon (*timer_doPosts4_);
1518 #endif // TPETRA_DISTRIBUTOR_TIMERS
1519 
1520  // Run-time configurable parameters that come from the input
1521  // ParameterList set by setParameterList().
1522  const Details::EDistributorSendType sendType = sendType_;
1523  const bool doBarrier = barrierBetween_;
1524 
1525 // #ifdef HAVE_TEUCHOS_DEBUG
1526 // // Prepare for verbose output, if applicable.
1527 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
1528 // RCP<Teuchos::FancyOStream> out = this->getOStream ();
1529 // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
1530 // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
1531 
1532 // if (doPrint) {
1533 // // Only need one process to print out parameters.
1534 // *out << "Distributor::doPosts (4 args)" << endl;
1535 // }
1536 // // Add one tab level. We declare this outside the doPrint scopes
1537 // // so that the tab persists until the end of this method.
1538 // Teuchos::OSTab tab = this->getOSTab ();
1539 // if (doPrint) {
1540 // *out << "Parameters:" << endl;
1541 // {
1542 // OSTab tab2 (out);
1543 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
1544 // << endl << "barrierBetween: " << doBarrier << endl;
1545 // }
1546 // }
1547 // #endif // HAVE_TEUCHOS_DEBUG
1548 
1549  TEUCHOS_TEST_FOR_EXCEPTION(
1550  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error,
1551  "Tpetra::Distributor::doPosts(4 args): Ready-send version requires a "
1552  "barrier between posting receives and posting ready sends. This should "
1553  "have been checked before. "
1554  "Please report this bug to the Tpetra developers.");
1555 
1556  const int myImageID = comm_->getRank ();
1557  size_t selfReceiveOffset = 0;
1558 
1559 #ifdef HAVE_TEUCHOS_DEBUG
1560  // Different messages may have different numbers of packets.
1561  size_t totalNumImportPackets = 0;
1562  for (int ii = 0; ii < numImportPacketsPerLID.size(); ++ii) {
1563  totalNumImportPackets += numImportPacketsPerLID[ii];
1564  }
1565  TEUCHOS_TEST_FOR_EXCEPTION(
1566  static_cast<size_t> (imports.size ()) < totalNumImportPackets,
1567  std::runtime_error, "Tpetra::Distributor::doPosts(4 args): The 'imports' "
1568  "array must have enough entries to hold the expected number of import "
1569  "packets. imports.size() = " << imports.size() << " < "
1570  "totalNumImportPackets = " << totalNumImportPackets << ".");
1571 #endif // HAVE_TEUCHOS_DEBUG
1572 
1573  // MPI tag for nonblocking receives and blocking sends in this
1574  // method. Some processes might take the "fast" path
1575  // (indicesTo_.empty()) and others might take the "slow" path for
1576  // the same doPosts() call, so the path tag must be the same for
1577  // both.
1578  const int pathTag = 1;
1579  const int tag = this->getTag (pathTag);
1580 
1581  if (debug_) {
1582  TEUCHOS_TEST_FOR_EXCEPTION(
1583  requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
1584  "doPosts(4 args): Process " << myImageID << ": requests_.size() = "
1585  << requests_.size () << " != 0.");
1586  std::ostringstream os;
1587  os << myImageID << ": doPosts(4,"
1588  << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
1589  *out_ << os.str ();
1590  }
1591 
1592  // Distributor uses requests_.size() as the number of outstanding
1593  // nonblocking message requests, so we resize to zero to maintain
1594  // this invariant.
1595  //
1596  // numReceives_ does _not_ include the self message, if there is
1597  // one. Here, we do actually send a message to ourselves, so we
1598  // include any self message in the "actual" number of receives to
1599  // post.
1600  //
1601  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
1602  // doesn't (re)allocate its array of requests. That happens in
1603  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
1604  // demand), or Resize_().
1605  const size_type actualNumReceives = as<size_type> (numReceives_) +
1606  as<size_type> (selfMessage_ ? 1 : 0);
1607  requests_.resize (0);
1608 
1609  // Post the nonblocking receives. It's common MPI wisdom to post
1610  // receives before sends. In MPI terms, this means favoring
1611  // adding to the "posted queue" (of receive requests) over adding
1612  // to the "unexpected queue" (of arrived messages not yet matched
1613  // with a receive).
1614  {
1615 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1616  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4_recvs_);
1617 #endif // TPETRA_DISTRIBUTOR_TIMERS
1618 
1619  size_t curBufferOffset = 0;
1620  size_t curLIDoffset = 0;
1621  for (size_type i = 0; i < actualNumReceives; ++i) {
1622  size_t totalPacketsFrom_i = 0;
1623  for (size_t j = 0; j < lengthsFrom_[i]; ++j) {
1624  totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
1625  }
1626  curLIDoffset += lengthsFrom_[i];
1627  if (imagesFrom_[i] != myImageID && totalPacketsFrom_i) {
1628  // If my process is receiving these packet(s) from another
1629  // process (not a self-receive), and if there is at least
1630  // one packet to receive:
1631  //
1632  // 1. Set up the persisting view (recvBuf) into the imports
1633  // array, given the offset and size (total number of
1634  // packets from process imagesFrom_[i]).
1635  // 2. Start the Irecv and save the resulting request.
1636  ArrayRCP<Packet> recvBuf =
1637  imports.persistingView (curBufferOffset, totalPacketsFrom_i);
1638  requests_.push_back (ireceive<int, Packet> (recvBuf, imagesFrom_[i],
1639  tag, *comm_));
1640  }
1641  else { // Receiving these packet(s) from myself
1642  selfReceiveOffset = curBufferOffset; // Remember the offset
1643  }
1644  curBufferOffset += totalPacketsFrom_i;
1645  }
1646  }
1647 
1648  if (doBarrier) {
1649 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1650  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4_barrier_);
1651 #endif // TPETRA_DISTRIBUTOR_TIMERS
1652  // If we are using ready sends (MPI_Rsend) below, we need to do
1653  // a barrier before we post the ready sends. This is because a
1654  // ready send requires that its matching receive has already
1655  // been posted before the send has been posted. The only way to
1656  // guarantee that in this case is to use a barrier.
1657  comm_->barrier ();
1658  }
1659 
1660 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1661  Teuchos::TimeMonitor timeMonSends (*timer_doPosts4_sends_);
1662 #endif // TPETRA_DISTRIBUTOR_TIMERS
1663 
1664  // setup arrays containing starting-offsets into exports for each send,
1665  // and num-packets-to-send for each send.
1666  Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0);
1667  size_t maxNumPackets = 0;
1668  size_t curPKToffset = 0;
1669  for (size_t pp=0; pp<numSends_; ++pp) {
1670  sendPacketOffsets[pp] = curPKToffset;
1671  size_t numPackets = 0;
1672  for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) {
1673  numPackets += numExportPacketsPerLID[j];
1674  }
1675  if (numPackets > maxNumPackets) maxNumPackets = numPackets;
1676  packetsPerSend[pp] = numPackets;
1677  curPKToffset += numPackets;
1678  }
1679 
1680  // setup scan through imagesTo_ list starting with higher numbered images
1681  // (should help balance message traffic)
1682  size_t numBlocks = numSends_+ selfMessage_;
1683  size_t imageIndex = 0;
1684  while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) {
1685  ++imageIndex;
1686  }
1687  if (imageIndex == numBlocks) {
1688  imageIndex = 0;
1689  }
1690 
1691  size_t selfNum = 0;
1692  size_t selfIndex = 0;
1693 
1694  if (indicesTo_.empty()) {
1695  if (debug_) {
1696  std::ostringstream os;
1697  os << myImageID << ": doPosts(4,fast): posting sends" << endl;
1698  *out_ << os.str ();
1699  }
1700 
1701  // Data are already blocked (laid out) by process, so we don't
1702  // need a separate send buffer (besides the exports array).
1703  for (size_t i = 0; i < numBlocks; ++i) {
1704  size_t p = i + imageIndex;
1705  if (p > (numBlocks - 1)) {
1706  p -= numBlocks;
1707  }
1708 
1709  if (imagesTo_[p] != myImageID && packetsPerSend[p] > 0) {
1710  ArrayView<const Packet> tmpSend =
1711  exports.view (sendPacketOffsets[p], packetsPerSend[p]);
1712 
1713  if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first
1714  send<int, Packet> (tmpSend.getRawPtr (),
1715  as<int> (tmpSend.size ()),
1716  imagesTo_[p], tag, *comm_);
1717  }
1718  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1719  readySend<int, Packet> (tmpSend.getRawPtr (),
1720  as<int> (tmpSend.size ()),
1721  imagesTo_[p], tag, *comm_);
1722  }
1723  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1724  ArrayRCP<const Packet> tmpSendBuf =
1725  exports.persistingView (sendPacketOffsets[p], packetsPerSend[p]);
1726  requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p],
1727  tag, *comm_));
1728  }
1729  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1730  ssend<int, Packet> (tmpSend.getRawPtr (),
1731  as<int> (tmpSend.size ()),
1732  imagesTo_[p], tag, *comm_);
1733  }
1734  else {
1735  TEUCHOS_TEST_FOR_EXCEPTION(
1736  true, std::logic_error, "Tpetra::Distributor::doPosts(4 args): "
1737  "Invalid send type. We should never get here. Please report "
1738  "this bug to the Tpetra developers.");
1739  }
1740  }
1741  else { // "Sending" the message to myself
1742  selfNum = p;
1743  }
1744  }
1745 
1746  if (selfMessage_) {
1747  std::copy (exports.begin()+sendPacketOffsets[selfNum],
1748  exports.begin()+sendPacketOffsets[selfNum]+packetsPerSend[selfNum],
1749  imports.begin()+selfReceiveOffset);
1750  }
1751  if (debug_) {
1752  std::ostringstream os;
1753  os << myImageID << ": doPosts(4,fast) done" << endl;
1754  *out_ << os.str ();
1755  }
1756  }
1757  else { // data are not blocked by image, use send buffer
1758  if (debug_) {
1759  std::ostringstream os;
1760  os << myImageID << ": doPosts(4,slow): posting sends" << endl;
1761  *out_ << os.str ();
1762  }
1763 
1764  // FIXME (mfh 05 Mar 2013) This may be broken for Isend.
1765  ArrayRCP<Packet> sendArray (maxNumPackets); // send buffer
1766 
1767  TEUCHOS_TEST_FOR_EXCEPTION(
1768  sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
1769  "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" "
1770  "code path may not necessarily work with nonblocking sends.");
1771 
1772  Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
1773  size_t ioffset = 0;
1774  for (int j=0; j<numExportPacketsPerLID.size(); ++j) {
1775  indicesOffsets[j] = ioffset;
1776  ioffset += numExportPacketsPerLID[j];
1777  }
1778 
1779  for (size_t i = 0; i < numBlocks; ++i) {
1780  size_t p = i + imageIndex;
1781  if (p > (numBlocks - 1)) {
1782  p -= numBlocks;
1783  }
1784 
1785  if (imagesTo_[p] != myImageID) {
1786  typename ArrayView<const Packet>::iterator srcBegin, srcEnd;
1787  size_t sendArrayOffset = 0;
1788  size_t j = startsTo_[p];
1789  size_t numPacketsTo_p = 0;
1790  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
1791  srcBegin = exports.begin() + indicesOffsets[j];
1792  srcEnd = srcBegin + numExportPacketsPerLID[j];
1793  numPacketsTo_p += numExportPacketsPerLID[j];
1794  std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset);
1795  sendArrayOffset += numExportPacketsPerLID[j];
1796  }
1797  if (numPacketsTo_p > 0) {
1798  ArrayView<const Packet> tmpSend =
1799  sendArray.view (0, numPacketsTo_p);
1800 
1801  if (sendType == Details::DISTRIBUTOR_RSEND) {
1802  readySend<int, Packet> (tmpSend.getRawPtr (),
1803  as<int> (tmpSend.size ()),
1804  imagesTo_[p], tag, *comm_);
1805  }
1806  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1807  ArrayRCP<const Packet> tmpSendBuf =
1808  sendArray.persistingView (0, numPacketsTo_p);
1809  requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p],
1810  tag, *comm_));
1811  }
1812  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1813  ssend<int, Packet> (tmpSend.getRawPtr (),
1814  as<int> (tmpSend.size ()),
1815  imagesTo_[p], tag, *comm_);
1816  }
1817  else { // if (sendType == Details::DISTRIBUTOR_SSEND)
1818  send<int, Packet> (tmpSend.getRawPtr (),
1819  as<int> (tmpSend.size ()),
1820  imagesTo_[p], tag, *comm_);
1821  }
1822  }
1823  }
1824  else { // "Sending" the message to myself
1825  selfNum = p;
1826  selfIndex = startsTo_[p];
1827  }
1828  }
1829 
1830  if (selfMessage_) {
1831  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
1832  std::copy (exports.begin()+indicesOffsets[selfIndex],
1833  exports.begin()+indicesOffsets[selfIndex]+numExportPacketsPerLID[selfIndex],
1834  imports.begin() + selfReceiveOffset);
1835  selfReceiveOffset += numExportPacketsPerLID[selfIndex];
1836  ++selfIndex;
1837  }
1838  }
1839  if (debug_) {
1840  std::ostringstream os;
1841  os << myImageID << ": doPosts(4,slow) done" << endl;
1842  *out_ << os.str ();
1843  }
1844  }
1845  }
1846 
1847  template <class Packet>
1848  void Distributor::
1849  doReversePostsAndWaits (const ArrayView<const Packet>& exports,
1850  size_t numPackets,
1851  const ArrayView<Packet>& imports)
1852  {
1853  using Teuchos::as;
1854 
1855  // doReversePosts() takes exports and imports as ArrayRCPs,
1856  // requiring that the memory locations are persisting. However,
1857  // they need only persist within the scope of that routine, so it
1858  // is safe for us to use nonpersisting references in this case.
1859 
1860  // mfh 04 Apr 2012: For some reason, calling arcp<const Packet>
1861  // for Packet=std::complex<T> (e.g., T=float) fails to compile
1862  // with some versions of GCC. The issue only arises with the
1863  // exports array. This is why we construct a separate nonowning
1864  // ArrayRCP.
1865  typedef typename ArrayRCP<const Packet>::size_type size_type;
1866  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr(), as<size_type> (0),
1867  exports.size(), false);
1868  // mfh 04 Apr 2012: This is the offending code. This statement
1869  // would normally be in place of "exportsArcp" in the
1870  // doReversePosts() call below.
1871  //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false)
1872  doReversePosts (exportsArcp,
1873  numPackets,
1874  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false));
1875  doReverseWaits ();
1876 
1877  lastRoundBytesSend_ = exports.size() * sizeof(Packet);
1878  lastRoundBytesRecv_ = imports.size() * sizeof(Packet);
1879  }
1880 
1881  template <class Packet>
1882  void Distributor::
1883  doReversePostsAndWaits (const ArrayView<const Packet>& exports,
1884  const ArrayView<size_t> &numExportPacketsPerLID,
1885  const ArrayView<Packet> &imports,
1886  const ArrayView<size_t> &numImportPacketsPerLID)
1887  {
1888  using Teuchos::as;
1889  using Teuchos::arcp;
1890  using Teuchos::ArrayRCP;
1891 
1892  TEUCHOS_TEST_FOR_EXCEPTION(
1893  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1894  "doReversePostsAndWaits(4 args): There are " << requests_.size ()
1895  << " outstanding nonblocking messages pending. It is incorrect to call "
1896  "this method with posts outstanding.");
1897 
1898  // doReversePosts() accepts the exports and imports arrays as
1899  // ArrayRCPs, requiring that the memory location is persisting (as
1900  // is necessary for nonblocking receives). However, it need only
1901  // persist until doReverseWaits() completes, so it is safe for us
1902  // to use a nonpersisting reference in this case. The use of a
1903  // nonpersisting reference is purely a performance optimization.
1904 
1905  // mfh 02 Apr 2012: For some reason, calling arcp<const Packet>
1906  // for Packet=std::complex<double> fails to compile with some
1907  // versions of GCC. The issue only arises with the exports array.
1908  // This is why we construct a separate nonowning ArrayRCP.
1909  typedef typename ArrayRCP<const Packet>::size_type size_type;
1910  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (), as<size_type> (0),
1911  exports.size (), false);
1912  doReversePosts (exportsArcp,
1913  numExportPacketsPerLID,
1914  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false),
1915  numImportPacketsPerLID);
1916  doReverseWaits ();
1917 
1918  lastRoundBytesSend_ = exports.size() * sizeof(Packet);
1919  lastRoundBytesRecv_ = imports.size() * sizeof(Packet);
1920  }
1921 
1922  template <class Packet>
1923  void Distributor::
1924  doReversePosts (const ArrayRCP<const Packet>& exports,
1925  size_t numPackets,
1926  const ArrayRCP<Packet>& imports)
1927  {
1928  // FIXME (mfh 29 Mar 2012) WHY?
1929  TEUCHOS_TEST_FOR_EXCEPTION(
1930  ! indicesTo_.empty (), std::runtime_error,
1931  "Tpetra::Distributor::doReversePosts(3 args): Can only do reverse "
1932  "communication when original data are blocked by process.");
1933  if (reverseDistributor_.is_null ()) {
1934  createReverseDistributor ();
1935  }
1936  reverseDistributor_->doPosts (exports, numPackets, imports);
1937  }
1938 
1939  template <class Packet>
1940  void Distributor::
1941  doReversePosts (const ArrayRCP<const Packet>& exports,
1942  const ArrayView<size_t>& numExportPacketsPerLID,
1943  const ArrayRCP<Packet>& imports,
1944  const ArrayView<size_t>& numImportPacketsPerLID)
1945  {
1946  // FIXME (mfh 29 Mar 2012) WHY?
1947  TEUCHOS_TEST_FOR_EXCEPTION(
1948  ! indicesTo_.empty (), std::runtime_error,
1949  "Tpetra::Distributor::doReversePosts(3 args): Can only do reverse "
1950  "communication when original data are blocked by process.");
1951  if (reverseDistributor_.is_null ()) {
1952  createReverseDistributor ();
1953  }
1954  reverseDistributor_->doPosts (exports, numExportPacketsPerLID,
1955  imports, numImportPacketsPerLID);
1956  }
1957 
1958  template <class Packet, class Layout, class Device, class Mem>
1959  void Distributor::
1960  doPostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
1961  size_t numPackets,
1962  const Kokkos::View<Packet*, Layout, Device, Mem> &imports)
1963  {
1964  // If the MPI library doesn't support RDMA for communication
1965  // directly to or from the GPU's memory, we must transfer exports
1966  // to the host, do the communication, then transfer imports back
1967  // to the GPU.
1968  //
1969  // We need to do this here instead of doPosts() because the copy
1970  // back to the GPU must occur after the waits.
1971  typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view;
1972  typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view;
1973  const bool can_access_from_host =
1974  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace,
1975  typename exports_view::memory_space >::value;
1976  if (! enable_cuda_rdma_ && ! can_access_from_host) {
1977  typename exports_view::HostMirror host_exports =
1978  Kokkos::create_mirror_view (exports);
1979  typename imports_view::HostMirror host_imports =
1980  Kokkos::create_mirror_view (imports);
1981  Kokkos::deep_copy (host_exports, exports);
1982  doPostsAndWaits (Kokkos::Compat::create_const_view (host_exports),
1983  numPackets,
1984  host_imports);
1985  Kokkos::deep_copy (imports, host_imports);
1986  return;
1987  }
1988 
1989  TEUCHOS_TEST_FOR_EXCEPTION(
1990  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1991  "doPostsAndWaits(3 args): There are " << requests_.size () <<
1992  " outstanding nonblocking messages pending. It is incorrect to call "
1993  "this method with posts outstanding.");
1994 
1995  doPosts (exports, numPackets, imports);
1996  doWaits ();
1997  }
1998 
1999  template <class Packet, class Layout, class Device, class Mem>
2000  void Distributor::
2001  doPostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
2002  const ArrayView<size_t> &numExportPacketsPerLID,
2003  const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
2004  const ArrayView<size_t> &numImportPacketsPerLID)
2005  {
2006  // If MPI library doesn't support RDMA for communication directly
2007  // to/from GPU, transfer exports to the host, do the communication, then
2008  // transfer imports back to the GPU
2009  //
2010  // We need to do this here instead of doPosts() because the copy back
2011  // to the GPU must occur after the waits.
2012  typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view;
2013  typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view;
2014  if (!enable_cuda_rdma_ && !exports_view::is_hostspace) {
2015  typename exports_view::HostMirror host_exports =
2016  Kokkos::create_mirror_view(exports);
2017  typename imports_view::HostMirror host_imports =
2018  Kokkos::create_mirror_view(imports);
2019  Kokkos::deep_copy (host_exports, exports);
2020  doPostsAndWaits(Kokkos::Compat::create_const_view(host_exports),
2021  numExportPacketsPerLID,
2022  host_imports,
2023  numImportPacketsPerLID);
2024  Kokkos::deep_copy (imports, host_imports);
2025  return;
2026  }
2027 
2028  TEUCHOS_TEST_FOR_EXCEPTION(
2029  requests_.size () != 0, std::runtime_error,
2030  "Tpetra::Distributor::doPostsAndWaits(4 args): There are "
2031  << requests_.size () << " outstanding nonblocking messages pending. "
2032  "It is incorrect to call this method with posts outstanding.");
2033 
2034  doPosts (exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
2035  doWaits ();
2036  }
2037 
2038 
2039  template <class Packet, class Layout, class Device, class Mem>
2040  void Distributor::
2041  doPosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
2042  size_t numPackets,
2043  const Kokkos::View<Packet*, Layout, Device, Mem> &imports)
2044  {
2045  using Teuchos::Array;
2046  using Teuchos::as;
2047  using Teuchos::FancyOStream;
2048  using Teuchos::includesVerbLevel;
2049  using Teuchos::ireceive;
2050  using Teuchos::isend;
2051  using Teuchos::OSTab;
2052  using Teuchos::readySend;
2053  using Teuchos::send;
2054  using Teuchos::ssend;
2055  using Teuchos::TypeNameTraits;
2056  using Teuchos::typeName;
2057  using std::endl;
2058  using Kokkos::Compat::create_const_view;
2059  using Kokkos::Compat::create_view;
2060  using Kokkos::Compat::subview_offset;
2061  using Kokkos::Compat::deep_copy_offset;
2062  typedef Array<size_t>::size_type size_type;
2063  typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view_type;
2064  typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view_type;
2065 
2066  Teuchos::OSTab tab (out_);
2067 
2068 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2069  Teuchos::TimeMonitor timeMon (*timer_doPosts3_);
2070 #endif // TPETRA_DISTRIBUTOR_TIMERS
2071 
2072  // Run-time configurable parameters that come from the input
2073  // ParameterList set by setParameterList().
2074  const Details::EDistributorSendType sendType = sendType_;
2075  const bool doBarrier = barrierBetween_;
2076 
2077 // #ifdef HAVE_TEUCHOS_DEBUG
2078 // // Prepare for verbose output, if applicable.
2079 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
2080 // (void) verbLevel; // Silence "unused variable" compiler warning.
2081 // RCP<FancyOStream> out = this->getOStream ();
2082 // // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
2083 // // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
2084 // const bool doPrint = out.get () && (comm_->getRank () == 0);
2085 
2086 // if (doPrint) {
2087 // // Only need one process to print out parameters.
2088 // *out << "Distributor::doPosts (3 args)" << endl;
2089 // }
2090 // // Add one tab level. We declare this outside the doPrint scopes
2091 // // so that the tab persists until the end of this method.
2092 // OSTab tab = this->getOSTab ();
2093 // if (doPrint) {
2094 // *out << "Parameters:" << endl;
2095 // {
2096 // OSTab tab2 (out);
2097 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
2098 // << endl << "barrierBetween: " << doBarrier << endl;
2099 // }
2100 // }
2101 // #endif // HAVE_TEUCHOS_DEBUG
2102 
2103  TEUCHOS_TEST_FOR_EXCEPTION(
2104  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error,
2105  "Tpetra::Distributor::doPosts(3 args): Ready-send version requires a "
2106  "barrier between posting receives and posting ready sends. This should "
2107  "have been checked before. "
2108  "Please report this bug to the Tpetra developers.");
2109 
2110  const int myImageID = comm_->getRank ();
2111  size_t selfReceiveOffset = 0;
2112 
2113  // Each message has the same number of packets.
2114  const size_t totalNumImportPackets = totalReceiveLength_ * numPackets;
2115  TEUCHOS_TEST_FOR_EXCEPTION(
2116  static_cast<size_t> (imports.dimension_0 ()) < totalNumImportPackets,
2117  std::runtime_error, "Tpetra::Distributor::doPosts(3 args): The 'imports' "
2118  "array must have enough entries to hold the expected number of import "
2119  "packets. imports.dimension_0() = " << imports.dimension_0 () << " < "
2120  "totalNumImportPackets = " << totalNumImportPackets << ".");
2121 
2122  // MPI tag for nonblocking receives and blocking sends in this
2123  // method. Some processes might take the "fast" path
2124  // (indicesTo_.empty()) and others might take the "slow" path for
2125  // the same doPosts() call, so the path tag must be the same for
2126  // both.
2127  const int pathTag = 0;
2128  const int tag = this->getTag (pathTag);
2129 
2130  if (debug_) {
2131  TEUCHOS_TEST_FOR_EXCEPTION(
2132  requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
2133  "doPosts(3 args): Process " << myImageID << ": requests_.size() = "
2134  << requests_.size () << " != 0.");
2135  std::ostringstream os;
2136  os << myImageID << ": doPosts(3,"
2137  << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
2138  *out_ << os.str ();
2139  }
2140 
2141  // Distributor uses requests_.size() as the number of outstanding
2142  // nonblocking message requests, so we resize to zero to maintain
2143  // this invariant.
2144  //
2145  // numReceives_ does _not_ include the self message, if there is
2146  // one. Here, we do actually send a message to ourselves, so we
2147  // include any self message in the "actual" number of receives to
2148  // post.
2149  //
2150  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
2151  // doesn't (re)allocate its array of requests. That happens in
2152  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
2153  // demand), or Resize_().
2154  const size_type actualNumReceives = as<size_type> (numReceives_) +
2155  as<size_type> (selfMessage_ ? 1 : 0);
2156  requests_.resize (0);
2157 
2158  // Post the nonblocking receives. It's common MPI wisdom to post
2159  // receives before sends. In MPI terms, this means favoring
2160  // adding to the "posted queue" (of receive requests) over adding
2161  // to the "unexpected queue" (of arrived messages not yet matched
2162  // with a receive).
2163  {
2164 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2165  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3_recvs_);
2166 #endif // TPETRA_DISTRIBUTOR_TIMERS
2167 
2168  size_t curBufferOffset = 0;
2169  for (size_type i = 0; i < actualNumReceives; ++i) {
2170  if (imagesFrom_[i] != myImageID) {
2171  // If my process is receiving these packet(s) from another
2172  // process (not a self-receive):
2173  //
2174  // 1. Set up the persisting view (recvBuf) of the imports
2175  // array, given the offset and size (total number of
2176  // packets from process imagesFrom_[i]).
2177  // 2. Start the Irecv and save the resulting request.
2178  imports_view_type recvBuf =
2179  subview_offset (imports, curBufferOffset, lengthsFrom_[i]*numPackets);
2180  requests_.push_back (ireceive<int> (recvBuf, imagesFrom_[i],
2181  tag, *comm_));
2182  if (debug_) {
2183  std::ostringstream os;
2184  os << myImageID << ": doPosts(3,"
2185  << (indicesTo_.empty () ? "fast" : "slow") << "): "
2186  << "Posted irecv from Proc " << imagesFrom_[i] << " with "
2187  "specified tag " << tag << endl;
2188  *out_ << os.str ();
2189  }
2190  }
2191  else { // Receiving from myself
2192  selfReceiveOffset = curBufferOffset; // Remember the self-recv offset
2193  }
2194  curBufferOffset += lengthsFrom_[i]*numPackets;
2195  }
2196  }
2197 
2198  if (doBarrier) {
2199 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2200  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3_barrier_);
2201 #endif // TPETRA_DISTRIBUTOR_TIMERS
2202  // If we are using ready sends (MPI_Rsend) below, we need to do
2203  // a barrier before we post the ready sends. This is because a
2204  // ready send requires that its matching receive has already
2205  // been posted before the send has been posted. The only way to
2206  // guarantee that in this case is to use a barrier.
2207  comm_->barrier ();
2208  }
2209 
2210 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2211  Teuchos::TimeMonitor timeMonSends (*timer_doPosts3_sends_);
2212 #endif // TPETRA_DISTRIBUTOR_TIMERS
2213 
2214  // setup scan through imagesTo_ list starting with higher numbered images
2215  // (should help balance message traffic)
2216  //
2217  // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
2218  // It doesn't depend on the input at all.
2219  size_t numBlocks = numSends_ + selfMessage_;
2220  size_t imageIndex = 0;
2221  while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) {
2222  ++imageIndex;
2223  }
2224  if (imageIndex == numBlocks) {
2225  imageIndex = 0;
2226  }
2227 
2228  size_t selfNum = 0;
2229  size_t selfIndex = 0;
2230 
2231  if (indicesTo_.empty()) {
2232  if (debug_) {
2233  std::ostringstream os;
2234  os << myImageID << ": doPosts(3,fast): posting sends" << endl;
2235  *out_ << os.str ();
2236  }
2237 
2238  // Data are already blocked (laid out) by process, so we don't
2239  // need a separate send buffer (besides the exports array).
2240  for (size_t i = 0; i < numBlocks; ++i) {
2241  size_t p = i + imageIndex;
2242  if (p > (numBlocks - 1)) {
2243  p -= numBlocks;
2244  }
2245 
2246  if (imagesTo_[p] != myImageID) {
2247  exports_view_type tmpSend = subview_offset(
2248  exports, startsTo_[p]*numPackets, lengthsTo_[p]*numPackets);
2249 
2250  if (sendType == Details::DISTRIBUTOR_SEND) {
2251  send<int> (tmpSend,
2252  as<int> (tmpSend.size ()),
2253  imagesTo_[p], tag, *comm_);
2254  }
2255  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2256  exports_view_type tmpSendBuf =
2257  subview_offset (exports, startsTo_[p] * numPackets,
2258  lengthsTo_[p] * numPackets);
2259  requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p],
2260  tag, *comm_));
2261  }
2262  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2263  readySend<int> (tmpSend,
2264  as<int> (tmpSend.size ()),
2265  imagesTo_[p], tag, *comm_);
2266  }
2267  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2268  ssend<int> (tmpSend,
2269  as<int> (tmpSend.size ()),
2270  imagesTo_[p], tag, *comm_);
2271  } else {
2272  TEUCHOS_TEST_FOR_EXCEPTION(
2273  true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): "
2274  "Invalid send type. We should never get here. "
2275  "Please report this bug to the Tpetra developers.");
2276  }
2277 
2278  if (debug_) {
2279  std::ostringstream os;
2280  os << myImageID << ": doPosts(3,fast): "
2281  << "Posted send to Proc " << imagesTo_[i]
2282  << " w/ specified tag " << tag << endl;
2283  *out_ << os.str ();
2284  }
2285  }
2286  else { // "Sending" the message to myself
2287  selfNum = p;
2288  }
2289  }
2290 
2291  if (selfMessage_) {
2292  // This is how we "send a message to ourself": we copy from
2293  // the export buffer to the import buffer. That saves
2294  // Teuchos::Comm implementations other than MpiComm (in
2295  // particular, SerialComm) the trouble of implementing self
2296  // messages correctly. (To do this right, SerialComm would
2297  // need internal buffer space for messages, keyed on the
2298  // message's tag.)
2299  deep_copy_offset(imports, exports, selfReceiveOffset,
2300  startsTo_[selfNum]*numPackets,
2301  lengthsTo_[selfNum]*numPackets);
2302  }
2303  if (debug_) {
2304  std::ostringstream os;
2305  os << myImageID << ": doPosts(3,fast) done" << endl;
2306  *out_ << os.str ();
2307  }
2308  }
2309  else { // data are not blocked by image, use send buffer
2310  if (debug_) {
2311  std::ostringstream os;
2312  os << myImageID << ": doPosts(3,slow): posting sends" << endl;
2313  *out_ << os.str ();
2314  }
2315 
2316  // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking
2317  // sends), because the buffer is only long enough for one send.
2318  Kokkos::View<Packet*,Layout,Device,Mem> sendArray =
2319  create_view<Packet,Layout,Device,Mem> ("sendArray",
2320  maxSendLength_ * numPackets);
2321 
2322  TEUCHOS_TEST_FOR_EXCEPTION(
2323  sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
2324  "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" code path "
2325  "doesn't currently work with nonblocking sends.");
2326 
2327  for (size_t i = 0; i < numBlocks; ++i) {
2328  size_t p = i + imageIndex;
2329  if (p > (numBlocks - 1)) {
2330  p -= numBlocks;
2331  }
2332 
2333  if (imagesTo_[p] != myImageID) {
2334  size_t sendArrayOffset = 0;
2335  size_t j = startsTo_[p];
2336  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
2337  deep_copy_offset(sendArray, exports, sendArrayOffset,
2338  indicesTo_[j]*numPackets, numPackets);
2339  sendArrayOffset += numPackets;
2340  }
2341  Kokkos::View<Packet*, Layout, Device, Mem> tmpSend =
2342  subview_offset(sendArray, size_t(0), lengthsTo_[p]*numPackets);
2343 
2344  if (sendType == Details::DISTRIBUTOR_SEND) {
2345  send<int> (tmpSend,
2346  as<int> (tmpSend.size ()),
2347  imagesTo_[p], tag, *comm_);
2348  }
2349  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2350  exports_view_type tmpSendBuf =
2351  subview_offset (sendArray, size_t(0), lengthsTo_[p] * numPackets);
2352  requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p],
2353  tag, *comm_));
2354  }
2355  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2356  readySend<int> (tmpSend,
2357  as<int> (tmpSend.size ()),
2358  imagesTo_[p], tag, *comm_);
2359  }
2360  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2361  ssend<int> (tmpSend,
2362  as<int> (tmpSend.size ()),
2363  imagesTo_[p], tag, *comm_);
2364  }
2365  else {
2366  TEUCHOS_TEST_FOR_EXCEPTION(
2367  true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): "
2368  "Invalid send type. We should never get here. "
2369  "Please report this bug to the Tpetra developers.");
2370  }
2371 
2372  if (debug_) {
2373  std::ostringstream os;
2374  os << myImageID << ": doPosts(3,slow): "
2375  << "Posted send to Proc " << imagesTo_[i]
2376  << " w/ specified tag " << tag << endl;
2377  *out_ << os.str ();
2378  }
2379  }
2380  else { // "Sending" the message to myself
2381  selfNum = p;
2382  selfIndex = startsTo_[p];
2383  }
2384  }
2385 
2386  if (selfMessage_) {
2387  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
2388  deep_copy_offset(imports, exports, selfReceiveOffset,
2389  indicesTo_[selfIndex]*numPackets, numPackets);
2390  ++selfIndex;
2391  selfReceiveOffset += numPackets;
2392  }
2393  }
2394  if (debug_) {
2395  std::ostringstream os;
2396  os << myImageID << ": doPosts(3,slow) done" << endl;
2397  *out_ << os.str ();
2398  }
2399  }
2400  }
2401 
2402  template <class Packet, class Layout, class Device, class Mem>
2403  void Distributor::
2404  doPosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
2405  const ArrayView<size_t> &numExportPacketsPerLID,
2406  const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
2407  const ArrayView<size_t> &numImportPacketsPerLID)
2408  {
2409  using Teuchos::Array;
2410  using Teuchos::as;
2411  using Teuchos::ireceive;
2412  using Teuchos::isend;
2413  using Teuchos::readySend;
2414  using Teuchos::send;
2415  using Teuchos::ssend;
2416  using Teuchos::TypeNameTraits;
2417 #ifdef HAVE_TEUCHOS_DEBUG
2418  using Teuchos::OSTab;
2419 #endif // HAVE_TEUCHOS_DEBUG
2420  using std::endl;
2421  using Kokkos::Compat::create_const_view;
2422  using Kokkos::Compat::create_view;
2423  using Kokkos::Compat::subview_offset;
2424  using Kokkos::Compat::deep_copy_offset;
2425  typedef Array<size_t>::size_type size_type;
2426  typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view_type;
2427  typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view_type;
2428 
2429  Teuchos::OSTab tab (out_);
2430 
2431 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2432  Teuchos::TimeMonitor timeMon (*timer_doPosts4_);
2433 #endif // TPETRA_DISTRIBUTOR_TIMERS
2434 
2435  // Run-time configurable parameters that come from the input
2436  // ParameterList set by setParameterList().
2437  const Details::EDistributorSendType sendType = sendType_;
2438  const bool doBarrier = barrierBetween_;
2439 
2440 // #ifdef HAVE_TEUCHOS_DEBUG
2441 // // Prepare for verbose output, if applicable.
2442 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
2443 // RCP<Teuchos::FancyOStream> out = this->getOStream ();
2444 // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
2445 // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
2446 
2447 // if (doPrint) {
2448 // // Only need one process to print out parameters.
2449 // *out << "Distributor::doPosts (4 args)" << endl;
2450 // }
2451 // // Add one tab level. We declare this outside the doPrint scopes
2452 // // so that the tab persists until the end of this method.
2453 // Teuchos::OSTab tab = this->getOSTab ();
2454 // if (doPrint) {
2455 // *out << "Parameters:" << endl;
2456 // {
2457 // OSTab tab2 (out);
2458 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
2459 // << endl << "barrierBetween: " << doBarrier << endl;
2460 // }
2461 // }
2462 // #endif // HAVE_TEUCHOS_DEBUG
2463 
2464  TEUCHOS_TEST_FOR_EXCEPTION(
2465  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
2466  std::logic_error, "Tpetra::Distributor::doPosts(4 args): Ready-send "
2467  "version requires a barrier between posting receives and posting ready "
2468  "sends. This should have been checked before. "
2469  "Please report this bug to the Tpetra developers.");
2470 
2471  const int myImageID = comm_->getRank ();
2472  size_t selfReceiveOffset = 0;
2473 
2474 #ifdef HAVE_TEUCHOS_DEBUG
2475  // Different messages may have different numbers of packets.
2476  size_t totalNumImportPackets = 0;
2477  for (size_type ii = 0; ii < numImportPacketsPerLID.size (); ++ii) {
2478  totalNumImportPackets += numImportPacketsPerLID[ii];
2479  }
2480  TEUCHOS_TEST_FOR_EXCEPTION(
2481  imports.dimension_0 () < totalNumImportPackets, std::runtime_error,
2482  "Tpetra::Distributor::doPosts(4 args): The 'imports' array must have "
2483  "enough entries to hold the expected number of import packets. "
2484  "imports.dimension_0() = " << imports.dimension_0 () << " < "
2485  "totalNumImportPackets = " << totalNumImportPackets << ".");
2486 #endif // HAVE_TEUCHOS_DEBUG
2487 
2488  // MPI tag for nonblocking receives and blocking sends in this
2489  // method. Some processes might take the "fast" path
2490  // (indicesTo_.empty()) and others might take the "slow" path for
2491  // the same doPosts() call, so the path tag must be the same for
2492  // both.
2493  const int pathTag = 1;
2494  const int tag = this->getTag (pathTag);
2495 
2496  if (debug_) {
2497  TEUCHOS_TEST_FOR_EXCEPTION(
2498  requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
2499  "doPosts(4 args): Process " << myImageID << ": requests_.size () = "
2500  << requests_.size () << " != 0.");
2501  std::ostringstream os;
2502  os << myImageID << ": doPosts(4,"
2503  << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
2504  *out_ << os.str ();
2505  }
2506 
2507  // Distributor uses requests_.size() as the number of outstanding
2508  // nonblocking message requests, so we resize to zero to maintain
2509  // this invariant.
2510  //
2511  // numReceives_ does _not_ include the self message, if there is
2512  // one. Here, we do actually send a message to ourselves, so we
2513  // include any self message in the "actual" number of receives to
2514  // post.
2515  //
2516  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
2517  // doesn't (re)allocate its array of requests. That happens in
2518  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
2519  // demand), or Resize_().
2520  const size_type actualNumReceives = as<size_type> (numReceives_) +
2521  as<size_type> (selfMessage_ ? 1 : 0);
2522  requests_.resize (0);
2523 
2524  // Post the nonblocking receives. It's common MPI wisdom to post
2525  // receives before sends. In MPI terms, this means favoring
2526  // adding to the "posted queue" (of receive requests) over adding
2527  // to the "unexpected queue" (of arrived messages not yet matched
2528  // with a receive).
2529  {
2530 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2531  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4_recvs_);
2532 #endif // TPETRA_DISTRIBUTOR_TIMERS
2533 
2534  size_t curBufferOffset = 0;
2535  size_t curLIDoffset = 0;
2536  for (size_type i = 0; i < actualNumReceives; ++i) {
2537  size_t totalPacketsFrom_i = 0;
2538  for (size_t j = 0; j < lengthsFrom_[i]; ++j) {
2539  totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
2540  }
2541  curLIDoffset += lengthsFrom_[i];
2542  if (imagesFrom_[i] != myImageID && totalPacketsFrom_i) {
2543  // If my process is receiving these packet(s) from another
2544  // process (not a self-receive), and if there is at least
2545  // one packet to receive:
2546  //
2547  // 1. Set up the persisting view (recvBuf) into the imports
2548  // array, given the offset and size (total number of
2549  // packets from process imagesFrom_[i]).
2550  // 2. Start the Irecv and save the resulting request.
2551  imports_view_type recvBuf =
2552  subview_offset (imports, curBufferOffset, totalPacketsFrom_i);
2553  requests_.push_back (ireceive<int> (recvBuf, imagesFrom_[i],
2554  tag, *comm_));
2555  }
2556  else { // Receiving these packet(s) from myself
2557  selfReceiveOffset = curBufferOffset; // Remember the offset
2558  }
2559  curBufferOffset += totalPacketsFrom_i;
2560  }
2561  }
2562 
2563  if (doBarrier) {
2564 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2565  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4_barrier_);
2566 #endif // TPETRA_DISTRIBUTOR_TIMERS
2567  // If we are using ready sends (MPI_Rsend) below, we need to do
2568  // a barrier before we post the ready sends. This is because a
2569  // ready send requires that its matching receive has already
2570  // been posted before the send has been posted. The only way to
2571  // guarantee that in this case is to use a barrier.
2572  comm_->barrier ();
2573  }
2574 
2575 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2576  Teuchos::TimeMonitor timeMonSends (*timer_doPosts4_sends_);
2577 #endif // TPETRA_DISTRIBUTOR_TIMERS
2578 
2579  // setup arrays containing starting-offsets into exports for each send,
2580  // and num-packets-to-send for each send.
2581  Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0);
2582  size_t maxNumPackets = 0;
2583  size_t curPKToffset = 0;
2584  for (size_t pp=0; pp<numSends_; ++pp) {
2585  sendPacketOffsets[pp] = curPKToffset;
2586  size_t numPackets = 0;
2587  for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) {
2588  numPackets += numExportPacketsPerLID[j];
2589  }
2590  if (numPackets > maxNumPackets) maxNumPackets = numPackets;
2591  packetsPerSend[pp] = numPackets;
2592  curPKToffset += numPackets;
2593  }
2594 
2595  // setup scan through imagesTo_ list starting with higher numbered images
2596  // (should help balance message traffic)
2597  size_t numBlocks = numSends_+ selfMessage_;
2598  size_t imageIndex = 0;
2599  while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) {
2600  ++imageIndex;
2601  }
2602  if (imageIndex == numBlocks) {
2603  imageIndex = 0;
2604  }
2605 
2606  size_t selfNum = 0;
2607  size_t selfIndex = 0;
2608 
2609  if (indicesTo_.empty()) {
2610  if (debug_) {
2611  std::ostringstream os;
2612  os << myImageID << ": doPosts(4,fast): posting sends" << endl;
2613  *out_ << os.str ();
2614  }
2615 
2616  // Data are already blocked (laid out) by process, so we don't
2617  // need a separate send buffer (besides the exports array).
2618  for (size_t i = 0; i < numBlocks; ++i) {
2619  size_t p = i + imageIndex;
2620  if (p > (numBlocks - 1)) {
2621  p -= numBlocks;
2622  }
2623 
2624  if (imagesTo_[p] != myImageID && packetsPerSend[p] > 0) {
2625  exports_view_type tmpSend =
2626  subview_offset(exports, sendPacketOffsets[p], packetsPerSend[p]);
2627 
2628  if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first
2629  send<int> (tmpSend,
2630  as<int> (tmpSend.size ()),
2631  imagesTo_[p], tag, *comm_);
2632  }
2633  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2634  readySend<int> (tmpSend,
2635  as<int> (tmpSend.size ()),
2636  imagesTo_[p], tag, *comm_);
2637  }
2638  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2639  exports_view_type tmpSendBuf =
2640  subview_offset (exports, sendPacketOffsets[p], packetsPerSend[p]);
2641  requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p],
2642  tag, *comm_));
2643  }
2644  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2645  ssend<int> (tmpSend,
2646  as<int> (tmpSend.size ()),
2647  imagesTo_[p], tag, *comm_);
2648  }
2649  else {
2650  TEUCHOS_TEST_FOR_EXCEPTION(
2651  true, std::logic_error, "Tpetra::Distributor::doPosts(4 args): "
2652  "Invalid send type. We should never get here. "
2653  "Please report this bug to the Tpetra developers.");
2654  }
2655  }
2656  else { // "Sending" the message to myself
2657  selfNum = p;
2658  }
2659  }
2660 
2661  if (selfMessage_) {
2662  deep_copy_offset(imports, exports, selfReceiveOffset,
2663  sendPacketOffsets[selfNum], packetsPerSend[selfNum]);
2664  }
2665  if (debug_) {
2666  std::ostringstream os;
2667  os << myImageID << ": doPosts(4,fast) done" << endl;
2668  *out_ << os.str ();
2669  }
2670  }
2671  else { // data are not blocked by image, use send buffer
2672  if (debug_) {
2673  std::ostringstream os;
2674  os << myImageID << ": doPosts(4,slow): posting sends" << endl;
2675  *out_ << os.str ();
2676  }
2677 
2678  // FIXME (mfh 05 Mar 2013) This may be broken for Isend.
2679  Kokkos::View<Packet*,Layout,Device,Mem> sendArray =
2680  create_view<Packet,Layout,Device,Mem>("sendArray", maxNumPackets); // send buffer
2681 
2682  TEUCHOS_TEST_FOR_EXCEPTION(
2683  sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
2684  "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" code path "
2685  "may not necessarily work with nonblocking sends.");
2686 
2687  Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
2688  size_t ioffset = 0;
2689  for (int j=0; j<numExportPacketsPerLID.size(); ++j) {
2690  indicesOffsets[j] = ioffset;
2691  ioffset += numExportPacketsPerLID[j];
2692  }
2693 
2694  for (size_t i = 0; i < numBlocks; ++i) {
2695  size_t p = i + imageIndex;
2696  if (p > (numBlocks - 1)) {
2697  p -= numBlocks;
2698  }
2699 
2700  if (imagesTo_[p] != myImageID) {
2701  size_t sendArrayOffset = 0;
2702  size_t j = startsTo_[p];
2703  size_t numPacketsTo_p = 0;
2704  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
2705  deep_copy_offset(sendArray, exports, sendArrayOffset,
2706  indicesOffsets[j], numExportPacketsPerLID[j]);
2707  sendArrayOffset += numExportPacketsPerLID[j];
2708  }
2709  if (numPacketsTo_p > 0) {
2710  Kokkos::View<Packet*, Layout, Device, Mem> tmpSend =
2711  subview_offset(sendArray, size_t(0), numPacketsTo_p);
2712 
2713  if (sendType == Details::DISTRIBUTOR_RSEND) {
2714  readySend<int> (tmpSend,
2715  as<int> (tmpSend.size ()),
2716  imagesTo_[p], tag, *comm_);
2717  }
2718  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2719  exports_view_type tmpSendBuf =
2720  subview_offset (sendArray, size_t(0), numPacketsTo_p);
2721  requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p],
2722  tag, *comm_));
2723  }
2724  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2725  ssend<int> (tmpSend,
2726  as<int> (tmpSend.size ()),
2727  imagesTo_[p], tag, *comm_);
2728  }
2729  else { // if (sendType == Details::DISTRIBUTOR_SSEND)
2730  send<int> (tmpSend,
2731  as<int> (tmpSend.size ()),
2732  imagesTo_[p], tag, *comm_);
2733  }
2734  }
2735  }
2736  else { // "Sending" the message to myself
2737  selfNum = p;
2738  selfIndex = startsTo_[p];
2739  }
2740  }
2741 
2742  if (selfMessage_) {
2743  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
2744  deep_copy_offset(imports, exports, selfReceiveOffset,
2745  indicesOffsets[selfIndex],
2746  numExportPacketsPerLID[selfIndex]);
2747  selfReceiveOffset += numExportPacketsPerLID[selfIndex];
2748  ++selfIndex;
2749  }
2750  }
2751  if (debug_) {
2752  std::ostringstream os;
2753  os << myImageID << ": doPosts(4,slow) done" << endl;
2754  *out_ << os.str ();
2755  }
2756  }
2757  }
2758 
2759  template <class Packet, class Layout, class Device, class Mem>
2760  void Distributor::
2761  doReversePostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
2762  size_t numPackets,
2763  const Kokkos::View<Packet*, Layout, Device, Mem> &imports)
2764  {
2765  // If MPI library doesn't support RDMA for communication directly
2766  // to/from GPU, transfer exports to the host, do the communication, then
2767  // transfer imports back to the GPU
2768  //
2769  // We need to do this here instead of doPosts() because the copy back
2770  // to the GPU must occur after the waits.
2771  typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view;
2772  typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view;
2773  if (!enable_cuda_rdma_ && !exports_view::is_hostspace) {
2774  typename exports_view::HostMirror host_exports =
2775  Kokkos::create_mirror_view(exports);
2776  typename imports_view::HostMirror host_imports =
2777  Kokkos::create_mirror_view(imports);
2778  Kokkos::deep_copy (host_exports, exports);
2779  doPostsAndWaits(Kokkos::Compat::create_const_view(host_exports),
2780  numPackets,
2781  host_imports);
2782  Kokkos::deep_copy (imports, host_imports);
2783  return;
2784  }
2785 
2786  doReversePosts (exports, numPackets, imports);
2787  doReverseWaits ();
2788  }
2789 
2790  template <class Packet, class Layout, class Device, class Mem>
2791  void Distributor::
2792  doReversePostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
2793  const ArrayView<size_t> &numExportPacketsPerLID,
2794  const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
2795  const ArrayView<size_t> &numImportPacketsPerLID)
2796  {
2797  // If MPI library doesn't support RDMA for communication directly
2798  // to/from GPU, transfer exports to the host, do the communication, then
2799  // transfer imports back to the GPU
2800  //
2801  // We need to do this here instead of doPosts() because the copy back
2802  // to the GPU must occur after the waits.
2803  typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view;
2804  typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view;
2805  if (!enable_cuda_rdma_ && !exports_view::is_hostspace) {
2806  typename exports_view::HostMirror host_exports =
2807  Kokkos::create_mirror_view(exports);
2808  typename imports_view::HostMirror host_imports =
2809  Kokkos::create_mirror_view(imports);
2810  Kokkos::deep_copy (host_exports, exports);
2811  doPostsAndWaits(Kokkos::Compat::create_const_view(host_exports),
2812  numExportPacketsPerLID,
2813  host_imports,
2814  numImportPacketsPerLID);
2815  Kokkos::deep_copy (imports, host_imports);
2816  return;
2817  }
2818 
2819  TEUCHOS_TEST_FOR_EXCEPTION(requests_.size() != 0, std::runtime_error,
2820  "Tpetra::Distributor::doReversePostsAndWaits(4 args): There are "
2821  << requests_.size() << " outstanding nonblocking messages pending. It "
2822  "is incorrect to call this method with posts outstanding.");
2823 
2824  doReversePosts (exports, numExportPacketsPerLID, imports,
2825  numImportPacketsPerLID);
2826  doReverseWaits ();
2827  }
2828 
2829  template <class Packet, class Layout, class Device, class Mem>
2830  void Distributor::
2831  doReversePosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
2832  size_t numPackets,
2833  const Kokkos::View<Packet*, Layout, Device, Mem> &imports)
2834  {
2835  // FIXME (mfh 29 Mar 2012) WHY?
2836  TEUCHOS_TEST_FOR_EXCEPTION(
2837  ! indicesTo_.empty (), std::runtime_error,
2838  "Tpetra::Distributor::doReversePosts(3 args): Can only do "
2839  "reverse communication when original data are blocked by process.");
2840  if (reverseDistributor_.is_null ()) {
2841  createReverseDistributor ();
2842  }
2843  reverseDistributor_->doPosts (exports, numPackets, imports);
2844  }
2845 
2846  template <class Packet, class Layout, class Device, class Mem>
2847  void Distributor::
2848  doReversePosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports,
2849  const ArrayView<size_t> &numExportPacketsPerLID,
2850  const Kokkos::View<Packet*, Layout, Device, Mem> &imports,
2851  const ArrayView<size_t> &numImportPacketsPerLID)
2852  {
2853  // FIXME (mfh 29 Mar 2012) WHY?
2854  TEUCHOS_TEST_FOR_EXCEPTION(
2855  ! indicesTo_.empty (), std::runtime_error,
2856  "Tpetra::Distributor::doReversePosts(3 args): Can only do "
2857  "reverse communication when original data are blocked by process.");
2858  if (reverseDistributor_.is_null ()) {
2859  createReverseDistributor ();
2860  }
2861  reverseDistributor_->doPosts (exports, numExportPacketsPerLID,
2862  imports, numImportPacketsPerLID);
2863  }
2864 
2865  template <class OrdinalType>
2866  void Distributor::
2867  computeSends (const ArrayView<const OrdinalType> & importIDs,
2868  const ArrayView<const int> & importNodeIDs,
2869  Array<OrdinalType> & exportIDs,
2870  Array<int> & exportNodeIDs)
2871  {
2872  // NOTE (mfh 19 Apr 2012): There was a note on this code saying:
2873  // "assumes that size_t >= Ordinal". The code certainly does
2874  // assume that sizeof(size_t) >= sizeof(OrdinalType) as well as
2875  // sizeof(size_t) >= sizeof(int). This is because it casts the
2876  // OrdinalType elements of importIDs (along with their
2877  // corresponding process IDs, as int) to size_t, and does a
2878  // doPostsAndWaits<size_t>() to send the packed data.
2879  using std::endl;
2880  typedef typename ArrayView<const OrdinalType>::size_type size_type;
2881 
2882  Teuchos::OSTab tab (out_);
2883  const int myRank = comm_->getRank ();
2884  if (debug_) {
2885  std::ostringstream os;
2886  os << myRank << ": computeSends" << endl;
2887  *out_ << os.str ();
2888  }
2889 
2890  TEUCHOS_TEST_FOR_EXCEPTION(
2891  importIDs.size () != importNodeIDs.size (), std::invalid_argument,
2892  "Tpetra::Distributor::computeSends: On Process " << myRank << ": "
2893  "importNodeIDs.size() = " << importNodeIDs.size ()
2894  << " != importIDs.size() = " << importIDs.size () << ".");
2895 
2896  const size_type numImports = importNodeIDs.size ();
2897  Array<size_t> importObjs (2*numImports);
2898  // Pack pairs (importIDs[i], my process ID) to send into importObjs.
2899  for (size_type i = 0; i < numImports; ++i) {
2900  importObjs[2*i] = static_cast<size_t> (importIDs[i]);
2901  importObjs[2*i+1] = static_cast<size_t> (myRank);
2902  }
2903  //
2904  // Use a temporary Distributor to send the (importIDs[i], myRank)
2905  // pairs to importNodeIDs[i].
2906  //
2907  Distributor tempPlan (comm_, out_);
2908  if (debug_) {
2909  std::ostringstream os;
2910  os << myRank << ": computeSends: tempPlan.createFromSends" << endl;
2911  *out_ << os.str ();
2912  }
2913 
2914  // mfh 20 Mar 2014: An extra-cautious cast from unsigned to
2915  // signed, in order to forestall any possible causes for Bug 6069.
2916  const size_t numExportsAsSizeT = tempPlan.createFromSends (importNodeIDs);
2917  const size_type numExports = static_cast<size_type> (numExportsAsSizeT);
2918  TEUCHOS_TEST_FOR_EXCEPTION(
2919  numExports < 0, std::logic_error, "Tpetra::Distributor::computeSends: "
2920  "tempPlan.createFromSends() returned numExports = " << numExportsAsSizeT
2921  << " as a size_t, which overflows to " << numExports << " when cast to "
2922  << Teuchos::TypeNameTraits<size_type>::name () << ". "
2923  "Please report this bug to the Tpetra developers.");
2924  TEUCHOS_TEST_FOR_EXCEPTION(
2925  static_cast<size_type> (tempPlan.getTotalReceiveLength ()) != numExports,
2926  std::logic_error, "Tpetra::Distributor::computeSends: tempPlan.getTotal"
2927  "ReceiveLength() = " << tempPlan.getTotalReceiveLength () << " != num"
2928  "Exports = " << numExports << ". Please report this bug to the "
2929  "Tpetra developers.");
2930 
2931  if (numExports > 0) {
2932  exportIDs.resize (numExports);
2933  exportNodeIDs.resize (numExports);
2934  }
2935 
2936  // exportObjs: Packed receive buffer. (exportObjs[2*i],
2937  // exportObjs[2*i+1]) will give the (GID, PID) pair for export i,
2938  // after tempPlan.doPostsAndWaits(...) finishes below.
2939  //
2940  // FIXME (mfh 19 Mar 2014) This only works if OrdinalType fits in
2941  // size_t. This issue might come up, for example, on a 32-bit
2942  // machine using 64-bit global indices. I will add a check here
2943  // for that case.
2944  TEUCHOS_TEST_FOR_EXCEPTION(
2945  sizeof (size_t) < sizeof (OrdinalType), std::logic_error,
2946  "Tpetra::Distributor::computeSends: sizeof(size_t) = " << sizeof(size_t)
2947  << " < sizeof(" << Teuchos::TypeNameTraits<OrdinalType>::name () << ") = "
2948  << sizeof (OrdinalType) << ". This violates an assumption of the "
2949  "method. It's not hard to work around (just use Array<OrdinalType> as "
2950  "the export buffer, not Array<size_t>), but we haven't done that yet. "
2951  "Please report this bug to the Tpetra developers.");
2952 
2953  TEUCHOS_TEST_FOR_EXCEPTION(
2954  tempPlan.getTotalReceiveLength () < static_cast<size_t> (numExports),
2955  std::logic_error,
2956  "Tpetra::Distributor::computeSends: tempPlan.getTotalReceiveLength() = "
2957  << tempPlan.getTotalReceiveLength() << " < numExports = " << numExports
2958  << ". Please report this bug to the Tpetra developers.");
2959 
2960  Array<size_t> exportObjs (tempPlan.getTotalReceiveLength () * 2);
2961  if (debug_) {
2962  std::ostringstream os;
2963  os << myRank << ": computeSends: tempPlan.doPostsAndWaits" << endl;
2964  *out_ << os.str ();
2965  }
2966  tempPlan.doPostsAndWaits<size_t> (importObjs (), 2, exportObjs ());
2967 
2968  // Unpack received (GID, PID) pairs into exportIDs resp. exportNodeIDs.
2969  for (size_type i = 0; i < numExports; ++i) {
2970  exportIDs[i] = static_cast<OrdinalType> (exportObjs[2*i]);
2971  exportNodeIDs[i] = static_cast<int> (exportObjs[2*i+1]);
2972  }
2973 
2974  if (debug_) {
2975  std::ostringstream os;
2976  os << myRank << ": computeSends done" << endl;
2977  *out_ << os.str ();
2978  }
2979  }
2980 
2981  template <class OrdinalType>
2982  void Distributor::
2983  createFromRecvs (const ArrayView<const OrdinalType> &remoteIDs,
2984  const ArrayView<const int> &remoteImageIDs,
2985  Array<OrdinalType> &exportGIDs,
2986  Array<int> &exportNodeIDs)
2987  {
2988  using std::endl;
2989 
2990  Teuchos::OSTab tab (out_);
2991  const int myRank = comm_->getRank();
2992 
2993  if (debug_) {
2994  *out_ << myRank << ": createFromRecvs" << endl;
2995  }
2996 
2997 #ifdef HAVE_TPETRA_DEBUG
2998  using Teuchos::outArg;
2999  using Teuchos::reduceAll;
3000 
3001  // In debug mode, first test locally, then do an all-reduce to
3002  // make sure that all processes passed.
3003  const int errProc =
3004  (remoteIDs.size () != remoteImageIDs.size ()) ? myRank : -1;
3005  int maxErrProc = -1;
3006  reduceAll<int, int> (*comm_, Teuchos::REDUCE_MAX, errProc, outArg (maxErrProc));
3007  TEUCHOS_TEST_FOR_EXCEPTION(maxErrProc != -1, std::runtime_error,
3008  Teuchos::typeName (*this) << "::createFromRecvs(): lists of remote IDs "
3009  "and remote process IDs must have the same size on all participating "
3010  "processes. Maximum process ID with error: " << maxErrProc << ".");
3011 #else // NOT HAVE_TPETRA_DEBUG
3012 
3013  // In non-debug mode, just test locally.
3014  TEUCHOS_TEST_FOR_EXCEPTION(
3015  remoteIDs.size () != remoteImageIDs.size (), std::invalid_argument,
3016  Teuchos::typeName (*this) << "::createFromRecvs<" <<
3017  Teuchos::TypeNameTraits<OrdinalType>::name () << ">(): On Process " <<
3018  myRank << ": remoteIDs.size() = " << remoteIDs.size () << " != "
3019  "remoteImageIDs.size() = " << remoteImageIDs.size () << ".");
3020 #endif // HAVE_TPETRA_DEBUG
3021 
3022  computeSends (remoteIDs, remoteImageIDs, exportGIDs, exportNodeIDs);
3023 
3024  const size_t numProcsSendingToMe = createFromSends (exportNodeIDs ());
3025 
3026  if (debug_) {
3027  // NOTE (mfh 20 Mar 2014) If remoteImageIDs could contain
3028  // duplicates, then its length might not be the right check here,
3029  // even if we account for selfMessage_. selfMessage_ is set in
3030  // createFromSends.
3031  std::ostringstream os;
3032  os << "Proc " << myRank << ": {numProcsSendingToMe: "
3033  << numProcsSendingToMe << ", remoteImageIDs.size(): "
3034  << remoteImageIDs.size () << ", selfMessage_: "
3035  << (selfMessage_ ? "true" : "false") << "}" << std::endl;
3036  std::cerr << os.str ();
3037  }
3038 
3039  if (debug_) {
3040  *out_ << myRank << ": createFromRecvs done" << endl;
3041  }
3042 
3043  howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS;
3044  }
3045 
3046 } // namespace Tpetra
3047 
3048 #endif // TPETRA_DISTRIBUTOR_HPP
Namespace Tpetra contains the class and methods constituting the Tpetra library.
EDistributorHowInitialized
Enum indicating how and whether a Distributor was initialized.
void doPosts(const ArrayRCP< const Packet > &exports, size_t numPackets, const ArrayRCP< Packet > &imports)
Post the data for a forward plan, but do not execute the waits yet.
void deep_copy(MultiVector< DS, DL, DG, DN, dstClassic > &dst, const MultiVector< SS, SL, SG, SN, srcClassic > &src)
Copy the contents of the MultiVector src into dst.
std::string DistributorSendTypeEnumToString(EDistributorSendType sendType)
Convert an EDistributorSendType enum value to a string.
Implementation details of Tpetra.
Details::EDistributorHowInitialized howInitialized() const
Return an enum indicating whether and how a Distributor was initialized.
void doPostsAndWaits(const ArrayView< const Packet > &exports, size_t numPackets, const ArrayView< Packet > &imports)
Execute the (forward) communication plan.
Sets up and executes a communication plan for a Tpetra DistObject.
size_t getTotalReceiveLength() const
Total number of values this process will receive from other processes.
size_t createFromSends(const ArrayView< const int > &exportNodeIDs)
Set up Distributor using list of process ranks to which this process will send.
void doReversePosts(const ArrayRCP< const Packet > &exports, size_t numPackets, const ArrayRCP< Packet > &imports)
Post the data for a reverse plan, but do not execute the waits yet.
void doReversePostsAndWaits(const ArrayView< const Packet > &exports, size_t numPackets, const ArrayView< Packet > &imports)
Execute the reverse communication plan.
std::string DistributorHowInitializedEnumToString(EDistributorHowInitialized how)
Convert an EDistributorHowInitialized enum value to a string.
Stand-alone utility functions and macros.
void getLastDoStatistics(size_t &bytes_sent, size_t &bytes_recvd) const
Information on the last call to do/doReverse.
Array< std::string > distributorSendTypes()
Valid values for Distributor&#39;s "Send type" parameter.
void createFromRecvs(const ArrayView< const Ordinal > &remoteIDs, const ArrayView< const int > &remoteNodeIDs, Array< Ordinal > &exportIDs, Array< int > &exportNodeIDs)
Set up Distributor using list of process ranks from which to receive.
EDistributorSendType
The type of MPI send that Distributor should use.