Tpetra parallel linear algebra  Version of the Day
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_MULTIVECTOR_DEF_HPP
43 #define TPETRA_MULTIVECTOR_DEF_HPP
44 
52 
53 #include "Tpetra_Util.hpp"
54 #include "Tpetra_Vector.hpp"
57 #include "Tpetra_Details_checkView.hpp"
58 #include "Tpetra_Details_fill.hpp"
59 #include "Tpetra_Details_gathervPrint.hpp"
66 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
67 # include "Teuchos_SerialDenseMatrix.hpp"
68 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
69 #include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
70 #include "KokkosCompat_View.hpp"
71 #include "KokkosBlas.hpp"
72 #include "KokkosKernels_Utils.hpp"
73 #include "Kokkos_Random.hpp"
74 #include "Kokkos_ArithTraits.hpp"
75 #include <memory>
76 #include <sstream>
77 
78 #ifdef HAVE_TPETRA_INST_FLOAT128
79 namespace Kokkos {
80  // FIXME (mfh 04 Sep 2015) Just a stub for now!
81  template<class Generator>
82  struct rand<Generator, __float128> {
83  static KOKKOS_INLINE_FUNCTION __float128 max ()
84  {
85  return static_cast<__float128> (1.0);
86  }
87  static KOKKOS_INLINE_FUNCTION __float128
88  draw (Generator& gen)
89  {
90  // Half the smallest normalized double, is the scaling factor of
91  // the lower-order term in the double-double representation.
92  const __float128 scalingFactor =
93  static_cast<__float128> (std::numeric_limits<double>::min ()) /
94  static_cast<__float128> (2.0);
95  const __float128 higherOrderTerm = static_cast<__float128> (gen.drand ());
96  const __float128 lowerOrderTerm =
97  static_cast<__float128> (gen.drand ()) * scalingFactor;
98  return higherOrderTerm + lowerOrderTerm;
99  }
100  static KOKKOS_INLINE_FUNCTION __float128
101  draw (Generator& gen, const __float128& range)
102  {
103  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
104  const __float128 scalingFactor =
105  static_cast<__float128> (std::numeric_limits<double>::min ()) /
106  static_cast<__float128> (2.0);
107  const __float128 higherOrderTerm =
108  static_cast<__float128> (gen.drand (range));
109  const __float128 lowerOrderTerm =
110  static_cast<__float128> (gen.drand (range)) * scalingFactor;
111  return higherOrderTerm + lowerOrderTerm;
112  }
113  static KOKKOS_INLINE_FUNCTION __float128
114  draw (Generator& gen, const __float128& start, const __float128& end)
115  {
116  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
117  const __float128 scalingFactor =
118  static_cast<__float128> (std::numeric_limits<double>::min ()) /
119  static_cast<__float128> (2.0);
120  const __float128 higherOrderTerm =
121  static_cast<__float128> (gen.drand (start, end));
122  const __float128 lowerOrderTerm =
123  static_cast<__float128> (gen.drand (start, end)) * scalingFactor;
124  return higherOrderTerm + lowerOrderTerm;
125  }
126  };
127 } // namespace Kokkos
128 #endif // HAVE_TPETRA_INST_FLOAT128
129 
130 namespace { // (anonymous)
131 
146  template<class ST, class LO, class GO, class NT>
148  allocDualView (const size_t lclNumRows,
149  const size_t numCols,
150  const bool zeroOut = true,
151  const bool allowPadding = false)
152  {
153  using ::Tpetra::Details::Behavior;
154  using Kokkos::AllowPadding;
155  using Kokkos::view_alloc;
156  using Kokkos::WithoutInitializing;
157  typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
158  typedef typename dual_view_type::t_dev dev_view_type;
159  // This needs to be a string and not a char*, if given as an
160  // argument to Kokkos::view_alloc. This is because view_alloc
161  // also allows a raw pointer as its first argument. See
162  // https://github.com/kokkos/kokkos/issues/434.
163  const std::string label ("MV::DualView");
164  const bool debug = Behavior::debug ();
165 
166  // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
167  // creation of the DualView's Views works around
168  // Kokkos::DualView's current inability to accept an
169  // AllocationProperties initial argument (as Kokkos::View does).
170  // However, the work-around is harmless, since it does what the
171  // (currently nonexistent) equivalent DualView constructor would
172  // have done anyway.
173 
174  dev_view_type d_view;
175  if (zeroOut) {
176  if (allowPadding) {
177  d_view = dev_view_type (view_alloc (label, AllowPadding),
178  lclNumRows, numCols);
179  }
180  else {
181  d_view = dev_view_type (view_alloc (label),
182  lclNumRows, numCols);
183  }
184  }
185  else {
186  if (allowPadding) {
187  d_view = dev_view_type (view_alloc (label,
188  WithoutInitializing,
189  AllowPadding),
190  lclNumRows, numCols);
191  }
192  else {
193  d_view = dev_view_type (view_alloc (label, WithoutInitializing),
194  lclNumRows, numCols);
195  }
196  if (debug) {
197  // Filling with NaN is a cheap and effective way to tell if
198  // downstream code is trying to use a MultiVector's data
199  // without them having been initialized. ArithTraits lets us
200  // call nan() even if the scalar type doesn't define it; it
201  // just returns some undefined value in the latter case. This
202  // won't hurt anything because by setting zeroOut=false, users
203  // already agreed that they don't care about the contents of
204  // the MultiVector.
205  const ST nan = Kokkos::Details::ArithTraits<ST>::nan ();
206  KokkosBlas::fill (d_view, nan);
207  }
208  }
209  if (debug) {
210  TEUCHOS_TEST_FOR_EXCEPTION
211  (static_cast<size_t> (d_view.extent (0)) != lclNumRows ||
212  static_cast<size_t> (d_view.extent (1)) != numCols, std::logic_error,
213  "allocDualView: d_view's dimensions actual dimensions do not match "
214  "requested dimensions. d_view is " << d_view.extent (0) << " x " <<
215  d_view.extent (1) << "; requested " << lclNumRows << " x " << numCols
216  << ". Please report this bug to the Tpetra developers.");
217  }
218 
219  dual_view_type dv (d_view, Kokkos::create_mirror_view (d_view));
220  // Whether or not the user cares about the initial contents of the
221  // MultiVector, the device and host views are out of sync. We
222  // prefer to work in device memory. The way to ensure this
223  // happens is to mark the device view as modified.
224  dv.modify_device ();
225 
226  return dv;
227  }
228 
229  // Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
230  //
231  // T: The type of the entries of the View.
232  // ExecSpace: The Kokkos execution space.
233  template<class T, class ExecSpace>
234  struct MakeUnmanagedView {
235  // The 'false' part of the branch carefully ensures that this
236  // won't attempt to use a host execution space that hasn't been
237  // initialized. For example, if Kokkos::OpenMP is disabled and
238  // Kokkos::Threads is enabled, the latter is always the default
239  // execution space of Kokkos::HostSpace, even when ExecSpace is
240  // Kokkos::Serial. That's why we go through the trouble of asking
241  // Kokkos::DualView what _its_ space is. That seems to work
242  // around this default execution space issue.
243  //
244  // NOTE (mfh 29 Jan 2016): See kokkos/kokkos#178 for why we use
245  // a memory space, rather than an execution space, as the first
246  // argument of VerifyExecutionCanAccessMemorySpace.
247  typedef typename Kokkos::Impl::if_c<
248  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
249  typename ExecSpace::memory_space,
250  Kokkos::HostSpace>::value,
251  typename ExecSpace::device_type,
252  typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
253  typedef Kokkos::LayoutLeft array_layout;
254  typedef Kokkos::View<T*, array_layout, host_exec_space,
255  Kokkos::MemoryUnmanaged> view_type;
256 
257  static view_type getView (const Teuchos::ArrayView<T>& x_in)
258  {
259  const size_t numEnt = static_cast<size_t> (x_in.size ());
260  if (numEnt == 0) {
261  return view_type ();
262  } else {
263  return view_type (x_in.getRawPtr (), numEnt);
264  }
265  }
266  };
267 
268  // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
269  // taking a subview of a 0 x N DualView incorrectly always results
270  // in a 0 x 0 DualView.
271  template<class DualViewType>
272  DualViewType
273  takeSubview (const DualViewType& X,
274  const Kokkos::Impl::ALL_t&,
275  const std::pair<size_t, size_t>& colRng)
276  {
277  if (X.extent (0) == 0 && X.extent (1) != 0) {
278  return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
279  }
280  else {
281  return subview (X, Kokkos::ALL (), colRng);
282  }
283  }
284 
285  // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
286  // taking a subview of a 0 x N DualView incorrectly always results
287  // in a 0 x 0 DualView.
288  template<class DualViewType>
289  DualViewType
290  takeSubview (const DualViewType& X,
291  const std::pair<size_t, size_t>& rowRng,
292  const std::pair<size_t, size_t>& colRng)
293  {
294  if (X.extent (0) == 0 && X.extent (1) != 0) {
295  return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
296  }
297  else {
298  return subview (X, rowRng, colRng);
299  }
300  }
301 
302  template<class DualViewType>
303  size_t
304  getDualViewStride (const DualViewType& dv)
305  {
306  // FIXME (mfh 15 Mar 2019) DualView doesn't have a stride
307  // method yet, but its Views do.
308  const size_t LDA = dv.d_view.stride (1);
309  const size_t numRows = dv.extent (0);
310 
311  if (LDA == 0) {
312  return (numRows == 0) ? size_t (1) : numRows;
313  }
314  else {
315  return LDA;
316  }
317  }
318 
319  template<class ViewType>
320  size_t
321  getViewStride (const ViewType& view)
322  {
323  const size_t LDA = view.stride (1);
324  const size_t numRows = view.extent (0);
325 
326  if (LDA == 0) {
327  return (numRows == 0) ? size_t (1) : numRows;
328  }
329  else {
330  return LDA;
331  }
332  }
333 
334  template <class SC, class LO, class GO, class NT>
335  bool
336  multiVectorRunNormOnHost (const ::Tpetra::MultiVector<SC, LO, GO, NT>& X)
337  {
338  if (! X.need_sync_device ()) {
339  return false; // most up-to-date on device
340  }
341  else { // most up-to-date on host
342  constexpr size_t localLengthThreshold = 10000;
343  return X.getLocalLength () <= localLengthThreshold;
344  }
345  }
346 
347  template <class SC, class LO, class GO, class NT>
348  void
349  multiVectorNormImpl (typename ::Tpetra::MultiVector<SC, LO, GO, NT>::mag_type norms[],
352  {
355  using val_type = typename MV::impl_scalar_type;
356  using mag_type = typename MV::mag_type;
357  using dual_view_type = typename MV::dual_view_type;
358 
359  auto map = X.getMap ();
360  auto comm = map.is_null () ? nullptr : map->getComm ().getRawPtr ();
361  auto whichVecs = getMultiVectorWhichVectors (X);
362  const bool isConstantStride = X.isConstantStride ();
363  const bool isDistributed = X.isDistributed ();
364 
365  const bool runOnHost = multiVectorRunNormOnHost (X);
366  if (runOnHost) {
367  using view_type = typename dual_view_type::t_host;
368  using array_layout = typename view_type::array_layout;
369  using device_type = typename view_type::device_type;
370 
371  if (X.need_sync_host ()) {
372  X.sync_host ();
373  }
374  view_type X_lcl = X.getLocalViewHost ();
375  normImpl<val_type, array_layout, device_type,
376  mag_type> (norms, X_lcl, whichNorm, whichVecs,
377  isConstantStride, isDistributed, comm);
378  }
379  else {
380  using view_type = typename dual_view_type::t_dev;
381  using array_layout = typename view_type::array_layout;
382  using device_type = typename view_type::device_type;
383 
384  if (X.need_sync_device ()) {
385  X.sync_device ();
386  }
387  view_type X_lcl = X.getLocalViewDevice ();
388  normImpl<val_type, array_layout, device_type,
389  mag_type> (norms, X_lcl, whichNorm, whichVecs,
390  isConstantStride, isDistributed, comm);
391  }
392  }
393 } // namespace (anonymous)
394 
395 
396 namespace Tpetra {
397 
398  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
399  bool
400  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
401  vectorIndexOutOfRange (const size_t VectorIndex) const {
402  return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
403  }
404 
405  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
407  MultiVector () :
408  base_type (Teuchos::rcp (new map_type ()))
409  {}
410 
411  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
413  MultiVector (const Teuchos::RCP<const map_type>& map,
414  const size_t numVecs,
415  const bool zeroOut) : /* default is true */
416  base_type (map)
417  {
418  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,numVecs,zeroOut)");
419 
420  const size_t lclNumRows = this->getLocalLength ();
421  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (lclNumRows, numVecs, zeroOut);
422  origView_ = view_;
423  }
424 
425  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
428  const Teuchos::DataAccess copyOrView) :
429  base_type (source),
430  view_ (source.view_),
431  origView_ (source.origView_),
432  whichVectors_ (source.whichVectors_)
433  {
435  const char tfecfFuncName[] = "MultiVector(const MultiVector&, "
436  "const Teuchos::DataAccess): ";
437 
438  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV 2-arg \"copy\" ctor");
439 
440  if (copyOrView == Teuchos::Copy) {
441  // Reuse the conveniently already existing function that creates
442  // a deep copy.
443  MV cpy = createCopy (source);
444  this->view_ = cpy.view_;
445  this->origView_ = cpy.origView_;
446  this->whichVectors_ = cpy.whichVectors_;
447  }
448  else if (copyOrView == Teuchos::View) {
449  }
450  else {
451  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
452  true, std::invalid_argument, "Second argument 'copyOrView' has an "
453  "invalid value " << copyOrView << ". Valid values include "
454  "Teuchos::Copy = " << Teuchos::Copy << " and Teuchos::View = "
455  << Teuchos::View << ".");
456  }
457  }
458 
459  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
461  MultiVector (const Teuchos::RCP<const map_type>& map,
462  const dual_view_type& view) :
463  base_type (map),
464  view_ (view),
465  origView_ (view)
466  {
467  const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
468  const size_t lclNumRows_map = map.is_null () ? size_t (0) :
469  map->getNodeNumElements ();
470  const size_t lclNumRows_view = view.extent (0);
471  const size_t LDA = getDualViewStride (view);
472 
473  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
474  (LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
475  std::invalid_argument, "Kokkos::DualView does not match Map. "
476  "map->getNodeNumElements() = " << lclNumRows_map
477  << ", view.extent(0) = " << lclNumRows_view
478  << ", and getStride() = " << LDA << ".");
479 
480  using ::Tpetra::Details::Behavior;
481  const bool debug = Behavior::debug ();
482  if (debug) {
483  using ::Tpetra::Details::checkGlobalDualViewValidity;
484  std::ostringstream gblErrStrm;
485  const bool verbose = Behavior::verbose ();
486  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
487  const bool gblValid =
488  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
489  comm.getRawPtr ());
490  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
491  (! gblValid, std::runtime_error, gblErrStrm.str ());
492  }
493  }
494 
495 
496  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
498  MultiVector (const Teuchos::RCP<const map_type>& map,
499  const typename dual_view_type::t_dev& d_view) :
500  base_type (map)
501  {
502  using Teuchos::ArrayRCP;
503  using Teuchos::RCP;
504  const char tfecfFuncName[] = "MultiVector(map,d_view): ";
505 
506  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,d_view)");
507 
508  const size_t LDA = getViewStride (d_view);
509  const size_t lclNumRows = map->getNodeNumElements ();
510  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
511  (LDA < lclNumRows, std::invalid_argument, "Map does not match "
512  "Kokkos::View. map->getNodeNumElements() = " << lclNumRows
513  << ", View's column stride = " << LDA
514  << ", and View's extent(0) = " << d_view.extent (0) << ".");
515 
516  auto h_view = Kokkos::create_mirror_view (d_view);
517  view_ = dual_view_type (d_view, h_view);
518 
519  using ::Tpetra::Details::Behavior;
520  const bool debug = Behavior::debug ();
521  if (debug) {
522  using ::Tpetra::Details::checkGlobalDualViewValidity;
523  std::ostringstream gblErrStrm;
524  const bool verbose = Behavior::verbose ();
525  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
526  const bool gblValid =
527  checkGlobalDualViewValidity (&gblErrStrm, view_, verbose,
528  comm.getRawPtr ());
529  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
530  (! gblValid, std::runtime_error, gblErrStrm.str ());
531  }
532  // The user gave us a device view. In order to respect its
533  // initial contents, we mark the DualView as "modified on device."
534  // That way, the next sync will synchronize it with the host view.
535  this->modify_device ();
537  }
538 
539  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
541  MultiVector (const Teuchos::RCP<const map_type>& map,
542  const dual_view_type& view,
543  const dual_view_type& origView) :
544  base_type (map),
545  view_ (view),
546  origView_ (origView)
547  {
548  const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
549 
550  const size_t LDA = getDualViewStride (origView);
551  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
552  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
553  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
554  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
555  << ". This may also mean that the input origView's first dimension (number "
556  "of rows = " << origView.extent (0) << ") does not not match the number "
557  "of entries in the Map on the calling process.");
558 
559  using ::Tpetra::Details::Behavior;
560  const bool debug = Behavior::debug ();
561  if (debug) {
562  using ::Tpetra::Details::checkGlobalDualViewValidity;
563  std::ostringstream gblErrStrm;
564  const bool verbose = Behavior::verbose ();
565  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
566  const bool gblValid_0 =
567  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
568  comm.getRawPtr ());
569  const bool gblValid_1 =
570  checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
571  comm.getRawPtr ());
572  const bool gblValid = gblValid_0 && gblValid_1;
573  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
574  (! gblValid, std::runtime_error, gblErrStrm.str ());
575  }
576  }
577 
578 
579  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
581  MultiVector (const Teuchos::RCP<const map_type>& map,
582  const dual_view_type& view,
583  const Teuchos::ArrayView<const size_t>& whichVectors) :
584  base_type (map),
585  view_ (view),
586  origView_ (view),
587  whichVectors_ (whichVectors.begin (), whichVectors.end ())
588  {
589  using Kokkos::ALL;
590  using Kokkos::subview;
591  const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
592 
593  using ::Tpetra::Details::Behavior;
594  const bool debug = Behavior::debug ();
595  if (debug) {
596  using ::Tpetra::Details::checkGlobalDualViewValidity;
597  std::ostringstream gblErrStrm;
598  const bool verbose = Behavior::verbose ();
599  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
600  const bool gblValid =
601  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
602  comm.getRawPtr ());
603  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
604  (! gblValid, std::runtime_error, gblErrStrm.str ());
605  }
606 
607  const size_t lclNumRows = map.is_null () ? size_t (0) :
608  map->getNodeNumElements ();
609  // Check dimensions of the input DualView. We accept that Kokkos
610  // might not allow construction of a 0 x m (Dual)View with m > 0,
611  // so we only require the number of rows to match if the
612  // (Dual)View has more than zero columns. Likewise, we only
613  // require the number of columns to match if the (Dual)View has
614  // more than zero rows.
615  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
616  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
617  std::invalid_argument, "view.extent(0) = " << view.extent (0)
618  << " < map->getNodeNumElements() = " << lclNumRows << ".");
619  if (whichVectors.size () != 0) {
620  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
621  view.extent (1) != 0 && view.extent (1) == 0,
622  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
623  " = " << whichVectors.size () << " > 0.");
624  size_t maxColInd = 0;
625  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
626  for (size_type k = 0; k < whichVectors.size (); ++k) {
627  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
628  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
629  std::invalid_argument, "whichVectors[" << k << "] = "
630  "Teuchos::OrdinalTraits<size_t>::invalid().");
631  maxColInd = std::max (maxColInd, whichVectors[k]);
632  }
633  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
634  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
635  std::invalid_argument, "view.extent(1) = " << view.extent (1)
636  << " <= max(whichVectors) = " << maxColInd << ".");
637  }
638 
639  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
640  // zero strides, so modify in that case.
641  const size_t LDA = getDualViewStride (view);
642  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
643  (LDA < lclNumRows, std::invalid_argument,
644  "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
645 
646  if (whichVectors.size () == 1) {
647  // If whichVectors has only one entry, we don't need to bother
648  // with nonconstant stride. Just shift the view over so it
649  // points to the desired column.
650  //
651  // NOTE (mfh 10 May 2014) This is a special case where we set
652  // origView_ just to view that one column, not all of the
653  // original columns. This ensures that the use of origView_ in
654  // offsetView works correctly.
655  const std::pair<size_t, size_t> colRng (whichVectors[0],
656  whichVectors[0] + 1);
657  view_ = takeSubview (view_, ALL (), colRng);
658  origView_ = takeSubview (origView_, ALL (), colRng);
659  // whichVectors_.size() == 0 means "constant stride."
660  whichVectors_.clear ();
661  }
662  }
663 
664  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
666  MultiVector (const Teuchos::RCP<const map_type>& map,
667  const dual_view_type& view,
668  const dual_view_type& origView,
669  const Teuchos::ArrayView<const size_t>& whichVectors) :
670  base_type (map),
671  view_ (view),
672  origView_ (origView),
673  whichVectors_ (whichVectors.begin (), whichVectors.end ())
674  {
675  using Kokkos::ALL;
676  using Kokkos::subview;
677  const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
678 
679  using ::Tpetra::Details::Behavior;
680  const bool debug = Behavior::debug ();
681  if (debug) {
682  using ::Tpetra::Details::checkGlobalDualViewValidity;
683  std::ostringstream gblErrStrm;
684  const bool verbose = Behavior::verbose ();
685  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
686  const bool gblValid_0 =
687  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
688  comm.getRawPtr ());
689  const bool gblValid_1 =
690  checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
691  comm.getRawPtr ());
692  const bool gblValid = gblValid_0 && gblValid_1;
693  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
694  (! gblValid, std::runtime_error, gblErrStrm.str ());
695  }
696 
697  const size_t lclNumRows = this->getLocalLength ();
698  // Check dimensions of the input DualView. We accept that Kokkos
699  // might not allow construction of a 0 x m (Dual)View with m > 0,
700  // so we only require the number of rows to match if the
701  // (Dual)View has more than zero columns. Likewise, we only
702  // require the number of columns to match if the (Dual)View has
703  // more than zero rows.
704  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
705  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
706  std::invalid_argument, "view.extent(0) = " << view.extent (0)
707  << " < map->getNodeNumElements() = " << lclNumRows << ".");
708  if (whichVectors.size () != 0) {
709  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
710  view.extent (1) != 0 && view.extent (1) == 0,
711  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
712  " = " << whichVectors.size () << " > 0.");
713  size_t maxColInd = 0;
714  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
715  for (size_type k = 0; k < whichVectors.size (); ++k) {
716  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
717  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
718  std::invalid_argument, "whichVectors[" << k << "] = "
719  "Teuchos::OrdinalTraits<size_t>::invalid().");
720  maxColInd = std::max (maxColInd, whichVectors[k]);
721  }
722  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
723  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
724  std::invalid_argument, "view.extent(1) = " << view.extent (1)
725  << " <= max(whichVectors) = " << maxColInd << ".");
726  }
727 
728  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
729  // zero strides, so modify in that case.
730  const size_t LDA = getDualViewStride (origView);
731  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
732  (LDA < lclNumRows, std::invalid_argument, "Map and DualView origView "
733  "do not match. LDA = " << LDA << " < this->getLocalLength() = " <<
734  lclNumRows << ". origView.extent(0) = " << origView.extent(0)
735  << ", origView.stride(1) = " << origView.d_view.stride(1) << ".");
736 
737  if (whichVectors.size () == 1) {
738  // If whichVectors has only one entry, we don't need to bother
739  // with nonconstant stride. Just shift the view over so it
740  // points to the desired column.
741  //
742  // NOTE (mfh 10 May 2014) This is a special case where we set
743  // origView_ just to view that one column, not all of the
744  // original columns. This ensures that the use of origView_ in
745  // offsetView works correctly.
746  const std::pair<size_t, size_t> colRng (whichVectors[0],
747  whichVectors[0] + 1);
748  view_ = takeSubview (view_, ALL (), colRng);
749  origView_ = takeSubview (origView_, ALL (), colRng);
750  // whichVectors_.size() == 0 means "constant stride."
751  whichVectors_.clear ();
752  }
753  }
754 
755  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
757  MultiVector (const Teuchos::RCP<const map_type>& map,
758  const Teuchos::ArrayView<const Scalar>& data,
759  const size_t LDA,
760  const size_t numVecs) :
761  base_type (map)
762  {
763  typedef LocalOrdinal LO;
764  typedef GlobalOrdinal GO;
765  const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
766  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
767 
768  // Deep copy constructor, constant stride (NO whichVectors_).
769  // There is no need for a deep copy constructor with nonconstant stride.
770 
771  const size_t lclNumRows =
772  map.is_null () ? size_t (0) : map->getNodeNumElements ();
773  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
774  (LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
775  "map->getNodeNumElements() = " << lclNumRows << ".");
776  if (numVecs != 0) {
777  const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
778  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
779  (static_cast<size_t> (data.size ()) < minNumEntries,
780  std::invalid_argument, "Input Teuchos::ArrayView does not have enough "
781  "entries, given the input Map and number of vectors in the MultiVector."
782  " data.size() = " << data.size () << " < (LDA*(numVecs-1)) + "
783  "map->getNodeNumElements () = " << minNumEntries << ".");
784  }
785 
786  this->view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
787  this->modify_device ();
788  auto X_out = this->getLocalViewDevice ();
789  origView_ = view_;
790 
791  // Make an unmanaged host Kokkos::View of the input data. First
792  // create a View (X_in_orig) with the original stride. Then,
793  // create a subview (X_in) with the right number of columns.
794  const impl_scalar_type* const X_in_raw =
795  reinterpret_cast<const impl_scalar_type*> (data.getRawPtr ());
796  Kokkos::View<const impl_scalar_type**,
797  Kokkos::LayoutLeft,
798  Kokkos::HostSpace,
799  Kokkos::MemoryUnmanaged> X_in_orig (X_in_raw, LDA, numVecs);
800  const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
801  auto X_in = Kokkos::subview (X_in_orig, rowRng, Kokkos::ALL ());
802 
803  // If LDA != X_out's column stride, then we need to copy one
804  // column at a time; Kokkos::deep_copy refuses to work in that
805  // case.
806  const size_t outStride =
807  X_out.extent (1) == 0 ? size_t (1) : X_out.stride (1);
808  if (LDA == outStride) { // strides are the same; deep_copy once
809  // This only works because MultiVector uses LayoutLeft.
810  // We would need a custom copy functor otherwise.
811  Kokkos::deep_copy (X_out, X_in);
812  }
813  else { // strides differ; copy one column at a time
814  typedef decltype (Kokkos::subview (X_out, Kokkos::ALL (), 0))
815  out_col_view_type;
816  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0))
817  in_col_view_type;
818  for (size_t j = 0; j < numVecs; ++j) {
819  out_col_view_type X_out_j = Kokkos::subview (X_out, Kokkos::ALL (), j);
820  in_col_view_type X_in_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
821  Kokkos::deep_copy (X_out_j, X_in_j);
822  }
823  }
824  }
825 
826  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
828  MultiVector (const Teuchos::RCP<const map_type>& map,
829  const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
830  const size_t numVecs) :
831  base_type (map)
832  {
833  typedef impl_scalar_type IST;
834  typedef LocalOrdinal LO;
835  typedef GlobalOrdinal GO;
836  const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
837  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
838 
839  const size_t lclNumRows =
840  map.is_null () ? size_t (0) : map->getNodeNumElements ();
841  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
842  (numVecs < 1 || numVecs != static_cast<size_t> (ArrayOfPtrs.size ()),
843  std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
844  "ArrayOfPtrs.size() (= " << ArrayOfPtrs.size () << ") != numVecs.");
845  for (size_t j = 0; j < numVecs; ++j) {
846  Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
847  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
848  static_cast<size_t> (X_j_av.size ()) < lclNumRows,
849  std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = "
850  << X_j_av.size () << " < map->getNodeNumElements() = " << lclNumRows
851  << ".");
852  }
853 
854  view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
855  this->modify_device ();
856  auto X_out = this->getLocalViewDevice ();
857 
858  // Make sure that the type of a single input column has the same
859  // array layout as each output column does, so we can deep_copy.
860  using array_layout = typename decltype (X_out)::array_layout;
861  using input_col_view_type = typename Kokkos::View<const IST*,
862  array_layout,
863  Kokkos::HostSpace,
864  Kokkos::MemoryUnmanaged>;
865 
866  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
867  for (size_t j = 0; j < numVecs; ++j) {
868  Teuchos::ArrayView<const IST> X_j_av =
869  Teuchos::av_reinterpret_cast<const IST> (ArrayOfPtrs[j]);
870  input_col_view_type X_j_in (X_j_av.getRawPtr (), lclNumRows);
871  auto X_j_out = Kokkos::subview (X_out, rowRng, j);
872  Kokkos::deep_copy (X_j_out, X_j_in);
873  }
874  origView_ = view_;
875  }
876 
877  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
879  isConstantStride () const {
880  return whichVectors_.empty ();
881  }
882 
883  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
884  size_t
886  getLocalLength () const
887  {
888  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
889  return static_cast<size_t> (0);
890  } else {
891  return this->getMap ()->getNodeNumElements ();
892  }
893  }
894 
895  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
898  getGlobalLength () const
899  {
900  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
901  return static_cast<size_t> (0);
902  } else {
903  return this->getMap ()->getGlobalNumElements ();
904  }
905  }
906 
907  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
908  size_t
910  getStride () const
911  {
912  return isConstantStride () ? getDualViewStride (origView_) : size_t (0);
913  }
914 
915  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
916  bool
918  checkSizes (const SrcDistObject& sourceObj)
919  {
920  // Check whether the source object is a MultiVector. If not, then
921  // we can't even compare sizes, so it's definitely not OK to
922  // Import or Export from it.
924  const this_type* src = dynamic_cast<const this_type*> (&sourceObj);
925  if (src == nullptr) {
926  return false;
927  }
928  else {
929  // The target of the Import or Export calls checkSizes() in
930  // DistObject::doTransfer(). By that point, we've already
931  // constructed an Import or Export object using the two
932  // multivectors' Maps, which means that (hopefully) we've
933  // already checked other attributes of the multivectors. Thus,
934  // all we need to do here is check the number of columns.
935  return src->getNumVectors () == this->getNumVectors ();
936  }
937  }
938 
939  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
940  size_t
942  constantNumberOfPackets () const {
943  return this->getNumVectors ();
944  }
945 
946  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
947  void
949 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
950  copyAndPermuteNew
951 #else // TPETRA_ENABLE_DEPRECATED_CODE
952  copyAndPermute
953 #endif // TPETRA_ENABLE_DEPRECATED_CODE
954  (const SrcDistObject& sourceObj,
955  const size_t numSameIDs,
956  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
957  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs)
958  {
959  using ::Tpetra::Details::Behavior;
961  using ::Tpetra::Details::ProfilingRegion;
962  using std::endl;
963  using KokkosRefactor::Details::permute_array_multi_column;
964  using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
965  using Kokkos::Compat::create_const_view;
967  const char tfecfFuncName[] = "copyAndPermute: ";
968  ProfilingRegion regionCAP ("Tpetra::MultiVector::copyAndPermute");
969 
970  const bool verbose = Behavior::verbose ();
971  std::unique_ptr<std::string> prefix;
972  if (verbose) {
973  auto map = this->getMap ();
974  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
975  const int myRank = comm.is_null () ? -1 : comm->getRank ();
976  std::ostringstream os;
977  os << "Proc " << myRank << ": MV::copyAndPermute: ";
978  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
979  os << "Start" << endl;
980  std::cerr << os.str ();
981  }
982 
983  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
984  (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0),
985  std::logic_error, "permuteToLIDs.extent(0) = "
986  << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = "
987  << permuteFromLIDs.extent (0) << ".");
988 
989  // We've already called checkSizes(), so this cast must succeed.
990  const MV& sourceMV = dynamic_cast<const MV&> (sourceObj);
991  const size_t numCols = this->getNumVectors ();
992 
993  // sourceMV doesn't belong to us, so we can't sync it. Do the
994  // copying where it's currently sync'd.
995  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
996  (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
997  std::logic_error, "Input MultiVector needs sync to both host "
998  "and device.");
999  const bool copyOnHost = sourceMV.need_sync_device ();
1000  if (verbose) {
1001  std::ostringstream os;
1002  os << *prefix << "copyOnHost=" << (copyOnHost ? "true" : "false") << endl;
1003  std::cerr << os.str ();
1004  }
1005 
1006  if (copyOnHost) {
1007  if (this->need_sync_host ()) {
1008  this->sync_host ();
1009  }
1010  this->modify_host ();
1011  }
1012  else {
1013  if (this->need_sync_device ()) {
1014  this->sync_device ();
1015  }
1016  this->modify_device ();
1017  }
1018 
1019  if (verbose) {
1020  std::ostringstream os;
1021  os << *prefix << "Copy" << endl;
1022  std::cerr << os.str ();
1023  }
1024 
1025  // TODO (mfh 15 Sep 2013) When we replace
1026  // KokkosClassic::MultiVector with a Kokkos::View, there are two
1027  // ways to copy the data:
1028  //
1029  // 1. Get a (sub)view of each column and call deep_copy on that.
1030  // 2. Write a custom kernel to copy the data.
1031  //
1032  // The first is easier, but the second might be more performant in
1033  // case we decide to use layouts other than LayoutLeft. It might
1034  // even make sense to hide whichVectors_ in an entirely new layout
1035  // for Kokkos Views.
1036 
1037  // Copy rows [0, numSameIDs-1] of the local multivectors.
1038  //
1039  // Note (ETP 2 Jul 2014) We need to always copy one column at a
1040  // time, even when both multivectors are constant-stride, since
1041  // deep_copy between strided subviews with more than one column
1042  // doesn't currently work.
1043 
1044  // FIXME (mfh 04 Feb 2019) Need to optimize for the case where
1045  // both source and target are constant stride and have multiple
1046  // columns.
1047  if (numSameIDs > 0) {
1048  const std::pair<size_t, size_t> rows (0, numSameIDs);
1049  if (copyOnHost) {
1050  auto tgt_h = this->getLocalViewHost ();
1051  auto src_h = create_const_view (sourceMV.getLocalViewHost ());
1052 
1053  for (size_t j = 0; j < numCols; ++j) {
1054  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1055  const size_t srcCol =
1056  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1057 
1058  auto tgt_j = Kokkos::subview (tgt_h, rows, tgtCol);
1059  auto src_j = Kokkos::subview (src_h, rows, srcCol);
1060  Kokkos::deep_copy (tgt_j, src_j); // Copy src_j into tgt_j
1061  }
1062  }
1063  else { // copy on device
1064  auto tgt_d = this->getLocalViewDevice ();
1065  auto src_d = create_const_view (sourceMV.getLocalViewDevice ());
1066 
1067  for (size_t j = 0; j < numCols; ++j) {
1068  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1069  const size_t srcCol =
1070  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1071 
1072  auto tgt_j = Kokkos::subview (tgt_d, rows, tgtCol);
1073  auto src_j = Kokkos::subview (src_d, rows, srcCol);
1074  Kokkos::deep_copy (tgt_j, src_j); // Copy src_j into tgt_j
1075  }
1076  }
1077  }
1078 
1079  // For the remaining GIDs, execute the permutations. This may
1080  // involve noncontiguous access of both source and destination
1081  // vectors, depending on the LID lists.
1082  //
1083  // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
1084  // the same process, this merges their values by replacement of
1085  // the last encountered GID, not by the specified merge rule
1086  // (such as ADD).
1087 
1088  // If there are no permutations, we are done
1089  if (permuteFromLIDs.extent (0) == 0 ||
1090  permuteToLIDs.extent (0) == 0) {
1091  if (verbose) {
1092  std::ostringstream os;
1093  os << *prefix << "No permutations. Done!" << endl;
1094  std::cerr << os.str ();
1095  }
1096  return;
1097  }
1098 
1099  if (verbose) {
1100  std::ostringstream os;
1101  os << *prefix << "Permute" << endl;
1102  std::cerr << os.str ();
1103  }
1105  // We could in theory optimize for the case where exactly one of
1106  // them is constant stride, but we don't currently do that.
1107  const bool nonConstStride =
1108  ! this->isConstantStride () || ! sourceMV.isConstantStride ();
1109 
1110  if (verbose) {
1111  std::ostringstream os;
1112  os << *prefix << "nonConstStride="
1113  << (nonConstStride ? "true" : "false") << endl;
1114  std::cerr << os.str ();
1115  }
1116 
1117  // We only need the "which vectors" arrays if either the source or
1118  // target MV is not constant stride. Since we only have one
1119  // kernel that must do double-duty, we have to create a "which
1120  // vectors" array for the MV that _is_ constant stride.
1121  Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
1122  Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
1123  if (nonConstStride) {
1124  if (this->whichVectors_.size () == 0) {
1125  Kokkos::DualView<size_t*, device_type> tmpTgt ("tgtWhichVecs", numCols);
1126  tmpTgt.modify_host ();
1127  for (size_t j = 0; j < numCols; ++j) {
1128  tmpTgt.h_view(j) = j;
1129  }
1130  if (! copyOnHost) {
1131  tmpTgt.sync_device ();
1132  }
1133  tgtWhichVecs = tmpTgt;
1134  }
1135  else {
1136  Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_ ();
1137  tgtWhichVecs =
1138  getDualViewCopyFromArrayView<size_t, device_type> (tgtWhichVecsT,
1139  "tgtWhichVecs",
1140  copyOnHost);
1141  }
1142  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1143  (static_cast<size_t> (tgtWhichVecs.extent (0)) !=
1144  this->getNumVectors (),
1145  std::logic_error, "tgtWhichVecs.extent(0) = " <<
1146  tgtWhichVecs.extent (0) << " != this->getNumVectors() = " <<
1147  this->getNumVectors () << ".");
1148 
1149  if (sourceMV.whichVectors_.size () == 0) {
1150  Kokkos::DualView<size_t*, device_type> tmpSrc ("srcWhichVecs", numCols);
1151  tmpSrc.modify_host ();
1152  for (size_t j = 0; j < numCols; ++j) {
1153  tmpSrc.h_view(j) = j;
1154  }
1155  if (! copyOnHost) {
1156  tmpSrc.sync_device ();
1157  }
1158  srcWhichVecs = tmpSrc;
1159  }
1160  else {
1161  Teuchos::ArrayView<const size_t> srcWhichVecsT =
1162  sourceMV.whichVectors_ ();
1163  srcWhichVecs =
1164  getDualViewCopyFromArrayView<size_t, device_type> (srcWhichVecsT,
1165  "srcWhichVecs",
1166  copyOnHost);
1167  }
1168  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1169  (static_cast<size_t> (srcWhichVecs.extent (0)) !=
1170  sourceMV.getNumVectors (), std::logic_error,
1171  "srcWhichVecs.extent(0) = " << srcWhichVecs.extent (0)
1172  << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors ()
1173  << ".");
1174  }
1175 
1176  if (copyOnHost) { // permute on host too
1177  if (verbose) {
1178  std::ostringstream os;
1179  os << *prefix << "Get permute LIDs on host" << std::endl;
1180  std::cerr << os.str ();
1181  }
1182  auto tgt_h = this->getLocalViewHost ();
1183  auto src_h = create_const_view (sourceMV.getLocalViewHost ());
1184 
1185  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
1186  auto permuteToLIDs_h = create_const_view (permuteToLIDs.view_host ());
1187  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
1188  auto permuteFromLIDs_h =
1189  create_const_view (permuteFromLIDs.view_host ());
1190 
1191  if (verbose) {
1192  std::ostringstream os;
1193  os << *prefix << "Permute on host" << endl;
1194  std::cerr << os.str ();
1195  }
1196  if (nonConstStride) {
1197  // No need to sync first, because copyOnHost argument to
1198  // getDualViewCopyFromArrayView puts them in the right place.
1199  auto tgtWhichVecs_h =
1200  create_const_view (tgtWhichVecs.view_host ());
1201  auto srcWhichVecs_h =
1202  create_const_view (srcWhichVecs.view_host ());
1203  permute_array_multi_column_variable_stride (tgt_h, src_h,
1204  permuteToLIDs_h,
1205  permuteFromLIDs_h,
1206  tgtWhichVecs_h,
1207  srcWhichVecs_h, numCols);
1208  }
1209  else {
1210  permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1211  permuteFromLIDs_h, numCols);
1212  }
1213  }
1214  else { // permute on device
1215  if (verbose) {
1216  std::ostringstream os;
1217  os << *prefix << "Get permute LIDs on device" << endl;
1218  std::cerr << os.str ();
1219  }
1220  auto tgt_d = this->getLocalViewDevice ();
1221  auto src_d = create_const_view (sourceMV.getLocalViewDevice ());
1222 
1223  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () );
1224  auto permuteToLIDs_d = create_const_view (permuteToLIDs.view_device ());
1225  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () );
1226  auto permuteFromLIDs_d =
1227  create_const_view (permuteFromLIDs.view_device ());
1228 
1229  if (verbose) {
1230  std::ostringstream os;
1231  os << *prefix << "Permute on device" << endl;
1232  std::cerr << os.str ();
1233  }
1234  if (nonConstStride) {
1235  // No need to sync first, because copyOnHost argument to
1236  // getDualViewCopyFromArrayView puts them in the right place.
1237  auto tgtWhichVecs_d = create_const_view (tgtWhichVecs.view_device ());
1238  auto srcWhichVecs_d = create_const_view (srcWhichVecs.view_device ());
1239  permute_array_multi_column_variable_stride (tgt_d, src_d,
1240  permuteToLIDs_d,
1241  permuteFromLIDs_d,
1242  tgtWhichVecs_d,
1243  srcWhichVecs_d, numCols);
1244  }
1245  else {
1246  permute_array_multi_column (tgt_d, src_d, permuteToLIDs_d,
1247  permuteFromLIDs_d, numCols);
1248  }
1249  }
1250 
1251  if (verbose) {
1252  std::ostringstream os;
1253  os << *prefix << "Done!" << endl;
1254  std::cerr << os.str ();
1255  }
1256  }
1257 
1258  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1259  void
1261 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1262  packAndPrepareNew
1263 #else // TPETRA_ENABLE_DEPRECATED_CODE
1264  packAndPrepare
1265 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1266  (const SrcDistObject& sourceObj,
1267  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1268  Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1269  Kokkos::DualView<size_t*, buffer_device_type> /* numExportPacketsPerLID */,
1270  size_t& constantNumPackets,
1271  Distributor & /* distor */ )
1272  {
1273  using ::Tpetra::Details::Behavior;
1274  using ::Tpetra::Details::ProfilingRegion;
1276  using Kokkos::Compat::create_const_view;
1277  using Kokkos::Compat::getKokkosViewDeepCopy;
1278  using std::endl;
1280  const char tfecfFuncName[] = "packAndPrepare: ";
1281  ProfilingRegion regionPAP ("Tpetra::MultiVector::packAndPrepare");
1282 
1283  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1284  // have the option to check indices. We do so when Tpetra is in
1285  // debug mode. It is in debug mode by default in a debug build,
1286  // but you may control this at run time, before launching the
1287  // executable, by setting the TPETRA_DEBUG environment variable to
1288  // "1" (or "TRUE").
1289  const bool debugCheckIndices = Behavior::debug ();
1290  // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
1291  // environment variable to "1" (or "TRUE") for copious debug
1292  // output to std::cerr on every MPI process. This is unwise for
1293  // runs with large numbers of MPI processes.
1294  const bool printDebugOutput = Behavior::verbose ();
1295 
1296  std::unique_ptr<std::string> prefix;
1297  if (printDebugOutput) {
1298  auto map = this->getMap ();
1299  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1300  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1301  std::ostringstream os;
1302  os << "Proc " << myRank << ": MV::packAndPrepare: ";
1303  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1304  os << "Start" << endl;
1305  std::cerr << os.str ();
1306  }
1307 
1308  // We've already called checkSizes(), so this cast must succeed.
1309  const MV& sourceMV = dynamic_cast<const MV&> (sourceObj);
1310 
1311  const size_t numCols = sourceMV.getNumVectors ();
1312 
1313  // This spares us from needing to fill numExportPacketsPerLID.
1314  // Setting constantNumPackets to a nonzero value signals that
1315  // all packets have the same number of entries.
1316  constantNumPackets = numCols;
1318  // If we have no exports, there is nothing to do. Make sure this
1319  // goes _after_ setting constantNumPackets correctly.
1320  if (exportLIDs.extent (0) == 0) {
1321  if (printDebugOutput) {
1322  std::ostringstream os;
1323  os << *prefix << "No exports on this proc, DONE" << std::endl;
1324  std::cerr << os.str ();
1325  }
1326  return;
1327  }
1328 
1329  /* The layout in the export for MultiVectors is as follows:
1330  exports = { all of the data from row exportLIDs.front() ;
1331  ....
1332  all of the data from row exportLIDs.back() }
1333  This doesn't have the best locality, but is necessary because
1334  the data for a Packet (all data associated with an LID) is
1335  required to be contiguous. */
1336 
1337  // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1338  // packing scheme in the above comment? The data going to a
1339  // particular process must be contiguous, of course, but those
1340  // data could include entries from multiple LIDs. DistObject just
1341  // needs to know how to index into that data. Kokkos is good at
1342  // decoupling storage intent from data layout choice.
1343 
1344  const size_t numExportLIDs = exportLIDs.extent (0);
1345  const size_t newExportsSize = numCols * numExportLIDs;
1346  if (printDebugOutput) {
1347  std::ostringstream os;
1348  os << *prefix << "realloc: "
1349  << "numExportLIDs: " << numExportLIDs
1350  << ", exports.extent(0): " << exports.extent (0)
1351  << ", newExportsSize: " << newExportsSize << std::endl;
1352  std::cerr << os.str ();
1353  }
1354  reallocDualViewIfNeeded (exports, newExportsSize, "exports");
1355 
1356  // mfh 04 Feb 2019: sourceMV doesn't belong to us, so we can't
1357  // sync it. Pack it where it's currently sync'd.
1358  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1359  (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1360  std::logic_error, "Input MultiVector needs sync to both host "
1361  "and device.");
1362  const bool packOnHost = sourceMV.need_sync_device ();
1363  auto src_dev = sourceMV.getLocalViewHost ();
1364  auto src_host = sourceMV.getLocalViewDevice ();
1365  if (printDebugOutput) {
1366  std::ostringstream os;
1367  os << *prefix << "packOnHost=" << (packOnHost ? "true" : "false") << endl;
1368  std::cerr << os.str ();
1369  }
1370 
1371  // Mark 'exports' here, since we might have resized it above.
1372  // Resizing currently requires calling the constructor, which
1373  // clears out the 'modified' flags.
1374  if (packOnHost) {
1375  exports.modify_host ();
1376  }
1377  else {
1378  exports.modify_device ();
1379  }
1380 
1381  if (numCols == 1) { // special case for one column only
1382  // MultiVector always represents a single column with constant
1383  // stride, but it doesn't hurt to implement both cases anyway.
1384  //
1385  // ETP: I'm not sure I agree with the above statement. Can't a single-
1386  // column multivector be a subview of another multi-vector, in which case
1387  // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1388  // separately.
1389  //
1390  // mfh 18 Jan 2016: In answer to ETP's comment above:
1391  // MultiVector treats single-column MultiVectors created using a
1392  // "nonconstant stride constructor" as a special case, and makes
1393  // them constant stride (by making whichVectors_ have length 0).
1394  if (sourceMV.isConstantStride ()) {
1395  using KokkosRefactor::Details::pack_array_single_column;
1396  if (printDebugOutput) {
1397  std::ostringstream os;
1398  os << *prefix << "Pack numCols=1 const stride" << endl;
1399  std::cerr << os.str ();
1400  }
1401  if (packOnHost) {
1402  pack_array_single_column (exports.view_host (),
1403  create_const_view (src_host),
1404  exportLIDs.view_host (),
1405  0,
1406  debugCheckIndices);
1407  }
1408  else { // pack on device
1409  pack_array_single_column (exports.view_device (),
1410  create_const_view (src_dev),
1411  exportLIDs.view_device (),
1412  0,
1413  debugCheckIndices);
1414  }
1415  }
1416  else {
1417  using KokkosRefactor::Details::pack_array_single_column;
1418  if (printDebugOutput) {
1419  std::ostringstream os;
1420  os << *prefix << "Pack numCols=1 nonconst stride" << endl;
1421  std::cerr << os.str ();
1422  }
1423  if (packOnHost) {
1424  pack_array_single_column (exports.view_host (),
1425  create_const_view (src_host),
1426  exportLIDs.view_host (),
1427  sourceMV.whichVectors_[0],
1428  debugCheckIndices);
1429  }
1430  else { // pack on device
1431  pack_array_single_column (exports.view_device (),
1432  create_const_view (src_dev),
1433  exportLIDs.view_device (),
1434  sourceMV.whichVectors_[0],
1435  debugCheckIndices);
1436  }
1437  }
1438  }
1439  else { // the source MultiVector has multiple columns
1440  if (sourceMV.isConstantStride ()) {
1441  using KokkosRefactor::Details::pack_array_multi_column;
1442  if (printDebugOutput) {
1443  std::ostringstream os;
1444  os << *prefix << "Pack numCols=" << numCols << " const stride" << endl;
1445  std::cerr << os.str ();
1446  }
1447  if (packOnHost) {
1448  pack_array_multi_column (exports.view_host (),
1449  create_const_view (src_host),
1450  exportLIDs.view_host (),
1451  numCols,
1452  debugCheckIndices);
1453  }
1454  else { // pack on device
1455  pack_array_multi_column (exports.view_device (),
1456  create_const_view (src_dev),
1457  exportLIDs.view_device (),
1458  numCols,
1459  debugCheckIndices);
1460  }
1461  }
1462  else {
1463  using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1464  if (printDebugOutput) {
1465  std::ostringstream os;
1466  os << *prefix << "Pack numCols=" << numCols << " nonconst stride"
1467  << endl;
1468  std::cerr << os.str ();
1469  }
1470  // FIXME (mfh 04 Feb 2019) Creating a Kokkos::View for
1471  // whichVectors_ can be expensive, but pack and unpack for
1472  // nonconstant-stride MultiVectors is slower anyway.
1473  using IST = impl_scalar_type;
1474  using DV = Kokkos::DualView<IST*, device_type>;
1475  using HES = typename DV::t_host::execution_space;
1476  using DES = typename DV::t_dev::execution_space;
1477  Teuchos::ArrayView<const size_t> whichVecs = sourceMV.whichVectors_ ();
1478  if (packOnHost) {
1479  pack_array_multi_column_variable_stride
1480  (exports.view_host (),
1481  create_const_view (src_host),
1482  exportLIDs.view_host (),
1483  getKokkosViewDeepCopy<HES> (whichVecs),
1484  numCols,
1485  debugCheckIndices);
1486  }
1487  else { // pack on device
1488  pack_array_multi_column_variable_stride
1489  (exports.view_device (),
1490  create_const_view (src_dev),
1491  exportLIDs.view_device (),
1492  getKokkosViewDeepCopy<DES> (whichVecs),
1493  numCols,
1494  debugCheckIndices);
1495  }
1496  }
1497  }
1498 
1499  if (printDebugOutput) {
1500  std::ostringstream os;
1501  os << *prefix << "Done!" << endl;
1502  std::cerr << os.str ();
1503  }
1504  }
1505 
1506 
1507  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1508  void
1509  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1510 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1511  unpackAndCombineNew
1512 #else // TPETRA_ENABLE_DEPRECATED_CODE
1513  unpackAndCombine
1514 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1515  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
1516  Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
1517  Kokkos::DualView<size_t*, buffer_device_type> /* numPacketsPerLID */,
1518  const size_t constantNumPackets,
1519  Distributor& /* distor */,
1520  const CombineMode CM)
1521  {
1522  using ::Tpetra::Details::Behavior;
1523  using ::Tpetra::Details::ProfilingRegion;
1524  using KokkosRefactor::Details::unpack_array_multi_column;
1525  using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1526  using Kokkos::Compat::getKokkosViewDeepCopy;
1527  using std::endl;
1528  using IST = impl_scalar_type;
1529  const char tfecfFuncName[] = "unpackAndCombine: ";
1530  ProfilingRegion regionUAC ("Tpetra::MultiVector::unpackAndCombine");
1531 
1532  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1533  // have the option to check indices. We do so when Tpetra is in
1534  // debug mode. It is in debug mode by default in a debug build,
1535  // but you may control this at run time, before launching the
1536  // executable, by setting the TPETRA_DEBUG environment variable to
1537  // "1" (or "TRUE").
1538  const bool debugCheckIndices = Behavior::debug ();
1540  const bool printDebugOutput = Behavior::verbose ();
1541  std::unique_ptr<std::string> prefix;
1542  if (printDebugOutput) {
1543  auto map = this->getMap ();
1544  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1545  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1546  std::ostringstream os;
1547  os << "Proc " << myRank << ": MV::packAndPrepare: ";
1548  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1549  os << "Start" << endl;
1550  std::cerr << os.str ();
1551  }
1552 
1553  // If we have no imports, there is nothing to do
1554  if (importLIDs.extent (0) == 0) {
1555  if (printDebugOutput) {
1556  std::ostringstream os;
1557  os << *prefix << "No imports. Done!" << endl;
1558  }
1559  return;
1560  }
1561 
1562  const size_t numVecs = getNumVectors ();
1563  if (debugCheckIndices) {
1564  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1565  (static_cast<size_t> (imports.extent (0)) !=
1566  numVecs * importLIDs.extent (0),
1567  std::runtime_error,
1568  "imports.extent(0) = " << imports.extent (0)
1569  << " != getNumVectors() * importLIDs.extent(0) = " << numVecs
1570  << " * " << importLIDs.extent (0) << " = "
1571  << numVecs * importLIDs.extent (0) << ".");
1572 
1573  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1574  (constantNumPackets == static_cast<size_t> (0), std::runtime_error,
1575  "constantNumPackets input argument must be nonzero.");
1576 
1577  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1578  (static_cast<size_t> (numVecs) !=
1579  static_cast<size_t> (constantNumPackets),
1580  std::runtime_error, "constantNumPackets must equal numVecs.");
1581  }
1582 
1583  // mfh 12 Apr 2016, 04 Feb 2019: Decide where to unpack based on
1584  // the memory space in which the imports buffer was last modified.
1585  // DistObject::doTransferNew gets to decide this.
1586  const bool unpackOnHost = imports.need_sync_device ();
1588  if (printDebugOutput) {
1589  std::ostringstream os;
1590  os << *prefix << "unpackOnHost=" << (unpackOnHost ? "true" : "false")
1591  << endl;
1592  std::cerr << os.str ();
1593  }
1594 
1595  // We have to sync before modifying, because this method may read
1596  // as well as write (depending on the CombineMode).
1597  if (unpackOnHost) {
1598  if (this->need_sync_host ()) {
1599  this->sync_host ();
1600  }
1601  this->modify_host ();
1602  }
1603  else {
1604  if (this->need_sync_device ()) {
1605  this->sync_device ();
1606  }
1607  this->modify_device ();
1608  }
1609  auto X_d = this->getLocalViewDevice ();
1610  auto X_h = this->getLocalViewHost ();
1611  auto imports_d = imports.view_device ();
1612  auto imports_h = imports.view_host ();
1613  auto importLIDs_d = importLIDs.view_device ();
1614  auto importLIDs_h = importLIDs.view_host ();
1615 
1616  Kokkos::DualView<size_t*, device_type> whichVecs;
1617  if (! isConstantStride ()) {
1618  Kokkos::View<const size_t*, Kokkos::HostSpace,
1619  Kokkos::MemoryUnmanaged> whichVecsIn (whichVectors_.getRawPtr (),
1620  numVecs);
1621  whichVecs = Kokkos::DualView<size_t*, device_type> ("whichVecs", numVecs);
1622  if (unpackOnHost) {
1623  whichVecs.modify_host ();
1624  Kokkos::deep_copy (whichVecs.view_host (), whichVecsIn);
1625  }
1626  else {
1627  whichVecs.modify_device ();
1628  Kokkos::deep_copy (whichVecs.view_device (), whichVecsIn);
1629  }
1630  }
1631  auto whichVecs_d = whichVecs.view_device ();
1632  auto whichVecs_h = whichVecs.view_host ();
1633 
1634  /* The layout in the export for MultiVectors is as follows:
1635  imports = { all of the data from row exportLIDs.front() ;
1636  ....
1637  all of the data from row exportLIDs.back() }
1638  This doesn't have the best locality, but is necessary because
1639  the data for a Packet (all data associated with an LID) is
1640  required to be contiguous. */
1641 
1642  if (numVecs > 0 && importLIDs.extent (0) > 0) {
1643  using dev_exec_space = typename dual_view_type::t_dev::execution_space;
1644  using host_exec_space = typename dual_view_type::t_host::execution_space;
1645 
1646  // This fixes GitHub Issue #4418.
1647  const bool use_atomic_updates = unpackOnHost ?
1648  host_exec_space::concurrency () != 1 :
1649  dev_exec_space::concurrency () != 1;
1650 
1651  if (printDebugOutput) {
1652  std::ostringstream os;
1653  os << *prefix << "Unpack: " << combineModeToString (CM) << endl;
1654  std::cerr << os.str ();
1655  }
1656 
1657  // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
1658  // custom combine modes, start editing here.
1659 
1660  if (CM == INSERT || CM == REPLACE) {
1661  using op_type = KokkosRefactor::Details::InsertOp<IST>;
1662  if (isConstantStride ()) {
1663  if (unpackOnHost) {
1664  unpack_array_multi_column (host_exec_space (),
1665  X_h, imports_h, importLIDs_h,
1666  op_type (), numVecs,
1667  use_atomic_updates,
1668  debugCheckIndices);
1669 
1670  }
1671  else { // unpack on device
1672  unpack_array_multi_column (dev_exec_space (),
1673  X_d, imports_d, importLIDs_d,
1674  op_type (), numVecs,
1675  use_atomic_updates,
1676  debugCheckIndices);
1677  }
1678  }
1679  else { // not constant stride
1680  if (unpackOnHost) {
1681  unpack_array_multi_column_variable_stride (host_exec_space (),
1682  X_h, imports_h,
1683  importLIDs_h,
1684  whichVecs_h,
1685  op_type (),
1686  numVecs,
1687  use_atomic_updates,
1688  debugCheckIndices);
1689  }
1690  else { // unpack on device
1691  unpack_array_multi_column_variable_stride (dev_exec_space (),
1692  X_d, imports_d,
1693  importLIDs_d,
1694  whichVecs_d,
1695  op_type (),
1696  numVecs,
1697  use_atomic_updates,
1698  debugCheckIndices);
1699  }
1700  }
1701  }
1702  else if (CM == ADD) {
1703  using op_type = KokkosRefactor::Details::AddOp<IST>;
1704  if (isConstantStride ()) {
1705  if (unpackOnHost) {
1706  unpack_array_multi_column (host_exec_space (),
1707  X_h, imports_h, importLIDs_h,
1708  op_type (), numVecs,
1709  use_atomic_updates,
1710  debugCheckIndices);
1711  }
1712  else { // unpack on device
1713  unpack_array_multi_column (dev_exec_space (),
1714  X_d, imports_d, importLIDs_d,
1715  op_type (), numVecs,
1716  use_atomic_updates,
1717  debugCheckIndices);
1718  }
1719  }
1720  else { // not constant stride
1721  if (unpackOnHost) {
1722  unpack_array_multi_column_variable_stride (host_exec_space (),
1723  X_h, imports_h,
1724  importLIDs_h,
1725  whichVecs_h,
1726  op_type (),
1727  numVecs,
1728  use_atomic_updates,
1729  debugCheckIndices);
1730  }
1731  else { // unpack on device
1732  unpack_array_multi_column_variable_stride (dev_exec_space (),
1733  X_d, imports_d,
1734  importLIDs_d,
1735  whichVecs_d,
1736  op_type (),
1737  numVecs,
1738  use_atomic_updates,
1739  debugCheckIndices);
1740  }
1741  }
1742  }
1743  else if (CM == ABSMAX) {
1744  using op_type = KokkosRefactor::Details::AbsMaxOp<IST>;
1745  if (isConstantStride ()) {
1746  if (unpackOnHost) {
1747  unpack_array_multi_column (host_exec_space (),
1748  X_h, imports_h, importLIDs_h,
1749  op_type (), numVecs,
1750  use_atomic_updates,
1751  debugCheckIndices);
1752  }
1753  else { // unpack on device
1754  unpack_array_multi_column (dev_exec_space (),
1755  X_d, imports_d, importLIDs_d,
1756  op_type (), numVecs,
1757  use_atomic_updates,
1758  debugCheckIndices);
1759  }
1760  }
1761  else {
1762  if (unpackOnHost) {
1763  unpack_array_multi_column_variable_stride (host_exec_space (),
1764  X_h, imports_h,
1765  importLIDs_h,
1766  whichVecs_h,
1767  op_type (),
1768  numVecs,
1769  use_atomic_updates,
1770  debugCheckIndices);
1771  }
1772  else { // unpack on device
1773  unpack_array_multi_column_variable_stride (dev_exec_space (),
1774  X_d, imports_d,
1775  importLIDs_d,
1776  whichVecs_d,
1777  op_type (),
1778  numVecs,
1779  use_atomic_updates,
1780  debugCheckIndices);
1781  }
1782  }
1783  }
1784  else {
1785  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1786  (true, std::logic_error, "Invalid CombineMode");
1787  }
1788  }
1789  else {
1790  if (printDebugOutput) {
1791  std::ostringstream os;
1792  os << *prefix << "Nothing to unpack" << endl;
1793  std::cerr << os.str ();
1794  }
1795  }
1796 
1797  if (printDebugOutput) {
1798  std::ostringstream os;
1799  os << *prefix << "Done!" << endl;
1800  std::cerr << os.str ();
1801  }
1802  }
1803 
1804  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1805  size_t
1807  getNumVectors () const
1808  {
1809  if (isConstantStride ()) {
1810  return static_cast<size_t> (view_.extent (1));
1811  } else {
1812  return static_cast<size_t> (whichVectors_.size ());
1813  }
1814  }
1815 
1816  namespace { // (anonymous)
1817 
1818  template<class RV>
1819  void
1820  gblDotImpl (const RV& dotsOut,
1821  const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
1822  const bool distributed)
1823  {
1824  using Teuchos::REDUCE_MAX;
1825  using Teuchos::REDUCE_SUM;
1826  using Teuchos::reduceAll;
1827  typedef typename RV::non_const_value_type dot_type;
1828 
1829  const size_t numVecs = dotsOut.extent (0);
1830 
1831  // If the MultiVector is distributed over multiple processes, do
1832  // the distributed (interprocess) part of the dot product. We
1833  // assume that the MPI implementation can read from and write to
1834  // device memory.
1835  //
1836  // replaceMap() may have removed some processes. Those
1837  // processes have a null Map. They must not participate in any
1838  // collective operations. We ask first whether the Map is null,
1839  // because isDistributed() defers that question to the Map. We
1840  // still compute and return local dots for processes not
1841  // participating in collective operations; those probably don't
1842  // make any sense, but it doesn't hurt to do them, since it's
1843  // illegal to call dot() on those processes anyway.
1844  if (distributed && ! comm.is_null ()) {
1845  // The calling process only participates in the collective if
1846  // both the Map and its Comm on that process are nonnull.
1847  const int nv = static_cast<int> (numVecs);
1848  const bool commIsInterComm = ::Tpetra::Details::isInterComm (*comm);
1849 
1850  if (commIsInterComm) {
1851  // If comm is an intercomm, then we may not alias input and
1852  // output buffers, so we have to make a copy of the local
1853  // sum.
1854  typename RV::non_const_type lclDots (Kokkos::ViewAllocateWithoutInitializing ("tmp"), numVecs);
1855  Kokkos::deep_copy (lclDots, dotsOut);
1856  const dot_type* const lclSum = lclDots.data ();
1857  dot_type* const gblSum = dotsOut.data ();
1858  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
1859  }
1860  else {
1861  dot_type* const inout = dotsOut.data ();
1862  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, inout, inout);
1863  }
1864  }
1865  }
1866  } // namespace (anonymous)
1867 
1868  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1869  void
1872  const Kokkos::View<dot_type*, Kokkos::HostSpace>& dots) const
1873  {
1874  using ::Tpetra::Details::Behavior;
1875  using Kokkos::subview;
1876  using Teuchos::Comm;
1877  using Teuchos::null;
1878  using Teuchos::RCP;
1879  // View of all the dot product results.
1880  typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
1882  typedef typename dual_view_type::t_dev XMV;
1883  const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
1884 
1885  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Kokkos::View)");
1886 
1887  const size_t numVecs = this->getNumVectors ();
1888  if (numVecs == 0) {
1889  return; // nothing to do
1890  }
1891  const size_t lclNumRows = this->getLocalLength ();
1892  const size_t numDots = static_cast<size_t> (dots.extent (0));
1893  const bool debug = Behavior::debug ();
1894 
1895  if (debug) {
1896  const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
1897  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1898  (! compat, std::invalid_argument, "'*this' MultiVector is not "
1899  "compatible with the input MultiVector A. We only test for this "
1900  "in debug mode.");
1901  }
1902 
1903  // FIXME (mfh 11 Jul 2014) These exception tests may not
1904  // necessarily be thrown on all processes consistently. We should
1905  // instead pass along error state with the inner product. We
1906  // could do this by setting an extra slot to
1907  // Kokkos::Details::ArithTraits<dot_type>::one() on error. The
1908  // final sum should be
1909  // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
1910  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1911  lclNumRows != A.getLocalLength (), std::runtime_error,
1912  "MultiVectors do not have the same local length. "
1913  "this->getLocalLength() = " << lclNumRows << " != "
1914  "A.getLocalLength() = " << A.getLocalLength () << ".");
1915  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1916  numVecs != A.getNumVectors (), std::runtime_error,
1917  "MultiVectors must have the same number of columns (vectors). "
1918  "this->getNumVectors() = " << numVecs << " != "
1919  "A.getNumVectors() = " << A.getNumVectors () << ".");
1920  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1921  numDots != numVecs, std::runtime_error,
1922  "The output array 'dots' must have the same number of entries as the "
1923  "number of columns (vectors) in *this and A. dots.extent(0) = " <<
1924  numDots << " != this->getNumVectors() = " << numVecs << ".");
1925 
1926  const std::pair<size_t, size_t> colRng (0, numVecs);
1927  RV dotsOut = subview (dots, colRng);
1928  RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
1929  this->getMap ()->getComm ();
1930 
1931  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
1932  if (this->need_sync_device ()) {
1933  const_cast<MV*>(this)->sync_device ();
1934  }
1935  if (A.need_sync_device ()) {
1936  const_cast<MV&>(A).sync_device ();
1937  }
1939  auto thisView = this->getLocalViewDevice ();
1940  auto A_view = A.getLocalViewDevice ();
1941 
1942  ::Tpetra::Details::lclDot<RV, XMV> (dotsOut, thisView, A_view, lclNumRows, numVecs,
1943  this->whichVectors_.getRawPtr (),
1944  A.whichVectors_.getRawPtr (),
1945  this->isConstantStride (), A.isConstantStride ());
1946  gblDotImpl (dotsOut, comm, this->isDistributed ());
1947  }
1948 
1949  namespace { // (anonymous)
1950  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1952  multiVectorSingleColumnDot (MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& x,
1954  {
1955  using ::Tpetra::Details::ProfilingRegion;
1957  using dot_type = typename MV::dot_type;
1958  ProfilingRegion region ("Tpetra::multiVectorSingleColumnDot");
1959 
1960  auto map = x.getMap ();
1961  Teuchos::RCP<const Teuchos::Comm<int> > comm =
1962  map.is_null () ? Teuchos::null : map->getComm ();
1963  if (comm.is_null ()) {
1964  return Kokkos::ArithTraits<dot_type>::zero ();
1965  }
1966  else {
1967  using LO = LocalOrdinal;
1968  // The min just ensures that we don't overwrite memory that
1969  // doesn't belong to us, in the erroneous input case where x
1970  // and y have different numbers of rows.
1971  const LO lclNumRows = static_cast<LO> (std::min (x.getLocalLength (),
1972  y.getLocalLength ()));
1973  const Kokkos::pair<LO, LO> rowRng (0, lclNumRows);
1974  dot_type lclDot = Kokkos::ArithTraits<dot_type>::zero ();
1975  dot_type gblDot = Kokkos::ArithTraits<dot_type>::zero ();
1976 
1977  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
1978  if (x.need_sync_device ()) {
1979  x.sync_device ();
1980  }
1981  if (y.need_sync_device ()) {
1982  const_cast<MV&>(y).sync_device ();
1983  }
1984 
1985  x.modify_device ();
1986  auto x_2d = x.getLocalViewDevice ();
1987  auto x_1d = Kokkos::subview (x_2d, rowRng, 0);
1988  auto y_2d = y.getLocalViewDevice ();
1989  auto y_1d = Kokkos::subview (y_2d, rowRng, 0);
1990  lclDot = KokkosBlas::dot (x_1d, y_1d);
1991 
1992  if (x.isDistributed ()) {
1993  using Teuchos::outArg;
1994  using Teuchos::REDUCE_SUM;
1995  using Teuchos::reduceAll;
1996  reduceAll<int, dot_type> (*comm, REDUCE_SUM, lclDot, outArg (gblDot));
1997  }
1998  else {
1999  gblDot = lclDot;
2000  }
2001  return gblDot;
2002  }
2003  }
2004  } // namespace (anonymous)
2005 
2006 
2007 
2008  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2009  void
2012  const Teuchos::ArrayView<dot_type>& dots) const
2013  {
2015  const char tfecfFuncName[] = "dot: ";
2016  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Teuchos::ArrayView)");
2017 
2018  const size_t numVecs = this->getNumVectors ();
2019  const size_t lclNumRows = this->getLocalLength ();
2020  const size_t numDots = static_cast<size_t> (dots.size ());
2021 
2022  // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
2023  // not necessarily be thrown on all processes consistently. We
2024  // keep them for now, because MultiVector's unit tests insist on
2025  // them. In the future, we should instead pass along error state
2026  // with the inner product. We could do this by setting an extra
2027  // slot to Kokkos::Details::ArithTraits<dot_type>::one() on error.
2028  // The final sum should be
2029  // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
2030  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2031  (lclNumRows != A.getLocalLength (), std::runtime_error,
2032  "MultiVectors do not have the same local length. "
2033  "this->getLocalLength() = " << lclNumRows << " != "
2034  "A.getLocalLength() = " << A.getLocalLength () << ".");
2035  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2036  (numVecs != A.getNumVectors (), std::runtime_error,
2037  "MultiVectors must have the same number of columns (vectors). "
2038  "this->getNumVectors() = " << numVecs << " != "
2039  "A.getNumVectors() = " << A.getNumVectors () << ".");
2040  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2041  (numDots != numVecs, std::runtime_error,
2042  "The output array 'dots' must have the same number of entries as the "
2043  "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2044  numDots << " != this->getNumVectors() = " << numVecs << ".");
2045 
2046  if (numVecs == 1 && this->isConstantStride () && A.isConstantStride ()) {
2047  const dot_type gblDot = multiVectorSingleColumnDot (const_cast<MV&> (*this), A);
2048  dots[0] = gblDot;
2049  }
2050  else {
2051  this->dot (A, Kokkos::View<dot_type*, Kokkos::HostSpace>(dots.getRawPtr (), numDots));
2052  }
2053  }
2054 
2055  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2056  void
2058  norm2 (const Teuchos::ArrayView<mag_type>& norms) const
2059  {
2061  using ::Tpetra::Details::NORM_TWO;
2062  using ::Tpetra::Details::ProfilingRegion;
2063  ProfilingRegion region ("Tpetra::MV::norm2 (host output)");
2064 
2065  // The function needs to be able to sync X.
2066  MV& X = const_cast<MV&> (*this);
2067  multiVectorNormImpl (norms.getRawPtr (), X, NORM_TWO);
2068  }
2069 
2070  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2071  void
2073  norm2 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2074  {
2075  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2076  this->norm2 (norms_av);
2077  }
2078 
2079 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
2080  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2081  void TPETRA_DEPRECATED
2084  const Teuchos::ArrayView<mag_type> &norms) const
2085  {
2086  using Kokkos::ALL;
2087  using Kokkos::subview;
2088  using Teuchos::Comm;
2089  using Teuchos::null;
2090  using Teuchos::RCP;
2091  using Teuchos::reduceAll;
2092  using Teuchos::REDUCE_SUM;
2093  typedef Kokkos::ArithTraits<impl_scalar_type> ATS;
2094  typedef Kokkos::ArithTraits<mag_type> ATM;
2095  typedef Kokkos::View<mag_type*, Kokkos::HostSpace> norms_view_type;
2097  const char tfecfFuncName[] = "normWeighted: ";
2098 
2099  const size_t numVecs = this->getNumVectors ();
2100  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2101  static_cast<size_t> (norms.size ()) != numVecs, std::runtime_error,
2102  "norms.size() = " << norms.size () << " != this->getNumVectors() = "
2103  << numVecs << ".");
2104 
2105  const bool OneW = (weights.getNumVectors () == 1);
2106  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2107  ! OneW && weights.getNumVectors () != numVecs, std::runtime_error,
2108  "The input MultiVector of weights must contain either one column, "
2109  "or must have the same number of columns as *this. "
2110  "weights.getNumVectors() = " << weights.getNumVectors ()
2111  << " and this->getNumVectors() = " << numVecs << ".");
2112 
2113  const bool debug = ::Tpetra::Details::Behavior::debug ();
2114 
2115  if (debug) {
2116  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2117  (! this->getMap ()->isCompatible (*weights.getMap ()), std::runtime_error,
2118  "MultiVectors do not have compatible Maps:" << std::endl
2119  << "this->getMap(): " << std::endl << *this->getMap()
2120  << "weights.getMap(): " << std::endl << *weights.getMap() << std::endl);
2121  }
2122  else {
2123  const size_t lclNumRows = this->getLocalLength ();
2124  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2125  (lclNumRows != weights.getLocalLength (), std::runtime_error,
2126  "MultiVectors do not have the same local length.");
2127  }
2128 
2129  norms_view_type lclNrms ("Tpetra::MV::lclNrms", numVecs);
2130 
2131  // FIXME (mfh 18 May 2016) Yes, I know "const" is a lie.
2132  const_cast<MV*> (this)->sync_device ();
2133  const_cast<MV&> (weights).sync_device ();
2134 
2135  auto X_lcl = this->getLocalViewDevice ();
2136  auto W_lcl = weights.getLocalViewDevice ();
2137 
2138  if (isConstantStride () && ! OneW) {
2139  KokkosBlas::nrm2w_squared (lclNrms, X_lcl, W_lcl);
2140  }
2141  else {
2142  for (size_t j = 0; j < numVecs; ++j) {
2143  const size_t X_col = this->isConstantStride () ? j :
2144  this->whichVectors_[j];
2145  const size_t W_col = OneW ? static_cast<size_t> (0) :
2146  (weights.isConstantStride () ? j : weights.whichVectors_[j]);
2147  KokkosBlas::nrm2w_squared (subview (lclNrms, j),
2148  subview (X_lcl, ALL (), X_col),
2149  subview (W_lcl, ALL (), W_col));
2150  }
2151  }
2152 
2153  const mag_type OneOverN =
2154  ATM::one () / static_cast<mag_type> (this->getGlobalLength ());
2155  RCP<const Comm<int> > comm = this->getMap ().is_null () ?
2156  Teuchos::null : this->getMap ()->getComm ();
2157 
2158  if (! comm.is_null () && this->isDistributed ()) {
2159  // Assume that MPI can access device memory.
2160  reduceAll<int, mag_type> (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2161  lclNrms.data (), norms.getRawPtr ());
2162  for (size_t k = 0; k < numVecs; ++k) {
2163  norms[k] = ATM::sqrt (norms[k] * OneOverN);
2164  }
2165  }
2166  else {
2167  for (size_t k = 0; k < numVecs; ++k) {
2168  norms[k] = ATM::sqrt (ATS::magnitude (lclNrms(k)) * OneOverN);
2169  }
2170  }
2171  }
2172 #endif // TPETRA_ENABLE_DEPRECATED_CODE
2174  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2175  void
2177  norm1 (const Teuchos::ArrayView<mag_type>& norms) const
2178  {
2180  using ::Tpetra::Details::NORM_ONE;
2181  using ::Tpetra::Details::ProfilingRegion;
2182  ProfilingRegion region ("Tpetra::MV::norm1 (host output)");
2183 
2184  // The function needs to be able to sync X.
2185  MV& X = const_cast<MV&> (*this);
2186  multiVectorNormImpl (norms.data (), X, NORM_ONE);
2187  }
2188 
2189  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2190  void
2192  norm1 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2193  {
2194  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2195  this->norm1 (norms_av);
2196  }
2197 
2198  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2199  void
2201  normInf (const Teuchos::ArrayView<mag_type>& norms) const
2202  {
2204  using ::Tpetra::Details::NORM_INF;
2205  using ::Tpetra::Details::ProfilingRegion;
2206  ProfilingRegion region ("Tpetra::MV::normInf (host output)");
2207 
2208  // The function needs to be able to sync X.
2209  MV& X = const_cast<MV&> (*this);
2210  multiVectorNormImpl (norms.getRawPtr (), X, NORM_INF);
2211  }
2212 
2213  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2214  void
2216  normInf (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2217  {
2218  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2219  this->normInf (norms_av);
2220  }
2221 
2222  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2223  void
2225  meanValue (const Teuchos::ArrayView<impl_scalar_type>& means) const
2226  {
2227  // KR FIXME Overload this method to take a View.
2228  using Kokkos::ALL;
2229  using Kokkos::subview;
2230  using Teuchos::Comm;
2231  using Teuchos::RCP;
2232  using Teuchos::reduceAll;
2233  using Teuchos::REDUCE_SUM;
2234  typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
2235 
2236  const size_t lclNumRows = this->getLocalLength ();
2237  const size_t numVecs = this->getNumVectors ();
2238  const size_t numMeans = static_cast<size_t> (means.size ());
2239 
2240  TEUCHOS_TEST_FOR_EXCEPTION(
2241  numMeans != numVecs, std::runtime_error,
2242  "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2243  << " != this->getNumVectors() = " << numVecs << ".");
2244 
2245  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2246  const std::pair<size_t, size_t> colRng (0, numVecs);
2247 
2248  // Make sure that the final output view has the same layout as the
2249  // temporary view's HostMirror. Left or Right doesn't matter for
2250  // a 1-D array anyway; this is just to placate the compiler.
2251  typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2252  typedef Kokkos::View<impl_scalar_type*,
2253  typename local_view_type::HostMirror::array_layout,
2254  Kokkos::HostSpace,
2255  Kokkos::MemoryTraits<Kokkos::Unmanaged> > host_local_view_type;
2256  host_local_view_type meansOut (means.getRawPtr (), numMeans);
2257 
2258  RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
2259  this->getMap ()->getComm ();
2260 
2261  // If we need sync to device, then host has the most recent version.
2262  const bool useHostVersion = this->need_sync_device ();
2263  if (useHostVersion) {
2264  // DualView was last modified on host, so run the local kernel there.
2265  auto X_lcl = subview (getLocalViewHost (),
2266  rowRng, Kokkos::ALL ());
2267  // Compute the local sum of each column.
2268  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2269  if (isConstantStride ()) {
2270  KokkosBlas::sum (lclSums, X_lcl);
2271  }
2272  else {
2273  for (size_t j = 0; j < numVecs; ++j) {
2274  const size_t col = whichVectors_[j];
2275  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2276  }
2277  }
2278 
2279  // If there are multiple MPI processes, the all-reduce reads
2280  // from lclSums, and writes to meansOut. Otherwise, we just
2281  // copy lclSums into meansOut.
2282  if (! comm.is_null () && this->isDistributed ()) {
2283  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2284  lclSums.data (), meansOut.data ());
2285  }
2286  else {
2287  Kokkos::deep_copy (meansOut, lclSums);
2288  }
2289  }
2290  else {
2291  // DualView was last modified on device, so run the local kernel there.
2292  auto X_lcl = subview (this->getLocalViewDevice (),
2293  rowRng, Kokkos::ALL ());
2294 
2295  // Compute the local sum of each column.
2296  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2297  if (isConstantStride ()) {
2298  KokkosBlas::sum (lclSums, X_lcl);
2299  }
2300  else {
2301  for (size_t j = 0; j < numVecs; ++j) {
2302  const size_t col = whichVectors_[j];
2303  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2304  }
2305  }
2306 
2307  // If there are multiple MPI processes, the all-reduce reads
2308  // from lclSums, and writes to meansOut. (We assume that MPI
2309  // can read device memory.) Otherwise, we just copy lclSums
2310  // into meansOut.
2311  if (! comm.is_null () && this->isDistributed ()) {
2312  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2313  lclSums.data (), meansOut.data ());
2314  }
2315  else {
2316  Kokkos::deep_copy (meansOut, lclSums);
2317  }
2318  }
2319 
2320  // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2321  // to the magnitude type, since operator/ (std::complex<T>, int)
2322  // isn't necessarily defined.
2323  const impl_scalar_type OneOverN =
2324  ATS::one () / static_cast<mag_type> (this->getGlobalLength ());
2325  for (size_t k = 0; k < numMeans; ++k) {
2326  meansOut(k) = meansOut(k) * OneOverN;
2327  }
2328  }
2329 
2330 
2331  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2332  void
2334  randomize ()
2335  {
2336  typedef impl_scalar_type IST;
2337  typedef Kokkos::Details::ArithTraits<IST> ATS;
2338  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2339  typedef typename pool_type::generator_type generator_type;
2340 
2341  const IST max = Kokkos::rand<generator_type, IST>::max ();
2342  const IST min = ATS::is_signed ? IST (-max) : ATS::zero ();
2343 
2344  this->randomize (static_cast<Scalar> (min), static_cast<Scalar> (max));
2345  }
2346 
2347 
2348  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2349  void
2351  randomize (const Scalar& minVal, const Scalar& maxVal)
2352  {
2353  typedef impl_scalar_type IST;
2354  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2355 
2356  // Seed the pseudorandom number generator using the calling
2357  // process' rank. This helps decorrelate different process'
2358  // pseudorandom streams. It's not perfect but it's effective and
2359  // doesn't require MPI communication. The seed also includes bits
2360  // from the standard library's rand().
2361  //
2362  // FIXME (mfh 07 Jan 2015) Should we save the seed for later use?
2363  // The code below just makes a new seed each time.
2364 
2365  const uint64_t myRank =
2366  static_cast<uint64_t> (this->getMap ()->getComm ()->getRank ());
2367  uint64_t seed64 = static_cast<uint64_t> (std::rand ()) + myRank + 17311uLL;
2368  unsigned int seed = static_cast<unsigned int> (seed64&0xffffffff);
2369 
2370  pool_type rand_pool (seed);
2371  const IST max = static_cast<IST> (maxVal);
2372  const IST min = static_cast<IST> (minVal);
2373 
2374  // See #1510. In case diag has already been marked modified on
2375  // host or device, we need to clear those flags, since the code
2376  // below works on device.
2377  this->view_.clear_sync_state();
2378 
2379  this->modify_device ();
2380  auto thisView = this->getLocalViewDevice ();
2381 
2382  if (isConstantStride ()) {
2383  Kokkos::fill_random (thisView, rand_pool, min, max);
2384  }
2385  else {
2386  const size_t numVecs = getNumVectors ();
2387  for (size_t k = 0; k < numVecs; ++k) {
2388  const size_t col = whichVectors_[k];
2389  auto X_k = Kokkos::subview (thisView, Kokkos::ALL (), col);
2390  Kokkos::fill_random (X_k, rand_pool, min, max);
2391  }
2392  }
2393  }
2394 
2395  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2396  void
2398  putScalar (const Scalar& alpha)
2399  {
2400  using ::Tpetra::Details::ProfilingRegion;
2401  using ::Tpetra::Details::Blas::fill;
2402  using DES = typename dual_view_type::t_dev::execution_space;
2403  using HES = typename dual_view_type::t_host::execution_space;
2404  using LO = LocalOrdinal;
2405  ProfilingRegion region ("Tpetra::MultiVector::putScalar");
2406 
2407  // We need this cast for cases like Scalar = std::complex<T> but
2408  // impl_scalar_type = Kokkos::complex<T>.
2409  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2410  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
2411  const LO numVecs = static_cast<LO> (this->getNumVectors ());
2412 
2413  // Modify the most recently updated version of the data. This
2414  // avoids sync'ing, which could violate users' expectations.
2415  //
2416  // If we need sync to device, then host has the most recent version.
2417  const bool runOnHost = this->need_sync_device ();
2419  if (! runOnHost) { // last modified in device memory
2420  this->modify_device ();
2421  auto X = this->getLocalViewDevice ();
2422  if (this->isConstantStride ()) {
2423  fill (DES (), X, theAlpha, lclNumRows, numVecs);
2424  }
2425  else {
2426  fill (DES (), X, theAlpha, lclNumRows, numVecs,
2427  this->whichVectors_.getRawPtr ());
2428  }
2429  }
2430  else { // last modified in host memory, so modify data there.
2431  this->modify_host ();
2432  auto X = this->getLocalViewHost ();
2433  if (this->isConstantStride ()) {
2434  fill (HES (), X, theAlpha, lclNumRows, numVecs);
2435  }
2436  else {
2437  fill (HES (), X, theAlpha, lclNumRows, numVecs,
2438  this->whichVectors_.getRawPtr ());
2439  }
2440  }
2441  }
2443 
2444  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2445  void
2447  replaceMap (const Teuchos::RCP<const map_type>& newMap)
2448  {
2449  using Teuchos::ArrayRCP;
2450  using Teuchos::Comm;
2451  using Teuchos::RCP;
2452  using ST = Scalar;
2453  using LO = LocalOrdinal;
2454  using GO = GlobalOrdinal;
2455 
2456  // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2457  // it might work if the MV is a column view of another MV.
2458  // However, things might go wrong when restoring the original
2459  // Map, so we don't allow this case for now.
2460  TEUCHOS_TEST_FOR_EXCEPTION(
2461  ! this->isConstantStride (), std::logic_error,
2462  "Tpetra::MultiVector::replaceMap: This method does not currently work "
2463  "if the MultiVector is a column view of another MultiVector (that is, if "
2464  "isConstantStride() == false).");
2465 
2466  // Case 1: current Map and new Map are both nonnull on this process.
2467  // Case 2: current Map is nonnull, new Map is null.
2468  // Case 3: current Map is null, new Map is nonnull.
2469  // Case 4: both Maps are null: forbidden.
2470  //
2471  // Case 1 means that we don't have to do anything on this process,
2472  // other than assign the new Map. (We always have to do that.)
2473  // It's an error for the user to supply a Map that requires
2474  // resizing in this case.
2475  //
2476  // Case 2 means that the calling process is in the current Map's
2477  // communicator, but will be excluded from the new Map's
2478  // communicator. We don't have to do anything on the calling
2479  // process; just leave whatever data it may have alone.
2480  //
2481  // Case 3 means that the calling process is excluded from the
2482  // current Map's communicator, but will be included in the new
2483  // Map's communicator. This means we need to (re)allocate the
2484  // local DualView if it does not have the right number of rows.
2485  // If the new number of rows is nonzero, we'll fill the newly
2486  // allocated local data with zeros, as befits a projection
2487  // operation.
2488  //
2489  // The typical use case for Case 3 is that the MultiVector was
2490  // first created with the Map with more processes, then that Map
2491  // was replaced with a Map with fewer processes, and finally the
2492  // original Map was restored on this call to replaceMap.
2493 
2494 #ifdef HAVE_TEUCHOS_DEBUG
2495  // mfh 28 Mar 2013: We can't check for compatibility across the
2496  // whole communicator, unless we know that the current and new
2497  // Maps are nonnull on _all_ participating processes.
2498  // TEUCHOS_TEST_FOR_EXCEPTION(
2499  // origNumProcs == newNumProcs && ! this->getMap ()->isCompatible (*map),
2500  // std::invalid_argument, "Tpetra::MultiVector::project: "
2501  // "If the input Map's communicator is compatible (has the same number of "
2502  // "processes as) the current Map's communicator, then the two Maps must be "
2503  // "compatible. The replaceMap() method is not for data redistribution; "
2504  // "use Import or Export for that purpose.");
2505 
2506  // TODO (mfh 28 Mar 2013) Add compatibility checks for projections
2507  // of the Map, in case the process counts don't match.
2508 #endif // HAVE_TEUCHOS_DEBUG
2509 
2510  if (this->getMap ().is_null ()) { // current Map is null
2511  // If this->getMap() is null, that means that this MultiVector
2512  // has already had replaceMap happen to it. In that case, just
2513  // reallocate the DualView with the right size.
2514 
2515  TEUCHOS_TEST_FOR_EXCEPTION(
2516  newMap.is_null (), std::invalid_argument,
2517  "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2518  "This probably means that the input Map is incorrect.");
2519 
2520  // Case 3: current Map is null, new Map is nonnull.
2521  // Reallocate the DualView with the right dimensions.
2522  const size_t newNumRows = newMap->getNodeNumElements ();
2523  const size_t origNumRows = view_.extent (0);
2524  const size_t numCols = this->getNumVectors ();
2525 
2526  if (origNumRows != newNumRows || view_.extent (1) != numCols) {
2527  view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2528  }
2529  }
2530  else if (newMap.is_null ()) { // Case 2: current Map is nonnull, new Map is null
2531  // I am an excluded process. Reinitialize my data so that I
2532  // have 0 rows. Keep the number of columns as before.
2533  const size_t newNumRows = static_cast<size_t> (0);
2534  const size_t numCols = this->getNumVectors ();
2535  view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2536  }
2537 
2538  this->map_ = newMap;
2539  }
2540 
2541  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2542  void
2544  scale (const Scalar& alpha)
2545  {
2546  using Kokkos::ALL;
2547  using IST = impl_scalar_type;
2548 
2549  const IST theAlpha = static_cast<IST> (alpha);
2550  if (theAlpha == Kokkos::ArithTraits<IST>::one ()) {
2551  return; // do nothing
2552  }
2553  const size_t lclNumRows = getLocalLength ();
2554  const size_t numVecs = getNumVectors ();
2555  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2556  const std::pair<size_t, size_t> colRng (0, numVecs);
2557 
2558  // We can't substitute putScalar(0.0) for scale(0.0), because the
2559  // former will overwrite NaNs present in the MultiVector. The
2560  // semantics of this call require multiplying them by 0, which
2561  // IEEE 754 requires to be NaN.
2562 
2563  // If we need sync to device, then host has the most recent version.
2564  const bool useHostVersion = need_sync_device ();
2565  if (useHostVersion) {
2566  auto Y_lcl = Kokkos::subview (getLocalViewHost (), rowRng, ALL ());
2567  if (isConstantStride ()) {
2568  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2569  }
2570  else {
2571  for (size_t k = 0; k < numVecs; ++k) {
2572  const size_t Y_col = isConstantStride () ? k : whichVectors_[k];
2573  auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2574  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2575  }
2576  }
2577  }
2578  else { // work on device
2579  auto Y_lcl = Kokkos::subview (getLocalViewDevice (), rowRng, ALL ());
2580  if (isConstantStride ()) {
2581  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2582  }
2583  else {
2584  for (size_t k = 0; k < numVecs; ++k) {
2585  const size_t Y_col = isConstantStride () ? k : whichVectors_[k];
2586  auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2587  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2588  }
2589  }
2590  }
2591  }
2592 
2593 
2594  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2595  void
2597  scale (const Teuchos::ArrayView<const Scalar>& alphas)
2598  {
2599  const size_t numVecs = this->getNumVectors ();
2600  const size_t numAlphas = static_cast<size_t> (alphas.size ());
2601  TEUCHOS_TEST_FOR_EXCEPTION(
2602  numAlphas != numVecs, std::invalid_argument, "Tpetra::MultiVector::"
2603  "scale: alphas.size() = " << numAlphas << " != this->getNumVectors() = "
2604  << numVecs << ".");
2605 
2606  // Use a DualView to copy the scaling constants onto the device.
2607  using k_alphas_type = Kokkos::DualView<impl_scalar_type*, device_type>;
2608  k_alphas_type k_alphas ("alphas::tmp", numAlphas);
2609  k_alphas.modify_host ();
2610  for (size_t i = 0; i < numAlphas; ++i) {
2611  k_alphas.h_view(i) = static_cast<impl_scalar_type> (alphas[i]);
2612  }
2613  k_alphas.sync_device ();
2614  // Invoke the scale() overload that takes a device View of coefficients.
2615  this->scale (k_alphas.view_device ());
2616  }
2617 
2618  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2619  void
2621  scale (const Kokkos::View<const impl_scalar_type*, device_type>& alphas)
2622  {
2623  using Kokkos::ALL;
2624  using Kokkos::subview;
2625 
2626  const size_t lclNumRows = this->getLocalLength ();
2627  const size_t numVecs = this->getNumVectors ();
2628  TEUCHOS_TEST_FOR_EXCEPTION(
2629  static_cast<size_t> (alphas.extent (0)) != numVecs,
2630  std::invalid_argument, "Tpetra::MultiVector::scale(alphas): "
2631  "alphas.extent(0) = " << alphas.extent (0)
2632  << " != this->getNumVectors () = " << numVecs << ".");
2633  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2634  const std::pair<size_t, size_t> colRng (0, numVecs);
2635 
2636  // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2637  // type of the return value of subview. This is because if we
2638  // switch the array layout from LayoutLeft to LayoutRight
2639  // (preferred for performance of block operations), the types
2640  // below won't be valid. (A view of a column of a LayoutRight
2641  // multivector has LayoutStride, not LayoutLeft.)
2642 
2643  // If we need sync to device, then host has the most recent version.
2644  const bool useHostVersion = this->need_sync_device ();
2645  if (useHostVersion) {
2646  // Work in host memory. This means we need to create a host
2647  // mirror of the input View of coefficients.
2648  auto alphas_h = Kokkos::create_mirror_view (alphas);
2649  Kokkos::deep_copy (alphas_h, alphas);
2650 
2651  auto Y_lcl = subview (this->getLocalViewHost (), rowRng, ALL ());
2652  if (isConstantStride ()) {
2653  KokkosBlas::scal (Y_lcl, alphas_h, Y_lcl);
2654  }
2655  else {
2656  for (size_t k = 0; k < numVecs; ++k) {
2657  const size_t Y_col = this->isConstantStride () ? k :
2658  this->whichVectors_[k];
2659  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2660  // We don't have to use the entire 1-D View here; we can use
2661  // the version that takes a scalar coefficient.
2662  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2663  }
2664  }
2665  }
2666  else { // Work in device memory, using the input View 'alphas' directly.
2667  auto Y_lcl = subview (this->getLocalViewDevice (), rowRng, ALL ());
2668  if (isConstantStride ()) {
2669  KokkosBlas::scal (Y_lcl, alphas, Y_lcl);
2670  }
2671  else {
2672  // FIXME (mfh 15 Mar 2019) We need one coefficient at a time,
2673  // as values on host, so copy them to host. Another approach
2674  // would be to fix scal() so that it takes a 0-D View as the
2675  // second argument.
2676  auto alphas_h = Kokkos::create_mirror_view (alphas);
2677  Kokkos::deep_copy (alphas_h, alphas);
2678 
2679  for (size_t k = 0; k < numVecs; ++k) {
2680  const size_t Y_col = this->isConstantStride () ? k :
2681  this->whichVectors_[k];
2682  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2683  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2684  }
2685  }
2686  }
2687  }
2688 
2689  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2690  void
2692  scale (const Scalar& alpha,
2694  {
2695  using Kokkos::ALL;
2696  using Kokkos::subview;
2698  const char tfecfFuncName[] = "scale: ";
2699 
2700  const size_t lclNumRows = getLocalLength ();
2701  const size_t numVecs = getNumVectors ();
2702 
2703  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2704  lclNumRows != A.getLocalLength (), std::invalid_argument,
2705  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2706  << A.getLocalLength () << ".");
2707  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2708  numVecs != A.getNumVectors (), std::invalid_argument,
2709  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2710  << A.getNumVectors () << ".");
2711 
2712  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2713  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2714  const std::pair<size_t, size_t> colRng (0, numVecs);
2715 
2716  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2717  if (this->need_sync_device ()) {
2718  this->sync_device ();
2719  }
2720  if (A.need_sync_device ()) {
2721  const_cast<MV&>(A).sync_device ();
2722  }
2723 
2724  this->modify_device ();
2725  auto Y_lcl_orig = this->getLocalViewDevice ();
2726  auto X_lcl_orig = A.getLocalViewDevice ();
2727  auto Y_lcl = subview (Y_lcl_orig, rowRng, ALL ());
2728  auto X_lcl = subview (X_lcl_orig, rowRng, ALL ());
2729 
2730  if (isConstantStride () && A.isConstantStride ()) {
2731  KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
2732  }
2733  else {
2734  // Make sure that Kokkos only uses the local length for add.
2735  for (size_t k = 0; k < numVecs; ++k) {
2736  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2737  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2738  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2739  auto X_k = subview (X_lcl, ALL (), X_col);
2740 
2741  KokkosBlas::scal (Y_k, theAlpha, X_k);
2742  }
2743  }
2744  }
2745 
2746 
2747 
2748  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2749  void
2752  {
2754  const char tfecfFuncName[] = "reciprocal: ";
2755 
2756  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2757  getLocalLength () != A.getLocalLength (), std::runtime_error,
2758  "MultiVectors do not have the same local length. "
2759  "this->getLocalLength() = " << getLocalLength ()
2760  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2761  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2762  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2763  ": MultiVectors do not have the same number of columns (vectors). "
2764  "this->getNumVectors() = " << getNumVectors ()
2765  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2766 
2767  const size_t numVecs = getNumVectors ();
2768 
2769  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2770  if (this->need_sync_device ()) {
2771  this->sync_device ();
2772  }
2773  if (A.need_sync_device ()) {
2774  const_cast<MV&>(A).sync_device ();
2775  }
2776  this->modify_device ();
2777 
2778  auto this_view_dev = this->getLocalViewDevice ();
2779  auto A_view_dev = A.getLocalViewDevice ();
2780 
2781  if (isConstantStride () && A.isConstantStride ()) {
2782  KokkosBlas::reciprocal (this_view_dev, A_view_dev);
2783  }
2784  else {
2785  using Kokkos::ALL;
2786  using Kokkos::subview;
2787  for (size_t k = 0; k < numVecs; ++k) {
2788  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2789  auto vector_k = subview (this_view_dev, ALL (), this_col);
2790  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
2791  auto vector_Ak = subview (A_view_dev, ALL (), A_col);
2792  KokkosBlas::reciprocal (vector_k, vector_Ak);
2793  }
2794  }
2795  }
2796 
2797  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2798  void
2801  {
2803  const char tfecfFuncName[] = "abs";
2804 
2805  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2806  getLocalLength () != A.getLocalLength (), std::runtime_error,
2807  ": MultiVectors do not have the same local length. "
2808  "this->getLocalLength() = " << getLocalLength ()
2809  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2810  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2811  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2812  ": MultiVectors do not have the same number of columns (vectors). "
2813  "this->getNumVectors() = " << getNumVectors ()
2814  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2815  const size_t numVecs = getNumVectors ();
2816 
2817  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2818  if (this->need_sync_device ()) {
2819  this->sync_device ();
2820  }
2821  if (A.need_sync_device ()) {
2822  const_cast<MV&>(A).sync_device ();
2823  }
2824  this->modify_device ();
2825 
2826  auto this_view_dev = this->getLocalViewDevice ();
2827  auto A_view_dev = A.getLocalViewDevice ();
2828 
2829  if (isConstantStride () && A.isConstantStride ()) {
2830  KokkosBlas::abs (this_view_dev, A_view_dev);
2831  }
2832  else {
2833  using Kokkos::ALL;
2834  using Kokkos::subview;
2835 
2836  for (size_t k=0; k < numVecs; ++k) {
2837  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2838  auto vector_k = subview (this_view_dev, ALL (), this_col);
2839  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
2840  auto vector_Ak = subview (A_view_dev, ALL (), A_col);
2841  KokkosBlas::abs (vector_k, vector_Ak);
2842  }
2843  }
2844  }
2845 
2846  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2847  void
2849  update (const Scalar& alpha,
2851  const Scalar& beta)
2852  {
2853  const char tfecfFuncName[] = "update: ";
2854  using Kokkos::subview;
2855  using Kokkos::ALL;
2857 
2858  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta)");
2859 
2860  const size_t lclNumRows = getLocalLength ();
2861  const size_t numVecs = getNumVectors ();
2862 
2863  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2864  lclNumRows != A.getLocalLength (), std::invalid_argument,
2865  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2866  << A.getLocalLength () << ".");
2867  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2868  numVecs != A.getNumVectors (), std::invalid_argument,
2869  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2870  << A.getNumVectors () << ".");
2871 
2872  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2873  if (this->need_sync_device ()) {
2874  this->sync_device ();
2875  }
2876  if (A.need_sync_device ()) {
2877  const_cast<MV&>(A).sync_device ();
2878  }
2879 
2880  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2881  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
2882  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2883  const std::pair<size_t, size_t> colRng (0, numVecs);
2884 
2885  auto Y_lcl_orig = this->getLocalViewDevice ();
2886  auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
2887  auto X_lcl_orig = A.getLocalViewDevice ();
2888  auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
2889 
2890  // The device memory of *this is about to be modified
2891  this->modify_device ();
2892  if (isConstantStride () && A.isConstantStride ()) {
2893  KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
2894  }
2895  else {
2896  // Make sure that Kokkos only uses the local length for add.
2897  for (size_t k = 0; k < numVecs; ++k) {
2898  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2899  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2900  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2901  auto X_k = subview (X_lcl, ALL (), X_col);
2902 
2903  KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
2904  }
2905  }
2906  }
2907 
2908  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2909  void
2911  update (const Scalar& alpha,
2913  const Scalar& beta,
2915  const Scalar& gamma)
2916  {
2917  using Kokkos::ALL;
2918  using Kokkos::subview;
2920 
2921  const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
2922 
2923  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta,B,gamma)");
2924 
2925  const size_t lclNumRows = this->getLocalLength ();
2926  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2927  lclNumRows != A.getLocalLength (), std::invalid_argument,
2928  "The input MultiVector A has " << A.getLocalLength () << " local "
2929  "row(s), but this MultiVector has " << lclNumRows << " local "
2930  "row(s).");
2931  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2932  lclNumRows != B.getLocalLength (), std::invalid_argument,
2933  "The input MultiVector B has " << B.getLocalLength () << " local "
2934  "row(s), but this MultiVector has " << lclNumRows << " local "
2935  "row(s).");
2936  const size_t numVecs = getNumVectors ();
2937  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2938  A.getNumVectors () != numVecs, std::invalid_argument,
2939  "The input MultiVector A has " << A.getNumVectors () << " column(s), "
2940  "but this MultiVector has " << numVecs << " column(s).");
2941  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2942  B.getNumVectors () != numVecs, std::invalid_argument,
2943  "The input MultiVector B has " << B.getNumVectors () << " column(s), "
2944  "but this MultiVector has " << numVecs << " column(s).");
2945 
2946  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2947  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
2948  const impl_scalar_type theGamma = static_cast<impl_scalar_type> (gamma);
2949 
2950  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2951  if (this->need_sync_device ()) this->sync_device ();
2952  if (A.need_sync_device ()) const_cast<MV&>(A).sync_device ();
2953  if (B.need_sync_device ()) const_cast<MV&>(B).sync_device ();
2954 
2955  // This method modifies *this.
2956  this->modify_device ();
2957 
2958  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2959  const std::pair<size_t, size_t> colRng (0, numVecs);
2960 
2961  // Prefer 'auto' over specifying the type explicitly. This avoids
2962  // issues with a subview possibly having a different type than the
2963  // original view.
2964  auto C_lcl = subview (this->getLocalViewDevice (), rowRng, ALL ());
2965  auto A_lcl = subview (A.getLocalViewDevice (), rowRng, ALL ());
2966  auto B_lcl = subview (B.getLocalViewDevice (), rowRng, ALL ());
2967 
2968  if (isConstantStride () && A.isConstantStride () && B.isConstantStride ()) {
2969  KokkosBlas::update (theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
2970  }
2971  else {
2972  // Some input (or *this) is not constant stride,
2973  // so perform the update one column at a time.
2974  for (size_t k = 0; k < numVecs; ++k) {
2975  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2976  const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
2977  const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k];
2978  KokkosBlas::update (theAlpha, subview (A_lcl, rowRng, A_col),
2979  theBeta, subview (B_lcl, rowRng, B_col),
2980  theGamma, subview (C_lcl, rowRng, this_col));
2981  }
2982  }
2983  }
2984 
2985  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2986  Teuchos::ArrayRCP<const Scalar>
2988  getData (size_t j) const
2989  {
2990  using Kokkos::ALL;
2992  using IST = impl_scalar_type;
2993  const char tfecfFuncName[] = "getData: ";
2994 
2995  // Any MultiVector method that called the (classic) Kokkos Node's
2996  // viewBuffer or viewBufferNonConst methods always implied a
2997  // device->host synchronization. Thus, we synchronize here as
2998  // well.
2999  const_cast<MV*> (this)->sync_host ();
3000 
3001  auto hostView = getLocalViewHost ();
3002  const size_t col = isConstantStride () ? j : whichVectors_[j];
3003  auto hostView_j = Kokkos::subview (hostView, ALL (), col);
3004  Teuchos::ArrayRCP<const IST> dataAsArcp =
3005  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3006 
3007  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3008  (static_cast<size_t> (hostView_j.extent (0)) <
3009  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3010  "hostView_j.extent(0) = " << hostView_j.extent (0)
3011  << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3012  "Please report this bug to the Tpetra developers.");
3013 
3014  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3015  }
3016 
3017  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3018  Teuchos::ArrayRCP<Scalar>
3020  getDataNonConst (size_t j)
3021  {
3022  using Kokkos::ALL;
3023  using Kokkos::subview;
3025  using IST = impl_scalar_type;
3026  const char tfecfFuncName[] = "getDataNonConst: ";
3027 
3028  // Any MultiVector method that called the (classic) Kokkos Node's
3029  // viewBuffer or viewBufferNonConst methods always implied a
3030  // device->host synchronization. Thus, we synchronize here as
3031  // well.
3032  const_cast<MV*> (this)->sync_host ();
3033  // Calling getDataNonConst() implies that the user plans to modify
3034  // the values in the MultiVector, so we mark the host data as modified.
3035  modify_host ();
3036 
3037  auto hostView = getLocalViewHost ();
3038  const size_t col = isConstantStride () ? j : whichVectors_[j];
3039  auto hostView_j = subview (hostView, ALL (), col);
3040  Teuchos::ArrayRCP<IST> dataAsArcp =
3041  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3042 
3043  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3044  (static_cast<size_t> (hostView_j.extent (0)) <
3045  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3046  "hostView_j.extent(0) = " << hostView_j.extent (0)
3047  << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3048  "Please report this bug to the Tpetra developers.");
3049 
3050  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3051  }
3052 
3053  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3054  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3056  subCopy (const Teuchos::ArrayView<const size_t>& cols) const
3057  {
3058  using Teuchos::RCP;
3060 
3061  // Check whether the index set in cols is contiguous. If it is,
3062  // use the more efficient Range1D version of subCopy.
3063  bool contiguous = true;
3064  const size_t numCopyVecs = static_cast<size_t> (cols.size ());
3065  for (size_t j = 1; j < numCopyVecs; ++j) {
3066  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3067  contiguous = false;
3068  break;
3069  }
3070  }
3071  if (contiguous && numCopyVecs > 0) {
3072  return this->subCopy (Teuchos::Range1D (cols[0], cols[numCopyVecs-1]));
3073  }
3074  else {
3075  RCP<const MV> X_sub = this->subView (cols);
3076  RCP<MV> Y (new MV (this->getMap (), numCopyVecs, false));
3077  Y->assign (*X_sub);
3078  return Y;
3079  }
3080  }
3081 
3082  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3083  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3085  subCopy (const Teuchos::Range1D &colRng) const
3086  {
3087  using Teuchos::RCP;
3089 
3090  RCP<const MV> X_sub = this->subView (colRng);
3091  RCP<MV> Y (new MV (this->getMap (), static_cast<size_t> (colRng.size ()), false));
3092  Y->assign (*X_sub);
3093  return Y;
3094  }
3095 
3096  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3097  size_t
3099  getOrigNumLocalRows () const {
3100  return origView_.extent (0);
3101  }
3102 
3103  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3104  size_t
3106  getOrigNumLocalCols () const {
3107  return origView_.extent (1);
3108  }
3109 
3110  template <class Scalar, class LO, class GO, class Node>
3113  const Teuchos::RCP<const map_type>& subMap,
3114  const local_ordinal_type rowOffset) :
3115  base_type (subMap)
3116  {
3117  using Kokkos::ALL;
3118  using Kokkos::subview;
3119  using Teuchos::outArg;
3120  using Teuchos::RCP;
3121  using Teuchos::rcp;
3122  using Teuchos::reduceAll;
3123  using Teuchos::REDUCE_MIN;
3124  using std::endl;
3126  const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3127  const char suffix[] = "Please report this bug to the Tpetra developers.";
3128  int lclGood = 1;
3129  int gblGood = 1;
3130  std::unique_ptr<std::ostringstream> errStrm;
3131  const bool debug = ::Tpetra::Details::Behavior::debug ();
3132  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
3133 
3134  // Be careful to use the input Map's communicator, not X's. The
3135  // idea is that, on return, *this is a subview of X, using the
3136  // input Map.
3137  const auto comm = subMap->getComm ();
3138  TEUCHOS_ASSERT( ! comm.is_null () );
3139  const int myRank = comm->getRank ();
3140 
3141  const LO lclNumRowsBefore = static_cast<LO> (X.getLocalLength ());
3142  const LO numCols = static_cast<LO> (X.getNumVectors ());
3143  const LO newNumRows = static_cast<LO> (subMap->getNodeNumElements ());
3144  if (verbose) {
3145  std::ostringstream os;
3146  os << "Proc " << myRank << ": " << prefix
3147  << "X: {lclNumRows: " << lclNumRowsBefore
3148  << ", origLclNumRows: " << X.getOrigNumLocalRows ()
3149  << ", numCols: " << numCols << "}, "
3150  << "subMap: {lclNumRows: " << newNumRows << "}" << endl;
3151  std::cerr << os.str ();
3152  }
3153  // We ask for the _original_ number of rows in X, because X could
3154  // be a shorter (fewer rows) view of a longer MV. (For example, X
3155  // could be a domain Map view of a column Map MV.)
3156  const bool tooManyElts =
3157  newNumRows + rowOffset > static_cast<LO> (X.getOrigNumLocalRows ());
3158  if (tooManyElts) {
3159  errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3160  *errStrm << " Proc " << myRank << ": subMap->getNodeNumElements() (="
3161  << newNumRows << ") + rowOffset (=" << rowOffset
3162  << ") > X.getOrigNumLocalRows() (=" << X.getOrigNumLocalRows ()
3163  << ")." << endl;
3164  lclGood = 0;
3165  TEUCHOS_TEST_FOR_EXCEPTION
3166  (! debug && tooManyElts, std::invalid_argument,
3167  prefix << errStrm->str () << suffix);
3168  }
3169 
3170  if (debug) {
3171  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3172  if (gblGood != 1) {
3173  std::ostringstream gblErrStrm;
3174  const std::string myErrStr =
3175  errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3176  ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3177  TEUCHOS_TEST_FOR_EXCEPTION
3178  (true, std::invalid_argument, gblErrStrm.str ());
3179  }
3180  }
3181 
3182  using range_type = std::pair<LO, LO>;
3183  const range_type origRowRng
3184  (rowOffset, static_cast<LO> (X.origView_.extent (0)));
3185  const range_type rowRng
3186  (rowOffset, rowOffset + newNumRows);
3187 
3188  dual_view_type newOrigView = subview (X.origView_, origRowRng, ALL ());
3189  // FIXME (mfh 29 Sep 2016) If we just use X.view_ here, it breaks
3190  // CrsMatrix's Gauss-Seidel implementation (which assumes the
3191  // ability to create domain Map views of column Map MultiVectors,
3192  // and then get the original column Map MultiVector out again).
3193  // If we just use X.origView_ here, it breaks the fix for #46.
3194  // The test for rowOffset == 0 is a hack that makes both tests
3195  // pass, but doesn't actually fix the more general issue. In
3196  // particular, the right way to fix Gauss-Seidel would be to fix
3197  // #385; that would make "getting the original column Map
3198  // MultiVector out again" unnecessary.
3199  dual_view_type newView =
3200  subview (rowOffset == 0 ? X.origView_ : X.view_, rowRng, ALL ());
3201 
3202  // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3203  // handling subviews of degenerate Views quite so well. For some
3204  // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3205  // 0. We work around by creating a new empty DualView of the
3206  // desired (degenerate) dimensions.
3207  if (newOrigView.extent (0) == 0 &&
3208  newOrigView.extent (1) != X.origView_.extent (1)) {
3209  newOrigView =
3210  allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
3211  }
3212  if (newView.extent (0) == 0 &&
3213  newView.extent (1) != X.view_.extent (1)) {
3214  newView =
3215  allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
3216  }
3217 
3218  MV subViewMV = X.isConstantStride () ?
3219  MV (subMap, newView, newOrigView) :
3220  MV (subMap, newView, newOrigView, X.whichVectors_ ());
3221 
3222  if (debug) {
3223  const LO lclNumRowsRet = static_cast<LO> (subViewMV.getLocalLength ());
3224  const LO numColsRet = static_cast<LO> (subViewMV.getNumVectors ());
3225  if (newNumRows != lclNumRowsRet || numCols != numColsRet) {
3226  lclGood = 0;
3227  if (errStrm.get () == nullptr) {
3228  errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3229  }
3230  *errStrm << " Proc " << myRank <<
3231  ": subMap.getNodeNumElements(): " << newNumRows <<
3232  ", subViewMV.getLocalLength(): " << lclNumRowsRet <<
3233  ", X.getNumVectors(): " << numCols <<
3234  ", subViewMV.getNumVectors(): " << numColsRet << endl;
3235  }
3236  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3237  if (gblGood != 1) {
3238  std::ostringstream gblErrStrm;
3239  if (myRank == 0) {
3240  gblErrStrm << prefix << "Returned MultiVector has the wrong local "
3241  "dimensions on one or more processes:" << endl;
3242  }
3243  const std::string myErrStr =
3244  errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3245  ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3246  gblErrStrm << suffix << endl;
3247  TEUCHOS_TEST_FOR_EXCEPTION
3248  (true, std::invalid_argument, gblErrStrm.str ());
3249  }
3250  }
3251 
3252  if (verbose) {
3253  std::ostringstream os;
3254  os << "Proc " << myRank << ": " << prefix << "Call op=" << endl;
3255  std::cerr << os.str ();
3256  }
3257 
3258  *this = subViewMV; // shallow copy
3259 
3260  if (verbose) {
3261  std::ostringstream os;
3262  os << "Proc " << myRank << ": " << prefix << "Done!" << endl;
3263  std::cerr << os.str ();
3264  }
3265  }
3266 
3267  template <class Scalar, class LO, class GO, class Node>
3269  MultiVector (const MultiVector<Scalar, LO, GO, Node>& X,
3270  const map_type& subMap,
3271  const size_t rowOffset) :
3272  MultiVector (X, Teuchos::RCP<const map_type> (new map_type (subMap)),
3273  static_cast<local_ordinal_type> (rowOffset))
3274  {}
3275 
3276  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3277  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3279  offsetView (const Teuchos::RCP<const map_type>& subMap,
3280  const size_t offset) const
3281  {
3283  return Teuchos::rcp (new MV (*this, *subMap, offset));
3284  }
3285 
3286  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3287  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3289  offsetViewNonConst (const Teuchos::RCP<const map_type>& subMap,
3290  const size_t offset)
3291  {
3293  return Teuchos::rcp (new MV (*this, *subMap, offset));
3294  }
3295 
3296  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3297  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3299  subView (const Teuchos::ArrayView<const size_t>& cols) const
3300  {
3301  using Teuchos::Array;
3302  using Teuchos::rcp;
3304 
3305  const size_t numViewCols = static_cast<size_t> (cols.size ());
3306  TEUCHOS_TEST_FOR_EXCEPTION(
3307  numViewCols < 1, std::runtime_error, "Tpetra::MultiVector::subView"
3308  "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3309  "contain at least one entry, but cols.size() = " << cols.size ()
3310  << " == 0.");
3311 
3312  // Check whether the index set in cols is contiguous. If it is,
3313  // use the more efficient Range1D version of subView.
3314  bool contiguous = true;
3315  for (size_t j = 1; j < numViewCols; ++j) {
3316  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3317  contiguous = false;
3318  break;
3319  }
3320  }
3321  if (contiguous) {
3322  if (numViewCols == 0) {
3323  // The output MV has no columns, so there is nothing to view.
3324  return rcp (new MV (this->getMap (), numViewCols));
3325  } else {
3326  // Use the more efficient contiguous-index-range version.
3327  return this->subView (Teuchos::Range1D (cols[0], cols[numViewCols-1]));
3328  }
3329  }
3330 
3331  if (isConstantStride ()) {
3332  return rcp (new MV (this->getMap (), view_, origView_, cols));
3333  }
3334  else {
3335  Array<size_t> newcols (cols.size ());
3336  for (size_t j = 0; j < numViewCols; ++j) {
3337  newcols[j] = whichVectors_[cols[j]];
3338  }
3339  return rcp (new MV (this->getMap (), view_, origView_, newcols ()));
3340  }
3341  }
3342 
3343 
3344  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3345  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3347  subView (const Teuchos::Range1D& colRng) const
3348  {
3349  using ::Tpetra::Details::Behavior;
3350  using Kokkos::ALL;
3351  using Kokkos::subview;
3352  using Teuchos::Array;
3353  using Teuchos::RCP;
3354  using Teuchos::rcp;
3356  const char tfecfFuncName[] = "subView(Range1D): ";
3357 
3358  const size_t lclNumRows = this->getLocalLength ();
3359  const size_t numVecs = this->getNumVectors ();
3360  // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3361  // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3362  // "at least one vector.");
3363  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3364  static_cast<size_t> (colRng.size ()) > numVecs, std::runtime_error,
3365  "colRng.size() = " << colRng.size () << " > this->getNumVectors() = "
3366  << numVecs << ".");
3367  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3368  numVecs != 0 && colRng.size () != 0 &&
3369  (colRng.lbound () < static_cast<Teuchos::Ordinal> (0) ||
3370  static_cast<size_t> (colRng.ubound ()) >= numVecs),
3371  std::invalid_argument, "Nonempty input range [" << colRng.lbound () <<
3372  "," << colRng.ubound () << "] exceeds the valid range of column indices "
3373  "[0, " << numVecs << "].");
3374 
3375  RCP<const MV> X_ret; // the MultiVector subview to return
3376 
3377  // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3378  // broken for the case of views with zero rows. I will brutally
3379  // enforce that the subview has the correct dimensions. In
3380  // particular, in the case of zero rows, I will, if necessary,
3381  // create a new dual_view_type with zero rows and the correct
3382  // number of columns. In a debug build, I will use an all-reduce
3383  // to ensure that it has the correct dimensions on all processes.
3384 
3385  const std::pair<size_t, size_t> rows (0, lclNumRows);
3386  if (colRng.size () == 0) {
3387  const std::pair<size_t, size_t> cols (0, 0); // empty range
3388  dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3389  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3390  }
3391  else {
3392  // Returned MultiVector is constant stride only if *this is.
3393  if (isConstantStride ()) {
3394  const std::pair<size_t, size_t> cols (colRng.lbound (),
3395  colRng.ubound () + 1);
3396  dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3397  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3398  }
3399  else {
3400  if (static_cast<size_t> (colRng.size ()) == static_cast<size_t> (1)) {
3401  // We're only asking for one column, so the result does have
3402  // constant stride, even though this MultiVector does not.
3403  const std::pair<size_t, size_t> col (whichVectors_[0] + colRng.lbound (),
3404  whichVectors_[0] + colRng.ubound () + 1);
3405  dual_view_type X_sub = takeSubview (view_, ALL (), col);
3406  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3407  }
3408  else {
3409  Array<size_t> which (whichVectors_.begin () + colRng.lbound (),
3410  whichVectors_.begin () + colRng.ubound () + 1);
3411  X_ret = rcp (new MV (this->getMap (), view_, origView_, which));
3412  }
3413  }
3414  }
3415 
3416  const bool debug = Behavior::debug ();
3417  if (debug) {
3418  using Teuchos::Comm;
3419  using Teuchos::outArg;
3420  using Teuchos::REDUCE_MIN;
3421  using Teuchos::reduceAll;
3422 
3423  RCP<const Comm<int> > comm = this->getMap ().is_null () ?
3424  Teuchos::null : this->getMap ()->getComm ();
3425  if (! comm.is_null ()) {
3426  int lclSuccess = 1;
3427  int gblSuccess = 1;
3428 
3429  if (X_ret.is_null ()) {
3430  lclSuccess = 0;
3431  }
3432  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
3433  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3434  (lclSuccess != 1, std::logic_error, "X_ret (the subview of this "
3435  "MultiVector; the return value of this method) is null on some MPI "
3436  "process in this MultiVector's communicator. This should never "
3437  "happen. Please report this bug to the Tpetra developers.");
3438  if (! X_ret.is_null () &&
3439  X_ret->getNumVectors () != static_cast<size_t> (colRng.size ())) {
3440  lclSuccess = 0;
3441  }
3442  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess,
3443  outArg (gblSuccess));
3444  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3445  (lclSuccess != 1, std::logic_error, "X_ret->getNumVectors() != "
3446  "colRng.size(), on at least one MPI process in this MultiVector's "
3447  "communicator. This should never happen. "
3448  "Please report this bug to the Tpetra developers.");
3449  }
3450  }
3451  return X_ret;
3452  }
3453 
3454 
3455  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3456  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3458  subViewNonConst (const Teuchos::ArrayView<const size_t> &cols)
3459  {
3461  return Teuchos::rcp_const_cast<MV> (this->subView (cols));
3462  }
3463 
3464 
3465  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3466  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3468  subViewNonConst (const Teuchos::Range1D &colRng)
3469  {
3471  return Teuchos::rcp_const_cast<MV> (this->subView (colRng));
3472  }
3473 
3474 
3475  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3478  const size_t j)
3479  : base_type (X.getMap ())
3480  {
3481  using Kokkos::subview;
3482  typedef std::pair<size_t, size_t> range_type;
3483  const char tfecfFuncName[] = "MultiVector(const MultiVector&, const size_t): ";
3484 
3485  const size_t numCols = X.getNumVectors ();
3486  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3487  (j >= numCols, std::invalid_argument, "Input index j (== " << j
3488  << ") exceeds valid column index range [0, " << numCols << " - 1].");
3489  const size_t jj = X.isConstantStride () ?
3490  static_cast<size_t> (j) :
3491  static_cast<size_t> (X.whichVectors_[j]);
3492  this->view_ = takeSubview (X.view_, Kokkos::ALL (), range_type (jj, jj+1));
3493  this->origView_ = X.origView_;
3494 
3495  // mfh 31 Jul 2017: It would be unwise to execute concurrent
3496  // Export or Import operations with different subviews of a
3497  // MultiVector. Thus, it is safe to reuse communication buffers.
3498  // See #1560 discussion.
3499  //
3500  // We only need one column's worth of buffer for imports_ and
3501  // exports_. Taking subviews now ensures that their lengths will
3502  // be exactly what we need, so we won't have to resize them later.
3503  {
3504  const size_t newSize = X.imports_.extent (0) / numCols;
3505  auto newImports = X.imports_;
3506  newImports.d_view = subview (X.imports_.d_view, range_type (0, newSize));
3507  newImports.h_view = subview (X.imports_.h_view, range_type (0, newSize));
3508  }
3509  {
3510  const size_t newSize = X.exports_.extent (0) / numCols;
3511  auto newExports = X.exports_;
3512  newExports.d_view = subview (X.exports_.d_view, range_type (0, newSize));
3513  newExports.h_view = subview (X.exports_.h_view, range_type (0, newSize));
3514  }
3515  // These two DualViews already either have the right number of
3516  // entries, or zero entries. This means that we don't need to
3517  // resize them.
3520  }
3521 
3522 
3523  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3524  Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3526  getVector (const size_t j) const
3527  {
3529  return Teuchos::rcp (new V (*this, j));
3530  }
3531 
3532 
3533  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3534  Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3536  getVectorNonConst (const size_t j)
3537  {
3539  return Teuchos::rcp (new V (*this, j));
3540  }
3541 
3542 
3543  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3544  void
3546  get1dCopy (const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const
3547  {
3548  using dev_view_type = typename dual_view_type::t_dev;
3549  using host_view_type = typename dual_view_type::t_host;
3550  using IST = impl_scalar_type;
3551  using input_view_type = Kokkos::View<IST**, Kokkos::LayoutLeft,
3552  Kokkos::HostSpace,
3553  Kokkos::MemoryUnmanaged>;
3554  const char tfecfFuncName[] = "get1dCopy: ";
3555 
3556  const size_t numRows = this->getLocalLength ();
3557  const size_t numCols = this->getNumVectors ();
3558 
3559  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3560  (LDA < numRows, std::runtime_error,
3561  "LDA = " << LDA << " < numRows = " << numRows << ".");
3562  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3563  (numRows > size_t (0) && numCols > size_t (0) &&
3564  size_t (A.size ()) < LDA * (numCols - 1) + numRows,
3565  std::runtime_error,
3566  "A.size() = " << A.size () << ", but its size must be at least "
3567  << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3568 
3569  const std::pair<size_t, size_t> rowRange (0, numRows);
3570  const std::pair<size_t, size_t> colRange (0, numCols);
3571 
3572  input_view_type A_view_orig (reinterpret_cast<IST*> (A.getRawPtr ()),
3573  LDA, numCols);
3574  auto A_view = Kokkos::subview (A_view_orig, rowRange, colRange);
3575 
3576  // Use the most recently updated version of this MultiVector's
3577  // data. This avoids sync'ing, which could violate users'
3578  // expectations.
3579  //
3580  // If we need sync to device, then host has the most recent version.
3581  const bool useHostVersion = this->need_sync_device ();
3582 
3583  dev_view_type srcView_dev;
3584  host_view_type srcView_host;
3585  if (useHostVersion) {
3586  srcView_host = this->getLocalViewHost ();
3587  }
3588  else {
3589  srcView_dev = this->getLocalViewDevice ();
3590  }
3591 
3592  if (this->isConstantStride ()) {
3593  if (useHostVersion) {
3594  Kokkos::deep_copy (A_view, srcView_host);
3595  }
3596  else {
3597  Kokkos::deep_copy (A_view, srcView_dev);
3598  }
3599  }
3600  else {
3601  for (size_t j = 0; j < numCols; ++j) {
3602  const size_t srcCol = this->whichVectors_[j];
3603  auto dstColView = Kokkos::subview (A_view, rowRange, j);
3604 
3605  if (useHostVersion) {
3606  auto srcColView_host = Kokkos::subview (srcView_host, rowRange, srcCol);
3607  Kokkos::deep_copy (dstColView, srcColView_host);
3608  }
3609  else {
3610  auto srcColView_dev = Kokkos::subview (srcView_dev, rowRange, srcCol);
3611  Kokkos::deep_copy (dstColView, srcColView_dev);
3612  }
3613  }
3614  }
3615  }
3616 
3617 
3618  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3619  void
3621  get2dCopy (const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const
3622  {
3624  const char tfecfFuncName[] = "get2dCopy: ";
3625  const size_t numRows = this->getLocalLength ();
3626  const size_t numCols = this->getNumVectors ();
3627 
3628  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3629  static_cast<size_t> (ArrayOfPtrs.size ()) != numCols,
3630  std::runtime_error, "Input array of pointers must contain as many "
3631  "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3632  << ArrayOfPtrs.size () << " != getNumVectors() = " << numCols << ".");
3633 
3634  if (numRows != 0 && numCols != 0) {
3635  // No side effects until we've validated the input.
3636  for (size_t j = 0; j < numCols; ++j) {
3637  const size_t dstLen = static_cast<size_t> (ArrayOfPtrs[j].size ());
3638  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3639  dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3640  "the input array of arrays is not long enough to fit all entries in "
3641  "that column of the MultiVector. ArrayOfPtrs[j].size() = " << dstLen
3642  << " < getLocalLength() = " << numRows << ".");
3643  }
3644 
3645  // We've validated the input, so it's safe to start copying.
3646  for (size_t j = 0; j < numCols; ++j) {
3647  Teuchos::RCP<const V> X_j = this->getVector (j);
3648  const size_t LDA = static_cast<size_t> (ArrayOfPtrs[j].size ());
3649  X_j->get1dCopy (ArrayOfPtrs[j], LDA);
3650  }
3651  }
3652  }
3653 
3654  namespace { // (anonymous)
3655  template <class SC, class LO, class GO, class NT>
3657  syncMVToHostIfNeededAndGetHostView (MultiVector<SC, LO, GO, NT>& X,
3658  const bool markModified)
3659  {
3660  // NOTE (mfh 16 May 2016) get?dView() and get?dViewNonConst()
3661  // (replace ? with 1 or 2) have always been device->host
3662  // synchronization points, since <= 2012. We retain this
3663  // behavior for backwards compatibility.
3664  if (X.need_sync_host ()) {
3665  X.sync_host ();
3666  }
3667  if (markModified) {
3668  X.modify_host ();
3669  }
3670  return X.getLocalViewHost ();
3671  }
3672  } // namespace (anonymous)
3673 
3674 
3675  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3676  Teuchos::ArrayRCP<const Scalar>
3678  get1dView () const
3679  {
3680  if (getLocalLength () == 0 || getNumVectors () == 0) {
3681  return Teuchos::null;
3682  } else {
3683  TEUCHOS_TEST_FOR_EXCEPTION(
3684  ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3685  "get1dView: This MultiVector does not have constant stride, so it is "
3686  "not possible to view its data as a single array. You may check "
3687  "whether a MultiVector has constant stride by calling "
3688  "isConstantStride().");
3689  // Since get1dView() is and was always marked const, I have to
3690  // cast away const here in order not to break backwards
3691  // compatibility.
3692  constexpr bool markModified = false;
3694  auto X_lcl = syncMVToHostIfNeededAndGetHostView (const_cast<MV&> (*this),
3695  markModified);
3696  Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3697  Kokkos::Compat::persistingView (X_lcl);
3698  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3699  }
3700  }
3701 
3702  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3703  Teuchos::ArrayRCP<Scalar>
3706  {
3707  if (this->getLocalLength () == 0 || this->getNumVectors () == 0) {
3708  return Teuchos::null;
3709  }
3710  else {
3711  TEUCHOS_TEST_FOR_EXCEPTION
3712  (! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3713  "get1dViewNonConst: This MultiVector does not have constant stride, "
3714  "so it is not possible to view its data as a single array. You may "
3715  "check whether a MultiVector has constant stride by calling "
3716  "isConstantStride().");
3717  constexpr bool markModified = true;
3718  auto X_lcl = syncMVToHostIfNeededAndGetHostView (*this, markModified);
3719  Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3720  Kokkos::Compat::persistingView (X_lcl);
3721  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3722  }
3723  }
3724 
3725  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3726  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3729  {
3730  constexpr bool markModified = true;
3731  auto X_lcl = syncMVToHostIfNeededAndGetHostView (*this, markModified);
3732 
3733  // Don't use the row range here on the outside, in order to avoid
3734  // a strided return type (in case Kokkos::subview is conservative
3735  // about that). Instead, use the row range for the column views
3736  // in the loop.
3737  const size_t myNumRows = this->getLocalLength ();
3738  const size_t numCols = this->getNumVectors ();
3739  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3740 
3741  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views (numCols);
3742  for (size_t j = 0; j < numCols; ++j) {
3743  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3744  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3745  Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3746  Kokkos::Compat::persistingView (X_lcl_j);
3747  views[j] = Teuchos::arcp_reinterpret_cast<Scalar> (X_lcl_j_arcp);
3748  }
3749  return views;
3750  }
3751 
3752  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3753  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
3755  get2dView () const
3756  {
3757  // Since get2dView() is and was always marked const, I have to
3758  // cast away const here in order not to break backwards
3759  // compatibility.
3760  constexpr bool markModified = false;
3762  auto X_lcl = syncMVToHostIfNeededAndGetHostView (const_cast<MV&> (*this),
3763  markModified);
3764  // Don't use the row range here on the outside, in order to avoid
3765  // a strided return type (in case Kokkos::subview is conservative
3766  // about that). Instead, use the row range for the column views
3767  // in the loop.
3768  const size_t myNumRows = this->getLocalLength ();
3769  const size_t numCols = this->getNumVectors ();
3770  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3771 
3772  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views (numCols);
3773  for (size_t j = 0; j < numCols; ++j) {
3774  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3775  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3776  Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
3777  Kokkos::Compat::persistingView (X_lcl_j);
3778  views[j] = Teuchos::arcp_reinterpret_cast<const Scalar> (X_lcl_j_arcp);
3779  }
3780  return views;
3781  }
3782 
3783  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3784  void
3786  multiply (Teuchos::ETransp transA,
3787  Teuchos::ETransp transB,
3788  const Scalar& alpha,
3791  const Scalar& beta)
3792  {
3793  using ::Tpetra::Details::ProfilingRegion;
3794  using Teuchos::CONJ_TRANS;
3795  using Teuchos::NO_TRANS;
3796  using Teuchos::TRANS;
3797  using Teuchos::RCP;
3798  using Teuchos::rcp;
3799  using Teuchos::rcpFromRef;
3800  using std::endl;
3801  using ATS = Kokkos::ArithTraits<impl_scalar_type>;
3802  using LO = local_ordinal_type;
3803  using STS = Teuchos::ScalarTraits<Scalar>;
3805  const char tfecfFuncName[] = "multiply: ";
3806  ProfilingRegion region ("Tpetra::MV::multiply");
3807 
3808  // This routine performs a variety of matrix-matrix multiply
3809  // operations, interpreting the MultiVector (this-aka C , A and B)
3810  // as 2D matrices. Variations are due to the fact that A, B and C
3811  // can be locally replicated or globally distributed MultiVectors
3812  // and that we may or may not operate with the transpose of A and
3813  // B. Possible cases are:
3814  //
3815  // Operations # Cases Notes
3816  // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
3817  // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
3818  // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
3819  //
3820  // The following operations are not meaningful for 1-D
3821  // distributions:
3822  //
3823  // u1) C(local) = A^T(distr) * B^T(distr) 1
3824  // u2) C(local) = A (distr) * B^X(distr) 2
3825  // u3) C(distr) = A^X(local) * B^X(local) 4
3826  // u4) C(distr) = A^X(local) * B^X(distr) 4
3827  // u5) C(distr) = A^T(distr) * B^X(local) 2
3828  // u6) C(local) = A^X(distr) * B^X(local) 4
3829  // u7) C(distr) = A^X(distr) * B^X(local) 4
3830  // u8) C(local) = A^X(local) * B^X(distr) 4
3831  //
3832  // Total number of cases: 32 (= 2^5).
3833 
3834  impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
3835 
3836  const bool A_is_local = ! A.isDistributed ();
3837  const bool B_is_local = ! B.isDistributed ();
3838  const bool C_is_local = ! this->isDistributed ();
3839 
3840  // In debug mode, check compatibility of local dimensions. We
3841  // only do this in debug mode, since it requires an all-reduce
3842  // to ensure correctness on all processses. It's entirely
3843  // possible that only some processes may have incompatible local
3844  // dimensions. Throwing an exception only on those processes
3845  // could cause this method to hang.
3846  const bool debug = ::Tpetra::Details::Behavior::debug ();
3847  if (debug) {
3848  auto myMap = this->getMap ();
3849  if (! myMap.is_null () && ! myMap->getComm ().is_null ()) {
3850  using Teuchos::REDUCE_MIN;
3851  using Teuchos::reduceAll;
3852  using Teuchos::outArg;
3853 
3854  auto comm = myMap->getComm ();
3855  const size_t A_nrows =
3856  (transA != NO_TRANS) ? A.getNumVectors () : A.getLocalLength ();
3857  const size_t A_ncols =
3858  (transA != NO_TRANS) ? A.getLocalLength () : A.getNumVectors ();
3859  const size_t B_nrows =
3860  (transB != NO_TRANS) ? B.getNumVectors () : B.getLocalLength ();
3861  const size_t B_ncols =
3862  (transB != NO_TRANS) ? B.getLocalLength () : B.getNumVectors ();
3863 
3864  const bool lclBad = this->getLocalLength () != A_nrows ||
3865  this->getNumVectors () != B_ncols || A_ncols != B_nrows;
3866 
3867  const int myRank = comm->getRank ();
3868  std::ostringstream errStrm;
3869  if (this->getLocalLength () != A_nrows) {
3870  errStrm << "Proc " << myRank << ": this->getLocalLength()="
3871  << this->getLocalLength () << " != A_nrows=" << A_nrows
3872  << "." << std::endl;
3873  }
3874  if (this->getNumVectors () != B_ncols) {
3875  errStrm << "Proc " << myRank << ": this->getNumVectors()="
3876  << this->getNumVectors () << " != B_ncols=" << B_ncols
3877  << "." << std::endl;
3878  }
3879  if (A_ncols != B_nrows) {
3880  errStrm << "Proc " << myRank << ": A_ncols="
3881  << A_ncols << " != B_nrows=" << B_nrows
3882  << "." << std::endl;
3883  }
3884 
3885  const int lclGood = lclBad ? 0 : 1;
3886  int gblGood = 0;
3887  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3888  if (gblGood != 1) {
3889  std::ostringstream os;
3890  ::Tpetra::Details::gathervPrint (os, errStrm.str (), *comm);
3891 
3892  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3893  (true, std::runtime_error, "Inconsistent local dimensions on at "
3894  "least one process in this object's communicator." << std::endl
3895  << "Operation: "
3896  << "C(" << (C_is_local ? "local" : "distr") << ") = "
3897  << alpha << "*A"
3898  << (transA == Teuchos::TRANS ? "^T" :
3899  (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
3900  << "(" << (A_is_local ? "local" : "distr") << ") * "
3901  << beta << "*B"
3902  << (transA == Teuchos::TRANS ? "^T" :
3903  (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
3904  << "(" << (B_is_local ? "local" : "distr") << ")." << std::endl
3905  << "Global dimensions: C(" << this->getGlobalLength () << ", "
3906  << this->getNumVectors () << "), A(" << A.getGlobalLength ()
3907  << ", " << A.getNumVectors () << "), B(" << B.getGlobalLength ()
3908  << ", " << B.getNumVectors () << ")." << std::endl
3909  << os.str ());
3910  }
3911  }
3912  }
3913 
3914  // Case 1: C(local) = A^X(local) * B^X(local)
3915  const bool Case1 = C_is_local && A_is_local && B_is_local;
3916  // Case 2: C(local) = A^T(distr) * B (distr)
3917  const bool Case2 = C_is_local && ! A_is_local && ! B_is_local &&
3918  transA != NO_TRANS &&
3919  transB == NO_TRANS;
3920  // Case 3: C(distr) = A (distr) * B^X(local)
3921  const bool Case3 = ! C_is_local && ! A_is_local && B_is_local &&
3922  transA == NO_TRANS;
3923 
3924  // Test that we are considering a meaningful case
3925  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3926  (! Case1 && ! Case2 && ! Case3, std::runtime_error,
3927  "Multiplication of op(A) and op(B) into *this is not a "
3928  "supported use case.");
3929 
3930  if (beta != STS::zero () && Case2) {
3931  // If Case2, then C is local and contributions must be summed
3932  // across all processes. However, if beta != 0, then accumulate
3933  // beta*C into the sum. When summing across all processes, we
3934  // only want to accumulate this once, so set beta == 0 on all
3935  // processes except Process 0.
3936  const int myRank = this->getMap ()->getComm ()->getRank ();
3937  if (myRank != 0) {
3938  beta_local = ATS::zero ();
3939  }
3940  }
3941 
3942  // We only know how to do matrix-matrix multiplies if all the
3943  // MultiVectors have constant stride. If not, we have to make
3944  // temporary copies of those MultiVectors (including possibly
3945  // *this) that don't have constant stride.
3946  RCP<MV> C_tmp;
3947  if (! isConstantStride ()) {
3948  C_tmp = rcp (new MV (*this, Teuchos::Copy)); // deep copy
3949  } else {
3950  C_tmp = rcp (this, false);
3951  }
3952 
3953  RCP<const MV> A_tmp;
3954  if (! A.isConstantStride ()) {
3955  A_tmp = rcp (new MV (A, Teuchos::Copy)); // deep copy
3956  } else {
3957  A_tmp = rcpFromRef (A);
3958  }
3959 
3960  RCP<const MV> B_tmp;
3961  if (! B.isConstantStride ()) {
3962  B_tmp = rcp (new MV (B, Teuchos::Copy)); // deep copy
3963  } else {
3964  B_tmp = rcpFromRef (B);
3965  }
3966 
3967  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3968  (! C_tmp->isConstantStride () || ! B_tmp->isConstantStride () ||
3969  ! A_tmp->isConstantStride (), std::logic_error,
3970  "Failed to make temporary constant-stride copies of MultiVectors.");
3971 
3972  {
3973  if (A_tmp->need_sync_device ()) {
3974  const_cast<MV&> (*A_tmp).sync_device (); // const is a lie
3975  }
3976  const LO A_lclNumRows = A_tmp->getLocalLength ();
3977  const LO A_numVecs = A_tmp->getNumVectors ();
3978  auto A_lcl = A_tmp->getLocalViewDevice ();
3979  auto A_sub = Kokkos::subview (A_lcl,
3980  std::make_pair (LO (0), A_lclNumRows),
3981  std::make_pair (LO (0), A_numVecs));
3982 
3983  if (B_tmp->need_sync_device ()) {
3984  const_cast<MV&> (*B_tmp).sync_device (); // const is a lie
3985  }
3986  const LO B_lclNumRows = B_tmp->getLocalLength ();
3987  const LO B_numVecs = B_tmp->getNumVectors ();
3988  auto B_lcl = B_tmp->getLocalViewDevice ();
3989  auto B_sub = Kokkos::subview (B_lcl,
3990  std::make_pair (LO (0), B_lclNumRows),
3991  std::make_pair (LO (0), B_numVecs));
3992 
3993  if (C_tmp->need_sync_device ()) {
3994  const_cast<MV&> (*C_tmp).sync_device (); // const is a lie
3995  }
3996  const LO C_lclNumRows = C_tmp->getLocalLength ();
3997  const LO C_numVecs = C_tmp->getNumVectors ();
3998  auto C_lcl = C_tmp->getLocalViewDevice ();
3999  auto C_sub = Kokkos::subview (C_lcl,
4000  std::make_pair (LO (0), C_lclNumRows),
4001  std::make_pair (LO (0), C_numVecs));
4002  const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' :
4003  (transA == Teuchos::TRANS ? 'T' : 'C'));
4004  const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' :
4005  (transB == Teuchos::TRANS ? 'T' : 'C'));
4006  const impl_scalar_type alpha_IST (alpha);
4007 
4008  ProfilingRegion regionGemm ("Tpetra::MV::multiply-call-gemm");
4009  KokkosBlas::gemm (&ctransA, &ctransB, alpha_IST, A_sub, B_sub,
4010  beta_local, C_sub);
4011  }
4012 
4013  if (! isConstantStride ()) {
4014  ::Tpetra::deep_copy (*this, *C_tmp); // Copy the result back into *this.
4015  }
4016 
4017  // Dispose of (possibly) extra copies of A and B.
4018  A_tmp = Teuchos::null;
4019  B_tmp = Teuchos::null;
4020 
4021  // If Case 2 then sum up *this and distribute it to all processes.
4022  if (Case2) {
4023  this->reduce ();
4024  }
4025  }
4026 
4027  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4028  void
4030  elementWiseMultiply (Scalar scalarAB,
4033  Scalar scalarThis)
4034  {
4035  using Kokkos::ALL;
4036  using Kokkos::subview;
4039  const char tfecfFuncName[] = "elementWiseMultiply: ";
4040 
4041  const size_t lclNumRows = this->getLocalLength ();
4042  const size_t numVecs = this->getNumVectors ();
4043 
4044  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4045  (lclNumRows != A.getLocalLength () || lclNumRows != B.getLocalLength (),
4046  std::runtime_error, "MultiVectors do not have the same local length.");
4047  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4048  numVecs != B.getNumVectors (), std::runtime_error, "this->getNumVectors"
4049  "() = " << numVecs << " != B.getNumVectors() = " << B.getNumVectors ()
4050  << ".");
4051 
4052  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
4053  if (this->need_sync_device ()) {
4054  this->sync_device ();
4055  }
4056  if (A.need_sync_device ()) {
4057  const_cast<V&>(A).sync_device ();
4058  }
4059  if (B.need_sync_device ()) {
4060  const_cast<MV&>(B).sync_device ();
4061  }
4062  this->modify_device ();
4063 
4064  auto this_view = this->getLocalViewDevice ();
4065  auto A_view = A.getLocalViewDevice ();
4066  auto B_view = B.getLocalViewDevice ();
4067 
4068  if (isConstantStride () && B.isConstantStride ()) {
4069  // A is just a Vector; it only has one column, so it always has
4070  // constant stride.
4071  //
4072  // If both *this and B have constant stride, we can do an
4073  // element-wise multiply on all columns at once.
4074  KokkosBlas::mult (scalarThis,
4075  this_view,
4076  scalarAB,
4077  subview (A_view, ALL (), 0),
4078  B_view);
4079  }
4080  else {
4081  for (size_t j = 0; j < numVecs; ++j) {
4082  const size_t C_col = isConstantStride () ? j : whichVectors_[j];
4083  const size_t B_col = B.isConstantStride () ? j : B.whichVectors_[j];
4084  KokkosBlas::mult (scalarThis,
4085  subview (this_view, ALL (), C_col),
4086  scalarAB,
4087  subview (A_view, ALL (), 0),
4088  subview (B_view, ALL (), B_col));
4089  }
4090  }
4091  }
4092 
4093  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4094  void
4096  reduce ()
4097  {
4099  using ::Tpetra::Details::ProfilingRegion;
4100  ProfilingRegion region ("Tpetra::MV::reduce");
4101 
4102  const auto map = this->getMap ();
4103  if (map.get () == nullptr) {
4104  return;
4105  }
4106  const auto comm = map->getComm ();
4107  if (comm.get () == nullptr) {
4108  return;
4109  }
4110 
4111  // Avoid giving device buffers to MPI. Even if MPI handles them
4112  // correctly, doing so may not perform well.
4113  const bool changed_on_device = this->need_sync_host ();
4114  if (changed_on_device) {
4115  // NOTE (mfh 17 Mar 2019) If we ever get rid of UVM, then device
4116  // and host will be separate allocations. In that case, it may
4117  // pay to do the all-reduce from device to host.
4118  this->modify_device ();
4119  auto X_lcl = this->getLocalViewDevice ();
4120  allReduceView (X_lcl, X_lcl, *comm);
4121  }
4122  else {
4123  this->modify_host ();
4124  auto X_lcl = this->getLocalViewHost ();
4125  allReduceView (X_lcl, X_lcl, *comm);
4126  }
4127  }
4128 
4129  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4130  void
4132  replaceLocalValue (const LocalOrdinal lclRow,
4133  const size_t col,
4134  const impl_scalar_type& ScalarValue) const
4135  {
4136 #ifdef HAVE_TPETRA_DEBUG
4137  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4138  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4139  TEUCHOS_TEST_FOR_EXCEPTION(
4140  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4141  std::runtime_error,
4142  "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4143  << " is invalid. The range of valid row indices on this process "
4144  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4145  << ", " << maxLocalIndex << "].");
4146  TEUCHOS_TEST_FOR_EXCEPTION(
4147  vectorIndexOutOfRange(col),
4148  std::runtime_error,
4149  "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4150  << " of the multivector is invalid.");
4151 #endif
4152  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4153  view_.h_view (lclRow, colInd) = ScalarValue;
4154  }
4155 
4156 
4157  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4158  void
4160  sumIntoLocalValue (const LocalOrdinal lclRow,
4161  const size_t col,
4162  const impl_scalar_type& value,
4163  const bool atomic) const
4164  {
4165 #ifdef HAVE_TPETRA_DEBUG
4166  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4167  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4168  TEUCHOS_TEST_FOR_EXCEPTION(
4169  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4170  std::runtime_error,
4171  "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4172  << " is invalid. The range of valid row indices on this process "
4173  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4174  << ", " << maxLocalIndex << "].");
4175  TEUCHOS_TEST_FOR_EXCEPTION(
4176  vectorIndexOutOfRange(col),
4177  std::runtime_error,
4178  "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4179  << " of the multivector is invalid.");
4180 #endif
4181  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4182  if (atomic) {
4183  Kokkos::atomic_add (& (view_.h_view(lclRow, colInd)), value);
4184  }
4185  else {
4186  view_.h_view (lclRow, colInd) += value;
4187  }
4188  }
4189 
4190 
4191  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4192  void
4194  replaceGlobalValue (const GlobalOrdinal gblRow,
4195  const size_t col,
4196  const impl_scalar_type& ScalarValue) const
4197  {
4198  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4199  // touches the RCP's reference count, which isn't thread safe.
4200  const LocalOrdinal lclRow = this->map_->getLocalElement (gblRow);
4201 #ifdef HAVE_TPETRA_DEBUG
4202  const char tfecfFuncName[] = "replaceGlobalValue: ";
4203  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4204  (lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4205  std::runtime_error,
4206  "Global row index " << gblRow << " is not present on this process "
4207  << this->getMap ()->getComm ()->getRank () << ".");
4208  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4209  (this->vectorIndexOutOfRange (col), std::runtime_error,
4210  "Vector index " << col << " of the MultiVector is invalid.");
4211 #endif // HAVE_TPETRA_DEBUG
4212  this->replaceLocalValue (lclRow, col, ScalarValue);
4213  }
4214 
4215  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4216  void
4218  sumIntoGlobalValue (const GlobalOrdinal globalRow,
4219  const size_t col,
4220  const impl_scalar_type& value,
4221  const bool atomic) const
4222  {
4223  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4224  // touches the RCP's reference count, which isn't thread safe.
4225  const LocalOrdinal lclRow = this->map_->getLocalElement (globalRow);
4226 #ifdef HAVE_TEUCHOS_DEBUG
4227  TEUCHOS_TEST_FOR_EXCEPTION(
4228  lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4229  std::runtime_error,
4230  "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4231  << " is not present on this process "
4232  << this->getMap ()->getComm ()->getRank () << ".");
4233  TEUCHOS_TEST_FOR_EXCEPTION(
4234  vectorIndexOutOfRange(col),
4235  std::runtime_error,
4236  "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4237  << " of the multivector is invalid.");
4238 #endif
4239  this->sumIntoLocalValue (lclRow, col, value, atomic);
4240  }
4241 
4242  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4243  template <class T>
4244  Teuchos::ArrayRCP<T>
4246  getSubArrayRCP (Teuchos::ArrayRCP<T> arr,
4247  size_t j) const
4248  {
4249  typedef Kokkos::DualView<impl_scalar_type*,
4250  typename dual_view_type::array_layout,
4251  execution_space> col_dual_view_type;
4252  const size_t col = isConstantStride () ? j : whichVectors_[j];
4253  col_dual_view_type X_col =
4254  Kokkos::subview (view_, Kokkos::ALL (), col);
4255  return Kokkos::Compat::persistingView (X_col.d_view);
4256  }
4257 
4258 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
4259  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4261  TPETRA_DEPRECATED
4263  getDualView () const {
4264  return view_;
4265  }
4266 #endif // TPETRA_ENABLE_DEPRECATED_CODE
4267 
4268  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4269  void
4271  clear_sync_state () {
4272  view_.clear_sync_state ();
4273  }
4274 
4275  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4276  void
4278  sync_host () {
4279  view_.sync_host ();
4280  }
4281 
4282  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4283  void
4285  sync_device () {
4286  view_.sync_device ();
4287  }
4288 
4289  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4290  bool
4292  need_sync_host () const {
4293  return view_.need_sync_host ();
4294  }
4295 
4296  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4297  bool
4299  need_sync_device () const {
4300  return view_.need_sync_device ();
4301  }
4302 
4303  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4304  void
4306  modify_device () {
4307  view_.modify_device ();
4308  }
4309 
4310  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4311  void
4313  modify_host () {
4314  view_.modify_host ();
4315  }
4316 
4317  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4320  getLocalViewDevice () const {
4321  return view_.view_device ();
4322  }
4323 
4324  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4327  getLocalViewHost () const {
4328  return view_.view_host ();
4329  }
4330 
4331  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4332  std::string
4334  descriptionImpl (const std::string& className) const
4335  {
4336  using Teuchos::TypeNameTraits;
4337 
4338  std::ostringstream out;
4339  out << "\"" << className << "\": {";
4340  out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name ()
4341  << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name ()
4342  << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name ()
4343  << ", Node" << Node::name ()
4344  << "}, ";
4345  if (this->getObjectLabel () != "") {
4346  out << "Label: \"" << this->getObjectLabel () << "\", ";
4347  }
4348  out << ", numRows: " << this->getGlobalLength ();
4349  if (className != "Tpetra::Vector") {
4350  out << ", numCols: " << this->getNumVectors ()
4351  << ", isConstantStride: " << this->isConstantStride ();
4352  }
4353  if (this->isConstantStride ()) {
4354  out << ", columnStride: " << this->getStride ();
4355  }
4356  out << "}";
4357 
4358  return out.str ();
4359  }
4360 
4361  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4362  std::string
4364  description () const
4365  {
4366  return this->descriptionImpl ("Tpetra::MultiVector");
4367  }
4368 
4369  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4370  std::string
4372  localDescribeToString (const Teuchos::EVerbosityLevel vl) const
4373  {
4374  typedef LocalOrdinal LO;
4375  using std::endl;
4376 
4377  if (vl <= Teuchos::VERB_LOW) {
4378  return std::string ();
4379  }
4380  auto map = this->getMap ();
4381  if (map.is_null ()) {
4382  return std::string ();
4383  }
4384  auto outStringP = Teuchos::rcp (new std::ostringstream ());
4385  auto outp = Teuchos::getFancyOStream (outStringP);
4386  Teuchos::FancyOStream& out = *outp;
4387  auto comm = map->getComm ();
4388  const int myRank = comm->getRank ();
4389  const int numProcs = comm->getSize ();
4390 
4391  out << "Process " << myRank << " of " << numProcs << ":" << endl;
4392  Teuchos::OSTab tab1 (out);
4393 
4394  // VERB_MEDIUM and higher prints getLocalLength()
4395  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
4396  out << "Local number of rows: " << lclNumRows << endl;
4397 
4398  if (vl > Teuchos::VERB_MEDIUM) {
4399  // VERB_HIGH and higher prints isConstantStride() and getStride().
4400  // The first is only relevant if the Vector has multiple columns.
4401  if (this->getNumVectors () != static_cast<size_t> (1)) {
4402  out << "isConstantStride: " << this->isConstantStride () << endl;
4403  }
4404  if (this->isConstantStride ()) {
4405  out << "Column stride: " << this->getStride () << endl;
4406  }
4407 
4408  if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4409  // VERB_EXTREME prints values. Get a host View of the
4410  // Vector's local data, so we can print it. (Don't assume
4411  // that we can access device data directly in host code.)
4412  typename dual_view_type::t_host X_host;
4413  if (need_sync_host ()) {
4414  // Device memory has the latest version. Don't actually
4415  // sync to host; that changes the Vector's state, and may
4416  // change future computations (that use the data's current
4417  // place to decide where to compute). Instead, create a
4418  // temporary host copy and print that.
4419  auto X_dev = getLocalViewDevice ();
4420  auto X_host_copy = Kokkos::create_mirror_view (X_dev);
4421  Kokkos::deep_copy (X_host_copy, X_dev);
4422  X_host = X_host_copy;
4423  }
4424  else {
4425  // Either host and device are in sync, or host has the
4426  // latest version of the Vector's data. Thus, we can use
4427  // the host version directly.
4428  X_host = getLocalViewHost ();
4429  }
4430  // The square braces [] and their contents are in Matlab
4431  // format, so users may copy and paste directly into Matlab.
4432  out << "Values: " << endl
4433  << "[";
4434  const LO numCols = static_cast<LO> (this->getNumVectors ());
4435  if (numCols == 1) {
4436  for (LO i = 0; i < lclNumRows; ++i) {
4437  out << X_host(i,0);
4438  if (i + 1 < lclNumRows) {
4439  out << "; ";
4440  }
4441  }
4442  }
4443  else {
4444  for (LO i = 0; i < lclNumRows; ++i) {
4445  for (LO j = 0; j < numCols; ++j) {
4446  out << X_host(i,j);
4447  if (j + 1 < numCols) {
4448  out << ", ";
4449  }
4450  }
4451  if (i + 1 < lclNumRows) {
4452  out << "; ";
4453  }
4454  }
4455  }
4456  out << "]" << endl;
4457  }
4458  }
4459 
4460  out.flush (); // make sure the ostringstream got everything
4461  return outStringP->str ();
4462  }
4463 
4464  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4465  void
4467  describeImpl (Teuchos::FancyOStream& out,
4468  const std::string& className,
4469  const Teuchos::EVerbosityLevel verbLevel) const
4470  {
4471  using Teuchos::TypeNameTraits;
4472  using Teuchos::VERB_DEFAULT;
4473  using Teuchos::VERB_NONE;
4474  using Teuchos::VERB_LOW;
4475  using std::endl;
4476  const Teuchos::EVerbosityLevel vl =
4477  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
4478 
4479  if (vl == VERB_NONE) {
4480  return; // don't print anything
4481  }
4482  // If this Vector's Comm is null, then the Vector does not
4483  // participate in collective operations with the other processes.
4484  // In that case, it is not even legal to call this method. The
4485  // reasonable thing to do in that case is nothing.
4486  auto map = this->getMap ();
4487  if (map.is_null ()) {
4488  return;
4489  }
4490  auto comm = map->getComm ();
4491  if (comm.is_null ()) {
4492  return;
4493  }
4494 
4495  const int myRank = comm->getRank ();
4496 
4497  // Only Process 0 should touch the output stream, but this method
4498  // in general may need to do communication. Thus, we may need to
4499  // preserve the current tab level across multiple "if (myRank ==
4500  // 0) { ... }" inner scopes. This is why we sometimes create
4501  // OSTab instances by pointer, instead of by value. We only need
4502  // to create them by pointer if the tab level must persist through
4503  // multiple inner scopes.
4504  Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4505 
4506  // VERB_LOW and higher prints the equivalent of description().
4507  if (myRank == 0) {
4508  tab0 = Teuchos::rcp (new Teuchos::OSTab (out));
4509  out << "\"" << className << "\":" << endl;
4510  tab1 = Teuchos::rcp (new Teuchos::OSTab (out));
4511  {
4512  out << "Template parameters:" << endl;
4513  Teuchos::OSTab tab2 (out);
4514  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
4515  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
4516  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
4517  << "Node: " << Node::name () << endl;
4518  }
4519  if (this->getObjectLabel () != "") {
4520  out << "Label: \"" << this->getObjectLabel () << "\", ";
4521  }
4522  out << "Global number of rows: " << this->getGlobalLength () << endl;
4523  if (className != "Tpetra::Vector") {
4524  out << "Number of columns: " << this->getNumVectors () << endl;
4525  }
4526  // getStride() may differ on different processes, so it (and
4527  // isConstantStride()) properly belong to per-process data.
4528  }
4529 
4530  // This is collective over the Map's communicator.
4531  if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4532  const std::string lclStr = this->localDescribeToString (vl);
4533  ::Tpetra::Details::gathervPrint (out, lclStr, *comm);
4534  }
4535  }
4536 
4537  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4538  void
4540  describe (Teuchos::FancyOStream &out,
4541  const Teuchos::EVerbosityLevel verbLevel) const
4542  {
4543  this->describeImpl (out, "Tpetra::MultiVector", verbLevel);
4544  }
4545 
4546  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4547  void
4549  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
4550  {
4551  replaceMap (newMap);
4552  }
4553 
4554  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4555  void
4558  {
4559  using ::Tpetra::Details::localDeepCopy;
4560  const char prefix[] = "Tpetra::MultiVector::assign: ";
4561 
4562  TEUCHOS_TEST_FOR_EXCEPTION
4563  (this->getGlobalLength () != src.getGlobalLength () ||
4564  this->getNumVectors () != src.getNumVectors (), std::invalid_argument,
4565  prefix << "Global dimensions of the two Tpetra::MultiVector "
4566  "objects do not match. src has dimensions [" << src.getGlobalLength ()
4567  << "," << src.getNumVectors () << "], and *this has dimensions ["
4568  << this->getGlobalLength () << "," << this->getNumVectors () << "].");
4569  // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4570  TEUCHOS_TEST_FOR_EXCEPTION
4571  (this->getLocalLength () != src.getLocalLength (), std::invalid_argument,
4572  prefix << "The local row counts of the two Tpetra::MultiVector "
4573  "objects do not match. src has " << src.getLocalLength () << " row(s) "
4574  << " and *this has " << this->getLocalLength () << " row(s).");
4575 
4576  // See #1510. We're writing to *this, so we don't care about its
4577  // contents in either memory space, and we don't want
4578  // DualView::modify to complain about "concurrent modification" of
4579  // host and device Views.
4580  this->clear_sync_state();
4581  this->modify_device ();
4582 
4583  // If need sync to device, then host has most recent version.
4584  const bool src_last_updated_on_host = src.need_sync_device ();
4585 
4586  if (src_last_updated_on_host) {
4587  localDeepCopy (this->getLocalViewDevice (),
4588  src.getLocalViewHost (),
4589  this->isConstantStride (),
4590  src.isConstantStride (),
4591  this->whichVectors_ (),
4592  src.whichVectors_ ());
4593  }
4594  else {
4595  localDeepCopy (this->getLocalViewDevice (),
4596  src.getLocalViewDevice (),
4597  this->isConstantStride (),
4598  src.isConstantStride (),
4599  this->whichVectors_ (),
4600  src.whichVectors_ ());
4601  }
4602  }
4603 
4604  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4605  bool
4608  using ::Tpetra::Details::PackTraits;
4609  using ST = impl_scalar_type;
4610  using HES =
4611  typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
4612 
4613  const size_t l1 = this->getLocalLength();
4614  const size_t l2 = vec.getLocalLength();
4615  if ((l1!=l2) || (this->getNumVectors() != vec.getNumVectors())) {
4616  return false;
4617  }
4618  if (l1==0) {
4619  return true;
4620  }
4621 
4622  auto v1 = this->getLocalViewHost ();
4623  auto v2 = vec.getLocalViewHost ();
4624  if (PackTraits<ST, HES>::packValueCount (v1(0,0)) !=
4625  PackTraits<ST, HES>::packValueCount (v2(0,0))) {
4626  return false;
4627  }
4628 
4629  return true;
4630  }
4631 
4632  template <class ST, class LO, class GO, class NT>
4635  std::swap(mv.map_, this->map_);
4636  std::swap(mv.view_, this->view_);
4637  std::swap(mv.origView_, this->origView_);
4638  std::swap(mv.whichVectors_, this->whichVectors_);
4639  }
4640 
4641 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4642  template <class ST, class LO, class GO, class NT>
4643  void
4645  const Teuchos::SerialDenseMatrix<int, ST>& src)
4646  {
4647  using ::Tpetra::Details::localDeepCopy;
4648  using MV = MultiVector<ST, LO, GO, NT>;
4649  using IST = typename MV::impl_scalar_type;
4650  using input_view_type =
4651  Kokkos::View<const IST**, Kokkos::LayoutLeft,
4652  Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4653  using pair_type = std::pair<LO, LO>;
4654 
4655  const LO numRows = static_cast<LO> (src.numRows ());
4656  const LO numCols = static_cast<LO> (src.numCols ());
4657 
4658  TEUCHOS_TEST_FOR_EXCEPTION
4659  (numRows != static_cast<LO> (dst.getLocalLength ()) ||
4660  numCols != static_cast<LO> (dst.getNumVectors ()),
4661  std::invalid_argument, "Tpetra::deep_copy: On Process "
4662  << dst.getMap ()->getComm ()->getRank () << ", dst is "
4663  << dst.getLocalLength () << " x " << dst.getNumVectors ()
4664  << ", but src is " << numRows << " x " << numCols << ".");
4665 
4666  const IST* src_raw = reinterpret_cast<const IST*> (src.values ());
4667  input_view_type src_orig (src_raw, src.stride (), numCols);
4668  input_view_type src_view =
4669  Kokkos::subview (src_orig, pair_type (0, numRows), Kokkos::ALL ());
4670 
4671  dst.clear_sync_state ();
4672  dst.modify_device ();
4673  constexpr bool src_isConstantStride = true;
4674  Teuchos::ArrayView<const size_t> srcWhichVectors (nullptr, 0);
4675  localDeepCopy (dst.getLocalViewDevice (),
4676  src_view,
4677  dst.isConstantStride (),
4678  src_isConstantStride,
4679  getMultiVectorWhichVectors (dst),
4680  srcWhichVectors);
4681  }
4682 
4683  template <class ST, class LO, class GO, class NT>
4684  void
4685  deep_copy (Teuchos::SerialDenseMatrix<int, ST>& dst,
4686  const MultiVector<ST, LO, GO, NT>& src)
4687  {
4688  using ::Tpetra::Details::localDeepCopy;
4689  using MV = MultiVector<ST, LO, GO, NT>;
4690  using IST = typename MV::impl_scalar_type;
4691  using output_view_type =
4692  Kokkos::View<IST**, Kokkos::LayoutLeft,
4693  Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4694  using pair_type = std::pair<LO, LO>;
4695 
4696  const LO numRows = static_cast<LO> (dst.numRows ());
4697  const LO numCols = static_cast<LO> (dst.numCols ());
4698 
4699  TEUCHOS_TEST_FOR_EXCEPTION
4700  (numRows != static_cast<LO> (src.getLocalLength ()) ||
4701  numCols != static_cast<LO> (src.getNumVectors ()),
4702  std::invalid_argument, "Tpetra::deep_copy: On Process "
4703  << src.getMap ()->getComm ()->getRank () << ", src is "
4704  << src.getLocalLength () << " x " << src.getNumVectors ()
4705  << ", but dst is " << numRows << " x " << numCols << ".");
4706 
4707  IST* dst_raw = reinterpret_cast<IST*> (dst.values ());
4708  output_view_type dst_orig (dst_raw, dst.stride (), numCols);
4709  auto dst_view =
4710  Kokkos::subview (dst_orig, pair_type (0, numRows), Kokkos::ALL ());
4711 
4712  constexpr bool dst_isConstantStride = true;
4713  Teuchos::ArrayView<const size_t> dstWhichVectors (nullptr, 0);
4714 
4715  // Prefer the host version of src's data.
4716  if (src.need_sync_host ()) { // last modified on device
4717  localDeepCopy (dst_view,
4718  src.getLocalViewDevice (),
4719  dst_isConstantStride,
4720  src.isConstantStride (),
4721  dstWhichVectors,
4722  getMultiVectorWhichVectors (src));
4723  }
4724  else {
4725  localDeepCopy (dst_view,
4726  src.getLocalViewHost (),
4727  dst_isConstantStride,
4728  src.isConstantStride (),
4729  dstWhichVectors,
4730  getMultiVectorWhichVectors (src));
4731  }
4732  }
4733 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4734 
4735  template <class Scalar, class LO, class GO, class NT>
4736  Teuchos::RCP<MultiVector<Scalar, LO, GO, NT> >
4737  createMultiVector (const Teuchos::RCP<const Map<LO, GO, NT> >& map,
4738  size_t numVectors)
4739  {
4740  typedef MultiVector<Scalar, LO, GO, NT> MV;
4741  return Teuchos::rcp (new MV (map, numVectors));
4742  }
4743 
4744  template <class ST, class LO, class GO, class NT>
4745  MultiVector<ST, LO, GO, NT>
4747  {
4748  typedef MultiVector<ST, LO, GO, NT> MV;
4749  MV cpy (src.getMap (), src.getNumVectors (), false);
4750  cpy.assign (src);
4751  return cpy;
4752  }
4753 
4754 } // namespace Tpetra
4755 
4756 //
4757 // Explicit instantiation macro
4758 //
4759 // Must be expanded from within the Tpetra namespace!
4760 //
4761 
4762 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4763 # define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4764  template class MultiVector< SCALAR , LO , GO , NODE >; \
4765  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src); \
4766  template Teuchos::RCP<MultiVector< SCALAR , LO , GO , NODE > > createMultiVector (const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors); \
4767  template void deep_copy (MultiVector<SCALAR, LO, GO, NODE>& dst, const Teuchos::SerialDenseMatrix<int, SCALAR>& src); \
4768  template void deep_copy (Teuchos::SerialDenseMatrix<int, SCALAR>& dst, const MultiVector<SCALAR, LO, GO, NODE>& src);
4769 
4770 #else
4771 # define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4772  template class MultiVector< SCALAR , LO , GO , NODE >; \
4773  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src);
4774 
4775 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4776 
4777 #endif // TPETRA_MULTIVECTOR_DEF_HPP
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
All-reduce a 1-D or 2-D Kokkos::View.
Declaration and definition of Tpetra::Details::Blas::fill, an implementation detail of Tpetra::MultiV...
void fill(const ExecutionSpace &execSpace, const ViewType &X, const ValueType &alpha, const IndexType numRows, const IndexType numCols)
Fill the entries of the given 1-D or 2-D Kokkos::View with the given scalar value alpha.
Declaration of Tpetra::Details::isInterComm.
Declaration and definition of Tpetra::Details::lclDot, an implementation detail of Tpetra::MultiVecto...
Declaration and definition of Tpetra::Details::normImpl, which is an implementation detail of Tpetra:...
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
Stand-alone utility functions and macros.
static bool debug()
Whether Tpetra is in debug mode.
static bool verbose()
Whether Tpetra is in verbose mode.
Base class for distributed Tpetra objects that support data redistribution.
Kokkos::DualView< size_t *, buffer_device_type > numImportPacketsPerLID_
Number of packets to receive for each receive operation.
Kokkos::DualView< packet_type *, buffer_device_type > exports_
Buffer from which packed data are exported (sent to other processes).
Kokkos::DualView< packet_type *, buffer_device_type > imports_
Buffer into which packed data are imported (received from other processes).
Kokkos::DualView< size_t *, buffer_device_type > numExportPacketsPerLID_
Number of packets to send for each send operation.
Teuchos::RCP< const map_type > map_
The Map over which this object is distributed.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
bool isDistributed() const
Whether this is a globally distributed object.
Sets up and executes a communication plan for a Tpetra DistObject.
A parallel distribution of indices over processes.
One or more distributed dense vectors.
void reciprocal(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise reciprocal values of input Multi-vector in target, this(i,j) = 1/A(i,...
dual_view_type origView_
The "original view" of the MultiVector's data.
void normInf(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the infinity-norm of each vector (column), storing the result in a host View.
void get1dCopy(const Teuchos::ArrayView< Scalar > &A, const size_t LDA) const
Fill the given array with a copy of this multivector's local values.
size_t getStride() const
Stride between columns in the multivector.
virtual size_t constantNumberOfPackets() const
Number of packets to send per LID.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
virtual std::string description() const
A simple one-line description of this object.
void reduce()
Sum values of a locally replicated multivector across all processes.
dual_view_type::t_host getLocalViewHost() const
A local Kokkos::View of host memory.
void randomize()
Set all values in the multivector to pseudorandom numbers.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap)
Remove processes owning zero rows from the Map and their communicator.
typename map_type::local_ordinal_type local_ordinal_type
The type of local indices that this class uses.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subView(const Teuchos::Range1D &colRng) const
Return a const MultiVector with const views of selected columns.
void dot(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Teuchos::ArrayView< dot_type > &dots) const
Compute the dot product of each corresponding pair of vectors (columns) in A and B.
void describeImpl(Teuchos::FancyOStream &out, const std::string &className, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Implementation of describe() for this class, and its subclass Vector.
void replaceLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &value) const
Replace value in host memory, using local (row) index.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetView(const Teuchos::RCP< const map_type > &subMap, const size_t offset) const
Return a const view of a subset of rows.
Teuchos::ArrayRCP< Scalar > get1dViewNonConst()
Nonconst persisting (1-D) view of this multivector's local values.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &src)
Copy the contents of src into *this (deep copy).
Teuchos::RCP< Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVectorNonConst(const size_t j)
Return a Vector which is a nonconst view of column j.
void meanValue(const Teuchos::ArrayView< impl_scalar_type > &means) const
Compute mean (average) value of each column.
std::string descriptionImpl(const std::string &className) const
Implementation of description() for this class, and its subclass Vector.
void multiply(Teuchos::ETransp transA, Teuchos::ETransp transB, const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Scalar &beta)
Matrix-matrix multiplication: this = beta*this + alpha*op(A)*op(B).
bool need_sync_device() const
Whether this MultiVector needs synchronization to the device.
void clear_sync_state()
Clear "modified" flags on both host and device sides.
void swap(MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &mv)
Return a deep copy of this MultiVector, with a different Node type.
size_t getLocalLength() const
Local number of rows on the calling process.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subCopy(const Teuchos::Range1D &colRng) const
Return a MultiVector with copies of selected columns.
typename Kokkos::Details::InnerProductSpaceTraits< impl_scalar_type >::dot_type dot_type
Type of an inner ("dot") product result.
Teuchos::ArrayRCP< const Scalar > getData(size_t j) const
Const view of the local values in a particular vector of this multivector.
bool need_sync_host() const
Whether this MultiVector needs synchronization to the host.
Teuchos::RCP< const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVector(const size_t j) const
Return a Vector which is a const view of column j.
void modify_device()
Mark data as modified on the device side.
void sumIntoGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value, const bool atomic=useAtomicUpdatesByDefault) const
Update (+=) a value in host memory, using global row index.
void norm2(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the two-norm of each vector (column), storing the result in a host View.
typename Kokkos::Details::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
Teuchos::ArrayRCP< const Scalar > get1dView() const
Const persisting (1-D) view of this multivector's local values.
Teuchos::ArrayRCP< T > getSubArrayRCP(Teuchos::ArrayRCP< T > arr, size_t j) const
Persisting view of j-th column in the given ArrayRCP.
void sumIntoLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &val, const bool atomic=useAtomicUpdatesByDefault) const
Update (+=) a value in host memory, using local row index.
virtual void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Print the object with the given verbosity level to a FancyOStream.
void elementWiseMultiply(Scalar scalarAB, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarThis)
Multiply a Vector A elementwise by a MultiVector B.
void replaceGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value) const
Replace value in host memory, using global row index.
size_t getNumVectors() const
Number of columns in the multivector.
void sync_host()
Synchronize to Host.
void replaceMap(const Teuchos::RCP< const map_type > &map)
Replace the underlying Map in place.
virtual bool checkSizes(const SrcDistObject &sourceObj)
Whether data redistribution between sourceObj and this object is legal.
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
typename device_type::execution_space execution_space
Type of the (new) Kokkos execution space.
void abs(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise absolute values of input Multi-vector in target: A = abs(this)
void norm1(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the one-norm of each vector (column), storing the result in a host view.
void sync_device()
Synchronize to Device.
void get2dCopy(const Teuchos::ArrayView< const Teuchos::ArrayView< Scalar > > &ArrayOfPtrs) const
Fill the given array with a copy of this multivector's local values.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subViewNonConst(const Teuchos::Range1D &colRng)
Return a MultiVector with views of selected columns.
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.
Teuchos::ArrayRCP< Scalar > getDataNonConst(size_t j)
View of the local values in a particular vector of this multivector.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
size_t getOrigNumLocalCols() const
"Original" number of columns in the (local) data.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< Scalar > > get2dViewNonConst()
Return non-const persisting pointers to values.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetViewNonConst(const Teuchos::RCP< const map_type > &subMap, const size_t offset)
Return a nonconst view of a subset of rows.
std::string localDescribeToString(const Teuchos::EVerbosityLevel vl) const
Print the calling process' verbose describe() information to the returned string.
bool isSameSize(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &vec) const
size_t getOrigNumLocalRows() const
"Original" number of rows in the (local) data.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< const Scalar > > get2dView() const
Return const persisting pointers to values.
void modify_host()
Mark data as modified on the host side.
dual_view_type view_
The Kokkos::DualView containing the MultiVector's data.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, execution_space > dual_view_type
Kokkos::DualView specialization used by this class.
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
dual_view_type::t_dev getLocalViewDevice() const
A local Kokkos::View of device memory.
Abstract base class for objects that can be the source of an Import or Export operation.
A distributed dense vector.
int local_ordinal_type
Default value of Scalar template parameter.
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
static void allReduceView(const OutputViewType &output, const InputViewType &input, const Teuchos::Comm< int > &comm)
All-reduce from input Kokkos::View to output Kokkos::View.
EWhichNorm
Input argument for normImpl() (which see).
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
void localDeepCopy(const DstViewType &dst, const SrcViewType &src, const bool dstConstStride, const bool srcConstStride, const DstWhichVecsType &dstWhichVecs, const SrcWhichVecsType &srcWhichVecs)
Implementation of Tpetra::MultiVector deep copy of local data.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector "constructor": Create a MultiVector from a given Map.
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values into existing values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.