Zoltan2
Zoltan2_AlgMultiJagged.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // Zoltan2: A package of combinatorial algorithms for scientific computing
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Karen Devine (kddevin@sandia.gov)
39 // Erik Boman (egboman@sandia.gov)
40 // Siva Rajamanickam (srajama@sandia.gov)
41 //
42 // ***********************************************************************
43 //
44 // @HEADER
49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50 #define _ZOLTAN2_ALGMultiJagged_HPP_
51 
54 #include <Zoltan2_Parameters.hpp>
55 #include <Zoltan2_Algorithm.hpp>
57 #include <Teuchos_StandardParameterEntryValidators.hpp>
58 
59 #include <Tpetra_Distributor.hpp>
60 #include <Teuchos_ParameterList.hpp>
62 #include <new> // ::operator new[]
63 #include <algorithm> // std::sort
64 #include <Zoltan2_Util.hpp>
65 #include <vector>
66 
67 #if defined(__cplusplus) && __cplusplus >= 201103L
68 #include <unordered_map>
69 #else
70 #include <Teuchos_Hashtable.hpp>
71 #endif // C++11 is enabled
72 
73 #ifdef ZOLTAN2_USEZOLTANCOMM
74 #ifdef HAVE_ZOLTAN2_MPI
75 #define ENABLE_ZOLTAN_MIGRATION
76 #include "zoltan_comm_cpp.h"
77 #include "zoltan_types.h" // for error codes
78 #endif
79 #endif
80 
81 #ifdef HAVE_ZOLTAN2_OMP
82 #include <omp.h>
83 #endif
84 
85 #define LEAST_SIGNIFICANCE 0.0001
86 #define SIGNIFICANCE_MUL 1000
87 
88 //if the (last dimension reduce all count) x the mpi world size
89 //estimated to be bigger than this number then migration will be forced
90 //in earlier iterations.
91 #define FUTURE_REDUCEALL_CUTOFF 1500000
92 //if parts right before last dimension are estimated to have less than
93 //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
94 #define MIN_WORK_LAST_DIM 1000
95 
96 
97 
98 
99 #define ZOLTAN2_ABS(x) ((x) >= 0 ? (x) : -(x))
100 //imbalance calculation. Wreal / Wexpected - 1
101 #define imbalanceOf(Wachieved, totalW, expectedRatio) \
102  (Wachieved) / ((totalW) * (expectedRatio)) - 1
103 #define imbalanceOf2(Wachieved, wExpected) \
104  (Wachieved) / (wExpected) - 1
105 
106 
107 #define ZOLTAN2_ALGMULTIJAGGED_SWAP(a,b,temp) temp=(a);(a)=(b);(b)=temp;
108 
109 
110 namespace Teuchos{
111 
116 template <typename Ordinal, typename T>
117 class Zoltan2_BoxBoundaries : public ValueTypeReductionOp<Ordinal,T>
118 {
119 private:
120  Ordinal size;
121  T _EPSILON;
122 
123 public:
126  Zoltan2_BoxBoundaries ():size(0), _EPSILON (std::numeric_limits<T>::epsilon()){}
127 
134  Zoltan2_BoxBoundaries (Ordinal s_):
135  size(s_), _EPSILON (std::numeric_limits<T>::epsilon()){}
136 
139  void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
140  {
141  for (Ordinal i=0; i < count; i++){
142  if (Z2_ABS(inBuffer[i]) > _EPSILON){
143  inoutBuffer[i] = inBuffer[i];
144  }
145  }
146  }
147 };
148 } // namespace Teuchos
149 
150 namespace Zoltan2{
151 
155 template <typename T>
156 T *allocMemory(size_t size){
157  if (size > 0){
158  T * a = new T[size];
159  if (a == NULL) {
160  throw "cannot allocate memory";
161  }
162  return a;
163  }
164  else {
165  return NULL;
166  }
167 }
168 
172 template <typename T>
173 void freeArray(T *&array){
174  if(array != NULL){
175  delete [] array;
176  array = NULL;
177  }
178 }
179 
180 
188 template <typename IT, typename CT, typename WT>
190 {
191 public:
192  //TODO: Why volatile?
193  //no idea, another intel compiler faiulure.
194  volatile IT index;
195  volatile CT count;
196  //unsigned int val;
197  volatile WT *val;
198  volatile WT _EPSILON;
199 
201  this->index = 0;
202  this->count = 0;
203  this->val = NULL;
204  this->_EPSILON = std::numeric_limits<WT>::epsilon() * 100;
205  }
206 
207 
208  uMultiSortItem(IT index_ ,CT count_, WT *vals_){
209  this->index = index_;
210  this->count = count_;
211  this->val = vals_;
212  this->_EPSILON = std::numeric_limits<WT>::epsilon() * 100;
213  }
214 
216  this->index = other.index;
217  this->count = other.count;
218  this->val = other.val;
219  this->_EPSILON = other._EPSILON;
220  }
221 
223  //freeArray<WT>(this->val);
224  }
225 
226  void set(IT index_ ,CT count_, WT *vals_){
227  this->index = index_;
228  this->count = count_;
229  this->val = vals_;
230  }
231 
232 
234  this->index = other.index;
235  this->count = other.count;
236  this->val = other.val;
237  return *(this);
238  }
239 
240  bool operator<(const uMultiSortItem<IT,CT,WT>& other) const{
241  assert (this->count == other.count);
242  for(CT i = 0; i < this->count; ++i){
243  //if the values are equal go to next one.
244  if (ZOLTAN2_ABS(this->val[i] - other.val[i]) < this->_EPSILON){
245  continue;
246  }
247  //if next value is smaller return true;
248  if(this->val[i] < other.val[i]){
249  return true;
250  }
251  //if next value is bigger return false;
252  else {
253  return false;
254  }
255  }
256  //if they are totally equal.
257  return this->index < other.index;
258  }
259  bool operator>(const uMultiSortItem<IT,CT,WT>& other) const{
260  assert (this->count == other.count);
261  for(CT i = 0; i < this->count; ++i){
262  //if the values are equal go to next one.
263  if (ZOLTAN2_ABS(this->val[i] - other.val[i]) < this->_EPSILON){
264  continue;
265  }
266  //if next value is bigger return true;
267  if(this->val[i] > other.val[i]){
268  return true;
269  }
270  //if next value is smaller return false;
271  else //(this->val[i] > other.val[i])
272  {
273  return false;
274  }
275  }
276  //if they are totally equal.
277  return this->index > other.index;
278  }
279 };// uSortItem;
280 
284 template <class IT, class WT>
285 struct uSortItem
286 {
287  IT id;
288  //unsigned int val;
289  WT val;
290 };// uSortItem;
291 
295 template <class IT, class WT>
296 void uqsort(IT n, uSortItem<IT, WT> * arr)
297 {
298 
299  int NSTACK = 50;
300  int M = 7;
301  IT i, ir=n, j, k, l=1;
302  IT jstack=0, istack[50];
303  WT aval;
304  uSortItem<IT,WT> a, temp;
305 
306  --arr;
307  for (;;)
308  {
309  if (ir-l < M)
310  {
311  for (j=l+1;j<=ir;j++)
312  {
313  a=arr[j];
314  aval = a.val;
315  for (i=j-1;i>=1;i--)
316  {
317  if (arr[i].val <= aval)
318  break;
319  arr[i+1] = arr[i];
320  }
321  arr[i+1]=a;
322  }
323  if (jstack == 0)
324  break;
325  ir=istack[jstack--];
326  l=istack[jstack--];
327  }
328  else
329  {
330  k=(l+ir) >> 1;
331 
332  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[k],arr[l+1], temp)
333  if (arr[l+1].val > arr[ir].val)
334  {
335  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[ir],temp)
336  }
337  if (arr[l].val > arr[ir].val)
338  {
339  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l],arr[ir],temp)
340  }
341  if (arr[l+1].val > arr[l].val)
342  {
343  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[l],temp)
344  }
345  i=l+1;
346  j=ir;
347  a=arr[l];
348  aval = a.val;
349  for (;;)
350  {
351  do i++; while (arr[i].val < aval);
352  do j--; while (arr[j].val > aval);
353  if (j < i) break;
354  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[i],arr[j],temp);
355  }
356  arr[l]=arr[j];
357  arr[j]=a;
358  jstack += 2;
359  if (jstack > NSTACK){
360  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
361  exit(1);
362  }
363  if (ir-i+1 >= j-l)
364  {
365  istack[jstack]=ir;
366  istack[jstack-1]=i;
367  ir=j-1;
368  }
369  else
370  {
371  istack[jstack]=j-1;
372  istack[jstack-1]=l;
373  l=i;
374  }
375  }
376  }
377 }
378 
379 template <class IT, class WT, class SIGN>
381 {
382  IT id;
383  //unsigned int val;
384  WT val;
385  SIGN signbit; // 1 means positive, 0 means negative.
386  bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs) const {
387  /*if I am negative, the other is positive*/
388  if (this->signbit < rhs.signbit){
389  return true;
390  }
391  /*if both has the same sign*/
392  else if (this->signbit == rhs.signbit){
393 
394  if (this->val < rhs.val){//if my value is smaller,
395  return this->signbit;//then if we both are positive return true.
396  //if we both are negative, return false.
397  }
398  else if (this->val > rhs.val){//if my value is larger,
399  return !this->signbit; //then if we both are positive return false.
400  //if we both are negative, return true.
401  }
402  else { //if both are equal.
403  return false;
404  }
405  }
406  else {
407  /*if I am positive, the other is negative*/
408  return false;
409  }
410 
411  }
412  bool operator>(const uSignedSortItem<IT, WT, SIGN>& rhs) const {
413  /*if I am positive, the other is negative*/
414  if (this->signbit > rhs.signbit){
415  return true;
416  }
417  /*if both has the same sign*/
418  else if (this->signbit == rhs.signbit){
419 
420  if (this->val < rhs.val){//if my value is smaller,
421  return !this->signbit;//then if we both are positive return false.
422  //if we both are negative, return true.
423  }
424  else if (this->val > rhs.val){//if my value is larger,
425  return this->signbit; //then if we both are positive return true.
426  //if we both are negative, return false.
427  }
428  else { // if they are equal
429  return false;
430  }
431  }
432  else {
433  /*if I am negative, the other is positive*/
434  return false;
435  }
436  }
438  return !(*this > rhs);}
440  return !(*this < rhs);}
441 };
442 
446 template <class IT, class WT, class SIGN>
448 
449  IT NSTACK = 50;
450  IT M = 7;
451  IT i, ir=n, j, k, l=1;
452  IT jstack=0, istack[50];
454 
455  --arr;
456  for (;;)
457  {
458  if (ir < M + l)
459  {
460  for (j=l+1;j<=ir;j++)
461  {
462  a=arr[j];
463  for (i=j-1;i>=1;i--)
464  {
465  if (arr[i] <= a)
466  {
467  break;
468  }
469  arr[i+1] = arr[i];
470  }
471  arr[i+1]=a;
472  }
473  if (jstack == 0)
474  break;
475  ir=istack[jstack--];
476  l=istack[jstack--];
477  }
478  else
479  {
480  k=(l+ir) >> 1;
481  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[k],arr[l+1], temp)
482  if (arr[l+1] > arr[ir])
483  {
484  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[ir],temp)
485  }
486  if (arr[l] > arr[ir])
487  {
488  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l],arr[ir],temp)
489  }
490  if (arr[l+1] > arr[l])
491  {
492  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[l],temp)
493  }
494  i=l+1;
495  j=ir;
496  a=arr[l];
497  for (;;)
498  {
499  do i++; while (arr[i] < a);
500  do j--; while (arr[j] > a);
501  if (j < i) break;
502  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[i],arr[j],temp);
503  }
504  arr[l]=arr[j];
505  arr[j]=a;
506  jstack += 2;
507  if (jstack > NSTACK){
508  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
509  exit(1);
510  }
511  if (ir+l+1 >= j+i)
512  {
513  istack[jstack]=ir;
514  istack[jstack-1]=i;
515  ir=j-1;
516  }
517  else
518  {
519  istack[jstack]=j-1;
520  istack[jstack-1]=l;
521  l=i;
522  }
523  }
524  }
525 }
526 
530 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
531  typename mj_part_t>
532 class AlgMJ
533 {
534 private:
536  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
537 
538  RCP<const Environment> mj_env; //the environment object
539  RCP<const Comm<int> > mj_problemComm; //initial comm object
540 
541  double imbalance_tolerance; //input imbalance tolerance.
542  mj_part_t *part_no_array; //input part array specifying num part to divide along each dim.
543  int recursion_depth; //the number of steps that partitioning will be solved in.
544  int coord_dim, num_weights_per_coord; //coordinate dim and # of weights per coord
545 
546  size_t initial_num_loc_coords; //initial num local coords.
547  global_size_t initial_num_glob_coords; //initial num global coords.
548 
549  mj_lno_t num_local_coords; //number of local coords.
550  mj_gno_t num_global_coords; //number of global coords.
551 
552  mj_scalar_t **mj_coordinates; //two dimension coordinate array
553  mj_scalar_t **mj_weights; //two dimension weight array
554  bool *mj_uniform_parts; //if the target parts are uniform
555  mj_scalar_t **mj_part_sizes; //target part weight sizes.
556  bool *mj_uniform_weights; //if the coordinates have uniform weights.
557 
558  ArrayView<const mj_gno_t> mj_gnos; //global ids of the coordinates, comes from the input
559  size_t num_global_parts; //the targeted number of parts
560 
561  mj_gno_t *initial_mj_gnos; //initial global ids of the coordinates.
562  mj_gno_t *current_mj_gnos; //current global ids of the coordinates, might change during migration.
563  int *owner_of_coordinate; //the actual processor owner of the coordinate, to track after migrations.
564 
565  mj_lno_t *coordinate_permutations; //permutation of coordinates, for partitioning.
566  mj_lno_t *new_coordinate_permutations; //permutation work array.
567  mj_part_t *assigned_part_ids; //the part ids assigned to coordinates.
568 
569  mj_lno_t *part_xadj; //beginning and end of each part.
570  mj_lno_t *new_part_xadj; // work array for beginning and end of each part.
571 
572  //get mj specific parameters.
573  bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts.
574  mj_part_t max_concurrent_part_calculation; // how many parts we can calculate concurrently.
575 
576  bool mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value.
577  int mj_user_recursion_depth; //the recursion depth value provided by user.
578  bool mj_keep_part_boxes; //if the boxes need to be kept.
579 
580  int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
581  int migration_type; // when doing the migration, 0 will aim for perfect load-imbalance,
582  //1 - will aim for minimized number of messages with possibly bad load-imbalance
583  double minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration.
584  int num_threads; //num threads
585 
586  // Nonuniform first level partitioning (Currently available only for sequential_task_partitioning):
587  // Used for Dragonfly task mapping by partitioning Dragonfly RCA
588  // machine coordinates and application coordinates.
589  // An optimization that completely partitions the most important machine dimension
590  // first (i.e. the Dragonfly group coordinate, or RCA's x coordinate). The standard
591  // MJ alg follows after the nonuniform first level partitioning.
592  //
593  // Ex. (first level partitioning): If we have 120 elements,
594  // num_first_level_parts = 3, first_level_distribution = [4, 10, 6], then
595  // part sizes after first level will be [24, 60, 36]. Standard uniform MJ
596  // continues for all subsequent levels.
597  mj_part_t num_first_level_parts; // If used, number of parts requested for a nonuniform first level partitioning
598  const mj_part_t *first_level_distribution; // If used, the requested distribution of parts for the nonuniform first level partitioning
599 
600  mj_part_t total_num_cut ; //how many cuts will be totally
601  mj_part_t total_num_part; //how many parts will be totally
602 
603  mj_part_t max_num_part_along_dim ; //maximum part count along a dimension.
604  mj_part_t max_num_cut_along_dim; //maximum cut count along a dimension.
605  size_t max_num_total_part_along_dim; //maximum part+cut count along a dimension.
606 
607  mj_part_t total_dim_num_reduce_all; //estimate on #reduceAlls can be done.
608  mj_part_t last_dim_num_part; //max no of parts that might occur
609  //during the partition before the
610  //last partitioning dimension.
611 
612  RCP<Comm<int> > comm; //comm object than can be altered during execution
613  float fEpsilon; //epsilon for float
614  mj_scalar_t sEpsilon; //epsilon for mj_scalar_t
615 
616  mj_scalar_t maxScalar_t; //max possible scalar
617  mj_scalar_t minScalar_t; //min scalar
618 
619  mj_scalar_t *all_cut_coordinates;
620  mj_scalar_t *max_min_coords;
621  mj_scalar_t *process_cut_line_weight_to_put_left; //how much weight should a MPI put left side of the each cutline
622  mj_scalar_t **thread_cut_line_weight_to_put_left; //how much weight percentage should each thread in MPI put left side of the each outline
623 
624  // work array to manipulate coordinate of cutlines in different iterations.
625  //necessary because previous cut line information is used for determining
626  //the next cutline information. therefore, cannot update the cut work array
627  //until all cutlines are determined.
628  mj_scalar_t *cut_coordinates_work_array;
629 
630  //cumulative part weight array.
631  mj_scalar_t *target_part_weights;
632 
633  mj_scalar_t *cut_upper_bound_coordinates ; //upper bound coordinate of a cut line
634  mj_scalar_t *cut_lower_bound_coordinates ; //lower bound coordinate of a cut line
635  mj_scalar_t *cut_lower_bound_weights ; //lower bound weight of a cut line
636  mj_scalar_t *cut_upper_bound_weights ; //upper bound weight of a cut line
637 
638  mj_scalar_t *process_local_min_max_coord_total_weight ; //combined array to exchange the min and max coordinate, and total weight of part.
639  mj_scalar_t *global_min_max_coord_total_weight ;//global combined array with the results for min, max and total weight.
640 
641  //isDone is used to determine if a cutline is determined already.
642  //If a cut line is already determined, the next iterations will skip this cut line.
643  bool *is_cut_line_determined;
644  //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part
645  //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part.
646  mj_part_t *my_incomplete_cut_count;
647  //local part weights of each thread.
648  double **thread_part_weights;
649  //the work manupulation array for partweights.
650  double **thread_part_weight_work;
651 
652  //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread).
653  mj_scalar_t **thread_cut_left_closest_point;
654  //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread)
655  mj_scalar_t **thread_cut_right_closest_point;
656 
657  //to store how many points in each part a thread has.
658  mj_lno_t **thread_point_counts;
659 
660  mj_scalar_t *process_rectilinear_cut_weight;
661  mj_scalar_t *global_rectilinear_cut_weight;
662 
663  //for faster communication, concatanation of
664  //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
665  //leftClosest distances sized P-1, since P-1 cut lines
666  //rightClosest distances size P-1, since P-1 cut lines.
667  mj_scalar_t *total_part_weight_left_right_closests ;
668  mj_scalar_t *global_total_part_weight_left_right_closests;
669 
670  RCP<mj_partBoxVector_t> kept_boxes; // vector of all boxes for all parts;
671  // constructed only if
672  // mj_keep_part_boxes == true
673  RCP<mj_partBox_t> global_box;
674  int myRank, myActualRank; //processor rank, and initial rank
675 
676  bool divide_to_prime_first;
677 
678  /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in
679  * the input. part_no_array takes
680  * precedence if both are provided.
681  * Depending on these parameters, total cut/part number,
682  * maximum part/cut number along a dimension, estimated number of reduceAlls,
683  * and the number of parts before the last dimension is calculated.
684  * */
685  void set_part_specifications();
686 
687  /* \brief Tries to determine the part number for current dimension,
688  * by trying to make the partitioning as square as possible.
689  * \param num_total_future how many more partitionings are required.
690  * \param root how many more recursion depth is left.
691  */
692  inline mj_part_t get_part_count(
693  mj_part_t num_total_future,
694  double root);
695 
696  /* \brief Allocates the all required memory for the mj partitioning algorithm.
697  *
698  */
699  void allocate_set_work_memory();
700 
701  /* \brief for part communication we keep track of the box boundaries.
702  * This is performed when either asked specifically, or when geometric mapping is performed afterwards.
703  * This function initializes a single box with all global min and max coordinates.
704  * \param initial_partitioning_boxes the input and output vector for boxes.
705  */
706  void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
707 
708  /* \brief compute global bounding box: min/max coords of global domain */
709  void compute_global_box();
710 
711  /* \brief Function returns how many parts that will be obtained after this dimension partitioning.
712  * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector,
713  * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector,
714  * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor.
715  *
716  * \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into.
717  * \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into.
718  * \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into.
719  * \param future_num_parts: output, max number of future parts that will be obtained from a single
720  * \param current_num_parts: input, how many parts are there currently.
721  * \param current_iteration: input, current dimension iteration number.
722  * \param input_part_boxes: input, if boxes are kept, current boxes.
723  * \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts.
724  */
725  mj_part_t update_part_num_arrays(
726  std::vector<mj_part_t> &num_partitioning_in_current_dim, //assumes this vector is empty.
727  std::vector<mj_part_t> *future_num_part_in_parts,
728  std::vector<mj_part_t> *next_future_num_parts_in_parts, //assumes this vector is empty.
729  mj_part_t &future_num_parts,
730  mj_part_t current_num_parts,
731  int current_iteration,
732  RCP<mj_partBoxVector_t> input_part_boxes,
733  RCP<mj_partBoxVector_t> output_part_boxes,
734  mj_part_t atomic_part_count);
735 
747  void mj_get_local_min_max_coord_totW(
748  mj_lno_t coordinate_begin_index,
749  mj_lno_t coordinate_end_index,
750  mj_lno_t *mj_current_coordinate_permutations,
751  mj_scalar_t *mj_current_dim_coords,
752  mj_scalar_t &min_coordinate,
753  mj_scalar_t &max_coordinate,
754  mj_scalar_t &total_weight);
755 
763  void mj_get_global_min_max_coord_totW(
764  mj_part_t current_concurrent_num_parts,
765  mj_scalar_t *local_min_max_total,
766  mj_scalar_t *global_min_max_total);
767 
795  void mj_get_initial_cut_coords_target_weights(
796  mj_scalar_t min_coord,
797  mj_scalar_t max_coord,
798  mj_part_t num_cuts/*p-1*/ ,
799  mj_scalar_t global_weight,
800  mj_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/,
801  mj_scalar_t *target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/,
802 
803  std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
804  std::vector <mj_part_t> *next_future_num_parts_in_parts,
805  mj_part_t concurrent_current_part,
806  mj_part_t obtained_part_index,
807  mj_part_t num_target_first_level_parts = 1,
808  const mj_part_t *target_first_level_dist = NULL);
809 
822  void set_initial_coordinate_parts(
823  mj_scalar_t &max_coordinate,
824  mj_scalar_t &min_coordinate,
825  mj_part_t &concurrent_current_part_index,
826  mj_lno_t coordinate_begin_index,
827  mj_lno_t coordinate_end_index,
828  mj_lno_t *mj_current_coordinate_permutations,
829  mj_scalar_t *mj_current_dim_coords,
830  mj_part_t *mj_part_ids,
831  mj_part_t &partition_count);
832 
843  void mj_1D_part(
844  mj_scalar_t *mj_current_dim_coords,
845  double imbalanceTolerance,
846  mj_part_t current_work_part,
847  mj_part_t current_concurrent_num_parts,
848  mj_scalar_t *current_cut_coordinates,
849  mj_part_t total_incomplete_cut_count,
850  std::vector <mj_part_t> &num_partitioning_in_current_dim);
851 
871  void mj_1D_part_get_thread_part_weights(
872  size_t total_part_count,
873  mj_part_t num_cuts,
874  mj_scalar_t max_coord,
875  mj_scalar_t min_coord,
876  mj_lno_t coordinate_begin_index,
877  mj_lno_t coordinate_end_index,
878  mj_scalar_t *mj_current_dim_coords,
879  mj_scalar_t *temp_current_cut_coords,
880  bool *current_cut_status,
881  double *my_current_part_weights,
882  mj_scalar_t *my_current_left_closest,
883  mj_scalar_t *my_current_right_closest);
884 
892  void mj_accumulate_thread_results(
893  const std::vector <mj_part_t> &num_partitioning_in_current_dim,
894  mj_part_t current_work_part,
895  mj_part_t current_concurrent_num_parts);
896 
927  void mj_get_new_cut_coordinates(
928  const size_t &num_total_part,
929  const mj_part_t &num_cuts,
930  const mj_scalar_t &max_coordinate,
931  const mj_scalar_t &min_coordinate,
932  const mj_scalar_t &global_total_weight,
933  const double &used_imbalance_tolerance,
934  mj_scalar_t * current_global_part_weights,
935  const mj_scalar_t * current_local_part_weights,
936  const mj_scalar_t *current_part_target_weights,
937  bool *current_cut_line_determined,
938  mj_scalar_t *current_cut_coordinates,
939  mj_scalar_t *current_cut_upper_bounds,
940  mj_scalar_t *current_cut_lower_bounds,
941  mj_scalar_t *current_global_left_closest_points,
942  mj_scalar_t *current_global_right_closest_points,
943  mj_scalar_t * current_cut_lower_bound_weights,
944  mj_scalar_t * current_cut_upper_weights,
945  mj_scalar_t *new_current_cut_coordinates,
946  mj_scalar_t *current_part_cut_line_weight_to_put_left,
947  mj_part_t *rectilinear_cut_count,
948  mj_part_t &my_num_incomplete_cut);
949 
959  void mj_calculate_new_cut_position (
960  mj_scalar_t cut_upper_bound,
961  mj_scalar_t cut_lower_bound,
962  mj_scalar_t cut_upper_weight,
963  mj_scalar_t cut_lower_weight,
964  mj_scalar_t expected_weight,
965  mj_scalar_t &new_cut_position);
966 
977  void mj_create_new_partitions(
978  mj_part_t num_parts,
979  mj_scalar_t *mj_current_dim_coords,
980  mj_scalar_t *current_concurrent_cut_coordinate,
981  mj_lno_t coordinate_begin,
982  mj_lno_t coordinate_end,
983  mj_scalar_t *used_local_cut_line_weight_to_left,
984  double **used_thread_part_weight_work,
985  mj_lno_t *out_part_xadj);
986 
1009  bool mj_perform_migration(
1010  mj_part_t in_num_parts, //current umb parts
1011  mj_part_t &out_num_parts, //output umb parts.
1012  std::vector<mj_part_t> *next_future_num_parts_in_parts,
1013  mj_part_t &output_part_begin_index,
1014  size_t migration_reduce_all_population,
1015  mj_lno_t num_coords_for_last_dim_part,
1016  std::string iteration,
1017  RCP<mj_partBoxVector_t> &input_part_boxes,
1018  RCP<mj_partBoxVector_t> &output_part_boxes);
1019 
1029  void get_processor_num_points_in_parts(
1030  mj_part_t num_procs,
1031  mj_part_t num_parts,
1032  mj_gno_t *&num_points_in_all_processor_parts);
1033 
1046  bool mj_check_to_migrate(
1047  size_t migration_reduce_all_population,
1048  mj_lno_t num_coords_for_last_dim_part,
1049  mj_part_t num_procs,
1050  mj_part_t num_parts,
1051  mj_gno_t *num_points_in_all_processor_parts);
1052 
1053 
1071  void mj_migration_part_proc_assignment(
1072  mj_gno_t * num_points_in_all_processor_parts,
1073  mj_part_t num_parts,
1074  mj_part_t num_procs,
1075  mj_lno_t *send_count_to_each_proc,
1076  std::vector<mj_part_t> &processor_ranks_for_subcomm,
1077  std::vector<mj_part_t> *next_future_num_parts_in_parts,
1078  mj_part_t &out_num_part,
1079  std::vector<mj_part_t> &out_part_indices,
1080  mj_part_t &output_part_numbering_begin_index,
1081  int *coordinate_destinations);
1082 
1099  void mj_assign_proc_to_parts(
1100  mj_gno_t * num_points_in_all_processor_parts,
1101  mj_part_t num_parts,
1102  mj_part_t num_procs,
1103  mj_lno_t *send_count_to_each_proc,
1104  std::vector<mj_part_t> &processor_ranks_for_subcomm,
1105  std::vector<mj_part_t> *next_future_num_parts_in_parts,
1106  mj_part_t &out_part_index,
1107  mj_part_t &output_part_numbering_begin_index,
1108  int *coordinate_destinations);
1109 
1120  void assign_send_destinations(
1121  mj_part_t num_parts,
1122  mj_part_t *part_assignment_proc_begin_indices,
1123  mj_part_t *processor_chains_in_parts,
1124  mj_lno_t *send_count_to_each_proc,
1125  int *coordinate_destinations);
1126 
1139  void assign_send_destinations2(
1140  mj_part_t num_parts,
1141  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors
1142  int *coordinate_destinations,
1143  mj_part_t &output_part_numbering_begin_index,
1144  std::vector<mj_part_t> *next_future_num_parts_in_parts);
1145 
1162  void mj_assign_parts_to_procs(
1163  mj_gno_t * num_points_in_all_processor_parts,
1164  mj_part_t num_parts,
1165  mj_part_t num_procs,
1166  mj_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc.
1167  std::vector<mj_part_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into.
1168  mj_part_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function.
1169  std::vector<mj_part_t> &out_part_indices, //output: the part indices which the processor is assigned to.
1170  mj_part_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution
1171  int *coordinate_destinations);
1172 
1185  void mj_migrate_coords(
1186  mj_part_t num_procs,
1187  mj_lno_t &num_new_local_points,
1188  std::string iteration,
1189  int *coordinate_destinations,
1190  mj_part_t num_parts);
1191 
1198  void create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm);
1199 
1200 
1206  void fill_permutation_array(
1207  mj_part_t output_num_parts,
1208  mj_part_t num_parts);
1209 
1218  void set_final_parts(
1219  mj_part_t current_num_parts,
1220  mj_part_t output_part_begin_index,
1221  RCP<mj_partBoxVector_t> &output_part_boxes,
1222  bool is_data_ever_migrated);
1225  void free_work_memory();
1239  void create_consistent_chunks(
1240  mj_part_t num_parts,
1241  mj_scalar_t *mj_current_dim_coords,
1242  mj_scalar_t *current_concurrent_cut_coordinate,
1243  mj_lno_t coordinate_begin,
1244  mj_lno_t coordinate_end,
1245  mj_scalar_t *used_local_cut_line_weight_to_left,
1246  mj_lno_t *out_part_xadj,
1247  int coordInd, bool longest_dim_part, uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted);
1248 
1253  mj_part_t find_largest_prime_factor(mj_part_t num_parts){
1254  mj_part_t largest_factor = 1;
1255  mj_part_t n = num_parts;
1256  mj_part_t divisor = 2;
1257  while (n > 1){
1258  while (n % divisor == 0){
1259  n = n / divisor;
1260  largest_factor = divisor;
1261  }
1262  ++divisor;
1263  if (divisor * divisor > n){
1264  if (n > 1){
1265  largest_factor = n;
1266  }
1267  break;
1268  }
1269  }
1270  return largest_factor;
1271  }
1272 public:
1273  AlgMJ();
1274 
1302  void multi_jagged_part(
1303  const RCP<const Environment> &env,
1304  RCP<const Comm<int> > &problemComm,
1305 
1306  double imbalance_tolerance,
1307  size_t num_global_parts,
1308  mj_part_t *part_no_array,
1309  int recursion_depth,
1310 
1311  int coord_dim,
1312  mj_lno_t num_local_coords,
1313  mj_gno_t num_global_coords,
1314  const mj_gno_t *initial_mj_gnos,
1315  mj_scalar_t **mj_coordinates,
1316 
1317  int num_weights_per_coord,
1318  bool *mj_uniform_weights,
1319  mj_scalar_t **mj_weights,
1320  bool *mj_uniform_parts,
1321  mj_scalar_t **mj_part_sizes,
1322 
1323  mj_part_t *&result_assigned_part_ids,
1324  mj_gno_t *&result_mj_gnos);
1325 
1326 
1336  bool distribute_points_on_cut_lines_,
1337  int max_concurrent_part_calculation_,
1338  int check_migrate_avoid_migration_option_,
1339  double minimum_migration_imbalance_, int migration_type_ = 0);
1340 
1344  void set_to_keep_part_boxes();
1345 
1348  RCP<mj_partBox_t> get_global_box() const;
1349 
1350  RCP<mj_partBoxVector_t> get_kept_boxes() const;
1351 
1352  RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1353  RCP<mj_partBoxVector_t> &localPartBoxes) const;
1354 
1398  const RCP<const Environment> &env,
1399  mj_lno_t num_total_coords,
1400  mj_lno_t num_selected_coords,
1401  size_t num_target_part,
1402  int coord_dim,
1403  mj_scalar_t **mj_coordinates,
1404  mj_lno_t *initial_selected_coords_output_permutation,
1405  mj_lno_t *output_xadj,
1406  int recursion_depth,
1407  const mj_part_t *part_no_array,
1408  bool partition_along_longest_dim,
1409  int num_ranks_per_node,
1410  bool divide_to_prime_first_,
1411  mj_part_t num_first_level_parts_ = 1,
1412  const mj_part_t *first_level_distribution_ = NULL);
1413 
1414 };
1415 
1458 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1459  typename mj_part_t>
1461  const RCP<const Environment> &env,
1462  mj_lno_t num_total_coords,
1463  mj_lno_t num_selected_coords,
1464  size_t num_target_part,
1465  int coord_dim_,
1466  mj_scalar_t **mj_coordinates_,
1467  mj_lno_t *inital_adjList_output_adjlist,
1468  mj_lno_t *output_xadj,
1469  int rd,
1470  const mj_part_t *part_no_array_,
1471  bool partition_along_longest_dim,
1472  int num_ranks_per_node,
1473  bool divide_to_prime_first_,
1474  mj_part_t num_first_level_parts_,
1475  const mj_part_t *first_level_distribution_) {
1476 
1477  this->mj_env = env;
1478  const RCP<Comm<int> > commN;
1479  this->mj_problemComm =
1480  Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1481  this->comm =
1482  Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1483  this->myActualRank = this->myRank = 1;
1484 
1485 #ifdef HAVE_ZOLTAN2_OMP
1486  //int actual_num_threads = omp_get_num_threads();
1487  //omp_set_num_threads(1);
1488 #endif
1489 
1490  this->divide_to_prime_first = divide_to_prime_first_;
1491  //weights are uniform for task mapping
1492 
1493  //parts are uniform for task mapping
1494  //as input indices.
1495  this->imbalance_tolerance = 0;
1496  this->num_global_parts = num_target_part;
1497  this->part_no_array = (mj_part_t *)part_no_array_;
1498  this->recursion_depth = rd;
1499 
1500  // If nonuniform first level partitioning, the requested num of parts and the requested distribution of
1501  // elements for each part
1502  this->num_first_level_parts = num_first_level_parts_;
1503  this->first_level_distribution = (mj_part_t *)first_level_distribution_;
1504 
1505  this->coord_dim = coord_dim_;
1506  this->num_local_coords = num_total_coords;
1507  this->num_global_coords = num_total_coords;
1508  this->mj_coordinates = mj_coordinates_; //will copy the memory to this->mj_coordinates.
1509 
1512  this->initial_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
1513 
1514  this->num_weights_per_coord = 0;
1515  bool *tmp_mj_uniform_weights = new bool[1];
1516  this->mj_uniform_weights = tmp_mj_uniform_weights;
1517  this->mj_uniform_weights[0] = true;
1518 
1519  mj_scalar_t **tmp_mj_weights = new mj_scalar_t *[1];
1520  this->mj_weights = tmp_mj_weights; //will copy the memory to this->mj_weights
1521 
1522  bool *tmp_mj_uniform_parts = new bool[1];
1523  this->mj_uniform_parts = tmp_mj_uniform_parts;
1524  this->mj_uniform_parts[0] = true;
1525 
1526  mj_scalar_t **tmp_mj_part_sizes = new mj_scalar_t * [1];
1527  this->mj_part_sizes = tmp_mj_part_sizes;
1528  this->mj_part_sizes[0] = NULL;
1529 
1530  this->num_threads = 1;
1531  this->set_part_specifications();
1532 
1533  this->allocate_set_work_memory();
1534  //the end of the initial partition is the end of coordinates.
1535  this->part_xadj[0] = static_cast<mj_lno_t>(num_selected_coords);
1536  for(size_t i = 0; i < static_cast<size_t>(num_total_coords); ++i){
1537  this->coordinate_permutations[i] = inital_adjList_output_adjlist[i];
1538  }
1539 
1540  mj_part_t current_num_parts = 1;
1541 
1542  mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
1543 
1544  mj_part_t future_num_parts = this->total_num_part;
1545 
1546  std::vector<mj_part_t> *future_num_part_in_parts = new std::vector<mj_part_t> ();
1547  std::vector<mj_part_t> *next_future_num_parts_in_parts = new std::vector<mj_part_t> ();
1548  next_future_num_parts_in_parts->push_back(this->num_global_parts);
1549  RCP<mj_partBoxVector_t> t1;
1550  RCP<mj_partBoxVector_t> t2;
1551 
1552 
1553  std::vector <uSignedSortItem<int, mj_scalar_t, char> > coord_dimension_range_sorted(this->coord_dim);
1554  uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted = &(coord_dimension_range_sorted[0]);
1555  std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1556  std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1557 
1558  for (int i = 0; i < this->recursion_depth; ++i) {
1559 
1560  //partitioning array. size will be as the number of current partitions and this
1561  //holds how many parts that each part will be in the current dimension partitioning.
1562  std::vector <mj_part_t> num_partitioning_in_current_dim;
1563 
1564  //number of parts that will be obtained at the end of this partitioning.
1565  //future_num_part_in_parts is as the size of current number of parts.
1566  //holds how many more parts each should be divided in the further
1567  //iterations. this will be used to calculate num_partitioning_in_current_dim,
1568  //as the number of parts that the part will be partitioned
1569  //in the current dimension partitioning.
1570 
1571  //next_future_num_parts_in_parts will be as the size of outnumParts,
1572  //and this will hold how many more parts that each output part
1573  //should be divided. this array will also be used to determine the weight ratios
1574  //of the parts.
1575  //swap the arrays to use iteratively..
1576  std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
1577  future_num_part_in_parts = next_future_num_parts_in_parts;
1578  next_future_num_parts_in_parts = tmpPartVect;
1579 
1580  //clear next_future_num_parts_in_parts array as
1581  //getPartitionArrays expects it to be empty.
1582  //it also expects num_partitioning_in_current_dim to be empty as well.
1583  next_future_num_parts_in_parts->clear();
1584 
1585 
1586  //returns the total number of output parts for this dimension partitioning.
1587  mj_part_t output_part_count_in_dimension =
1588  this->update_part_num_arrays(
1589  num_partitioning_in_current_dim,
1590  future_num_part_in_parts,
1591  next_future_num_parts_in_parts,
1592  future_num_parts,
1593  current_num_parts,
1594  i,
1595  t1,
1596  t2, num_ranks_per_node);
1597 
1598  //if the number of obtained parts equal to current number of parts,
1599  //skip this dimension. For example, this happens when 1 is given in the input
1600  //part array is given. P=4,5,1,2
1601  if(output_part_count_in_dimension == current_num_parts) {
1602  tmpPartVect= future_num_part_in_parts;
1603  future_num_part_in_parts = next_future_num_parts_in_parts;
1604  next_future_num_parts_in_parts = tmpPartVect;
1605  continue;
1606  }
1607 
1608  //convert i to string to be used for debugging purposes.
1609  std::string istring = Teuchos::toString<int>(i);
1610 
1611  //alloc Memory to point the indices
1612  //of the parts in the permutation array.
1613  this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
1614 
1615  //the index where in the outtotalCounts will be written.
1616  mj_part_t output_part_index = 0;
1617  //whatever is written to outTotalCounts will be added with previousEnd
1618  //so that the points will be shifted.
1619  mj_part_t output_coordinate_end_index = 0;
1620 
1621  mj_part_t current_work_part = 0;
1622  mj_part_t current_concurrent_num_parts = 1;
1623 
1624  mj_part_t obtained_part_index = 0;
1625 
1626  //get the coordinate axis along which the partitioning will be done.
1627  int coordInd = i % this->coord_dim;
1628  mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
1629 
1630 
1631  //run for all available parts.
1632  for (; current_work_part < current_num_parts;
1633  current_work_part += current_concurrent_num_parts) {
1634 
1635 
1636  //current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
1637  //this->max_concurrent_part_calculation);
1638 
1639  mj_part_t actual_work_part_count = 0;
1640  //initialization for 1D partitioning.
1641  //get the min and max coordinates of each part
1642  //together with the part weights of each part.
1643  for (int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1644  mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
1645 
1646  //if this part wont be partitioned any further
1647  //dont do any work for this part.
1648  if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
1649  continue;
1650  }
1651  ++actual_work_part_count;
1652  mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
1653  mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts == 0 ?
1654  0 : this->part_xadj[current_work_part_in_concurrent_parts - 1];
1655 
1656 /*
1657  std::cout << "\n\ni:" << i << " j:" << current_work_part + kk
1658  << " coordinate_begin_index:" << coordinate_begin_index
1659  << " coordinate_end_index:" << coordinate_end_index
1660  << " total:" << coordinate_end_index - coordinate_begin_index << "\n\n";
1661 */
1662 
1663 
1664  if (partition_along_longest_dim) {
1665 
1666  mj_scalar_t best_weight_coord = 0;
1667  for (int coord_traverse_ind = 0; coord_traverse_ind < this->coord_dim; ++coord_traverse_ind){
1668  mj_scalar_t best_min_coord = 0;
1669  mj_scalar_t best_max_coord = 0;
1670  //MD:same for all coordinates, but I will still use this for now.
1671 
1672  this->mj_get_local_min_max_coord_totW(
1673  coordinate_begin_index,
1674  coordinate_end_index,
1675  this->coordinate_permutations,
1676  this->mj_coordinates[coord_traverse_ind],
1677  best_min_coord, //min coordinate
1678  best_max_coord, //max coordinate
1679  best_weight_coord //total weight);
1680  );
1681 
1682  coord_dim_mins[coord_traverse_ind] = best_min_coord;
1683  coord_dim_maxs[coord_traverse_ind] = best_max_coord;
1684  mj_scalar_t best_range = best_max_coord - best_min_coord;
1685  coord_dimension_range_sorted[coord_traverse_ind].id = coord_traverse_ind;
1686  coord_dimension_range_sorted[coord_traverse_ind].val = best_range;
1687  coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1688  }
1689 
1690 
1691  uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1692  coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].id;
1693 
1694 /*
1695  std::cout << "\n\n";
1696  for (int coord_traverse_ind = 0; coord_traverse_ind < this->coord_dim; ++coord_traverse_ind){
1697  std::cout << "i:" << p_coord_dimension_range_sorted[coord_traverse_ind].id
1698  << " range:" << p_coord_dimension_range_sorted[coord_traverse_ind].val << std::endl;
1699  std::cout << "i:" << p_coord_dimension_range_sorted[coord_traverse_ind].id
1700  << " coord_dim_mins:" << coord_dim_mins[p_coord_dimension_range_sorted[coord_traverse_ind].id]<< std::endl;
1701  std::cout << "i:" << p_coord_dimension_range_sorted[coord_traverse_ind].id
1702  << " coord_dim_maxs:" << coord_dim_maxs[p_coord_dimension_range_sorted[coord_traverse_ind].id] << std::endl;
1703  }
1704  std::cout << "\n\n";
1705 */
1706 
1707  mj_current_dim_coords = this->mj_coordinates[coordInd];
1708 
1709  this->process_local_min_max_coord_total_weight[kk] = coord_dim_mins[coordInd];
1710  this->process_local_min_max_coord_total_weight[kk+ current_concurrent_num_parts] = coord_dim_maxs[coordInd];
1711  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] = best_weight_coord;
1712 
1713  }
1714  else{
1715  this->mj_get_local_min_max_coord_totW(
1716  coordinate_begin_index,
1717  coordinate_end_index,
1718  this->coordinate_permutations,
1719  mj_current_dim_coords,
1720  this->process_local_min_max_coord_total_weight[kk], //min coordinate
1721  this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max coordinate
1722  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] //total weight);
1723  );
1724  }
1725  }
1726 
1727  //1D partitioning
1728  if (actual_work_part_count > 0) {
1729  //obtain global Min max of the part.
1730  this->mj_get_global_min_max_coord_totW(
1731  current_concurrent_num_parts,
1732  this->process_local_min_max_coord_total_weight,
1733  this->global_min_max_coord_total_weight);
1734 
1735  //represents the total number of cutlines
1736  //whose coordinate should be determined.
1737  mj_part_t total_incomplete_cut_count = 0;
1738 
1739  //Compute weight ratios for parts & cuts:
1740  //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1.0
1741  // part0 cut0 part1 cut1 part2 cut2 part3
1742  mj_part_t concurrent_part_cut_shift = 0;
1743  mj_part_t concurrent_part_part_shift = 0;
1744 
1745 
1746  for (int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1747  mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
1748  mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
1749  current_concurrent_num_parts];
1750  mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
1751  2 * current_concurrent_num_parts];
1752 
1753  mj_part_t concurrent_current_part_index = current_work_part + kk;
1754 
1755  mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
1756 
1757  mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
1758  mj_scalar_t *current_target_part_weights = this->target_part_weights +
1759  concurrent_part_part_shift;
1760  //shift the usedCutCoordinate array as noCuts.
1761  concurrent_part_cut_shift += partition_count - 1;
1762  //shift the partRatio array as noParts.
1763  concurrent_part_part_shift += partition_count;
1764 
1765  //calculate only if part is not empty,
1766  //and part will be further partitioend.
1767  if(partition_count > 1 && min_coordinate <= max_coordinate){
1768 
1769  //increase allDone by the number of cuts of the current
1770  //part's cut line number.
1771  total_incomplete_cut_count += partition_count - 1;
1772  //set the number of cut lines that should be determined
1773  //for this part.
1774  this->my_incomplete_cut_count[kk] = partition_count - 1;
1775 
1776  // Nonuniform partitioning on the first level, providing
1777  // requested number of parts (num_first_level_parts) and
1778  // requested distribution in parts (first_level_distribution)
1779  if (i == 0 &&
1780  first_level_distribution != NULL &&
1781  num_first_level_parts > 1) {
1782  // Get the target part weights given a desired distribution
1783  this->mj_get_initial_cut_coords_target_weights(
1784  min_coordinate,
1785  max_coordinate,
1786  partition_count - 1,
1787  global_total_weight,
1788  usedCutCoordinate,
1789  current_target_part_weights,
1790  future_num_part_in_parts,
1791  next_future_num_parts_in_parts,
1792  concurrent_current_part_index,
1793  obtained_part_index,
1794  this->num_first_level_parts,
1795  this->first_level_distribution);
1796  }
1797  // Uniform partitioning
1798  else {
1799 
1800  //get the target weights of the parts.
1801  this->mj_get_initial_cut_coords_target_weights(
1802  min_coordinate,
1803  max_coordinate,
1804  partition_count - 1,
1805  global_total_weight,
1806  usedCutCoordinate,
1807  current_target_part_weights,
1808  future_num_part_in_parts,
1809  next_future_num_parts_in_parts,
1810  concurrent_current_part_index,
1811  obtained_part_index);
1812  }
1813 
1814  mj_lno_t coordinate_end_index = this->part_xadj[concurrent_current_part_index];
1815  mj_lno_t coordinate_begin_index = concurrent_current_part_index == 0 ?
1816  0 : this->part_xadj[concurrent_current_part_index - 1];
1817 
1818  //get the initial estimated part assignments of the coordinates.
1819  this->set_initial_coordinate_parts(
1820  max_coordinate,
1821  min_coordinate,
1822  concurrent_current_part_index,
1823  coordinate_begin_index, coordinate_end_index,
1824  this->coordinate_permutations,
1825  mj_current_dim_coords,
1826  this->assigned_part_ids,
1827  partition_count);
1828 
1829  }
1830  else {
1831  // e.g., if have fewer coordinates than parts, don't need to do next dim.
1832  this->my_incomplete_cut_count[kk] = 0;
1833  }
1834  obtained_part_index += partition_count;
1835  }
1836 
1837  //used imbalance, it is always 0, as it is difficult to estimate a range.
1838  double used_imbalance = 0;
1839 
1840 
1841  // Determine cut lines for k parts here.
1842  this->mj_1D_part(
1843  mj_current_dim_coords,
1844  used_imbalance,
1845  current_work_part,
1846  current_concurrent_num_parts,
1847  current_cut_coordinates,
1848  total_incomplete_cut_count,
1849  num_partitioning_in_current_dim);
1850  }
1851  else {
1852  obtained_part_index += current_concurrent_num_parts;
1853  }
1854 
1855  //create part chunks
1856  {
1857 
1858  mj_part_t output_array_shift = 0;
1859  mj_part_t cut_shift = 0;
1860  size_t tlr_shift = 0;
1861  size_t partweight_array_shift = 0;
1862 
1863  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
1864  mj_part_t current_concurrent_work_part = current_work_part + kk;
1865  mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
1866 
1867  //if the part is empty, skip the part.
1868  if((num_parts != 1 ) && this->global_min_max_coord_total_weight[kk] >
1869  this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
1870 
1871  for(mj_part_t jj = 0; jj < num_parts; ++jj){
1872  this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
1873  }
1874  cut_shift += num_parts - 1;
1875  tlr_shift += (4 *(num_parts - 1) + 1);
1876  output_array_shift += num_parts;
1877  partweight_array_shift += (2 * (num_parts - 1) + 1);
1878  continue;
1879  }
1880 
1881  mj_lno_t coordinate_end = this->part_xadj[current_concurrent_work_part];
1882  mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[current_concurrent_work_part
1883  -1];
1884  mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
1885  mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
1886  cut_shift;
1887 
1888  for(int ii = 0; ii < this->num_threads; ++ii){
1889  this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
1890  }
1891 
1892  if(num_parts > 1){
1893  // Rewrite the indices based on the computed cuts.
1894  this->create_consistent_chunks(
1895  num_parts,
1896  mj_current_dim_coords,
1897  current_concurrent_cut_coordinate,
1898  coordinate_begin,
1899  coordinate_end,
1900  used_local_cut_line_weight_to_left,
1901  this->new_part_xadj + output_part_index + output_array_shift,
1902  coordInd,
1903  partition_along_longest_dim,
1904  p_coord_dimension_range_sorted);
1905  }
1906  else {
1907  //if this part is partitioned into 1 then just copy
1908  //the old values.
1909  mj_lno_t part_size = coordinate_end - coordinate_begin;
1910  *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
1911  memcpy(this->new_coordinate_permutations + coordinate_begin,
1912  this->coordinate_permutations + coordinate_begin,
1913  part_size * sizeof(mj_lno_t));
1914  }
1915 
1916 
1917 
1918  cut_shift += num_parts - 1;
1919  tlr_shift += (4 *(num_parts - 1) + 1);
1920  output_array_shift += num_parts;
1921  partweight_array_shift += (2 * (num_parts - 1) + 1);
1922  }
1923 
1924  //shift cut coordinates so that all cut coordinates are stored.
1925  //current_cut_coordinates += cutShift;
1926 
1927  //getChunks from coordinates partitioned the parts and
1928  //wrote the indices as if there were a single part.
1929  //now we need to shift the beginning indices.
1930  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
1931  mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
1932  for (mj_part_t ii = 0;ii < num_parts ; ++ii){
1933  //shift it by previousCount
1934  this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
1935  if (ii % 2 == 1){
1936  mj_lno_t coordinate_end = this->new_part_xadj[output_part_index+ii];
1937  mj_lno_t coordinate_begin = this->new_part_xadj[output_part_index];
1938 
1939  for (mj_lno_t task_traverse = coordinate_begin; task_traverse < coordinate_end; ++task_traverse){
1940  mj_lno_t l = this->new_coordinate_permutations[task_traverse];
1941  //MARKER: FLIPPED ZORDER BELOW
1942  mj_current_dim_coords[l] = -mj_current_dim_coords[l];
1943  }
1944  }
1945  }
1946  //increase the previous count by current end.
1947  output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
1948  //increase the current out.
1949  output_part_index += num_parts ;
1950  }
1951  }
1952  }
1953  // end of this partitioning dimension
1954 
1955  //set the current num parts for next dim partitioning
1956  current_num_parts = output_part_count_in_dimension;
1957 
1958  //swap the coordinate permutations for the next dimension.
1959  mj_lno_t * tmp = this->coordinate_permutations;
1960  this->coordinate_permutations = this->new_coordinate_permutations;
1961  this->new_coordinate_permutations = tmp;
1962 
1963  freeArray<mj_lno_t>(this->part_xadj);
1964  this->part_xadj = this->new_part_xadj;
1965  this->new_part_xadj = NULL;
1966  }
1967 
1968  for(mj_lno_t i = 0; i < num_total_coords; ++i){
1969  inital_adjList_output_adjlist[i] = this->coordinate_permutations[i];
1970  }
1971 
1972  // Return output_xadj in CSR format
1973  output_xadj[0] = 0;
1974  for(size_t i = 0; i < this->num_global_parts ; ++i){
1975  output_xadj[i+1] = this->part_xadj[i];
1976  }
1977 
1978  delete future_num_part_in_parts;
1979  delete next_future_num_parts_in_parts;
1980 
1981  //free the extra memory that we allocated.
1982  freeArray<mj_part_t>(this->assigned_part_ids);
1983  freeArray<mj_gno_t>(this->initial_mj_gnos);
1984  freeArray<mj_gno_t>(this->current_mj_gnos);
1985  freeArray<bool>(tmp_mj_uniform_weights);
1986  freeArray<bool>(tmp_mj_uniform_parts);
1987  freeArray<mj_scalar_t *>(tmp_mj_weights);
1988  freeArray<mj_scalar_t *>(tmp_mj_part_sizes);
1989 
1990  this->free_work_memory();
1991 
1992 #ifdef HAVE_ZOLTAN2_OMP
1993  //omp_set_num_threads(actual_num_threads);
1994 #endif
1995 }
1996 
2000 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2001  typename mj_part_t>
2003  mj_env(), mj_problemComm(), imbalance_tolerance(0),
2004  part_no_array(NULL), recursion_depth(0), coord_dim(0),
2005  num_weights_per_coord(0), initial_num_loc_coords(0),
2006  initial_num_glob_coords(0),
2007  num_local_coords(0), num_global_coords(0), mj_coordinates(NULL),
2008  mj_weights(NULL), mj_uniform_parts(NULL), mj_part_sizes(NULL),
2009  mj_uniform_weights(NULL), mj_gnos(), num_global_parts(1),
2010  initial_mj_gnos(NULL), current_mj_gnos(NULL), owner_of_coordinate(NULL),
2011  coordinate_permutations(NULL), new_coordinate_permutations(NULL),
2012  assigned_part_ids(NULL), part_xadj(NULL), new_part_xadj(NULL),
2013  distribute_points_on_cut_lines(true), max_concurrent_part_calculation(1),
2014  mj_run_as_rcb(false), mj_user_recursion_depth(0), mj_keep_part_boxes(false),
2015  check_migrate_avoid_migration_option(0), migration_type(0), minimum_migration_imbalance(0.30),
2016  num_threads(1), num_first_level_parts(1), first_level_distribution(NULL),
2017  total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
2018  max_num_cut_along_dim(0), max_num_total_part_along_dim(0), total_dim_num_reduce_all(0),
2019  last_dim_num_part(0), comm(), fEpsilon(0), sEpsilon(0), maxScalar_t(0), minScalar_t(0),
2020  all_cut_coordinates(NULL), max_min_coords(NULL), process_cut_line_weight_to_put_left(NULL),
2021  thread_cut_line_weight_to_put_left(NULL), cut_coordinates_work_array(NULL),
2022  target_part_weights(NULL), cut_upper_bound_coordinates(NULL), cut_lower_bound_coordinates(NULL),
2023  cut_lower_bound_weights(NULL), cut_upper_bound_weights(NULL),
2024  process_local_min_max_coord_total_weight(NULL), global_min_max_coord_total_weight(NULL),
2025  is_cut_line_determined(NULL), my_incomplete_cut_count(NULL),
2026  thread_part_weights(NULL), thread_part_weight_work(NULL),
2027  thread_cut_left_closest_point(NULL), thread_cut_right_closest_point(NULL),
2028  thread_point_counts(NULL), process_rectilinear_cut_weight(NULL),
2029  global_rectilinear_cut_weight(NULL),total_part_weight_left_right_closests(NULL),
2030  global_total_part_weight_left_right_closests(NULL),
2031  kept_boxes(),global_box(),
2032  myRank(0), myActualRank(0), divide_to_prime_first(false)
2033 {
2034  this->fEpsilon = std::numeric_limits<float>::epsilon();
2035  this->sEpsilon = std::numeric_limits<mj_scalar_t>::epsilon() * 100;
2036 
2037  this->maxScalar_t = std::numeric_limits<mj_scalar_t>::max();
2038  this->minScalar_t = -std::numeric_limits<mj_scalar_t>::max();
2039 
2040 }
2041 
2042 
2046 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2047  typename mj_part_t>
2048 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBox_t>
2050 {
2051  return this->global_box;
2052 }
2053 
2057 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2058  typename mj_part_t>
2060  this->mj_keep_part_boxes = true;
2061 }
2062 
2063 
2064 /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in
2065  * the input. part_no_array takes
2066  * precedence if both are provided.
2067  * Depending on these parameters, total cut/part number,
2068  * maximum part/cut number along a dimension, estimated number of reduceAlls,
2069  * and the number of parts before the last dimension is calculated.
2070  * */
2071 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2072  typename mj_part_t>
2074 
2075  this->total_num_cut = 0; //how many cuts will be totally
2076  this->total_num_part = 1; //how many parts will be totally
2077  this->max_num_part_along_dim = 0; //maximum part count along a dimension.
2078  this->total_dim_num_reduce_all = 0; //estimate on #reduceAlls can be done.
2079  this->last_dim_num_part = 1; //max no of parts that might occur
2080  //during the partition before the
2081  //last partitioning dimension.
2082  this->max_num_cut_along_dim = 0;
2083  this->max_num_total_part_along_dim = 0;
2084 
2085  if (this->part_no_array) {
2086  //if user provided part array, traverse the array and set variables.
2087  for (int i = 0; i < this->recursion_depth; ++i){
2088  this->total_dim_num_reduce_all += this->total_num_part;
2089  this->total_num_part *= this->part_no_array[i];
2090  if(this->part_no_array[i] > this->max_num_part_along_dim) {
2091  this->max_num_part_along_dim = this->part_no_array[i];
2092  }
2093  }
2094  this->last_dim_num_part = this->total_num_part / this->part_no_array[recursion_depth-1];
2095  this->num_global_parts = this->total_num_part;
2096  }
2097  else {
2098  mj_part_t future_num_parts = this->num_global_parts;
2099 
2100  // If using nonuniform first level partitioning.
2101  // initial value max_num_part_along_dim == num_first_level_parts
2102  if (this->first_level_distribution != NULL &&
2103  this->num_first_level_parts > 1) {
2104  this->max_num_part_along_dim = this->num_first_level_parts;
2105  }
2106 
2107  // We need to calculate the part numbers now, to determine the maximum along the dimensions.
2108  for (int rd = 0; rd < this->recursion_depth; ++rd){
2109 
2110  mj_part_t maxNoPartAlongI = 0;
2111  mj_part_t nfutureNumParts = 0;
2112 
2113  // Nonuniform first level partitioning sets part specificiations for rd == 0 only,
2114  // given requested num of parts and distribution in parts for the first level.
2115  if (rd == 0 &&
2116  this->first_level_distribution != NULL &&
2117  this->num_first_level_parts > 1) {
2118 
2119  maxNoPartAlongI = this->num_first_level_parts;
2120  this->max_num_part_along_dim = this->num_first_level_parts;
2121 
2122  mj_part_t sum_first_level_dist = 0;
2123  mj_part_t max_part = 0;
2124 
2125  // Cumulative sum of distribution of parts and size of largest part
2126  for (int i = 0; i < this->num_first_level_parts; ++i) {
2127 
2128  sum_first_level_dist += this->first_level_distribution[i];
2129 
2130  if (this->first_level_distribution[i] > max_part)
2131  max_part = this->first_level_distribution[i];
2132  }
2133 
2134  // Total parts in largest nonuniform superpart from first level partitioning
2135  nfutureNumParts = this->num_global_parts * max_part / sum_first_level_dist;
2136 
2137  }
2138  // Standard uniform partitioning this level
2139  else {
2140  maxNoPartAlongI = this->get_part_count(future_num_parts,
2141  1.0f / (this->recursion_depth - rd));
2142 
2143  if (maxNoPartAlongI > this->max_num_part_along_dim)
2144  this->max_num_part_along_dim = maxNoPartAlongI;
2145 
2146 
2147  nfutureNumParts = future_num_parts / maxNoPartAlongI;
2148  if (future_num_parts % maxNoPartAlongI){
2149  ++nfutureNumParts;
2150  }
2151  }
2152 
2153  future_num_parts = nfutureNumParts;
2154  }
2155  this->total_num_part = this->num_global_parts;
2156 
2157  if (this->divide_to_prime_first){
2158  this->total_dim_num_reduce_all = this->num_global_parts * 2;
2159  this->last_dim_num_part = this->num_global_parts;
2160  }
2161  else {
2162  //this is the lower bound.
2163 
2164  //estimate reduceAll Count here.
2165  //we find the upperbound instead.
2166  size_t p = 1;
2167 
2168  for (int i = 0; i < this->recursion_depth; ++i){
2169  this->total_dim_num_reduce_all += p;
2170  p *= this->max_num_part_along_dim;
2171  }
2172 
2173  if (p / this->max_num_part_along_dim > this->num_global_parts){
2174  this->last_dim_num_part = this->num_global_parts;
2175  }
2176  else {
2177  this->last_dim_num_part = p / this->max_num_part_along_dim;
2178  }
2179 
2180  }
2181  }
2182 
2183  this->total_num_cut = this->total_num_part - 1;
2184  this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2185  this->max_num_total_part_along_dim = this->max_num_part_along_dim + size_t(this->max_num_cut_along_dim);
2186  //maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
2187 
2188  //refine the concurrent part count, if it is given bigger than the maximum possible part count.
2189  if(this->max_concurrent_part_calculation > this->last_dim_num_part){
2190  if(this->mj_problemComm->getRank() == 0){
2191  std::cerr << "Warning: Concurrent part count ("<< this->max_concurrent_part_calculation <<
2192  ") has been set bigger than maximum amount that can be used." <<
2193  " Setting to:" << this->last_dim_num_part << "." << std::endl;
2194  }
2195  this->max_concurrent_part_calculation = this->last_dim_num_part;
2196  }
2197 
2198 }
2199 /* \brief Tries to determine the part number for current dimension,
2200  * by trying to make the partitioning as square as possible.
2201  * \param num_total_future how many more partitionings are required.
2202  * \param root how many more recursion depth is left.
2203  */
2204 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2205  typename mj_part_t>
2206 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_part_count(
2207  mj_part_t num_total_future,
2208  double root)
2209 {
2210  double fp = pow(num_total_future, root);
2211  mj_part_t ip = mj_part_t (fp);
2212  if (fp - ip < this->fEpsilon * 100){
2213  return ip;
2214  }
2215  else {
2216  return ip + 1;
2217  }
2218 }
2219 
2220 /* \brief Function returns how many parts that will be obtained after this dimension partitioning.
2221  * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector,
2222  * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector,
2223  * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor.
2224  *
2225  * \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into.
2226  * \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into.
2227  * \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into.
2228  * \param future_num_parts: input/output, max number of future parts that will be obtained from a single
2229  * \param current_num_parts: input, how many parts are there currently.
2230  * \param current_iteration: input, current dimension iteration number.
2231  * \param input_part_boxes: input, if boxes are kept, current boxes.
2232  * \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts.
2233  */
2234 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2235  typename mj_part_t>
2236 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::update_part_num_arrays(
2237  std::vector <mj_part_t> &num_partitioning_in_current_dim, //assumes this vector is empty.
2238  std::vector<mj_part_t> *future_num_part_in_parts,
2239  std::vector<mj_part_t> *next_future_num_parts_in_parts, //assumes this vector is empty.
2240  mj_part_t &future_num_parts,
2241  mj_part_t current_num_parts,
2242  int current_iteration,
2243  RCP<mj_partBoxVector_t> input_part_boxes,
2244  RCP<mj_partBoxVector_t> output_part_boxes,
2245  mj_part_t atomic_part_count) {
2246 
2247  //how many parts that will be obtained after this dimension.
2248  mj_part_t output_num_parts = 0;
2249 
2250  if(this->part_no_array){
2251  //when the partNo array is provided as input,
2252  //each current partition will be partition to the same number of parts.
2253  //we dont need to use the future_num_part_in_parts vector in this case.
2254 
2255  mj_part_t p = this->part_no_array[current_iteration];
2256  if (p < 1){
2257  std::cout << "Current recursive iteration: " << current_iteration
2258  << " part_no_array[" << current_iteration << "] is given as:" << p << std::endl;
2259  exit(1);
2260  }
2261  if (p == 1){
2262  return current_num_parts;
2263  }
2264  // If using part_no_array, ensure compatibility with num_first_level_parts.
2265  if (this->first_level_distribution != NULL &&
2266  current_iteration == 0 &&
2267  p != this->num_first_level_parts)
2268  {
2269  std::cout << "Current recursive iteration: " << current_iteration
2270  << " part_no_array[" << current_iteration << "] is given as: " << p
2271  << " and contradicts num_first_level_parts: " << this->num_first_level_parts << std::endl;
2272  exit(1);
2273  }
2274 
2275  for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2276  num_partitioning_in_current_dim.push_back(p);
2277  }
2278 
2279 /*
2280  std::cout << "\n\nme: " << this->myRank << " current_iteration: " << current_iteration
2281  << " current_num_parts: " << current_num_parts << "\n\n";
2282 
2283  std::cout << "\n\nnum_partitioning_in_current_dim[0]: " << num_partitioning_in_current_dim[0] << "\n\n";
2284 
2285  //set the new value of future_num_parts.
2286 
2287  std::cout << "\n\nfuture_num_parts: " << future_num_parts
2288  << " num_partitioning_in_current_dim[0]: " << num_partitioning_in_current_dim[0]
2289  << " " << future_num_parts / num_partitioning_in_current_dim[0] << "\n\n";
2290 */
2291 
2292  future_num_parts /= num_partitioning_in_current_dim[0];
2293  output_num_parts = current_num_parts * num_partitioning_in_current_dim[0];
2294 
2295  if (this->mj_keep_part_boxes){
2296  for (mj_part_t k = 0; k < current_num_parts; ++k){
2297  //initialized the output boxes as its ancestor.
2298  for (mj_part_t j = 0; j < num_partitioning_in_current_dim[0]; ++j){
2299  output_part_boxes->push_back((*input_part_boxes)[k]);
2300  }
2301  }
2302  }
2303 
2304  //set the how many more parts each part will be divided.
2305  //this is obvious when partNo array is provided as input.
2306  //however, fill this so that weights will be calculated according to this array.
2307  for (mj_part_t ii = 0; ii < output_num_parts; ++ii){
2308  next_future_num_parts_in_parts->push_back(future_num_parts);
2309  }
2310  }
2311  else {
2312  //if partNo array is not provided as input,
2313  //future_num_part_in_parts holds how many parts each part should be divided.
2314  //initially it holds a single number equal to the total number of global parts.
2315 
2316  //calculate the future_num_parts from beginning,
2317  //since each part might be divided into different number of parts.
2318  future_num_parts = 1;
2319 
2320  //std::cout << "i:" << i << std::endl;
2321 
2322  for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2323  //get how many parts a part should be divided.
2324  mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2325 
2326  //get the ideal number of parts that is close to the
2327  //(recursion_depth - i) root of the future_num_parts_of_part_ii.
2328  mj_part_t num_partitions_in_current_dim =
2329  this->get_part_count(future_num_parts_of_part_ii,
2330  1.0 / (this->recursion_depth - current_iteration) );
2331 
2332  if (num_partitions_in_current_dim > this->max_num_part_along_dim){
2333  std::cerr << "ERROR: maxPartNo calculation is wrong. num_partitions_in_current_dim: "
2334  << num_partitions_in_current_dim << " this->max_num_part_along_dim: "
2335  << this->max_num_part_along_dim <<
2336  " this->recursion_depth: " << this->recursion_depth <<
2337  " current_iteration: " << current_iteration <<
2338  " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2339  " might need to fix max part no calculation for largest_prime_first partitioning." <<
2340  std::endl;
2341  exit(1);
2342  }
2343  //add this number to num_partitioning_in_current_dim vector.
2344 // num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2345 
2346 // mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2347 
2348  // Update part num arrays when on current_iteration == 0 and
2349  // using nonuniform first level partitioning
2350  // with requested num parts (num_first_level_parts) and
2351  // a requested distribution in parts (first_level_distribution).
2352  if (current_iteration == 0 &&
2353  this->first_level_distribution != NULL &&
2354  this->num_first_level_parts > 1) {
2355 
2356  // Only 1 current part to begin and partitions into
2357  // num_first_level_parts many parts
2358  num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2359 
2360  // The output number of parts from first level partitioning
2361  output_num_parts = this->num_first_level_parts;
2362 
2363  // Remaining parts left to partition for all future levels
2364  future_num_parts /= this->num_first_level_parts;
2365 
2366  mj_part_t max_part = 0;
2367  mj_part_t sum_first_level_dist = 0;
2368 
2369  // Cumulative sum of distribution of first level parts
2370  // and size of largest first level part
2371  for (int i = 0; i < this->num_first_level_parts; ++i) {
2372  sum_first_level_dist += this->first_level_distribution[i];
2373 
2374  if (this->first_level_distribution[i] > max_part)
2375  max_part = this->first_level_distribution[i];
2376  }
2377 
2378  // Maximum # of remaining parts left to partition for all future levels
2379  future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2380 
2381  // Number of parts remaining left to partition for each future_part
2382  // The sum must exactly equal global_num_parts
2383  for (int i = 0; i < this->num_first_level_parts; ++i) {
2384 
2385  next_future_num_parts_in_parts->push_back(this->first_level_distribution[i] *
2386  this->num_global_parts / sum_first_level_dist);
2387  }
2388  }
2389  else if (this->divide_to_prime_first) {
2390 
2391  // Add this number to num_partitioning_in_current_dim vector.
2392  num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2393 
2394  mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2395 
2396  //increase the output number of parts.
2397  output_num_parts += num_partitions_in_current_dim;
2398 
2399  if (future_num_parts_of_part_ii == atomic_part_count ||
2400  future_num_parts_of_part_ii % atomic_part_count != 0) {
2401  atomic_part_count = 1;
2402  }
2403 
2404  largest_prime_factor =
2405  this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2406 
2407  // We divide to num_partitions_in_current_dim. But we adjust the weights
2408  // based on largest prime/ if num_partitions_in_current_dim = 2,
2409  // largest prime = 5 --> we divide to 2 parts with weights 3x and 2x.
2410  // if the largest prime is less than part count, we use the part count
2411  // so that we divide uniformly.
2412  if (largest_prime_factor < num_partitions_in_current_dim) {
2413  largest_prime_factor = num_partitions_in_current_dim;
2414  }
2415 
2416  //ideal number of future partitions for each part.
2417  mj_part_t ideal_num_future_parts_in_part =
2418  (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2419  //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2420  mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2421 
2422 /*
2423  std::cout << "\ncurrent num part: " << ii
2424  << " largest_prime_factor: " << largest_prime_factor
2425  << " To Partition: " << future_num_parts_of_part_ii << "\n\n";
2426 */
2427 
2428  for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2429  //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2430  mj_part_t my_ideal_primescale = ideal_prime_scale;
2431  //left over weighs. Left side is adjusted to be 3x, right side stays as 2x
2432  if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2433  ++my_ideal_primescale;
2434  }
2435  //scale with 'x';
2436  mj_part_t num_future_parts_for_part_iii =
2437  ideal_num_future_parts_in_part * my_ideal_primescale;
2438 
2439  //if there is a remainder in the part increase the part weight.
2440  if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2441  //if not uniform, add 1 for the extra parts.
2442  ++num_future_parts_for_part_iii;
2443  }
2444 
2445  next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2446 
2447  //if part boxes are stored, initialize the box of the parts as the ancestor.
2448  if (this->mj_keep_part_boxes) {
2449  output_part_boxes->push_back((*input_part_boxes)[ii]);
2450  }
2451 
2452  //set num future_num_parts to maximum in this part.
2453  if (num_future_parts_for_part_iii > future_num_parts)
2454  future_num_parts = num_future_parts_for_part_iii;
2455 
2456  }
2457  }
2458  else {
2459 
2460  // Add this number to num_partitioning_in_current_dim vector.
2461  num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2462 
2463  //increase the output number of parts.
2464  output_num_parts += num_partitions_in_current_dim;
2465 
2466  if (future_num_parts_of_part_ii == atomic_part_count ||
2467  future_num_parts_of_part_ii % atomic_part_count != 0) {
2468  atomic_part_count = 1;
2469  }
2470  //ideal number of future partitions for each part.
2471  mj_part_t ideal_num_future_parts_in_part =
2472  (future_num_parts_of_part_ii / atomic_part_count) / num_partitions_in_current_dim;
2473 
2474  for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
2475  mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part;
2476 
2477  //if there is a remainder in the part increase the part weight.
2478  if (iii < (future_num_parts_of_part_ii / atomic_part_count) % num_partitions_in_current_dim){
2479  //if not uniform, add 1 for the extra parts.
2480  ++num_future_parts_for_part_iii;
2481  }
2482 
2483  next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2484 
2485  //if part boxes are stored, initialize the box of the parts as the ancestor.
2486  if (this->mj_keep_part_boxes){
2487  output_part_boxes->push_back((*input_part_boxes)[ii]);
2488  }
2489 
2490  //set num future_num_parts to maximum in this part.
2491  if (num_future_parts_for_part_iii > future_num_parts)
2492  future_num_parts = num_future_parts_for_part_iii;
2493  }
2494  }
2495  }
2496  }
2497  return output_num_parts;
2498 }
2499 
2500 
2501 /* \brief Allocates and initializes the work memory that will be used by MJ.
2502  *
2503  * */
2504 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2505  typename mj_part_t>
2506 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::allocate_set_work_memory(){
2507 
2508  //points to process that initially owns the coordinate.
2509  this->owner_of_coordinate = NULL;
2510 
2511  //Throughout the partitioning execution,
2512  //instead of the moving the coordinates, hold a permutation array for parts.
2513  //coordinate_permutations holds the current permutation.
2514  this->coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2515  //initial configuration, set each pointer-i to i.
2516 #ifdef HAVE_ZOLTAN2_OMP
2517 #pragma omp parallel for
2518 #endif
2519  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
2520  this->coordinate_permutations[i] = i;
2521  }
2522 
2523  //new_coordinate_permutations holds the current permutation.
2524  this->new_coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2525 
2526  this->assigned_part_ids = NULL;
2527  if(this->num_local_coords > 0){
2528  this->assigned_part_ids = allocMemory<mj_part_t>(this->num_local_coords);
2529  }
2530 
2531  //single partition starts at index-0, and ends at numLocalCoords
2532  //inTotalCounts array holds the end points in coordinate_permutations array
2533  //for each partition. Initially sized 1, and single element is set to numLocalCoords.
2534  this->part_xadj = allocMemory<mj_lno_t>(1);
2535  this->part_xadj[0] = static_cast<mj_lno_t>(this->num_local_coords);//the end of the initial partition is the end of coordinates.
2536  //the ends points of the output, this is allocated later.
2537  this->new_part_xadj = NULL;
2538 
2539  // only store this much if cuts are needed to be stored.
2540  //this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->total_num_cut);
2541 
2542 
2543  this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2544 
2545  this->max_min_coords = allocMemory< mj_scalar_t>(this->num_threads * 2);
2546 
2547  this->process_cut_line_weight_to_put_left = NULL; //how much weight percentage should a MPI put left side of the each cutline
2548  this->thread_cut_line_weight_to_put_left = NULL; //how much weight percentage should each thread in MPI put left side of the each outline
2549  //distribute_points_on_cut_lines = false;
2550  if(this->distribute_points_on_cut_lines){
2551  this->process_cut_line_weight_to_put_left = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2552  this->thread_cut_line_weight_to_put_left = allocMemory<mj_scalar_t *>(this->num_threads);
2553  for(int i = 0; i < this->num_threads; ++i){
2554  this->thread_cut_line_weight_to_put_left[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2555  }
2556  this->process_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2557  this->global_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2558  }
2559 
2560 
2561  // work array to manipulate coordinate of cutlines in different iterations.
2562  //necessary because previous cut line information is used for determining
2563  //the next cutline information. therefore, cannot update the cut work array
2564  //until all cutlines are determined.
2565  this->cut_coordinates_work_array = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim *
2566  this->max_concurrent_part_calculation);
2567 
2568 
2569  //cumulative part weight array.
2570  this->target_part_weights = allocMemory<mj_scalar_t>(
2571  this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2572  // the weight from left to write.
2573 
2574  this->cut_upper_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation); //upper bound coordinate of a cut line
2575  this->cut_lower_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //lower bound coordinate of a cut line
2576  this->cut_lower_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //lower bound weight of a cut line
2577  this->cut_upper_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //upper bound weight of a cut line
2578 
2579  this->process_local_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation); //combined array to exchange the min and max coordinate, and total weight of part.
2580  this->global_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);//global combined array with the results for min, max and total weight.
2581 
2582  //is_cut_line_determined is used to determine if a cutline is determined already.
2583  //If a cut line is already determined, the next iterations will skip this cut line.
2584  this->is_cut_line_determined = allocMemory<bool>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2585  //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part
2586  //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part.
2587  this->my_incomplete_cut_count = allocMemory<mj_part_t>(this->max_concurrent_part_calculation);
2588  //local part weights of each thread.
2589  this->thread_part_weights = allocMemory<double *>(this->num_threads);
2590  //the work manupulation array for partweights.
2591  this->thread_part_weight_work = allocMemory<double *>(this->num_threads);
2592 
2593  //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread).
2594  this->thread_cut_left_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2595  //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread)
2596  this->thread_cut_right_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2597 
2598  //to store how many points in each part a thread has.
2599  this->thread_point_counts = allocMemory<mj_lno_t *>(this->num_threads);
2600 
2601  for(int i = 0; i < this->num_threads; ++i){
2602  //partWeights[i] = allocMemory<mj_scalar_t>(maxTotalPartCount);
2603  this->thread_part_weights[i] = allocMemory < double >(this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2604  this->thread_cut_right_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2605  this->thread_cut_left_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2606  this->thread_point_counts[i] = allocMemory<mj_lno_t>(this->max_num_part_along_dim);
2607  }
2608  //for faster communication, concatanation of
2609  //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
2610  //leftClosest distances sized P-1, since P-1 cut lines
2611  //rightClosest distances size P-1, since P-1 cut lines.
2612  this->total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2613  this->global_total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2614 
2615 
2616  mj_scalar_t **coord = allocMemory<mj_scalar_t *>(this->coord_dim);
2617  for (int i=0; i < this->coord_dim; i++){
2618  coord[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2619 #ifdef HAVE_ZOLTAN2_OMP
2620 #pragma omp parallel for
2621 #endif
2622  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2623  coord[i][j] = this->mj_coordinates[i][j];
2624  }
2625  this->mj_coordinates = coord;
2626 
2627 
2628  int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
2629  mj_scalar_t **weights = allocMemory<mj_scalar_t *>(criteria_dim);
2630 
2631  for (int i=0; i < criteria_dim; i++){
2632  weights[i] = NULL;
2633  }
2634  for (int i=0; i < this->num_weights_per_coord; i++){
2635  weights[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2636 #ifdef HAVE_ZOLTAN2_OMP
2637 #pragma omp parallel for
2638 #endif
2639  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2640  weights[i][j] = this->mj_weights[i][j];
2641 
2642  }
2643  this->mj_weights = weights;
2644  this->current_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
2645 #ifdef HAVE_ZOLTAN2_OMP
2646 #pragma omp parallel for
2647 #endif
2648  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2649  this->current_mj_gnos[j] = this->initial_mj_gnos[j];
2650 
2651  this->owner_of_coordinate = allocMemory<int>(this->num_local_coords);
2652 
2653 #ifdef HAVE_ZOLTAN2_OMP
2654 #pragma omp parallel for
2655 #endif
2656  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2657  this->owner_of_coordinate[j] = this->myActualRank;
2658 }
2659 
2660 /* \brief compute the global bounding box
2661  */
2662 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2663  typename mj_part_t>
2664 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::compute_global_box()
2665 {
2666  //local min coords
2667  mj_scalar_t *mins = allocMemory<mj_scalar_t>(this->coord_dim);
2668  //global min coords
2669  mj_scalar_t *gmins = allocMemory<mj_scalar_t>(this->coord_dim);
2670  //local max coords
2671  mj_scalar_t *maxs = allocMemory<mj_scalar_t>(this->coord_dim);
2672  //global max coords
2673  mj_scalar_t *gmaxs = allocMemory<mj_scalar_t>(this->coord_dim);
2674 
2675  for (int i = 0; i < this->coord_dim; ++i){
2676  mj_scalar_t localMin = std::numeric_limits<mj_scalar_t>::max();
2677  mj_scalar_t localMax = -localMin;
2678  if (localMax > 0) localMax = 0;
2679 
2680 
2681  for (mj_lno_t j = 0; j < this->num_local_coords; ++j){
2682  if (this->mj_coordinates[i][j] < localMin){
2683  localMin = this->mj_coordinates[i][j];
2684  }
2685  if (this->mj_coordinates[i][j] > localMax){
2686  localMax = this->mj_coordinates[i][j];
2687  }
2688  }
2689  //std::cout << " localMin:" << localMin << std::endl;
2690  //std::cout << " localMax:" << localMax << std::endl;
2691  mins[i] = localMin;
2692  maxs[i] = localMax;
2693 
2694  }
2695  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2696  this->coord_dim, mins, gmins
2697  );
2698 
2699 
2700  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2701  this->coord_dim, maxs, gmaxs
2702  );
2703 
2704 
2705 
2706  //create single box with all areas.
2707  global_box = rcp(new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2708  //coordinateModelPartBox <mj_scalar_t, mj_part_t> tmpBox (0, coordDim);
2709  freeArray<mj_scalar_t>(mins);
2710  freeArray<mj_scalar_t>(gmins);
2711  freeArray<mj_scalar_t>(maxs);
2712  freeArray<mj_scalar_t>(gmaxs);
2713 }
2714 
2715 /* \brief for part communication we keep track of the box boundaries.
2716  * This is performed when either asked specifically, or when geometric mapping is performed afterwards.
2717  * This function initializes a single box with all global min and max coordinates.
2718  * \param initial_partitioning_boxes the input and output vector for boxes.
2719  */
2720 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2721  typename mj_part_t>
2722 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::init_part_boxes(
2723  RCP<mj_partBoxVector_t> & initial_partitioning_boxes
2724 )
2725 {
2726  mj_partBox_t tmp_box(*global_box);
2727  initial_partitioning_boxes->push_back(tmp_box);
2728 }
2729 
2740 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2741  typename mj_part_t>
2742 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_local_min_max_coord_totW(
2743  mj_lno_t coordinate_begin_index,
2744  mj_lno_t coordinate_end_index,
2745  mj_lno_t *mj_current_coordinate_permutations,
2746  mj_scalar_t *mj_current_dim_coords,
2747  mj_scalar_t &min_coordinate,
2748  mj_scalar_t &max_coordinate,
2749  mj_scalar_t &total_weight){
2750 
2751  //if the part is empty.
2752  //set the min and max coordinates as reverse.
2753  if(coordinate_begin_index >= coordinate_end_index)
2754  {
2755  min_coordinate = this->maxScalar_t;
2756  max_coordinate = this->minScalar_t;
2757  total_weight = 0;
2758  }
2759  else {
2760  mj_scalar_t my_total_weight = 0;
2761 #ifdef HAVE_ZOLTAN2_OMP
2762 #pragma omp parallel num_threads(this->num_threads)
2763 #endif
2764  {
2765  //if uniform weights are used, then weight is equal to count.
2766  if (this->mj_uniform_weights[0]) {
2767 #ifdef HAVE_ZOLTAN2_OMP
2768 #pragma omp single
2769 #endif
2770  {
2771  my_total_weight = coordinate_end_index - coordinate_begin_index;
2772  }
2773 
2774  }
2775  else {
2776  //if not uniform, then weights are reducted from threads.
2777 #ifdef HAVE_ZOLTAN2_OMP
2778 #pragma omp for reduction(+:my_total_weight)
2779 #endif
2780  for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2781  int i = mj_current_coordinate_permutations[ii];
2782  my_total_weight += this->mj_weights[0][i];
2783  }
2784  }
2785 
2786  int my_thread_id = 0;
2787 #ifdef HAVE_ZOLTAN2_OMP
2788  my_thread_id = omp_get_thread_num();
2789 #endif
2790  mj_scalar_t my_thread_min_coord, my_thread_max_coord;
2791  my_thread_min_coord=my_thread_max_coord
2792  =mj_current_dim_coords[mj_current_coordinate_permutations[coordinate_begin_index]];
2793 
2794 
2795 #ifdef HAVE_ZOLTAN2_OMP
2796 #pragma omp for
2797 #endif
2798  for(mj_lno_t j = coordinate_begin_index + 1; j < coordinate_end_index; ++j){
2799  int i = mj_current_coordinate_permutations[j];
2800  if(mj_current_dim_coords[i] > my_thread_max_coord)
2801  my_thread_max_coord = mj_current_dim_coords[i];
2802  if(mj_current_dim_coords[i] < my_thread_min_coord)
2803  my_thread_min_coord = mj_current_dim_coords[i];
2804  }
2805  this->max_min_coords[my_thread_id] = my_thread_min_coord;
2806  this->max_min_coords[my_thread_id + this->num_threads] = my_thread_max_coord;
2807 
2808 #ifdef HAVE_ZOLTAN2_OMP
2809 //we need a barrier here, because max_min_array might not be filled by some of the threads.
2810 #pragma omp barrier
2811 #pragma omp single nowait
2812 #endif
2813  {
2814  min_coordinate = this->max_min_coords[0];
2815  for(int i = 1; i < this->num_threads; ++i){
2816  if(this->max_min_coords[i] < min_coordinate)
2817  min_coordinate = this->max_min_coords[i];
2818  }
2819  }
2820 
2821 #ifdef HAVE_ZOLTAN2_OMP
2822 #pragma omp single nowait
2823 #endif
2824  {
2825  max_coordinate = this->max_min_coords[this->num_threads];
2826  for(int i = this->num_threads + 1; i < this->num_threads * 2; ++i){
2827  if(this->max_min_coords[i] > max_coordinate)
2828  max_coordinate = this->max_min_coords[i];
2829  }
2830  }
2831  }
2832  total_weight = my_total_weight;
2833  }
2834 }
2835 
2836 
2844 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2845  typename mj_part_t>
2846 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_global_min_max_coord_totW(
2847  mj_part_t current_concurrent_num_parts,
2848  mj_scalar_t *local_min_max_total,
2849  mj_scalar_t *global_min_max_total){
2850 
2851  //reduce min for first current_concurrent_num_parts elements, reduce max for next
2852  //concurrentPartCount elements,
2853  //reduce sum for the last concurrentPartCount elements.
2854  if(this->comm->getSize() > 1){
2856  reductionOp(
2857  current_concurrent_num_parts,
2858  current_concurrent_num_parts,
2859  current_concurrent_num_parts);
2860  try{
2861  reduceAll<int, mj_scalar_t>(
2862  *(this->comm),
2863  reductionOp,
2864  3 * current_concurrent_num_parts,
2865  local_min_max_total,
2866  global_min_max_total);
2867  }
2868  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
2869  }
2870  else {
2871  mj_part_t s = 3 * current_concurrent_num_parts;
2872  for (mj_part_t i = 0; i < s; ++i){
2873  global_min_max_total[i] = local_min_max_total[i];
2874  }
2875  }
2876 }
2877 
2878 
2879 
2907 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2908  typename mj_part_t>
2909 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_initial_cut_coords_target_weights(
2910  mj_scalar_t min_coord,
2911  mj_scalar_t max_coord,
2912  mj_part_t num_cuts/*p-1*/ ,
2913  mj_scalar_t global_weight,
2914  mj_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/,
2915  mj_scalar_t *current_target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/,
2916 
2917  std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
2918  std::vector <mj_part_t> *next_future_num_parts_in_parts,
2919  mj_part_t concurrent_current_part,
2920  mj_part_t obtained_part_index,
2921  mj_part_t num_target_first_level_parts,
2922  const mj_part_t *target_first_level_dist) {
2923 
2924  mj_scalar_t coord_range = max_coord - min_coord;
2925 
2926  // Uniform target weights
2927  if (num_target_first_level_parts <= 1 &&
2928  this->mj_uniform_parts[0]) {
2929  {
2930  mj_part_t cumulative = 0;
2931 
2932  // How many total future parts the part will be partitioned into.
2933  mj_scalar_t total_future_part_count_in_part = mj_scalar_t((*future_num_part_in_parts)[concurrent_current_part]);
2934 
2935  // How much each part should weigh in ideal case.
2936  mj_scalar_t unit_part_weight = global_weight / total_future_part_count_in_part;
2937 
2938  for (mj_part_t i = 0; i < num_cuts; ++i) {
2939  cumulative += (*next_future_num_parts_in_parts)[i + obtained_part_index];
2940 
2941  // Set target part weight.
2942  current_target_part_weights[i] = cumulative * unit_part_weight;
2943 
2944  // Set initial cut coordinate.
2945  initial_cut_coords[i] = min_coord + (coord_range * cumulative) / total_future_part_count_in_part;
2946  }
2947 
2948  current_target_part_weights[num_cuts] = global_weight;
2949  }
2950 
2951  // Round the target part weights.
2952  if (this->mj_uniform_weights[0]) { // Repeated if???
2953  for (mj_part_t i = 0; i < num_cuts + 1; ++i) {
2954  current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2955  }
2956  }
2957  }
2958  // Nonuniform target weights for first level of partitioning
2959  else if(num_target_first_level_parts > 1 &&
2960  target_first_level_dist != NULL) {
2961  {
2962  // Running sum of the total weight
2963  mj_part_t cumulative = 0.0;
2964 
2965  // Sum of entries in the first level partition distribution vector
2966  mj_scalar_t sum_target_first_level_dist = 0.0;
2967 
2968  for (int i = 0; i < num_target_first_level_parts; ++i) {
2969  sum_target_first_level_dist += target_first_level_dist[i];
2970  }
2971 
2972  for (mj_part_t i = 0; i < num_cuts; ++i) {
2973  cumulative += global_weight * target_first_level_dist[i] / sum_target_first_level_dist;
2974 
2975  // Set target part weight.
2976  current_target_part_weights[i] = cumulative;
2977 
2978  // Set initial cut coordinate.
2979  initial_cut_coords[i] = min_coord + (coord_range * cumulative) / global_weight;
2980  }
2981 
2982  current_target_part_weights[num_cuts] = global_weight;
2983  }
2984 
2985  //round the target part weights.
2986  for (mj_part_t i = 0; i < num_cuts + 1; ++i) {
2987  current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2988  }
2989  }
2990  else {
2991  std::cerr << "MJ does not support non uniform part weights beyond the first partition" << std::endl;
2992  exit(1);
2993  }
2994 }
2995 
2996 
3009 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3010  typename mj_part_t>
3011 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_initial_coordinate_parts(
3012  mj_scalar_t &max_coordinate,
3013  mj_scalar_t &min_coordinate,
3014  mj_part_t &/* concurrent_current_part_index */,
3015  mj_lno_t coordinate_begin_index,
3016  mj_lno_t coordinate_end_index,
3017  mj_lno_t *mj_current_coordinate_permutations,
3018  mj_scalar_t *mj_current_dim_coords,
3019  mj_part_t *mj_part_ids,
3020  mj_part_t &partition_count
3021 ){
3022  mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3023 
3024  //if there is single point, or if all points are along a line.
3025  //set initial part to 0 for all.
3026  if(ZOLTAN2_ABS(coordinate_range) < this->sEpsilon ){
3027 #ifdef HAVE_ZOLTAN2_OMP
3028 #pragma omp parallel for
3029 #endif
3030  for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
3031  mj_part_ids[mj_current_coordinate_permutations[ii]] = 0;
3032  }
3033  }
3034  else{
3035 
3036  //otherwise estimate an initial part for each coordinate.
3037  //assuming uniform distribution of points.
3038  mj_scalar_t slice = coordinate_range / partition_count;
3039 
3040 #ifdef HAVE_ZOLTAN2_OMP
3041 #pragma omp parallel for
3042 #endif
3043  for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
3044 
3045  mj_lno_t iii = mj_current_coordinate_permutations[ii];
3046  mj_part_t pp = mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3047  mj_part_ids[iii] = 2 * pp;
3048  }
3049  }
3050 }
3051 
3052 
3063 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3064  typename mj_part_t>
3065 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part(
3066  mj_scalar_t *mj_current_dim_coords,
3067  double used_imbalance_tolerance,
3068  mj_part_t current_work_part,
3069  mj_part_t current_concurrent_num_parts,
3070  mj_scalar_t *current_cut_coordinates,
3071  mj_part_t total_incomplete_cut_count,
3072  std::vector <mj_part_t> &num_partitioning_in_current_dim
3073 ){
3074 
3075 
3076  mj_part_t rectilinear_cut_count = 0;
3077  mj_scalar_t *temp_cut_coords = current_cut_coordinates;
3078 
3080  *reductionOp = NULL;
3081  reductionOp = new Teuchos::MultiJaggedCombinedReductionOp
3082  <mj_part_t, mj_scalar_t>(
3083  &num_partitioning_in_current_dim ,
3084  current_work_part ,
3085  current_concurrent_num_parts);
3086 
3087  size_t total_reduction_size = 0;
3088 #ifdef HAVE_ZOLTAN2_OMP
3089 #pragma omp parallel shared(total_incomplete_cut_count, rectilinear_cut_count) num_threads(this->num_threads)
3090 #endif
3091  {
3092  int me = 0;
3093 #ifdef HAVE_ZOLTAN2_OMP
3094  me = omp_get_thread_num();
3095 #endif
3096  double *my_thread_part_weights = this->thread_part_weights[me];
3097  mj_scalar_t *my_thread_left_closest = this->thread_cut_left_closest_point[me];
3098  mj_scalar_t *my_thread_right_closest = this->thread_cut_right_closest_point[me];
3099 
3100 #ifdef HAVE_ZOLTAN2_OMP
3101 #pragma omp single
3102 #endif
3103  {
3104  //initialize the lower and upper bounds of the cuts.
3105  mj_part_t next = 0;
3106  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3107 
3108  mj_part_t num_part_in_dim = num_partitioning_in_current_dim[current_work_part + i];
3109  mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3110  total_reduction_size += (4 * num_cut_in_dim + 1);
3111 
3112  for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii){
3113  this->is_cut_line_determined[next] = false;
3114  this->cut_lower_bound_coordinates[next] = global_min_max_coord_total_weight[i]; //min coordinate
3115  this->cut_upper_bound_coordinates[next] = global_min_max_coord_total_weight[i + current_concurrent_num_parts]; //max coordinate
3116 
3117  this->cut_upper_bound_weights[next] = global_min_max_coord_total_weight[i + 2 * current_concurrent_num_parts]; //total weight
3118  this->cut_lower_bound_weights[next] = 0;
3119 
3120  if(this->distribute_points_on_cut_lines){
3121  this->process_cut_line_weight_to_put_left[next] = 0;
3122  }
3123  ++next;
3124  }
3125  }
3126  }
3127 
3128  //no need to have barrier here.
3129  //pragma omp single have implicit barrier.
3130 
3131  int iteration = 0;
3132  while (total_incomplete_cut_count != 0){
3133  iteration += 1;
3134  mj_part_t concurrent_cut_shifts = 0;
3135  size_t total_part_shift = 0;
3136 
3137  for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
3138  mj_part_t num_parts = -1;
3139  num_parts = num_partitioning_in_current_dim[current_work_part + kk];
3140 
3141  mj_part_t num_cuts = num_parts - 1;
3142  size_t total_part_count = num_parts + size_t (num_cuts) ;
3143  if (this->my_incomplete_cut_count[kk] > 0){
3144 
3145  //although isDone shared, currentDone is private and same for all.
3146  bool *current_cut_status = this->is_cut_line_determined + concurrent_cut_shifts;
3147  double *my_current_part_weights = my_thread_part_weights + total_part_shift;
3148  mj_scalar_t *my_current_left_closest = my_thread_left_closest + concurrent_cut_shifts;
3149  mj_scalar_t *my_current_right_closest = my_thread_right_closest + concurrent_cut_shifts;
3150 
3151  mj_part_t conccurent_current_part = current_work_part + kk;
3152  mj_lno_t coordinate_begin_index = conccurent_current_part ==0 ? 0: this->part_xadj[conccurent_current_part -1];
3153  mj_lno_t coordinate_end_index = this->part_xadj[conccurent_current_part];
3154  mj_scalar_t *temp_current_cut_coords = temp_cut_coords + concurrent_cut_shifts;
3155 
3156  mj_scalar_t min_coord = global_min_max_coord_total_weight[kk];
3157  mj_scalar_t max_coord = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
3158 
3159  // compute part weights using existing cuts
3160  this->mj_1D_part_get_thread_part_weights(
3161  total_part_count,
3162  num_cuts,
3163  max_coord,//globalMinMaxTotal[kk + concurrentPartCount],//maxScalar,
3164  min_coord,//globalMinMaxTotal[kk]//minScalar,
3165  coordinate_begin_index,
3166  coordinate_end_index,
3167  mj_current_dim_coords,
3168  temp_current_cut_coords,
3169  current_cut_status,
3170  my_current_part_weights,
3171  my_current_left_closest,
3172  my_current_right_closest);
3173 
3174  }
3175 
3176  concurrent_cut_shifts += num_cuts;
3177  total_part_shift += total_part_count;
3178  }
3179 
3180  //sum up the results of threads
3181  this->mj_accumulate_thread_results(
3182  num_partitioning_in_current_dim,
3183  current_work_part,
3184  current_concurrent_num_parts);
3185 
3186  //now sum up the results of mpi processors.
3187 #ifdef HAVE_ZOLTAN2_OMP
3188 #pragma omp single
3189 #endif
3190  {
3191  if(this->comm->getSize() > 1){
3192  reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3193  total_reduction_size,
3194  this->total_part_weight_left_right_closests,
3195  this->global_total_part_weight_left_right_closests);
3196 
3197  }
3198  else {
3199  memcpy(
3200  this->global_total_part_weight_left_right_closests,
3201  this->total_part_weight_left_right_closests,
3202  total_reduction_size * sizeof(mj_scalar_t));
3203  }
3204  }
3205 
3206  //how much cut will be shifted for the next part in the concurrent part calculation.
3207  mj_part_t cut_shift = 0;
3208 
3209  //how much the concantaneted array will be shifted for the next part in concurrent part calculation.
3210  size_t tlr_shift = 0;
3211  for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
3212  mj_part_t num_parts = num_partitioning_in_current_dim[current_work_part + kk];
3213  mj_part_t num_cuts = num_parts - 1;
3214  size_t num_total_part = num_parts + size_t (num_cuts) ;
3215 
3216  //if the cuts of this cut has already been completed.
3217  //nothing to do for this part.
3218  //just update the shift amount and proceed.
3219  if (this->my_incomplete_cut_count[kk] == 0) {
3220  cut_shift += num_cuts;
3221  tlr_shift += (num_total_part + 2 * num_cuts);
3222  continue;
3223  }
3224 
3225  mj_scalar_t *current_local_part_weights = this->total_part_weight_left_right_closests + tlr_shift ;
3226  mj_scalar_t *current_global_tlr = this->global_total_part_weight_left_right_closests + tlr_shift;
3227  mj_scalar_t *current_global_left_closest_points = current_global_tlr + num_total_part; //left closest points
3228  mj_scalar_t *current_global_right_closest_points = current_global_tlr + num_total_part + num_cuts; //right closest points
3229  mj_scalar_t *current_global_part_weights = current_global_tlr;
3230  bool *current_cut_line_determined = this->is_cut_line_determined + cut_shift;
3231 
3232  mj_scalar_t *current_part_target_weights = this->target_part_weights + cut_shift + kk;
3233  mj_scalar_t *current_part_cut_line_weight_to_put_left = this->process_cut_line_weight_to_put_left + cut_shift;
3234 
3235  mj_scalar_t min_coordinate = global_min_max_coord_total_weight[kk];
3236  mj_scalar_t max_coordinate = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
3237  mj_scalar_t global_total_weight = global_min_max_coord_total_weight[kk + current_concurrent_num_parts * 2];
3238  mj_scalar_t *current_cut_lower_bound_weights = this->cut_lower_bound_weights + cut_shift;
3239  mj_scalar_t *current_cut_upper_weights = this->cut_upper_bound_weights + cut_shift;
3240  mj_scalar_t *current_cut_upper_bounds = this->cut_upper_bound_coordinates + cut_shift;
3241  mj_scalar_t *current_cut_lower_bounds = this->cut_lower_bound_coordinates + cut_shift;
3242 
3243  mj_part_t initial_incomplete_cut_count = this->my_incomplete_cut_count[kk];
3244 
3245  // Now compute the new cut coordinates.
3246  this->mj_get_new_cut_coordinates(
3247  num_total_part,
3248  num_cuts,
3249  max_coordinate,
3250  min_coordinate,
3251  global_total_weight,
3252  used_imbalance_tolerance,
3253  current_global_part_weights,
3254  current_local_part_weights,
3255  current_part_target_weights,
3256  current_cut_line_determined,
3257  temp_cut_coords + cut_shift,
3258  current_cut_upper_bounds,
3259  current_cut_lower_bounds,
3260  current_global_left_closest_points,
3261  current_global_right_closest_points,
3262  current_cut_lower_bound_weights,
3263  current_cut_upper_weights,
3264  this->cut_coordinates_work_array +cut_shift, //new cut coordinates
3265  current_part_cut_line_weight_to_put_left,
3266  &rectilinear_cut_count,
3267  this->my_incomplete_cut_count[kk]);
3268 
3269  cut_shift += num_cuts;
3270  tlr_shift += (num_total_part + 2 * num_cuts);
3271  mj_part_t iteration_complete_cut_count = initial_incomplete_cut_count - this->my_incomplete_cut_count[kk];
3272 #ifdef HAVE_ZOLTAN2_OMP
3273 #pragma omp single
3274 #endif
3275  {
3276  total_incomplete_cut_count -= iteration_complete_cut_count;
3277  }
3278 
3279  }
3280  { //This unnecessary bracket works around a compiler bug in NVCC when compiling with OpenMP enabled
3281 #ifdef HAVE_ZOLTAN2_OMP
3282 #pragma omp barrier
3283 #pragma omp single
3284 #endif
3285  {
3286  //swap the cut coordinates for next iteration.
3287  mj_scalar_t *t = temp_cut_coords;
3288  temp_cut_coords = this->cut_coordinates_work_array;
3289  this->cut_coordinates_work_array = t;
3290  }
3291  }
3292  }
3293 
3294  //if (myRank == 0)
3295  //std::cout << "iteration:" << iteration << " partition:" << num_partitioning_in_current_dim[current_work_part] << std::endl;
3296  // Needed only if keep_cuts; otherwise can simply swap array pointers
3297  // cutCoordinates and cutCoordinatesWork.
3298  // (at first iteration, cutCoordinates == cutCoorindates_tmp).
3299  // computed cuts must be in cutCoordinates.
3300  if (current_cut_coordinates != temp_cut_coords){
3301 #ifdef HAVE_ZOLTAN2_OMP
3302 #pragma omp single
3303 #endif
3304  {
3305  mj_part_t next = 0;
3306  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3307  mj_part_t num_parts = -1;
3308  num_parts = num_partitioning_in_current_dim[current_work_part + i];
3309  mj_part_t num_cuts = num_parts - 1;
3310 
3311  for(mj_part_t ii = 0; ii < num_cuts; ++ii){
3312  current_cut_coordinates[next + ii] = temp_cut_coords[next + ii];
3313  }
3314  next += num_cuts;
3315  }
3316  }
3317 
3318 #ifdef HAVE_ZOLTAN2_OMP
3319 #pragma omp single
3320 #endif
3321  {
3322  this->cut_coordinates_work_array = temp_cut_coords;
3323  }
3324  }
3325  }
3326  delete reductionOp;
3327 }
3328 
3329 
3349 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3350  typename mj_part_t>
3351 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part_get_thread_part_weights(
3352  size_t total_part_count,
3353  mj_part_t num_cuts,
3354  mj_scalar_t max_coord,
3355  mj_scalar_t min_coord,
3356  mj_lno_t coordinate_begin_index,
3357  mj_lno_t coordinate_end_index,
3358  mj_scalar_t *mj_current_dim_coords,
3359  mj_scalar_t *temp_current_cut_coords,
3360  bool * /* current_cut_status */,
3361  double *my_current_part_weights,
3362  mj_scalar_t *my_current_left_closest,
3363  mj_scalar_t *my_current_right_closest){
3364 
3365  // initializations for part weights, left/right closest
3366  for (size_t i = 0; i < total_part_count; ++i){
3367  my_current_part_weights[i] = 0;
3368  }
3369 
3370  //initialize the left and right closest coordinates
3371  //to their max value.
3372  for(mj_part_t i = 0; i < num_cuts; ++i){
3373  my_current_left_closest[i] = min_coord - 1;
3374  my_current_right_closest[i] = max_coord + 1;
3375  }
3376  //mj_lno_t comparison_count = 0;
3377  mj_scalar_t minus_EPSILON = -this->sEpsilon;
3378 #ifdef HAVE_ZOLTAN2_OMP
3379  //no need for the barrier as all threads uses their local memories.
3380  //dont change the static scheduling here, as it is assumed when the new
3381  //partitions are created later.
3382 #pragma omp for
3383 #endif
3384  for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
3385  int i = this->coordinate_permutations[ii];
3386 
3387  //the accesses to assigned_part_ids are thread safe
3388  //since each coordinate is assigned to only a single thread.
3389  mj_part_t j = this->assigned_part_ids[i] / 2;
3390 
3391  if(j >= num_cuts){
3392  j = num_cuts - 1;
3393  }
3394 
3395  mj_part_t lower_cut_index = 0;
3396  mj_part_t upper_cut_index = num_cuts - 1;
3397 
3398  mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
3399  bool is_inserted = false;
3400  bool is_on_left_of_cut = false;
3401  bool is_on_right_of_cut = false;
3402  mj_part_t last_compared_part = -1;
3403 
3404  mj_scalar_t coord = mj_current_dim_coords[i];
3405 
3406  while(upper_cut_index >= lower_cut_index)
3407  {
3408  //comparison_count++;
3409  last_compared_part = -1;
3410  is_on_left_of_cut = false;
3411  is_on_right_of_cut = false;
3412  mj_scalar_t cut = temp_current_cut_coords[j];
3413  mj_scalar_t distance_to_cut = coord - cut;
3414  mj_scalar_t abs_distance_to_cut = ZOLTAN2_ABS(distance_to_cut);
3415 
3416  //if it is on the line.
3417  if(abs_distance_to_cut < this->sEpsilon){
3418 
3419  my_current_part_weights[j * 2 + 1] += w;
3420  this->assigned_part_ids[i] = j * 2 + 1;
3421 
3422  //assign left and right closest point to cut as the point is on the cut.
3423  my_current_left_closest[j] = coord;
3424  my_current_right_closest[j] = coord;
3425  //now we need to check if there are other cuts on the same cut coordinate.
3426  //if there are, then we add the weight of the cut to all cuts in the same coordinate.
3427  mj_part_t kk = j + 1;
3428  while(kk < num_cuts){
3429  // Needed when cuts shared the same position
3430  distance_to_cut =ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3431  if(distance_to_cut < this->sEpsilon){
3432  my_current_part_weights[2 * kk + 1] += w;
3433  my_current_left_closest[kk] = coord;
3434  my_current_right_closest[kk] = coord;
3435  kk++;
3436  }
3437  else{
3438  //cut is far away.
3439  //just check the left closest point for the next cut.
3440  if(coord - my_current_left_closest[kk] > this->sEpsilon){
3441  my_current_left_closest[kk] = coord;
3442  }
3443  break;
3444  }
3445  }
3446 
3447 
3448  kk = j - 1;
3449  //continue checking for the cuts on the left if they share the same coordinate.
3450  while(kk >= 0){
3451  distance_to_cut =ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3452  if(distance_to_cut < this->sEpsilon){
3453  my_current_part_weights[2 * kk + 1] += w;
3454  //try to write the partId as the leftmost cut.
3455  this->assigned_part_ids[i] = kk * 2 + 1;
3456  my_current_left_closest[kk] = coord;
3457  my_current_right_closest[kk] = coord;
3458  kk--;
3459  }
3460  else{
3461  //if cut is far away on the left of the point.
3462  //then just compare for right closest point.
3463  if(my_current_right_closest[kk] - coord > this->sEpsilon){
3464  my_current_right_closest[kk] = coord;
3465  }
3466  break;
3467  }
3468  }
3469 
3470  is_inserted = true;
3471  break;
3472  }
3473  else {
3474  //if point is on the left of the cut.
3475  if (distance_to_cut < 0) {
3476  bool _break = false;
3477  if(j > 0){
3478  //check distance to the cut on the left the current cut compared.
3479  //if point is on the right, then we find the part of the point.
3480  mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j - 1];
3481  if(distance_to_next_cut > this->sEpsilon){
3482  _break = true;
3483  }
3484  }
3485  //if point is not on the right of the next cut, then
3486  //set the upper bound to this cut.
3487  upper_cut_index = j - 1;
3488  //set the last part, and mark it as on the left of the last part.
3489  is_on_left_of_cut = true;
3490  last_compared_part = j;
3491  if(_break) break;
3492  }
3493  else {
3494  //if point is on the right of the cut.
3495  bool _break = false;
3496  if(j < num_cuts - 1){
3497  //check distance to the cut on the left the current cut compared.
3498  //if point is on the right, then we find the part of the point.
3499  mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j + 1];
3500  if(distance_to_next_cut < minus_EPSILON){
3501  _break = true;
3502  }
3503  }
3504 
3505  //if point is not on the left of the next cut, then
3506  //set the upper bound to this cut.
3507  lower_cut_index = j + 1;
3508  //set the last part, and mark it as on the right of the last part.
3509  is_on_right_of_cut = true;
3510  last_compared_part = j;
3511  if(_break) break;
3512  }
3513  }
3514 
3515  j = (upper_cut_index + lower_cut_index) / 2;
3516  }
3517  if(!is_inserted){
3518  if(is_on_right_of_cut){
3519 
3520  //add it to the right of the last compared part.
3521  my_current_part_weights[2 * last_compared_part + 2] += w;
3522  this->assigned_part_ids[i] = 2 * last_compared_part + 2;
3523 
3524  //update the right closest point of last compared cut.
3525  if(my_current_right_closest[last_compared_part] - coord > this->sEpsilon){
3526  my_current_right_closest[last_compared_part] = coord;
3527  }
3528  //update the left closest point of the cut on the right of the last compared cut.
3529  if(last_compared_part+1 < num_cuts){
3530 
3531  if(coord - my_current_left_closest[last_compared_part + 1] > this->sEpsilon){
3532  my_current_left_closest[last_compared_part + 1] = coord;
3533  }
3534  }
3535 
3536  }
3537  else if(is_on_left_of_cut){
3538 
3539  //add it to the left of the last compared part.
3540  my_current_part_weights[2 * last_compared_part] += w;
3541  this->assigned_part_ids[i] = 2 * last_compared_part;
3542 
3543 
3544  //update the left closest point of last compared cut.
3545  if(coord - my_current_left_closest[last_compared_part] > this->sEpsilon){
3546  my_current_left_closest[last_compared_part] = coord;
3547  }
3548 
3549  //update the right closest point of the cut on the left of the last compared cut.
3550  if(last_compared_part-1 >= 0){
3551  if(my_current_right_closest[last_compared_part -1] - coord > this->sEpsilon){
3552  my_current_right_closest[last_compared_part -1] = coord;
3553  }
3554  }
3555  }
3556  }
3557  }
3558 
3559  // prefix sum computation.
3560  //we need prefix sum for each part to determine cut positions.
3561  for (size_t i = 1; i < total_part_count; ++i){
3562  // check for cuts sharing the same position; all cuts sharing a position
3563  // have the same weight == total weight for all cuts sharing the position.
3564  // don't want to accumulate that total weight more than once.
3565  if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
3566  ZOLTAN2_ABS(temp_current_cut_coords[i / 2] - temp_current_cut_coords[i /2 - 1])
3567  < this->sEpsilon){
3568  //i % 2 = 0 when part i represents the cut coordinate.
3569  //if it is a cut, and if the next cut also have the same coordinate, then
3570  //dont addup.
3571  my_current_part_weights[i] = my_current_part_weights[i-2];
3572  continue;
3573  }
3574  //otherwise do the prefix sum.
3575  my_current_part_weights[i] += my_current_part_weights[i-1];
3576  }
3577 }
3578 
3579 
3587 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3588  typename mj_part_t>
3589 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_accumulate_thread_results(
3590  const std::vector <mj_part_t> &num_partitioning_in_current_dim,
3591  mj_part_t current_work_part,
3592  mj_part_t current_concurrent_num_parts){
3593 
3594 #ifdef HAVE_ZOLTAN2_OMP
3595  //needs barrier here, as it requires all threads to finish mj_1D_part_get_thread_part_weights
3596  //using parallel region here reduces the performance because of the cache invalidates.
3597 #pragma omp barrier
3598 #pragma omp single
3599 #endif
3600  {
3601  size_t tlr_array_shift = 0;
3602  mj_part_t cut_shift = 0;
3603 
3604  //iterate for all concurrent parts to find the left and right closest points in the process.
3605  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3606 
3607  mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3608  mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3609  size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3610 
3611  //iterate for cuts in a single part.
3612  for(mj_part_t ii = 0; ii < num_cuts_in_part ; ++ii){
3613  mj_part_t next = tlr_array_shift + ii;
3614  mj_part_t cut_index = cut_shift + ii;
3615  if(this->is_cut_line_determined[cut_index]) continue;
3616  mj_scalar_t left_closest_in_process = this->thread_cut_left_closest_point[0][cut_index],
3617  right_closest_in_process = this->thread_cut_right_closest_point[0][cut_index];
3618 
3619  //find the closest points from left and right for the cut in the process.
3620  for (int j = 1; j < this->num_threads; ++j){
3621  if (this->thread_cut_right_closest_point[j][cut_index] < right_closest_in_process ){
3622  right_closest_in_process = this->thread_cut_right_closest_point[j][cut_index];
3623  }
3624  if (this->thread_cut_left_closest_point[j][cut_index] > left_closest_in_process ){
3625  left_closest_in_process = this->thread_cut_left_closest_point[j][cut_index];
3626  }
3627  }
3628  //store the left and right closes points.
3629  this->total_part_weight_left_right_closests[num_total_part_in_part +
3630  next] = left_closest_in_process;
3631  this->total_part_weight_left_right_closests[num_total_part_in_part +
3632  num_cuts_in_part + next] = right_closest_in_process;
3633  }
3634  //set the shift position in the arrays
3635  tlr_array_shift += (num_total_part_in_part + 2 * num_cuts_in_part);
3636  cut_shift += num_cuts_in_part;
3637  }
3638 
3639  tlr_array_shift = 0;
3640  cut_shift = 0;
3641  size_t total_part_array_shift = 0;
3642 
3643  //iterate for all concurrent parts to find the total weight in the process.
3644  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3645 
3646  mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3647  mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3648  size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3649 
3650  for(size_t j = 0; j < num_total_part_in_part; ++j){
3651 
3652  mj_part_t cut_ind = j / 2 + cut_shift;
3653 
3654  //need to check j != num_total_part_in_part - 1
3655  // which is same as j/2 != num_cuts_in_part.
3656  //we cannot check it using cut_ind, because of the concurrent part concantanetion.
3657  if(j != num_total_part_in_part - 1 && this->is_cut_line_determined[cut_ind]) continue;
3658  double pwj = 0;
3659  for (int k = 0; k < this->num_threads; ++k){
3660  pwj += this->thread_part_weights[k][total_part_array_shift + j];
3661  }
3662  //size_t jshift = j % total_part_count + i * (total_part_count + 2 * noCuts);
3663  this->total_part_weight_left_right_closests[tlr_array_shift + j] = pwj;
3664  }
3665  cut_shift += num_cuts_in_part;
3666  tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
3667  total_part_array_shift += num_total_part_in_part;
3668  }
3669  }
3670  //the other threads needs to wait here.
3671  //but we don't need a pragma omp barrier.
3672  //as omp single has already have implicit barrier.
3673 }
3674 
3675 
3685 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3686  typename mj_part_t>
3687 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_calculate_new_cut_position (
3688  mj_scalar_t cut_upper_bound,
3689  mj_scalar_t cut_lower_bound,
3690  mj_scalar_t cut_upper_weight,
3691  mj_scalar_t cut_lower_weight,
3692  mj_scalar_t expected_weight,
3693  mj_scalar_t &new_cut_position){
3694 
3695  if(ZOLTAN2_ABS(cut_upper_bound - cut_lower_bound) < this->sEpsilon){
3696  new_cut_position = cut_upper_bound; //or lower bound does not matter.
3697  }
3698 
3699 
3700  if(ZOLTAN2_ABS(cut_upper_weight - cut_lower_weight) < this->sEpsilon){
3701  new_cut_position = cut_lower_bound;
3702  }
3703 
3704  mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
3705  mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
3706  mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
3707 
3708  mj_scalar_t required_shift = (my_weight_diff / weight_range);
3709  int scale_constant = 20;
3710  int shiftint= int (required_shift * scale_constant);
3711  if (shiftint == 0) shiftint = 1;
3712  required_shift = mj_scalar_t (shiftint) / scale_constant;
3713  new_cut_position = coordinate_range * required_shift + cut_lower_bound;
3714 }
3715 
3716 
3727 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3728  typename mj_part_t>
3729 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_create_new_partitions(
3730  mj_part_t num_parts,
3731  mj_scalar_t * /* mj_current_dim_coords */,
3732  mj_scalar_t *current_concurrent_cut_coordinate,
3733  mj_lno_t coordinate_begin,
3734  mj_lno_t coordinate_end,
3735  mj_scalar_t *used_local_cut_line_weight_to_left,
3736  double **used_thread_part_weight_work,
3737  mj_lno_t *out_part_xadj){
3738 
3739  mj_part_t num_cuts = num_parts - 1;
3740 
3741 #ifdef HAVE_ZOLTAN2_OMP
3742 #pragma omp parallel
3743 #endif
3744  {
3745  int me = 0;
3746 #ifdef HAVE_ZOLTAN2_OMP
3747  me = omp_get_thread_num();
3748 #endif
3749 
3750  mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
3751  mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
3752 
3753  //now if the rectilinear partitioning is allowed we decide how
3754  //much weight each thread should put to left and right.
3755  if (this->distribute_points_on_cut_lines){
3756  my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
3757  // this for assumes the static scheduling in mj_1D_part calculation.
3758 #ifdef HAVE_ZOLTAN2_OMP
3759 #pragma omp for
3760 #endif
3761  for (mj_part_t i = 0; i < num_cuts; ++i){
3762  //the left to be put on the left of the cut.
3763  mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
3764  for(int ii = 0; ii < this->num_threads; ++ii){
3765  if(left_weight > this->sEpsilon){
3766  //the weight of thread ii on cut.
3767  mj_scalar_t thread_ii_weight_on_cut = used_thread_part_weight_work[ii][i * 2 + 1] - used_thread_part_weight_work[ii][i * 2 ];
3768  if(thread_ii_weight_on_cut < left_weight){
3769  //if left weight is bigger than threads weight on cut.
3770  this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
3771  }
3772  else {
3773  //if thread's weight is bigger than space, then put only a portion.
3774  this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
3775  }
3776  left_weight -= thread_ii_weight_on_cut;
3777  }
3778  else {
3779  this->thread_cut_line_weight_to_put_left[ii][i] = 0;
3780  }
3781  }
3782  }
3783 
3784  if(num_cuts > 0){
3785  //this is a special case. If cutlines share the same coordinate, their weights are equal.
3786  //we need to adjust the ratio for that.
3787  for (mj_part_t i = num_cuts - 1; i > 0 ; --i){
3788  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
3789  my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
3790  }
3791  my_local_thread_cut_weights_to_put_left[i] = static_cast<long long>((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL)
3792  / mj_scalar_t(SIGNIFICANCE_MUL);
3793  }
3794  }
3795  }
3796 
3797  for(mj_part_t ii = 0; ii < num_parts; ++ii){
3798  thread_num_points_in_parts[ii] = 0;
3799  }
3800 
3801 
3802 #ifdef HAVE_ZOLTAN2_OMP
3803  //dont change static scheduler. the static partitioner used later as well.
3804 #pragma omp for
3805 #endif
3806  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3807 
3808  mj_lno_t coordinate_index = this->coordinate_permutations[ii];
3809  mj_scalar_t coordinate_weight = this->mj_uniform_weights[0]? 1:this->mj_weights[0][coordinate_index];
3810  mj_part_t coordinate_assigned_place = this->assigned_part_ids[coordinate_index];
3811  mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
3812  if(coordinate_assigned_place % 2 == 1){
3813  //if it is on the cut.
3814  if(this->distribute_points_on_cut_lines
3815  && my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > this->sEpsilon){
3816  //if the rectilinear partitioning is allowed,
3817  //and the thread has still space to put on the left of the cut
3818  //then thread puts the vertex to left.
3819  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3820  //if putting the vertex to left increased the weight more than expected.
3821  //and if the next cut is on the same coordinate,
3822  //then we need to adjust how much weight next cut puts to its left as well,
3823  //in order to take care of the imbalance.
3824  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0
3825  && coordinate_assigned_part < num_cuts - 1
3826  && ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3827  current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3828  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3829  }
3830  ++thread_num_points_in_parts[coordinate_assigned_part];
3831  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3832  }
3833  else{
3834  //if there is no more space on the left, put the coordinate to the right of the cut.
3835  ++coordinate_assigned_part;
3836  //this while loop is necessary when a line is partitioned into more than 2 parts.
3837  while(this->distribute_points_on_cut_lines &&
3838  coordinate_assigned_part < num_cuts){
3839  //traverse all the cut lines having the same partitiong
3840  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part] -
3841  current_concurrent_cut_coordinate[coordinate_assigned_part - 1])
3842  < this->sEpsilon){
3843  //if line has enough space on left, put it there.
3844  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >
3845  this->sEpsilon &&
3846  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >=
3847  ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] - coordinate_weight)){
3848  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3849  //Again if it put too much on left of the cut,
3850  //update how much the next cut sharing the same coordinate will put to its left.
3851  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 &&
3852  coordinate_assigned_part < num_cuts - 1 &&
3853  ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3854  current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3855  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3856  }
3857  break;
3858  }
3859  }
3860  else {
3861  break;
3862  }
3863  ++coordinate_assigned_part;
3864  }
3865  ++thread_num_points_in_parts[coordinate_assigned_part];
3866  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3867  }
3868  }
3869  else {
3870  //if it is already assigned to a part, then just put it to the corresponding part.
3871  ++thread_num_points_in_parts[coordinate_assigned_part];
3872  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3873  }
3874  }
3875 
3876 
3877 
3878  //now we calculate where each thread will write in new_coordinate_permutations array.
3879  //first we find the out_part_xadj, by marking the begin and end points of each part found.
3880  //the below loop find the number of points in each part, and writes it to out_part_xadj
3881 #ifdef HAVE_ZOLTAN2_OMP
3882 #pragma omp for
3883 #endif
3884  for(mj_part_t j = 0; j < num_parts; ++j){
3885  mj_lno_t num_points_in_part_j_upto_thread_i = 0;
3886  for (int i = 0; i < this->num_threads; ++i){
3887  mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
3888  //prefix sum to thread point counts, so that each will have private space to write.
3889  this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
3890  num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
3891 
3892  }
3893  out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin;
3894  }
3895 
3896  //now we need to do a prefix sum to out_part_xadj[j], to point begin and end of each part.
3897 #ifdef HAVE_ZOLTAN2_OMP
3898 #pragma omp single
3899 #endif
3900  {
3901  //perform prefix sum for num_points in parts.
3902  for(mj_part_t j = 1; j < num_parts; ++j){
3903  out_part_xadj[j] += out_part_xadj[j - 1];
3904  }
3905  }
3906 
3907  //shift the num points in threads thread to obtain the
3908  //beginning index of each thread's private space.
3909  for(mj_part_t j = 1; j < num_parts; ++j){
3910  thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
3911  }
3912 
3913 
3914  //now thread gets the coordinate and writes the index of coordinate to the permutation array
3915  //using the part index we calculated.
3916 #ifdef HAVE_ZOLTAN2_OMP
3917 #pragma omp for
3918 #endif
3919  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3920  mj_lno_t i = this->coordinate_permutations[ii];
3921  mj_part_t p = this->assigned_part_ids[i];
3922  this->new_coordinate_permutations[coordinate_begin +
3923  thread_num_points_in_parts[p]++] = i;
3924  }
3925  }
3926 }
3927 
3928 
3929 
3958 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3959  typename mj_part_t>
3960 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_new_cut_coordinates(
3961  const size_t &/* num_total_part */,
3962  const mj_part_t &num_cuts,
3963  const mj_scalar_t &max_coordinate,
3964  const mj_scalar_t &min_coordinate,
3965  const mj_scalar_t &global_total_weight,
3966  const double &used_imbalance_tolerance,
3967  mj_scalar_t * current_global_part_weights,
3968  const mj_scalar_t * current_local_part_weights,
3969  const mj_scalar_t *current_part_target_weights,
3970  bool *current_cut_line_determined,
3971  mj_scalar_t *current_cut_coordinates,
3972  mj_scalar_t *current_cut_upper_bounds,
3973  mj_scalar_t *current_cut_lower_bounds,
3974  mj_scalar_t *current_global_left_closest_points,
3975  mj_scalar_t *current_global_right_closest_points,
3976  mj_scalar_t * current_cut_lower_bound_weights,
3977  mj_scalar_t * current_cut_upper_weights,
3978  mj_scalar_t *new_current_cut_coordinates,
3979  mj_scalar_t *current_part_cut_line_weight_to_put_left,
3980  mj_part_t *rectilinear_cut_count,
3981  mj_part_t &my_num_incomplete_cut){
3982 
3983  //seen weight in the part
3984  mj_scalar_t seen_weight_in_part = 0;
3985  //expected weight for part.
3986  mj_scalar_t expected_weight_in_part = 0;
3987  //imbalance for the left and right side of the cut.
3988  double imbalance_on_left = 0, imbalance_on_right = 0;
3989 
3990 
3991 #ifdef HAVE_ZOLTAN2_OMP
3992 #pragma omp for
3993 #endif
3994  for (mj_part_t i = 0; i < num_cuts; i++){
3995  //if left and right closest points are not set yet,
3996  //set it to the cut itself.
3997  if(min_coordinate - current_global_left_closest_points[i] > this->sEpsilon)
3998  current_global_left_closest_points[i] = current_cut_coordinates[i];
3999  if(current_global_right_closest_points[i] - max_coordinate > this->sEpsilon)
4000  current_global_right_closest_points[i] = current_cut_coordinates[i];
4001 
4002  }
4003 #ifdef HAVE_ZOLTAN2_OMP
4004 #pragma omp for
4005 #endif
4006  for (mj_part_t i = 0; i < num_cuts; i++){
4007 
4008  if(this->distribute_points_on_cut_lines){
4009  //init the weight on the cut.
4010  this->global_rectilinear_cut_weight[i] = 0;
4011  this->process_rectilinear_cut_weight[i] = 0;
4012  }
4013  //if already determined at previous iterations,
4014  //then just write the coordinate to new array, and proceed.
4015  if(current_cut_line_determined[i]) {
4016  new_current_cut_coordinates[i] = current_cut_coordinates[i];
4017  continue;
4018  }
4019 
4020  //current weight of the part at the left of the cut line.
4021  seen_weight_in_part = current_global_part_weights[i * 2];
4022 
4023  /*
4024  std::cout << "seen_weight_in_part:" << i << " is "<< seen_weight_in_part <<std::endl;
4025  std::cout << "\tcut:" << current_cut_coordinates[i]
4026  << " current_cut_lower_bounds:" << current_cut_lower_bounds[i]
4027  << " current_cut_upper_bounds:" << current_cut_upper_bounds[i] << std::endl;
4028  */
4029  //expected ratio
4030  expected_weight_in_part = current_part_target_weights[i];
4031  //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
4032  imbalance_on_left = imbalanceOf2(seen_weight_in_part, expected_weight_in_part);
4033  //rightImbalance = imbalanceOf(globalTotalWeight - seenW, globalTotalWeight, 1 - expected);
4034  imbalance_on_right = imbalanceOf2(global_total_weight - seen_weight_in_part, global_total_weight - expected_weight_in_part);
4035 
4036  bool is_left_imbalance_valid = ZOLTAN2_ABS(imbalance_on_left) - used_imbalance_tolerance < this->sEpsilon ;
4037  bool is_right_imbalance_valid = ZOLTAN2_ABS(imbalance_on_right) - used_imbalance_tolerance < this->sEpsilon;
4038 
4039  //if the cut line reaches to desired imbalance.
4040  if(is_left_imbalance_valid && is_right_imbalance_valid){
4041  current_cut_line_determined[i] = true;
4042 #ifdef HAVE_ZOLTAN2_OMP
4043 #pragma omp atomic
4044 #endif
4045  my_num_incomplete_cut -= 1;
4046  new_current_cut_coordinates [i] = current_cut_coordinates[i];
4047  continue;
4048  }
4049  else if(imbalance_on_left < 0){
4050  //if left imbalance < 0 then we need to move the cut to right.
4051 
4052  if(this->distribute_points_on_cut_lines){
4053  //if it is okay to distribute the coordinate on
4054  //the same coordinate to left and right.
4055  //then check if we can reach to the target weight by including the
4056  //coordinates in the part.
4057  if (current_global_part_weights[i * 2 + 1] == expected_weight_in_part){
4058  //if it is we are done.
4059  current_cut_line_determined[i] = true;
4060 #ifdef HAVE_ZOLTAN2_OMP
4061 #pragma omp atomic
4062 #endif
4063  my_num_incomplete_cut -= 1;
4064 
4065  //then assign everything on the cut to the left of the cut.
4066  new_current_cut_coordinates [i] = current_cut_coordinates[i];
4067 
4068  //for this cut all the weight on cut will be put to left.
4069 
4070  current_part_cut_line_weight_to_put_left[i] = current_local_part_weights[i * 2 + 1] - current_local_part_weights[i * 2];
4071  continue;
4072  }
4073  else if (current_global_part_weights[i * 2 + 1] > expected_weight_in_part){
4074 
4075  //if the weight is larger than the expected weight,
4076  //then we need to distribute some points to left, some to right.
4077  current_cut_line_determined[i] = true;
4078 #ifdef HAVE_ZOLTAN2_OMP
4079 #pragma omp atomic
4080 #endif
4081  *rectilinear_cut_count += 1;
4082  //increase the num cuts to be determined with rectilinear partitioning.
4083 
4084 #ifdef HAVE_ZOLTAN2_OMP
4085 #pragma omp atomic
4086 #endif
4087  my_num_incomplete_cut -= 1;
4088  new_current_cut_coordinates [i] = current_cut_coordinates[i];
4089  this->process_rectilinear_cut_weight[i] = current_local_part_weights[i * 2 + 1] -
4090  current_local_part_weights[i * 2];
4091  continue;
4092  }
4093  }
4094  //we need to move further right,so set lower bound to current line, and shift it to the closes point from right.
4095  current_cut_lower_bounds[i] = current_global_right_closest_points[i];
4096  //set the lower bound weight to the weight we have seen.
4097  current_cut_lower_bound_weights[i] = seen_weight_in_part;
4098 
4099  //compare the upper bound with what has been found in the last iteration.
4100  //we try to make more strict bounds for the cut here.
4101  for (mj_part_t ii = i + 1; ii < num_cuts ; ++ii){
4102  mj_scalar_t p_weight = current_global_part_weights[ii * 2];
4103  mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
4104 
4105  if(p_weight >= expected_weight_in_part){
4106  //if a cut on the right has the expected weight, then we found
4107  //our cut position. Set up and low coordiantes to this new cut coordinate.
4108  //but we need one more iteration to finalize the cut position,
4109  //as wee need to update the part ids.
4110  if(p_weight == expected_weight_in_part){
4111  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
4112  current_cut_upper_weights[i] = p_weight;
4113  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
4114  current_cut_lower_bound_weights[i] = p_weight;
4115  } else if (p_weight < current_cut_upper_weights[i]){
4116  //if a part weight is larger then my expected weight,
4117  //but lower than my upper bound weight, update upper bound.
4118  current_cut_upper_bounds[i] = current_global_left_closest_points[ii];
4119  current_cut_upper_weights[i] = p_weight;
4120  }
4121  break;
4122  }
4123  //if comes here then pw < ew
4124  //then compare the weight against line weight.
4125  if(line_weight >= expected_weight_in_part){
4126  //if the line is larger than the expected weight,
4127  //then we need to reach to the balance by distributing coordinates on this line.
4128  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
4129  current_cut_upper_weights[i] = line_weight;
4130  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
4131  current_cut_lower_bound_weights[i] = p_weight;
4132  break;
4133  }
4134  //if a stricter lower bound is found,
4135  //update the lower bound.
4136  if (p_weight <= expected_weight_in_part && p_weight >= current_cut_lower_bound_weights[i]){
4137  current_cut_lower_bounds[i] = current_global_right_closest_points[ii] ;
4138  current_cut_lower_bound_weights[i] = p_weight;
4139  }
4140  }
4141 
4142 
4143  mj_scalar_t new_cut_position = 0;
4144  this->mj_calculate_new_cut_position(
4145  current_cut_upper_bounds[i],
4146  current_cut_lower_bounds[i],
4147  current_cut_upper_weights[i],
4148  current_cut_lower_bound_weights[i],
4149  expected_weight_in_part, new_cut_position);
4150 
4151  //if cut line does not move significantly.
4152  //then finalize the search.
4153  if (ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
4154  /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/
4155  ){
4156  current_cut_line_determined[i] = true;
4157 #ifdef HAVE_ZOLTAN2_OMP
4158 #pragma omp atomic
4159 #endif
4160  my_num_incomplete_cut -= 1;
4161 
4162  //set the cut coordinate and proceed.
4163  new_current_cut_coordinates [i] = current_cut_coordinates[i];
4164  } else {
4165  new_current_cut_coordinates [i] = new_cut_position;
4166  }
4167  } else {
4168 
4169  //need to move the cut line to left.
4170  //set upper bound to current line.
4171  current_cut_upper_bounds[i] = current_global_left_closest_points[i];
4172  current_cut_upper_weights[i] = seen_weight_in_part;
4173 
4174  // compare the current cut line weights with previous upper and lower bounds.
4175  for (int ii = i - 1; ii >= 0; --ii){
4176  mj_scalar_t p_weight = current_global_part_weights[ii * 2];
4177  mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
4178  if(p_weight <= expected_weight_in_part){
4179  if(p_weight == expected_weight_in_part){
4180  //if the weight of the part is my expected weight
4181  //then we find the solution.
4182  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
4183  current_cut_upper_weights[i] = p_weight;
4184  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
4185  current_cut_lower_bound_weights[i] = p_weight;
4186  }
4187  else if (p_weight > current_cut_lower_bound_weights[i]){
4188  //if found weight is bigger than the lower bound
4189  //then update the lower bound.
4190  current_cut_lower_bounds[i] = current_global_right_closest_points[ii];
4191  current_cut_lower_bound_weights[i] = p_weight;
4192 
4193  //at the same time, if weight of line is bigger than the
4194  //expected weight, then update the upper bound as well.
4195  //in this case the balance will be obtained by distributing weightss
4196  //on this cut position.
4197  if(line_weight > expected_weight_in_part){
4198  current_cut_upper_bounds[i] = current_global_right_closest_points[ii];
4199  current_cut_upper_weights[i] = line_weight;
4200  }
4201  }
4202  break;
4203  }
4204  //if the weight of the cut on the left is still bigger than my weight,
4205  //and also if the weight is smaller than the current upper weight,
4206  //or if the weight is equal to current upper weight, but on the left of
4207  // the upper weight, then update upper bound.
4208  if (p_weight >= expected_weight_in_part &&
4209  (p_weight < current_cut_upper_weights[i] ||
4210  (p_weight == current_cut_upper_weights[i] &&
4211  current_cut_upper_bounds[i] > current_global_left_closest_points[ii]
4212  )
4213  )
4214  ){
4215  current_cut_upper_bounds[i] = current_global_left_closest_points[ii] ;
4216  current_cut_upper_weights[i] = p_weight;
4217  }
4218  }
4219  mj_scalar_t new_cut_position = 0;
4220  this->mj_calculate_new_cut_position(
4221  current_cut_upper_bounds[i],
4222  current_cut_lower_bounds[i],
4223  current_cut_upper_weights[i],
4224  current_cut_lower_bound_weights[i],
4225  expected_weight_in_part,
4226  new_cut_position);
4227 
4228  //if cut line does not move significantly.
4229  if (ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
4230  /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/ ){
4231  current_cut_line_determined[i] = true;
4232 #ifdef HAVE_ZOLTAN2_OMP
4233 #pragma omp atomic
4234 #endif
4235  my_num_incomplete_cut -= 1;
4236  //set the cut coordinate and proceed.
4237  new_current_cut_coordinates [ i] = current_cut_coordinates[i];
4238  } else {
4239  new_current_cut_coordinates [ i] = new_cut_position;
4240  }
4241  }
4242  }
4243 
4244  { // This unnecessary bracket works around a compiler bug in NVCC when enabling OpenMP as well
4245 
4246  //communication to determine the ratios of processors for the distribution
4247  //of coordinates on the cut lines.
4248 #ifdef HAVE_ZOLTAN2_OMP
4249  //no need barrier here as it is implicit.
4250 #pragma omp single
4251 #endif
4252  {
4253  if(*rectilinear_cut_count > 0){
4254 
4255  try{
4256  Teuchos::scan<int,mj_scalar_t>(
4257  *comm, Teuchos::REDUCE_SUM,
4258  num_cuts,
4259  this->process_rectilinear_cut_weight,
4260  this->global_rectilinear_cut_weight
4261  );
4262  }
4263  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
4264 
4265  for (mj_part_t i = 0; i < num_cuts; ++i){
4266  //if cut line weight to be distributed.
4267  if(this->global_rectilinear_cut_weight[i] > 0) {
4268  //expected weight to go to left of the cut.
4269  mj_scalar_t expected_part_weight = current_part_target_weights[i];
4270  //the weight that should be put to left of the cut.
4271  mj_scalar_t necessary_weight_on_line_for_left = expected_part_weight - current_global_part_weights[i * 2];
4272  //the weight of the cut in the process
4273  mj_scalar_t my_weight_on_line = this->process_rectilinear_cut_weight[i];
4274  //the sum of the cut weights upto this process, including the weight of this process.
4275  mj_scalar_t weight_on_line_upto_process_inclusive = this->global_rectilinear_cut_weight[i];
4276  //the space on the left side of the cut after all processes before this process (including this process)
4277  //puts their weights on cut to left.
4278  mj_scalar_t space_to_put_left = necessary_weight_on_line_for_left - weight_on_line_upto_process_inclusive;
4279  //add my weight to this space to find out how much space is left to me.
4280  mj_scalar_t space_left_to_me = space_to_put_left + my_weight_on_line;
4281 
4282  /*
4283  std::cout << "expected_part_weight:" << expected_part_weight
4284  << " necessary_weight_on_line_for_left:" << necessary_weight_on_line_for_left
4285  << " my_weight_on_line" << my_weight_on_line
4286  << " weight_on_line_upto_process_inclusive:" << weight_on_line_upto_process_inclusive
4287  << " space_to_put_left:" << space_to_put_left
4288  << " space_left_to_me" << space_left_to_me << std::endl;
4289  */
4290  if(space_left_to_me < 0){
4291  //space_left_to_me is negative and i dont need to put anything to left.
4292  current_part_cut_line_weight_to_put_left[i] = 0;
4293  }
4294  else if(space_left_to_me >= my_weight_on_line){
4295  //space left to me is bigger than the weight of the processor on cut.
4296  //so put everything to left.
4297  current_part_cut_line_weight_to_put_left[i] = my_weight_on_line;
4298  //std::cout << "setting current_part_cut_line_weight_to_put_left to my_weight_on_line:" << my_weight_on_line << std::endl;
4299  }
4300  else {
4301  //put only the weight as much as the space.
4302  current_part_cut_line_weight_to_put_left[i] = space_left_to_me ;
4303 
4304  //std::cout << "setting current_part_cut_line_weight_to_put_left to space_left_to_me:" << space_left_to_me << std::endl;
4305  }
4306 
4307  }
4308  }
4309  *rectilinear_cut_count = 0;
4310  }
4311  }
4312  }
4313 }
4314 
4324 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4325  typename mj_part_t>
4326 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_processor_num_points_in_parts(
4327  mj_part_t num_procs,
4328  mj_part_t num_parts,
4329  mj_gno_t *&num_points_in_all_processor_parts){
4330 
4331  //initially allocation_size is num_parts
4332  size_t allocation_size = num_parts * (num_procs + 1);
4333 
4334  //this will be output
4335  //holds how many each processor has in each part.
4336  //last portion is the sum of all processor points in each part.
4337 
4338  //allocate memory for the local num coordinates in each part.
4339  mj_gno_t *num_local_points_in_each_part_to_reduce_sum = allocMemory<mj_gno_t>(allocation_size);
4340 
4341 
4342  //this is the portion of the memory which will be used
4343  //at the summation to obtain total number of processors' points in each part.
4344  mj_gno_t *my_local_points_to_reduce_sum = num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
4345  //this is the portion of the memory where each stores its local number.
4346  //this information is needed by other processors.
4347  mj_gno_t *my_local_point_counts_in_each_art = num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
4348 
4349  //initialize the array with 0's.
4350  memset(num_local_points_in_each_part_to_reduce_sum, 0, sizeof(mj_gno_t)*allocation_size);
4351 
4352  //write the number of coordinates in each part.
4353  for (mj_part_t i = 0; i < num_parts; ++i){
4354  mj_lno_t part_begin_index = 0;
4355  if (i > 0){
4356  part_begin_index = this->new_part_xadj[i - 1];
4357  }
4358  mj_lno_t part_end_index = this->new_part_xadj[i];
4359  my_local_points_to_reduce_sum[i] = part_end_index - part_begin_index;
4360  }
4361 
4362  //copy the local num parts to the last portion of array,
4363  //so that this portion will represent the global num points in each part after the reduction.
4364  memcpy (my_local_point_counts_in_each_art,
4365  my_local_points_to_reduce_sum,
4366  sizeof(mj_gno_t) * (num_parts) );
4367 
4368 
4369  //reduceAll operation.
4370  //the portion that belongs to a processor with index p
4371  //will start from myRank * num_parts.
4372  //the global number of points will be held at the index
4373  try{
4374  reduceAll<int, mj_gno_t>(
4375  *(this->comm),
4376  Teuchos::REDUCE_SUM,
4377  allocation_size,
4378  num_local_points_in_each_part_to_reduce_sum,
4379  num_points_in_all_processor_parts);
4380  }
4381  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
4382  freeArray<mj_gno_t>(num_local_points_in_each_part_to_reduce_sum);
4383 }
4384 
4385 
4386 
4399 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4400  typename mj_part_t>
4401 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_check_to_migrate(
4402  size_t migration_reduce_all_population,
4403  mj_lno_t num_coords_for_last_dim_part,
4404  mj_part_t num_procs,
4405  mj_part_t num_parts,
4406  mj_gno_t *num_points_in_all_processor_parts){
4407 
4408  //if reduce all count and population in the last dim is too high
4409  if (migration_reduce_all_population > FUTURE_REDUCEALL_CUTOFF) return true;
4410  //if the work in a part per processor in the last dim is too low.
4411  if (num_coords_for_last_dim_part < MIN_WORK_LAST_DIM) return true;
4412 
4413  //if migration is to be checked and the imbalance is too high
4414  if (this->check_migrate_avoid_migration_option == 0){
4415  double global_imbalance = 0;
4416  //global shift to reach the sum of coordiante count in each part.
4417  size_t global_shift = num_procs * num_parts;
4418 
4419  for (mj_part_t ii = 0; ii < num_procs; ++ii){
4420  for (mj_part_t i = 0; i < num_parts; ++i){
4421  double ideal_num = num_points_in_all_processor_parts[global_shift + i]
4422  / double(num_procs);
4423 
4424  global_imbalance += ZOLTAN2_ABS(ideal_num -
4425  num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
4426  }
4427  }
4428  global_imbalance /= num_parts;
4429  global_imbalance /= num_procs;
4430 
4431  /*
4432  if (this->myRank == 0) {
4433  std::cout << "imbalance for next iteration:" << global_imbalance << std::endl;
4434  }
4435  */
4436 
4437  if(global_imbalance <= this->minimum_migration_imbalance){
4438  return false;
4439  }
4440  else {
4441  return true;
4442  }
4443  }
4444  else {
4445  //if migration is forced
4446  return true;
4447  }
4448 }
4449 
4450 
4460 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4461  typename mj_part_t>
4462 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations(
4463  mj_part_t num_parts,
4464  mj_part_t *part_assignment_proc_begin_indices,
4465  mj_part_t *processor_chains_in_parts,
4466  mj_lno_t *send_count_to_each_proc,
4467  int *coordinate_destinations){
4468 
4469  for (mj_part_t p = 0; p < num_parts; ++p){
4470  mj_lno_t part_begin = 0;
4471  if (p > 0) part_begin = this->new_part_xadj[p - 1];
4472  mj_lno_t part_end = this->new_part_xadj[p];
4473 
4474  //get the first part that current processor will send its part-p.
4475  mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
4476  //initialize how many point I sent to this processor.
4477  mj_lno_t num_total_send = 0;
4478  for (mj_lno_t j=part_begin; j < part_end; j++){
4479  mj_lno_t local_ind = this->new_coordinate_permutations[j];
4480  while (num_total_send >= send_count_to_each_proc[proc_to_sent]){
4481  //then get the next processor to send the points in part p.
4482  num_total_send = 0;
4483  //assign new processor to part_assign_begin[p]
4484  part_assignment_proc_begin_indices[p] = processor_chains_in_parts[proc_to_sent];
4485  //remove the previous processor
4486  processor_chains_in_parts[proc_to_sent] = -1;
4487  //choose the next processor as the next one to send.
4488  proc_to_sent = part_assignment_proc_begin_indices[p];
4489  }
4490  //write the gno index to corresponding position in sendBuf.
4491  coordinate_destinations[local_ind] = proc_to_sent;
4492  ++num_total_send;
4493  }
4494  }
4495 }
4496 
4511 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4512  typename mj_part_t>
4513 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_proc_to_parts(
4514  mj_gno_t * num_points_in_all_processor_parts,
4515  mj_part_t num_parts,
4516  mj_part_t num_procs,
4517  mj_lno_t *send_count_to_each_proc,
4518  std::vector<mj_part_t> &processor_ranks_for_subcomm,
4519  std::vector<mj_part_t> *next_future_num_parts_in_parts,
4520  mj_part_t &out_part_index,
4521  mj_part_t &output_part_numbering_begin_index,
4522  int *coordinate_destinations){
4523 
4524 
4525  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4526  mj_part_t *num_procs_assigned_to_each_part = allocMemory<mj_part_t>(num_parts);
4527 
4528  //boolean variable if the process finds its part to be assigned.
4529  bool did_i_find_my_group = false;
4530 
4531  mj_part_t num_free_procs = num_procs;
4532  mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
4533 
4534  double max_imbalance_difference = 0;
4535  mj_part_t max_differing_part = 0;
4536 
4537  //find how many processor each part requires.
4538  for (mj_part_t i=0; i < num_parts; i++){
4539 
4540  //scalar portion of the required processors
4541  double scalar_required_proc = num_procs *
4542  (double (global_num_points_in_parts[i]) / double (this->num_global_coords));
4543 
4544  //round it to closest integer; make sure have at least one proc.
4545  mj_part_t required_proc = static_cast<mj_part_t> (0.5 + scalar_required_proc);
4546  if (required_proc == 0) required_proc = 1;
4547 
4548  //if assigning the required num procs, creates problems for the rest of the parts.
4549  //then only assign {num_free_procs - (minimum_num_procs_required_for_rest_of_parts)} procs to this part.
4550  if (num_free_procs - required_proc < minimum_num_procs_required_for_rest_of_parts){
4551  required_proc = num_free_procs - (minimum_num_procs_required_for_rest_of_parts);
4552  }
4553 
4554  //reduce the free processor count
4555  num_free_procs -= required_proc;
4556  //reduce the free minimum processor count required for the rest of the part by 1.
4557  --minimum_num_procs_required_for_rest_of_parts;
4558 
4559  //part (i) is assigned to (required_proc) processors.
4560  num_procs_assigned_to_each_part[i] = required_proc;
4561 
4562  //because of the roundings some processors might be left as unassigned.
4563  //we want to assign those processors to the part with most imbalance.
4564  //find the part with the maximum imbalance here.
4565  double imbalance_wrt_ideal = (scalar_required_proc - required_proc) / required_proc;
4566  if (imbalance_wrt_ideal > max_imbalance_difference){
4567  max_imbalance_difference = imbalance_wrt_ideal;
4568  max_differing_part = i;
4569  }
4570  }
4571 
4572  //assign extra processors to the part with maximum imbalance than the ideal.
4573  if (num_free_procs > 0){
4574  num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
4575  }
4576 
4577  //now find what are the best processors with least migration for each part.
4578 
4579  //part_assignment_proc_begin_indices ([i]) is the array that holds the beginning
4580  //index of a processor that processor sends its data for part - i
4581  mj_part_t *part_assignment_proc_begin_indices = allocMemory<mj_part_t>(num_parts);
4582  //the next processor send is found in processor_chains_in_parts, in linked list manner.
4583  mj_part_t *processor_chains_in_parts = allocMemory<mj_part_t>(num_procs);
4584  mj_part_t *processor_part_assignments = allocMemory<mj_part_t>(num_procs);
4585 
4586  //initialize the assignment of each processor.
4587  //this has a linked list implementation.
4588  //the beginning of processors assigned
4589  //to each part is hold at part_assignment_proc_begin_indices[part].
4590  //then the next processor assigned to that part is located at
4591  //proc_part_assignments[part_assign_begins[part]], this is a chain
4592  //until the value of -1 is reached.
4593  for (int i = 0; i < num_procs; ++i ){
4594  processor_part_assignments[i] = -1;
4595  processor_chains_in_parts[i] = -1;
4596  }
4597  for (int i = 0; i < num_parts; ++i ){
4598  part_assignment_proc_begin_indices[i] = -1;
4599  }
4600 
4601 
4602  //std::cout << "Before migration: mig type:" << this->migration_type << std::endl;
4603  //Allocate memory for sorting data structure.
4604  uSignedSortItem<mj_part_t, mj_gno_t, char> * sort_item_num_part_points_in_procs = allocMemory <uSignedSortItem<mj_part_t, mj_gno_t, char> > (num_procs);
4605  for(mj_part_t i = 0; i < num_parts; ++i){
4606  //the algorithm tries to minimize the cost of migration,
4607  //by assigning the processors with highest number of coordinates on that part.
4608  //here we might want to implement a maximum weighted bipartite matching algorithm.
4609  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4610  sort_item_num_part_points_in_procs[ii].id = ii;
4611  //if processor is not assigned yet.
4612  //add its num points to the sort data structure.
4613  if (processor_part_assignments[ii] == -1){
4614  sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4615  sort_item_num_part_points_in_procs[ii].signbit = 1; //indicate that the processor has positive weight.
4616  }
4617  else {
4618  //if processor is already assigned, insert -nLocal - 1 so that it won't be selected again.
4619  //would be same if we simply set it to -1,
4620  //but more information with no extra cost (which is used later) is provided.
4621  //sort_item_num_part_points_in_procs[ii].val = -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
4622 
4623  //UPDATE: Since above gets warning when unsigned is used to represent, we added extra bit to as sign bit to the sort item.
4624  //It is 1 for positives, 0 for negatives.
4625  sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4626  sort_item_num_part_points_in_procs[ii].signbit = 0;
4627  }
4628  }
4629  //sort the processors in the part.
4630  uqSignsort<mj_part_t, mj_gno_t,char>(num_procs, sort_item_num_part_points_in_procs);
4631 
4632  /*
4633  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4634  std::cout << "ii:" << ii << " " << sort_item_num_part_points_in_procs[ii].id <<
4635  " " << sort_item_num_part_points_in_procs[ii].val <<
4636  " " << int(sort_item_num_part_points_in_procs[ii].signbit) << std::endl;
4637  }
4638  */
4639 
4640  mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
4641  mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
4642  mj_gno_t ideal_num_points_in_a_proc =
4643  Teuchos::as<mj_gno_t>(ceil (total_num_points_in_part / double (required_proc_count)));
4644 
4645  //starts sending to least heaviest part.
4646  mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
4647  mj_part_t next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4648  mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4649 
4650  //find the processors that will be assigned to this part, which are the heaviest
4651  //non assigned processors.
4652  for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4653  mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
4654  //assign processor to part - i.
4655  processor_part_assignments[proc_id] = i;
4656  }
4657 
4658  bool did_change_sign = false;
4659  //if processor has a minus count, reverse it.
4660  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4661  // TODO: THE LINE BELOW PRODUCES A WARNING IF gno_t IS UNSIGNED
4662  // TODO: SEE BUG 6194
4663  if (sort_item_num_part_points_in_procs[ii].signbit == 0){
4664  did_change_sign = true;
4665  sort_item_num_part_points_in_procs[ii].signbit = 1;
4666  }
4667  else {
4668  break;
4669  }
4670  }
4671  if(did_change_sign){
4672  //resort the processors in the part for the rest of the processors that is not assigned.
4673  uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count, sort_item_num_part_points_in_procs);
4674  }
4675  /*
4676  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4677  std::cout << "after resort ii:" << ii << " " << sort_item_num_part_points_in_procs[ii].id <<
4678  " " << sort_item_num_part_points_in_procs[ii].val <<
4679  " " << int(sort_item_num_part_points_in_procs[ii].signbit ) << std::endl;
4680  }
4681  */
4682 
4683  //check if this processors is one of the procs assigned to this part.
4684  //if it is, then get the group.
4685  if (!did_i_find_my_group){
4686  for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4687 
4688  mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
4689  //add the proc to the group.
4690  processor_ranks_for_subcomm.push_back(proc_id_to_assign);
4691 
4692  if(proc_id_to_assign == this->myRank){
4693  //if the assigned process is me, then I find my group.
4694  did_i_find_my_group = true;
4695  //set the beginning of part i to my rank.
4696  part_assignment_proc_begin_indices[i] = this->myRank;
4697  processor_chains_in_parts[this->myRank] = -1;
4698 
4699  //set send count to myself to the number of points that I have in part i.
4700  send_count_to_each_proc[this->myRank] = sort_item_num_part_points_in_procs[ii].val;
4701 
4702  //calculate the shift required for the output_part_numbering_begin_index
4703  for (mj_part_t in = 0; in < i; ++in){
4704  output_part_numbering_begin_index += (*next_future_num_parts_in_parts)[in];
4705  }
4706  out_part_index = i;
4707  }
4708  }
4709  //if these was not my group,
4710  //clear the subcomminicator processor array.
4711  if (!did_i_find_my_group){
4712  processor_ranks_for_subcomm.clear();
4713  }
4714  }
4715 
4716  //send points of the nonassigned coordinates to the assigned coordinates.
4717  //starts from the heaviest nonassigned processor.
4718  //TODO we might want to play with this part, that allows more computational imbalance
4719  //but having better communication balance.
4720  for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii){
4721  mj_part_t nonassigned_proc_id = sort_item_num_part_points_in_procs[ii].id;
4722  mj_lno_t num_points_to_sent = sort_item_num_part_points_in_procs[ii].val;
4723 
4724  //we set number of points to -to_sent - 1 for the assigned processors.
4725  //we reverse it here. This should not happen, as we have already reversed them above.
4726 #ifdef MJ_DEBUG
4727  if (num_points_to_sent < 0) {
4728  std::cout << "Migration - processor assignments - for part:" << i << "from proc:" << nonassigned_proc_id << " num_points_to_sent:" << num_points_to_sent << std::endl;
4729  exit(1);
4730  }
4731 #endif
4732 
4733  switch (migration_type){
4734  case 0:
4735  {
4736  //now sends the points to the assigned processors.
4737  while (num_points_to_sent > 0){
4738  //if the processor has enough space.
4739  if (num_points_to_sent <= space_left_in_sent_proc){
4740  //reduce the space left in the processor.
4741  space_left_in_sent_proc -= num_points_to_sent;
4742  //if my rank is the one that is sending the coordinates.
4743  if (this->myRank == nonassigned_proc_id){
4744  //set my sent count to the sent processor.
4745  send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4746  //save the processor in the list (processor_chains_in_parts and part_assignment_proc_begin_indices)
4747  //that the processor will send its point in part-i.
4748  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4749  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4750  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4751  }
4752  num_points_to_sent = 0;
4753  }
4754  else {
4755  //there might be no space left in the processor.
4756  if(space_left_in_sent_proc > 0){
4757  num_points_to_sent -= space_left_in_sent_proc;
4758 
4759  //send as the space left in the processor.
4760  if (this->myRank == nonassigned_proc_id){
4761  //send as much as the space in this case.
4762  send_count_to_each_proc[next_proc_to_send_id] = space_left_in_sent_proc;
4763  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4764  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4765  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4766 
4767  }
4768  }
4769  //change the sent part
4770  ++next_proc_to_send_index;
4771 
4772 #ifdef MJ_DEBUG
4773  if(next_part_to_send_index < nprocs - required_proc_count ){
4774  std::cout << "Migration - processor assignments - for part:"
4775  << i
4776  << " next_part_to_send :" << next_part_to_send_index
4777  << " nprocs:" << nprocs
4778  << " required_proc_count:" << required_proc_count
4779  << " Error: next_part_to_send_index < nprocs - required_proc_count" << std::endl;
4780  exit(1)l
4781 
4782  }
4783 #endif
4784  //send the new id.
4785  next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4786  //set the new space in the processor.
4787  space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4788  }
4789  }
4790  }
4791  break;
4792  default:
4793  {
4794  //to minimize messages, we want each processor to send its coordinates to only a single point.
4795  //we do not respect imbalances here, we send all points to the next processor.
4796  if (this->myRank == nonassigned_proc_id){
4797  //set my sent count to the sent processor.
4798  send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4799  //save the processor in the list (processor_chains_in_parts and part_assignment_proc_begin_indices)
4800  //that the processor will send its point in part-i.
4801  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4802  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4803  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4804  }
4805  num_points_to_sent = 0;
4806  ++next_proc_to_send_index;
4807 
4808  //if we made it to the heaviest processor we round robin and go to beginning
4809  if (next_proc_to_send_index == num_procs){
4810  next_proc_to_send_index = num_procs - required_proc_count;
4811  }
4812  //send the new id.
4813  next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4814  //set the new space in the processor.
4815  space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4816  }
4817  }
4818  }
4819  }
4820 
4821  /*
4822  for (int i = 0; i < num_procs;++i){
4823  std::cout << "me:" << this->myRank << " to part:" << i << " sends:" << send_count_to_each_proc[i] << std::endl;
4824  }
4825  */
4826 
4827 
4828  this->assign_send_destinations(
4829  num_parts,
4830  part_assignment_proc_begin_indices,
4831  processor_chains_in_parts,
4832  send_count_to_each_proc,
4833  coordinate_destinations);
4834 
4835  freeArray<mj_part_t>(part_assignment_proc_begin_indices);
4836  freeArray<mj_part_t>(processor_chains_in_parts);
4837  freeArray<mj_part_t>(processor_part_assignments);
4838  freeArray<uSignedSortItem<mj_part_t, mj_gno_t, char> > (sort_item_num_part_points_in_procs);
4839  freeArray<mj_part_t > (num_procs_assigned_to_each_part);
4840 
4841 }
4842 
4843 
4856 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4857  typename mj_part_t>
4858 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations2(
4859  mj_part_t num_parts,
4860  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors
4861  int *coordinate_destinations,
4862  mj_part_t &output_part_numbering_begin_index,
4863  std::vector<mj_part_t> *next_future_num_parts_in_parts){
4864 
4865  mj_part_t part_shift_amount = output_part_numbering_begin_index;
4866  mj_part_t previous_processor = -1;
4867  for(mj_part_t i = 0; i < num_parts; ++i){
4868  mj_part_t p = sort_item_part_to_proc_assignment[i].id;
4869  //assigned processors are sorted.
4870  mj_lno_t part_begin_index = 0;
4871  if (p > 0) part_begin_index = this->new_part_xadj[p - 1];
4872  mj_lno_t part_end_index = this->new_part_xadj[p];
4873 
4874  mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
4875  if (this->myRank == assigned_proc && previous_processor != assigned_proc){
4876  output_part_numbering_begin_index = part_shift_amount;
4877  }
4878  previous_processor = assigned_proc;
4879  part_shift_amount += (*next_future_num_parts_in_parts)[p];
4880 
4881  for (mj_lno_t j=part_begin_index; j < part_end_index; j++){
4882  mj_lno_t localInd = this->new_coordinate_permutations[j];
4883  coordinate_destinations[localInd] = assigned_proc;
4884  }
4885  }
4886 }
4887 
4888 
4905 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4906  typename mj_part_t>
4907 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_parts_to_procs(
4908  mj_gno_t * num_points_in_all_processor_parts,
4909  mj_part_t num_parts,
4910  mj_part_t num_procs,
4911  mj_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc.
4912  std::vector<mj_part_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into.
4913  mj_part_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function.
4914  std::vector<mj_part_t> &out_part_indices, //output: the part indices which the processor is assigned to.
4915  mj_part_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution
4916  int *coordinate_destinations){
4917  out_num_part = 0;
4918 
4919  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4920  out_part_indices.clear();
4921 
4922  //to sort the parts that is assigned to the processors.
4923  //id is the part number, sort value is the assigned processor id.
4924  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment = allocMemory <uSortItem<mj_part_t, mj_part_t> >(num_parts);
4925  uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_procs);
4926 
4927 
4928  //calculate the optimal number of coordinates that should be assigned to each processor.
4929  mj_lno_t work_each = mj_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f);
4930  //to hold the left space as the number of coordinates to the optimal number in each proc.
4931  mj_lno_t *space_in_each_processor = allocMemory <mj_lno_t>(num_procs);
4932  //initialize left space in each.
4933  for (mj_part_t i = 0; i < num_procs; ++i){
4934  space_in_each_processor[i] = work_each;
4935  }
4936 
4937  //we keep track of how many parts each processor is assigned to.
4938  //because in some weird inputs, it might be possible that some
4939  //processors is not assigned to any part. Using these variables,
4940  //we force each processor to have at least one part.
4941  mj_part_t *num_parts_proc_assigned = allocMemory <mj_part_t>(num_procs);
4942  memset(num_parts_proc_assigned, 0, sizeof(mj_part_t) * num_procs);
4943  int empty_proc_count = num_procs;
4944 
4945  //to sort the parts with decreasing order of their coordiantes.
4946  //id are the part numbers, sort value is the number of points in each.
4947  uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_parts);
4948 
4949  //initially we will sort the parts according to the number of coordinates they have.
4950  //so that we will start assigning with the part that has the most number of coordinates.
4951  for (mj_part_t i = 0; i < num_parts; ++i){
4952  sort_item_point_counts_in_parts[i].id = i;
4953  sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
4954  }
4955  //sort parts with increasing order of loads.
4956  uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
4957 
4958 
4959  //assigning parts to the processors
4960  //traverse the part win decreasing order of load.
4961  //first assign the heaviest part.
4962  for (mj_part_t j = 0; j < num_parts; ++j){
4963  //sorted with increasing order, traverse inverse.
4964  mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
4965  //load of the part
4966  mj_gno_t load = global_num_points_in_parts[i];
4967 
4968  //assigned processors
4969  mj_part_t assigned_proc = -1;
4970  //if not fit best processor.
4971  mj_part_t best_proc_to_assign = 0;
4972 
4973 
4974  //sort processors with increasing number of points in this part.
4975  for (mj_part_t ii = 0; ii < num_procs; ++ii){
4976  sort_item_num_points_of_proc_in_part_i[ii].id = ii;
4977 
4978  //if there are still enough parts to fill empty processors, than proceed normally.
4979  //but if empty processor count is equal to the number of part, then
4980  //we force to part assignments only to empty processors.
4981  if (empty_proc_count < num_parts - j || num_parts_proc_assigned[ii] == 0){
4982  //how many points processor ii has in part i?
4983  sort_item_num_points_of_proc_in_part_i[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4984  }
4985  else {
4986  sort_item_num_points_of_proc_in_part_i[ii].val = -1;
4987  }
4988  }
4989  uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_points_of_proc_in_part_i);
4990 
4991  //traverse all processors with decreasing load.
4992  for (mj_part_t iii = num_procs - 1; iii >= 0; --iii){
4993  mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
4994  mj_lno_t left_space = space_in_each_processor[ii] - load;
4995  //if enought space, assign to this part.
4996  if(left_space >= 0 ){
4997  assigned_proc = ii;
4998  break;
4999  }
5000  //if space is not enough, store the best candidate part.
5001  if (space_in_each_processor[best_proc_to_assign] < space_in_each_processor[ii]){
5002  best_proc_to_assign = ii;
5003  }
5004  }
5005 
5006  //if none had enough space, then assign it to best part.
5007  if (assigned_proc == -1){
5008  assigned_proc = best_proc_to_assign;
5009  }
5010 
5011  if (num_parts_proc_assigned[assigned_proc]++ == 0){
5012  --empty_proc_count;
5013  }
5014  space_in_each_processor[assigned_proc] -= load;
5015  //to sort later, part-i is assigned to the proccessor - assignment.
5016  sort_item_part_to_proc_assignment[j].id = i; //part i
5017  sort_item_part_to_proc_assignment[j].val = assigned_proc; //assigned to processor - assignment.
5018 
5019 
5020  //if assigned processor is me, increase the number.
5021  if (assigned_proc == this->myRank){
5022  out_num_part++;//assigned_part_count;
5023  out_part_indices.push_back(i);
5024  }
5025  //increase the send to that processor by the number of points in that part.
5026  //as everyone send their coordiantes in this part to the processor assigned to this part.
5027  send_count_to_each_proc[assigned_proc] += num_points_in_all_processor_parts[this->myRank * num_parts + i];
5028  }
5029  freeArray<mj_part_t>(num_parts_proc_assigned);
5030  freeArray< uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_points_of_proc_in_part_i);
5031  freeArray<uSortItem<mj_part_t, mj_gno_t> >(sort_item_point_counts_in_parts);
5032  freeArray<mj_lno_t >(space_in_each_processor);
5033 
5034 
5035  //sort assignments with respect to the assigned processors.
5036  uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
5037  //fill sendBuf.
5038 
5039 
5040  this->assign_send_destinations2(
5041  num_parts,
5042  sort_item_part_to_proc_assignment,
5043  coordinate_destinations,
5044  output_part_numbering_begin_index,
5045  next_future_num_parts_in_parts);
5046 
5047  freeArray<uSortItem<mj_part_t, mj_part_t> >(sort_item_part_to_proc_assignment);
5048 }
5049 
5050 
5068 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5069  typename mj_part_t>
5070 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migration_part_proc_assignment(
5071  mj_gno_t * num_points_in_all_processor_parts,
5072  mj_part_t num_parts,
5073  mj_part_t num_procs,
5074  mj_lno_t *send_count_to_each_proc,
5075  std::vector<mj_part_t> &processor_ranks_for_subcomm,
5076  std::vector<mj_part_t> *next_future_num_parts_in_parts,
5077  mj_part_t &out_num_part,
5078  std::vector<mj_part_t> &out_part_indices,
5079  mj_part_t &output_part_numbering_begin_index,
5080  int *coordinate_destinations){
5081 
5082 
5083 
5084  processor_ranks_for_subcomm.clear();
5085  // if (this->num_local_coords > 0)
5086  if (num_procs > num_parts){
5087  //if there are more processors than the number of current part
5088  //then processors share the existing parts.
5089  //at the end each processor will have a single part,
5090  //but a part will be shared by a group of processors.
5091  mj_part_t out_part_index = 0;
5092  this->mj_assign_proc_to_parts(
5093  num_points_in_all_processor_parts,
5094  num_parts,
5095  num_procs,
5096  send_count_to_each_proc,
5097  processor_ranks_for_subcomm,
5098  next_future_num_parts_in_parts,
5099  out_part_index,
5100  output_part_numbering_begin_index,
5101  coordinate_destinations
5102  );
5103 
5104  out_num_part = 1;
5105  out_part_indices.clear();
5106  out_part_indices.push_back(out_part_index);
5107  }
5108  else {
5109 
5110  //there are more parts than the processors.
5111  //therefore a processor will be assigned multiple parts,
5112  //the subcommunicators will only have a single processor.
5113  processor_ranks_for_subcomm.push_back(this->myRank);
5114 
5115  //since there are more parts then procs,
5116  //assign multiple parts to processors.
5117  this->mj_assign_parts_to_procs(
5118  num_points_in_all_processor_parts,
5119  num_parts,
5120  num_procs,
5121  send_count_to_each_proc,
5122  next_future_num_parts_in_parts,
5123  out_num_part,
5124  out_part_indices,
5125  output_part_numbering_begin_index,
5126  coordinate_destinations);
5127  }
5128 }
5129 
5142 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5143  typename mj_part_t>
5144 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migrate_coords(
5145  mj_part_t num_procs,
5146  mj_lno_t &num_new_local_points,
5147  std::string iteration,
5148  int *coordinate_destinations,
5149  mj_part_t num_parts)
5150 {
5151 #ifdef ENABLE_ZOLTAN_MIGRATION
5152  if (sizeof(mj_lno_t) <= sizeof(int)) {
5153 
5154  // Cannot use Zoltan_Comm with local ordinals larger than ints.
5155  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
5156  // may overflow.
5157 
5158  ZOLTAN_COMM_OBJ *plan = NULL;
5159  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
5160  int num_incoming_gnos = 0;
5161  int message_tag = 7859;
5162 
5163  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration Z1PlanCreating-" + iteration);
5164  int ierr = Zoltan_Comm_Create(
5165  &plan,
5166  int(this->num_local_coords),
5167  coordinate_destinations,
5168  mpi_comm,
5169  message_tag,
5170  &num_incoming_gnos);
5171  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5172  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration Z1PlanCreating-" + iteration);
5173 
5174  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration Z1Migration-" + iteration);
5175  mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(num_incoming_gnos);
5176 
5177  //migrate gnos.
5178  message_tag++;
5179  ierr = Zoltan_Comm_Do(
5180  plan,
5181  message_tag,
5182  (char *) this->current_mj_gnos,
5183  sizeof(mj_gno_t),
5184  (char *) incoming_gnos);
5185  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5186 
5187  freeArray<mj_gno_t>(this->current_mj_gnos);
5188  this->current_mj_gnos = incoming_gnos;
5189 
5190 
5191  //migrate coordinates
5192  for (int i = 0; i < this->coord_dim; ++i){
5193  message_tag++;
5194  mj_scalar_t *coord = this->mj_coordinates[i];
5195 
5196  this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5197  ierr = Zoltan_Comm_Do(
5198  plan,
5199  message_tag,
5200  (char *) coord,
5201  sizeof(mj_scalar_t),
5202  (char *) this->mj_coordinates[i]);
5203  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5204  freeArray<mj_scalar_t>(coord);
5205  }
5206 
5207  //migrate weights.
5208  for (int i = 0; i < this->num_weights_per_coord; ++i){
5209  message_tag++;
5210  mj_scalar_t *weight = this->mj_weights[i];
5211 
5212  this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5213  ierr = Zoltan_Comm_Do(
5214  plan,
5215  message_tag,
5216  (char *) weight,
5217  sizeof(mj_scalar_t),
5218  (char *) this->mj_weights[i]);
5219  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5220  freeArray<mj_scalar_t>(weight);
5221  }
5222 
5223 
5224  //migrate owners.
5225  int *coord_own = allocMemory<int>(num_incoming_gnos);
5226  message_tag++;
5227  ierr = Zoltan_Comm_Do(
5228  plan,
5229  message_tag,
5230  (char *) this->owner_of_coordinate,
5231  sizeof(int), (char *) coord_own);
5232  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5233  freeArray<int>(this->owner_of_coordinate);
5234  this->owner_of_coordinate = coord_own;
5235 
5236 
5237  //if num procs is less than num parts,
5238  //we need the part assigment arrays as well, since
5239  //there will be multiple parts in processor.
5240  mj_part_t *new_parts = allocMemory<mj_part_t>(num_incoming_gnos);
5241  if(num_procs < num_parts){
5242  message_tag++;
5243  ierr = Zoltan_Comm_Do(
5244  plan,
5245  message_tag,
5246  (char *) this->assigned_part_ids,
5247  sizeof(mj_part_t),
5248  (char *) new_parts);
5249  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5250  }
5251  freeArray<mj_part_t>(this->assigned_part_ids);
5252  this->assigned_part_ids = new_parts;
5253 
5254  ierr = Zoltan_Comm_Destroy(&plan);
5255  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5256  num_new_local_points = num_incoming_gnos;
5257  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration Z1Migration-" + iteration);
5258  }
5259 
5260  else
5261 
5262 #endif // ENABLE_ZOLTAN_MIGRATION
5263  {
5264  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration DistributorPlanCreating-" + iteration);
5265  Tpetra::Distributor distributor(this->comm);
5266  ArrayView<const mj_part_t> destinations( coordinate_destinations, this->num_local_coords);
5267  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
5268  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration DistributorPlanCreating-" + iteration);
5269 
5270  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration DistributorMigration-" + iteration);
5271  {
5272  //migrate gnos.
5273  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
5274  ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5275  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5276  freeArray<mj_gno_t>(this->current_mj_gnos);
5277  this->current_mj_gnos = allocMemory<mj_gno_t>(num_incoming_gnos);
5278  memcpy(
5279  this->current_mj_gnos,
5280  received_gnos.getRawPtr(),
5281  num_incoming_gnos * sizeof(mj_gno_t));
5282  }
5283  //migrate coordinates
5284  for (int i = 0; i < this->coord_dim; ++i){
5285 
5286  ArrayView<mj_scalar_t> sent_coord(this->mj_coordinates[i], this->num_local_coords);
5287  ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
5288  distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
5289  freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5290  this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5291  memcpy(
5292  this->mj_coordinates[i],
5293  received_coord.getRawPtr(),
5294  num_incoming_gnos * sizeof(mj_scalar_t));
5295  }
5296 
5297  //migrate weights.
5298  for (int i = 0; i < this->num_weights_per_coord; ++i){
5299 
5300  ArrayView<mj_scalar_t> sent_weight(this->mj_weights[i], this->num_local_coords);
5301  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
5302  distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
5303  freeArray<mj_scalar_t>(this->mj_weights[i]);
5304  this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5305  memcpy(
5306  this->mj_weights[i],
5307  received_weight.getRawPtr(),
5308  num_incoming_gnos * sizeof(mj_scalar_t));
5309  }
5310 
5311  {
5312  //migrate the owners of the coordinates
5313  ArrayView<int> sent_owners(this->owner_of_coordinate, this->num_local_coords);
5314  ArrayRCP<int> received_owners(num_incoming_gnos);
5315  distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners());
5316  freeArray<int>(this->owner_of_coordinate);
5317  this->owner_of_coordinate = allocMemory<int>(num_incoming_gnos);
5318  memcpy(
5319  this->owner_of_coordinate,
5320  received_owners.getRawPtr(),
5321  num_incoming_gnos * sizeof(int));
5322  }
5323 
5324  //if num procs is less than num parts,
5325  //we need the part assigment arrays as well, since
5326  //there will be multiple parts in processor.
5327  if(num_procs < num_parts){
5328  ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5329  ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
5330  distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5331  freeArray<mj_part_t>(this->assigned_part_ids);
5332  this->assigned_part_ids = allocMemory<mj_part_t>(num_incoming_gnos);
5333  memcpy(
5334  this->assigned_part_ids,
5335  received_partids.getRawPtr(),
5336  num_incoming_gnos * sizeof(mj_part_t));
5337  }
5338  else {
5339  mj_part_t *new_parts = allocMemory<int>(num_incoming_gnos);
5340  freeArray<mj_part_t>(this->assigned_part_ids);
5341  this->assigned_part_ids = new_parts;
5342  }
5343  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration DistributorMigration-" + iteration);
5344  num_new_local_points = num_incoming_gnos;
5345 
5346  }
5347 }
5348 
5355 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5356  typename mj_part_t>
5357 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm){
5358  mj_part_t group_size = processor_ranks_for_subcomm.size();
5359  mj_part_t *ids = allocMemory<mj_part_t>(group_size);
5360  for(mj_part_t i = 0; i < group_size; ++i) {
5361  ids[i] = processor_ranks_for_subcomm[i];
5362  }
5363  ArrayView<const mj_part_t> idView(ids, group_size);
5364  this->comm = this->comm->createSubcommunicator(idView);
5365  freeArray<mj_part_t>(ids);
5366 }
5367 
5368 
5374 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5375  typename mj_part_t>
5376 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::fill_permutation_array(
5377  mj_part_t output_num_parts,
5378  mj_part_t num_parts){
5379  //if there is single output part, then simply fill the permutation array.
5380  if (output_num_parts == 1){
5381  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
5382  this->new_coordinate_permutations[i] = i;
5383  }
5384  this->new_part_xadj[0] = this->num_local_coords;
5385  }
5386  else {
5387 
5388  //otherwise we need to count how many points are there in each part.
5389  //we allocate here as num_parts, because the sent partids are up to num_parts,
5390  //although there are outout_num_parts different part.
5391  mj_lno_t *num_points_in_parts = allocMemory<mj_lno_t>(num_parts);
5392  //part shift holds the which part number an old part number corresponds to.
5393  mj_part_t *part_shifts = allocMemory<mj_part_t>(num_parts);
5394 
5395  memset(num_points_in_parts, 0, sizeof(mj_lno_t) * num_parts);
5396 
5397  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
5398  mj_part_t ii = this->assigned_part_ids[i];
5399  ++num_points_in_parts[ii];
5400  }
5401 
5402  //write the end points of the parts.
5403  mj_part_t p = 0;
5404  mj_lno_t prev_index = 0;
5405  for(mj_part_t i = 0; i < num_parts; ++i){
5406  if(num_points_in_parts[i] > 0) {
5407  this->new_part_xadj[p] = prev_index + num_points_in_parts[i];
5408  prev_index += num_points_in_parts[i];
5409  part_shifts[i] = p++;
5410  }
5411  }
5412 
5413  //for the rest of the parts write the end index as end point.
5414  mj_part_t assigned_num_parts = p - 1;
5415  for (;p < num_parts; ++p){
5416  this->new_part_xadj[p] = this->new_part_xadj[assigned_num_parts];
5417  }
5418  for(mj_part_t i = 0; i < output_num_parts; ++i){
5419  num_points_in_parts[i] = this->new_part_xadj[i];
5420  }
5421 
5422  //write the permutation array here.
5423  //get the part of the coordinate i, shift it to obtain the new part number.
5424  //assign it to the end of the new part numbers pointer.
5425  for(mj_lno_t i = this->num_local_coords - 1; i >= 0; --i){
5426  mj_part_t part = part_shifts[mj_part_t(this->assigned_part_ids[i])];
5427  this->new_coordinate_permutations[--num_points_in_parts[part]] = i;
5428  }
5429 
5430  freeArray<mj_lno_t>(num_points_in_parts);
5431  freeArray<mj_part_t>(part_shifts);
5432  }
5433 }
5434 
5435 
5458 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5459  typename mj_part_t>
5460 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_perform_migration(
5461  mj_part_t input_num_parts, //current number of parts
5462  mj_part_t &output_num_parts, //output number of parts.
5463  std::vector<mj_part_t> *next_future_num_parts_in_parts,
5464  mj_part_t &output_part_begin_index,
5465  size_t migration_reduce_all_population,
5466  mj_lno_t num_coords_for_last_dim_part,
5467  std::string iteration,
5468  RCP<mj_partBoxVector_t> &input_part_boxes,
5469  RCP<mj_partBoxVector_t> &output_part_boxes
5470 )
5471 {
5472  mj_part_t num_procs = this->comm->getSize();
5473  this->myRank = this->comm->getRank();
5474 
5475 
5476  //this array holds how many points each processor has in each part.
5477  //to access how many points processor i has on part j,
5478  //num_points_in_all_processor_parts[i * num_parts + j]
5479  mj_gno_t *num_points_in_all_processor_parts = allocMemory<mj_gno_t>(input_num_parts * (num_procs + 1));
5480 
5481  //get the number of coordinates in each part in each processor.
5482  this->get_processor_num_points_in_parts(
5483  num_procs,
5484  input_num_parts,
5485  num_points_in_all_processor_parts);
5486 
5487 
5488  //check if migration will be performed or not.
5489  if (!this->mj_check_to_migrate(
5490  migration_reduce_all_population,
5491  num_coords_for_last_dim_part,
5492  num_procs,
5493  input_num_parts,
5494  num_points_in_all_processor_parts)){
5495  freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5496  return false;
5497  }
5498 
5499 
5500  mj_lno_t *send_count_to_each_proc = NULL;
5501  int *coordinate_destinations = allocMemory<int>(this->num_local_coords);
5502  send_count_to_each_proc = allocMemory<mj_lno_t>(num_procs);
5503  for (int i = 0; i < num_procs; ++i) send_count_to_each_proc[i] = 0;
5504 
5505  std::vector<mj_part_t> processor_ranks_for_subcomm;
5506  std::vector<mj_part_t> out_part_indices;
5507 
5508  //determine which processors are assigned to which parts
5509  this->mj_migration_part_proc_assignment(
5510  num_points_in_all_processor_parts,
5511  input_num_parts,
5512  num_procs,
5513  send_count_to_each_proc,
5514  processor_ranks_for_subcomm,
5515  next_future_num_parts_in_parts,
5516  output_num_parts,
5517  out_part_indices,
5518  output_part_begin_index,
5519  coordinate_destinations);
5520 
5521 
5522 
5523 
5524  freeArray<mj_lno_t>(send_count_to_each_proc);
5525  std::vector <mj_part_t> tmpv;
5526 
5527  std::sort (out_part_indices.begin(), out_part_indices.end());
5528  mj_part_t outP = out_part_indices.size();
5529 
5530  mj_gno_t new_global_num_points = 0;
5531  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * input_num_parts;
5532 
5533  if (this->mj_keep_part_boxes){
5534  input_part_boxes->clear();
5535  }
5536 
5537  //now we calculate the new values for next_future_num_parts_in_parts.
5538  //same for the part boxes.
5539  for (mj_part_t i = 0; i < outP; ++i){
5540  mj_part_t ind = out_part_indices[i];
5541  new_global_num_points += global_num_points_in_parts[ind];
5542  tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
5543  if (this->mj_keep_part_boxes){
5544  input_part_boxes->push_back((*output_part_boxes)[ind]);
5545  }
5546  }
5547  //swap the input and output part boxes.
5548  if (this->mj_keep_part_boxes){
5549  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5550  input_part_boxes = output_part_boxes;
5551  output_part_boxes = tmpPartBoxes;
5552  }
5553  next_future_num_parts_in_parts->clear();
5554  for (mj_part_t i = 0; i < outP; ++i){
5555  mj_part_t p = tmpv[i];
5556  next_future_num_parts_in_parts->push_back(p);
5557  }
5558 
5559  freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5560 
5561  mj_lno_t num_new_local_points = 0;
5562 
5563 
5564  //perform the actual migration operation here.
5565  this->mj_migrate_coords(
5566  num_procs,
5567  num_new_local_points,
5568  iteration,
5569  coordinate_destinations,
5570  input_num_parts);
5571 
5572 
5573  freeArray<int>(coordinate_destinations);
5574 
5575  if(this->num_local_coords != num_new_local_points){
5576  freeArray<mj_lno_t>(this->new_coordinate_permutations);
5577  freeArray<mj_lno_t>(this->coordinate_permutations);
5578 
5579  this->new_coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5580  this->coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5581  }
5582  this->num_local_coords = num_new_local_points;
5583  this->num_global_coords = new_global_num_points;
5584 
5585 
5586 
5587  //create subcommunicator.
5588  this->create_sub_communicator(processor_ranks_for_subcomm);
5589  processor_ranks_for_subcomm.clear();
5590 
5591  //fill the new permutation arrays.
5592  this->fill_permutation_array(
5593  output_num_parts,
5594  input_num_parts);
5595  return true;
5596 }
5597 
5598 
5612 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5613  typename mj_part_t>
5614 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_consistent_chunks(
5615  mj_part_t num_parts,
5616  mj_scalar_t *mj_current_dim_coords,
5617  mj_scalar_t *current_concurrent_cut_coordinate,
5618  mj_lno_t coordinate_begin,
5619  mj_lno_t coordinate_end,
5620  mj_scalar_t *used_local_cut_line_weight_to_left,
5621  mj_lno_t *out_part_xadj,
5622  int coordInd, bool longest_dim_part, uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted){
5623 
5624  //mj_lno_t numCoordsInPart = coordinateEnd - coordinateBegin;
5625  mj_part_t no_cuts = num_parts - 1;
5626 
5627 
5628 
5629  int me = 0;
5630  mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
5631  mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
5632 
5633 
5634  //now if the rectilinear partitioning is allowed we decide how
5635  //much weight each thread should put to left and right.
5636  if (this->distribute_points_on_cut_lines){
5637 
5638  my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
5639  for (mj_part_t i = 0; i < no_cuts; ++i){
5640  //the left to be put on the left of the cut.
5641  mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
5642  //std::cout << "i:" << i << " left_weight:" << left_weight << std::endl;
5643  for(int ii = 0; ii < this->num_threads; ++ii){
5644  if(left_weight > this->sEpsilon){
5645  //the weight of thread ii on cut.
5646  mj_scalar_t thread_ii_weight_on_cut = this->thread_part_weight_work[ii][i * 2 + 1] - this->thread_part_weight_work[ii][i * 2 ];
5647  if(thread_ii_weight_on_cut < left_weight){
5648  this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
5649  }
5650  else {
5651  this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
5652  }
5653  left_weight -= thread_ii_weight_on_cut;
5654  }
5655  else {
5656  this->thread_cut_line_weight_to_put_left[ii][i] = 0;
5657  }
5658  }
5659  }
5660 
5661  if(no_cuts > 0){
5662  //this is a special case. If cutlines share the same coordinate, their weights are equal.
5663  //we need to adjust the ratio for that.
5664  for (mj_part_t i = no_cuts - 1; i > 0 ; --i){
5665  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5666  my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
5667  }
5668  my_local_thread_cut_weights_to_put_left[i] = static_cast<long long>((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL)
5669  / mj_scalar_t(SIGNIFICANCE_MUL);
5670  }
5671  }
5672  }
5673 
5674  for(mj_part_t ii = 0; ii < num_parts; ++ii){
5675  thread_num_points_in_parts[ii] = 0;
5676  }
5677 
5678  //for this specific case we dont want to distribute the points along the cut position
5679  //randomly, as we need a specific ordering of them. Instead,
5680  //we put the coordinates into a sort item, where we sort those
5681  //using the coordinates of points on other dimensions and the index.
5682 
5683 
5684  //some of the cuts might share the same position.
5685  //in this case, if cut i and cut j share the same position
5686  //cut_map[i] = cut_map[j] = sort item index.
5687  mj_part_t *cut_map = allocMemory<mj_part_t> (no_cuts);
5688 
5689 
5690  typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
5691  typedef std::vector< multiSItem > multiSVector;
5692  typedef std::vector<multiSVector> multiS2Vector;
5693 
5694  //to keep track of the memory allocated.
5695  std::vector<mj_scalar_t *>allocated_memory;
5696 
5697  //vector for which the coordinates will be sorted.
5698  multiS2Vector sort_vector_points_on_cut;
5699 
5700  //the number of cuts that have different coordinates.
5701  mj_part_t different_cut_count = 1;
5702  cut_map[0] = 0;
5703 
5704  //now we insert 1 sort vector for all cuts on the different
5705  //positins.if multiple cuts are on the same position, they share sort vectors.
5706  multiSVector tmpMultiSVector;
5707  sort_vector_points_on_cut.push_back(tmpMultiSVector);
5708 
5709  for (mj_part_t i = 1; i < no_cuts ; ++i){
5710  //if cuts share the same cut coordinates
5711  //set the cutmap accordingly.
5712  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5713  cut_map[i] = cut_map[i-1];
5714  }
5715  else {
5716  cut_map[i] = different_cut_count++;
5717  multiSVector tmp2MultiSVector;
5718  sort_vector_points_on_cut.push_back(tmp2MultiSVector);
5719  }
5720  }
5721 
5722 
5723  //now the actual part assigment.
5724  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5725 
5726  mj_lno_t i = this->coordinate_permutations[ii];
5727 
5728  mj_part_t pp = this->assigned_part_ids[i];
5729  mj_part_t p = pp / 2;
5730  //if the coordinate is on a cut.
5731  if(pp % 2 == 1 ){
5732  mj_scalar_t *vals = allocMemory<mj_scalar_t>(this->coord_dim -1);
5733  allocated_memory.push_back(vals);
5734 
5735  //we insert the coordinates to the sort item here.
5736  int val_ind = 0;
5737 
5738  if (longest_dim_part){
5739  //std::cout << std::endl << std::endl;
5740  for(int dim = this->coord_dim - 2; dim >= 0; --dim){
5741  //uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted
5742  int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
5743  //std::cout << "next_largest_coord_dim: " << next_largest_coord_dim << " ";
5744  vals[val_ind++] = this->mj_coordinates[next_largest_coord_dim][i];
5745  }
5746  }
5747  else {
5748  for(int dim = coordInd + 1; dim < this->coord_dim; ++dim){
5749  vals[val_ind++] = this->mj_coordinates[dim][i];
5750  }
5751  for(int dim = 0; dim < coordInd; ++dim){
5752  vals[val_ind++] = this->mj_coordinates[dim][i];
5753  }
5754  }
5755  multiSItem tempSortItem(i, this->coord_dim -1, vals);
5756  //inser the point to the sort vector pointed by the cut_map[p].
5757  mj_part_t cmap = cut_map[p];
5758  sort_vector_points_on_cut[cmap].push_back(tempSortItem);
5759  }
5760  else {
5761  //if it is not on the cut, simple sorting.
5762  ++thread_num_points_in_parts[p];
5763  this->assigned_part_ids[i] = p;
5764  }
5765  }
5766 
5767  //sort all the sort vectors.
5768  for (mj_part_t i = 0; i < different_cut_count; ++i){
5769  std::sort (sort_vector_points_on_cut[i].begin(), sort_vector_points_on_cut[i].end());
5770  }
5771 
5772  //we do the part assignment for the points on cuts here.
5773  mj_part_t previous_cut_map = cut_map[0];
5774 
5775  //this is how much previous part owns the weight of the current part.
5776  //when target part weight is 1.6, and the part on the left is given 2,
5777  //the left has an extra 0.4, while the right has missing 0.4 from the previous cut.
5778  //this parameter is used to balance this issues.
5779  //in the above example weight_stolen_from_previous_part will be 0.4.
5780  //if the left part target is 2.2 but it is given 2,
5781  //then weight_stolen_from_previous_part will be -0.2.
5782  mj_scalar_t weight_stolen_from_previous_part = 0;
5783  for (mj_part_t p = 0; p < no_cuts; ++p){
5784 
5785  mj_part_t mapped_cut = cut_map[p];
5786 
5787  //if previous cut map is done, and it does not have the same index,
5788  //then assign all points left on that cut to its right.
5789  if (previous_cut_map != mapped_cut){
5790  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5791  for (; sort_vector_end >= 0; --sort_vector_end){
5792  multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5793  mj_lno_t i = t.index;
5794  ++thread_num_points_in_parts[p];
5795  this->assigned_part_ids[i] = p;
5796  }
5797  sort_vector_points_on_cut[previous_cut_map].clear();
5798  }
5799 
5800  //TODO: MD: I dont remember why I have it reverse order here.
5801  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size() - 1;
5802  //mj_lno_t sort_vector_begin= 0;
5803  //mj_lno_t sort_vector_size = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size();
5804 
5805  //TODO commented for reverse order
5806  for (; sort_vector_end >= 0; --sort_vector_end){
5807  //for (; sort_vector_begin < sort_vector_size; ++sort_vector_begin){
5808  //TODO COMMENTED FOR REVERSE ORDER
5809  multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
5810  //multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_begin];
5811  mj_lno_t i = t.index;
5812  mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
5813 
5814 
5815  //part p has enough space for point i, then put it to point i.
5816  if( my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part> this->sEpsilon &&
5817  my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - w)
5818  > this->sEpsilon){
5819 
5820  my_local_thread_cut_weights_to_put_left[p] -= w;
5821  sort_vector_points_on_cut[mapped_cut].pop_back();
5822  ++thread_num_points_in_parts[p];
5823  this->assigned_part_ids[i] = p;
5824  //if putting this weight to left overweights the left cut, then
5825  //increase the space for the next cut using weight_stolen_from_previous_part.
5826  if(p < no_cuts - 1 && my_local_thread_cut_weights_to_put_left[p] < this->sEpsilon){
5827  if(mapped_cut == cut_map[p + 1] ){
5828  //if the cut before the cut indexed at p was also at the same position
5829  //special case, as we handle the weight differently here.
5830  if (previous_cut_map != mapped_cut){
5831  weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5832  }
5833  else {
5834  //if the cut before the cut indexed at p was also at the same position
5835  //we assign extra weights cumulatively in this case.
5836  weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5837  }
5838  }
5839  else{
5840  weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5841  }
5842  //end assignment for part p
5843  break;
5844  }
5845  } else {
5846  //if part p does not have enough space for this point
5847  //and if there is another cut sharing the same positon,
5848  //again increase the space for the next
5849  if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]){
5850  if (previous_cut_map != mapped_cut){
5851  weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5852  }
5853  else {
5854  weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5855  }
5856  }
5857  else{
5858  weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5859  }
5860  //end assignment for part p
5861  break;
5862  }
5863  }
5864  previous_cut_map = mapped_cut;
5865  }
5866 
5867  //TODO commented for reverse order
5868  //put everything left on the last cut to the last part.
5869  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5870 
5871  //mj_lno_t sort_vector_begin= 0;
5872  //mj_lno_t sort_vector_size = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size();
5873  //TODO commented for reverse order
5874  for (; sort_vector_end >= 0; --sort_vector_end){
5875  //for (; sort_vector_begin < sort_vector_size; ++sort_vector_begin){
5876  //TODO commented for reverse order
5877  multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5878  //multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_begin];
5879  mj_lno_t i = t.index;
5880  ++thread_num_points_in_parts[no_cuts];
5881  this->assigned_part_ids[i] = no_cuts;
5882  }
5883  sort_vector_points_on_cut[previous_cut_map].clear();
5884  freeArray<mj_part_t> (cut_map);
5885 
5886  //free the memory allocated for vertex sort items .
5887  mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
5888  for(mj_lno_t i = 0; i < vSize; ++i){
5889  freeArray<mj_scalar_t> (allocated_memory[i]);
5890  }
5891 
5892  //creation of part_xadj as in usual case.
5893  for(mj_part_t j = 0; j < num_parts; ++j){
5894  mj_lno_t num_points_in_part_j_upto_thread_i = 0;
5895  for (int i = 0; i < this->num_threads; ++i){
5896  mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
5897  this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
5898  num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
5899 
5900  }
5901  out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin;
5902  }
5903 
5904  //perform prefix sum for num_points in parts.
5905  for(mj_part_t j = 1; j < num_parts; ++j){
5906  out_part_xadj[j] += out_part_xadj[j - 1];
5907  }
5908 
5909 
5910  //shift the num points in threads thread to obtain the
5911  //beginning index of each thread's private space.
5912  for(mj_part_t j = 1; j < num_parts; ++j){
5913  thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
5914  }
5915 
5916  //now thread gets the coordinate and writes the index of coordinate to the permutation array
5917  //using the part index we calculated.
5918  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5919  mj_lno_t i = this->coordinate_permutations[ii];
5920  mj_part_t p = this->assigned_part_ids[i];
5921  this->new_coordinate_permutations[coordinate_begin +
5922  thread_num_points_in_parts[p]++] = i;
5923  }
5924 }
5925 
5926 
5927 
5937 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5938  typename mj_part_t>
5939 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_final_parts(
5940  mj_part_t current_num_parts,
5941  mj_part_t output_part_begin_index,
5942  RCP<mj_partBoxVector_t> &output_part_boxes,
5943  bool is_data_ever_migrated)
5944 {
5945  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Part_Assignment");
5946 
5947 #ifdef HAVE_ZOLTAN2_OMP
5948 #pragma omp parallel for
5949 #endif
5950  for(mj_part_t i = 0; i < current_num_parts;++i){
5951 
5952  mj_lno_t begin = 0;
5953  mj_lno_t end = this->part_xadj[i];
5954 
5955  if(i > 0) begin = this->part_xadj[i -1];
5956  mj_part_t part_to_set_index = i + output_part_begin_index;
5957  if (this->mj_keep_part_boxes){
5958  (*output_part_boxes)[i].setpId(part_to_set_index);
5959  }
5960  for (mj_lno_t ii = begin; ii < end; ++ii){
5961  mj_lno_t k = this->coordinate_permutations[ii];
5962  this->assigned_part_ids[k] = part_to_set_index;
5963  }
5964  }
5965 
5966  //ArrayRCP<const mj_gno_t> gnoList;
5967  if(!is_data_ever_migrated){
5968  //freeArray<mj_gno_t>(this->current_mj_gnos);
5969  //if(this->num_local_coords > 0){
5970  // gnoList = arcpFromArrayView(this->mj_gnos);
5971  //}
5972  }
5973  else {
5974 #ifdef ENABLE_ZOLTAN_MIGRATION
5975  if (sizeof(mj_lno_t) <= sizeof(int)) {
5976 
5977  // Cannot use Zoltan_Comm with local ordinals larger than ints.
5978  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
5979  // may overflow.
5980 
5981  //if data is migrated, then send part numbers to the original owners.
5982  ZOLTAN_COMM_OBJ *plan = NULL;
5983  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
5984 
5985  int incoming = 0;
5986  int message_tag = 7856;
5987 
5988  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final Z1PlanCreating");
5989  int ierr = Zoltan_Comm_Create( &plan, int(this->num_local_coords),
5990  this->owner_of_coordinate, mpi_comm, message_tag,
5991  &incoming);
5992  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5993  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final Z1PlanCreating" );
5994 
5995  mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(incoming);
5996 
5997  message_tag++;
5998  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final Z1PlanComm");
5999  ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->current_mj_gnos,
6000  sizeof(mj_gno_t), (char *) incoming_gnos);
6001  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6002 
6003  freeArray<mj_gno_t>(this->current_mj_gnos);
6004  this->current_mj_gnos = incoming_gnos;
6005 
6006  mj_part_t *incoming_partIds = allocMemory< mj_part_t>(incoming);
6007 
6008  message_tag++;
6009  ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->assigned_part_ids,
6010  sizeof(mj_part_t), (char *) incoming_partIds);
6011  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6012  freeArray<mj_part_t>(this->assigned_part_ids);
6013  this->assigned_part_ids = incoming_partIds;
6014 
6015  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final Z1PlanComm");
6016  ierr = Zoltan_Comm_Destroy(&plan);
6017  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6018 
6019  this->num_local_coords = incoming;
6020  //gnoList = arcp(this->current_mj_gnos, 0, this->num_local_coords, true);
6021  }
6022  else
6023 
6024 #endif // !ENABLE_ZOLTAN_MIGRATION
6025  {
6026  //if data is migrated, then send part numbers to the original owners.
6027  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final DistributorPlanCreating");
6028  Tpetra::Distributor distributor(this->mj_problemComm);
6029  ArrayView<const mj_part_t> owners_of_coords(this->owner_of_coordinate, this->num_local_coords);
6030  mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
6031  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final DistributorPlanCreating" );
6032 
6033  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final DistributorPlanComm");
6034  //migrate gnos to actual owners.
6035  ArrayRCP<mj_gno_t> received_gnos(incoming);
6036  ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
6037  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
6038  freeArray<mj_gno_t>(this->current_mj_gnos);
6039  this->current_mj_gnos = allocMemory<mj_gno_t>(incoming);
6040  memcpy( this->current_mj_gnos,
6041  received_gnos.getRawPtr(),
6042  incoming * sizeof(mj_gno_t));
6043 
6044  //migrate part ids to actual owners.
6045  ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
6046  ArrayRCP<mj_part_t> received_partids(incoming);
6047  distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
6048  freeArray<mj_part_t>(this->assigned_part_ids);
6049  this->assigned_part_ids = allocMemory<mj_part_t>(incoming);
6050  memcpy( this->assigned_part_ids,
6051  received_partids.getRawPtr(),
6052  incoming * sizeof(mj_part_t));
6053 
6054  this->num_local_coords = incoming;
6055  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final DistributorPlanComm");
6056 
6057  }
6058  }
6059 
6060  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Part_Assignment");
6061 
6062  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Solution_Part_Assignment");
6063 
6064  //ArrayRCP<mj_part_t> partId;
6065  //partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true);
6066 
6067  if (this->mj_keep_part_boxes){
6068  this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
6069 
6070  }
6071 
6072  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Solution_Part_Assignment");
6073 }
6074 
6077 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6078  typename mj_part_t>
6079 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::free_work_memory(){
6080  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Free");
6081 
6082  for (int i=0; i < this->coord_dim; i++){
6083  freeArray<mj_scalar_t>(this->mj_coordinates[i]);
6084  }
6085  freeArray<mj_scalar_t *>(this->mj_coordinates);
6086 
6087  for (int i=0; i < this->num_weights_per_coord; i++){
6088  freeArray<mj_scalar_t>(this->mj_weights[i]);
6089  }
6090  freeArray<mj_scalar_t *>(this->mj_weights);
6091 
6092  freeArray<int>(this->owner_of_coordinate);
6093 
6094  for(int i = 0; i < this->num_threads; ++i){
6095  freeArray<mj_lno_t>(this->thread_point_counts[i]);
6096  }
6097 
6098  freeArray<mj_lno_t *>(this->thread_point_counts);
6099  freeArray<double *> (this->thread_part_weight_work);
6100 
6101  if(this->distribute_points_on_cut_lines){
6102  freeArray<mj_scalar_t>(this->process_cut_line_weight_to_put_left);
6103  for(int i = 0; i < this->num_threads; ++i){
6104  freeArray<mj_scalar_t>(this->thread_cut_line_weight_to_put_left[i]);
6105  }
6106  freeArray<mj_scalar_t *>(this->thread_cut_line_weight_to_put_left);
6107  freeArray<mj_scalar_t>(this->process_rectilinear_cut_weight);
6108  freeArray<mj_scalar_t>(this->global_rectilinear_cut_weight);
6109  }
6110 
6111  freeArray<mj_part_t>(this->my_incomplete_cut_count);
6112 
6113  freeArray<mj_scalar_t>(this->max_min_coords);
6114 
6115  freeArray<mj_lno_t>(this->part_xadj);
6116 
6117  freeArray<mj_lno_t>(this->coordinate_permutations);
6118 
6119  freeArray<mj_lno_t>(this->new_coordinate_permutations);
6120 
6121  freeArray<mj_scalar_t>(this->all_cut_coordinates);
6122 
6123  freeArray<mj_scalar_t> (this->process_local_min_max_coord_total_weight);
6124 
6125  freeArray<mj_scalar_t> (this->global_min_max_coord_total_weight);
6126 
6127  freeArray<mj_scalar_t>(this->cut_coordinates_work_array);
6128 
6129  freeArray<mj_scalar_t>(this->target_part_weights);
6130 
6131  freeArray<mj_scalar_t>(this->cut_upper_bound_coordinates);
6132 
6133  freeArray<mj_scalar_t>(this->cut_lower_bound_coordinates);
6134 
6135  freeArray<mj_scalar_t>(this->cut_lower_bound_weights);
6136  freeArray<mj_scalar_t>(this->cut_upper_bound_weights);
6137  freeArray<bool>(this->is_cut_line_determined);
6138  freeArray<mj_scalar_t>(this->total_part_weight_left_right_closests);
6139  freeArray<mj_scalar_t>(this->global_total_part_weight_left_right_closests);
6140 
6141  for(int i = 0; i < this->num_threads; ++i){
6142  freeArray<double>(this->thread_part_weights[i]);
6143  freeArray<mj_scalar_t>(this->thread_cut_right_closest_point[i]);
6144  freeArray<mj_scalar_t>(this->thread_cut_left_closest_point[i]);
6145  }
6146 
6147  freeArray<double *>(this->thread_part_weights);
6148  freeArray<mj_scalar_t *>(this->thread_cut_left_closest_point);
6149  freeArray<mj_scalar_t *>(this->thread_cut_right_closest_point);
6150 
6151  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Free");
6152 }
6153 
6162 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6163  typename mj_part_t>
6165  bool distribute_points_on_cut_lines_,
6166  int max_concurrent_part_calculation_,
6167  int check_migrate_avoid_migration_option_,
6168  double minimum_migration_imbalance_,
6169  int migration_type_ ){
6170  this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
6171  this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
6172  this->check_migrate_avoid_migration_option = check_migrate_avoid_migration_option_;
6173  this->minimum_migration_imbalance = minimum_migration_imbalance_;
6174  this->migration_type = migration_type_;
6175 
6176 }
6177 
6178 
6179 
6180 
6208 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6209  typename mj_part_t>
6211 
6212  const RCP<const Environment> &env,
6213  RCP<const Comm<int> > &problemComm,
6214 
6215  double imbalance_tolerance_,
6216  size_t num_global_parts_,
6217  mj_part_t *part_no_array_,
6218  int recursion_depth_,
6219 
6220  int coord_dim_,
6221  mj_lno_t num_local_coords_,
6222  mj_gno_t num_global_coords_,
6223  const mj_gno_t *initial_mj_gnos_,
6224  mj_scalar_t **mj_coordinates_,
6225 
6226  int num_weights_per_coord_,
6227  bool *mj_uniform_weights_,
6228  mj_scalar_t **mj_weights_,
6229  bool *mj_uniform_parts_,
6230  mj_scalar_t **mj_part_sizes_,
6231 
6232  mj_part_t *&result_assigned_part_ids_,
6233  mj_gno_t *&result_mj_gnos_
6234 )
6235 {
6236 
6237 
6238 
6239 #ifdef print_debug
6240  if(comm->getRank() == 0){
6241  std::cout << "size of gno:" << sizeof(mj_gno_t) << std::endl;
6242  std::cout << "size of lno:" << sizeof(mj_lno_t) << std::endl;
6243  std::cout << "size of mj_scalar_t:" << sizeof(mj_scalar_t) << std::endl;
6244  }
6245 #endif
6246  this->mj_env = env;
6247  this->mj_problemComm = problemComm;
6248  this->myActualRank = this->myRank = this->mj_problemComm->getRank();
6249 
6250  /*
6251  if (0)
6252  {
6253  int a = rand();
6254  this->mj_problemComm->broadcast(0, sizeof(int), (char *) (&a));
6255  std::string istring = "output_" + Teuchos::toString<int>(a) + "_" + Teuchos::toString<int>(myRank) + ".mtx";
6256 
6257  std::ofstream output(istring.c_str());
6258  output << num_local_coords_ << " " << coord_dim_ << std::endl;
6259  for (int j = 0; j < coord_dim_ ; ++j){
6260  for (size_t i = 0; i < num_local_coords_; ++i){
6261  output << mj_coordinates_[j][i] << std::endl;
6262  }
6263 
6264  }
6265  output.close();
6266  }
6267  */
6268 
6269 
6270  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Total");
6271  this->mj_env->debug(3, "In MultiJagged Jagged");
6272 
6273  {
6274  this->imbalance_tolerance = imbalance_tolerance_;
6275  this->num_global_parts = num_global_parts_;
6276  this->part_no_array = part_no_array_;
6277  this->recursion_depth = recursion_depth_;
6278 
6279  this->coord_dim = coord_dim_;
6280  this->num_local_coords = num_local_coords_;
6281  this->num_global_coords = num_global_coords_;
6282  this->mj_coordinates = mj_coordinates_; //will copy the memory to this->mj_coordinates.
6283  this->initial_mj_gnos = (mj_gno_t *) initial_mj_gnos_; //will copy the memory to this->current_mj_gnos[j].
6284 
6285  this->num_weights_per_coord = num_weights_per_coord_;
6286  this->mj_uniform_weights = mj_uniform_weights_;
6287  this->mj_weights = mj_weights_; //will copy the memory to this->mj_weights
6288  this->mj_uniform_parts = mj_uniform_parts_;
6289  this->mj_part_sizes = mj_part_sizes_;
6290 
6291  this->num_threads = 1;
6292 #ifdef HAVE_ZOLTAN2_OMP
6293 #pragma omp parallel
6294 
6295  {
6296  this->num_threads = omp_get_num_threads();
6297  }
6298 #endif
6299  }
6300  //this->set_input_data();
6301  this->set_part_specifications();
6302 
6303  this->allocate_set_work_memory();
6304 
6305  //We duplicate the comm as we create subcommunicators during migration.
6306  //We keep the problemComm as it is, while comm changes after each migration.
6307  this->comm = this->mj_problemComm->duplicate();
6308 
6309  //initially there is a single partition
6310  mj_part_t current_num_parts = 1;
6311  mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
6312 
6313  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Partitioning");
6314 
6315  mj_part_t output_part_begin_index = 0;
6316  mj_part_t future_num_parts = this->total_num_part;
6317  bool is_data_ever_migrated = false;
6318 
6319  std::vector<mj_part_t> *future_num_part_in_parts = new std::vector<mj_part_t> ();
6320  std::vector<mj_part_t> *next_future_num_parts_in_parts = new std::vector<mj_part_t> ();
6321  next_future_num_parts_in_parts->push_back(this->num_global_parts);
6322 
6323  RCP<mj_partBoxVector_t> input_part_boxes(new mj_partBoxVector_t(), true) ;
6324  RCP<mj_partBoxVector_t> output_part_boxes(new mj_partBoxVector_t(), true);
6325 
6326  compute_global_box();
6327  if(this->mj_keep_part_boxes){
6328  this->init_part_boxes(output_part_boxes);
6329  }
6330 
6331  for (int i = 0; i < this->recursion_depth; ++i){
6332  //partitioning array. size will be as the number of current partitions and this
6333  //holds how many parts that each part will be in the current dimension partitioning.
6334  std::vector <mj_part_t> num_partitioning_in_current_dim;
6335 
6336  //number of parts that will be obtained at the end of this partitioning.
6337  //future_num_part_in_parts is as the size of current number of parts.
6338  //holds how many more parts each should be divided in the further
6339  //iterations. this will be used to calculate num_partitioning_in_current_dim,
6340  //as the number of parts that the part will be partitioned
6341  //in the current dimension partitioning.
6342 
6343  //next_future_num_parts_in_parts will be as the size of outnumParts,
6344  //and this will hold how many more parts that each output part
6345  //should be divided. this array will also be used to determine the weight ratios
6346  //of the parts.
6347  //swap the arrays to use iteratively..
6348  std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
6349  future_num_part_in_parts = next_future_num_parts_in_parts;
6350  next_future_num_parts_in_parts = tmpPartVect;
6351 
6352  //clear next_future_num_parts_in_parts array as
6353  //getPartitionArrays expects it to be empty.
6354  //it also expects num_partitioning_in_current_dim to be empty as well.
6355  next_future_num_parts_in_parts->clear();
6356 
6357  if(this->mj_keep_part_boxes){
6358  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6359  input_part_boxes = output_part_boxes;
6360  output_part_boxes = tmpPartBoxes;
6361  output_part_boxes->clear();
6362  }
6363 
6364  //returns the total no. of output parts for this dimension partitioning.
6365  mj_part_t output_part_count_in_dimension =
6366  this->update_part_num_arrays(
6367  num_partitioning_in_current_dim,
6368  future_num_part_in_parts,
6369  next_future_num_parts_in_parts,
6370  future_num_parts,
6371  current_num_parts,
6372  i,
6373  input_part_boxes,
6374  output_part_boxes, 1);
6375 
6376  //if the number of obtained parts equal to current number of parts,
6377  //skip this dimension. For example, this happens when 1 is given in the input
6378  //part array is given. P=4,5,1,2
6379  if(output_part_count_in_dimension == current_num_parts) {
6380  //still need to swap the input output arrays.
6381  tmpPartVect= future_num_part_in_parts;
6382  future_num_part_in_parts = next_future_num_parts_in_parts;
6383  next_future_num_parts_in_parts = tmpPartVect;
6384 
6385  if(this->mj_keep_part_boxes){
6386  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6387  input_part_boxes = output_part_boxes;
6388  output_part_boxes = tmpPartBoxes;
6389  }
6390  continue;
6391  }
6392 
6393 
6394  //get the coordinate axis along which the partitioning will be done.
6395  int coordInd = i % this->coord_dim;
6396  mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
6397 
6398  //convert i to string to be used for debugging purposes.
6399  std::string istring = Teuchos::toString<int>(i);
6400  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Partitioning_" + istring);
6401 
6402  //alloc Memory to point the indices
6403  //of the parts in the permutation array.
6404  this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
6405 
6406  //the index where in the new_part_xadj will be written.
6407  mj_part_t output_part_index = 0;
6408  //whatever is written to output_part_index will be added with putput_coordinate_end_index
6409  //so that the points will be shifted.
6410  mj_part_t output_coordinate_end_index = 0;
6411 
6412  mj_part_t current_work_part = 0;
6413  mj_part_t current_concurrent_num_parts =
6414  std::min(current_num_parts - current_work_part, this->max_concurrent_part_calculation);
6415 
6416  mj_part_t obtained_part_index = 0;
6417 
6418  //run for all available parts.
6419  for (; current_work_part < current_num_parts;
6420  current_work_part += current_concurrent_num_parts){
6421 
6422  current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
6423  this->max_concurrent_part_calculation);
6424 
6425  mj_part_t actual_work_part_count = 0;
6426  //initialization for 1D partitioning.
6427  //get the min and max coordinates of each part
6428  //together with the part weights of each part.
6429  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
6430  mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
6431 
6432  //if this part wont be partitioned any further
6433  //dont do any work for this part.
6434  if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
6435  continue;
6436  }
6437  ++actual_work_part_count;
6438  mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
6439  mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts == 0 ?
6440  0 : this->part_xadj[current_work_part_in_concurrent_parts - 1];
6441 
6442 /*
6443  std::cout << "i:" << i << " j:" << current_work_part + kk
6444  << " coordinate_begin_index:" << coordinate_begin_index
6445  << " coordinate_end_index:" << coordinate_end_index
6446  << " total:" << coordinate_end_index - coordinate_begin_index<< std::endl;
6447  */
6448  this->mj_get_local_min_max_coord_totW(
6449  coordinate_begin_index,
6450  coordinate_end_index,
6451  this->coordinate_permutations,
6452  mj_current_dim_coords,
6453  this->process_local_min_max_coord_total_weight[kk], //min_coordinate
6454  this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max_coordinate
6455  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]); //total_weight
6456 
6457  }
6458 
6459  //1D partitioning
6460  if (actual_work_part_count > 0){
6461  //obtain global Min max of the part.
6462  this->mj_get_global_min_max_coord_totW(
6463  current_concurrent_num_parts,
6464  this->process_local_min_max_coord_total_weight,
6465  this->global_min_max_coord_total_weight);
6466 
6467  //represents the total number of cutlines
6468  //whose coordinate should be determined.
6469  mj_part_t total_incomplete_cut_count = 0;
6470 
6471  //Compute weight ratios for parts & cuts:
6472  //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
6473  //part0 cut0 part1 cut1 part2 cut2 part3
6474  mj_part_t concurrent_part_cut_shift = 0;
6475  mj_part_t concurrent_part_part_shift = 0;
6476  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
6477  mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
6478  mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
6479  current_concurrent_num_parts];
6480 
6481  mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
6482  2 * current_concurrent_num_parts];
6483 
6484  mj_part_t concurrent_current_part_index = current_work_part + kk;
6485 
6486  mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
6487 
6488  mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
6489  mj_scalar_t *current_target_part_weights = this->target_part_weights +
6490  concurrent_part_part_shift;
6491  //shift the usedCutCoordinate array as noCuts.
6492  concurrent_part_cut_shift += partition_count - 1;
6493  //shift the partRatio array as noParts.
6494  concurrent_part_part_shift += partition_count;
6495 
6496 
6497  //calculate only if part is not empty,
6498  //and part will be further partitioned.
6499  if(partition_count > 1 && min_coordinate <= max_coordinate){
6500 
6501  //increase num_cuts_do_be_determined by the number of cuts of the current
6502  //part's cut line number.
6503  total_incomplete_cut_count += partition_count - 1;
6504  //set the number of cut lines that should be determined
6505  //for this part.
6506  this->my_incomplete_cut_count[kk] = partition_count - 1;
6507 
6508  //get the target weights of the parts.
6509  this->mj_get_initial_cut_coords_target_weights(
6510  min_coordinate,
6511  max_coordinate,
6512  partition_count - 1,
6513  global_total_weight,
6514  usedCutCoordinate,
6515  current_target_part_weights,
6516  future_num_part_in_parts,
6517  next_future_num_parts_in_parts,
6518  concurrent_current_part_index,
6519  obtained_part_index);
6520 
6521  mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
6522  mj_lno_t coordinate_begin_index = concurrent_current_part_index == 0 ?
6523  0 : this->part_xadj[concurrent_current_part_index - 1];
6524 
6525  //get the initial estimated part assignments of the
6526  //coordinates.
6527  this->set_initial_coordinate_parts(
6528  max_coordinate,
6529  min_coordinate,
6530  concurrent_current_part_index,
6531  coordinate_begin_index, coordinate_end_index,
6532  this->coordinate_permutations,
6533  mj_current_dim_coords,
6534  this->assigned_part_ids,
6535  partition_count);
6536  }
6537  else {
6538  // e.g., if have fewer coordinates than parts, don't need to do next dim.
6539  this->my_incomplete_cut_count[kk] = 0;
6540  }
6541  obtained_part_index += partition_count;
6542  }
6543 
6544 
6545 
6546  //used imbalance, it is always 0, as it is difficult to
6547  //estimate a range.
6548  double used_imbalance = 0;
6549 
6550 
6551  // Determine cut lines for all concurrent parts parts here.
6552  this->mj_1D_part(
6553  mj_current_dim_coords,
6554  used_imbalance,
6555  current_work_part,
6556  current_concurrent_num_parts,
6557  current_cut_coordinates,
6558  total_incomplete_cut_count,
6559  num_partitioning_in_current_dim);
6560  }
6561 
6562  //create new part chunks
6563  {
6564  mj_part_t output_array_shift = 0;
6565  mj_part_t cut_shift = 0;
6566  size_t tlr_shift = 0;
6567  size_t partweight_array_shift = 0;
6568 
6569  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
6570  mj_part_t current_concurrent_work_part = current_work_part + kk;
6571  mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
6572 
6573  //if the part is empty, skip the part.
6574  if((num_parts != 1 )
6575  &&
6576  this->global_min_max_coord_total_weight[kk] >
6577  this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
6578 
6579  //we still need to write the begin and end point of the
6580  //empty part. simply set it zero, the array indices will be shifted later.
6581  for(mj_part_t jj = 0; jj < num_parts; ++jj){
6582  this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
6583  }
6584  cut_shift += num_parts - 1;
6585  tlr_shift += (4 *(num_parts - 1) + 1);
6586  output_array_shift += num_parts;
6587  partweight_array_shift += (2 * (num_parts - 1) + 1);
6588  continue;
6589  }
6590 
6591  mj_lno_t coordinate_end= this->part_xadj[current_concurrent_work_part];
6592  mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[
6593  current_concurrent_work_part -1];
6594  mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
6595  mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
6596  cut_shift;
6597 
6598  //mj_scalar_t *used_tlr_array = this->total_part_weight_left_right_closests + tlr_shift;
6599 
6600  for(int ii = 0; ii < this->num_threads; ++ii){
6601  this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
6602  }
6603 
6604  if(num_parts > 1){
6605  if(this->mj_keep_part_boxes){
6606  //if part boxes are to be stored update the boundaries.
6607  for (mj_part_t j = 0; j < num_parts - 1; ++j){
6608  (*output_part_boxes)[output_array_shift + output_part_index +
6609  j].updateMinMax(current_concurrent_cut_coordinate[j], 1
6610  /*update max*/, coordInd);
6611 
6612  (*output_part_boxes)[output_array_shift + output_part_index + j +
6613  1].updateMinMax(current_concurrent_cut_coordinate[j], 0
6614  /*update min*/, coordInd);
6615  }
6616  }
6617 
6618  // Rewrite the indices based on the computed cuts.
6619  this->mj_create_new_partitions(
6620  num_parts,
6621  mj_current_dim_coords,
6622  current_concurrent_cut_coordinate,
6623  coordinate_begin,
6624  coordinate_end,
6625  used_local_cut_line_weight_to_left,
6626  this->thread_part_weight_work,
6627  this->new_part_xadj + output_part_index + output_array_shift
6628  );
6629 
6630  }
6631  else {
6632  //if this part is partitioned into 1 then just copy
6633  //the old values.
6634  mj_lno_t part_size = coordinate_end - coordinate_begin;
6635  *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
6636  memcpy(
6637  this->new_coordinate_permutations + coordinate_begin,
6638  this->coordinate_permutations + coordinate_begin,
6639  part_size * sizeof(mj_lno_t));
6640  }
6641  cut_shift += num_parts - 1;
6642  tlr_shift += (4 *(num_parts - 1) + 1);
6643  output_array_shift += num_parts;
6644  partweight_array_shift += (2 * (num_parts - 1) + 1);
6645  }
6646 
6647  //shift cut coordinates so that all cut coordinates are stored.
6648  //no shift now because we dont keep the cuts.
6649  //current_cut_coordinates += cut_shift;
6650 
6651  //mj_create_new_partitions from coordinates partitioned the parts and
6652  //write the indices as if there were a single part.
6653  //now we need to shift the beginning indices.
6654  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
6655  mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
6656  for (mj_part_t ii = 0;ii < num_parts ; ++ii){
6657  //shift it by previousCount
6658  this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
6659  }
6660  //increase the previous count by current end.
6661  output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
6662  //increase the current out.
6663  output_part_index += num_parts ;
6664  }
6665  }
6666  }
6667  // end of this partitioning dimension
6668 
6669 
6670  int current_world_size = this->comm->getSize();
6671  long migration_reduce_all_population = this->total_dim_num_reduce_all * current_world_size;
6672 
6673 
6674  bool is_migrated_in_current_dimension = false;
6675 
6676  //we migrate if there are more partitionings to be done after this step
6677  //and if the migration is not forced to be avoided.
6678  //and the operation is not sequential.
6679  if (future_num_parts > 1 &&
6680  this->check_migrate_avoid_migration_option >= 0 &&
6681  current_world_size > 1){
6682 
6683  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + istring);
6684  mj_part_t num_parts = output_part_count_in_dimension;
6685  if ( this->mj_perform_migration(
6686  num_parts,
6687  current_num_parts, //output
6688  next_future_num_parts_in_parts, //output
6689  output_part_begin_index,
6690  migration_reduce_all_population,
6691  this->num_global_coords / (future_num_parts * current_num_parts),
6692  istring,
6693  input_part_boxes, output_part_boxes) ) {
6694  is_migrated_in_current_dimension = true;
6695  is_data_ever_migrated = true;
6696  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Migration-" +
6697  istring);
6698  //since data is migrated, we reduce the number of reduceAll operations for the last part.
6699  this->total_dim_num_reduce_all /= num_parts;
6700  }
6701  else {
6702  is_migrated_in_current_dimension = false;
6703  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + istring);
6704  }
6705  }
6706 
6707  //swap the coordinate permutations for the next dimension.
6708  mj_lno_t * tmp = this->coordinate_permutations;
6709  this->coordinate_permutations = this->new_coordinate_permutations;
6710  this->new_coordinate_permutations = tmp;
6711 
6712  if(!is_migrated_in_current_dimension){
6713  this->total_dim_num_reduce_all -= current_num_parts;
6714  current_num_parts = output_part_count_in_dimension;
6715  }
6716  freeArray<mj_lno_t>(this->part_xadj);
6717  this->part_xadj = this->new_part_xadj;
6718  this->new_part_xadj = NULL;
6719  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Partitioning_" + istring);
6720  }
6721 
6722  // Partitioning is done
6723  delete future_num_part_in_parts;
6724  delete next_future_num_parts_in_parts;
6725 
6726  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Partitioning");
6728 
6729 
6730  //get the final parts of each initial coordinate
6731  //the results will be written to
6732  //this->assigned_part_ids for gnos given in this->current_mj_gnos
6733  this->set_final_parts(
6734  current_num_parts,
6735  output_part_begin_index,
6736  output_part_boxes,
6737  is_data_ever_migrated);
6738 
6739  result_assigned_part_ids_ = this->assigned_part_ids;
6740  result_mj_gnos_ = this->current_mj_gnos;
6741 
6742  this->free_work_memory();
6743  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Total");
6744  this->mj_env->debug(3, "Out of MultiJagged");
6745 
6746 }
6747 
6748 
6752 template <typename Adapter>
6753 class Zoltan2_AlgMJ : public Algorithm<Adapter>
6754 {
6755 private:
6756 
6757 #ifndef DOXYGEN_SHOULD_SKIP_THIS
6758 
6759  typedef CoordinateModel<typename Adapter::base_adapter_t> coordinateModel_t;
6760 
6761  // For coordinates and weights, MJ needs floats or doubles
6762  // But Adapter can provide other scalars, e.g., ints.
6763  // So have separate scalar_t for MJ and adapter.
6764  typedef typename Adapter::scalar_t adapter_scalar_t;
6765 
6766  // Provide a default type for mj_scalar_t;
6767  typedef float default_mj_scalar_t;
6768 
6769  // If Adapter provided float or double scalar_t, use it (prevents copies).
6770  // Otherwise, use the default type of mj_scalar_t;
6771  typedef typename
6772  std::conditional<
6773  (std::is_same<adapter_scalar_t, float>::value ||
6774  std::is_same<adapter_scalar_t, double>::value),
6775  adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
6776 
6777  typedef typename Adapter::gno_t mj_gno_t;
6778  typedef typename Adapter::lno_t mj_lno_t;
6779  typedef typename Adapter::node_t mj_node_t;
6780  typedef typename Adapter::part_t mj_part_t;
6781  typedef coordinateModelPartBox mj_partBox_t;
6782  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
6783 #endif
6785 
6786  RCP<const Environment> mj_env; //the environment object
6787  RCP<const Comm<int> > mj_problemComm; //initial comm object
6788  RCP<const coordinateModel_t> mj_coords; //coordinate adapter
6789 
6790  //PARAMETERS
6791  double imbalance_tolerance; //input imbalance tolerance.
6792  size_t num_global_parts; //the targeted number of parts
6793  mj_part_t *part_no_array; //input part array specifying num part to divide along each dim.
6794  int recursion_depth; //the number of steps that partitioning will be solved in.
6795 
6796  int coord_dim; // coordinate dimension.
6797  mj_lno_t num_local_coords; //number of local coords.
6798  mj_gno_t num_global_coords; //number of global coords.
6799  const mj_gno_t *initial_mj_gnos; //initial global ids of the coordinates.
6800  mj_scalar_t **mj_coordinates; //two dimension coordinate array
6801 
6802  int num_weights_per_coord; // number of weights per coordinate
6803  bool *mj_uniform_weights; //if the coordinates have uniform weights.
6804  mj_scalar_t **mj_weights; //two dimensional weight array
6805  bool *mj_uniform_parts; //if the target parts are uniform
6806  mj_scalar_t **mj_part_sizes; //target part weight sizes.
6807 
6808  // Nonuniform first level partitioning
6809  // Currently used for Dragonfly task mapping by partitioning Dragonfly RCA
6810  // machine coordinates and application coordinates.
6811  // An optimization that completely partitions the most important machine dimension
6812  // first (i.e. the Dragonfly group coordinate, or RCA's x coordinate). The standard
6813  // MJ alg follows after the nonuniform first level partitioning.
6814  mj_part_t num_first_level_parts; // If used, number of parts for the first level partitioing
6815  const mj_part_t *first_level_distribution; // If used, the distribution of parts for the nonuniform first level partitioning
6816 
6817  bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts.
6818  mj_part_t max_concurrent_part_calculation; // how many parts we can calculate concurrently.
6819  int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
6820  int migration_type; // when doing the migration, 0 will aim for perfect load-imbalance,
6821  //1 for minimized messages
6822  double minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration.
6823  bool mj_keep_part_boxes; //if the boxes need to be kept.
6824 
6825  int num_threads;
6826 
6827  bool mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value.
6828  int mj_premigration_option;
6829  int min_coord_per_rank_for_premigration;
6830 
6831  ArrayRCP<mj_part_t> comXAdj_; //communication graph xadj
6832  ArrayRCP<mj_part_t> comAdj_; //communication graph adj.
6833 
6834 
6835  //when we have strided data, it returns a unstrided data in RCP form.
6836  //we need to hold on to that data, during the execution of mj, so that the data is not released.
6837  //coordinate_rcp_holder will hold that data, and release it when MJ is deleted.
6838  ArrayRCP<const mj_scalar_t> * coordinate_ArrayRCP_holder;
6839 
6840  void set_up_partitioning_data(
6841  const RCP<PartitioningSolution<Adapter> >&solution);
6842 
6843  void set_input_parameters(const Teuchos::ParameterList &p);
6844 
6845  void free_work_memory();
6846 
6847  RCP<mj_partBoxVector_t> getGlobalBoxBoundaries() const;
6848 
6849  bool mj_premigrate_to_subset(int used_num_ranks, int migration_selection_option,
6850  RCP<const Environment> mj_env_,
6851  RCP<const Comm<int> > mj_problemComm_,
6852  int coord_dim_,
6853  mj_lno_t num_local_coords_,
6854  mj_gno_t num_global_coords_, size_t num_global_parts_,
6855  const mj_gno_t *initial_mj_gnos_,
6856  mj_scalar_t **mj_coordinates_,
6857  int num_weights_per_coord_,
6858  mj_scalar_t **mj_weights_,
6859  //results
6860  RCP<const Comm<int> > &result_problemComm_,
6861  mj_lno_t & result_num_local_coords_,
6862  mj_gno_t * &result_initial_mj_gnos_,
6863  mj_scalar_t ** &result_mj_coordinates_,
6864  mj_scalar_t ** &result_mj_weights_,
6865  int * &result_actual_owner_rank_);
6866 
6867 public:
6868 
6869  Zoltan2_AlgMJ(const RCP<const Environment> &env,
6870  RCP<const Comm<int> > &problemComm,
6871  const RCP<const coordinateModel_t> &coords) :
6872  mj_partitioner(), mj_env(env),
6873  mj_problemComm(problemComm),
6874  mj_coords(coords),
6875  imbalance_tolerance(0),
6876  num_global_parts(1),
6877  part_no_array(NULL),
6878  recursion_depth(0),
6879  coord_dim(0),
6880  num_local_coords(0),
6881  num_global_coords(0),
6882  initial_mj_gnos(NULL),
6883  mj_coordinates(NULL),
6884  num_weights_per_coord(0),
6885  mj_uniform_weights(NULL),
6886  mj_weights(NULL),
6887  mj_uniform_parts(NULL),
6888  mj_part_sizes(NULL),
6889  num_first_level_parts(1),
6890  first_level_distribution(NULL),
6891  distribute_points_on_cut_lines(true),
6892  max_concurrent_part_calculation(1),
6893  check_migrate_avoid_migration_option(0),
6894  migration_type(0),
6895  minimum_migration_imbalance(0.30),
6896  mj_keep_part_boxes(false),
6897  num_threads(1),
6898  mj_run_as_rcb(false),
6899  mj_premigration_option(0),
6900  min_coord_per_rank_for_premigration(32000),
6901  comXAdj_(), comAdj_(),
6902  coordinate_ArrayRCP_holder(NULL)
6903  {}
6904 
6906  if (coordinate_ArrayRCP_holder != NULL){
6907  delete [] this->coordinate_ArrayRCP_holder;
6908  this->coordinate_ArrayRCP_holder = NULL;
6909  }
6910  }
6911 
6914  static void getValidParameters(ParameterList & pl)
6915  {
6916  const bool bUnsorted = true; // this clarifies the flag is for unsrorted
6917  RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
6918  Teuchos::rcp( new Zoltan2::IntegerRangeListValidator<int>(bUnsorted) );
6919  pl.set("mj_parts", "0", "list of parts for multiJagged partitioning "
6920  "algorithm. As many as the dimension count.", mj_parts_Validator);
6921 
6922  pl.set("mj_concurrent_part_count", 1, "The number of parts whose cut "
6923  "coordinates will be calculated concurently.", Environment::getAnyIntValidator());
6924 
6925  pl.set("mj_minimum_migration_imbalance", 1.1,
6926  "mj_minimum_migration_imbalance, the minimum imbalance of the "
6927  "processors to avoid migration",
6929 
6930  RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
6931  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 2) );
6932  pl.set("mj_migration_option", 1, "Migration option, 0 for decision "
6933  "depending on the imbalance, 1 for forcing migration, 2 for "
6934  "avoiding migration", mj_migration_option_validator);
6935 
6936 
6937 
6938 
6939  RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
6940  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1) );
6941  pl.set("mj_migration_type", 0, "Migration type, 0 for migration to minimize the imbalance "
6942  "1 for migration to minimize messages exchanged the migration." ,
6943  mj_migration_option_validator);
6944 
6945  // bool parameter
6946  pl.set("mj_keep_part_boxes", false, "Keep the part boundaries of the "
6947  "geometric partitioning.", Environment::getBoolValidator());
6948 
6949  // bool parameter
6950  pl.set("mj_enable_rcb", false, "Use MJ as RCB.",
6952 
6953  pl.set("mj_recursion_depth", -1, "Recursion depth for MJ: Must be "
6954  "greater than 0.", Environment::getAnyIntValidator());
6955 
6956  RCP<Teuchos::EnhancedNumberValidator<int>> mj_premigration_option_validator =
6957  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
6958 
6959  pl.set("mj_premigration_option", 0, "Whether to do premigration or not. 0 for no migration "
6960  "x > 0 for migration to consecutive processors, the subset will be 0,x,2x,3x,...subset ranks."
6961  , mj_premigration_option_validator);
6962 
6963  pl.set("mj_premigration_coordinate_count", 32000, "How many coordinate to assign each rank in multijagged after premigration"
6965 
6966  }
6967 
6974  void partition(const RCP<PartitioningSolution<Adapter> > &solution);
6975 
6976  mj_partBoxVector_t &getPartBoxesView() const
6977  {
6978  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6979  return *pBoxes;
6980  }
6981 
6982  mj_part_t pointAssign(int dim, adapter_scalar_t *point) const;
6983 
6984  void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
6985  size_t &nPartsFound, mj_part_t **partsFound) const;
6986 
6987 
6990  void getCommunicationGraph(
6991  const PartitioningSolution<Adapter> *solution,
6992  ArrayRCP<mj_part_t> &comXAdj,
6993  ArrayRCP<mj_part_t> &comAdj);
6994 };
6995 
6996 
6997 
6998 
6999 template <typename Adapter>
7000 bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset( int used_num_ranks,
7001  int /* migration_selection_option */,
7002  RCP<const Environment> mj_env_,
7003  RCP<const Comm<int> > mj_problemComm_,
7004  int coord_dim_,
7005  mj_lno_t num_local_coords_,
7006  mj_gno_t /* num_global_coords_ */, size_t /* num_global_parts_ */,
7007  const mj_gno_t *initial_mj_gnos_,
7008  mj_scalar_t **mj_coordinates_,
7009  int num_weights_per_coord_,
7010  mj_scalar_t **mj_weights_,
7011  //results
7012  RCP<const Comm<int> > &result_problemComm_,
7013  mj_lno_t &result_num_local_coords_,
7014  mj_gno_t * &result_initial_mj_gnos_,
7015  mj_scalar_t ** &result_mj_coordinates_,
7016  mj_scalar_t ** &result_mj_weights_,
7017  int * &result_actual_owner_rank_){
7018  mj_env_->timerStart(MACRO_TIMERS, "MultiJagged - PreMigration DistributorPlanCreating");
7019 
7020 
7021  int myRank = mj_problemComm_->getRank();
7022  int worldSize = mj_problemComm_->getSize();
7023 
7024  mj_part_t groupsize = worldSize / used_num_ranks;
7025 
7026  //std::cout << "used_num_ranks:" << used_num_ranks << " groupsize:" << groupsize << std::endl;
7027 
7028  std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
7029 
7030  mj_part_t i_am_sending_to = 0;
7031  bool am_i_a_receiver = false;
7032 
7033  for(int i = 0; i < used_num_ranks; ++i){
7034  group_begins[i+ 1] = group_begins[i] + groupsize;
7035  if (worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
7036  if (i == used_num_ranks) group_begins[i+ 1] = worldSize;
7037  if (myRank >= group_begins[i] && myRank < group_begins[i + 1]) i_am_sending_to = group_begins[i];
7038  if (myRank == group_begins[i]) am_i_a_receiver= true;
7039  }
7040 
7041  ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
7042  result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
7043 
7044 
7045  Tpetra::Distributor distributor(mj_problemComm_);
7046 
7047  std::vector<mj_part_t> coordinate_destinations(num_local_coords_, i_am_sending_to);
7048  ArrayView<const mj_part_t> destinations( &(coordinate_destinations[0]), num_local_coords_);
7049  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
7050  result_num_local_coords_ = num_incoming_gnos;
7051  mj_env_->timerStop(MACRO_TIMERS, "MultiJagged - PreMigration DistributorPlanCreating");
7052 
7053  mj_env_->timerStart(MACRO_TIMERS, "MultiJagged - PreMigration DistributorMigration");
7054 
7055  //migrate gnos.
7056  {
7057  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
7058 
7059  ArrayView<const mj_gno_t> sent_gnos(initial_mj_gnos_, num_local_coords_);
7060  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
7061 
7062  result_initial_mj_gnos_ = allocMemory<mj_gno_t>(num_incoming_gnos);
7063  memcpy(
7064  result_initial_mj_gnos_,
7065  received_gnos.getRawPtr(),
7066  num_incoming_gnos * sizeof(mj_gno_t));
7067  }
7068 
7069  //migrate coordinates
7070  result_mj_coordinates_ = allocMemory<mj_scalar_t *>(coord_dim_);
7071  for (int i = 0; i < coord_dim_; ++i){
7072  ArrayView<const mj_scalar_t> sent_coord(mj_coordinates_[i], num_local_coords_);
7073  ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
7074  distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
7075  result_mj_coordinates_[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
7076  memcpy(
7077  result_mj_coordinates_[i],
7078  received_coord.getRawPtr(),
7079  num_incoming_gnos * sizeof(mj_scalar_t));
7080  }
7081 
7082  result_mj_weights_ = allocMemory<mj_scalar_t *>(num_weights_per_coord_);
7083  //migrate weights.
7084  for (int i = 0; i < num_weights_per_coord_; ++i){
7085  ArrayView<const mj_scalar_t> sent_weight(mj_weights_[i], num_local_coords_);
7086  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
7087  distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
7088  result_mj_weights_[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
7089  memcpy(
7090  result_mj_weights_[i],
7091  received_weight.getRawPtr(),
7092  num_incoming_gnos * sizeof(mj_scalar_t));
7093  }
7094 
7095  //migrate the owners of the coordinates
7096  {
7097  std::vector<int> owner_of_coordinate(num_local_coords_, myRank);
7098  ArrayView<int> sent_owners(&(owner_of_coordinate[0]), num_local_coords_);
7099  ArrayRCP<int> received_owners(num_incoming_gnos);
7100  distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners());
7101  result_actual_owner_rank_ = allocMemory<int>(num_incoming_gnos);
7102  memcpy(
7103  result_actual_owner_rank_,
7104  received_owners.getRawPtr(),
7105  num_incoming_gnos * sizeof(int));
7106  }
7107  mj_env_->timerStop(MACRO_TIMERS, "MultiJagged - PreMigration DistributorMigration");
7108  return am_i_a_receiver;
7109 }
7110 
7111 
7112 
7113 
7114 
7115 
7116 
7126 template <typename Adapter>
7128  const RCP<PartitioningSolution<Adapter> > &solution
7129 )
7130 {
7131  this->set_up_partitioning_data(solution);
7132  this->set_input_parameters(this->mj_env->getParameters());
7133  if (this->mj_keep_part_boxes){
7134  this->mj_partitioner.set_to_keep_part_boxes();
7135  }
7136  this->mj_partitioner.set_partitioning_parameters(
7137  this->distribute_points_on_cut_lines,
7138  this->max_concurrent_part_calculation,
7139  this->check_migrate_avoid_migration_option,
7140  this->minimum_migration_imbalance, this->migration_type);
7141 
7142 
7143  RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
7144  mj_lno_t result_num_local_coords = this->num_local_coords;
7145  mj_gno_t * result_initial_mj_gnos = NULL;
7146  mj_scalar_t **result_mj_coordinates = this->mj_coordinates;
7147  mj_scalar_t **result_mj_weights = this->mj_weights;
7148  int *result_actual_owner_rank = NULL;
7149  const mj_gno_t * result_initial_mj_gnos_ = this->initial_mj_gnos;
7150 
7151  //TODO: MD 08/2017: Further discussion is required.
7152  //MueLu calls MJ when it has very few coordinates per processors, such as 10.
7153  //For example, it begins with 1K processor with 1K coordinate in each.
7154  //Then with coarsening this reduces to 10 coordinate per procesor.
7155  //It calls MJ to repartition these to 10 coordinates.
7156  //MJ runs with 1K processor, 10 coordinate in each, and partitions to 10 parts.
7157  //As expected strong scaling is problem here, because computation is almost 0, and
7158  //communication cost of MJ linearly increases.
7159  //Premigration option gathers the coordinates to 10 parts before MJ starts
7160  //therefore MJ will run with a smalller subset of the problem.
7161  //Below, I am migrating the coordinates if mj_premigration_option is set,
7162  //and the result parts are less than the current part count, and the average number of
7163  //local coordinates is less than some threshold.
7164  //For example, premigration may not help if 1000 processors are partitioning data to 10,
7165  //but each of them already have 1M coordinate. In that case, we premigration would not help.
7166  int current_world_size = this->mj_problemComm->getSize();
7167  mj_lno_t threshold_num_local_coords = this->min_coord_per_rank_for_premigration;
7168  bool is_pre_migrated = false;
7169  bool am_i_in_subset = true;
7170  if ( mj_premigration_option > 0 &&
7171  size_t (current_world_size) > this->num_global_parts &&
7172  this->num_global_coords < mj_gno_t (current_world_size * threshold_num_local_coords)){
7173  if (this->mj_keep_part_boxes){
7174  throw std::logic_error("Multijagged: mj_keep_part_boxes and mj_premigration_option are not supported together yet.");
7175  }
7176  is_pre_migrated =true;
7177  int migration_selection_option = mj_premigration_option;
7178  if(migration_selection_option * this->num_global_parts > (size_t) (current_world_size)){
7179  migration_selection_option = current_world_size / this->num_global_parts;
7180  }
7181  int used_num_ranks = int (this->num_global_coords / float (threshold_num_local_coords) + 0.5);
7182  if (used_num_ranks == 0) used_num_ranks = 1;
7183 
7184  am_i_in_subset = this->mj_premigrate_to_subset(
7185  used_num_ranks,
7186  migration_selection_option,
7187  this->mj_env,
7188  this->mj_problemComm,
7189  this->coord_dim,
7190  this->num_local_coords,
7191  this->num_global_coords,
7192  this->num_global_parts,
7193  this->initial_mj_gnos,
7194  this->mj_coordinates,
7195  this->num_weights_per_coord,
7196  this->mj_weights,
7197  //results
7198  result_problemComm,
7199  result_num_local_coords,
7200  result_initial_mj_gnos,
7201  result_mj_coordinates,
7202  result_mj_weights,
7203  result_actual_owner_rank);
7204  result_initial_mj_gnos_ = result_initial_mj_gnos;
7205  }
7206 
7207 
7208 
7209  mj_part_t *result_assigned_part_ids = NULL;
7210  mj_gno_t *result_mj_gnos = NULL;
7211 
7212  if (am_i_in_subset){
7213  this->mj_partitioner.multi_jagged_part(
7214  this->mj_env,
7215  result_problemComm, //this->mj_problemComm,
7216 
7217  this->imbalance_tolerance,
7218  this->num_global_parts,
7219  this->part_no_array,
7220  this->recursion_depth,
7221 
7222  this->coord_dim,
7223  result_num_local_coords, //this->num_local_coords,
7224  this->num_global_coords,
7225  result_initial_mj_gnos_, //this->initial_mj_gnos,
7226  result_mj_coordinates, //this->mj_coordinates,
7227 
7228  this->num_weights_per_coord,
7229  this->mj_uniform_weights,
7230  result_mj_weights, //this->mj_weights,
7231  this->mj_uniform_parts,
7232  this->mj_part_sizes,
7233 
7234  result_assigned_part_ids,
7235  result_mj_gnos
7236  );
7237 
7238  }
7239 
7240  // Reorder results so that they match the order of the input
7241 
7242 #if defined(__cplusplus) && __cplusplus >= 201103L
7243  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
7244  localGidToLid.reserve(result_num_local_coords);
7245  for (mj_lno_t i = 0; i < result_num_local_coords; i++)
7246  localGidToLid[result_initial_mj_gnos_[i]] = i;
7247  ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[result_num_local_coords],
7248  0, result_num_local_coords, true);
7249 
7250  for (mj_lno_t i = 0; i < result_num_local_coords; i++) {
7251  mj_lno_t origLID = localGidToLid[result_mj_gnos[i]];
7252  partId[origLID] = result_assigned_part_ids[i];
7253  }
7254 
7255 #else
7256  Teuchos::Hashtable<mj_gno_t, mj_lno_t>
7257  localGidToLid(result_num_local_coords);
7258  for (mj_lno_t i = 0; i < result_num_local_coords; i++)
7259  localGidToLid.put(result_initial_mj_gnos_[i], i);
7260 
7261  ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[result_num_local_coords],
7262  0, result_num_local_coords, true);
7263 
7264  for (mj_lno_t i = 0; i < result_num_local_coords; i++) {
7265  mj_lno_t origLID = localGidToLid.get(result_mj_gnos[i]);
7266  partId[origLID] = result_assigned_part_ids[i];
7267  }
7268 
7269 #endif // C++11 is enabled
7270 
7271  delete [] result_mj_gnos;
7272  delete [] result_assigned_part_ids;
7273 
7274 
7275  //now the results are reordered. but if premigration occured,
7276  //then we need to send these ids to actual owners again.
7277  if (is_pre_migrated){
7278  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - PostMigration DistributorPlanCreating");
7279  Tpetra::Distributor distributor(this->mj_problemComm);
7280 
7281  ArrayView<const mj_part_t> actual_owner_destinations( result_actual_owner_rank , result_num_local_coords);
7282  mj_lno_t num_incoming_gnos = distributor.createFromSends(actual_owner_destinations);
7283  if (num_incoming_gnos != this->num_local_coords){
7284  throw std::logic_error("Zoltan2 - Multijagged Post Migration - num incoming is not equal to num local coords");
7285  }
7286  mj_env->timerStop(MACRO_TIMERS, "MultiJagged - PostMigration DistributorPlanCreating");
7287  mj_env->timerStart(MACRO_TIMERS, "MultiJagged - PostMigration DistributorMigration");
7288  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
7289  ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
7290  {
7291  ArrayView<const mj_gno_t> sent_gnos(result_initial_mj_gnos_, result_num_local_coords);
7292  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
7293  }
7294  {
7295  ArrayView<mj_part_t> sent_partnos(partId());
7296  distributor.doPostsAndWaits<mj_part_t>(sent_partnos, 1, received_partids());
7297  }
7298  partId = arcp(new mj_part_t[this->num_local_coords],
7299  0, this->num_local_coords, true);
7300 
7301  {
7302 #if defined(__cplusplus) && __cplusplus >= 201103L
7303  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
7304  localGidToLid2.reserve(this->num_local_coords);
7305  for (mj_lno_t i = 0; i < this->num_local_coords; i++)
7306  localGidToLid2[this->initial_mj_gnos[i]] = i;
7307 
7308 
7309  for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
7310  mj_lno_t origLID = localGidToLid2[received_gnos[i]];
7311  partId[origLID] = received_partids[i];
7312  }
7313 
7314 #else
7315  Teuchos::Hashtable<mj_gno_t, mj_lno_t>
7316  localGidToLid2(this->num_local_coords);
7317  for (mj_lno_t i = 0; i < this->num_local_coords; i++)
7318  localGidToLid2.put(this->initial_mj_gnos[i], i);
7319 
7320 
7321  for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
7322  mj_lno_t origLID = localGidToLid2.get(received_gnos[i]);
7323  partId[origLID] = received_partids[i];
7324  }
7325 
7326 #endif // C++11 is enabled
7327 
7328  }
7329 
7330  {
7331  freeArray<mj_gno_t> (result_initial_mj_gnos);
7332  for (int i = 0; i < this->coord_dim; ++i){
7333  freeArray<mj_scalar_t> (result_mj_coordinates[i]);
7334  }
7335  freeArray<mj_scalar_t *> (result_mj_coordinates);
7336 
7337  for (int i = 0; i < this->num_weights_per_coord; ++i){
7338  freeArray<mj_scalar_t> (result_mj_weights[i]);
7339  }
7340  freeArray<mj_scalar_t *> (result_mj_weights);
7341  freeArray<int> (result_actual_owner_rank);
7342  }
7343  mj_env->timerStop(MACRO_TIMERS, "MultiJagged - PostMigration DistributorMigration");
7344 
7345  }
7346 
7347  solution->setParts(partId);
7348  this->free_work_memory();
7349 }
7350 
7351 /* \brief Freeing the memory allocated.
7352  * */
7353 template <typename Adapter>
7355  freeArray<mj_scalar_t *>(this->mj_coordinates);
7356  freeArray<mj_scalar_t *>(this->mj_weights);
7357  freeArray<bool>(this->mj_uniform_parts);
7358  freeArray<mj_scalar_t *>(this->mj_part_sizes);
7359  freeArray<bool>(this->mj_uniform_weights);
7360 
7361 }
7362 
7363 /* \brief Sets the partitioning data for multijagged algorithm.
7364  * */
7365 template <typename Adapter>
7366 void Zoltan2_AlgMJ<Adapter>::set_up_partitioning_data(
7367  const RCP<PartitioningSolution<Adapter> > &solution
7368 )
7369 {
7370  this->coord_dim = this->mj_coords->getCoordinateDim();
7371  this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
7372  this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
7373  this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
7374  int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
7375 
7376  // From the Solution we get part information.
7377  // If the part sizes for a given criteria are not uniform,
7378  // then they are values that sum to 1.0.
7379  this->num_global_parts = solution->getTargetGlobalNumberOfParts();
7380  //allocate only two dimensional pointer.
7381  //raw pointer addresess will be obtained from multivector.
7382  this->mj_coordinates = allocMemory<mj_scalar_t *>(this->coord_dim);
7383  this->mj_weights = allocMemory<mj_scalar_t *>(criteria_dim);
7384 
7385  //if the partitioning results are to be uniform.
7386  this->mj_uniform_parts = allocMemory< bool >(criteria_dim);
7387  //if in a criteria dimension, uniform part is false this shows ratios of
7388  //the target part weights.
7389  this->mj_part_sizes = allocMemory<mj_scalar_t *>(criteria_dim);
7390  //if the weights of coordinates are uniform in a criteria dimension.
7391  this->mj_uniform_weights = allocMemory< bool >(criteria_dim);
7392 
7393  typedef StridedData<mj_lno_t, adapter_scalar_t> input_t;
7394  ArrayView<const mj_gno_t> gnos;
7395  ArrayView<input_t> xyz;
7396  ArrayView<input_t> wgts;
7397 
7398 
7399  this->coordinate_ArrayRCP_holder = new ArrayRCP<const mj_scalar_t> [this->coord_dim + this->num_weights_per_coord];
7400 
7401  this->mj_coords->getCoordinates(gnos, xyz, wgts);
7402  //obtain global ids.
7403  ArrayView<const mj_gno_t> mj_gnos = gnos;
7404  this->initial_mj_gnos = mj_gnos.getRawPtr();
7405 
7406  //extract coordinates from multivector.
7407  for (int dim=0; dim < this->coord_dim; dim++){
7408  ArrayRCP<const mj_scalar_t> ar;
7409  xyz[dim].getInputArray(ar); // will copy if stride != 1 or
7410  // adapter_scalar_t != mj_scalar_t
7411  this->coordinate_ArrayRCP_holder[dim] = ar;
7412 
7413  //multiJagged coordinate values assignment
7414  this->mj_coordinates[dim] = (mj_scalar_t *)ar.getRawPtr();
7415  }
7416 
7417  //if no weights are provided set uniform weight.
7418  if (this->num_weights_per_coord == 0){
7419  this->mj_uniform_weights[0] = true;
7420  this->mj_weights[0] = NULL;
7421  }
7422  else{
7423  //if weights are provided get weights for all weight indices
7424  for (int wdim = 0; wdim < this->num_weights_per_coord; wdim++){
7425  ArrayRCP<const mj_scalar_t> ar;
7426  wgts[wdim].getInputArray(ar); // will copy if stride!=1
7427  // or adapter_scalar_t !=
7428  // mj_scalar_t
7429  this->coordinate_ArrayRCP_holder[this->coord_dim + wdim] = ar;
7430  this->mj_uniform_weights[wdim] = false;
7431  this->mj_weights[wdim] = (mj_scalar_t *) ar.getRawPtr();
7432  }
7433  }
7434 
7435  for (int wdim = 0; wdim < criteria_dim; wdim++){
7436  if (solution->criteriaHasUniformPartSizes(wdim)){
7437  this->mj_uniform_parts[wdim] = true;
7438  this->mj_part_sizes[wdim] = NULL;
7439  }
7440  else{
7441  std::cerr << "MJ does not support non uniform target part weights" << std::endl;
7442  exit(1);
7443  }
7444  }
7445 }
7446 
7447 /* \brief Sets the partitioning parameters for multijagged algorithm.
7448  * \param pl: is the parameter list provided to zoltan2 call
7449  * */
7450 template <typename Adapter>
7451 void Zoltan2_AlgMJ<Adapter>::set_input_parameters(const Teuchos::ParameterList &pl){
7452 
7453  const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
7454  if (pe){
7455  double tol;
7456  tol = pe->getValue(&tol);
7457  this->imbalance_tolerance = tol - 1.0;
7458  }
7459 
7460  // TODO: May be a more relaxed tolerance is needed. RCB uses 10%
7461  if (this->imbalance_tolerance <= 0)
7462  this->imbalance_tolerance= 10e-4;
7463 
7464  //if an input partitioning array is provided.
7465  this->part_no_array = NULL;
7466  //the length of the input partitioning array.
7467  this->recursion_depth = 0;
7468 
7469  if (pl.getPtr<Array <mj_part_t> >("mj_parts")){
7470  this->part_no_array = (mj_part_t *) pl.getPtr<Array <mj_part_t> >("mj_parts")->getRawPtr();
7471  this->recursion_depth = pl.getPtr<Array <mj_part_t> >("mj_parts")->size() - 1;
7472  this->mj_env->debug(2, "mj_parts provided by user");
7473  }
7474 
7475  //get mj specific parameters.
7476  this->distribute_points_on_cut_lines = true;
7477  this->max_concurrent_part_calculation = 1;
7478 
7479  this->mj_run_as_rcb = false;
7480  this->mj_premigration_option = 0;
7481  this->min_coord_per_rank_for_premigration = 32000;
7482 
7483  int mj_user_recursion_depth = -1;
7484  this->mj_keep_part_boxes = false;
7485  this->check_migrate_avoid_migration_option = 0;
7486  this->migration_type = 0;
7487  this->minimum_migration_imbalance = 0.35;
7488 
7489  pe = pl.getEntryPtr("mj_minimum_migration_imbalance");
7490  if (pe){
7491  double imb;
7492  imb = pe->getValue(&imb);
7493  this->minimum_migration_imbalance = imb - 1.0;
7494  }
7495 
7496  pe = pl.getEntryPtr("mj_migration_option");
7497  if (pe){
7498  this->check_migrate_avoid_migration_option = pe->getValue(&this->check_migrate_avoid_migration_option);
7499  }else {
7500  this->check_migrate_avoid_migration_option = 0;
7501  }
7502  if (this->check_migrate_avoid_migration_option > 1) this->check_migrate_avoid_migration_option = -1;
7503 
7505  pe = pl.getEntryPtr("mj_migration_type");
7506  if (pe){
7507  this->migration_type = pe->getValue(&this->migration_type);
7508  }else {
7509  this->migration_type = 0;
7510  }
7511  //std::cout << " this->migration_type:" << this->migration_type << std::endl;
7513 
7514  pe = pl.getEntryPtr("mj_concurrent_part_count");
7515  if (pe){
7516  this->max_concurrent_part_calculation = pe->getValue(&this->max_concurrent_part_calculation);
7517  }else {
7518  this->max_concurrent_part_calculation = 1; // Set to 1 if not provided.
7519  }
7520 
7521  pe = pl.getEntryPtr("mj_keep_part_boxes");
7522  if (pe){
7523  this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
7524  }else {
7525  this->mj_keep_part_boxes = false; // Set to invalid value
7526  }
7527 
7528 
7529  // For now, need keep_part_boxes to do pointAssign and boxAssign.
7530  // pe = pl.getEntryPtr("keep_cuts");
7531  // if (pe){
7532  // int tmp = pe->getValue(&tmp);
7533  // if (tmp) this->mj_keep_part_boxes = true;
7534  // }
7535 
7536  //need to keep part boxes if mapping type is geometric.
7537  if (this->mj_keep_part_boxes == false){
7538  pe = pl.getEntryPtr("mapping_type");
7539  if (pe){
7540  int mapping_type = -1;
7541  mapping_type = pe->getValue(&mapping_type);
7542  if (mapping_type == 0){
7543  mj_keep_part_boxes = true;
7544  }
7545  }
7546  }
7547 
7548  //need to keep part boxes if mapping type is geometric.
7549  pe = pl.getEntryPtr("mj_enable_rcb");
7550  if (pe){
7551  this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
7552  }else {
7553  this->mj_run_as_rcb = false; // Set to invalid value
7554  }
7555 
7556  pe = pl.getEntryPtr("mj_premigration_option");
7557  if (pe){
7558  mj_premigration_option = pe->getValue(&mj_premigration_option);
7559  }else {
7560  mj_premigration_option = 0;
7561  }
7562 
7563  pe = pl.getEntryPtr("mj_premigration_coordinate_count");
7564  if (pe){
7565  min_coord_per_rank_for_premigration = pe->getValue(&min_coord_per_rank_for_premigration);
7566  }else {
7567  min_coord_per_rank_for_premigration = 32000;
7568  }
7569 
7570  pe = pl.getEntryPtr("mj_recursion_depth");
7571  if (pe){
7572  mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
7573  }else {
7574  mj_user_recursion_depth = -1; // Set to invalid value
7575  }
7576 
7577  bool val = false;
7578  pe = pl.getEntryPtr("rectilinear");
7579  if (pe) val = pe->getValue(&val);
7580  if (val){
7581  this->distribute_points_on_cut_lines = false;
7582  } else {
7583  this->distribute_points_on_cut_lines = true;
7584  }
7585 
7586  if (this->mj_run_as_rcb){
7587  mj_user_recursion_depth = (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
7588  }
7589  if (this->recursion_depth < 1){
7590  if (mj_user_recursion_depth > 0){
7591  this->recursion_depth = mj_user_recursion_depth;
7592  }
7593  else {
7594  this->recursion_depth = this->coord_dim;
7595  }
7596  }
7597 
7598  this->num_threads = 1;
7599 #ifdef HAVE_ZOLTAN2_OMP
7600 #pragma omp parallel
7601  {
7602  this->num_threads = omp_get_num_threads();
7603  }
7604 #endif
7605 
7606 }
7607 
7609 template <typename Adapter>
7611  int dim,
7612  adapter_scalar_t *lower,
7613  adapter_scalar_t *upper,
7614  size_t &nPartsFound,
7615  typename Adapter::part_t **partsFound) const
7616 {
7617  // TODO: Implement with cuts rather than boxes to reduce algorithmic
7618  // TODO: complexity. Or at least do a search through the boxes, using
7619  // TODO: p x q x r x ... if possible.
7620 
7621  nPartsFound = 0;
7622  *partsFound = NULL;
7623 
7624  if (this->mj_keep_part_boxes) {
7625 
7626  // Get vector of part boxes
7627  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
7628 
7629  size_t nBoxes = (*partBoxes).size();
7630  if (nBoxes == 0) {
7631  throw std::logic_error("no part boxes exist");
7632  }
7633 
7634  // Determine whether the box overlaps the globalBox at all
7635  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
7636 
7637  if (globalBox->boxesOverlap(dim, lower, upper)) {
7638 
7639  std::vector<typename Adapter::part_t> partlist;
7640 
7641  // box overlaps the global box; find specific overlapping boxes
7642  for (size_t i = 0; i < nBoxes; i++) {
7643  try {
7644  if ((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
7645  nPartsFound++;
7646  partlist.push_back((*partBoxes)[i].getpId());
7647 
7648 // std::cout << "Given box (";
7649 // for (int j = 0; j < dim; j++)
7650 // std::cout << lower[j] << " ";
7651 // std::cout << ") x (";
7652 // for (int j = 0; j < dim; j++)
7653 // std::cout << upper[j] << " ";
7654 // std::cout << ") overlaps PartBox "
7655 // << (*partBoxes)[i].getpId() << " (";
7656 // for (int j = 0; j < dim; j++)
7657 // std::cout << (*partBoxes)[i].getlmins()[j] << " ";
7658 // std::cout << ") x (";
7659 // for (int j = 0; j < dim; j++)
7660 // std::cout << (*partBoxes)[i].getlmaxs()[j] << " ";
7661 // std::cout << ")" << std::endl;
7662  }
7663  }
7665  }
7666  if (nPartsFound) {
7667  *partsFound = new mj_part_t[nPartsFound];
7668  for (size_t i = 0; i < nPartsFound; i++)
7669  (*partsFound)[i] = partlist[i];
7670  }
7671  }
7672  else {
7673  // Box does not overlap the domain at all. Find the closest part
7674  // Not sure how to perform this operation for MJ without having the
7675  // cuts. With the RCB cuts, the concept of a part extending to
7676  // infinity was natural. With the boxes, it is much more difficult.
7677  // TODO: For now, return information indicating NO OVERLAP.
7678 
7679  }
7680  }
7681  else {
7682  throw std::logic_error("need to use keep_cuts parameter for boxAssign");
7683  }
7684 }
7685 
7687 template <typename Adapter>
7689  int dim,
7690  adapter_scalar_t *point) const
7691 {
7692 
7693  // TODO: Implement with cuts rather than boxes to reduce algorithmic
7694  // TODO: complexity. Or at least do a search through the boxes, using
7695  // TODO: p x q x r x ... if possible.
7696 
7697  if (this->mj_keep_part_boxes) {
7698  typename Adapter::part_t foundPart = -1;
7699 
7700  // Get vector of part boxes
7701  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
7702 
7703  size_t nBoxes = (*partBoxes).size();
7704  if (nBoxes == 0) {
7705  throw std::logic_error("no part boxes exist");
7706  }
7707 
7708  // Determine whether the point is within the global domain
7709  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
7710 
7711  if (globalBox->pointInBox(dim, point)) {
7712 
7713  // point is in the global domain; determine in which part it is.
7714  size_t i;
7715  for (i = 0; i < nBoxes; i++) {
7716  try {
7717  if ((*partBoxes)[i].pointInBox(dim, point)) {
7718  foundPart = (*partBoxes)[i].getpId();
7719 // std::cout << "Point (";
7720 // for (int j = 0; j < dim; j++) std::cout << point[j] << " ";
7721 // std::cout << ") found in box " << i << " part " << foundPart
7722 // << std::endl;
7723 // (*partBoxes)[i].print();
7724  break;
7725  }
7726  }
7728  }
7729 
7730  if (i == nBoxes) {
7731  // This error should never occur
7732  std::ostringstream oss;
7733  oss << "Point (";
7734  for (int j = 0; j < dim; j++) oss << point[j] << " ";
7735  oss << ") not found in domain";
7736  throw std::logic_error(oss.str());
7737  }
7738  }
7739 
7740  else {
7741  // Point is outside the global domain.
7742  // Determine to which part it is closest.
7743  // TODO: with cuts, would not need this special case
7744 
7745  typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
7746  size_t closestBox = 0;
7747  coord_t minDistance = std::numeric_limits<coord_t>::max();
7748  coord_t *centroid = new coord_t[dim];
7749  for (size_t i = 0; i < nBoxes; i++) {
7750  (*partBoxes)[i].computeCentroid(centroid);
7751  coord_t sum = 0.;
7752  coord_t diff;
7753  for (int j = 0; j < dim; j++) {
7754  diff = centroid[j] - point[j];
7755  sum += diff * diff;
7756  }
7757  if (sum < minDistance) {
7758  minDistance = sum;
7759  closestBox = i;
7760  }
7761  }
7762  foundPart = (*partBoxes)[closestBox].getpId();
7763  delete [] centroid;
7764  }
7765 
7766  return foundPart;
7767  }
7768  else {
7769  throw std::logic_error("need to use keep_cuts parameter for pointAssign");
7770  }
7771 }
7772 
7773 template <typename Adapter>
7775  const PartitioningSolution<Adapter> * /* solution */,
7776  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comXAdj,
7777  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comAdj)
7778 {
7779  if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL){
7780  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
7781  mj_part_t ntasks = (*pBoxes).size();
7782  int dim = (*pBoxes)[0].getDim();
7783  GridHash grid(pBoxes, ntasks, dim);
7784  grid.getAdjArrays(comXAdj_, comAdj_);
7785  }
7786  comAdj = comAdj_;
7787  comXAdj = comXAdj_;
7788 }
7789 
7790 
7791 template <typename Adapter>
7792 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
7794 {
7795  return this->mj_partitioner.get_kept_boxes();
7796 }
7797 
7798 
7799 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7800  typename mj_part_t>
7801 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
7803 {
7804  if (this->mj_keep_part_boxes)
7805  return this->kept_boxes;
7806  else
7807  throw std::logic_error("Error: part boxes are not stored.");
7808 }
7809 
7810 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7811  typename mj_part_t>
7812 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
7814  RCP<mj_partBoxVector_t> &localPartBoxes
7815 ) const
7816 {
7817  typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
7818  mj_part_t ntasks = this->num_global_parts;
7819  int dim = (*localPartBoxes)[0].getDim();
7820  coord_t *localPartBoundaries = new coord_t[ntasks * 2 *dim];
7821 
7822  memset(localPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
7823 
7824  coord_t *globalPartBoundaries = new coord_t[ntasks * 2 *dim];
7825  memset(globalPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
7826 
7827  coord_t *localPartMins = localPartBoundaries;
7828  coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
7829 
7830  coord_t *globalPartMins = globalPartBoundaries;
7831  coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
7832 
7833  mj_part_t boxCount = localPartBoxes->size();
7834  for (mj_part_t i = 0; i < boxCount; ++i){
7835  mj_part_t pId = (*localPartBoxes)[i].getpId();
7836  //std::cout << "me:" << comm->getRank() << " has:" << pId << std::endl;
7837 
7838  coord_t *lmins = (*localPartBoxes)[i].getlmins();
7839  coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
7840 
7841  for (int j = 0; j < dim; ++j){
7842  localPartMins[dim * pId + j] = lmins[j];
7843  localPartMaxs[dim * pId + j] = lmaxs[j];
7844  /*
7845  std::cout << "me:" << comm->getRank() <<
7846  " dim * pId + j:"<< dim * pId + j <<
7847  " localMin:" << localPartMins[dim * pId + j] <<
7848  " localMax:" << localPartMaxs[dim * pId + j] << std::endl;
7849  */
7850  }
7851  }
7852 
7853  Teuchos::Zoltan2_BoxBoundaries<int, coord_t> reductionOp(ntasks * 2 *dim);
7854 
7855  reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
7856  ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
7857  RCP<mj_partBoxVector_t> pB(new mj_partBoxVector_t(),true);
7858  for (mj_part_t i = 0; i < ntasks; ++i){
7859  Zoltan2::coordinateModelPartBox tpb(i, dim, globalPartMins + dim * i,
7860  globalPartMaxs + dim * i);
7861 
7862  /*
7863  for (int j = 0; j < dim; ++j){
7864  std::cout << "me:" << comm->getRank() <<
7865  " dim * pId + j:"<< dim * i + j <<
7866  " globalMin:" << globalPartMins[dim * i + j] <<
7867  " globalMax:" << globalPartMaxs[dim * i + j] << std::endl;
7868  }
7869  */
7870  pB->push_back(tpb);
7871  }
7872  delete []localPartBoundaries;
7873  delete []globalPartBoundaries;
7874  //RCP <mj_partBoxVector_t> tmpRCPBox(pB, true);
7875  return pB;
7876 }
7877 } // namespace Zoltan2
7878 
7879 #endif
#define ZOLTAN2_ABS(x)
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
#define MIN_WORK_LAST_DIM
#define LEAST_SIGNIFICANCE
#define SIGNIFICANCE_MUL
#define FUTURE_REDUCEALL_CUTOFF
#define imbalanceOf2(Wachieved, wExpected)
Defines the CoordinateModel classes.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
Define IntegerRangeList validator.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Defines Parameter related enumerators, declares functions.
A gathering of useful namespace methods.
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBox_t > get_global_box() const
Return the global bounding box: min/max coords of global domain.
RCP< mj_partBoxVector_t > get_kept_boxes() const
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, size_t num_global_parts, mj_part_t *part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, const mj_gno_t *initial_mj_gnos, mj_scalar_t **mj_coordinates, int num_weights_per_coord, bool *mj_uniform_weights, mj_scalar_t **mj_weights, bool *mj_uniform_parts, mj_scalar_t **mj_part_sizes, mj_part_t *&result_assigned_part_ids, mj_gno_t *&result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, mj_scalar_t **mj_coordinates, mj_lno_t *initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth, const mj_part_t *part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const mj_part_t *first_level_distribution_=NULL)
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
Algorithm defines the base class for all algorithms.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
GridHash Class, Hashing Class for part boxes.
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
A ParameterList validator for integer range lists.
A PartitioningSolution is a solution to a partitioning problem.
Multi Jagged coordinate partitioning algorithm.
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
Class for sorting items with multiple values. First sorting with respect to val[0],...
uMultiSortItem< IT, CT, WT > operator=(const uMultiSortItem< IT, CT, WT > &other)
void set(IT index_, CT count_, WT *vals_)
bool operator<(const uMultiSortItem< IT, CT, WT > &other) const
uMultiSortItem(const uMultiSortItem< IT, CT, WT > &other)
bool operator>(const uMultiSortItem< IT, CT, WT > &other) const
uMultiSortItem(IT index_, CT count_, WT *vals_)
Tpetra::global_size_t global_size_t
@ MACRO_TIMERS
Time an algorithm (or other entity) as a whole.
void freeArray(T *&array)
Frees the given array.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals.
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals.
T * allocMemory(size_t size)
Allocates memory for the given size.
dictionary vals
Definition: xml2dox.py:186
#define epsilon
SparseMatrixAdapter_t::part_t part_t
static ArrayRCP< ArrayRCP< zscalar_t > > weights
bool operator<=(const uSignedSortItem< IT, WT, SIGN > &rhs)
bool operator>=(const uSignedSortItem< IT, WT, SIGN > &rhs)
bool operator<(const uSignedSortItem< IT, WT, SIGN > &rhs) const
bool operator>(const uSignedSortItem< IT, WT, SIGN > &rhs) const
Sort items for quick sort function.