Zoltan2
Zoltan2_AlgMultiJagged.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // Zoltan2: A package of combinatorial algorithms for scientific computing
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Karen Devine (kddevin@sandia.gov)
39 // Erik Boman (egboman@sandia.gov)
40 // Siva Rajamanickam (srajama@sandia.gov)
41 //
42 // ***********************************************************************
43 //
44 // @HEADER
49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50 #define _ZOLTAN2_ALGMultiJagged_HPP_
51 
54 #include <Zoltan2_Parameters.hpp>
55 #include <Zoltan2_Algorithm.hpp>
56 
57 #include <Tpetra_Distributor.hpp>
58 #include <Teuchos_ParameterList.hpp>
60 #include <new> // ::operator new[]
61 #include <algorithm> // std::sort
62 #include <Zoltan2_Util.hpp>
63 #include <vector>
64 
65 #if defined(__cplusplus) && __cplusplus >= 201103L
66 #include <unordered_map>
67 #else
68 #include <Teuchos_Hashtable.hpp>
69 #endif // C++11 is enabled
70 
71 #ifdef ZOLTAN2_USEZOLTANCOMM
72 #ifdef HAVE_ZOLTAN2_MPI
73 #define ENABLE_ZOLTAN_MIGRATION
74 #include "zoltan_comm_cpp.h"
75 #include "zoltan_types.h" // for error codes
76 #endif
77 #endif
78 
79 #ifdef HAVE_ZOLTAN2_OMP
80 #include <omp.h>
81 #endif
82 
83 #define LEAST_SIGNIFICANCE 0.0001
84 #define SIGNIFICANCE_MUL 1000
85 
86 //if the (last dimension reduce all count) x the mpi world size
87 //estimated to be bigger than this number then migration will be forced
88 //in earlier iterations.
89 #define FUTURE_REDUCEALL_CUTOFF 1500000
90 //if parts right before last dimension are estimated to have less than
91 //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
92 #define MIN_WORK_LAST_DIM 1000
93 
94 
95 
96 
97 #define ZOLTAN2_ABS(x) ((x) >= 0 ? (x) : -(x))
98 //imbalance calculation. Wreal / Wexpected - 1
99 #define imbalanceOf(Wachieved, totalW, expectedRatio) \
100  (Wachieved) / ((totalW) * (expectedRatio)) - 1
101 #define imbalanceOf2(Wachieved, wExpected) \
102  (Wachieved) / (wExpected) - 1
103 
104 
105 #define ZOLTAN2_ALGMULTIJAGGED_SWAP(a,b,temp) temp=(a);(a)=(b);(b)=temp;
106 
107 
108 namespace Teuchos{
109 
114 template <typename Ordinal, typename T>
115 class Zoltan2_BoxBoundaries : public ValueTypeReductionOp<Ordinal,T>
116 {
117 private:
118  Ordinal size;
119  T _EPSILON;
120 
121 public:
124  Zoltan2_BoxBoundaries ():size(0), _EPSILON (std::numeric_limits<T>::epsilon()){}
125 
132  Zoltan2_BoxBoundaries (Ordinal s_):
133  size(s_), _EPSILON (std::numeric_limits<T>::epsilon()){}
134 
137  void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
138  {
139  for (Ordinal i=0; i < count; i++){
140  if (Z2_ABS(inBuffer[i]) > _EPSILON){
141  inoutBuffer[i] = inBuffer[i];
142  }
143  }
144  }
145 };
146 } // namespace Teuchos
147 
148 namespace Zoltan2{
149 
153 template <typename T>
154 T *allocMemory(size_t size){
155  if (size > 0){
156  T * a = new T[size];
157  if (a == NULL) {
158  throw "cannot allocate memory";
159  }
160  return a;
161  }
162  else {
163  return NULL;
164  }
165 }
166 
170 template <typename T>
171 void freeArray(T *&array){
172  if(array != NULL){
173  delete [] array;
174  array = NULL;
175  }
176 }
177 
181 template <typename tt>
182 std::string toString(tt obj){
183  std::stringstream ss (std::stringstream::in |std::stringstream::out);
184  ss << obj;
185  std::string tmp = "";
186  ss >> tmp;
187  return tmp;
188 }
189 
197 template <typename IT, typename CT, typename WT>
199 {
200 public:
201  IT index;
202  CT count;
203  //unsigned int val;
204  WT *val;
206 
208  this->index = 0;
209  this->count = 0;
210  this->val = NULL;
211  this->_EPSILON = std::numeric_limits<WT>::epsilon() * 100;
212  }
213 
214 
215  uMultiSortItem(IT index_ ,CT count_, WT *vals_){
216  this->index = index_;
217  this->count = count_;
218  this->val = vals_;
219  this->_EPSILON = std::numeric_limits<WT>::epsilon() * 100;
220  }
221 
223  this->index = other.index;
224  this->count = other.count;
225  this->val = other.val;
226  this->_EPSILON = other._EPSILON;
227  }
228 
230  //freeArray<WT>(this->val);
231  }
232 
233  void set(IT index_ ,CT count_, WT *vals_){
234  this->index = index_;
235  this->count = count_;
236  this->val = vals_;
237  }
238 
239 
241  this->index = other.index;
242  this->count = other.count;
243  this->val = other.val;
244  return *(this);
245  }
246 
247  bool operator<(const uMultiSortItem<IT,CT,WT>& other) const{
248  assert (this->count == other.count);
249  for(CT i = 0; i < this->count; ++i){
250  //if the values are equal go to next one.
251  if (ZOLTAN2_ABS(this->val[i] - other.val[i]) < this->_EPSILON){
252  continue;
253  }
254  //if next value is smaller return true;
255  if(this->val[i] < other.val[i]){
256  return true;
257  }
258  //if next value is bigger return false;
259  else {
260  return false;
261  }
262  }
263  //if they are totally equal.
264  return this->index < other.index;
265  }
266  bool operator>(const uMultiSortItem<IT,CT,WT>& other) const{
267  assert (this->count == other.count);
268  for(CT i = 0; i < this->count; ++i){
269  //if the values are equal go to next one.
270  if (ZOLTAN2_ABS(this->val[i] - other.val[i]) < this->_EPSILON){
271  continue;
272  }
273  //if next value is bigger return true;
274  if(this->val[i] > other.val[i]){
275  return true;
276  }
277  //if next value is smaller return false;
278  else //(this->val[i] > other.val[i])
279  {
280  return false;
281  }
282  }
283  //if they are totally equal.
284  return this->index > other.index;
285  }
286 };// uSortItem;
287 
291 template <class IT, class WT>
292 struct uSortItem
293 {
294  IT id;
295  //unsigned int val;
296  WT val;
297 };// uSortItem;
298 
302 template <class IT, class WT>
303 void uqsort(IT n, uSortItem<IT, WT> * arr)
304 {
305  int NSTACK = 50;
306  int M = 7;
307  IT i, ir=n, j, k, l=1;
308  IT jstack=0, istack[50];
309  WT aval;
310  uSortItem<IT,WT> a, temp;
311 
312  --arr;
313  for (;;)
314  {
315  if (ir-l < M)
316  {
317  for (j=l+1;j<=ir;j++)
318  {
319  a=arr[j];
320  aval = a.val;
321  for (i=j-1;i>=1;i--)
322  {
323  if (arr[i].val <= aval)
324  break;
325  arr[i+1] = arr[i];
326  }
327  arr[i+1]=a;
328  }
329  if (jstack == 0)
330  break;
331  ir=istack[jstack--];
332  l=istack[jstack--];
333  }
334  else
335  {
336  k=(l+ir) >> 1;
337  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[k],arr[l+1], temp)
338  if (arr[l+1].val > arr[ir].val)
339  {
340  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[ir],temp)
341  }
342  if (arr[l].val > arr[ir].val)
343  {
344  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l],arr[ir],temp)
345  }
346  if (arr[l+1].val > arr[l].val)
347  {
348  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[l+1],arr[l],temp)
349  }
350  i=l+1;
351  j=ir;
352  a=arr[l];
353  aval = a.val;
354  for (;;)
355  {
356  do i++; while (arr[i].val < aval);
357  do j--; while (arr[j].val > aval);
358  if (j < i) break;
359  ZOLTAN2_ALGMULTIJAGGED_SWAP(arr[i],arr[j],temp);
360  }
361  arr[l]=arr[j];
362  arr[j]=a;
363  jstack += 2;
364  if (jstack > NSTACK){
365  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
366  exit(1);
367  }
368  if (ir-i+1 >= j-l)
369  {
370  istack[jstack]=ir;
371  istack[jstack-1]=i;
372  ir=j-1;
373  }
374  else
375  {
376  istack[jstack]=j-1;
377  istack[jstack-1]=l;
378  l=i;
379  }
380  }
381  }
382 }
383 
384 
385 
389 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
390  typename mj_part_t>
391 class AlgMJ
392 {
393 private:
395  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
396 
397  RCP<const Environment> mj_env; //the environment object
398  RCP<Comm<int> > mj_problemComm; //initial comm object
399 
400  double imbalance_tolerance; //input imbalance tolerance.
401  mj_part_t *part_no_array; //input part array specifying num part to divide along each dim.
402  int recursion_depth; //the number of steps that partitioning will be solved in.
403  int coord_dim, num_weights_per_coord; //coordinate dim and # of weights per coord
404 
405  size_t initial_num_loc_coords; //initial num local coords.
406  global_size_t initial_num_glob_coords; //initial num global coords.
407 
408  mj_lno_t num_local_coords; //number of local coords.
409  mj_gno_t num_global_coords; //number of global coords.
410 
411  mj_scalar_t **mj_coordinates; //two dimension coordinate array
412  mj_scalar_t **mj_weights; //two dimension weight array
413  bool *mj_uniform_parts; //if the target parts are uniform
414  mj_scalar_t **mj_part_sizes; //target part weight sizes.
415  bool *mj_uniform_weights; //if the coordinates have uniform weights.
416 
417  ArrayView<const mj_gno_t> mj_gnos; //global ids of the coordinates, comes from the input
418  size_t num_global_parts; //the targeted number of parts
419 
420  mj_gno_t *initial_mj_gnos; //initial global ids of the coordinates.
421  mj_gno_t *current_mj_gnos; //current global ids of the coordinates, might change during migration.
422  int *owner_of_coordinate; //the actual processor owner of the coordinate, to track after migrations.
423 
424  mj_lno_t *coordinate_permutations; //permutation of coordinates, for partitioning.
425  mj_lno_t *new_coordinate_permutations; //permutation work array.
426  mj_part_t *assigned_part_ids; //the part ids assigned to coordinates.
427 
428  mj_lno_t *part_xadj; //beginning and end of each part.
429  mj_lno_t *new_part_xadj; // work array for beginning and end of each part.
430 
431  //get mj specific parameters.
432  bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts.
433  mj_part_t max_concurrent_part_calculation; // how many parts we can calculate concurrently.
434 
435  int mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value.
436  int mj_user_recursion_depth; //the recursion depth value provided by user.
437  int mj_keep_part_boxes; //if the boxes need to be kept.
438 
439  int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
440  mj_scalar_t minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration.
441  int num_threads; //num threads
442 
443  mj_part_t total_num_cut ; //how many cuts will be totally
444  mj_part_t total_num_part; //how many parts will be totally
445 
446  mj_part_t max_num_part_along_dim ; //maximum part count along a dimension.
447  mj_part_t max_num_cut_along_dim; //maximum cut count along a dimension.
448  size_t max_num_total_part_along_dim; //maximum part+cut count along a dimension.
449 
450  mj_part_t total_dim_num_reduce_all; //estimate on #reduceAlls can be done.
451  mj_part_t last_dim_num_part; //max no of parts that might occur
452  //during the partition before the
453  //last partitioning dimension.
454 
455  RCP<Comm<int> > comm; //comm object than can be altered during execution
456  float fEpsilon; //epsilon for float
457  mj_scalar_t sEpsilon; //epsilon for mj_scalar_t
458 
459  mj_scalar_t maxScalar_t; //max possible scalar
460  mj_scalar_t minScalar_t; //min scalar
461 
462  mj_scalar_t *all_cut_coordinates;
463  mj_scalar_t *max_min_coords;
464  mj_scalar_t *process_cut_line_weight_to_put_left; //how much weight should a MPI put left side of the each cutline
465  mj_scalar_t **thread_cut_line_weight_to_put_left; //how much weight percentage should each thread in MPI put left side of the each outline
466 
467  // work array to manipulate coordinate of cutlines in different iterations.
468  //necessary because previous cut line information is used for determining
469  //the next cutline information. therefore, cannot update the cut work array
470  //until all cutlines are determined.
471  mj_scalar_t *cut_coordinates_work_array;
472 
473  //cumulative part weight array.
474  mj_scalar_t *target_part_weights;
475 
476  mj_scalar_t *cut_upper_bound_coordinates ; //upper bound coordinate of a cut line
477  mj_scalar_t *cut_lower_bound_coordinates ; //lower bound coordinate of a cut line
478  mj_scalar_t *cut_lower_bound_weights ; //lower bound weight of a cut line
479  mj_scalar_t *cut_upper_bound_weights ; //upper bound weight of a cut line
480 
481  mj_scalar_t *process_local_min_max_coord_total_weight ; //combined array to exchange the min and max coordinate, and total weight of part.
482  mj_scalar_t *global_min_max_coord_total_weight ;//global combined array with the results for min, max and total weight.
483 
484  //isDone is used to determine if a cutline is determined already.
485  //If a cut line is already determined, the next iterations will skip this cut line.
486  bool *is_cut_line_determined;
487  //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part
488  //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part.
489  mj_part_t *my_incomplete_cut_count;
490  //local part weights of each thread.
491  double **thread_part_weights;
492  //the work manupulation array for partweights.
493  double **thread_part_weight_work;
494 
495  //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread).
496  mj_scalar_t **thread_cut_left_closest_point;
497  //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread)
498  mj_scalar_t **thread_cut_right_closest_point;
499 
500  //to store how many points in each part a thread has.
501  mj_lno_t **thread_point_counts;
502 
503  mj_scalar_t *process_rectilinear_cut_weight;
504  mj_scalar_t *global_rectilinear_cut_weight;
505 
506  //for faster communication, concatanation of
507  //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
508  //leftClosest distances sized P-1, since P-1 cut lines
509  //rightClosest distances size P-1, since P-1 cut lines.
510  mj_scalar_t *total_part_weight_left_right_closests ;
511  mj_scalar_t *global_total_part_weight_left_right_closests;
512 
513  RCP<mj_partBoxVector_t> kept_boxes; // vector of all boxes for all parts;
514  // constructed only if
515  // mj_keep_part_boxes == true
516  RCP<mj_partBox_t> global_box;
517  int myRank, myActualRank; //processor rank, and initial rank
518 
519  /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in
520  * the input. part_no_array takes
521  * precedence if both are provided.
522  * Depending on these parameters, total cut/part number,
523  * maximum part/cut number along a dimension, estimated number of reduceAlls,
524  * and the number of parts before the last dimension is calculated.
525  * */
526  void set_part_specifications();
527 
528  /* \brief Tries to determine the part number for current dimension,
529  * by trying to make the partitioning as square as possible.
530  * \param num_total_future how many more partitionings are required.
531  * \param root how many more recursion depth is left.
532  */
533  inline mj_part_t get_part_count(
534  mj_part_t num_total_future,
535  double root);
536 
537  /* \brief Allocates the all required memory for the mj partitioning algorithm.
538  *
539  */
540  void allocate_set_work_memory();
541 
542  /* \brief for part communication we keep track of the box boundaries.
543  * This is performed when either asked specifically, or when geometric mapping is performed afterwards.
544  * This function initializes a single box with all global min and max coordinates.
545  * \param initial_partitioning_boxes the input and output vector for boxes.
546  */
547  void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
548 
549  /* \brief compute global bounding box: min/max coords of global domain */
550  void compute_global_box();
551 
552  /* \brief Function returns how many parts that will be obtained after this dimension partitioning.
553  * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector,
554  * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector,
555  * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor.
556  *
557  * \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into.
558  * \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into.
559  * \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into.
560  * \param future_num_parts: output, max number of future parts that will be obtained from a single
561  * \param current_num_parts: input, how many parts are there currently.
562  * \param current_iteration: input, current dimension iteration number.
563  * \param input_part_boxes: input, if boxes are kept, current boxes.
564  * \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts.
565  */
566  mj_part_t update_part_num_arrays(
567  std::vector<mj_part_t> &num_partitioning_in_current_dim, //assumes this vector is empty.
568  std::vector<mj_part_t> *future_num_part_in_parts,
569  std::vector<mj_part_t> *next_future_num_parts_in_parts, //assumes this vector is empty.
570  mj_part_t &future_num_parts,
571  mj_part_t current_num_parts,
572  int current_iteration,
573  RCP<mj_partBoxVector_t> input_part_boxes,
574  RCP<mj_partBoxVector_t> output_part_boxes);
575 
587  void mj_get_local_min_max_coord_totW(
588  mj_lno_t coordinate_begin_index,
589  mj_lno_t coordinate_end_index,
590  mj_lno_t *mj_current_coordinate_permutations,
591  mj_scalar_t *mj_current_dim_coords,
592  mj_scalar_t &min_coordinate,
593  mj_scalar_t &max_coordinate,
594  mj_scalar_t &total_weight);
595 
603  void mj_get_global_min_max_coord_totW(
604  mj_part_t current_concurrent_num_parts,
605  mj_scalar_t *local_min_max_total,
606  mj_scalar_t *global_min_max_total);
607 
626  void mj_get_initial_cut_coords_target_weights(
627  mj_scalar_t min_coord,
628  mj_scalar_t max_coord,
629  mj_part_t num_cuts/*p-1*/ ,
630  mj_scalar_t global_weight,
631  mj_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/,
632  mj_scalar_t *target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/,
633 
634  std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
635  std::vector <mj_part_t> *next_future_num_parts_in_parts,
636  mj_part_t concurrent_current_part,
637  mj_part_t obtained_part_index);
638 
651  void set_initial_coordinate_parts(
652  mj_scalar_t &max_coordinate,
653  mj_scalar_t &min_coordinate,
654  mj_part_t &concurrent_current_part_index,
655  mj_lno_t coordinate_begin_index,
656  mj_lno_t coordinate_end_index,
657  mj_lno_t *mj_current_coordinate_permutations,
658  mj_scalar_t *mj_current_dim_coords,
659  mj_part_t *mj_part_ids,
660  mj_part_t &partition_count);
661 
672  void mj_1D_part(
673  mj_scalar_t *mj_current_dim_coords,
674  mj_scalar_t imbalanceTolerance,
675  mj_part_t current_work_part,
676  mj_part_t current_concurrent_num_parts,
677  mj_scalar_t *current_cut_coordinates,
678  mj_part_t total_incomplete_cut_count,
679  std::vector <mj_part_t> &num_partitioning_in_current_dim);
680 
700  void mj_1D_part_get_thread_part_weights(
701  size_t total_part_count,
702  mj_part_t num_cuts,
703  mj_scalar_t max_coord,
704  mj_scalar_t min_coord,
705  mj_lno_t coordinate_begin_index,
706  mj_lno_t coordinate_end_index,
707  mj_scalar_t *mj_current_dim_coords,
708  mj_scalar_t *temp_current_cut_coords,
709  bool *current_cut_status,
710  double *my_current_part_weights,
711  mj_scalar_t *my_current_left_closest,
712  mj_scalar_t *my_current_right_closest);
713 
721  void mj_accumulate_thread_results(
722  const std::vector <mj_part_t> &num_partitioning_in_current_dim,
723  mj_part_t current_work_part,
724  mj_part_t current_concurrent_num_parts);
725 
756  void mj_get_new_cut_coordinates(
757  const size_t &num_total_part,
758  const mj_part_t &num_cuts,
759  const mj_scalar_t &max_coordinate,
760  const mj_scalar_t &min_coordinate,
761  const mj_scalar_t &global_total_weight,
762  const mj_scalar_t &used_imbalance_tolerance,
763  mj_scalar_t * current_global_part_weights,
764  const mj_scalar_t * current_local_part_weights,
765  const mj_scalar_t *current_part_target_weights,
766  bool *current_cut_line_determined,
767  mj_scalar_t *current_cut_coordinates,
768  mj_scalar_t *current_cut_upper_bounds,
769  mj_scalar_t *current_cut_lower_bounds,
770  mj_scalar_t *current_global_left_closest_points,
771  mj_scalar_t *current_global_right_closest_points,
772  mj_scalar_t * current_cut_lower_bound_weights,
773  mj_scalar_t * current_cut_upper_weights,
774  mj_scalar_t *new_current_cut_coordinates,
775  mj_scalar_t *current_part_cut_line_weight_to_put_left,
776  mj_part_t *rectilinear_cut_count,
777  mj_part_t &my_num_incomplete_cut);
778 
788  void mj_calculate_new_cut_position (
789  mj_scalar_t cut_upper_bound,
790  mj_scalar_t cut_lower_bound,
791  mj_scalar_t cut_upper_weight,
792  mj_scalar_t cut_lower_weight,
793  mj_scalar_t expected_weight,
794  mj_scalar_t &new_cut_position);
795 
806  void mj_create_new_partitions(
807  mj_part_t num_parts,
808  mj_scalar_t *mj_current_dim_coords,
809  mj_scalar_t *current_concurrent_cut_coordinate,
810  mj_lno_t coordinate_begin,
811  mj_lno_t coordinate_end,
812  mj_scalar_t *used_local_cut_line_weight_to_left,
813  double **used_thread_part_weight_work,
814  mj_lno_t *out_part_xadj);
815 
838  bool mj_perform_migration(
839  mj_part_t in_num_parts, //current umb parts
840  mj_part_t &out_num_parts, //output umb parts.
841  std::vector<mj_part_t> *next_future_num_parts_in_parts,
842  mj_part_t &output_part_begin_index,
843  size_t migration_reduce_all_population,
844  mj_lno_t num_coords_for_last_dim_part,
845  std::string iteration,
846  RCP<mj_partBoxVector_t> &input_part_boxes,
847  RCP<mj_partBoxVector_t> &output_part_boxes);
848 
858  void get_processor_num_points_in_parts(
859  mj_part_t num_procs,
860  mj_part_t num_parts,
861  mj_gno_t *&num_points_in_all_processor_parts);
862 
875  bool mj_check_to_migrate(
876  size_t migration_reduce_all_population,
877  mj_lno_t num_coords_for_last_dim_part,
878  mj_part_t num_procs,
879  mj_part_t num_parts,
880  mj_gno_t *num_points_in_all_processor_parts);
881 
882 
900  void mj_migration_part_proc_assignment(
901  mj_gno_t * num_points_in_all_processor_parts,
902  mj_part_t num_parts,
903  mj_part_t num_procs,
904  mj_lno_t *send_count_to_each_proc,
905  std::vector<mj_part_t> &processor_ranks_for_subcomm,
906  std::vector<mj_part_t> *next_future_num_parts_in_parts,
907  mj_part_t &out_num_part,
908  std::vector<mj_part_t> &out_part_indices,
909  mj_part_t &output_part_numbering_begin_index,
910  int *coordinate_destinations);
911 
928  void mj_assign_proc_to_parts(
929  mj_gno_t * num_points_in_all_processor_parts,
930  mj_part_t num_parts,
931  mj_part_t num_procs,
932  mj_lno_t *send_count_to_each_proc,
933  std::vector<mj_part_t> &processor_ranks_for_subcomm,
934  std::vector<mj_part_t> *next_future_num_parts_in_parts,
935  mj_part_t &out_part_index,
936  mj_part_t &output_part_numbering_begin_index,
937  int *coordinate_destinations);
938 
949  void assign_send_destinations(
950  mj_part_t num_parts,
951  mj_part_t *part_assignment_proc_begin_indices,
952  mj_part_t *processor_chains_in_parts,
953  mj_lno_t *send_count_to_each_proc,
954  int *coordinate_destinations);
955 
968  void assign_send_destinations2(
969  mj_part_t num_parts,
970  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors
971  int *coordinate_destinations,
972  mj_part_t &output_part_numbering_begin_index,
973  std::vector<mj_part_t> *next_future_num_parts_in_parts);
974 
991  void mj_assign_parts_to_procs(
992  mj_gno_t * num_points_in_all_processor_parts,
993  mj_part_t num_parts,
994  mj_part_t num_procs,
995  mj_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc.
996  std::vector<mj_part_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into.
997  mj_part_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function.
998  std::vector<mj_part_t> &out_part_indices, //output: the part indices which the processor is assigned to.
999  mj_part_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution
1000  int *coordinate_destinations);
1001 
1014  void mj_migrate_coords(
1015  mj_part_t num_procs,
1016  mj_lno_t &num_new_local_points,
1017  std::string iteration,
1018  int *coordinate_destinations,
1019  mj_part_t num_parts);
1020 
1027  void create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm);
1028 
1029 
1035  void fill_permutation_array(
1036  mj_part_t output_num_parts,
1037  mj_part_t num_parts);
1038 
1047  void set_final_parts(
1048  mj_part_t current_num_parts,
1049  mj_part_t output_part_begin_index,
1050  RCP<mj_partBoxVector_t> &output_part_boxes,
1051  bool is_data_ever_migrated);
1054  void free_work_memory();
1068  void create_consistent_chunks(
1069  mj_part_t num_parts,
1070  mj_scalar_t *mj_current_dim_coords,
1071  mj_scalar_t *current_concurrent_cut_coordinate,
1072  mj_lno_t coordinate_begin,
1073  mj_lno_t coordinate_end,
1074  mj_scalar_t *used_local_cut_line_weight_to_left,
1075  mj_lno_t *out_part_xadj,
1076  int coordInd);
1077 public:
1078  AlgMJ();
1079 
1108  void multi_jagged_part(
1109  const RCP<const Environment> &env,
1110  RCP<Comm<int> > &problemComm,
1111 
1112  double imbalance_tolerance,
1113  size_t num_global_parts,
1114  mj_part_t *part_no_array,
1115  int recursion_depth,
1116 
1117  int coord_dim,
1118  mj_lno_t num_local_coords,
1119  mj_gno_t num_global_coords,
1120  const mj_gno_t *initial_mj_gnos,
1121  mj_scalar_t **mj_coordinates,
1122 
1123  int num_weights_per_coord,
1124  bool *mj_uniform_weights,
1125  mj_scalar_t **mj_weights,
1126  bool *mj_uniform_parts,
1127  mj_scalar_t **mj_part_sizes,
1128 
1129  mj_part_t *&result_assigned_part_ids,
1130  mj_gno_t *&result_mj_gnos
1131 
1132  );
1140  void set_partitioning_parameters(
1141  bool distribute_points_on_cut_lines_,
1142  int max_concurrent_part_calculation_,
1143  int check_migrate_avoid_migration_option_,
1144  mj_scalar_t minimum_migration_imbalance_);
1148  void set_to_keep_part_boxes();
1149 
1152  RCP<mj_partBox_t> get_global_box() const;
1153 
1154  RCP<mj_partBoxVector_t> get_kept_boxes() const;
1155 
1156  RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1157  RCP<mj_partBoxVector_t> &localPartBoxes) const;
1158 
1183  void sequential_task_partitioning(
1184  const RCP<const Environment> &env,
1185  mj_lno_t num_total_coords,
1186  mj_lno_t num_selected_coords,
1187  size_t num_target_part,
1188  int coord_dim,
1189  mj_scalar_t **mj_coordinates,
1190  mj_lno_t *initial_selected_coords_output_permutation,
1191  mj_lno_t *output_xadj,
1192  int recursion_depth,
1193  const mj_part_t *part_no_array);
1194 
1195 };
1196 
1221 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1222  typename mj_part_t>
1224  const RCP<const Environment> &env,
1225  mj_lno_t num_total_coords,
1226  mj_lno_t num_selected_coords,
1227  size_t num_target_part,
1228  int coord_dim_,
1229  mj_scalar_t **mj_coordinates_,
1230  mj_lno_t *inital_adjList_output_adjlist,
1231  mj_lno_t *output_xadj,
1232  int rd,
1233  const mj_part_t *part_no_array_
1234 ){
1235 
1236  this->mj_env = env;
1237  const RCP<Comm<int> > commN;
1238  this->comm = this->mj_problemComm = Teuchos::rcp_const_cast<Comm<int> >
1239  (Teuchos::DefaultComm<int>::getDefaultSerialComm(commN));
1240  this->myActualRank = this->myRank = 1;
1241 
1242 #ifdef HAVE_ZOLTAN2_OMP
1243  int actual_num_threads = omp_get_num_threads();
1244  omp_set_num_threads(1);
1245 #endif
1246 
1247  //weights are uniform for task mapping
1248 
1249  //parts are uniform for task mapping
1250  //as input indices.
1251 
1252  this->imbalance_tolerance = 0;
1253  this->num_global_parts = num_target_part;
1254  this->part_no_array = (mj_part_t *)part_no_array_;
1255  this->recursion_depth = rd;
1256 
1257  this->coord_dim = coord_dim_;
1258  this->num_local_coords = num_total_coords;
1259  this->num_global_coords = num_total_coords;
1260  this->mj_coordinates = mj_coordinates_; //will copy the memory to this->mj_coordinates.
1261 
1264  this->initial_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
1265 
1266  this->num_weights_per_coord = 0;
1267  bool *tmp_mj_uniform_weights = new bool[1];
1268  this->mj_uniform_weights = tmp_mj_uniform_weights ;
1269  this->mj_uniform_weights[0] = true;
1270 
1271  mj_scalar_t **tmp_mj_weights = new mj_scalar_t *[1];
1272  this->mj_weights = tmp_mj_weights; //will copy the memory to this->mj_weights
1273 
1274  bool *tmp_mj_uniform_parts = new bool[1];
1275  this->mj_uniform_parts = tmp_mj_uniform_parts;
1276  this->mj_uniform_parts[0] = true;
1277 
1278  mj_scalar_t **tmp_mj_part_sizes = new mj_scalar_t * [1];
1279  this->mj_part_sizes = tmp_mj_part_sizes;
1280  this->mj_part_sizes[0] = NULL;
1281 
1282  this->num_threads = 1;
1283  this->set_part_specifications();
1284 
1285  this->allocate_set_work_memory();
1286  //the end of the initial partition is the end of coordinates.
1287  this->part_xadj[0] = static_cast<mj_lno_t>(num_selected_coords);
1288  for(size_t i = 0; i < static_cast<size_t>(num_total_coords); ++i){
1289  this->coordinate_permutations[i] = inital_adjList_output_adjlist[i];
1290  }
1291 
1292  mj_part_t current_num_parts = 1;
1293 
1294  mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
1295 
1296  mj_part_t future_num_parts = this->total_num_part;
1297 
1298  std::vector<mj_part_t> *future_num_part_in_parts = new std::vector<mj_part_t> ();
1299  std::vector<mj_part_t> *next_future_num_parts_in_parts = new std::vector<mj_part_t> ();
1300  next_future_num_parts_in_parts->push_back(this->num_global_parts);
1301  RCP<mj_partBoxVector_t> t1;
1302  RCP<mj_partBoxVector_t> t2;
1303 
1304  for (int i = 0; i < this->recursion_depth; ++i){
1305 
1306  //partitioning array. size will be as the number of current partitions and this
1307  //holds how many parts that each part will be in the current dimension partitioning.
1308  std::vector <mj_part_t> num_partitioning_in_current_dim;
1309 
1310  //number of parts that will be obtained at the end of this partitioning.
1311  //future_num_part_in_parts is as the size of current number of parts.
1312  //holds how many more parts each should be divided in the further
1313  //iterations. this will be used to calculate num_partitioning_in_current_dim,
1314  //as the number of parts that the part will be partitioned
1315  //in the current dimension partitioning.
1316 
1317  //next_future_num_parts_in_parts will be as the size of outnumParts,
1318  //and this will hold how many more parts that each output part
1319  //should be divided. this array will also be used to determine the weight ratios
1320  //of the parts.
1321  //swap the arrays to use iteratively..
1322  std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
1323  future_num_part_in_parts = next_future_num_parts_in_parts;
1324  next_future_num_parts_in_parts = tmpPartVect;
1325 
1326  //clear next_future_num_parts_in_parts array as
1327  //getPartitionArrays expects it to be empty.
1328  //it also expects num_partitioning_in_current_dim to be empty as well.
1329  next_future_num_parts_in_parts->clear();
1330 
1331 
1332  //returns the total number of output parts for this dimension partitioning.
1333  mj_part_t output_part_count_in_dimension =
1334  this->update_part_num_arrays(
1335  num_partitioning_in_current_dim,
1336  future_num_part_in_parts,
1337  next_future_num_parts_in_parts,
1338  future_num_parts,
1339  current_num_parts,
1340  i,
1341  t1,
1342  t2);
1343 
1344  //if the number of obtained parts equal to current number of parts,
1345  //skip this dimension. For example, this happens when 1 is given in the input
1346  //part array is given. P=4,5,1,2
1347  if(output_part_count_in_dimension == current_num_parts) {
1348  tmpPartVect= future_num_part_in_parts;
1349  future_num_part_in_parts = next_future_num_parts_in_parts;
1350  next_future_num_parts_in_parts = tmpPartVect;
1351  continue;
1352  }
1353 
1354  //get the coordinate axis along which the partitioning will be done.
1355  int coordInd = i % this->coord_dim;
1356  mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
1357  //convert i to string to be used for debugging purposes.
1358  std::string istring = toString<int>(i);
1359 
1360  //alloc Memory to point the indices
1361  //of the parts in the permutation array.
1362  this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
1363 
1364  //the index where in the outtotalCounts will be written.
1365  mj_part_t output_part_index = 0;
1366  //whatever is written to outTotalCounts will be added with previousEnd
1367  //so that the points will be shifted.
1368  mj_part_t output_coordinate_end_index = 0;
1369 
1370  mj_part_t current_work_part = 0;
1371  mj_part_t current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
1372  this->max_concurrent_part_calculation);
1373 
1374  mj_part_t obtained_part_index = 0;
1375 
1376  //run for all available parts.
1377  for (; current_work_part < current_num_parts;
1378  current_work_part += current_concurrent_num_parts){
1379 
1380  current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
1381  this->max_concurrent_part_calculation);
1382 
1383  mj_part_t actual_work_part_count = 0;
1384  //initialization for 1D partitioning.
1385  //get the min and max coordinates of each part
1386  //together with the part weights of each part.
1387  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
1388  mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
1389 
1390  //if this part wont be partitioned any further
1391  //dont do any work for this part.
1392  if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
1393  continue;
1394  }
1395  ++actual_work_part_count;
1396  mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
1397  mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
1398 
1399  /*
1400  std::cout << "i:" << i << " j:" << current_work_part + kk
1401  << " coordinate_begin_index:" << coordinate_begin_index
1402  << " coordinate_end_index:" << coordinate_end_index
1403  << " total:" << coordinate_end_index - coordinate_begin_index<< std::endl;
1404  */
1405  this->mj_get_local_min_max_coord_totW(
1406  coordinate_begin_index,
1407  coordinate_end_index,
1408  this->coordinate_permutations,
1409  mj_current_dim_coords,
1410  this->process_local_min_max_coord_total_weight[kk], //min coordinate
1411  this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max coordinate
1412  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] //total weight);
1413  );
1414  }
1415 
1416  //1D partitioning
1417  if (actual_work_part_count > 0){
1418  //obtain global Min max of the part.
1419  this->mj_get_global_min_max_coord_totW(
1420  current_concurrent_num_parts,
1421  this->process_local_min_max_coord_total_weight,
1422  this->global_min_max_coord_total_weight);
1423 
1424  //represents the total number of cutlines
1425  //whose coordinate should be determined.
1426  mj_part_t total_incomplete_cut_count = 0;
1427 
1428  //Compute weight ratios for parts & cuts:
1429  //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
1430  //part0 cut0 part1 cut1 part2 cut2 part3
1431  mj_part_t concurrent_part_cut_shift = 0;
1432  mj_part_t concurrent_part_part_shift = 0;
1433  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
1434  mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
1435  mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
1436  current_concurrent_num_parts];
1437  mj_scalar_t global_total_weight =
1438  this->global_min_max_coord_total_weight[kk +
1439  2 * current_concurrent_num_parts];
1440 
1441  mj_part_t concurrent_current_part_index = current_work_part + kk;
1442 
1443  mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
1444 
1445  mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
1446  mj_scalar_t *current_target_part_weights = this->target_part_weights +
1447  concurrent_part_part_shift;
1448  //shift the usedCutCoordinate array as noCuts.
1449  concurrent_part_cut_shift += partition_count - 1;
1450  //shift the partRatio array as noParts.
1451  concurrent_part_part_shift += partition_count;
1452 
1453  //calculate only if part is not empty,
1454  //and part will be further partitioend.
1455  if(partition_count > 1 && min_coordinate <= max_coordinate){
1456 
1457  //increase allDone by the number of cuts of the current
1458  //part's cut line number.
1459  total_incomplete_cut_count += partition_count - 1;
1460  //set the number of cut lines that should be determined
1461  //for this part.
1462  this->my_incomplete_cut_count[kk] = partition_count - 1;
1463 
1464  //get the target weights of the parts.
1465  this->mj_get_initial_cut_coords_target_weights(
1466  min_coordinate,
1467  max_coordinate,
1468  partition_count - 1,
1469  global_total_weight,
1470  usedCutCoordinate,
1471  current_target_part_weights,
1472  future_num_part_in_parts,
1473  next_future_num_parts_in_parts,
1474  concurrent_current_part_index,
1475  obtained_part_index);
1476 
1477  mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
1478  mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
1479 
1480  //get the initial estimated part assignments of the coordinates.
1481  this->set_initial_coordinate_parts(
1482  max_coordinate,
1483  min_coordinate,
1484  concurrent_current_part_index,
1485  coordinate_begin_index, coordinate_end_index,
1486  this->coordinate_permutations,
1487  mj_current_dim_coords,
1488  this->assigned_part_ids,
1489  partition_count);
1490  }
1491  else {
1492  // e.g., if have fewer coordinates than parts, don't need to do next dim.
1493  this->my_incomplete_cut_count[kk] = 0;
1494  }
1495  obtained_part_index += partition_count;
1496  }
1497 
1498  //used imbalance, it is always 0, as it is difficult to estimate a range.
1499  mj_scalar_t used_imbalance = 0;
1500 
1501  // Determine cut lines for k parts here.
1502  this->mj_1D_part(
1503  mj_current_dim_coords,
1504  used_imbalance,
1505  current_work_part,
1506  current_concurrent_num_parts,
1507  current_cut_coordinates,
1508  total_incomplete_cut_count,
1509  num_partitioning_in_current_dim);
1510  }
1511 
1512  //create part chunks
1513  {
1514 
1515  mj_part_t output_array_shift = 0;
1516  mj_part_t cut_shift = 0;
1517  size_t tlr_shift = 0;
1518  size_t partweight_array_shift = 0;
1519 
1520  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
1521  mj_part_t current_concurrent_work_part = current_work_part + kk;
1522  mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
1523 
1524  //if the part is empty, skip the part.
1525  if((num_parts != 1 ) && this->global_min_max_coord_total_weight[kk] >
1526  this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
1527 
1528  for(mj_part_t jj = 0; jj < num_parts; ++jj){
1529  this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
1530  }
1531  cut_shift += num_parts - 1;
1532  tlr_shift += (4 *(num_parts - 1) + 1);
1533  output_array_shift += num_parts;
1534  partweight_array_shift += (2 * (num_parts - 1) + 1);
1535  continue;
1536  }
1537 
1538  mj_lno_t coordinate_end = this->part_xadj[current_concurrent_work_part];
1539  mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[current_concurrent_work_part
1540  -1];
1541  mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
1542  mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
1543  cut_shift;
1544 
1545  for(int ii = 0; ii < this->num_threads; ++ii){
1546  this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
1547  }
1548 
1549  if(num_parts > 1){
1550  // Rewrite the indices based on the computed cuts.
1551  this->create_consistent_chunks(
1552  num_parts,
1553  mj_current_dim_coords,
1554  current_concurrent_cut_coordinate,
1555  coordinate_begin,
1556  coordinate_end,
1557  used_local_cut_line_weight_to_left,
1558  this->new_part_xadj + output_part_index + output_array_shift,
1559  coordInd );
1560  }
1561  else {
1562  //if this part is partitioned into 1 then just copy
1563  //the old values.
1564  mj_lno_t part_size = coordinate_end - coordinate_begin;
1565  *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
1566  memcpy(this->new_coordinate_permutations + coordinate_begin,
1567  this->coordinate_permutations + coordinate_begin,
1568  part_size * sizeof(mj_lno_t));
1569  }
1570  cut_shift += num_parts - 1;
1571  tlr_shift += (4 *(num_parts - 1) + 1);
1572  output_array_shift += num_parts;
1573  partweight_array_shift += (2 * (num_parts - 1) + 1);
1574  }
1575 
1576  //shift cut coordinates so that all cut coordinates are stored.
1577  //current_cut_coordinates += cutShift;
1578 
1579  //getChunks from coordinates partitioned the parts and
1580  //wrote the indices as if there were a single part.
1581  //now we need to shift the beginning indices.
1582  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
1583  mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
1584  for (mj_part_t ii = 0;ii < num_parts ; ++ii){
1585  //shift it by previousCount
1586  this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
1587  }
1588  //increase the previous count by current end.
1589  output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
1590  //increase the current out.
1591  output_part_index += num_parts ;
1592  }
1593  }
1594  }
1595  // end of this partitioning dimension
1596 
1597  //set the current num parts for next dim partitioning
1598  current_num_parts = output_part_count_in_dimension;
1599 
1600  //swap the coordinate permutations for the next dimension.
1601  mj_lno_t * tmp = this->coordinate_permutations;
1602  this->coordinate_permutations = this->new_coordinate_permutations;
1603  this->new_coordinate_permutations = tmp;
1604 
1605  freeArray<mj_lno_t>(this->part_xadj);
1606  this->part_xadj = this->new_part_xadj;
1607  }
1608 
1609  for(mj_lno_t i = 0; i < num_total_coords; ++i){
1610  inital_adjList_output_adjlist[i] = this->coordinate_permutations[i];
1611  }
1612 
1613  // Return output_xadj in CSR format
1614  output_xadj[0] = 0;
1615  for(size_t i = 0; i < this->num_global_parts ; ++i){
1616  output_xadj[i+1] = this->part_xadj[i];
1617  }
1618 
1619  delete future_num_part_in_parts;
1620  delete next_future_num_parts_in_parts;
1621 
1622  //free the extra memory that we allocated.
1623  freeArray<mj_part_t>(this->assigned_part_ids);
1624  freeArray<mj_gno_t>(this->initial_mj_gnos);
1625  freeArray<mj_gno_t>(this->current_mj_gnos);
1626  freeArray<bool>(tmp_mj_uniform_weights);
1627  freeArray<bool>(tmp_mj_uniform_parts);
1628  freeArray<mj_scalar_t *>(tmp_mj_weights);
1629  freeArray<mj_scalar_t *>(tmp_mj_part_sizes);
1630 
1631  this->free_work_memory();
1632 
1633 #ifdef HAVE_ZOLTAN2_OMP
1634  omp_set_num_threads(actual_num_threads);
1635 #endif
1636 }
1637 
1641 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1642  typename mj_part_t>
1644  mj_env(), mj_problemComm(), imbalance_tolerance(0),
1645  part_no_array(NULL), recursion_depth(0), coord_dim(0),
1646  num_weights_per_coord(0), initial_num_loc_coords(0),
1647  initial_num_glob_coords(0),
1648  num_local_coords(0), num_global_coords(0), mj_coordinates(NULL),
1649  mj_weights(NULL), mj_uniform_parts(NULL), mj_part_sizes(NULL),
1650  mj_uniform_weights(NULL), mj_gnos(), num_global_parts(1),
1651  initial_mj_gnos(NULL), current_mj_gnos(NULL), owner_of_coordinate(NULL),
1652  coordinate_permutations(NULL), new_coordinate_permutations(NULL),
1653  assigned_part_ids(NULL), part_xadj(NULL), new_part_xadj(NULL),
1654  distribute_points_on_cut_lines(true), max_concurrent_part_calculation(1),
1655  mj_run_as_rcb(0), mj_user_recursion_depth(0), mj_keep_part_boxes(0),
1656  check_migrate_avoid_migration_option(0), minimum_migration_imbalance(0.30),
1657  num_threads(1), total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1658  max_num_cut_along_dim(0), max_num_total_part_along_dim(0), total_dim_num_reduce_all(0),
1659  last_dim_num_part(0), comm(), fEpsilon(0), sEpsilon(0), maxScalar_t(0), minScalar_t(0),
1660  all_cut_coordinates(NULL), max_min_coords(NULL), process_cut_line_weight_to_put_left(NULL),
1661  thread_cut_line_weight_to_put_left(NULL), cut_coordinates_work_array(NULL),
1662  target_part_weights(NULL), cut_upper_bound_coordinates(NULL), cut_lower_bound_coordinates(NULL),
1663  cut_lower_bound_weights(NULL), cut_upper_bound_weights(NULL),
1664  process_local_min_max_coord_total_weight(NULL), global_min_max_coord_total_weight(NULL),
1665  is_cut_line_determined(NULL), my_incomplete_cut_count(NULL),
1666  thread_part_weights(NULL), thread_part_weight_work(NULL),
1667  thread_cut_left_closest_point(NULL), thread_cut_right_closest_point(NULL),
1668  thread_point_counts(NULL), process_rectilinear_cut_weight(NULL),
1669  global_rectilinear_cut_weight(NULL),total_part_weight_left_right_closests(NULL),
1670  global_total_part_weight_left_right_closests(NULL),
1671  kept_boxes(),global_box(),
1672  myRank(0), myActualRank(0)
1673 {
1674  this->fEpsilon = std::numeric_limits<float>::epsilon();
1675  this->sEpsilon = std::numeric_limits<mj_scalar_t>::epsilon() * 100;
1676 
1677  this->maxScalar_t = std::numeric_limits<mj_scalar_t>::max();
1678  this->minScalar_t = -std::numeric_limits<mj_scalar_t>::max();
1679 
1680 }
1681 
1682 
1686 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1687  typename mj_part_t>
1688 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBox_t>
1690 {
1691  return this->global_box;
1692 }
1693 
1697 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1698  typename mj_part_t>
1700  this->mj_keep_part_boxes = 1;
1701 }
1702 
1703 
1704 /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in
1705  * the input. part_no_array takes
1706  * precedence if both are provided.
1707  * Depending on these parameters, total cut/part number,
1708  * maximum part/cut number along a dimension, estimated number of reduceAlls,
1709  * and the number of parts before the last dimension is calculated.
1710  * */
1711 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1712  typename mj_part_t>
1714 
1715  this->total_num_cut = 0; //how many cuts will be totally
1716  this->total_num_part = 1; //how many parts will be totally
1717  this->max_num_part_along_dim = 0; //maximum part count along a dimension.
1718  this->total_dim_num_reduce_all = 0; //estimate on #reduceAlls can be done.
1719  this->last_dim_num_part = 1; //max no of parts that might occur
1720  //during the partition before the
1721  //last partitioning dimension.
1722  this->max_num_cut_along_dim = 0;
1723  this->max_num_total_part_along_dim = 0;
1724 
1725  if (this->part_no_array){
1726  //if user provided part array, traverse the array and set variables.
1727  for (int i = 0; i < this->recursion_depth; ++i){
1728  this->total_dim_num_reduce_all += this->total_num_part;
1729  this->total_num_part *= this->part_no_array[i];
1730  if(this->part_no_array[i] > this->max_num_part_along_dim) {
1731  this->max_num_part_along_dim = this->part_no_array[i];
1732  }
1733  }
1734  this->last_dim_num_part = this->total_num_part / this->part_no_array[recursion_depth-1];
1735  this->num_global_parts = this->total_num_part;
1736  } else {
1737  mj_part_t future_num_parts = this->num_global_parts;
1738 
1739  //we need to calculate the part numbers now, to determine the maximum along the dimensions.
1740  for (int i = 0; i < this->recursion_depth; ++i){
1741 
1742  mj_part_t maxNoPartAlongI = this->get_part_count(
1743  future_num_parts, 1.0f / (this->recursion_depth - i));
1744 
1745  if (maxNoPartAlongI > this->max_num_part_along_dim){
1746  this->max_num_part_along_dim = maxNoPartAlongI;
1747  }
1748 
1749  mj_part_t nfutureNumParts = future_num_parts / maxNoPartAlongI;
1750  if (future_num_parts % maxNoPartAlongI){
1751  ++nfutureNumParts;
1752  }
1753  future_num_parts = nfutureNumParts;
1754  }
1755  this->total_num_part = this->num_global_parts;
1756  //estimate reduceAll Count here.
1757  //we find the upperbound instead.
1758  mj_part_t p = 1;
1759  for (int i = 0; i < this->recursion_depth; ++i){
1760  this->total_dim_num_reduce_all += p;
1761  p *= this->max_num_part_along_dim;
1762  }
1763 
1764  this->last_dim_num_part = p / this->max_num_part_along_dim;
1765  }
1766 
1767  this->total_num_cut = this->total_num_part - 1;
1768  this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
1769  this->max_num_total_part_along_dim = this->max_num_part_along_dim + size_t(this->max_num_cut_along_dim);
1770  //maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
1771 
1772  //refine the concurrent part count, if it is given bigger than the maximum possible part count.
1773  if(this->max_concurrent_part_calculation > this->last_dim_num_part){
1774  if(this->mj_problemComm->getRank() == 0){
1775  std::cerr << "Warning: Concurrent part count ("<< this->max_concurrent_part_calculation <<
1776  ") has been set bigger than maximum amount that can be used." <<
1777  " Setting to:" << this->last_dim_num_part << "." << std::endl;
1778  }
1779  this->max_concurrent_part_calculation = this->last_dim_num_part;
1780  }
1781 
1782 }
1783 /* \brief Tries to determine the part number for current dimension,
1784  * by trying to make the partitioning as square as possible.
1785  * \param num_total_future how many more partitionings are required.
1786  * \param root how many more recursion depth is left.
1787  */
1788 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1789  typename mj_part_t>
1791  mj_part_t num_total_future,
1792  double root)
1793 {
1794  double fp = pow(num_total_future, root);
1795  mj_part_t ip = mj_part_t (fp);
1796  if (fp - ip < this->fEpsilon * 100){
1797  return ip;
1798  }
1799  else {
1800  return ip + 1;
1801  }
1802 }
1803 
1804 /* \brief Function returns how many parts that will be obtained after this dimension partitioning.
1805  * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector,
1806  * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector,
1807  * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor.
1808  *
1809  * \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into.
1810  * \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into.
1811  * \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into.
1812  * \param future_num_parts: output, max number of future parts that will be obtained from a single
1813  * \param current_num_parts: input, how many parts are there currently.
1814  * \param current_iteration: input, current dimension iteration number.
1815  * \param input_part_boxes: input, if boxes are kept, current boxes.
1816  * \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts.
1817  */
1818 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1819  typename mj_part_t>
1821  std::vector <mj_part_t> &num_partitioning_in_current_dim, //assumes this vector is empty.
1822  std::vector<mj_part_t> *future_num_part_in_parts,
1823  std::vector<mj_part_t> *next_future_num_parts_in_parts, //assumes this vector is empty.
1824  mj_part_t &future_num_parts,
1825  mj_part_t current_num_parts,
1826  int current_iteration,
1827  RCP<mj_partBoxVector_t> input_part_boxes,
1828  RCP<mj_partBoxVector_t> output_part_boxes
1829 ){
1830  //how many parts that will be obtained after this dimension.
1831  mj_part_t output_num_parts = 0;
1832  if(this->part_no_array){
1833  //when the partNo array is provided as input,
1834  //each current partition will be partition to the same number of parts.
1835  //we dont need to use the future_num_part_in_parts vector in this case.
1836 
1837  mj_part_t p = this->part_no_array[current_iteration];
1838  if (p < 1){
1839  std::cout << "i:" << current_iteration << " p is given as:" << p << std::endl;
1840  exit(1);
1841  }
1842  if (p == 1){
1843  return current_num_parts;
1844  }
1845 
1846  for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
1847  num_partitioning_in_current_dim.push_back(p);
1848 
1849  }
1850  //cout << "me:" << this->myRank << " current_iteration" << current_iteration <<
1851  //" current_num_parts:" << current_num_parts << std::endl;
1852  //cout << "num_partitioning_in_current_dim[0]:" << num_partitioning_in_current_dim[0] << std::endl;
1853  //set the new value of future_num_parts.
1854 
1855  /*
1856  cout << "\tfuture_num_parts:" << future_num_parts
1857  << " num_partitioning_in_current_dim[0]:" << num_partitioning_in_current_dim[0]
1858  << future_num_parts/ num_partitioning_in_current_dim[0] << std::endl;
1859  */
1860 
1861  future_num_parts /= num_partitioning_in_current_dim[0];
1862  output_num_parts = current_num_parts * num_partitioning_in_current_dim[0];
1863 
1864  if (this->mj_keep_part_boxes){
1865  for (mj_part_t k = 0; k < current_num_parts; ++k){
1866  //initialized the output boxes as its ancestor.
1867  for (mj_part_t j = 0; j < num_partitioning_in_current_dim[0]; ++j){
1868  output_part_boxes->push_back((*input_part_boxes)[k]);
1869  }
1870  }
1871  }
1872 
1873  //set the how many more parts each part will be divided.
1874  //this is obvious when partNo array is provided as input.
1875  //however, fill this so that weights will be calculated according to this array.
1876  for (mj_part_t ii = 0; ii < output_num_parts; ++ii){
1877  next_future_num_parts_in_parts->push_back(future_num_parts);
1878  }
1879  }
1880  else {
1881  //if partNo array is not provided as input,
1882  //future_num_part_in_parts holds how many parts each part should be divided.
1883  //initially it holds a single number equal to the total number of global parts.
1884 
1885  //calculate the future_num_parts from beginning,
1886  //since each part might be divided into different number of parts.
1887  future_num_parts = 1;
1888 
1889  //cout << "i:" << i << std::endl;
1890 
1891  for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
1892  //get how many parts a part should be divided.
1893  mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
1894 
1895  //get the ideal number of parts that is close to the
1896  //(recursion_depth - i) root of the future_num_parts_of_part_ii.
1897  mj_part_t num_partitions_in_current_dim =
1898  this->get_part_count(
1899  future_num_parts_of_part_ii,
1900  1.0 / (this->recursion_depth - current_iteration)
1901  );
1902 
1903  if (num_partitions_in_current_dim > this->max_num_part_along_dim){
1904  std::cerr << "ERROR: maxPartNo calculation is wrong." << std::endl;
1905  exit(1);
1906  }
1907  //add this number to num_partitioning_in_current_dim vector.
1908  num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
1909 
1910 
1911  //increase the output number of parts.
1912  output_num_parts += num_partitions_in_current_dim;
1913 
1914  //ideal number of future partitions for each part.
1915  mj_part_t ideal_num_future_parts_in_part = future_num_parts_of_part_ii / num_partitions_in_current_dim;
1916  for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
1917  mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part;
1918 
1919  //if there is a remainder in the part increase the part weight.
1920  if (iii < future_num_parts_of_part_ii % num_partitions_in_current_dim){
1921  //if not uniform, add 1 for the extra parts.
1922  ++num_future_parts_for_part_iii;
1923  }
1924  next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii);
1925 
1926  //if part boxes are stored, initialize the box of the parts as the ancestor.
1927  if (this->mj_keep_part_boxes){
1928  output_part_boxes->push_back((*input_part_boxes)[ii]);
1929  }
1930 
1931  //set num future_num_parts to maximum in this part.
1932  if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii;
1933  }
1934  }
1935  }
1936  return output_num_parts;
1937 }
1938 
1939 
1940 /* \brief Allocates and initializes the work memory that will be used by MJ.
1941  *
1942  * */
1943 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1944  typename mj_part_t>
1946 
1947  //points to process that initially owns the coordinate.
1948  this->owner_of_coordinate = NULL;
1949 
1950  //Throughout the partitioning execution,
1951  //instead of the moving the coordinates, hold a permutation array for parts.
1952  //coordinate_permutations holds the current permutation.
1953  this->coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
1954  //initial configuration, set each pointer-i to i.
1955 #ifdef HAVE_ZOLTAN2_OMP
1956 #pragma omp parallel for
1957 #endif
1958  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
1959  this->coordinate_permutations[i] = i;
1960  }
1961 
1962  //new_coordinate_permutations holds the current permutation.
1963  this->new_coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
1964 
1965  this->assigned_part_ids = NULL;
1966  if(this->num_local_coords > 0){
1967  this->assigned_part_ids = allocMemory<mj_part_t>(this->num_local_coords);
1968  }
1969 
1970  //single partition starts at index-0, and ends at numLocalCoords
1971  //inTotalCounts array holds the end points in coordinate_permutations array
1972  //for each partition. Initially sized 1, and single element is set to numLocalCoords.
1973  this->part_xadj = allocMemory<mj_lno_t>(1);
1974  this->part_xadj[0] = static_cast<mj_lno_t>(this->num_local_coords);//the end of the initial partition is the end of coordinates.
1975  //the ends points of the output, this is allocated later.
1976  this->new_part_xadj = NULL;
1977 
1978  // only store this much if cuts are needed to be stored.
1979  //this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->total_num_cut);
1980 
1981 
1982  this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
1983 
1984  this->max_min_coords = allocMemory< mj_scalar_t>(this->num_threads * 2);
1985 
1986  this->process_cut_line_weight_to_put_left = NULL; //how much weight percentage should a MPI put left side of the each cutline
1987  this->thread_cut_line_weight_to_put_left = NULL; //how much weight percentage should each thread in MPI put left side of the each outline
1988  //distribute_points_on_cut_lines = false;
1989  if(this->distribute_points_on_cut_lines){
1990  this->process_cut_line_weight_to_put_left = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
1991  this->thread_cut_line_weight_to_put_left = allocMemory<mj_scalar_t *>(this->num_threads);
1992  for(int i = 0; i < this->num_threads; ++i){
1993  this->thread_cut_line_weight_to_put_left[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
1994  }
1995  this->process_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
1996  this->global_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
1997  }
1998 
1999 
2000  // work array to manipulate coordinate of cutlines in different iterations.
2001  //necessary because previous cut line information is used for determining
2002  //the next cutline information. therefore, cannot update the cut work array
2003  //until all cutlines are determined.
2004  this->cut_coordinates_work_array = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim *
2005  this->max_concurrent_part_calculation);
2006 
2007 
2008  //cumulative part weight array.
2009  this->target_part_weights = allocMemory<mj_scalar_t>(
2010  this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2011  // the weight from left to write.
2012 
2013  this->cut_upper_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation); //upper bound coordinate of a cut line
2014  this->cut_lower_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //lower bound coordinate of a cut line
2015  this->cut_lower_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //lower bound weight of a cut line
2016  this->cut_upper_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //upper bound weight of a cut line
2017 
2018  this->process_local_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation); //combined array to exchange the min and max coordinate, and total weight of part.
2019  this->global_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);//global combined array with the results for min, max and total weight.
2020 
2021  //is_cut_line_determined is used to determine if a cutline is determined already.
2022  //If a cut line is already determined, the next iterations will skip this cut line.
2023  this->is_cut_line_determined = allocMemory<bool>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2024  //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part
2025  //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part.
2026  this->my_incomplete_cut_count = allocMemory<mj_part_t>(this->max_concurrent_part_calculation);
2027  //local part weights of each thread.
2028  this->thread_part_weights = allocMemory<double *>(this->num_threads);
2029  //the work manupulation array for partweights.
2030  this->thread_part_weight_work = allocMemory<double *>(this->num_threads);
2031 
2032  //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread).
2033  this->thread_cut_left_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2034  //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread)
2035  this->thread_cut_right_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2036 
2037  //to store how many points in each part a thread has.
2038  this->thread_point_counts = allocMemory<mj_lno_t *>(this->num_threads);
2039 
2040  for(int i = 0; i < this->num_threads; ++i){
2041  //partWeights[i] = allocMemory<mj_scalar_t>(maxTotalPartCount);
2042  this->thread_part_weights[i] = allocMemory < double >(this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2043  this->thread_cut_right_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2044  this->thread_cut_left_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2045  this->thread_point_counts[i] = allocMemory<mj_lno_t>(this->max_num_part_along_dim);
2046  }
2047  //for faster communication, concatanation of
2048  //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
2049  //leftClosest distances sized P-1, since P-1 cut lines
2050  //rightClosest distances size P-1, since P-1 cut lines.
2051  this->total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2052  this->global_total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2053 
2054 
2055  mj_scalar_t **coord = allocMemory<mj_scalar_t *>(this->coord_dim);
2056  for (int i=0; i < this->coord_dim; i++){
2057  coord[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2058 #ifdef HAVE_ZOLTAN2_OMP
2059 #pragma omp parallel for
2060 #endif
2061  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2062  coord[i][j] = this->mj_coordinates[i][j];
2063  }
2064  this->mj_coordinates = coord;
2065 
2066 
2067  int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
2068  mj_scalar_t **weights = allocMemory<mj_scalar_t *>(criteria_dim);
2069 
2070  for (int i=0; i < criteria_dim; i++){
2071  weights[i] = NULL;
2072  }
2073  for (int i=0; i < this->num_weights_per_coord; i++){
2074  weights[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2075 #ifdef HAVE_ZOLTAN2_OMP
2076 #pragma omp parallel for
2077 #endif
2078  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2079  weights[i][j] = this->mj_weights[i][j];
2080 
2081  }
2082  this->mj_weights = weights;
2083  this->current_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
2084 #ifdef HAVE_ZOLTAN2_OMP
2085 #pragma omp parallel for
2086 #endif
2087  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2088  this->current_mj_gnos[j] = this->initial_mj_gnos[j];
2089 
2090  this->owner_of_coordinate = allocMemory<int>(this->num_local_coords);
2091 
2092 #ifdef HAVE_ZOLTAN2_OMP
2093 #pragma omp parallel for
2094 #endif
2095  for (mj_lno_t j=0; j < this->num_local_coords; j++)
2096  this->owner_of_coordinate[j] = this->myActualRank;
2097 }
2098 
2099 /* \brief compute the global bounding box
2100  */
2101 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2102  typename mj_part_t>
2104 {
2105  //local min coords
2106  mj_scalar_t *mins = allocMemory<mj_scalar_t>(this->coord_dim);
2107  //global min coords
2108  mj_scalar_t *gmins = allocMemory<mj_scalar_t>(this->coord_dim);
2109  //local max coords
2110  mj_scalar_t *maxs = allocMemory<mj_scalar_t>(this->coord_dim);
2111  //global max coords
2112  mj_scalar_t *gmaxs = allocMemory<mj_scalar_t>(this->coord_dim);
2113 
2114  for (int i = 0; i < this->coord_dim; ++i){
2115  mj_scalar_t localMin = this->mj_coordinates[i][0];
2116  mj_scalar_t localMax = this->mj_coordinates[i][0];
2117  for (mj_lno_t j = 1; j < this->num_local_coords; ++j){
2118  if (this->mj_coordinates[i][j] < localMin){
2119  localMin = this->mj_coordinates[i][j];
2120  }
2121  if (this->mj_coordinates[i][j] > localMax){
2122  localMax = this->mj_coordinates[i][j];
2123  }
2124  }
2125  //cout << " localMin:" << localMin << endl;
2126  //cout << " localMax:" << localMax << endl;
2127  mins[i] = localMin;
2128  maxs[i] = localMax;
2129  }
2130  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2131  this->coord_dim, mins, gmins
2132  );
2133 
2134  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2135  this->coord_dim, maxs, gmaxs
2136  );
2137 
2138  //create single box with all areas.
2139  global_box = rcp(new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2140  //coordinateModelPartBox <mj_scalar_t, mj_part_t> tmpBox (0, coordDim);
2141  freeArray<mj_scalar_t>(mins);
2142  freeArray<mj_scalar_t>(gmins);
2143  freeArray<mj_scalar_t>(maxs);
2144  freeArray<mj_scalar_t>(gmaxs);
2145 }
2146 
2147 /* \brief for part communication we keep track of the box boundaries.
2148  * This is performed when either asked specifically, or when geometric mapping is performed afterwards.
2149  * This function initializes a single box with all global min and max coordinates.
2150  * \param initial_partitioning_boxes the input and output vector for boxes.
2151  */
2152 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2153  typename mj_part_t>
2155  RCP<mj_partBoxVector_t> & initial_partitioning_boxes
2156 )
2157 {
2158  mj_partBox_t tmp_box(*global_box);
2159  initial_partitioning_boxes->push_back(tmp_box);
2160 }
2161 
2172 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2173  typename mj_part_t>
2175  mj_lno_t coordinate_begin_index,
2176  mj_lno_t coordinate_end_index,
2177  mj_lno_t *mj_current_coordinate_permutations,
2178  mj_scalar_t *mj_current_dim_coords,
2179  mj_scalar_t &min_coordinate,
2180  mj_scalar_t &max_coordinate,
2181  mj_scalar_t &total_weight){
2182 
2183  //if the part is empty.
2184  //set the min and max coordinates as reverse.
2185  if(coordinate_begin_index >= coordinate_end_index)
2186  {
2187  min_coordinate = this->maxScalar_t;
2188  max_coordinate = this->minScalar_t;
2189  total_weight = 0;
2190  }
2191  else {
2192  mj_scalar_t my_total_weight = 0;
2193 #ifdef HAVE_ZOLTAN2_OMP
2194 #pragma omp parallel
2195 #endif
2196  {
2197  //if uniform weights are used, then weight is equal to count.
2198  if (this->mj_uniform_weights[0]) {
2199 #ifdef HAVE_ZOLTAN2_OMP
2200 #pragma omp single
2201 #endif
2202  {
2203  my_total_weight = coordinate_end_index - coordinate_begin_index;
2204  }
2205 
2206  }
2207  else {
2208  //if not uniform, then weights are reducted from threads.
2209 #ifdef HAVE_ZOLTAN2_OMP
2210 #pragma omp for reduction(+:my_total_weight)
2211 #endif
2212  for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2213  int i = mj_current_coordinate_permutations[ii];
2214  my_total_weight += this->mj_weights[0][i];
2215  }
2216  }
2217 
2218  int my_thread_id = 0;
2219 #ifdef HAVE_ZOLTAN2_OMP
2220  my_thread_id = omp_get_thread_num();
2221 #endif
2222  mj_scalar_t my_thread_min_coord, my_thread_max_coord;
2223  my_thread_min_coord=my_thread_max_coord
2224  =mj_current_dim_coords[mj_current_coordinate_permutations[coordinate_begin_index]];
2225 
2226 
2227 #ifdef HAVE_ZOLTAN2_OMP
2228 #pragma omp for
2229 #endif
2230  for(mj_lno_t j = coordinate_begin_index + 1; j < coordinate_end_index; ++j){
2231  int i = mj_current_coordinate_permutations[j];
2232  if(mj_current_dim_coords[i] > my_thread_max_coord)
2233  my_thread_max_coord = mj_current_dim_coords[i];
2234  if(mj_current_dim_coords[i] < my_thread_min_coord)
2235  my_thread_min_coord = mj_current_dim_coords[i];
2236  }
2237  this->max_min_coords[my_thread_id] = my_thread_min_coord;
2238  this->max_min_coords[my_thread_id + this->num_threads] = my_thread_max_coord;
2239 
2240 #ifdef HAVE_ZOLTAN2_OMP
2241 //we need a barrier here, because max_min_array might not be filled by some of the threads.
2242 #pragma omp barrier
2243 #pragma omp single nowait
2244 #endif
2245  {
2246  min_coordinate = this->max_min_coords[0];
2247  for(int i = 1; i < this->num_threads; ++i){
2248  if(this->max_min_coords[i] < min_coordinate)
2249  min_coordinate = this->max_min_coords[i];
2250  }
2251  }
2252 
2253 #ifdef HAVE_ZOLTAN2_OMP
2254 #pragma omp single nowait
2255 #endif
2256  {
2257  max_coordinate = this->max_min_coords[this->num_threads];
2258  for(int i = this->num_threads + 1; i < this->num_threads * 2; ++i){
2259  if(this->max_min_coords[i] > max_coordinate)
2260  max_coordinate = this->max_min_coords[i];
2261  }
2262  }
2263  }
2264  total_weight = my_total_weight;
2265  }
2266 }
2267 
2268 
2276 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2277  typename mj_part_t>
2279  mj_part_t current_concurrent_num_parts,
2280  mj_scalar_t *local_min_max_total,
2281  mj_scalar_t *global_min_max_total){
2282 
2283  //reduce min for first current_concurrent_num_parts elements, reduce max for next
2284  //concurrentPartCount elements,
2285  //reduce sum for the last concurrentPartCount elements.
2286  if(this->comm->getSize() > 1){
2288  reductionOp(
2289  current_concurrent_num_parts,
2290  current_concurrent_num_parts,
2291  current_concurrent_num_parts);
2292  try{
2293  reduceAll<int, mj_scalar_t>(
2294  *(this->comm),
2295  reductionOp,
2296  3 * current_concurrent_num_parts,
2297  local_min_max_total,
2298  global_min_max_total);
2299  }
2300  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
2301  }
2302  else {
2303  mj_part_t s = 3 * current_concurrent_num_parts;
2304  for (mj_part_t i = 0; i < s; ++i){
2305  global_min_max_total[i] = local_min_max_total[i];
2306  }
2307  }
2308 }
2309 
2310 
2311 
2330 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2331  typename mj_part_t>
2333  mj_scalar_t min_coord,
2334  mj_scalar_t max_coord,
2335  mj_part_t num_cuts/*p-1*/ ,
2336  mj_scalar_t global_weight,
2337  mj_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/,
2338  mj_scalar_t *current_target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/,
2339 
2340  std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
2341  std::vector <mj_part_t> *next_future_num_parts_in_parts,
2342  mj_part_t concurrent_current_part,
2343  mj_part_t obtained_part_index
2344 ){
2345 
2346  mj_scalar_t coord_range = max_coord - min_coord;
2347  if(this->mj_uniform_parts[0]){
2348  {
2349  mj_part_t cumulative = 0;
2350  //how many total future parts the part will be partitioned into.
2351  mj_scalar_t total_future_part_count_in_part = mj_scalar_t((*future_num_part_in_parts)[concurrent_current_part]);
2352 
2353 
2354  //how much each part should weigh in ideal case.
2355  mj_scalar_t unit_part_weight = global_weight / total_future_part_count_in_part;
2356  /*
2357  cout << "total_future_part_count_in_part:" << total_future_part_count_in_part << endl;
2358  cout << "global_weight:" << global_weight << endl;
2359  cout << "unit_part_weight" << unit_part_weight <<endl;
2360  */
2361  for(mj_part_t i = 0; i < num_cuts; ++i){
2362  cumulative += (*next_future_num_parts_in_parts)[i + obtained_part_index];
2363 
2364  /*
2365  cout << "obtained_part_index:" << obtained_part_index <<
2366  " (*next_future_num_parts_in_parts)[i + obtained_part_index]:" << (*next_future_num_parts_in_parts)[i + obtained_part_index] <<
2367  " cumulative:" << cumulative << endl;
2368  */
2369  //set target part weight.
2370  current_target_part_weights[i] = cumulative * unit_part_weight;
2371  //cout <<"i:" << i << " current_target_part_weights:" << current_target_part_weights[i] << endl;
2372  //set initial cut coordinate.
2373  initial_cut_coords[i] = min_coord + (coord_range *
2374  cumulative) / total_future_part_count_in_part;
2375  }
2376  current_target_part_weights[num_cuts] = 1;
2377  }
2378 
2379  //round the target part weights.
2380  if (this->mj_uniform_weights[0]){
2381  for(mj_part_t i = 0; i < num_cuts + 1; ++i){
2382  current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2383  }
2384  }
2385  }
2386  else {
2387  std::cerr << "MJ does not support non uniform part weights" << std::endl;
2388  exit(1);
2389  }
2390 }
2391 
2392 
2405 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2406  typename mj_part_t>
2408  mj_scalar_t &max_coordinate,
2409  mj_scalar_t &min_coordinate,
2410  mj_part_t &concurrent_current_part_index,
2411  mj_lno_t coordinate_begin_index,
2412  mj_lno_t coordinate_end_index,
2413  mj_lno_t *mj_current_coordinate_permutations,
2414  mj_scalar_t *mj_current_dim_coords,
2415  mj_part_t *mj_part_ids,
2416  mj_part_t &partition_count
2417 ){
2418  mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
2419 
2420  //if there is single point, or if all points are along a line.
2421  //set initial part to 0 for all.
2422  if(ZOLTAN2_ABS(coordinate_range) < this->sEpsilon ){
2423 #ifdef HAVE_ZOLTAN2_OMP
2424 #pragma omp parallel for
2425 #endif
2426  for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2427  mj_part_ids[mj_current_coordinate_permutations[ii]] = 0;
2428  }
2429  }
2430  else{
2431 
2432  //otherwise estimate an initial part for each coordinate.
2433  //assuming uniform distribution of points.
2434  mj_scalar_t slice = coordinate_range / partition_count;
2435 
2436 #ifdef HAVE_ZOLTAN2_OMP
2437 #pragma omp parallel for
2438 #endif
2439  for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2440 
2441  mj_lno_t iii = mj_current_coordinate_permutations[ii];
2442  mj_part_t pp = mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
2443  mj_part_ids[iii] = 2 * pp;
2444  }
2445  }
2446 }
2447 
2448 
2459 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2460  typename mj_part_t>
2462  mj_scalar_t *mj_current_dim_coords,
2463  mj_scalar_t used_imbalance_tolerance,
2464  mj_part_t current_work_part,
2465  mj_part_t current_concurrent_num_parts,
2466  mj_scalar_t *current_cut_coordinates,
2467  mj_part_t total_incomplete_cut_count,
2468  std::vector <mj_part_t> &num_partitioning_in_current_dim
2469 ){
2470 
2471 
2472  mj_part_t rectilinear_cut_count = 0;
2473  mj_scalar_t *temp_cut_coords = current_cut_coordinates;
2474 
2476  *reductionOp = NULL;
2477  reductionOp = new Teuchos::MultiJaggedCombinedReductionOp
2478  <mj_part_t, mj_scalar_t>(
2479  &num_partitioning_in_current_dim ,
2480  current_work_part ,
2481  current_concurrent_num_parts);
2482 
2483  size_t total_reduction_size = 0;
2484 #ifdef HAVE_ZOLTAN2_OMP
2485 #pragma omp parallel shared(total_incomplete_cut_count, rectilinear_cut_count)
2486 #endif
2487  {
2488  int me = 0;
2489 #ifdef HAVE_ZOLTAN2_OMP
2490  me = omp_get_thread_num();
2491 #endif
2492  double *my_thread_part_weights = this->thread_part_weights[me];
2493  mj_scalar_t *my_thread_left_closest = this->thread_cut_left_closest_point[me];
2494  mj_scalar_t *my_thread_right_closest = this->thread_cut_right_closest_point[me];
2495 
2496 #ifdef HAVE_ZOLTAN2_OMP
2497 #pragma omp single
2498 #endif
2499  {
2500  //initialize the lower and upper bounds of the cuts.
2501  mj_part_t next = 0;
2502  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
2503 
2504  mj_part_t num_part_in_dim = num_partitioning_in_current_dim[current_work_part + i];
2505  mj_part_t num_cut_in_dim = num_part_in_dim - 1;
2506  total_reduction_size += (4 * num_cut_in_dim + 1);
2507 
2508  for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii){
2509  this->is_cut_line_determined[next] = false;
2510  this->cut_lower_bound_coordinates[next] = global_min_max_coord_total_weight[i]; //min coordinate
2511  this->cut_upper_bound_coordinates[next] = global_min_max_coord_total_weight[i + current_concurrent_num_parts]; //max coordinate
2512 
2513  this->cut_upper_bound_weights[next] = global_min_max_coord_total_weight[i + 2 * current_concurrent_num_parts]; //total weight
2514  this->cut_lower_bound_weights[next] = 0;
2515 
2516  if(this->distribute_points_on_cut_lines){
2517  this->process_cut_line_weight_to_put_left[next] = 0;
2518  }
2519  ++next;
2520  }
2521  }
2522  }
2523 
2524  //no need to have barrier here.
2525  //pragma omp single have implicit barrier.
2526 
2527  int iteration = 0;
2528  while (total_incomplete_cut_count != 0){
2529  iteration += 1;
2530  //cout << "\niteration:" << iteration << " ";
2531  mj_part_t concurrent_cut_shifts = 0;
2532  size_t total_part_shift = 0;
2533 
2534  for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2535  mj_part_t num_parts = -1;
2536  num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2537 
2538  mj_part_t num_cuts = num_parts - 1;
2539  size_t total_part_count = num_parts + size_t (num_cuts) ;
2540  if (this->my_incomplete_cut_count[kk] > 0){
2541 
2542  //although isDone shared, currentDone is private and same for all.
2543  bool *current_cut_status = this->is_cut_line_determined + concurrent_cut_shifts;
2544  double *my_current_part_weights = my_thread_part_weights + total_part_shift;
2545  mj_scalar_t *my_current_left_closest = my_thread_left_closest + concurrent_cut_shifts;
2546  mj_scalar_t *my_current_right_closest = my_thread_right_closest + concurrent_cut_shifts;
2547 
2548  mj_part_t conccurent_current_part = current_work_part + kk;
2549  mj_lno_t coordinate_begin_index = conccurent_current_part ==0 ? 0: this->part_xadj[conccurent_current_part -1];
2550  mj_lno_t coordinate_end_index = this->part_xadj[conccurent_current_part];
2551  mj_scalar_t *temp_current_cut_coords = temp_cut_coords + concurrent_cut_shifts;
2552 
2553  mj_scalar_t min_coord = global_min_max_coord_total_weight[kk];
2554  mj_scalar_t max_coord = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2555 
2556  // compute part weights using existing cuts
2557  this->mj_1D_part_get_thread_part_weights(
2558  total_part_count,
2559  num_cuts,
2560  max_coord,//globalMinMaxTotal[kk + concurrentPartCount],//maxScalar,
2561  min_coord,//globalMinMaxTotal[kk]//minScalar,
2562  coordinate_begin_index,
2563  coordinate_end_index,
2564  mj_current_dim_coords,
2565  temp_current_cut_coords,
2566  current_cut_status,
2567  my_current_part_weights,
2568  my_current_left_closest,
2569  my_current_right_closest);
2570 
2571  }
2572 
2573  concurrent_cut_shifts += num_cuts;
2574  total_part_shift += total_part_count;
2575  }
2576 
2577  //sum up the results of threads
2578  this->mj_accumulate_thread_results(
2579  num_partitioning_in_current_dim,
2580  current_work_part,
2581  current_concurrent_num_parts);
2582 
2583  //now sum up the results of mpi processors.
2584 #ifdef HAVE_ZOLTAN2_OMP
2585 #pragma omp single
2586 #endif
2587  {
2588  if(this->comm->getSize() > 1){
2589  reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
2590  total_reduction_size,
2591  this->total_part_weight_left_right_closests,
2592  this->global_total_part_weight_left_right_closests);
2593 
2594  }
2595  else {
2596  memcpy(
2597  this->global_total_part_weight_left_right_closests,
2598  this->total_part_weight_left_right_closests,
2599  total_reduction_size * sizeof(mj_scalar_t));
2600  }
2601  }
2602 
2603  //how much cut will be shifted for the next part in the concurrent part calculation.
2604  mj_part_t cut_shift = 0;
2605 
2606  //how much the concantaneted array will be shifted for the next part in concurrent part calculation.
2607  size_t tlr_shift = 0;
2608  for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2609  mj_part_t num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2610  mj_part_t num_cuts = num_parts - 1;
2611  size_t num_total_part = num_parts + size_t (num_cuts) ;
2612 
2613  //if the cuts of this cut has already been completed.
2614  //nothing to do for this part.
2615  //just update the shift amount and proceed.
2616  if (this->my_incomplete_cut_count[kk] == 0) {
2617  cut_shift += num_cuts;
2618  tlr_shift += (num_total_part + 2 * num_cuts);
2619  continue;
2620  }
2621 
2622  mj_scalar_t *current_local_part_weights = this->total_part_weight_left_right_closests + tlr_shift ;
2623  mj_scalar_t *current_global_tlr = this->global_total_part_weight_left_right_closests + tlr_shift;
2624  mj_scalar_t *current_global_left_closest_points = current_global_tlr + num_total_part; //left closest points
2625  mj_scalar_t *current_global_right_closest_points = current_global_tlr + num_total_part + num_cuts; //right closest points
2626  mj_scalar_t *current_global_part_weights = current_global_tlr;
2627  bool *current_cut_line_determined = this->is_cut_line_determined + cut_shift;
2628 
2629  mj_scalar_t *current_part_target_weights = this->target_part_weights + cut_shift + kk;
2630  mj_scalar_t *current_part_cut_line_weight_to_put_left = this->process_cut_line_weight_to_put_left + cut_shift;
2631 
2632  mj_scalar_t min_coordinate = global_min_max_coord_total_weight[kk];
2633  mj_scalar_t max_coordinate = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2634  mj_scalar_t global_total_weight = global_min_max_coord_total_weight[kk + current_concurrent_num_parts * 2];
2635  mj_scalar_t *current_cut_lower_bound_weights = this->cut_lower_bound_weights + cut_shift;
2636  mj_scalar_t *current_cut_upper_weights = this->cut_upper_bound_weights + cut_shift;
2637  mj_scalar_t *current_cut_upper_bounds = this->cut_upper_bound_coordinates + cut_shift;
2638  mj_scalar_t *current_cut_lower_bounds = this->cut_lower_bound_coordinates + cut_shift;
2639 
2640  mj_part_t initial_incomplete_cut_count = this->my_incomplete_cut_count[kk];
2641 
2642  // Now compute the new cut coordinates.
2643  this->mj_get_new_cut_coordinates(
2644  num_total_part,
2645  num_cuts,
2646  max_coordinate,
2647  min_coordinate,
2648  global_total_weight,
2649  used_imbalance_tolerance,
2650  current_global_part_weights,
2651  current_local_part_weights,
2652  current_part_target_weights,
2653  current_cut_line_determined,
2654  temp_cut_coords + cut_shift,
2655  current_cut_upper_bounds,
2656  current_cut_lower_bounds,
2657  current_global_left_closest_points,
2658  current_global_right_closest_points,
2659  current_cut_lower_bound_weights,
2660  current_cut_upper_weights,
2661  this->cut_coordinates_work_array +cut_shift, //new cut coordinates
2662  current_part_cut_line_weight_to_put_left,
2663  &rectilinear_cut_count,
2664  this->my_incomplete_cut_count[kk]);
2665 
2666  cut_shift += num_cuts;
2667  tlr_shift += (num_total_part + 2 * num_cuts);
2668  mj_part_t iteration_complete_cut_count = initial_incomplete_cut_count - this->my_incomplete_cut_count[kk];
2669 #ifdef HAVE_ZOLTAN2_OMP
2670 #pragma omp single
2671 #endif
2672  {
2673  total_incomplete_cut_count -= iteration_complete_cut_count;
2674  }
2675 
2676  }
2677 #ifdef HAVE_ZOLTAN2_OMP
2678 #pragma omp barrier
2679 #pragma omp single
2680 #endif
2681  {
2682  //swap the cut coordinates for next iteration.
2683  mj_scalar_t *t = temp_cut_coords;
2684  temp_cut_coords = this->cut_coordinates_work_array;
2685  this->cut_coordinates_work_array = t;
2686  }
2687  }
2688 
2689  // Needed only if keep_cuts; otherwise can simply swap array pointers
2690  // cutCoordinates and cutCoordinatesWork.
2691  // (at first iteration, cutCoordinates == cutCoorindates_tmp).
2692  // computed cuts must be in cutCoordinates.
2693  if (current_cut_coordinates != temp_cut_coords){
2694 #ifdef HAVE_ZOLTAN2_OMP
2695 #pragma omp single
2696 #endif
2697  {
2698  mj_part_t next = 0;
2699  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
2700  mj_part_t num_parts = -1;
2701  num_parts = num_partitioning_in_current_dim[current_work_part + i];
2702  mj_part_t num_cuts = num_parts - 1;
2703 
2704  for(mj_part_t ii = 0; ii < num_cuts; ++ii){
2705  current_cut_coordinates[next + ii] = temp_cut_coords[next + ii];
2706  }
2707  next += num_cuts;
2708  }
2709  }
2710 
2711 #ifdef HAVE_ZOLTAN2_OMP
2712 #pragma omp single
2713 #endif
2714  {
2715  this->cut_coordinates_work_array = temp_cut_coords;
2716  }
2717  }
2718  }
2719  delete reductionOp;
2720 }
2721 
2722 
2742 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2743  typename mj_part_t>
2745  size_t total_part_count,
2746  mj_part_t num_cuts,
2747  mj_scalar_t max_coord,
2748  mj_scalar_t min_coord,
2749  mj_lno_t coordinate_begin_index,
2750  mj_lno_t coordinate_end_index,
2751  mj_scalar_t *mj_current_dim_coords,
2752  mj_scalar_t *temp_current_cut_coords,
2753  bool *current_cut_status,
2754  double *my_current_part_weights,
2755  mj_scalar_t *my_current_left_closest,
2756  mj_scalar_t *my_current_right_closest){
2757 
2758  // initializations for part weights, left/right closest
2759  for (size_t i = 0; i < total_part_count; ++i){
2760  my_current_part_weights[i] = 0;
2761  }
2762 
2763  //initialize the left and right closest coordinates
2764  //to their max value.
2765  for(mj_part_t i = 0; i < num_cuts; ++i){
2766  my_current_left_closest[i] = min_coord - 1;
2767  my_current_right_closest[i] = max_coord + 1;
2768  }
2769  //mj_lno_t comparison_count = 0;
2770  mj_scalar_t minus_EPSILON = -this->sEpsilon;
2771 #ifdef HAVE_ZOLTAN2_OMP
2772  //no need for the barrier as all threads uses their local memories.
2773  //dont change the static scheduling here, as it is assumed when the new
2774  //partitions are created later.
2775 #pragma omp for
2776 #endif
2777  for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2778  int i = this->coordinate_permutations[ii];
2779 
2780  //the accesses to assigned_part_ids are thread safe
2781  //since each coordinate is assigned to only a single thread.
2782  mj_part_t j = this->assigned_part_ids[i] / 2;
2783 
2784  if(j >= num_cuts){
2785  j = num_cuts - 1;
2786  }
2787 
2788  mj_part_t lower_cut_index = 0;
2789  mj_part_t upper_cut_index = num_cuts - 1;
2790 
2791  mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
2792  bool is_inserted = false;
2793  bool is_on_left_of_cut = false;
2794  bool is_on_right_of_cut = false;
2795  mj_part_t last_compared_part = -1;
2796 
2797  mj_scalar_t coord = mj_current_dim_coords[i];
2798 
2799  while(upper_cut_index >= lower_cut_index)
2800  {
2801  //comparison_count++;
2802  last_compared_part = -1;
2803  is_on_left_of_cut = false;
2804  is_on_right_of_cut = false;
2805  mj_scalar_t cut = temp_current_cut_coords[j];
2806  mj_scalar_t distance_to_cut = coord - cut;
2807  mj_scalar_t abs_distance_to_cut = ZOLTAN2_ABS(distance_to_cut);
2808 
2809  //if it is on the line.
2810  if(abs_distance_to_cut < this->sEpsilon){
2811 
2812  my_current_part_weights[j * 2 + 1] += w;
2813  this->assigned_part_ids[i] = j * 2 + 1;
2814 
2815  //assign left and right closest point to cut as the point is on the cut.
2816  my_current_left_closest[j] = coord;
2817  my_current_right_closest[j] = coord;
2818  //now we need to check if there are other cuts on the same cut coordinate.
2819  //if there are, then we add the weight of the cut to all cuts in the same coordinate.
2820  mj_part_t kk = j + 1;
2821  while(kk < num_cuts){
2822  // Needed when cuts shared the same position
2823  distance_to_cut =ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
2824  if(distance_to_cut < this->sEpsilon){
2825  my_current_part_weights[2 * kk + 1] += w;
2826  my_current_left_closest[kk] = coord;
2827  my_current_right_closest[kk] = coord;
2828  kk++;
2829  }
2830  else{
2831  //cut is far away.
2832  //just check the left closest point for the next cut.
2833  if(coord - my_current_left_closest[kk] > this->sEpsilon){
2834  my_current_left_closest[kk] = coord;
2835  }
2836  break;
2837  }
2838  }
2839 
2840 
2841  kk = j - 1;
2842  //continue checking for the cuts on the left if they share the same coordinate.
2843  while(kk >= 0){
2844  distance_to_cut =ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
2845  if(distance_to_cut < this->sEpsilon){
2846  my_current_part_weights[2 * kk + 1] += w;
2847  //try to write the partId as the leftmost cut.
2848  this->assigned_part_ids[i] = kk * 2 + 1;
2849  my_current_left_closest[kk] = coord;
2850  my_current_right_closest[kk] = coord;
2851  kk--;
2852  }
2853  else{
2854  //if cut is far away on the left of the point.
2855  //then just compare for right closest point.
2856  if(my_current_right_closest[kk] - coord > this->sEpsilon){
2857  my_current_right_closest[kk] = coord;
2858  }
2859  break;
2860  }
2861  }
2862 
2863  is_inserted = true;
2864  break;
2865  }
2866  else {
2867  //if point is on the left of the cut.
2868  if (distance_to_cut < 0) {
2869  bool _break = false;
2870  if(j > 0){
2871  //check distance to the cut on the left the current cut compared.
2872  //if point is on the right, then we find the part of the point.
2873  mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j - 1];
2874  if(distance_to_next_cut > this->sEpsilon){
2875  _break = true;
2876  }
2877  }
2878  //if point is not on the right of the next cut, then
2879  //set the upper bound to this cut.
2880  upper_cut_index = j - 1;
2881  //set the last part, and mark it as on the left of the last part.
2882  is_on_left_of_cut = true;
2883  last_compared_part = j;
2884  if(_break) break;
2885  }
2886  else {
2887  //if point is on the right of the cut.
2888  bool _break = false;
2889  if(j < num_cuts - 1){
2890  //check distance to the cut on the left the current cut compared.
2891  //if point is on the right, then we find the part of the point.
2892  mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j + 1];
2893  if(distance_to_next_cut < minus_EPSILON){
2894  _break = true;
2895  }
2896  }
2897 
2898  //if point is not on the left of the next cut, then
2899  //set the upper bound to this cut.
2900  lower_cut_index = j + 1;
2901  //set the last part, and mark it as on the right of the last part.
2902  is_on_right_of_cut = true;
2903  last_compared_part = j;
2904  if(_break) break;
2905  }
2906  }
2907 
2908  j = (upper_cut_index + lower_cut_index) / 2;
2909  }
2910  if(!is_inserted){
2911  if(is_on_right_of_cut){
2912 
2913  //add it to the right of the last compared part.
2914  my_current_part_weights[2 * last_compared_part + 2] += w;
2915  this->assigned_part_ids[i] = 2 * last_compared_part + 2;
2916 
2917  //update the right closest point of last compared cut.
2918  if(my_current_right_closest[last_compared_part] - coord > this->sEpsilon){
2919  my_current_right_closest[last_compared_part] = coord;
2920  }
2921  //update the left closest point of the cut on the right of the last compared cut.
2922  if(last_compared_part+1 < num_cuts){
2923 
2924  if(coord - my_current_left_closest[last_compared_part + 1] > this->sEpsilon){
2925  my_current_left_closest[last_compared_part + 1] = coord;
2926  }
2927  }
2928 
2929  }
2930  else if(is_on_left_of_cut){
2931 
2932  //add it to the left of the last compared part.
2933  my_current_part_weights[2 * last_compared_part] += w;
2934  this->assigned_part_ids[i] = 2 * last_compared_part;
2935 
2936 
2937  //update the left closest point of last compared cut.
2938  if(coord - my_current_left_closest[last_compared_part] > this->sEpsilon){
2939  my_current_left_closest[last_compared_part] = coord;
2940  }
2941 
2942  //update the right closest point of the cut on the left of the last compared cut.
2943  if(last_compared_part-1 >= 0){
2944  if(my_current_right_closest[last_compared_part -1] - coord > this->sEpsilon){
2945  my_current_right_closest[last_compared_part -1] = coord;
2946  }
2947  }
2948  }
2949  }
2950  }
2951 
2952  // prefix sum computation.
2953  //we need prefix sum for each part to determine cut positions.
2954  for (size_t i = 1; i < total_part_count; ++i){
2955  // check for cuts sharing the same position; all cuts sharing a position
2956  // have the same weight == total weight for all cuts sharing the position.
2957  // don't want to accumulate that total weight more than once.
2958  if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
2959  ZOLTAN2_ABS(temp_current_cut_coords[i / 2] - temp_current_cut_coords[i /2 - 1])
2960  < this->sEpsilon){
2961  //i % 2 = 0 when part i represents the cut coordinate.
2962  //if it is a cut, and if the next cut also have the same coordinate, then
2963  //dont addup.
2964  my_current_part_weights[i] = my_current_part_weights[i-2];
2965  continue;
2966  }
2967  //otherwise do the prefix sum.
2968  my_current_part_weights[i] += my_current_part_weights[i-1];
2969  }
2970 }
2971 
2972 
2980 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2981  typename mj_part_t>
2983  const std::vector <mj_part_t> &num_partitioning_in_current_dim,
2984  mj_part_t current_work_part,
2985  mj_part_t current_concurrent_num_parts){
2986 
2987 #ifdef HAVE_ZOLTAN2_OMP
2988  //needs barrier here, as it requires all threads to finish mj_1D_part_get_thread_part_weights
2989  //using parallel region here reduces the performance because of the cache invalidates.
2990 #pragma omp barrier
2991 #pragma omp single
2992 #endif
2993  {
2994  size_t tlr_array_shift = 0;
2995  mj_part_t cut_shift = 0;
2996 
2997  //iterate for all concurrent parts to find the left and right closest points in the process.
2998  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
2999 
3000  mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3001  mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3002  size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3003 
3004  //iterate for cuts in a single part.
3005  for(mj_part_t ii = 0; ii < num_cuts_in_part ; ++ii){
3006  mj_part_t next = tlr_array_shift + ii;
3007  mj_part_t cut_index = cut_shift + ii;
3008  if(this->is_cut_line_determined[cut_index]) continue;
3009  mj_scalar_t left_closest_in_process = this->thread_cut_left_closest_point[0][cut_index],
3010  right_closest_in_process = this->thread_cut_right_closest_point[0][cut_index];
3011 
3012  //find the closest points from left and right for the cut in the process.
3013  for (int j = 1; j < this->num_threads; ++j){
3014  if (this->thread_cut_right_closest_point[j][cut_index] < right_closest_in_process ){
3015  right_closest_in_process = this->thread_cut_right_closest_point[j][cut_index];
3016  }
3017  if (this->thread_cut_left_closest_point[j][cut_index] > left_closest_in_process ){
3018  left_closest_in_process = this->thread_cut_left_closest_point[j][cut_index];
3019  }
3020  }
3021  //store the left and right closes points.
3022  this->total_part_weight_left_right_closests[num_total_part_in_part +
3023  next] = left_closest_in_process;
3024  this->total_part_weight_left_right_closests[num_total_part_in_part +
3025  num_cuts_in_part + next] = right_closest_in_process;
3026  }
3027  //set the shift position in the arrays
3028  tlr_array_shift += (num_total_part_in_part + 2 * num_cuts_in_part);
3029  cut_shift += num_cuts_in_part;
3030  }
3031 
3032  tlr_array_shift = 0;
3033  cut_shift = 0;
3034  size_t total_part_array_shift = 0;
3035 
3036  //iterate for all concurrent parts to find the total weight in the process.
3037  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3038 
3039  mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3040  mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3041  size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3042 
3043  for(size_t j = 0; j < num_total_part_in_part; ++j){
3044 
3045  mj_part_t cut_ind = j / 2 + cut_shift;
3046 
3047  //need to check j != num_total_part_in_part - 1
3048  // which is same as j/2 != num_cuts_in_part.
3049  //we cannot check it using cut_ind, because of the concurrent part concantanetion.
3050  if(j != num_total_part_in_part - 1 && this->is_cut_line_determined[cut_ind]) continue;
3051  double pwj = 0;
3052  for (int k = 0; k < this->num_threads; ++k){
3053  pwj += this->thread_part_weights[k][total_part_array_shift + j];
3054  }
3055  //size_t jshift = j % total_part_count + i * (total_part_count + 2 * noCuts);
3056  this->total_part_weight_left_right_closests[tlr_array_shift + j] = pwj;
3057  }
3058  cut_shift += num_cuts_in_part;
3059  tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
3060  total_part_array_shift += num_total_part_in_part;
3061  }
3062  }
3063  //the other threads needs to wait here.
3064  //but we don't need a pragma omp barrier.
3065  //as omp single has already have implicit barrier.
3066 }
3067 
3068 
3078 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3079  typename mj_part_t>
3081  mj_scalar_t cut_upper_bound,
3082  mj_scalar_t cut_lower_bound,
3083  mj_scalar_t cut_upper_weight,
3084  mj_scalar_t cut_lower_weight,
3085  mj_scalar_t expected_weight,
3086  mj_scalar_t &new_cut_position){
3087 
3088  if(ZOLTAN2_ABS(cut_upper_bound - cut_lower_bound) < this->sEpsilon){
3089  new_cut_position = cut_upper_bound; //or lower bound does not matter.
3090  }
3091 
3092 
3093  if(ZOLTAN2_ABS(cut_upper_weight - cut_lower_weight) < this->sEpsilon){
3094  new_cut_position = cut_lower_bound;
3095  }
3096 
3097  mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
3098  mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
3099  mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
3100 
3101  mj_scalar_t required_shift = (my_weight_diff / weight_range);
3102  int scale_constant = 20;
3103  int shiftint= int (required_shift * scale_constant);
3104  if (shiftint == 0) shiftint = 1;
3105  required_shift = mj_scalar_t (shiftint) / scale_constant;
3106  new_cut_position = coordinate_range * required_shift + cut_lower_bound;
3107 }
3108 
3109 
3120 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3121  typename mj_part_t>
3123  mj_part_t num_parts,
3124  mj_scalar_t *mj_current_dim_coords,
3125  mj_scalar_t *current_concurrent_cut_coordinate,
3126  mj_lno_t coordinate_begin,
3127  mj_lno_t coordinate_end,
3128  mj_scalar_t *used_local_cut_line_weight_to_left,
3129  double **used_thread_part_weight_work,
3130  mj_lno_t *out_part_xadj){
3131 
3132  mj_part_t num_cuts = num_parts - 1;
3133 
3134 #ifdef HAVE_ZOLTAN2_OMP
3135 #pragma omp parallel
3136 #endif
3137  {
3138  int me = 0;
3139 #ifdef HAVE_ZOLTAN2_OMP
3140  me = omp_get_thread_num();
3141 #endif
3142 
3143  mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
3144  mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
3145 
3146  //now if the rectilinear partitioning is allowed we decide how
3147  //much weight each thread should put to left and right.
3148  if (this->distribute_points_on_cut_lines){
3149  my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
3150  // this for assumes the static scheduling in mj_1D_part calculation.
3151 #ifdef HAVE_ZOLTAN2_OMP
3152 #pragma omp for
3153 #endif
3154  for (mj_part_t i = 0; i < num_cuts; ++i){
3155  //the left to be put on the left of the cut.
3156  mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
3157  for(int ii = 0; ii < this->num_threads; ++ii){
3158  if(left_weight > this->sEpsilon){
3159  //the weight of thread ii on cut.
3160  mj_scalar_t thread_ii_weight_on_cut = used_thread_part_weight_work[ii][i * 2 + 1] - used_thread_part_weight_work[ii][i * 2 ];
3161  if(thread_ii_weight_on_cut < left_weight){
3162  //if left weight is bigger than threads weight on cut.
3163  this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
3164  }
3165  else {
3166  //if thread's weight is bigger than space, then put only a portion.
3167  this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
3168  }
3169  left_weight -= thread_ii_weight_on_cut;
3170  }
3171  else {
3172  this->thread_cut_line_weight_to_put_left[ii][i] = 0;
3173  }
3174  }
3175  }
3176 
3177  if(num_cuts > 0){
3178  //this is a special case. If cutlines share the same coordinate, their weights are equal.
3179  //we need to adjust the ratio for that.
3180  for (mj_part_t i = num_cuts - 1; i > 0 ; --i){
3181  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
3182  my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
3183  }
3184  my_local_thread_cut_weights_to_put_left[i] = int ((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL)
3185  / mj_scalar_t(SIGNIFICANCE_MUL);
3186  }
3187  }
3188  }
3189 
3190  for(mj_part_t ii = 0; ii < num_parts; ++ii){
3191  thread_num_points_in_parts[ii] = 0;
3192  }
3193 
3194 
3195 #ifdef HAVE_ZOLTAN2_OMP
3196  //dont change static scheduler. the static partitioner used later as well.
3197 #pragma omp for
3198 #endif
3199  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3200 
3201  mj_lno_t coordinate_index = this->coordinate_permutations[ii];
3202  mj_scalar_t coordinate_weight = this->mj_uniform_weights[0]? 1:this->mj_weights[0][coordinate_index];
3203  mj_part_t coordinate_assigned_place = this->assigned_part_ids[coordinate_index];
3204  mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
3205  if(coordinate_assigned_place % 2 == 1){
3206  //if it is on the cut.
3207  if(this->distribute_points_on_cut_lines
3208  && my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > this->sEpsilon){
3209  //if the rectilinear partitioning is allowed,
3210  //and the thread has still space to put on the left of the cut
3211  //then thread puts the vertex to left.
3212  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3213  //if putting the vertex to left increased the weight more than expected.
3214  //and if the next cut is on the same coordinate,
3215  //then we need to adjust how much weight next cut puts to its left as well,
3216  //in order to take care of the imbalance.
3217  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0
3218  && coordinate_assigned_part < num_cuts - 1
3219  && ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3220  current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3221  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3222  }
3223  ++thread_num_points_in_parts[coordinate_assigned_part];
3224  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3225  }
3226  else{
3227  //if there is no more space on the left, put the coordinate to the right of the cut.
3228  ++coordinate_assigned_part;
3229  //this while loop is necessary when a line is partitioned into more than 2 parts.
3230  while(this->distribute_points_on_cut_lines &&
3231  coordinate_assigned_part < num_cuts){
3232  //traverse all the cut lines having the same partitiong
3233  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part] -
3234  current_concurrent_cut_coordinate[coordinate_assigned_part - 1])
3235  < this->sEpsilon){
3236  //if line has enough space on left, put it there.
3237  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >
3238  this->sEpsilon &&
3239  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >=
3240  ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] - coordinate_weight)){
3241  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3242  //Again if it put too much on left of the cut,
3243  //update how much the next cut sharing the same coordinate will put to its left.
3244  if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 &&
3245  coordinate_assigned_part < num_cuts - 1 &&
3246  ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3247  current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3248  my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3249  }
3250  break;
3251  }
3252  }
3253  else {
3254  break;
3255  }
3256  ++coordinate_assigned_part;
3257  }
3258  ++thread_num_points_in_parts[coordinate_assigned_part];
3259  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3260  }
3261  }
3262  else {
3263  //if it is already assigned to a part, then just put it to the corresponding part.
3264  ++thread_num_points_in_parts[coordinate_assigned_part];
3265  this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3266  }
3267  }
3268 
3269 
3270 
3271  //now we calculate where each thread will write in new_coordinate_permutations array.
3272  //first we find the out_part_xadj, by marking the begin and end points of each part found.
3273  //the below loop find the number of points in each part, and writes it to out_part_xadj
3274 #ifdef HAVE_ZOLTAN2_OMP
3275 #pragma omp for
3276 #endif
3277  for(mj_part_t j = 0; j < num_parts; ++j){
3278  mj_lno_t num_points_in_part_j_upto_thread_i = 0;
3279  for (int i = 0; i < this->num_threads; ++i){
3280  mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
3281  //prefix sum to thread point counts, so that each will have private space to write.
3282  this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
3283  num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
3284 
3285  }
3286  out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin;
3287  }
3288 
3289  //now we need to do a prefix sum to out_part_xadj[j], to point begin and end of each part.
3290 #ifdef HAVE_ZOLTAN2_OMP
3291 #pragma omp single
3292 #endif
3293  {
3294  //perform prefix sum for num_points in parts.
3295  for(mj_part_t j = 1; j < num_parts; ++j){
3296  out_part_xadj[j] += out_part_xadj[j - 1];
3297  }
3298  }
3299 
3300  //shift the num points in threads thread to obtain the
3301  //beginning index of each thread's private space.
3302  for(mj_part_t j = 1; j < num_parts; ++j){
3303  thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
3304  }
3305 
3306 
3307  //now thread gets the coordinate and writes the index of coordinate to the permutation array
3308  //using the part index we calculated.
3309 #ifdef HAVE_ZOLTAN2_OMP
3310 #pragma omp for
3311 #endif
3312  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3313  mj_lno_t i = this->coordinate_permutations[ii];
3314  mj_part_t p = this->assigned_part_ids[i];
3315  this->new_coordinate_permutations[coordinate_begin +
3316  thread_num_points_in_parts[p]++] = i;
3317  }
3318  }
3319 }
3320 
3321 
3322 
3351 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3352  typename mj_part_t>
3354  const size_t &num_total_part,
3355  const mj_part_t &num_cuts,
3356  const mj_scalar_t &max_coordinate,
3357  const mj_scalar_t &min_coordinate,
3358  const mj_scalar_t &global_total_weight,
3359  const mj_scalar_t &used_imbalance_tolerance,
3360  mj_scalar_t * current_global_part_weights,
3361  const mj_scalar_t * current_local_part_weights,
3362  const mj_scalar_t *current_part_target_weights,
3363  bool *current_cut_line_determined,
3364  mj_scalar_t *current_cut_coordinates,
3365  mj_scalar_t *current_cut_upper_bounds,
3366  mj_scalar_t *current_cut_lower_bounds,
3367  mj_scalar_t *current_global_left_closest_points,
3368  mj_scalar_t *current_global_right_closest_points,
3369  mj_scalar_t * current_cut_lower_bound_weights,
3370  mj_scalar_t * current_cut_upper_weights,
3371  mj_scalar_t *new_current_cut_coordinates,
3372  mj_scalar_t *current_part_cut_line_weight_to_put_left,
3373  mj_part_t *rectilinear_cut_count,
3374  mj_part_t &my_num_incomplete_cut){
3375 
3376  //seen weight in the part
3377  mj_scalar_t seen_weight_in_part = 0;
3378  //expected weight for part.
3379  mj_scalar_t expected_weight_in_part = 0;
3380  //imbalance for the left and right side of the cut.
3381  mj_scalar_t imbalance_on_left = 0, imbalance_on_right = 0;
3382 
3383 
3384 #ifdef HAVE_ZOLTAN2_OMP
3385 #pragma omp for
3386 #endif
3387  for (mj_part_t i = 0; i < num_cuts; i++){
3388  //if left and right closest points are not set yet,
3389  //set it to the cut itself.
3390  if(min_coordinate - current_global_left_closest_points[i] > this->sEpsilon)
3391  current_global_left_closest_points[i] = current_cut_coordinates[i];
3392  if(current_global_right_closest_points[i] - max_coordinate > this->sEpsilon)
3393  current_global_right_closest_points[i] = current_cut_coordinates[i];
3394 
3395  }
3396 #ifdef HAVE_ZOLTAN2_OMP
3397 #pragma omp for
3398 #endif
3399  for (mj_part_t i = 0; i < num_cuts; i++){
3400 
3401  if(this->distribute_points_on_cut_lines){
3402  //init the weight on the cut.
3403  this->global_rectilinear_cut_weight[i] = 0;
3404  this->process_rectilinear_cut_weight[i] = 0;
3405  }
3406  //if already determined at previous iterations,
3407  //then just write the coordinate to new array, and proceed.
3408  if(current_cut_line_determined[i]) {
3409  new_current_cut_coordinates[i] = current_cut_coordinates[i];
3410  continue;
3411  }
3412 
3413  //current weight of the part at the left of the cut line.
3414  seen_weight_in_part = current_global_part_weights[i * 2];
3415 
3416  /*
3417  cout << "seen_weight_in_part:" << i << " is "<< seen_weight_in_part << endl;
3418  cout << "\tcut:" << current_cut_coordinates[i]
3419  << " current_cut_lower_bounds:" << current_cut_lower_bounds[i]
3420  << " current_cut_upper_bounds:" << current_cut_upper_bounds[i] << endl;
3421  */
3422  //expected ratio
3423  expected_weight_in_part = current_part_target_weights[i];
3424  //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
3425  imbalance_on_left = imbalanceOf2(seen_weight_in_part, expected_weight_in_part);
3426  //rightImbalance = imbalanceOf(globalTotalWeight - seenW, globalTotalWeight, 1 - expected);
3427  imbalance_on_right = imbalanceOf2(global_total_weight - seen_weight_in_part, global_total_weight - expected_weight_in_part);
3428 
3429  bool is_left_imbalance_valid = ZOLTAN2_ABS(imbalance_on_left) - used_imbalance_tolerance < this->sEpsilon ;
3430  bool is_right_imbalance_valid = ZOLTAN2_ABS(imbalance_on_right) - used_imbalance_tolerance < this->sEpsilon;
3431 
3432  //if the cut line reaches to desired imbalance.
3433  if(is_left_imbalance_valid && is_right_imbalance_valid){
3434  current_cut_line_determined[i] = true;
3435 #ifdef HAVE_ZOLTAN2_OMP
3436 #pragma omp atomic
3437 #endif
3438  my_num_incomplete_cut -= 1;
3439  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3440  continue;
3441  }
3442  else if(imbalance_on_left < 0){
3443  //if left imbalance < 0 then we need to move the cut to right.
3444 
3445  if(this->distribute_points_on_cut_lines){
3446  //if it is okay to distribute the coordinate on
3447  //the same coordinate to left and right.
3448  //then check if we can reach to the target weight by including the
3449  //coordinates in the part.
3450  if (current_global_part_weights[i * 2 + 1] == expected_weight_in_part){
3451  //if it is we are done.
3452  current_cut_line_determined[i] = true;
3453 #ifdef HAVE_ZOLTAN2_OMP
3454 #pragma omp atomic
3455 #endif
3456  my_num_incomplete_cut -= 1;
3457 
3458  //then assign everything on the cut to the left of the cut.
3459  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3460 
3461  //for this cut all the weight on cut will be put to left.
3462 
3463  current_part_cut_line_weight_to_put_left[i] = current_local_part_weights[i * 2 + 1] - current_local_part_weights[i * 2];
3464  continue;
3465  }
3466  else if (current_global_part_weights[i * 2 + 1] > expected_weight_in_part){
3467 
3468  //if the weight is larger than the expected weight,
3469  //then we need to distribute some points to left, some to right.
3470  current_cut_line_determined[i] = true;
3471 #ifdef HAVE_ZOLTAN2_OMP
3472 #pragma omp atomic
3473 #endif
3474  *rectilinear_cut_count += 1;
3475  //increase the num cuts to be determined with rectilinear partitioning.
3476 
3477 #ifdef HAVE_ZOLTAN2_OMP
3478 #pragma omp atomic
3479 #endif
3480  my_num_incomplete_cut -= 1;
3481  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3482  this->process_rectilinear_cut_weight[i] = current_local_part_weights[i * 2 + 1] -
3483  current_local_part_weights[i * 2];
3484  continue;
3485  }
3486  }
3487  //we need to move further right,so set lower bound to current line, and shift it to the closes point from right.
3488  current_cut_lower_bounds[i] = current_global_right_closest_points[i];
3489  //set the lower bound weight to the weight we have seen.
3490  current_cut_lower_bound_weights[i] = seen_weight_in_part;
3491 
3492  //compare the upper bound with what has been found in the last iteration.
3493  //we try to make more strict bounds for the cut here.
3494  for (mj_part_t ii = i + 1; ii < num_cuts ; ++ii){
3495  mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3496  mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3497 
3498  if(p_weight >= expected_weight_in_part){
3499  //if a cut on the right has the expected weight, then we found
3500  //our cut position. Set up and low coordiantes to this new cut coordinate.
3501  //but we need one more iteration to finalize the cut position,
3502  //as wee need to update the part ids.
3503  if(p_weight == expected_weight_in_part){
3504  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3505  current_cut_upper_weights[i] = p_weight;
3506  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3507  current_cut_lower_bound_weights[i] = p_weight;
3508  } else if (p_weight < current_cut_upper_weights[i]){
3509  //if a part weight is larger then my expected weight,
3510  //but lower than my upper bound weight, update upper bound.
3511  current_cut_upper_bounds[i] = current_global_left_closest_points[ii];
3512  current_cut_upper_weights[i] = p_weight;
3513  }
3514  break;
3515  }
3516  //if comes here then pw < ew
3517  //then compare the weight against line weight.
3518  if(line_weight >= expected_weight_in_part){
3519  //if the line is larger than the expected weight,
3520  //then we need to reach to the balance by distributing coordinates on this line.
3521  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3522  current_cut_upper_weights[i] = line_weight;
3523  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3524  current_cut_lower_bound_weights[i] = p_weight;
3525  break;
3526  }
3527  //if a stricter lower bound is found,
3528  //update the lower bound.
3529  if (p_weight <= expected_weight_in_part && p_weight >= current_cut_lower_bound_weights[i]){
3530  current_cut_lower_bounds[i] = current_global_right_closest_points[ii] ;
3531  current_cut_lower_bound_weights[i] = p_weight;
3532  }
3533  }
3534 
3535 
3536  mj_scalar_t new_cut_position = 0;
3537  this->mj_calculate_new_cut_position(
3538  current_cut_upper_bounds[i],
3539  current_cut_lower_bounds[i],
3540  current_cut_upper_weights[i],
3541  current_cut_lower_bound_weights[i],
3542  expected_weight_in_part, new_cut_position);
3543 
3544  //if cut line does not move significantly.
3545  //then finalize the search.
3546  if (ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3547  /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/
3548  ){
3549  current_cut_line_determined[i] = true;
3550 #ifdef HAVE_ZOLTAN2_OMP
3551 #pragma omp atomic
3552 #endif
3553  my_num_incomplete_cut -= 1;
3554 
3555  //set the cut coordinate and proceed.
3556  new_current_cut_coordinates [i] = current_cut_coordinates[i];
3557  } else {
3558  new_current_cut_coordinates [i] = new_cut_position;
3559  }
3560  } else {
3561 
3562  //need to move the cut line to left.
3563  //set upper bound to current line.
3564  current_cut_upper_bounds[i] = current_global_left_closest_points[i];
3565  current_cut_upper_weights[i] = seen_weight_in_part;
3566 
3567  // compare the current cut line weights with previous upper and lower bounds.
3568  for (int ii = i - 1; ii >= 0; --ii){
3569  mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3570  mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3571  if(p_weight <= expected_weight_in_part){
3572  if(p_weight == expected_weight_in_part){
3573  //if the weight of the part is my expected weight
3574  //then we find the solution.
3575  current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3576  current_cut_upper_weights[i] = p_weight;
3577  current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3578  current_cut_lower_bound_weights[i] = p_weight;
3579  }
3580  else if (p_weight > current_cut_lower_bound_weights[i]){
3581  //if found weight is bigger than the lower bound
3582  //then update the lower bound.
3583  current_cut_lower_bounds[i] = current_global_right_closest_points[ii];
3584  current_cut_lower_bound_weights[i] = p_weight;
3585 
3586  //at the same time, if weight of line is bigger than the
3587  //expected weight, then update the upper bound as well.
3588  //in this case the balance will be obtained by distributing weightss
3589  //on this cut position.
3590  if(line_weight > expected_weight_in_part){
3591  current_cut_upper_bounds[i] = current_global_right_closest_points[ii];
3592  current_cut_upper_weights[i] = line_weight;
3593  }
3594  }
3595  break;
3596  }
3597  //if the weight of the cut on the left is still bigger than my weight,
3598  //and also if the weight is smaller than the current upper weight,
3599  //or if the weight is equal to current upper weight, but on the left of
3600  // the upper weight, then update upper bound.
3601  if (p_weight >= expected_weight_in_part &&
3602  (p_weight < current_cut_upper_weights[i] ||
3603  (p_weight == current_cut_upper_weights[i] &&
3604  current_cut_upper_bounds[i] > current_global_left_closest_points[ii]
3605  )
3606  )
3607  ){
3608  current_cut_upper_bounds[i] = current_global_left_closest_points[ii] ;
3609  current_cut_upper_weights[i] = p_weight;
3610  }
3611  }
3612  mj_scalar_t new_cut_position = 0;
3613  this->mj_calculate_new_cut_position(
3614  current_cut_upper_bounds[i],
3615  current_cut_lower_bounds[i],
3616  current_cut_upper_weights[i],
3617  current_cut_lower_bound_weights[i],
3618  expected_weight_in_part,
3619  new_cut_position);
3620 
3621  //if cut line does not move significantly.
3622  if (ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3623  /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/ ){
3624  current_cut_line_determined[i] = true;
3625 #ifdef HAVE_ZOLTAN2_OMP
3626 #pragma omp atomic
3627 #endif
3628  my_num_incomplete_cut -= 1;
3629  //set the cut coordinate and proceed.
3630  new_current_cut_coordinates [ i] = current_cut_coordinates[i];
3631  } else {
3632  new_current_cut_coordinates [ i] = new_cut_position;
3633  }
3634  }
3635  }
3636 
3637  //communication to determine the ratios of processors for the distribution
3638  //of coordinates on the cut lines.
3639 #ifdef HAVE_ZOLTAN2_OMP
3640  //no need barrier here as it is implicit.
3641 #pragma omp single
3642 #endif
3643  {
3644  if(*rectilinear_cut_count > 0){
3645 
3646  try{
3647  Teuchos::scan<int,mj_scalar_t>(
3648  *comm, Teuchos::REDUCE_SUM,
3649  num_cuts,
3650  this->process_rectilinear_cut_weight,
3651  this->global_rectilinear_cut_weight
3652  );
3653  }
3654  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
3655 
3656  for (mj_part_t i = 0; i < num_cuts; ++i){
3657  //if cut line weight to be distributed.
3658  if(this->global_rectilinear_cut_weight[i] > 0) {
3659  //expected weight to go to left of the cut.
3660  mj_scalar_t expected_part_weight = current_part_target_weights[i];
3661  //the weight that should be put to left of the cut.
3662  mj_scalar_t necessary_weight_on_line_for_left = expected_part_weight - current_global_part_weights[i * 2];
3663  //the weight of the cut in the process
3664  mj_scalar_t my_weight_on_line = this->process_rectilinear_cut_weight[i];
3665  //the sum of the cut weights upto this process, including the weight of this process.
3666  mj_scalar_t weight_on_line_upto_process_inclusive = this->global_rectilinear_cut_weight[i];
3667  //the space on the left side of the cut after all processes before this process (including this process)
3668  //puts their weights on cut to left.
3669  mj_scalar_t space_to_put_left = necessary_weight_on_line_for_left - weight_on_line_upto_process_inclusive;
3670  //add my weight to this space to find out how much space is left to me.
3671  mj_scalar_t space_left_to_me = space_to_put_left + my_weight_on_line;
3672 
3673  /*
3674  cout << "expected_part_weight:" << expected_part_weight
3675  << " necessary_weight_on_line_for_left:" << necessary_weight_on_line_for_left
3676  << " my_weight_on_line" << my_weight_on_line
3677  << " weight_on_line_upto_process_inclusive:" << weight_on_line_upto_process_inclusive
3678  << " space_to_put_left:" << space_to_put_left
3679  << " space_left_to_me" << space_left_to_me << endl;
3680  */
3681  if(space_left_to_me < 0){
3682  //space_left_to_me is negative and i dont need to put anything to left.
3683  current_part_cut_line_weight_to_put_left[i] = 0;
3684  }
3685  else if(space_left_to_me >= my_weight_on_line){
3686  //space left to me is bigger than the weight of the processor on cut.
3687  //so put everything to left.
3688  current_part_cut_line_weight_to_put_left[i] = my_weight_on_line;
3689  //cout << "setting current_part_cut_line_weight_to_put_left to my_weight_on_line:" << my_weight_on_line << endl;
3690  }
3691  else {
3692  //put only the weight as much as the space.
3693  current_part_cut_line_weight_to_put_left[i] = space_left_to_me ;
3694 
3695  //cout << "setting current_part_cut_line_weight_to_put_left to space_left_to_me:" << space_left_to_me << endl;
3696  }
3697 
3698  }
3699  }
3700  *rectilinear_cut_count = 0;
3701  }
3702  }
3703 }
3704 
3714 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3715  typename mj_part_t>
3717  mj_part_t num_procs,
3718  mj_part_t num_parts,
3719  mj_gno_t *&num_points_in_all_processor_parts){
3720 
3721  //initially allocation_size is num_parts
3722  size_t allocation_size = num_parts * (num_procs + 1);
3723 
3724  //this will be output
3725  //holds how many each processor has in each part.
3726  //last portion is the sum of all processor points in each part.
3727 
3728  //allocate memory for the local num coordinates in each part.
3729  mj_gno_t *num_local_points_in_each_part_to_reduce_sum = allocMemory<mj_gno_t>(allocation_size);
3730 
3731 
3732  //this is the portion of the memory which will be used
3733  //at the summation to obtain total number of processors' points in each part.
3734  mj_gno_t *my_local_points_to_reduce_sum = num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
3735  //this is the portion of the memory where each stores its local number.
3736  //this information is needed by other processors.
3737  mj_gno_t *my_local_point_counts_in_each_art = num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
3738 
3739  //initialize the array with 0's.
3740  memset(num_local_points_in_each_part_to_reduce_sum, 0, sizeof(mj_gno_t)*allocation_size);
3741 
3742  //write the number of coordinates in each part.
3743  for (mj_part_t i = 0; i < num_parts; ++i){
3744  mj_lno_t part_begin_index = 0;
3745  if (i > 0){
3746  part_begin_index = this->new_part_xadj[i - 1];
3747  }
3748  mj_lno_t part_end_index = this->new_part_xadj[i];
3749  my_local_points_to_reduce_sum[i] = part_end_index - part_begin_index;
3750  }
3751 
3752  //copy the local num parts to the last portion of array,
3753  //so that this portion will represent the global num points in each part after the reduction.
3754  memcpy (my_local_point_counts_in_each_art,
3755  my_local_points_to_reduce_sum,
3756  sizeof(mj_gno_t) * (num_parts) );
3757 
3758 
3759  //reduceAll operation.
3760  //the portion that belongs to a processor with index p
3761  //will start from myRank * num_parts.
3762  //the global number of points will be held at the index
3763  try{
3764  reduceAll<int, mj_gno_t>(
3765  *(this->comm),
3766  Teuchos::REDUCE_SUM,
3767  allocation_size,
3768  num_local_points_in_each_part_to_reduce_sum,
3769  num_points_in_all_processor_parts);
3770  }
3771  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
3772  freeArray<mj_gno_t>(num_local_points_in_each_part_to_reduce_sum);
3773 }
3774 
3775 
3776 
3789 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3790  typename mj_part_t>
3792  size_t migration_reduce_all_population,
3793  mj_lno_t num_coords_for_last_dim_part,
3794  mj_part_t num_procs,
3795  mj_part_t num_parts,
3796  mj_gno_t *num_points_in_all_processor_parts){
3797 
3798  //if reduce all count and population in the last dim is too high
3799  if (migration_reduce_all_population > FUTURE_REDUCEALL_CUTOFF) return true;
3800  //if the work in a part per processor in the last dim is too low.
3801  if (num_coords_for_last_dim_part < MIN_WORK_LAST_DIM) return true;
3802 
3803  //if migration is to be checked and the imbalance is too high
3804  if (this->check_migrate_avoid_migration_option == 0){
3805  double global_imbalance = 0;
3806  //global shift to reach the sum of coordiante count in each part.
3807  size_t global_shift = num_procs * num_parts;
3808 
3809  for (mj_part_t ii = 0; ii < num_procs; ++ii){
3810  for (mj_part_t i = 0; i < num_parts; ++i){
3811  double ideal_num = num_points_in_all_processor_parts[global_shift + i]
3812  / double(num_procs);
3813 
3814  global_imbalance += ZOLTAN2_ABS(ideal_num -
3815  num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
3816  }
3817  }
3818  global_imbalance /= num_parts;
3819  global_imbalance /= num_procs;
3820 
3821  /*
3822  if (this->myRank == 0) {
3823  cout << "imbalance for next iteration:" << global_imbalance << endl;
3824  }
3825  */
3826 
3827  if(global_imbalance <= this->minimum_migration_imbalance){
3828  return false;
3829  }
3830  else {
3831  return true;
3832  }
3833  }
3834  else {
3835  //if migration is forced
3836  return true;
3837  }
3838 }
3839 
3840 
3850 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3851  typename mj_part_t>
3853  mj_part_t num_parts,
3854  mj_part_t *part_assignment_proc_begin_indices,
3855  mj_part_t *processor_chains_in_parts,
3856  mj_lno_t *send_count_to_each_proc,
3857  int *coordinate_destinations){
3858 
3859  for (mj_part_t p = 0; p < num_parts; ++p){
3860  mj_lno_t part_begin = 0;
3861  if (p > 0) part_begin = this->new_part_xadj[p - 1];
3862  mj_lno_t part_end = this->new_part_xadj[p];
3863 
3864  //get the first part that current processor will send its part-p.
3865  mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
3866  //initialize how many point I sent to this processor.
3867  mj_lno_t num_total_send = 0;
3868  for (mj_lno_t j=part_begin; j < part_end; j++){
3869  mj_lno_t local_ind = this->new_coordinate_permutations[j];
3870  while (num_total_send >= send_count_to_each_proc[proc_to_sent]){
3871  //then get the next processor to send the points in part p.
3872  num_total_send = 0;
3873  //assign new processor to part_assign_begin[p]
3874  part_assignment_proc_begin_indices[p] = processor_chains_in_parts[proc_to_sent];
3875  //remove the previous processor
3876  processor_chains_in_parts[proc_to_sent] = -1;
3877  //choose the next processor as the next one to send.
3878  proc_to_sent = part_assignment_proc_begin_indices[p];
3879  }
3880  //write the gno index to corresponding position in sendBuf.
3881  coordinate_destinations[local_ind] = proc_to_sent;
3882  ++num_total_send;
3883  }
3884  }
3885 }
3886 
3901 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3902  typename mj_part_t>
3904  mj_gno_t * num_points_in_all_processor_parts,
3905  mj_part_t num_parts,
3906  mj_part_t num_procs,
3907  mj_lno_t *send_count_to_each_proc,
3908  std::vector<mj_part_t> &processor_ranks_for_subcomm,
3909  std::vector<mj_part_t> *next_future_num_parts_in_parts,
3910  mj_part_t &out_part_index,
3911  mj_part_t &output_part_numbering_begin_index,
3912  int *coordinate_destinations){
3913 
3914 
3915  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
3916  mj_part_t *num_procs_assigned_to_each_part = allocMemory<mj_part_t>(num_parts);
3917 
3918  //boolean variable if the process finds its part to be assigned.
3919  bool did_i_find_my_group = false;
3920 
3921  mj_part_t num_free_procs = num_procs;
3922  mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
3923 
3924  double max_imbalance_difference = 0;
3925  mj_part_t max_differing_part = 0;
3926 
3927  //find how many processor each part requires.
3928  for (mj_part_t i=0; i < num_parts; i++){
3929 
3930  //scalar portion of the required processors
3931  double scalar_required_proc = num_procs *
3932  (double (global_num_points_in_parts[i]) / double (this->num_global_coords));
3933 
3934  //round it to closest integer.
3935  mj_part_t required_proc = static_cast<mj_part_t> (0.5 + scalar_required_proc);
3936 
3937  //if assigning the required num procs, creates problems for the rest of the parts.
3938  //then only assign {num_free_procs - (minimum_num_procs_required_for_rest_of_parts)} procs to this part.
3939  if (num_free_procs - required_proc < minimum_num_procs_required_for_rest_of_parts){
3940  required_proc = num_free_procs - (minimum_num_procs_required_for_rest_of_parts);
3941  }
3942 
3943  //reduce the free processor count
3944  num_free_procs -= required_proc;
3945  //reduce the free minimum processor count required for the rest of the part by 1.
3946  --minimum_num_procs_required_for_rest_of_parts;
3947 
3948  //part (i) is assigned to (required_proc) processors.
3949  num_procs_assigned_to_each_part[i] = required_proc;
3950 
3951  //because of the roundings some processors might be left as unassigned.
3952  //we want to assign those processors to the part with most imbalance.
3953  //find the part with the maximum imbalance here.
3954  double imbalance_wrt_ideal = (scalar_required_proc - required_proc) / required_proc;
3955  if (imbalance_wrt_ideal > max_imbalance_difference){
3956  max_imbalance_difference = imbalance_wrt_ideal;
3957  max_differing_part = i;
3958  }
3959  }
3960 
3961  //assign extra processors to the part with maximum imbalance than the ideal.
3962  if (num_free_procs > 0){
3963  num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
3964  }
3965 
3966  //now find what are the best processors with least migration for each part.
3967 
3968  //part_assignment_proc_begin_indices ([i]) is the array that holds the beginning
3969  //index of a processor that processor sends its data for part - i
3970  mj_part_t *part_assignment_proc_begin_indices = allocMemory<mj_part_t>(num_parts);
3971  //the next processor send is found in processor_chains_in_parts, in linked list manner.
3972  mj_part_t *processor_chains_in_parts = allocMemory<mj_part_t>(num_procs);
3973  mj_part_t *processor_part_assignments = allocMemory<mj_part_t>(num_procs);
3974 
3975  //initialize the assignment of each processor.
3976  //this has a linked list implementation.
3977  //the beginning of processors assigned
3978  //to each part is hold at part_assignment_proc_begin_indices[part].
3979  //then the next processor assigned to that part is located at
3980  //proc_part_assignments[part_assign_begins[part]], this is a chain
3981  //until the value of -1 is reached.
3982  for (int i = 0; i < num_procs; ++i ){
3983  processor_part_assignments[i] = -1;
3984  processor_chains_in_parts[i] = -1;
3985  }
3986  for (int i = 0; i < num_parts; ++i ){
3987  part_assignment_proc_begin_indices[i] = -1;
3988  }
3989 
3990 
3991  //Allocate memory for sorting data structure.
3992  uSortItem<mj_part_t, mj_gno_t> * sort_item_num_part_points_in_procs = allocMemory <uSortItem<mj_part_t, mj_gno_t> > (num_procs);
3993  for(mj_part_t i = 0; i < num_parts; ++i){
3994  //the algorithm tries to minimize the cost of migration,
3995  //by assigning the processors with highest number of coordinates on that part.
3996  //here we might want to implement a maximum weighted bipartite matching algorithm.
3997  for(mj_part_t ii = 0; ii < num_procs; ++ii){
3998  sort_item_num_part_points_in_procs[ii].id = ii;
3999  //if processor is not assigned yet.
4000  //add its num points to the sort data structure.
4001  if (processor_part_assignments[ii] == -1){
4002  sort_item_num_part_points_in_procs[ii].val =
4003  num_points_in_all_processor_parts[ii * num_parts + i];
4004  }
4005  else {
4006  //if processor is already assigned, insert -nLocal - 1 so that it won't be selected again.
4007  //would be same if we simply set it to -1,
4008  //but more information with no extra cost (which is used later) is provided.
4009  sort_item_num_part_points_in_procs[ii].val = -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
4010  }
4011  }
4012  //sort the processors in the part.
4013  uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_part_points_in_procs);
4014 
4015  mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
4016  mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
4017  mj_gno_t ideal_num_points_in_a_proc =
4018  Teuchos::as<mj_gno_t>(ceil (total_num_points_in_part / double (required_proc_count)));
4019 
4020  //starts sending to least heaviest part.
4021  mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
4022  mj_part_t next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4023  mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4024 
4025  //find the processors that will be assigned to this part, which are the heaviest
4026  //non assigned processors.
4027  for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4028  mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
4029  //assign processor to part - i.
4030  processor_part_assignments[proc_id] = i;
4031  }
4032 
4033  bool did_change_sign = false;
4034  //if processor has a minus count, reverse it.
4035  for(mj_part_t ii = 0; ii < num_procs; ++ii){
4036  // TODO: THE LINE BELOW PRODUCES A WARNING IF gno_t IS UNSIGNED
4037  // TODO: SEE BUG 6194
4038  if (sort_item_num_part_points_in_procs[ii].val < 0){
4039  did_change_sign = true;
4040  sort_item_num_part_points_in_procs[ii].val = -sort_item_num_part_points_in_procs[ii].val - 1;
4041  }
4042  else {
4043  break;
4044  }
4045  }
4046  if(did_change_sign){
4047  //resort the processors in the part for the rest of the processors that is not assigned.
4048  uqsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count, sort_item_num_part_points_in_procs);
4049  }
4050 
4051  //check if this processors is one of the procs assigned to this part.
4052  //if it is, then get the group.
4053  if (!did_i_find_my_group){
4054  for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4055 
4056  mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
4057  //add the proc to the group.
4058  processor_ranks_for_subcomm.push_back(proc_id_to_assign);
4059 
4060  if(proc_id_to_assign == this->myRank){
4061  //if the assigned process is me, then I find my group.
4062  did_i_find_my_group = true;
4063  //set the beginning of part i to my rank.
4064  part_assignment_proc_begin_indices[i] = this->myRank;
4065  processor_chains_in_parts[this->myRank] = -1;
4066 
4067  //set send count to myself to the number of points that I have in part i.
4068  send_count_to_each_proc[this->myRank] = sort_item_num_part_points_in_procs[ii].val;
4069 
4070  //calculate the shift required for the output_part_numbering_begin_index
4071  for (mj_part_t in = 0; in < i; ++in){
4072  output_part_numbering_begin_index += (*next_future_num_parts_in_parts)[in];
4073  }
4074  out_part_index = i;
4075  }
4076  }
4077  //if these was not my group,
4078  //clear the subcomminicator processor array.
4079  if (!did_i_find_my_group){
4080  processor_ranks_for_subcomm.clear();
4081  }
4082  }
4083 
4084  //send points of the nonassigned coordinates to the assigned coordinates.
4085  //starts from the heaviest nonassigned processor.
4086  //TODO we might want to play with this part, that allows more computational imbalance
4087  //but having better communication balance.
4088  for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii){
4089  mj_part_t nonassigned_proc_id = sort_item_num_part_points_in_procs[ii].id;
4090  mj_lno_t num_points_to_sent = sort_item_num_part_points_in_procs[ii].val;
4091 
4092  //we set number of points to -to_sent - 1 for the assigned processors.
4093  //we reverse it here. This should not happen, as we have already reversed them above.
4094 #ifdef MJ_DEBUG
4095  if (num_points_to_sent < 0) {
4096  cout << "Migration - processor assignments - for part:" << i << "from proc:" << nonassigned_proc_id << " num_points_to_sent:" << num_points_to_sent << std::endl;
4097  exit(1);
4098  }
4099 #endif
4100 
4101  //now sends the points to the assigned processors.
4102  while (num_points_to_sent > 0){
4103  //if the processor has enough space.
4104  if (num_points_to_sent <= space_left_in_sent_proc){
4105  //reduce the space left in the processor.
4106  space_left_in_sent_proc -= num_points_to_sent;
4107  //if my rank is the one that is sending the coordinates.
4108  if (this->myRank == nonassigned_proc_id){
4109  //set my sent count to the sent processor.
4110  send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4111  //save the processor in the list (processor_chains_in_parts and part_assignment_proc_begin_indices)
4112  //that the processor will send its point in part-i.
4113  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4114  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4115  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4116  }
4117  num_points_to_sent = 0;
4118  }
4119  else {
4120  //there might be no space left in the processor.
4121  if(space_left_in_sent_proc > 0){
4122  num_points_to_sent -= space_left_in_sent_proc;
4123 
4124  //send as the space left in the processor.
4125  if (this->myRank == nonassigned_proc_id){
4126  //send as much as the space in this case.
4127  send_count_to_each_proc[next_proc_to_send_id] = space_left_in_sent_proc;
4128  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4129  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4130  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4131 
4132  }
4133  }
4134  //change the sent part
4135  ++next_proc_to_send_index;
4136 
4137 #ifdef MJ_DEBUG
4138  if(next_part_to_send_index < nprocs - required_proc_count ){
4139  cout << "Migration - processor assignments - for part:"
4140  << i
4141  << " next_part_to_send :" << next_part_to_send_index
4142  << " nprocs:" << nprocs
4143  << " required_proc_count:" << required_proc_count
4144  << " Error: next_part_to_send_index < nprocs - required_proc_count" << std::endl;
4145  exit(1)l
4146 
4147  }
4148 #endif
4149  //send the new id.
4150  next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4151  //set the new space in the processor.
4152  space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4153  }
4154  }
4155  }
4156  }
4157 
4158 
4159 
4160  this->assign_send_destinations(
4161  num_parts,
4162  part_assignment_proc_begin_indices,
4163  processor_chains_in_parts,
4164  send_count_to_each_proc,
4165  coordinate_destinations);
4166 
4167  freeArray<mj_part_t>(part_assignment_proc_begin_indices);
4168  freeArray<mj_part_t>(processor_chains_in_parts);
4169  freeArray<mj_part_t>(processor_part_assignments);
4170  freeArray<uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_part_points_in_procs);
4171  freeArray<mj_part_t > (num_procs_assigned_to_each_part);
4172 
4173 }
4174 
4175 
4188 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4189  typename mj_part_t>
4191  mj_part_t num_parts,
4192  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors
4193  int *coordinate_destinations,
4194  mj_part_t &output_part_numbering_begin_index,
4195  std::vector<mj_part_t> *next_future_num_parts_in_parts){
4196 
4197  mj_part_t part_shift_amount = output_part_numbering_begin_index;
4198  mj_part_t previous_processor = -1;
4199  for(mj_part_t i = 0; i < num_parts; ++i){
4200  mj_part_t p = sort_item_part_to_proc_assignment[i].id;
4201  //assigned processors are sorted.
4202  mj_lno_t part_begin_index = 0;
4203  if (p > 0) part_begin_index = this->new_part_xadj[p - 1];
4204  mj_lno_t part_end_index = this->new_part_xadj[p];
4205 
4206  mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
4207  if (this->myRank == assigned_proc && previous_processor != assigned_proc){
4208  output_part_numbering_begin_index = part_shift_amount;
4209  }
4210  previous_processor = assigned_proc;
4211  part_shift_amount += (*next_future_num_parts_in_parts)[p];
4212 
4213  for (mj_lno_t j=part_begin_index; j < part_end_index; j++){
4214  mj_lno_t localInd = this->new_coordinate_permutations[j];
4215  coordinate_destinations[localInd] = assigned_proc;
4216  }
4217  }
4218 }
4219 
4220 
4237 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4238  typename mj_part_t>
4240  mj_gno_t * num_points_in_all_processor_parts,
4241  mj_part_t num_parts,
4242  mj_part_t num_procs,
4243  mj_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc.
4244  std::vector<mj_part_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into.
4245  mj_part_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function.
4246  std::vector<mj_part_t> &out_part_indices, //output: the part indices which the processor is assigned to.
4247  mj_part_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution
4248  int *coordinate_destinations){
4249  out_num_part = 0;
4250 
4251  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4252  out_part_indices.clear();
4253 
4254  //to sort the parts that is assigned to the processors.
4255  //id is the part number, sort value is the assigned processor id.
4256  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment = allocMemory <uSortItem<mj_part_t, mj_part_t> >(num_parts);
4257  uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_procs);
4258 
4259 
4260  //calculate the optimal number of coordinates that should be assigned to each processor.
4261  mj_lno_t work_each = mj_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f);
4262  //to hold the left space as the number of coordinates to the optimal number in each proc.
4263  mj_lno_t *space_in_each_processor = allocMemory <mj_lno_t>(num_procs);
4264  //initialize left space in each.
4265  for (mj_part_t i = 0; i < num_procs; ++i){
4266  space_in_each_processor[i] = work_each;
4267  }
4268 
4269  //we keep track of how many parts each processor is assigned to.
4270  //because in some weird inputs, it might be possible that some
4271  //processors is not assigned to any part. Using these variables,
4272  //we force each processor to have at least one part.
4273  mj_part_t *num_parts_proc_assigned = allocMemory <mj_part_t>(num_procs);
4274  memset(num_parts_proc_assigned, 0, sizeof(mj_part_t) * num_procs);
4275  int empty_proc_count = num_procs;
4276 
4277  //to sort the parts with decreasing order of their coordiantes.
4278  //id are the part numbers, sort value is the number of points in each.
4279  uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_parts);
4280 
4281  //initially we will sort the parts according to the number of coordinates they have.
4282  //so that we will start assigning with the part that has the most number of coordinates.
4283  for (mj_part_t i = 0; i < num_parts; ++i){
4284  sort_item_point_counts_in_parts[i].id = i;
4285  sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
4286  }
4287  //sort parts with increasing order of loads.
4288  uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
4289 
4290 
4291  //assigning parts to the processors
4292  //traverse the part win decreasing order of load.
4293  //first assign the heaviest part.
4294  for (mj_part_t j = 0; j < num_parts; ++j){
4295  //sorted with increasing order, traverse inverse.
4296  mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
4297  //load of the part
4298  mj_gno_t load = global_num_points_in_parts[i];
4299 
4300  //assigned processors
4301  mj_part_t assigned_proc = -1;
4302  //if not fit best processor.
4303  mj_part_t best_proc_to_assign = 0;
4304 
4305 
4306  //sort processors with increasing number of points in this part.
4307  for (mj_part_t ii = 0; ii < num_procs; ++ii){
4308  sort_item_num_points_of_proc_in_part_i[ii].id = ii;
4309 
4310  //if there are still enough parts to fill empty processors, than proceed normally.
4311  //but if empty processor count is equal to the number of part, then
4312  //we force to part assignments only to empty processors.
4313  if (empty_proc_count < num_parts - j || num_parts_proc_assigned[ii] == 0){
4314  //how many points processor ii has in part i?
4315  sort_item_num_points_of_proc_in_part_i[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4316  }
4317  else {
4318  sort_item_num_points_of_proc_in_part_i[ii].val = -1;
4319  }
4320  }
4321  uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_points_of_proc_in_part_i);
4322 
4323  //traverse all processors with decreasing load.
4324  for (mj_part_t iii = num_procs - 1; iii >= 0; --iii){
4325  mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
4326  mj_lno_t left_space = space_in_each_processor[ii] - load;
4327  //if enought space, assign to this part.
4328  if(left_space >= 0 ){
4329  assigned_proc = ii;
4330  break;
4331  }
4332  //if space is not enough, store the best candidate part.
4333  if (space_in_each_processor[best_proc_to_assign] < space_in_each_processor[ii]){
4334  best_proc_to_assign = ii;
4335  }
4336  }
4337 
4338  //if none had enough space, then assign it to best part.
4339  if (assigned_proc == -1){
4340  assigned_proc = best_proc_to_assign;
4341  }
4342 
4343  if (num_parts_proc_assigned[assigned_proc]++ == 0){
4344  --empty_proc_count;
4345  }
4346  space_in_each_processor[assigned_proc] -= load;
4347  //to sort later, part-i is assigned to the proccessor - assignment.
4348  sort_item_part_to_proc_assignment[j].id = i; //part i
4349  sort_item_part_to_proc_assignment[j].val = assigned_proc; //assigned to processor - assignment.
4350 
4351 
4352  //if assigned processor is me, increase the number.
4353  if (assigned_proc == this->myRank){
4354  out_num_part++;//assigned_part_count;
4355  out_part_indices.push_back(i);
4356  }
4357  //increase the send to that processor by the number of points in that part.
4358  //as everyone send their coordiantes in this part to the processor assigned to this part.
4359  send_count_to_each_proc[assigned_proc] += num_points_in_all_processor_parts[this->myRank * num_parts + i];
4360  }
4361  freeArray<mj_part_t>(num_parts_proc_assigned);
4362  freeArray< uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_points_of_proc_in_part_i);
4363  freeArray<uSortItem<mj_part_t, mj_gno_t> >(sort_item_point_counts_in_parts);
4364  freeArray<mj_lno_t >(space_in_each_processor);
4365 
4366 
4367  //sort assignments with respect to the assigned processors.
4368  uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
4369  //fill sendBuf.
4370 
4371 
4372  this->assign_send_destinations2(
4373  num_parts,
4374  sort_item_part_to_proc_assignment,
4375  coordinate_destinations,
4376  output_part_numbering_begin_index,
4377  next_future_num_parts_in_parts);
4378 
4379  freeArray<uSortItem<mj_part_t, mj_part_t> >(sort_item_part_to_proc_assignment);
4380 }
4381 
4382 
4400 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4401  typename mj_part_t>
4403  mj_gno_t * num_points_in_all_processor_parts,
4404  mj_part_t num_parts,
4405  mj_part_t num_procs,
4406  mj_lno_t *send_count_to_each_proc,
4407  std::vector<mj_part_t> &processor_ranks_for_subcomm,
4408  std::vector<mj_part_t> *next_future_num_parts_in_parts,
4409  mj_part_t &out_num_part,
4410  std::vector<mj_part_t> &out_part_indices,
4411  mj_part_t &output_part_numbering_begin_index,
4412  int *coordinate_destinations){
4413 
4414 
4415 
4416  processor_ranks_for_subcomm.clear();
4417  // if (this->num_local_coords > 0)
4418  if (num_procs > num_parts){
4419  //if there are more processors than the number of current part
4420  //then processors share the existing parts.
4421  //at the end each processor will have a single part,
4422  //but a part will be shared by a group of processors.
4423  mj_part_t out_part_index = 0;
4424  this->mj_assign_proc_to_parts(
4425  num_points_in_all_processor_parts,
4426  num_parts,
4427  num_procs,
4428  send_count_to_each_proc,
4429  processor_ranks_for_subcomm,
4430  next_future_num_parts_in_parts,
4431  out_part_index,
4432  output_part_numbering_begin_index,
4433  coordinate_destinations
4434  );
4435 
4436  out_num_part = 1;
4437  out_part_indices.clear();
4438  out_part_indices.push_back(out_part_index);
4439  }
4440  else {
4441 
4442  //there are more parts than the processors.
4443  //therefore a processor will be assigned multiple parts,
4444  //the subcommunicators will only have a single processor.
4445  processor_ranks_for_subcomm.push_back(this->myRank);
4446 
4447  //since there are more parts then procs,
4448  //assign multiple parts to processors.
4449  this->mj_assign_parts_to_procs(
4450  num_points_in_all_processor_parts,
4451  num_parts,
4452  num_procs,
4453  send_count_to_each_proc,
4454  next_future_num_parts_in_parts,
4455  out_num_part,
4456  out_part_indices,
4457  output_part_numbering_begin_index,
4458  coordinate_destinations);
4459  }
4460 }
4461 
4474 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4475  typename mj_part_t>
4477  mj_part_t num_procs,
4478  mj_lno_t &num_new_local_points,
4479  std::string iteration,
4480  int *coordinate_destinations,
4481  mj_part_t num_parts)
4482 {
4483 #ifdef ENABLE_ZOLTAN_MIGRATION
4484  if (sizeof(mj_lno_t) <= sizeof(int)) {
4485 
4486  // Cannot use Zoltan_Comm with local ordinals larger than ints.
4487  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
4488  // may overflow.
4489 
4490  ZOLTAN_COMM_OBJ *plan = NULL;
4491  MPI_Comm mpi_comm = Teuchos2MPI (this->comm);
4492  int num_incoming_gnos = 0;
4493  int message_tag = 7859;
4494 
4495  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration Z1PlanCreating-" + iteration);
4496  int ierr = Zoltan_Comm_Create(
4497  &plan,
4498  int(this->num_local_coords),
4499  coordinate_destinations,
4500  mpi_comm,
4501  message_tag,
4502  &num_incoming_gnos);
4503  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4504  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration Z1PlanCreating-" + iteration);
4505 
4506  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration Z1Migration-" + iteration);
4507  mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(num_incoming_gnos);
4508 
4509  //migrate gnos.
4510  message_tag++;
4511  ierr = Zoltan_Comm_Do(
4512  plan,
4513  message_tag,
4514  (char *) this->current_mj_gnos,
4515  sizeof(mj_gno_t),
4516  (char *) incoming_gnos);
4517  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4518 
4519  freeArray<mj_gno_t>(this->current_mj_gnos);
4520  this->current_mj_gnos = incoming_gnos;
4521 
4522 
4523  //migrate coordinates
4524  for (int i = 0; i < this->coord_dim; ++i){
4525  message_tag++;
4526  mj_scalar_t *coord = this->mj_coordinates[i];
4527 
4528  this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4529  ierr = Zoltan_Comm_Do(
4530  plan,
4531  message_tag,
4532  (char *) coord,
4533  sizeof(mj_scalar_t),
4534  (char *) this->mj_coordinates[i]);
4535  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4536  freeArray<mj_scalar_t>(coord);
4537  }
4538 
4539  //migrate weights.
4540  for (int i = 0; i < this->num_weights_per_coord; ++i){
4541  message_tag++;
4542  mj_scalar_t *weight = this->mj_weights[i];
4543 
4544  this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4545  ierr = Zoltan_Comm_Do(
4546  plan,
4547  message_tag,
4548  (char *) weight,
4549  sizeof(mj_scalar_t),
4550  (char *) this->mj_weights[i]);
4551  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4552  freeArray<mj_scalar_t>(weight);
4553  }
4554 
4555 
4556  //migrate owners.
4557  int *coord_own = allocMemory<int>(num_incoming_gnos);
4558  message_tag++;
4559  ierr = Zoltan_Comm_Do(
4560  plan,
4561  message_tag,
4562  (char *) this->owner_of_coordinate,
4563  sizeof(int), (char *) coord_own);
4564  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4565  freeArray<int>(this->owner_of_coordinate);
4566  this->owner_of_coordinate = coord_own;
4567 
4568 
4569  //if num procs is less than num parts,
4570  //we need the part assigment arrays as well, since
4571  //there will be multiple parts in processor.
4572  mj_part_t *new_parts = allocMemory<mj_part_t>(num_incoming_gnos);
4573  if(num_procs < num_parts){
4574  message_tag++;
4575  ierr = Zoltan_Comm_Do(
4576  plan,
4577  message_tag,
4578  (char *) this->assigned_part_ids,
4579  sizeof(mj_part_t),
4580  (char *) new_parts);
4581  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4582  }
4583  freeArray<mj_part_t>(this->assigned_part_ids);
4584  this->assigned_part_ids = new_parts;
4585 
4586  ierr = Zoltan_Comm_Destroy(&plan);
4587  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
4588  num_new_local_points = num_incoming_gnos;
4589  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration Z1Migration-" + iteration);
4590  }
4591 
4592  else
4593 
4594 #endif // ENABLE_ZOLTAN_MIGRATION
4595  {
4596  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration DistributorPlanCreating-" + iteration);
4597  Tpetra::Distributor distributor(this->comm);
4598  ArrayView<const mj_part_t> destinations( coordinate_destinations, this->num_local_coords);
4599  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
4600  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration DistributorPlanCreating-" + iteration);
4601 
4602  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration DistributorMigration-" + iteration);
4603  {
4604  //migrate gnos.
4605  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
4606  ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
4607  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
4608  freeArray<mj_gno_t>(this->current_mj_gnos);
4609  this->current_mj_gnos = allocMemory<mj_gno_t>(num_incoming_gnos);
4610  memcpy(
4611  this->current_mj_gnos,
4612  received_gnos.getRawPtr(),
4613  num_incoming_gnos * sizeof(mj_gno_t));
4614  }
4615  //migrate coordinates
4616  for (int i = 0; i < this->coord_dim; ++i){
4617 
4618  ArrayView<mj_scalar_t> sent_coord(this->mj_coordinates[i], this->num_local_coords);
4619  ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
4620  distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
4621  freeArray<mj_scalar_t>(this->mj_coordinates[i]);
4622  this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4623  memcpy(
4624  this->mj_coordinates[i],
4625  received_coord.getRawPtr(),
4626  num_incoming_gnos * sizeof(mj_scalar_t));
4627  }
4628 
4629  //migrate weights.
4630  for (int i = 0; i < this->num_weights_per_coord; ++i){
4631 
4632  ArrayView<mj_scalar_t> sent_weight(this->mj_weights[i], this->num_local_coords);
4633  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
4634  distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
4635  freeArray<mj_scalar_t>(this->mj_weights[i]);
4636  this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4637  memcpy(
4638  this->mj_weights[i],
4639  received_weight.getRawPtr(),
4640  num_incoming_gnos * sizeof(mj_scalar_t));
4641  }
4642 
4643  {
4644  //migrate the owners of the coordinates
4645  ArrayView<int> sent_owners(this->owner_of_coordinate, this->num_local_coords);
4646  ArrayRCP<int> received_owners(num_incoming_gnos);
4647  distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners());
4648  freeArray<int>(this->owner_of_coordinate);
4649  this->owner_of_coordinate = allocMemory<int>(num_incoming_gnos);
4650  memcpy(
4651  this->owner_of_coordinate,
4652  received_owners.getRawPtr(),
4653  num_incoming_gnos * sizeof(int));
4654  }
4655 
4656  //if num procs is less than num parts,
4657  //we need the part assigment arrays as well, since
4658  //there will be multiple parts in processor.
4659  if(num_procs < num_parts){
4660  ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
4661  ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
4662  distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
4663  freeArray<mj_part_t>(this->assigned_part_ids);
4664  this->assigned_part_ids = allocMemory<mj_part_t>(num_incoming_gnos);
4665  memcpy(
4666  this->assigned_part_ids,
4667  received_partids.getRawPtr(),
4668  num_incoming_gnos * sizeof(mj_part_t));
4669  }
4670  else {
4671  mj_part_t *new_parts = allocMemory<int>(num_incoming_gnos);
4672  freeArray<mj_part_t>(this->assigned_part_ids);
4673  this->assigned_part_ids = new_parts;
4674  }
4675  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration DistributorMigration-" + iteration);
4676  num_new_local_points = num_incoming_gnos;
4677 
4678  }
4679 }
4680 
4687 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4688  typename mj_part_t>
4689 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm){
4690  mj_part_t group_size = processor_ranks_for_subcomm.size();
4691  mj_part_t *ids = allocMemory<mj_part_t>(group_size);
4692  for(mj_part_t i = 0; i < group_size; ++i) {
4693  ids[i] = processor_ranks_for_subcomm[i];
4694  }
4695  ArrayView<const mj_part_t> idView(ids, group_size);
4696  this->comm = this->comm->createSubcommunicator(idView);
4697  freeArray<mj_part_t>(ids);
4698 }
4699 
4700 
4706 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4707  typename mj_part_t>
4709  mj_part_t output_num_parts,
4710  mj_part_t num_parts){
4711  //if there is single output part, then simply fill the permutation array.
4712  if (output_num_parts == 1){
4713  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
4714  this->new_coordinate_permutations[i] = i;
4715  }
4716  this->new_part_xadj[0] = this->num_local_coords;
4717  }
4718  else {
4719 
4720  //otherwise we need to count how many points are there in each part.
4721  //we allocate here as num_parts, because the sent partids are up to num_parts,
4722  //although there are outout_num_parts different part.
4723  mj_lno_t *num_points_in_parts = allocMemory<mj_lno_t>(num_parts);
4724  //part shift holds the which part number an old part number corresponds to.
4725  mj_part_t *part_shifts = allocMemory<mj_part_t>(num_parts);
4726 
4727  memset(num_points_in_parts, 0, sizeof(mj_lno_t) * num_parts);
4728 
4729  for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
4730  mj_part_t ii = this->assigned_part_ids[i];
4731  ++num_points_in_parts[ii];
4732  }
4733 
4734  //write the end points of the parts.
4735  mj_part_t p = 0;
4736  mj_lno_t prev_index = 0;
4737  for(mj_part_t i = 0; i < num_parts; ++i){
4738  if(num_points_in_parts[i] > 0) {
4739  this->new_part_xadj[p] = prev_index + num_points_in_parts[i];
4740  prev_index += num_points_in_parts[i];
4741  part_shifts[i] = p++;
4742  }
4743  }
4744 
4745  //for the rest of the parts write the end index as end point.
4746  mj_part_t assigned_num_parts = p - 1;
4747  for (;p < num_parts; ++p){
4748  this->new_part_xadj[p] = this->new_part_xadj[assigned_num_parts];
4749  }
4750  for(mj_part_t i = 0; i < output_num_parts; ++i){
4751  num_points_in_parts[i] = this->new_part_xadj[i];
4752  }
4753 
4754  //write the permutation array here.
4755  //get the part of the coordinate i, shift it to obtain the new part number.
4756  //assign it to the end of the new part numbers pointer.
4757  for(mj_lno_t i = this->num_local_coords - 1; i >= 0; --i){
4758  mj_part_t part = part_shifts[mj_part_t(this->assigned_part_ids[i])];
4759  this->new_coordinate_permutations[--num_points_in_parts[part]] = i;
4760  }
4761 
4762  freeArray<mj_lno_t>(num_points_in_parts);
4763  freeArray<mj_part_t>(part_shifts);
4764  }
4765 }
4766 
4767 
4790 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4791  typename mj_part_t>
4793  mj_part_t input_num_parts, //current umb parts
4794  mj_part_t &output_num_parts, //output umb parts.
4795  std::vector<mj_part_t> *next_future_num_parts_in_parts,
4796  mj_part_t &output_part_begin_index,
4797  size_t migration_reduce_all_population,
4798  mj_lno_t num_coords_for_last_dim_part,
4799  std::string iteration,
4800  RCP<mj_partBoxVector_t> &input_part_boxes,
4801  RCP<mj_partBoxVector_t> &output_part_boxes
4802 )
4803 {
4804  mj_part_t num_procs = this->comm->getSize();
4805  this->myRank = this->comm->getRank();
4806 
4807 
4808  //this array holds how many points each processor has in each part.
4809  //to access how many points processor i has on part j,
4810  //num_points_in_all_processor_parts[i * num_parts + j]
4811  mj_gno_t *num_points_in_all_processor_parts = allocMemory<mj_gno_t>(input_num_parts * (num_procs + 1));
4812 
4813  //get the number of coordinates in each part in each processor.
4814  this->get_processor_num_points_in_parts(
4815  num_procs,
4816  input_num_parts,
4817  num_points_in_all_processor_parts);
4818 
4819 
4820  //check if migration will be performed or not.
4821  if (!this->mj_check_to_migrate(
4822  migration_reduce_all_population,
4823  num_coords_for_last_dim_part,
4824  num_procs,
4825  input_num_parts,
4826  num_points_in_all_processor_parts)){
4827  freeArray<mj_gno_t>(num_points_in_all_processor_parts);
4828  return false;
4829  }
4830 
4831 
4832  mj_lno_t *send_count_to_each_proc = NULL;
4833  int *coordinate_destinations = allocMemory<int>(this->num_local_coords);
4834  send_count_to_each_proc = allocMemory<mj_lno_t>(num_procs);
4835  for (int i = 0; i < num_procs; ++i) send_count_to_each_proc[i] = 0;
4836 
4837  std::vector<mj_part_t> processor_ranks_for_subcomm;
4838  std::vector<mj_part_t> out_part_indices;
4839 
4840  //determine which processors are assigned to which parts
4841  this->mj_migration_part_proc_assignment(
4842  num_points_in_all_processor_parts,
4843  input_num_parts,
4844  num_procs,
4845  send_count_to_each_proc,
4846  processor_ranks_for_subcomm,
4847  next_future_num_parts_in_parts,
4848  output_num_parts,
4849  out_part_indices,
4850  output_part_begin_index,
4851  coordinate_destinations);
4852 
4853 
4854 
4855 
4856  freeArray<mj_lno_t>(send_count_to_each_proc);
4857  std::vector <mj_part_t> tmpv;
4858 
4859  std::sort (out_part_indices.begin(), out_part_indices.end());
4860  mj_part_t outP = out_part_indices.size();
4861 
4862  mj_gno_t new_global_num_points = 0;
4863  mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * input_num_parts;
4864 
4865  if (this->mj_keep_part_boxes){
4866  input_part_boxes->clear();
4867  }
4868 
4869  //now we calculate the new values for next_future_num_parts_in_parts.
4870  //same for the part boxes.
4871  for (mj_part_t i = 0; i < outP; ++i){
4872  mj_part_t ind = out_part_indices[i];
4873  new_global_num_points += global_num_points_in_parts[ind];
4874  tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
4875  if (this->mj_keep_part_boxes){
4876  input_part_boxes->push_back((*output_part_boxes)[ind]);
4877  }
4878  }
4879  //swap the input and output part boxes.
4880  if (this->mj_keep_part_boxes){
4881  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
4882  input_part_boxes = output_part_boxes;
4883  output_part_boxes = tmpPartBoxes;
4884  }
4885  next_future_num_parts_in_parts->clear();
4886  for (mj_part_t i = 0; i < outP; ++i){
4887  mj_part_t p = tmpv[i];
4888  next_future_num_parts_in_parts->push_back(p);
4889  }
4890 
4891  freeArray<mj_gno_t>(num_points_in_all_processor_parts);
4892 
4893  mj_lno_t num_new_local_points = 0;
4894 
4895 
4896  //perform the actual migration operation here.
4897  this->mj_migrate_coords(
4898  num_procs,
4899  num_new_local_points,
4900  iteration,
4901  coordinate_destinations,
4902  input_num_parts);
4903 
4904 
4905  freeArray<int>(coordinate_destinations);
4906 
4907  if(this->num_local_coords != num_new_local_points){
4908  freeArray<mj_lno_t>(this->new_coordinate_permutations);
4909  freeArray<mj_lno_t>(this->coordinate_permutations);
4910 
4911  this->new_coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
4912  this->coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
4913  }
4914  this->num_local_coords = num_new_local_points;
4915  this->num_global_coords = new_global_num_points;
4916 
4917 
4918 
4919  //create subcommunicator.
4920  this->create_sub_communicator(processor_ranks_for_subcomm);
4921  processor_ranks_for_subcomm.clear();
4922 
4923  //fill the new permutation arrays.
4924  this->fill_permutation_array(
4925  output_num_parts,
4926  input_num_parts);
4927  return true;
4928 }
4929 
4930 
4944 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4945  typename mj_part_t>
4947  mj_part_t num_parts,
4948  mj_scalar_t *mj_current_dim_coords,
4949  mj_scalar_t *current_concurrent_cut_coordinate,
4950  mj_lno_t coordinate_begin,
4951  mj_lno_t coordinate_end,
4952  mj_scalar_t *used_local_cut_line_weight_to_left,
4953  mj_lno_t *out_part_xadj,
4954  int coordInd){
4955 
4956  //mj_lno_t numCoordsInPart = coordinateEnd - coordinateBegin;
4957  mj_part_t no_cuts = num_parts - 1;
4958 
4959 
4960 
4961  int me = 0;
4962  mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
4963  mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
4964 
4965 
4966  //now if the rectilinear partitioning is allowed we decide how
4967  //much weight each thread should put to left and right.
4968  if (this->distribute_points_on_cut_lines){
4969 
4970  my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
4971  for (mj_part_t i = 0; i < no_cuts; ++i){
4972  //the left to be put on the left of the cut.
4973  mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
4974  //cout << "i:" << i << " left_weight:" << left_weight << endl;
4975  for(int ii = 0; ii < this->num_threads; ++ii){
4976  if(left_weight > this->sEpsilon){
4977  //the weight of thread ii on cut.
4978  mj_scalar_t thread_ii_weight_on_cut = this->thread_part_weight_work[ii][i * 2 + 1] - this->thread_part_weight_work[ii][i * 2 ];
4979  if(thread_ii_weight_on_cut < left_weight){
4980  this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
4981  }
4982  else {
4983  this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
4984  }
4985  left_weight -= thread_ii_weight_on_cut;
4986  }
4987  else {
4988  this->thread_cut_line_weight_to_put_left[ii][i] = 0;
4989  }
4990  }
4991  }
4992 
4993  if(no_cuts > 0){
4994  //this is a special case. If cutlines share the same coordinate, their weights are equal.
4995  //we need to adjust the ratio for that.
4996  for (mj_part_t i = no_cuts - 1; i > 0 ; --i){
4997  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
4998  my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
4999  }
5000  my_local_thread_cut_weights_to_put_left[i] = int ((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL)
5001  / mj_scalar_t(SIGNIFICANCE_MUL);
5002  }
5003  }
5004  }
5005 
5006  for(mj_part_t ii = 0; ii < num_parts; ++ii){
5007  thread_num_points_in_parts[ii] = 0;
5008  }
5009 
5010  //for this specific case we dont want to distribute the points along the cut position
5011  //randomly, as we need a specific ordering of them. Instead,
5012  //we put the coordinates into a sort item, where we sort those
5013  //using the coordinates of points on other dimensions and the index.
5014 
5015 
5016  //some of the cuts might share the same position.
5017  //in this case, if cut i and cut j share the same position
5018  //cut_map[i] = cut_map[j] = sort item index.
5019  mj_part_t *cut_map = allocMemory<mj_part_t> (no_cuts);
5020 
5021 
5022  typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
5023  typedef std::vector< multiSItem > multiSVector;
5024  typedef std::vector<multiSVector> multiS2Vector;
5025 
5026  //to keep track of the memory allocated.
5027  std::vector<mj_scalar_t *>allocated_memory;
5028 
5029  //vector for which the coordinates will be sorted.
5030  multiS2Vector sort_vector_points_on_cut;
5031 
5032  //the number of cuts that have different coordinates.
5033  mj_part_t different_cut_count = 1;
5034  cut_map[0] = 0;
5035 
5036  //now we insert 1 sort vector for all cuts on the different
5037  //positins.if multiple cuts are on the same position, they share sort vectors.
5038  multiSVector tmpMultiSVector;
5039  sort_vector_points_on_cut.push_back(tmpMultiSVector);
5040 
5041  for (mj_part_t i = 1; i < no_cuts ; ++i){
5042  //if cuts share the same cut coordinates
5043  //set the cutmap accordingly.
5044  if(ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5045  cut_map[i] = cut_map[i-1];
5046  }
5047  else {
5048  cut_map[i] = different_cut_count++;
5049  multiSVector tmp2MultiSVector;
5050  sort_vector_points_on_cut.push_back(tmp2MultiSVector);
5051  }
5052  }
5053 
5054 
5055  //now the actual part assigment.
5056  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5057 
5058  mj_lno_t i = this->coordinate_permutations[ii];
5059 
5060  mj_part_t pp = this->assigned_part_ids[i];
5061  mj_part_t p = pp / 2;
5062  //if the coordinate is on a cut.
5063  if(pp % 2 == 1 ){
5064  mj_scalar_t *vals = allocMemory<mj_scalar_t>(this->coord_dim -1);
5065  allocated_memory.push_back(vals);
5066 
5067  //we insert the coordinates to the sort item here.
5068  int val_ind = 0;
5069  for(int dim = coordInd + 1; dim < this->coord_dim; ++dim){
5070  vals[val_ind++] = this->mj_coordinates[dim][i];
5071  }
5072  for(int dim = 0; dim < coordInd; ++dim){
5073  vals[val_ind++] = this->mj_coordinates[dim][i];
5074  }
5075  multiSItem tempSortItem(i, this->coord_dim -1, vals);
5076  //inser the point to the sort vector pointed by the cut_map[p].
5077  mj_part_t cmap = cut_map[p];
5078  sort_vector_points_on_cut[cmap].push_back(tempSortItem);
5079  }
5080  else {
5081  //if it is not on the cut, simple sorting.
5082  ++thread_num_points_in_parts[p];
5083  this->assigned_part_ids[i] = p;
5084  }
5085  }
5086 
5087  //sort all the sort vectors.
5088  for (mj_part_t i = 0; i < different_cut_count; ++i){
5089  std::sort (sort_vector_points_on_cut[i].begin(), sort_vector_points_on_cut[i].end());
5090  }
5091 
5092  //we do the part assignment for the points on cuts here.
5093  mj_part_t previous_cut_map = cut_map[0];
5094 
5095  //this is how much previous part owns the weight of the current part.
5096  //when target part weight is 1.6, and the part on the left is given 2,
5097  //the left has an extra 0.4, while the right has missing 0.4 from the previous cut.
5098  //this parameter is used to balance this issues.
5099  //in the above example weight_stolen_from_previous_part will be 0.4.
5100  //if the left part target is 2.2 but it is given 2,
5101  //then weight_stolen_from_previous_part will be -0.2.
5102  mj_scalar_t weight_stolen_from_previous_part = 0;
5103  for (mj_part_t p = 0; p < no_cuts; ++p){
5104 
5105  mj_part_t mapped_cut = cut_map[p];
5106 
5107  //if previous cut map is done, and it does not have the same index,
5108  //then assign all points left on that cut to its right.
5109  if (previous_cut_map != mapped_cut){
5110  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5111  for (; sort_vector_end >= 0; --sort_vector_end){
5112  multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5113  mj_lno_t i = t.index;
5114  ++thread_num_points_in_parts[p];
5115  this->assigned_part_ids[i] = p;
5116  }
5117  sort_vector_points_on_cut[previous_cut_map].clear();
5118  }
5119  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size() - 1;
5120 
5121  for (; sort_vector_end >= 0; --sort_vector_end){
5122  multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
5123  mj_lno_t i = t.index;
5124  mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
5125 
5126 
5127  //part p has enough space for point i, then put it to point i.
5128  if( my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part> this->sEpsilon &&
5129  my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - w)
5130  > this->sEpsilon){
5131 
5132  my_local_thread_cut_weights_to_put_left[p] -= w;
5133  sort_vector_points_on_cut[mapped_cut].pop_back();
5134  ++thread_num_points_in_parts[p];
5135  this->assigned_part_ids[i] = p;
5136  //if putting this weight to left overweights the left cut, then
5137  //increase the space for the next cut using weight_stolen_from_previous_part.
5138  if(p < no_cuts - 1 && my_local_thread_cut_weights_to_put_left[p] < this->sEpsilon){
5139  if(mapped_cut == cut_map[p + 1] ){
5140  //if the cut before the cut indexed at p was also at the same position
5141  //special case, as we handle the weight differently here.
5142  if (previous_cut_map != mapped_cut){
5143  weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5144  }
5145  else {
5146  //if the cut before the cut indexed at p was also at the same position
5147  //we assign extra weights cumulatively in this case.
5148  weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5149  }
5150  }
5151  else{
5152  weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5153  }
5154  //end assignment for part p
5155  break;
5156  }
5157  } else {
5158  //if part p does not have enough space for this point
5159  //and if there is another cut sharing the same positon,
5160  //again increase the space for the next
5161  if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]){
5162  if (previous_cut_map != mapped_cut){
5163  weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5164  }
5165  else {
5166  weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5167  }
5168  }
5169  else{
5170  weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5171  }
5172  //end assignment for part p
5173  break;
5174  }
5175  }
5176  previous_cut_map = mapped_cut;
5177  }
5178  //put everything left on the last cut to the last part.
5179  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5180  for (; sort_vector_end >= 0; --sort_vector_end){
5181  multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5182  mj_lno_t i = t.index;
5183  ++thread_num_points_in_parts[no_cuts];
5184  this->assigned_part_ids[i] = no_cuts;
5185  }
5186  sort_vector_points_on_cut[previous_cut_map].clear();
5187  freeArray<mj_part_t> (cut_map);
5188 
5189  //free the memory allocated for vertex sort items .
5190  mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
5191  for(mj_lno_t i = 0; i < vSize; ++i){
5192  freeArray<mj_scalar_t> (allocated_memory[i]);
5193  }
5194 
5195  //creation of part_xadj as in usual case.
5196  for(mj_part_t j = 0; j < num_parts; ++j){
5197  mj_lno_t num_points_in_part_j_upto_thread_i = 0;
5198  for (int i = 0; i < this->num_threads; ++i){
5199  mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
5200  this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
5201  num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
5202 
5203  }
5204  out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin;
5205  }
5206 
5207  //perform prefix sum for num_points in parts.
5208  for(mj_part_t j = 1; j < num_parts; ++j){
5209  out_part_xadj[j] += out_part_xadj[j - 1];
5210  }
5211 
5212 
5213  //shift the num points in threads thread to obtain the
5214  //beginning index of each thread's private space.
5215  for(mj_part_t j = 1; j < num_parts; ++j){
5216  thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
5217  }
5218 
5219  //now thread gets the coordinate and writes the index of coordinate to the permutation array
5220  //using the part index we calculated.
5221  for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5222  mj_lno_t i = this->coordinate_permutations[ii];
5223  mj_part_t p = this->assigned_part_ids[i];
5224  this->new_coordinate_permutations[coordinate_begin +
5225  thread_num_points_in_parts[p]++] = i;
5226  }
5227 }
5228 
5229 
5230 
5240 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5241  typename mj_part_t>
5243  mj_part_t current_num_parts,
5244  mj_part_t output_part_begin_index,
5245  RCP<mj_partBoxVector_t> &output_part_boxes,
5246  bool is_data_ever_migrated)
5247 {
5248  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Part_Assignment");
5249 
5250 #ifdef HAVE_ZOLTAN2_OMP
5251 #pragma omp parallel for
5252 #endif
5253  for(mj_part_t i = 0; i < current_num_parts;++i){
5254 
5255  mj_lno_t begin = 0;
5256  mj_lno_t end = this->part_xadj[i];
5257 
5258  if(i > 0) begin = this->part_xadj[i -1];
5259  mj_part_t part_to_set_index = i + output_part_begin_index;
5260  if (this->mj_keep_part_boxes){
5261  (*output_part_boxes)[i].setpId(part_to_set_index);
5262  }
5263  for (mj_lno_t ii = begin; ii < end; ++ii){
5264  mj_lno_t k = this->coordinate_permutations[ii];
5265  this->assigned_part_ids[k] = part_to_set_index;
5266  }
5267  }
5268 
5269  //ArrayRCP<const mj_gno_t> gnoList;
5270  if(!is_data_ever_migrated){
5271  //freeArray<mj_gno_t>(this->current_mj_gnos);
5272  //if(this->num_local_coords > 0){
5273  // gnoList = arcpFromArrayView(this->mj_gnos);
5274  //}
5275  }
5276  else {
5277 #ifdef ENABLE_ZOLTAN_MIGRATION
5278  if (sizeof(mj_lno_t) <= sizeof(int)) {
5279 
5280  // Cannot use Zoltan_Comm with local ordinals larger than ints.
5281  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
5282  // may overflow.
5283 
5284  //if data is migrated, then send part numbers to the original owners.
5285  ZOLTAN_COMM_OBJ *plan = NULL;
5286  MPI_Comm mpi_comm = Teuchos2MPI (this->mj_problemComm);
5287 
5288  int incoming = 0;
5289  int message_tag = 7856;
5290 
5291  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final Z1PlanCreating");
5292  int ierr = Zoltan_Comm_Create( &plan, int(this->num_local_coords),
5293  this->owner_of_coordinate, mpi_comm, message_tag,
5294  &incoming);
5295  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5296  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final Z1PlanCreating" );
5297 
5298  mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(incoming);
5299 
5300  message_tag++;
5301  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final Z1PlanComm");
5302  ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->current_mj_gnos,
5303  sizeof(mj_gno_t), (char *) incoming_gnos);
5304  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5305 
5306  freeArray<mj_gno_t>(this->current_mj_gnos);
5307  this->current_mj_gnos = incoming_gnos;
5308 
5309  mj_part_t *incoming_partIds = allocMemory< mj_part_t>(incoming);
5310 
5311  message_tag++;
5312  ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->assigned_part_ids,
5313  sizeof(mj_part_t), (char *) incoming_partIds);
5314  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5315  freeArray<mj_part_t>(this->assigned_part_ids);
5316  this->assigned_part_ids = incoming_partIds;
5317 
5318  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final Z1PlanComm");
5319  ierr = Zoltan_Comm_Destroy(&plan);
5320  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
5321 
5322  this->num_local_coords = incoming;
5323  //gnoList = arcp(this->current_mj_gnos, 0, this->num_local_coords, true);
5324  }
5325  else
5326 
5327 #endif // !ENABLE_ZOLTAN_MIGRATION
5328  {
5329  //if data is migrated, then send part numbers to the original owners.
5330  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final DistributorPlanCreating");
5331  Tpetra::Distributor distributor(this->mj_problemComm);
5332  ArrayView<const mj_part_t> owners_of_coords(this->owner_of_coordinate, this->num_local_coords);
5333  mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
5334  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final DistributorPlanCreating" );
5335 
5336  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final DistributorPlanComm");
5337  //migrate gnos to actual owners.
5338  ArrayRCP<mj_gno_t> received_gnos(incoming);
5339  ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5340  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5341  freeArray<mj_gno_t>(this->current_mj_gnos);
5342  this->current_mj_gnos = allocMemory<mj_gno_t>(incoming);
5343  memcpy( this->current_mj_gnos,
5344  received_gnos.getRawPtr(),
5345  incoming * sizeof(mj_gno_t));
5346 
5347  //migrate part ids to actual owners.
5348  ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5349  ArrayRCP<mj_part_t> received_partids(incoming);
5350  distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5351  freeArray<mj_part_t>(this->assigned_part_ids);
5352  this->assigned_part_ids = allocMemory<mj_part_t>(incoming);
5353  memcpy( this->assigned_part_ids,
5354  received_partids.getRawPtr(),
5355  incoming * sizeof(mj_part_t));
5356 
5357  this->num_local_coords = incoming;
5358  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final DistributorPlanComm");
5359 
5360  }
5361  }
5362 
5363  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Part_Assignment");
5364 
5365  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Solution_Part_Assignment");
5366 
5367  //ArrayRCP<mj_part_t> partId;
5368  //partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true);
5369 
5370  if (this->mj_keep_part_boxes){
5371  this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
5372 
5373  }
5374 
5375  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Solution_Part_Assignment");
5376 }
5377 
5380 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5381  typename mj_part_t>
5383  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Free");
5384 
5385  for (int i=0; i < this->coord_dim; i++){
5386  freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5387  }
5388  freeArray<mj_scalar_t *>(this->mj_coordinates);
5389 
5390  for (int i=0; i < this->num_weights_per_coord; i++){
5391  freeArray<mj_scalar_t>(this->mj_weights[i]);
5392  }
5393  freeArray<mj_scalar_t *>(this->mj_weights);
5394 
5395  freeArray<int>(this->owner_of_coordinate);
5396 
5397  for(int i = 0; i < this->num_threads; ++i){
5398  freeArray<mj_lno_t>(this->thread_point_counts[i]);
5399  }
5400 
5401  freeArray<mj_lno_t *>(this->thread_point_counts);
5402  freeArray<double *> (this->thread_part_weight_work);
5403 
5404  if(this->distribute_points_on_cut_lines){
5405  freeArray<mj_scalar_t>(this->process_cut_line_weight_to_put_left);
5406  for(int i = 0; i < this->num_threads; ++i){
5407  freeArray<mj_scalar_t>(this->thread_cut_line_weight_to_put_left[i]);
5408  }
5409  freeArray<mj_scalar_t *>(this->thread_cut_line_weight_to_put_left);
5410  freeArray<mj_scalar_t>(this->process_rectilinear_cut_weight);
5411  freeArray<mj_scalar_t>(this->global_rectilinear_cut_weight);
5412  }
5413 
5414  freeArray<mj_part_t>(this->my_incomplete_cut_count);
5415 
5416  freeArray<mj_scalar_t>(this->max_min_coords);
5417 
5418  freeArray<mj_lno_t>(this->new_part_xadj);
5419 
5420  freeArray<mj_lno_t>(this->coordinate_permutations);
5421 
5422  freeArray<mj_lno_t>(this->new_coordinate_permutations);
5423 
5424  freeArray<mj_scalar_t>(this->all_cut_coordinates);
5425 
5426  freeArray<mj_scalar_t> (this->process_local_min_max_coord_total_weight);
5427 
5428  freeArray<mj_scalar_t> (this->global_min_max_coord_total_weight);
5429 
5430  freeArray<mj_scalar_t>(this->cut_coordinates_work_array);
5431 
5432  freeArray<mj_scalar_t>(this->target_part_weights);
5433 
5434  freeArray<mj_scalar_t>(this->cut_upper_bound_coordinates);
5435 
5436  freeArray<mj_scalar_t>(this->cut_lower_bound_coordinates);
5437 
5438  freeArray<mj_scalar_t>(this->cut_lower_bound_weights);
5439  freeArray<mj_scalar_t>(this->cut_upper_bound_weights);
5440  freeArray<bool>(this->is_cut_line_determined);
5441  freeArray<mj_scalar_t>(this->total_part_weight_left_right_closests);
5442  freeArray<mj_scalar_t>(this->global_total_part_weight_left_right_closests);
5443 
5444  for(int i = 0; i < this->num_threads; ++i){
5445  freeArray<double>(this->thread_part_weights[i]);
5446  freeArray<mj_scalar_t>(this->thread_cut_right_closest_point[i]);
5447  freeArray<mj_scalar_t>(this->thread_cut_left_closest_point[i]);
5448  }
5449 
5450  freeArray<double *>(this->thread_part_weights);
5451  freeArray<mj_scalar_t *>(this->thread_cut_left_closest_point);
5452  freeArray<mj_scalar_t *>(this->thread_cut_right_closest_point);
5453 
5454  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Free");
5455 }
5456 
5464 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5465  typename mj_part_t>
5467  bool distribute_points_on_cut_lines_,
5468  int max_concurrent_part_calculation_,
5469  int check_migrate_avoid_migration_option_,
5470  mj_scalar_t minimum_migration_imbalance_){
5471  this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
5472  this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
5473  this->check_migrate_avoid_migration_option = check_migrate_avoid_migration_option_;
5474  this->minimum_migration_imbalance = minimum_migration_imbalance_;
5475 
5476 }
5477 
5478 
5507 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5508  typename mj_part_t>
5510 
5511  const RCP<const Environment> &env,
5512  RCP<Comm<int> > &problemComm,
5513 
5514  double imbalance_tolerance_,
5515  size_t num_global_parts_,
5516  mj_part_t *part_no_array_,
5517  int recursion_depth_,
5518 
5519  int coord_dim_,
5520  mj_lno_t num_local_coords_,
5521  mj_gno_t num_global_coords_,
5522  const mj_gno_t *initial_mj_gnos_,
5523  mj_scalar_t **mj_coordinates_,
5524 
5525  int num_weights_per_coord_,
5526  bool *mj_uniform_weights_,
5527  mj_scalar_t **mj_weights_,
5528  bool *mj_uniform_parts_,
5529  mj_scalar_t **mj_part_sizes_,
5530 
5531  mj_part_t *&result_assigned_part_ids_,
5532  mj_gno_t *&result_mj_gnos_
5533 )
5534 {
5535 
5536 #ifdef print_debug
5537  if(comm->getRank() == 0){
5538  std::cout << "size of gno:" << sizeof(mj_gno_t) << std::endl;
5539  std::cout << "size of lno:" << sizeof(mj_lno_t) << std::endl;
5540  std::cout << "size of mj_scalar_t:" << sizeof(mj_scalar_t) << std::endl;
5541  }
5542 #endif
5543 
5544  this->mj_env = env;
5545  this->mj_problemComm = problemComm;
5546  this->myActualRank = this->myRank = this->mj_problemComm->getRank();
5547 
5548  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Total");
5549  this->mj_env->debug(3, "In MultiJagged Jagged");
5550 
5551  {
5552  this->imbalance_tolerance = imbalance_tolerance_;
5553  this->num_global_parts = num_global_parts_;
5554  this->part_no_array = part_no_array_;
5555  this->recursion_depth = recursion_depth_;
5556 
5557  this->coord_dim = coord_dim_;
5558  this->num_local_coords = num_local_coords_;
5559  this->num_global_coords = num_global_coords_;
5560  this->mj_coordinates = mj_coordinates_; //will copy the memory to this->mj_coordinates.
5561  this->initial_mj_gnos = (mj_gno_t *) initial_mj_gnos_; //will copy the memory to this->current_mj_gnos[j].
5562 
5563  this->num_weights_per_coord = num_weights_per_coord_;
5564  this->mj_uniform_weights = mj_uniform_weights_;
5565  this->mj_weights = mj_weights_; //will copy the memory to this->mj_weights
5566  this->mj_uniform_parts = mj_uniform_parts_;
5567  this->mj_part_sizes = mj_part_sizes_;
5568 
5569  this->num_threads = 1;
5570 #ifdef HAVE_ZOLTAN2_OMP
5571 #pragma omp parallel
5572 
5573  {
5574  this->num_threads = omp_get_num_threads();
5575  }
5576 #endif
5577  }
5578  //this->set_input_data();
5579  this->set_part_specifications();
5580 
5581  this->allocate_set_work_memory();
5582 
5583  //We duplicate the comm as we create subcommunicators during migration.
5584  //We keep the problemComm as it is, while comm changes after each migration.
5585  this->comm = this->mj_problemComm->duplicate();
5586 
5587  //initially there is a single partition
5588  mj_part_t current_num_parts = 1;
5589  mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
5590 
5591  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Partitioning");
5592 
5593  mj_part_t output_part_begin_index = 0;
5594  mj_part_t future_num_parts = this->total_num_part;
5595  bool is_data_ever_migrated = false;
5596 
5597  std::vector<mj_part_t> *future_num_part_in_parts = new std::vector<mj_part_t> ();
5598  std::vector<mj_part_t> *next_future_num_parts_in_parts = new std::vector<mj_part_t> ();
5599  next_future_num_parts_in_parts->push_back(this->num_global_parts);
5600 
5601  RCP<mj_partBoxVector_t> input_part_boxes(new mj_partBoxVector_t(), true) ;
5602  RCP<mj_partBoxVector_t> output_part_boxes(new mj_partBoxVector_t(), true);
5603 
5604  compute_global_box();
5605  if(this->mj_keep_part_boxes){
5606  this->init_part_boxes(output_part_boxes);
5607  }
5608 
5609  for (int i = 0; i < this->recursion_depth; ++i){
5610  //partitioning array. size will be as the number of current partitions and this
5611  //holds how many parts that each part will be in the current dimension partitioning.
5612  std::vector <mj_part_t> num_partitioning_in_current_dim;
5613 
5614  //number of parts that will be obtained at the end of this partitioning.
5615  //future_num_part_in_parts is as the size of current number of parts.
5616  //holds how many more parts each should be divided in the further
5617  //iterations. this will be used to calculate num_partitioning_in_current_dim,
5618  //as the number of parts that the part will be partitioned
5619  //in the current dimension partitioning.
5620 
5621  //next_future_num_parts_in_parts will be as the size of outnumParts,
5622  //and this will hold how many more parts that each output part
5623  //should be divided. this array will also be used to determine the weight ratios
5624  //of the parts.
5625  //swap the arrays to use iteratively..
5626  std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
5627  future_num_part_in_parts = next_future_num_parts_in_parts;
5628  next_future_num_parts_in_parts = tmpPartVect;
5629 
5630  //clear next_future_num_parts_in_parts array as
5631  //getPartitionArrays expects it to be empty.
5632  //it also expects num_partitioning_in_current_dim to be empty as well.
5633  next_future_num_parts_in_parts->clear();
5634 
5635  if(this->mj_keep_part_boxes){
5636  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5637  input_part_boxes = output_part_boxes;
5638  output_part_boxes = tmpPartBoxes;
5639  output_part_boxes->clear();
5640  }
5641 
5642  //returns the total no. of output parts for this dimension partitioning.
5643  mj_part_t output_part_count_in_dimension =
5644  this->update_part_num_arrays(
5645  num_partitioning_in_current_dim,
5646  future_num_part_in_parts,
5647  next_future_num_parts_in_parts,
5648  future_num_parts,
5649  current_num_parts,
5650  i,
5651  input_part_boxes,
5652  output_part_boxes);
5653 
5654  //if the number of obtained parts equal to current number of parts,
5655  //skip this dimension. For example, this happens when 1 is given in the input
5656  //part array is given. P=4,5,1,2
5657  if(output_part_count_in_dimension == current_num_parts) {
5658  //still need to swap the input output arrays.
5659  tmpPartVect= future_num_part_in_parts;
5660  future_num_part_in_parts = next_future_num_parts_in_parts;
5661  next_future_num_parts_in_parts = tmpPartVect;
5662 
5663  if(this->mj_keep_part_boxes){
5664  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5665  input_part_boxes = output_part_boxes;
5666  output_part_boxes = tmpPartBoxes;
5667  }
5668  continue;
5669  }
5670 
5671 
5672  //get the coordinate axis along which the partitioning will be done.
5673  int coordInd = i % this->coord_dim;
5674  mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
5675 
5676  //convert i to string to be used for debugging purposes.
5677  std::string istring = toString<int>(i);
5678  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Partitioning_" + istring);
5679 
5680  //alloc Memory to point the indices
5681  //of the parts in the permutation array.
5682  this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
5683 
5684  //the index where in the new_part_xadj will be written.
5685  mj_part_t output_part_index = 0;
5686  //whatever is written to output_part_index will be added with putput_coordinate_end_index
5687  //so that the points will be shifted.
5688  mj_part_t output_coordinate_end_index = 0;
5689 
5690  mj_part_t current_work_part = 0;
5691  mj_part_t current_concurrent_num_parts =
5692  std::min(current_num_parts - current_work_part, this->max_concurrent_part_calculation);
5693 
5694  mj_part_t obtained_part_index = 0;
5695 
5696  //run for all available parts.
5697  for (; current_work_part < current_num_parts;
5698  current_work_part += current_concurrent_num_parts){
5699 
5700  current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
5701  this->max_concurrent_part_calculation);
5702 
5703  mj_part_t actual_work_part_count = 0;
5704  //initialization for 1D partitioning.
5705  //get the min and max coordinates of each part
5706  //together with the part weights of each part.
5707  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
5708  mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
5709 
5710  //if this part wont be partitioned any further
5711  //dont do any work for this part.
5712  if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
5713  continue;
5714  }
5715  ++actual_work_part_count;
5716  mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
5717  mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
5718 
5719 /*
5720  cout << "i:" << i << " j:" << current_work_part + kk
5721  << " coordinate_begin_index:" << coordinate_begin_index
5722  << " coordinate_end_index:" << coordinate_end_index
5723  << " total:" << coordinate_end_index - coordinate_begin_index<< endl;
5724  */
5725  this->mj_get_local_min_max_coord_totW(
5726  coordinate_begin_index,
5727  coordinate_end_index,
5728  this->coordinate_permutations,
5729  mj_current_dim_coords,
5730  this->process_local_min_max_coord_total_weight[kk], //min_coordinate
5731  this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max_coordinate
5732  this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]); //total_weight
5733 
5734  }
5735 
5736  //1D partitioning
5737  if (actual_work_part_count > 0){
5738  //obtain global Min max of the part.
5739  this->mj_get_global_min_max_coord_totW(
5740  current_concurrent_num_parts,
5741  this->process_local_min_max_coord_total_weight,
5742  this->global_min_max_coord_total_weight);
5743 
5744  //represents the total number of cutlines
5745  //whose coordinate should be determined.
5746  mj_part_t total_incomplete_cut_count = 0;
5747 
5748  //Compute weight ratios for parts & cuts:
5749  //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
5750  //part0 cut0 part1 cut1 part2 cut2 part3
5751  mj_part_t concurrent_part_cut_shift = 0;
5752  mj_part_t concurrent_part_part_shift = 0;
5753  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
5754  mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
5755  mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
5756  current_concurrent_num_parts];
5757 
5758  mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
5759  2 * current_concurrent_num_parts];
5760 
5761  mj_part_t concurrent_current_part_index = current_work_part + kk;
5762 
5763  mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
5764 
5765  mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
5766  mj_scalar_t *current_target_part_weights = this->target_part_weights +
5767  concurrent_part_part_shift;
5768  //shift the usedCutCoordinate array as noCuts.
5769  concurrent_part_cut_shift += partition_count - 1;
5770  //shift the partRatio array as noParts.
5771  concurrent_part_part_shift += partition_count;
5772 
5773 
5774  //calculate only if part is not empty,
5775  //and part will be further partitioned.
5776  if(partition_count > 1 && min_coordinate <= max_coordinate){
5777 
5778  //increase num_cuts_do_be_determined by the number of cuts of the current
5779  //part's cut line number.
5780  total_incomplete_cut_count += partition_count - 1;
5781  //set the number of cut lines that should be determined
5782  //for this part.
5783  this->my_incomplete_cut_count[kk] = partition_count - 1;
5784 
5785  //get the target weights of the parts.
5786  this->mj_get_initial_cut_coords_target_weights(
5787  min_coordinate,
5788  max_coordinate,
5789  partition_count - 1,
5790  global_total_weight,
5791  usedCutCoordinate,
5792  current_target_part_weights,
5793  future_num_part_in_parts,
5794  next_future_num_parts_in_parts,
5795  concurrent_current_part_index,
5796  obtained_part_index);
5797 
5798  mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
5799  mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
5800 
5801  //get the initial estimated part assignments of the
5802  //coordinates.
5803  this->set_initial_coordinate_parts(
5804  max_coordinate,
5805  min_coordinate,
5806  concurrent_current_part_index,
5807  coordinate_begin_index, coordinate_end_index,
5808  this->coordinate_permutations,
5809  mj_current_dim_coords,
5810  this->assigned_part_ids,
5811  partition_count);
5812  }
5813  else {
5814  // e.g., if have fewer coordinates than parts, don't need to do next dim.
5815  this->my_incomplete_cut_count[kk] = 0;
5816  }
5817  obtained_part_index += partition_count;
5818  }
5819 
5820 
5821 
5822  //used imbalance, it is always 0, as it is difficult to
5823  //estimate a range.
5824  mj_scalar_t used_imbalance = 0;
5825 
5826 
5827  // Determine cut lines for all concurrent parts parts here.
5828  this->mj_1D_part(
5829  mj_current_dim_coords,
5830  used_imbalance,
5831  current_work_part,
5832  current_concurrent_num_parts,
5833  current_cut_coordinates,
5834  total_incomplete_cut_count,
5835  num_partitioning_in_current_dim);
5836  }
5837 
5838  //create new part chunks
5839  {
5840  mj_part_t output_array_shift = 0;
5841  mj_part_t cut_shift = 0;
5842  size_t tlr_shift = 0;
5843  size_t partweight_array_shift = 0;
5844 
5845  for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
5846  mj_part_t current_concurrent_work_part = current_work_part + kk;
5847  mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
5848 
5849  //if the part is empty, skip the part.
5850  if((num_parts != 1 )
5851  &&
5852  this->global_min_max_coord_total_weight[kk] >
5853  this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
5854 
5855  //we still need to write the begin and end point of the
5856  //empty part. simply set it zero, the array indices will be shifted later.
5857  for(mj_part_t jj = 0; jj < num_parts; ++jj){
5858  this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
5859  }
5860  cut_shift += num_parts - 1;
5861  tlr_shift += (4 *(num_parts - 1) + 1);
5862  output_array_shift += num_parts;
5863  partweight_array_shift += (2 * (num_parts - 1) + 1);
5864  continue;
5865  }
5866 
5867  mj_lno_t coordinate_end= this->part_xadj[current_concurrent_work_part];
5868  mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[
5869  current_concurrent_work_part -1];
5870  mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
5871  mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
5872  cut_shift;
5873 
5874  //mj_scalar_t *used_tlr_array = this->total_part_weight_left_right_closests + tlr_shift;
5875 
5876  for(int ii = 0; ii < this->num_threads; ++ii){
5877  this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
5878  }
5879 
5880  if(num_parts > 1){
5881  if(this->mj_keep_part_boxes){
5882  //if part boxes are to be stored update the boundaries.
5883  for (mj_part_t j = 0; j < num_parts - 1; ++j){
5884  (*output_part_boxes)[output_array_shift + output_part_index +
5885  j].updateMinMax(current_concurrent_cut_coordinate[j], 1
5886  /*update max*/, coordInd);
5887 
5888  (*output_part_boxes)[output_array_shift + output_part_index + j +
5889  1].updateMinMax(current_concurrent_cut_coordinate[j], 0
5890  /*update min*/, coordInd);
5891  }
5892  }
5893 
5894  // Rewrite the indices based on the computed cuts.
5895  this->mj_create_new_partitions(
5896  num_parts,
5897  mj_current_dim_coords,
5898  current_concurrent_cut_coordinate,
5899  coordinate_begin,
5900  coordinate_end,
5901  used_local_cut_line_weight_to_left,
5902  this->thread_part_weight_work,
5903  this->new_part_xadj + output_part_index + output_array_shift
5904  );
5905 
5906  }
5907  else {
5908  //if this part is partitioned into 1 then just copy
5909  //the old values.
5910  mj_lno_t part_size = coordinate_end - coordinate_begin;
5911  *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
5912  memcpy(
5913  this->new_coordinate_permutations + coordinate_begin,
5914  this->coordinate_permutations + coordinate_begin,
5915  part_size * sizeof(mj_lno_t));
5916  }
5917  cut_shift += num_parts - 1;
5918  tlr_shift += (4 *(num_parts - 1) + 1);
5919  output_array_shift += num_parts;
5920  partweight_array_shift += (2 * (num_parts - 1) + 1);
5921  }
5922 
5923  //shift cut coordinates so that all cut coordinates are stored.
5924  //no shift now because we dont keep the cuts.
5925  //current_cut_coordinates += cut_shift;
5926 
5927  //mj_create_new_partitions from coordinates partitioned the parts and
5928  //write the indices as if there were a single part.
5929  //now we need to shift the beginning indices.
5930  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
5931  mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
5932  for (mj_part_t ii = 0;ii < num_parts ; ++ii){
5933  //shift it by previousCount
5934  this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
5935  }
5936  //increase the previous count by current end.
5937  output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
5938  //increase the current out.
5939  output_part_index += num_parts ;
5940  }
5941  }
5942  }
5943  // end of this partitioning dimension
5944 
5945 
5946  int current_world_size = this->comm->getSize();
5947  long migration_reduce_all_population = this->total_dim_num_reduce_all * current_world_size;
5948 
5949 
5950  bool is_migrated_in_current_dimension = false;
5951 
5952  //we migrate if there are more partitionings to be done after this step
5953  //and if the migration is not forced to be avoided.
5954  //and the operation is not sequential.
5955  if (future_num_parts > 1 &&
5956  this->check_migrate_avoid_migration_option >= 0 &&
5957  current_world_size > 1){
5958 
5959  this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + istring);
5960  mj_part_t num_parts = output_part_count_in_dimension;
5961  if ( this->mj_perform_migration(
5962  num_parts,
5963  current_num_parts, //output
5964  next_future_num_parts_in_parts, //output
5965  output_part_begin_index,
5966  migration_reduce_all_population,
5967  this->num_local_coords / (future_num_parts * current_num_parts),
5968  istring,
5969  input_part_boxes, output_part_boxes) ) {
5970  is_migrated_in_current_dimension = true;
5971  is_data_ever_migrated = true;
5972  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Migration-" +
5973  istring);
5974  //since data is migrated, we reduce the number of reduceAll operations for the last part.
5975  this->total_dim_num_reduce_all /= num_parts;
5976  }
5977  else {
5978  is_migrated_in_current_dimension = false;
5979  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + istring);
5980  }
5981  }
5982 
5983  //swap the coordinate permutations for the next dimension.
5984  mj_lno_t * tmp = this->coordinate_permutations;
5985  this->coordinate_permutations = this->new_coordinate_permutations;
5986  this->new_coordinate_permutations = tmp;
5987 
5988  if(!is_migrated_in_current_dimension){
5989  this->total_dim_num_reduce_all -= current_num_parts;
5990  current_num_parts = output_part_count_in_dimension;
5991  }
5992  freeArray<mj_lno_t>(this->part_xadj);
5993  this->part_xadj = this->new_part_xadj;
5994 
5995  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Partitioning_" + istring);
5996  }
5997 
5998  // Partitioning is done
5999  delete future_num_part_in_parts;
6000  delete next_future_num_parts_in_parts;
6001 
6002  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Partitioning");
6004 
6005 
6006  //get the final parts of each initial coordinate
6007  //the results will be written to
6008  //this->assigned_part_ids for gnos given in this->current_mj_gnos
6009  this->set_final_parts(
6010  current_num_parts,
6011  output_part_begin_index,
6012  output_part_boxes,
6013  is_data_ever_migrated);
6014 
6015  result_assigned_part_ids_ = this->assigned_part_ids;
6016  result_mj_gnos_ = this->current_mj_gnos;
6017 
6018  this->free_work_memory();
6019  this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Total");
6020  this->mj_env->debug(3, "Out of MultiJagged");
6021 
6022 }
6023 
6024 
6028 template <typename Adapter>
6029 class Zoltan2_AlgMJ : public Algorithm<Adapter>
6030 {
6031 private:
6032 
6033 #ifndef DOXYGEN_SHOULD_SKIP_THIS
6034 
6035  typedef CoordinateModel<typename Adapter::base_adapter_t> coordinateModel_t;
6036  typedef typename Adapter::scalar_t mj_scalar_t;
6037  typedef typename Adapter::gno_t mj_gno_t;
6038  typedef typename Adapter::lno_t mj_lno_t;
6039  typedef typename Adapter::node_t mj_node_t;
6040  typedef typename Adapter::part_t mj_part_t;
6042  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
6043 #endif
6045 
6046  RCP<const Environment> mj_env; //the environment object
6047  RCP<Comm<int> > mj_problemComm; //initial comm object
6048  RCP<const coordinateModel_t> mj_coords; //coordinate adapter
6049 
6050  //PARAMETERS
6051  double imbalance_tolerance; //input imbalance tolerance.
6052  size_t num_global_parts; //the targeted number of parts
6053  mj_part_t *part_no_array; //input part array specifying num part to divide along each dim.
6054  int recursion_depth; //the number of steps that partitioning will be solved in.
6055 
6056  int coord_dim; // coordinate dimension.
6057  mj_lno_t num_local_coords; //number of local coords.
6058  mj_gno_t num_global_coords; //number of global coords.
6059  const mj_gno_t *initial_mj_gnos; //initial global ids of the coordinates.
6060  mj_scalar_t **mj_coordinates; //two dimension coordinate array
6061 
6062  int num_weights_per_coord; // number of weights per coordinate
6063  bool *mj_uniform_weights; //if the coordinates have uniform weights.
6064  mj_scalar_t **mj_weights; //two dimensional weight array
6065  bool *mj_uniform_parts; //if the target parts are uniform
6066  mj_scalar_t **mj_part_sizes; //target part weight sizes.
6067 
6068  bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts.
6069  mj_part_t max_concurrent_part_calculation; // how many parts we can calculate concurrently.
6070  int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
6071  mj_scalar_t minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration.
6072  int mj_keep_part_boxes; //if the boxes need to be kept.
6073 
6074  int num_threads;
6075 
6076  int mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value.
6077 
6078  ArrayRCP<mj_part_t> comXAdj_; //communication graph xadj
6079  ArrayRCP<mj_part_t> comAdj_; //communication graph adj.
6080 
6081 
6082  //when we have strided data, it returns a unstrided data in RCP form.
6083  //we need to hold on to that data, during the execution of mj, so that the data is not released.
6084  //coordinate_rcp_holder will hold that data, and release it when MJ is deleted.
6085  ArrayRCP<const mj_scalar_t> * coordinate_ArrayRCP_holder;
6086 
6087  void set_up_partitioning_data(
6088  const RCP<PartitioningSolution<Adapter> >&solution);
6089 
6090  void set_input_parameters(const Teuchos::ParameterList &p);
6091 
6092  void free_work_memory();
6093 
6094  RCP<mj_partBoxVector_t> getGlobalBoxBoundaries() const;
6095 
6096 public:
6097 
6098  Zoltan2_AlgMJ(const RCP<const Environment> &env,
6099  RCP<Comm<int> > &problemComm,
6100  const RCP<const coordinateModel_t> &coords) :
6101  mj_partitioner(), mj_env(env),
6102  mj_problemComm(problemComm),
6103  mj_coords(coords),
6104  imbalance_tolerance(0),
6105  num_global_parts(1), part_no_array(NULL),
6106  recursion_depth(0),
6107  coord_dim(0),num_local_coords(0), num_global_coords(0),
6108  initial_mj_gnos(NULL), mj_coordinates(NULL),
6109  num_weights_per_coord(0),
6110  mj_uniform_weights(NULL), mj_weights(NULL),
6111  mj_uniform_parts(NULL),
6112  mj_part_sizes(NULL),
6113  distribute_points_on_cut_lines(true),
6114  max_concurrent_part_calculation(1),
6115  check_migrate_avoid_migration_option(0),
6116  minimum_migration_imbalance(0.30),
6117  mj_keep_part_boxes(0), num_threads(1), mj_run_as_rcb(0),
6118  comXAdj_(), comAdj_(), coordinate_ArrayRCP_holder (NULL)
6119  {}
6121  if (coordinate_ArrayRCP_holder != NULL){
6122  delete [] this->coordinate_ArrayRCP_holder;
6123  this->coordinate_ArrayRCP_holder = NULL;
6124  }
6125  }
6126 
6133  void partition(const RCP<PartitioningSolution<Adapter> > &solution);
6134 
6135  mj_partBoxVector_t &getPartBoxesView() const
6136  {
6137  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6138  return *pBoxes;
6139  }
6140 
6141  mj_part_t pointAssign(int dim, mj_scalar_t *point) const;
6142 
6143  void boxAssign(int dim, mj_scalar_t *lower, mj_scalar_t *upper,
6144  size_t &nPartsFound, mj_part_t **partsFound) const;
6145 
6146 
6149  void getCommunicationGraph(
6150  const PartitioningSolution<Adapter> *solution,
6151  ArrayRCP<mj_part_t> &comXAdj,
6152  ArrayRCP<mj_part_t> &comAdj);
6153 };
6154 
6155 
6165 template <typename Adapter>
6167  const RCP<PartitioningSolution<Adapter> > &solution
6168 )
6169 {
6170  this->set_up_partitioning_data(solution);
6171  this->set_input_parameters(this->mj_env->getParameters());
6172  if (this->mj_keep_part_boxes){
6173  this->mj_partitioner.set_to_keep_part_boxes();
6174  }
6175  this->mj_partitioner.set_partitioning_parameters(
6176  this->distribute_points_on_cut_lines,
6177  this->max_concurrent_part_calculation,
6178  this->check_migrate_avoid_migration_option,
6179  this->minimum_migration_imbalance);
6180 
6181  mj_part_t *result_assigned_part_ids = NULL;
6182  mj_gno_t *result_mj_gnos = NULL;
6183  this->mj_partitioner.multi_jagged_part(
6184  this->mj_env,
6185  this->mj_problemComm,
6186 
6187  this->imbalance_tolerance,
6188  this->num_global_parts,
6189  this->part_no_array,
6190  this->recursion_depth,
6191 
6192  this->coord_dim,
6193  this->num_local_coords,
6194  this->num_global_coords,
6195  this->initial_mj_gnos,
6196  this->mj_coordinates,
6197 
6198  this->num_weights_per_coord,
6199  this->mj_uniform_weights,
6200  this->mj_weights,
6201  this->mj_uniform_parts,
6202  this->mj_part_sizes,
6203 
6204  result_assigned_part_ids,
6205  result_mj_gnos
6206  );
6207 
6208  // Reorder results so that they match the order of the input
6209 #if defined(__cplusplus) && __cplusplus >= 201103L
6210  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
6211  localGidToLid.reserve(this->num_local_coords);
6212  for (mj_lno_t i = 0; i < this->num_local_coords; i++)
6213  localGidToLid[this->initial_mj_gnos[i]] = i;
6214 
6215  ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[this->num_local_coords],
6216  0, this->num_local_coords, true);
6217 
6218  for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
6219  mj_lno_t origLID = localGidToLid[result_mj_gnos[i]];
6220  partId[origLID] = result_assigned_part_ids[i];
6221  }
6222 
6223 #else
6224  Teuchos::Hashtable<mj_gno_t, mj_lno_t>
6225  localGidToLid(this->num_local_coords);
6226  for (mj_lno_t i = 0; i < this->num_local_coords; i++)
6227  localGidToLid.put(this->initial_mj_gnos[i], i);
6228 
6229  ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[this->num_local_coords],
6230  0, this->num_local_coords, true);
6231 
6232  for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
6233  mj_lno_t origLID = localGidToLid.get(result_mj_gnos[i]);
6234  partId[origLID] = result_assigned_part_ids[i];
6235  }
6236 
6237 #endif // C++11 is enabled
6238 
6239  delete [] result_mj_gnos;
6240  delete [] result_assigned_part_ids;
6241 
6242  solution->setParts(partId);
6243  this->free_work_memory();
6244 }
6245 
6246 /* \brief Freeing the memory allocated.
6247  * */
6248 template <typename Adapter>
6250  freeArray<mj_scalar_t *>(this->mj_coordinates);
6251  freeArray<mj_scalar_t *>(this->mj_weights);
6252  freeArray<bool>(this->mj_uniform_parts);
6253  freeArray<mj_scalar_t *>(this->mj_part_sizes);
6254  freeArray<bool>(this->mj_uniform_weights);
6255 
6256 }
6257 
6258 /* \brief Sets the partitioning data for multijagged algorithm.
6259  * */
6260 template <typename Adapter>
6262  const RCP<PartitioningSolution<Adapter> > &solution
6263 )
6264 {
6265  this->coord_dim = this->mj_coords->getCoordinateDim();
6266  this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
6267  this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
6268  this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
6269  int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
6270 
6271  // From the Solution we get part information.
6272  // If the part sizes for a given criteria are not uniform,
6273  // then they are values that sum to 1.0.
6274  this->num_global_parts = solution->getTargetGlobalNumberOfParts();
6275  //allocate only two dimensional pointer.
6276  //raw pointer addresess will be obtained from multivector.
6277  this->mj_coordinates = allocMemory<mj_scalar_t *>(this->coord_dim);
6278  this->mj_weights = allocMemory<mj_scalar_t *>(criteria_dim);
6279 
6280  //if the partitioning results are to be uniform.
6281  this->mj_uniform_parts = allocMemory< bool >(criteria_dim);
6282  //if in a criteria dimension, uniform part is false this shows ratios of
6283  //the target part weights.
6284  this->mj_part_sizes = allocMemory<mj_scalar_t *>(criteria_dim);
6285  //if the weights of coordinates are uniform in a criteria dimension.
6286  this->mj_uniform_weights = allocMemory< bool >(criteria_dim);
6287 
6288  typedef StridedData<mj_lno_t, mj_scalar_t> input_t;
6289  ArrayView<const mj_gno_t> gnos;
6290  ArrayView<input_t> xyz;
6291  ArrayView<input_t> wgts;
6292 
6293 
6294  this->coordinate_ArrayRCP_holder = new ArrayRCP<const mj_scalar_t> [this->coord_dim + this->num_weights_per_coord];
6295 
6296  this->mj_coords->getCoordinates(gnos, xyz, wgts);
6297  //obtain global ids.
6298  ArrayView<const mj_gno_t> mj_gnos = gnos;
6299  this->initial_mj_gnos = mj_gnos.getRawPtr();
6300 
6301  //extract coordinates from multivector.
6302  for (int dim=0; dim < this->coord_dim; dim++){
6303  ArrayRCP<const mj_scalar_t> ar;
6304  xyz[dim].getInputArray(ar);
6305  this->coordinate_ArrayRCP_holder[dim] = ar;
6306 
6307  //multiJagged coordinate values assignment
6308  this->mj_coordinates[dim] = (mj_scalar_t *)ar.getRawPtr();
6309  }
6310 
6311  //if no weights are provided set uniform weight.
6312  if (this->num_weights_per_coord == 0){
6313  this->mj_uniform_weights[0] = true;
6314  this->mj_weights[0] = NULL;
6315  }
6316  else{
6317  //if weights are provided get weights for all weight indices
6318  for (int wdim = 0; wdim < this->num_weights_per_coord; wdim++){
6319  ArrayRCP<const mj_scalar_t> ar;
6320  wgts[wdim].getInputArray(ar);
6321  this->coordinate_ArrayRCP_holder[this->coord_dim + wdim] = ar;
6322  this->mj_uniform_weights[wdim] = false;
6323  this->mj_weights[wdim] = (mj_scalar_t *) ar.getRawPtr();
6324  }
6325  }
6326 
6327  for (int wdim = 0; wdim < criteria_dim; wdim++){
6328  if (solution->criteriaHasUniformPartSizes(wdim)){
6329  this->mj_uniform_parts[wdim] = true;
6330  this->mj_part_sizes[wdim] = NULL;
6331  }
6332  else{
6333  std::cerr << "MJ does not support non uniform target part weights" << std::endl;
6334  exit(1);
6335  }
6336  }
6337 }
6338 
6339 /* \brief Sets the partitioning parameters for multijagged algorithm.
6340  * \param pl: is the parameter list provided to zoltan2 call
6341  * */
6342 template <typename Adapter>
6343 void Zoltan2_AlgMJ<Adapter>::set_input_parameters(const Teuchos::ParameterList &pl){
6344 
6345  const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
6346  if (pe){
6347  double tol;
6348  tol = pe->getValue(&tol);
6349  this->imbalance_tolerance = tol - 1.0;
6350  }
6351 
6352  // TODO: May be a more relaxed tolerance is needed. RCB uses 10%
6353  if (this->imbalance_tolerance <= 0)
6354  this->imbalance_tolerance= 10e-4;
6355 
6356  //if an input partitioning array is provided.
6357  this->part_no_array = NULL;
6358  //the length of the input partitioning array.
6359  this->recursion_depth = 0;
6360 
6361  if (pl.getPtr<Array <mj_part_t> >("mj_parts")){
6362  this->part_no_array = (mj_part_t *) pl.getPtr<Array <mj_part_t> >("mj_parts")->getRawPtr();
6363  this->recursion_depth = pl.getPtr<Array <mj_part_t> >("mj_parts")->size() - 1;
6364  this->mj_env->debug(2, "mj_parts provided by user");
6365  }
6366 
6367  //get mj specific parameters.
6368  this->distribute_points_on_cut_lines = true;
6369  this->max_concurrent_part_calculation = 1;
6370 
6371  this->mj_run_as_rcb = 0;
6372  int mj_user_recursion_depth = -1;
6373  this->mj_keep_part_boxes = 0;
6374  this->check_migrate_avoid_migration_option = 0;
6375  this->minimum_migration_imbalance = 0.35;
6376 
6377  pe = pl.getEntryPtr("mj_minimum_migration_imbalance");
6378  if (pe){
6379  double imb;
6380  imb = pe->getValue(&imb);
6381  this->minimum_migration_imbalance = imb - 1.0;
6382  }
6383 
6384  pe = pl.getEntryPtr("mj_migration_option");
6385  if (pe){
6386  this->check_migrate_avoid_migration_option = pe->getValue(&this->check_migrate_avoid_migration_option);
6387  }else {
6388  this->check_migrate_avoid_migration_option = 0;
6389  }
6390  if (this->check_migrate_avoid_migration_option > 1) this->check_migrate_avoid_migration_option = -1;
6391 
6392 
6393  pe = pl.getEntryPtr("mj_concurrent_part_count");
6394  if (pe){
6395  this->max_concurrent_part_calculation = pe->getValue(&this->max_concurrent_part_calculation);
6396  }else {
6397  this->max_concurrent_part_calculation = 1; // Set to 1 if not provided.
6398  }
6399 
6400  pe = pl.getEntryPtr("mj_keep_part_boxes");
6401  if (pe){
6402  this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
6403  }else {
6404  this->mj_keep_part_boxes = 0; // Set to invalid value
6405  }
6406 
6407  // For now, need keep_part_boxes to do pointAssign and boxAssign.
6408  // pe = pl.getEntryPtr("keep_cuts");
6409  // if (pe){
6410  // int tmp = pe->getValue(&tmp);
6411  // if (tmp) this->mj_keep_part_boxes = 1;
6412  // }
6413 
6414  //need to keep part boxes if mapping type is geometric.
6415  if (this->mj_keep_part_boxes == 0){
6416  pe = pl.getEntryPtr("mapping_type");
6417  if (pe){
6418  int mapping_type = -1;
6419  mapping_type = pe->getValue(&mapping_type);
6420  if (mapping_type == 0){
6421  mj_keep_part_boxes = 1;
6422  }
6423  }
6424  }
6425 
6426  //need to keep part boxes if mapping type is geometric.
6427  pe = pl.getEntryPtr("mj_enable_rcb");
6428  if (pe){
6429  this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
6430  }else {
6431  this->mj_run_as_rcb = 0; // Set to invalid value
6432  }
6433 
6434  pe = pl.getEntryPtr("mj_recursion_depth");
6435  if (pe){
6436  mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
6437  }else {
6438  mj_user_recursion_depth = -1; // Set to invalid value
6439  }
6440 
6441  int val = 0;
6442  pe = pl.getEntryPtr("rectilinear");
6443  if (pe) val = pe->getValue(&val);
6444  if (val == 1){
6445  this->distribute_points_on_cut_lines = false;
6446  } else {
6447  this->distribute_points_on_cut_lines = true;
6448  }
6449 
6450  if (this->mj_run_as_rcb){
6451  mj_user_recursion_depth = (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
6452  }
6453  if (this->recursion_depth < 1){
6454  if (mj_user_recursion_depth > 0){
6455  this->recursion_depth = mj_user_recursion_depth;
6456  }
6457  else {
6458  this->recursion_depth = this->coord_dim;
6459  }
6460  }
6461 
6462  this->num_threads = 1;
6463 #ifdef HAVE_ZOLTAN2_OMP
6464 #pragma omp parallel
6465  {
6466  this->num_threads = omp_get_num_threads();
6467  }
6468 #endif
6469 
6470 }
6471 
6473 template <typename Adapter>
6475  int dim,
6476  typename Adapter::scalar_t *lower,
6477  typename Adapter::scalar_t *upper,
6478  size_t &nPartsFound,
6479  typename Adapter::part_t **partsFound) const
6480 {
6481  // TODO: Implement with cuts rather than boxes to reduce algorithmic
6482  // TODO: complexity. Or at least do a search through the boxes, using
6483  // TODO: p x q x r x ... if possible.
6484 
6485  nPartsFound = 0;
6486  *partsFound = NULL;
6487 
6488  if (this->mj_keep_part_boxes) {
6489 
6490  // Get vector of part boxes
6491  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
6492 
6493  size_t nBoxes = (*partBoxes).size();
6494  if (nBoxes == 0) {
6495  throw std::logic_error("no part boxes exist");
6496  }
6497 
6498  // Determine whether the box overlaps the globalBox at all
6499  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
6500 
6501  if (globalBox->boxesOverlap(dim, lower, upper)) {
6502 
6503  std::vector<typename Adapter::part_t> partlist;
6504 
6505  // box overlaps the global box; find specific overlapping boxes
6506  for (size_t i = 0; i < nBoxes; i++) {
6507  try {
6508  if ((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
6509  nPartsFound++;
6510  partlist.push_back((*partBoxes)[i].getpId());
6511 
6512 // std::cout << "Given box (";
6513 // for (int j = 0; j < dim; j++)
6514 // std::cout << lower[j] << " ";
6515 // std::cout << ") x (";
6516 // for (int j = 0; j < dim; j++)
6517 // std::cout << upper[j] << " ";
6518 // std::cout << ") overlaps PartBox "
6519 // << (*partBoxes)[i].getpId() << " (";
6520 // for (int j = 0; j < dim; j++)
6521 // std::cout << (*partBoxes)[i].getlmins()[j] << " ";
6522 // std::cout << ") x (";
6523 // for (int j = 0; j < dim; j++)
6524 // std::cout << (*partBoxes)[i].getlmaxs()[j] << " ";
6525 // std::cout << ")" << std::endl;
6526  }
6527  }
6529  }
6530  if (nPartsFound) {
6531  *partsFound = new mj_part_t[nPartsFound];
6532  for (size_t i = 0; i < nPartsFound; i++)
6533  (*partsFound)[i] = partlist[i];
6534  }
6535  }
6536  else {
6537  // Box does not overlap the domain at all. Find the closest part
6538  // Not sure how to perform this operation for MJ without having the
6539  // cuts. With the RCB cuts, the concept of a part extending to
6540  // infinity was natural. With the boxes, it is much more difficult.
6541  // TODO: For now, return information indicating NO OVERLAP.
6542 
6543  }
6544  }
6545  else {
6546  throw std::logic_error("need to use keep_cuts parameter for boxAssign");
6547  }
6548 }
6549 
6551 template <typename Adapter>
6552 typename Adapter::part_t Zoltan2_AlgMJ<Adapter>::pointAssign(
6553  int dim,
6554  typename Adapter::scalar_t *point) const
6555 {
6556 
6557  // TODO: Implement with cuts rather than boxes to reduce algorithmic
6558  // TODO: complexity. Or at least do a search through the boxes, using
6559  // TODO: p x q x r x ... if possible.
6560 
6561  if (this->mj_keep_part_boxes) {
6562  typename Adapter::part_t foundPart = -1;
6563 
6564  // Get vector of part boxes
6565  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
6566 
6567  size_t nBoxes = (*partBoxes).size();
6568  if (nBoxes == 0) {
6569  throw std::logic_error("no part boxes exist");
6570  }
6571 
6572  // Determine whether the point is within the global domain
6573  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
6574 
6575  if (globalBox->pointInBox(dim, point)) {
6576 
6577  // point is in the global domain; determine in which part it is.
6578  size_t i;
6579  for (i = 0; i < nBoxes; i++) {
6580  try {
6581  if ((*partBoxes)[i].pointInBox(dim, point)) {
6582  foundPart = (*partBoxes)[i].getpId();
6583 // std::cout << "Point (";
6584 // for (int j = 0; j < dim; j++) std::cout << point[j] << " ";
6585 // std::cout << ") found in box " << i << " part " << foundPart
6586 // << std::endl;
6587 // (*partBoxes)[i].print();
6588  break;
6589  }
6590  }
6592  }
6593 
6594  if (i == nBoxes) {
6595  // This error should never occur
6596  std::ostringstream oss;
6597  oss << "Point (";
6598  for (int j = 0; j < dim; j++) oss << point[j] << " ";
6599  oss << ") not found in domain";
6600  throw std::logic_error(oss.str());
6601  }
6602  }
6603 
6604  else {
6605  // Point is outside the global domain.
6606  // Determine to which part it is closest.
6607  // TODO: with cuts, would not need this special case
6608 
6609  size_t closestBox = 0;
6610  mj_scalar_t minDistance = std::numeric_limits<mj_scalar_t>::max();
6611  mj_scalar_t *centroid = new mj_scalar_t[dim];
6612  for (size_t i = 0; i < nBoxes; i++) {
6613  (*partBoxes)[i].computeCentroid(centroid);
6614  mj_scalar_t sum = 0.;
6615  mj_scalar_t diff;
6616  for (int j = 0; j < dim; j++) {
6617  diff = centroid[j] - point[j];
6618  sum += diff * diff;
6619  }
6620  if (sum < minDistance) {
6621  minDistance = sum;
6622  closestBox = i;
6623  }
6624  }
6625  foundPart = (*partBoxes)[closestBox].getpId();
6626  delete [] centroid;
6627  }
6628 
6629  return foundPart;
6630  }
6631  else {
6632  throw std::logic_error("need to use keep_cuts parameter for pointAssign");
6633  }
6634 }
6635 
6636 template <typename Adapter>
6638  const PartitioningSolution<Adapter> *solution,
6639  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comXAdj,
6640  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comAdj)
6641 {
6642  if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL){
6643  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6644  mj_part_t ntasks = (*pBoxes).size();
6645  int dim = (*pBoxes)[0].getDim();
6646  GridHash<mj_scalar_t, mj_part_t> grid(pBoxes, ntasks, dim);
6647  grid.getAdjArrays(comXAdj_, comAdj_);
6648  }
6649  comAdj = comAdj_;
6650  comXAdj = comXAdj_;
6651 }
6652 
6653 
6654 template <typename Adapter>
6655 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
6657 {
6658  return this->mj_partitioner.get_kept_boxes();
6659 }
6660 
6661 
6662 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6663  typename mj_part_t>
6664 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
6666 {
6667  if (this->mj_keep_part_boxes)
6668  return this->kept_boxes;
6669  else
6670  throw std::logic_error("Error: part boxes are not stored.");
6671 }
6672 
6673 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6674  typename mj_part_t>
6675 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
6677  RCP<mj_partBoxVector_t> &localPartBoxes
6678 ) const
6679 {
6680  mj_part_t ntasks = this->num_global_parts;
6681  int dim = (*localPartBoxes)[0].getDim();
6682  mj_scalar_t *localPartBoundaries = new mj_scalar_t[ntasks * 2 *dim];
6683 
6684  memset(localPartBoundaries, 0, sizeof(mj_scalar_t) * ntasks * 2 *dim);
6685 
6686  mj_scalar_t *globalPartBoundaries = new mj_scalar_t[ntasks * 2 *dim];
6687  memset(globalPartBoundaries, 0, sizeof(mj_scalar_t) * ntasks * 2 *dim);
6688 
6689  mj_scalar_t *localPartMins = localPartBoundaries;
6690  mj_scalar_t *localPartMaxs = localPartBoundaries + ntasks * dim;
6691 
6692  mj_scalar_t *globalPartMins = globalPartBoundaries;
6693  mj_scalar_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
6694 
6695  mj_part_t boxCount = localPartBoxes->size();
6696  for (mj_part_t i = 0; i < boxCount; ++i){
6697  mj_part_t pId = (*localPartBoxes)[i].getpId();
6698  //cout << "me:" << comm->getRank() << " has:" << pId << endl;
6699 
6700  mj_scalar_t *lmins = (*localPartBoxes)[i].getlmins();
6701  mj_scalar_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
6702 
6703  for (int j = 0; j < dim; ++j){
6704  localPartMins[dim * pId + j] = lmins[j];
6705  localPartMaxs[dim * pId + j] = lmaxs[j];
6706  /*
6707  cout << "me:" << comm->getRank() <<
6708  " dim * pId + j:"<< dim * pId + j <<
6709  " localMin:" << localPartMins[dim * pId + j] <<
6710  " localMax:" << localPartMaxs[dim * pId + j] << endl;
6711  */
6712  }
6713  }
6714 
6715  Teuchos::Zoltan2_BoxBoundaries<int, mj_scalar_t> reductionOp(ntasks * 2 *dim);
6716 
6717  reduceAll<int, mj_scalar_t>(*mj_problemComm, reductionOp,
6718  ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
6719  RCP<mj_partBoxVector_t> pB(new mj_partBoxVector_t(),true);
6720  for (mj_part_t i = 0; i < ntasks; ++i){
6722  globalPartMins + dim * i,
6723  globalPartMaxs + dim * i);
6724 
6725  /*
6726  for (int j = 0; j < dim; ++j){
6727  cout << "me:" << comm->getRank() <<
6728  " dim * pId + j:"<< dim * i + j <<
6729  " globalMin:" << globalPartMins[dim * i + j] <<
6730  " globalMax:" << globalPartMaxs[dim * i + j] << endl;
6731  }
6732  */
6733  pB->push_back(tpb);
6734  }
6735  delete []localPartBoundaries;
6736  delete []globalPartBoundaries;
6737  //RCP <mj_partBoxVector_t> tmpRCPBox(pB, true);
6738  return pB;
6739 }
6740 } // namespace Zoltan2
6741 
6742 #endif
#define MIN_WORK_LAST_DIM
GridHash Class, Hashing Class for part boxes.
Time an algorithm (or other entity) as a whole.
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, mj_scalar_t minimum_migration_imbalance_)
Multi Jagged coordinate partitioning algorithm.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
Defines Parameter related enumerators, declares functions.
void boxAssign(int dim, mj_scalar_t *lower, mj_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
size_t global_size_t
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Sort items for quick sort function.
#define imbalanceOf2(Wachieved, wExpected)
RCP< mj_partBoxVector_t > get_kept_boxes() const
void freeArray(T *&array)
Frees the given array.
Class for sorting items with multiple values. First sorting with respect to val[0], then val[1] then ... val[count-1]. The last tie breaking is done with index values. Used for task mapping partitioning where the points on a cut line needs to be distributed consistently.
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
#define SIGNIFICANCE_MUL
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
#define FUTURE_REDUCEALL_CUTOFF
Multi Jagged coordinate partitioning algorithm.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
T * allocMemory(size_t size)
Allocates memory for the given size.
mj_part_t pointAssign(int dim, mj_scalar_t *point) const
dictionary vals
Definition: xml2dox.py:186
uMultiSortItem< IT, CT, WT > operator=(const uMultiSortItem< IT, CT, WT > &other)
A PartitioningSolution is a solution to a partitioning problem.
Zoltan2_BoxBoundaries()
Default Constructor.
#define ZOLTAN2_ABS(x)
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
void getInputArray(ArrayRCP< const T > &array) const
Create a contiguous array of the required type, perhaps for a TPL.
uMultiSortItem(const uMultiSortItem< IT, CT, WT > &other)
#define LEAST_SIGNIFICANCE
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
The StridedData class manages lists of weights or coordinates.
Algorithm defines the base class for all algorithms.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
RCP< mj_partBox_t > get_global_box() const
Return the global bounding box: min/max coords of global domain.
uMultiSortItem(IT index_, CT count_, WT *vals_)
std::string toString(tt obj)
Converts the given object to string.
Defines the CoordinateModel classes.
void multi_jagged_part(const RCP< const Environment > &env, RCP< Comm< int > > &problemComm, double imbalance_tolerance, size_t num_global_parts, mj_part_t *part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, const mj_gno_t *initial_mj_gnos, mj_scalar_t **mj_coordinates, int num_weights_per_coord, bool *mj_uniform_weights, mj_scalar_t **mj_weights, bool *mj_uniform_parts, mj_scalar_t **mj_part_sizes, mj_part_t *&result_assigned_part_ids, mj_gno_t *&result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
bool operator>(const uMultiSortItem< IT, CT, WT > &other) const
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
#define epsilon
A gathering of useful namespace methods.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Multi Jagged coordinate partitioning algorithm.