49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50 #define _ZOLTAN2_ALGMultiJagged_HPP_
57 #include <Teuchos_StandardParameterEntryValidators.hpp>
59 #include <Tpetra_Distributor.hpp>
60 #include <Teuchos_ParameterList.hpp>
67 #if defined(__cplusplus) && __cplusplus >= 201103L
68 #include <unordered_map>
70 #include <Teuchos_Hashtable.hpp>
73 #ifdef ZOLTAN2_USEZOLTANCOMM
74 #ifdef HAVE_ZOLTAN2_MPI
75 #define ENABLE_ZOLTAN_MIGRATION
76 #include "zoltan_comm_cpp.h"
77 #include "zoltan_types.h"
81 #ifdef HAVE_ZOLTAN2_OMP
85 #define LEAST_SIGNIFICANCE 0.0001
86 #define SIGNIFICANCE_MUL 1000
91 #define FUTURE_REDUCEALL_CUTOFF 1500000
94 #define MIN_WORK_LAST_DIM 1000
99 #define ZOLTAN2_ABS(x) ((x) >= 0 ? (x) : -(x))
101 #define imbalanceOf(Wachieved, totalW, expectedRatio) \
102 (Wachieved) / ((totalW) * (expectedRatio)) - 1
103 #define imbalanceOf2(Wachieved, wExpected) \
104 (Wachieved) / (wExpected) - 1
107 #define ZOLTAN2_ALGMULTIJAGGED_SWAP(a,b,temp) temp=(a);(a)=(b);(b)=temp;
116 template <
typename Ordinal,
typename T>
135 size(s_), _EPSILON (std::numeric_limits<T>::
epsilon()){}
139 void reduce(
const Ordinal count,
const T inBuffer[], T inoutBuffer[])
const
141 for (Ordinal i=0; i < count; i++){
142 if (
Z2_ABS(inBuffer[i]) > _EPSILON){
143 inoutBuffer[i] = inBuffer[i];
155 template <
typename T>
160 throw "cannot allocate memory";
172 template <
typename T>
188 template <
typename IT,
typename CT,
typename WT>
209 this->index = index_;
210 this->count = count_;
216 this->index = other.
index;
217 this->count = other.
count;
218 this->val = other.
val;
226 void set(IT index_ ,CT count_, WT *vals_){
227 this->index = index_;
228 this->count = count_;
234 this->index = other.
index;
235 this->count = other.
count;
236 this->val = other.
val;
241 assert (this->count == other.
count);
242 for(CT i = 0; i < this->
count; ++i){
248 if(this->val[i] < other.
val[i]){
257 return this->index < other.
index;
260 assert (this->count == other.
count);
261 for(CT i = 0; i < this->
count; ++i){
267 if(this->val[i] > other.
val[i]){
277 return this->index > other.
index;
284 template <
class IT,
class WT>
295 template <
class IT,
class WT>
301 IT i, ir=n, j, k, l=1;
302 IT jstack=0, istack[50];
311 for (j=l+1;j<=ir;j++)
317 if (arr[i].val <= aval)
333 if (arr[l+1].val > arr[ir].val)
337 if (arr[l].val > arr[ir].val)
341 if (arr[l+1].val > arr[l].val)
351 do i++;
while (arr[i].val < aval);
352 do j--;
while (arr[j].val > aval);
359 if (jstack > NSTACK){
360 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
379 template <
class IT,
class WT,
class SIGN>
388 if (this->signbit < rhs.
signbit){
392 else if (this->signbit == rhs.
signbit){
394 if (this->val < rhs.
val){
398 else if (this->val > rhs.
val){
414 if (this->signbit > rhs.
signbit){
418 else if (this->signbit == rhs.
signbit){
420 if (this->val < rhs.
val){
424 else if (this->val > rhs.
val){
438 return !(*
this > rhs);}
440 return !(*
this < rhs);}
446 template <
class IT,
class WT,
class SIGN>
451 IT i, ir=n, j, k, l=1;
452 IT jstack=0, istack[50];
460 for (j=l+1;j<=ir;j++)
482 if (arr[l+1] > arr[ir])
486 if (arr[l] > arr[ir])
490 if (arr[l+1] > arr[l])
499 do i++;
while (arr[i] < a);
500 do j--;
while (arr[j] > a);
507 if (jstack > NSTACK){
508 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
530 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
536 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
538 RCP<const Environment> mj_env;
539 RCP<const Comm<int> > mj_problemComm;
541 double imbalance_tolerance;
542 mj_part_t *part_no_array;
544 int coord_dim, num_weights_per_coord;
546 size_t initial_num_loc_coords;
549 mj_lno_t num_local_coords;
550 mj_gno_t num_global_coords;
552 mj_scalar_t **mj_coordinates;
553 mj_scalar_t **mj_weights;
554 bool *mj_uniform_parts;
555 mj_scalar_t **mj_part_sizes;
556 bool *mj_uniform_weights;
558 ArrayView<const mj_gno_t> mj_gnos;
559 size_t num_global_parts;
561 mj_gno_t *initial_mj_gnos;
562 mj_gno_t *current_mj_gnos;
563 int *owner_of_coordinate;
565 mj_lno_t *coordinate_permutations;
566 mj_lno_t *new_coordinate_permutations;
567 mj_part_t *assigned_part_ids;
570 mj_lno_t *new_part_xadj;
573 bool distribute_points_on_cut_lines;
574 mj_part_t max_concurrent_part_calculation;
577 int mj_user_recursion_depth;
578 bool mj_keep_part_boxes;
580 int check_migrate_avoid_migration_option;
583 double minimum_migration_imbalance;
597 mj_part_t num_first_level_parts;
598 const mj_part_t *first_level_distribution;
600 mj_part_t total_num_cut ;
601 mj_part_t total_num_part;
603 mj_part_t max_num_part_along_dim ;
604 mj_part_t max_num_cut_along_dim;
605 size_t max_num_total_part_along_dim;
607 mj_part_t total_dim_num_reduce_all;
608 mj_part_t last_dim_num_part;
612 RCP<Comm<int> > comm;
614 mj_scalar_t sEpsilon;
616 mj_scalar_t maxScalar_t;
617 mj_scalar_t minScalar_t;
619 mj_scalar_t *all_cut_coordinates;
620 mj_scalar_t *max_min_coords;
621 mj_scalar_t *process_cut_line_weight_to_put_left;
622 mj_scalar_t **thread_cut_line_weight_to_put_left;
628 mj_scalar_t *cut_coordinates_work_array;
631 mj_scalar_t *target_part_weights;
633 mj_scalar_t *cut_upper_bound_coordinates ;
634 mj_scalar_t *cut_lower_bound_coordinates ;
635 mj_scalar_t *cut_lower_bound_weights ;
636 mj_scalar_t *cut_upper_bound_weights ;
638 mj_scalar_t *process_local_min_max_coord_total_weight ;
639 mj_scalar_t *global_min_max_coord_total_weight ;
643 bool *is_cut_line_determined;
646 mj_part_t *my_incomplete_cut_count;
648 double **thread_part_weights;
650 double **thread_part_weight_work;
653 mj_scalar_t **thread_cut_left_closest_point;
655 mj_scalar_t **thread_cut_right_closest_point;
658 mj_lno_t **thread_point_counts;
660 mj_scalar_t *process_rectilinear_cut_weight;
661 mj_scalar_t *global_rectilinear_cut_weight;
667 mj_scalar_t *total_part_weight_left_right_closests ;
668 mj_scalar_t *global_total_part_weight_left_right_closests;
670 RCP<mj_partBoxVector_t> kept_boxes;
673 RCP<mj_partBox_t> global_box;
674 int myRank, myActualRank;
676 bool divide_to_prime_first;
685 void set_part_specifications();
692 inline mj_part_t get_part_count(
693 mj_part_t num_total_future,
699 void allocate_set_work_memory();
706 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
709 void compute_global_box();
725 mj_part_t update_part_num_arrays(
726 std::vector<mj_part_t> &num_partitioning_in_current_dim,
727 std::vector<mj_part_t> *future_num_part_in_parts,
728 std::vector<mj_part_t> *next_future_num_parts_in_parts,
729 mj_part_t &future_num_parts,
730 mj_part_t current_num_parts,
731 int current_iteration,
732 RCP<mj_partBoxVector_t> input_part_boxes,
733 RCP<mj_partBoxVector_t> output_part_boxes,
734 mj_part_t atomic_part_count);
747 void mj_get_local_min_max_coord_totW(
748 mj_lno_t coordinate_begin_index,
749 mj_lno_t coordinate_end_index,
750 mj_lno_t *mj_current_coordinate_permutations,
751 mj_scalar_t *mj_current_dim_coords,
752 mj_scalar_t &min_coordinate,
753 mj_scalar_t &max_coordinate,
754 mj_scalar_t &total_weight);
763 void mj_get_global_min_max_coord_totW(
764 mj_part_t current_concurrent_num_parts,
765 mj_scalar_t *local_min_max_total,
766 mj_scalar_t *global_min_max_total);
795 void mj_get_initial_cut_coords_target_weights(
796 mj_scalar_t min_coord,
797 mj_scalar_t max_coord,
799 mj_scalar_t global_weight,
800 mj_scalar_t *initial_cut_coords ,
801 mj_scalar_t *target_part_weights ,
803 std::vector <mj_part_t> *future_num_part_in_parts,
804 std::vector <mj_part_t> *next_future_num_parts_in_parts,
805 mj_part_t concurrent_current_part,
806 mj_part_t obtained_part_index,
807 mj_part_t num_target_first_level_parts = 1,
808 const mj_part_t *target_first_level_dist = NULL);
822 void set_initial_coordinate_parts(
823 mj_scalar_t &max_coordinate,
824 mj_scalar_t &min_coordinate,
825 mj_part_t &concurrent_current_part_index,
826 mj_lno_t coordinate_begin_index,
827 mj_lno_t coordinate_end_index,
828 mj_lno_t *mj_current_coordinate_permutations,
829 mj_scalar_t *mj_current_dim_coords,
830 mj_part_t *mj_part_ids,
831 mj_part_t &partition_count);
844 mj_scalar_t *mj_current_dim_coords,
845 double imbalanceTolerance,
846 mj_part_t current_work_part,
847 mj_part_t current_concurrent_num_parts,
848 mj_scalar_t *current_cut_coordinates,
849 mj_part_t total_incomplete_cut_count,
850 std::vector <mj_part_t> &num_partitioning_in_current_dim);
871 void mj_1D_part_get_thread_part_weights(
872 size_t total_part_count,
874 mj_scalar_t max_coord,
875 mj_scalar_t min_coord,
876 mj_lno_t coordinate_begin_index,
877 mj_lno_t coordinate_end_index,
878 mj_scalar_t *mj_current_dim_coords,
879 mj_scalar_t *temp_current_cut_coords,
880 bool *current_cut_status,
881 double *my_current_part_weights,
882 mj_scalar_t *my_current_left_closest,
883 mj_scalar_t *my_current_right_closest);
892 void mj_accumulate_thread_results(
893 const std::vector <mj_part_t> &num_partitioning_in_current_dim,
894 mj_part_t current_work_part,
895 mj_part_t current_concurrent_num_parts);
927 void mj_get_new_cut_coordinates(
928 const size_t &num_total_part,
929 const mj_part_t &num_cuts,
930 const mj_scalar_t &max_coordinate,
931 const mj_scalar_t &min_coordinate,
932 const mj_scalar_t &global_total_weight,
933 const double &used_imbalance_tolerance,
934 mj_scalar_t * current_global_part_weights,
935 const mj_scalar_t * current_local_part_weights,
936 const mj_scalar_t *current_part_target_weights,
937 bool *current_cut_line_determined,
938 mj_scalar_t *current_cut_coordinates,
939 mj_scalar_t *current_cut_upper_bounds,
940 mj_scalar_t *current_cut_lower_bounds,
941 mj_scalar_t *current_global_left_closest_points,
942 mj_scalar_t *current_global_right_closest_points,
943 mj_scalar_t * current_cut_lower_bound_weights,
944 mj_scalar_t * current_cut_upper_weights,
945 mj_scalar_t *new_current_cut_coordinates,
946 mj_scalar_t *current_part_cut_line_weight_to_put_left,
947 mj_part_t *rectilinear_cut_count,
948 mj_part_t &my_num_incomplete_cut);
959 void mj_calculate_new_cut_position (
960 mj_scalar_t cut_upper_bound,
961 mj_scalar_t cut_lower_bound,
962 mj_scalar_t cut_upper_weight,
963 mj_scalar_t cut_lower_weight,
964 mj_scalar_t expected_weight,
965 mj_scalar_t &new_cut_position);
977 void mj_create_new_partitions(
979 mj_scalar_t *mj_current_dim_coords,
980 mj_scalar_t *current_concurrent_cut_coordinate,
981 mj_lno_t coordinate_begin,
982 mj_lno_t coordinate_end,
983 mj_scalar_t *used_local_cut_line_weight_to_left,
984 double **used_thread_part_weight_work,
985 mj_lno_t *out_part_xadj);
1009 bool mj_perform_migration(
1010 mj_part_t in_num_parts,
1011 mj_part_t &out_num_parts,
1012 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1013 mj_part_t &output_part_begin_index,
1014 size_t migration_reduce_all_population,
1015 mj_lno_t num_coords_for_last_dim_part,
1016 std::string iteration,
1017 RCP<mj_partBoxVector_t> &input_part_boxes,
1018 RCP<mj_partBoxVector_t> &output_part_boxes);
1029 void get_processor_num_points_in_parts(
1030 mj_part_t num_procs,
1031 mj_part_t num_parts,
1032 mj_gno_t *&num_points_in_all_processor_parts);
1046 bool mj_check_to_migrate(
1047 size_t migration_reduce_all_population,
1048 mj_lno_t num_coords_for_last_dim_part,
1049 mj_part_t num_procs,
1050 mj_part_t num_parts,
1051 mj_gno_t *num_points_in_all_processor_parts);
1071 void mj_migration_part_proc_assignment(
1072 mj_gno_t * num_points_in_all_processor_parts,
1073 mj_part_t num_parts,
1074 mj_part_t num_procs,
1075 mj_lno_t *send_count_to_each_proc,
1076 std::vector<mj_part_t> &processor_ranks_for_subcomm,
1077 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1078 mj_part_t &out_num_part,
1079 std::vector<mj_part_t> &out_part_indices,
1080 mj_part_t &output_part_numbering_begin_index,
1081 int *coordinate_destinations);
1099 void mj_assign_proc_to_parts(
1100 mj_gno_t * num_points_in_all_processor_parts,
1101 mj_part_t num_parts,
1102 mj_part_t num_procs,
1103 mj_lno_t *send_count_to_each_proc,
1104 std::vector<mj_part_t> &processor_ranks_for_subcomm,
1105 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1106 mj_part_t &out_part_index,
1107 mj_part_t &output_part_numbering_begin_index,
1108 int *coordinate_destinations);
1120 void assign_send_destinations(
1121 mj_part_t num_parts,
1122 mj_part_t *part_assignment_proc_begin_indices,
1123 mj_part_t *processor_chains_in_parts,
1124 mj_lno_t *send_count_to_each_proc,
1125 int *coordinate_destinations);
1139 void assign_send_destinations2(
1140 mj_part_t num_parts,
1142 int *coordinate_destinations,
1143 mj_part_t &output_part_numbering_begin_index,
1144 std::vector<mj_part_t> *next_future_num_parts_in_parts);
1162 void mj_assign_parts_to_procs(
1163 mj_gno_t * num_points_in_all_processor_parts,
1164 mj_part_t num_parts,
1165 mj_part_t num_procs,
1166 mj_lno_t *send_count_to_each_proc,
1167 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1168 mj_part_t &out_num_part,
1169 std::vector<mj_part_t> &out_part_indices,
1170 mj_part_t &output_part_numbering_begin_index,
1171 int *coordinate_destinations);
1185 void mj_migrate_coords(
1186 mj_part_t num_procs,
1187 mj_lno_t &num_new_local_points,
1188 std::string iteration,
1189 int *coordinate_destinations,
1190 mj_part_t num_parts);
1198 void create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm);
1206 void fill_permutation_array(
1207 mj_part_t output_num_parts,
1208 mj_part_t num_parts);
1218 void set_final_parts(
1219 mj_part_t current_num_parts,
1220 mj_part_t output_part_begin_index,
1221 RCP<mj_partBoxVector_t> &output_part_boxes,
1222 bool is_data_ever_migrated);
1225 void free_work_memory();
1239 void create_consistent_chunks(
1240 mj_part_t num_parts,
1241 mj_scalar_t *mj_current_dim_coords,
1242 mj_scalar_t *current_concurrent_cut_coordinate,
1243 mj_lno_t coordinate_begin,
1244 mj_lno_t coordinate_end,
1245 mj_scalar_t *used_local_cut_line_weight_to_left,
1246 mj_lno_t *out_part_xadj,
1253 mj_part_t find_largest_prime_factor(mj_part_t num_parts){
1254 mj_part_t largest_factor = 1;
1255 mj_part_t n = num_parts;
1256 mj_part_t divisor = 2;
1258 while (n % divisor == 0){
1260 largest_factor = divisor;
1263 if (divisor * divisor > n){
1270 return largest_factor;
1303 const RCP<const Environment> &env,
1304 RCP<
const Comm<int> > &problemComm,
1306 double imbalance_tolerance,
1307 size_t num_global_parts,
1308 mj_part_t *part_no_array,
1309 int recursion_depth,
1312 mj_lno_t num_local_coords,
1313 mj_gno_t num_global_coords,
1314 const mj_gno_t *initial_mj_gnos,
1315 mj_scalar_t **mj_coordinates,
1317 int num_weights_per_coord,
1318 bool *mj_uniform_weights,
1319 mj_scalar_t **mj_weights,
1320 bool *mj_uniform_parts,
1321 mj_scalar_t **mj_part_sizes,
1323 mj_part_t *&result_assigned_part_ids,
1324 mj_gno_t *&result_mj_gnos);
1336 bool distribute_points_on_cut_lines_,
1337 int max_concurrent_part_calculation_,
1338 int check_migrate_avoid_migration_option_,
1339 double minimum_migration_imbalance_,
int migration_type_ = 0);
1353 RCP<mj_partBoxVector_t> &localPartBoxes)
const;
1398 const RCP<const Environment> &env,
1399 mj_lno_t num_total_coords,
1400 mj_lno_t num_selected_coords,
1401 size_t num_target_part,
1403 mj_scalar_t **mj_coordinates,
1404 mj_lno_t *initial_selected_coords_output_permutation,
1405 mj_lno_t *output_xadj,
1406 int recursion_depth,
1407 const mj_part_t *part_no_array,
1408 bool partition_along_longest_dim,
1409 int num_ranks_per_node,
1410 bool divide_to_prime_first_,
1411 mj_part_t num_first_level_parts_ = 1,
1412 const mj_part_t *first_level_distribution_ = NULL);
1458 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1461 const RCP<const Environment> &env,
1462 mj_lno_t num_total_coords,
1463 mj_lno_t num_selected_coords,
1464 size_t num_target_part,
1466 mj_scalar_t **mj_coordinates_,
1467 mj_lno_t *inital_adjList_output_adjlist,
1468 mj_lno_t *output_xadj,
1470 const mj_part_t *part_no_array_,
1471 bool partition_along_longest_dim,
1472 int num_ranks_per_node,
1473 bool divide_to_prime_first_,
1474 mj_part_t num_first_level_parts_,
1475 const mj_part_t *first_level_distribution_) {
1478 const RCP<Comm<int> > commN;
1479 this->mj_problemComm =
1480 Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1482 Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1483 this->myActualRank = this->myRank = 1;
1485 #ifdef HAVE_ZOLTAN2_OMP
1490 this->divide_to_prime_first = divide_to_prime_first_;
1495 this->imbalance_tolerance = 0;
1496 this->num_global_parts = num_target_part;
1497 this->part_no_array = (mj_part_t *)part_no_array_;
1498 this->recursion_depth = rd;
1502 this->num_first_level_parts = num_first_level_parts_;
1503 this->first_level_distribution = (mj_part_t *)first_level_distribution_;
1505 this->coord_dim = coord_dim_;
1506 this->num_local_coords = num_total_coords;
1507 this->num_global_coords = num_total_coords;
1508 this->mj_coordinates = mj_coordinates_;
1512 this->initial_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
1514 this->num_weights_per_coord = 0;
1515 bool *tmp_mj_uniform_weights =
new bool[1];
1516 this->mj_uniform_weights = tmp_mj_uniform_weights;
1517 this->mj_uniform_weights[0] =
true;
1519 mj_scalar_t **tmp_mj_weights =
new mj_scalar_t *[1];
1520 this->mj_weights = tmp_mj_weights;
1522 bool *tmp_mj_uniform_parts =
new bool[1];
1523 this->mj_uniform_parts = tmp_mj_uniform_parts;
1524 this->mj_uniform_parts[0] =
true;
1526 mj_scalar_t **tmp_mj_part_sizes =
new mj_scalar_t * [1];
1527 this->mj_part_sizes = tmp_mj_part_sizes;
1528 this->mj_part_sizes[0] = NULL;
1530 this->num_threads = 1;
1531 this->set_part_specifications();
1533 this->allocate_set_work_memory();
1535 this->part_xadj[0] =
static_cast<mj_lno_t
>(num_selected_coords);
1536 for(
size_t i = 0; i < static_cast<size_t>(num_total_coords); ++i){
1537 this->coordinate_permutations[i] = inital_adjList_output_adjlist[i];
1540 mj_part_t current_num_parts = 1;
1542 mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
1544 mj_part_t future_num_parts = this->total_num_part;
1546 std::vector<mj_part_t> *future_num_part_in_parts =
new std::vector<mj_part_t> ();
1547 std::vector<mj_part_t> *next_future_num_parts_in_parts =
new std::vector<mj_part_t> ();
1548 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1549 RCP<mj_partBoxVector_t> t1;
1550 RCP<mj_partBoxVector_t> t2;
1553 std::vector <uSignedSortItem<int, mj_scalar_t, char> > coord_dimension_range_sorted(this->coord_dim);
1555 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1556 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1558 for (
int i = 0; i < this->recursion_depth; ++i) {
1562 std::vector <mj_part_t> num_partitioning_in_current_dim;
1576 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
1577 future_num_part_in_parts = next_future_num_parts_in_parts;
1578 next_future_num_parts_in_parts = tmpPartVect;
1583 next_future_num_parts_in_parts->clear();
1587 mj_part_t output_part_count_in_dimension =
1588 this->update_part_num_arrays(
1589 num_partitioning_in_current_dim,
1590 future_num_part_in_parts,
1591 next_future_num_parts_in_parts,
1596 t2, num_ranks_per_node);
1601 if(output_part_count_in_dimension == current_num_parts) {
1602 tmpPartVect= future_num_part_in_parts;
1603 future_num_part_in_parts = next_future_num_parts_in_parts;
1604 next_future_num_parts_in_parts = tmpPartVect;
1609 std::string istring = Teuchos::toString<int>(i);
1613 this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
1616 mj_part_t output_part_index = 0;
1619 mj_part_t output_coordinate_end_index = 0;
1621 mj_part_t current_work_part = 0;
1622 mj_part_t current_concurrent_num_parts = 1;
1624 mj_part_t obtained_part_index = 0;
1627 int coordInd = i % this->coord_dim;
1628 mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
1632 for (; current_work_part < current_num_parts;
1633 current_work_part += current_concurrent_num_parts) {
1639 mj_part_t actual_work_part_count = 0;
1643 for (
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1644 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
1648 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
1651 ++actual_work_part_count;
1652 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
1653 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts == 0 ?
1654 0 : this->part_xadj[current_work_part_in_concurrent_parts - 1];
1664 if (partition_along_longest_dim) {
1666 mj_scalar_t best_weight_coord = 0;
1667 for (
int coord_traverse_ind = 0; coord_traverse_ind < this->coord_dim; ++coord_traverse_ind){
1668 mj_scalar_t best_min_coord = 0;
1669 mj_scalar_t best_max_coord = 0;
1672 this->mj_get_local_min_max_coord_totW(
1673 coordinate_begin_index,
1674 coordinate_end_index,
1675 this->coordinate_permutations,
1676 this->mj_coordinates[coord_traverse_ind],
1682 coord_dim_mins[coord_traverse_ind] = best_min_coord;
1683 coord_dim_maxs[coord_traverse_ind] = best_max_coord;
1684 mj_scalar_t best_range = best_max_coord - best_min_coord;
1685 coord_dimension_range_sorted[coord_traverse_ind].id = coord_traverse_ind;
1686 coord_dimension_range_sorted[coord_traverse_ind].val = best_range;
1687 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1691 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1692 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].
id;
1707 mj_current_dim_coords = this->mj_coordinates[coordInd];
1709 this->process_local_min_max_coord_total_weight[kk] = coord_dim_mins[coordInd];
1710 this->process_local_min_max_coord_total_weight[kk+ current_concurrent_num_parts] = coord_dim_maxs[coordInd];
1711 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] = best_weight_coord;
1715 this->mj_get_local_min_max_coord_totW(
1716 coordinate_begin_index,
1717 coordinate_end_index,
1718 this->coordinate_permutations,
1719 mj_current_dim_coords,
1720 this->process_local_min_max_coord_total_weight[kk],
1721 this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts],
1722 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]
1728 if (actual_work_part_count > 0) {
1730 this->mj_get_global_min_max_coord_totW(
1731 current_concurrent_num_parts,
1732 this->process_local_min_max_coord_total_weight,
1733 this->global_min_max_coord_total_weight);
1737 mj_part_t total_incomplete_cut_count = 0;
1742 mj_part_t concurrent_part_cut_shift = 0;
1743 mj_part_t concurrent_part_part_shift = 0;
1746 for (
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1747 mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
1748 mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
1749 current_concurrent_num_parts];
1750 mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
1751 2 * current_concurrent_num_parts];
1753 mj_part_t concurrent_current_part_index = current_work_part + kk;
1755 mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
1757 mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
1758 mj_scalar_t *current_target_part_weights = this->target_part_weights +
1759 concurrent_part_part_shift;
1761 concurrent_part_cut_shift += partition_count - 1;
1763 concurrent_part_part_shift += partition_count;
1767 if(partition_count > 1 && min_coordinate <= max_coordinate){
1771 total_incomplete_cut_count += partition_count - 1;
1774 this->my_incomplete_cut_count[kk] = partition_count - 1;
1780 first_level_distribution != NULL &&
1781 num_first_level_parts > 1) {
1783 this->mj_get_initial_cut_coords_target_weights(
1786 partition_count - 1,
1787 global_total_weight,
1789 current_target_part_weights,
1790 future_num_part_in_parts,
1791 next_future_num_parts_in_parts,
1792 concurrent_current_part_index,
1793 obtained_part_index,
1794 this->num_first_level_parts,
1795 this->first_level_distribution);
1801 this->mj_get_initial_cut_coords_target_weights(
1804 partition_count - 1,
1805 global_total_weight,
1807 current_target_part_weights,
1808 future_num_part_in_parts,
1809 next_future_num_parts_in_parts,
1810 concurrent_current_part_index,
1811 obtained_part_index);
1814 mj_lno_t coordinate_end_index = this->part_xadj[concurrent_current_part_index];
1815 mj_lno_t coordinate_begin_index = concurrent_current_part_index == 0 ?
1816 0 : this->part_xadj[concurrent_current_part_index - 1];
1819 this->set_initial_coordinate_parts(
1822 concurrent_current_part_index,
1823 coordinate_begin_index, coordinate_end_index,
1824 this->coordinate_permutations,
1825 mj_current_dim_coords,
1826 this->assigned_part_ids,
1832 this->my_incomplete_cut_count[kk] = 0;
1834 obtained_part_index += partition_count;
1838 double used_imbalance = 0;
1843 mj_current_dim_coords,
1846 current_concurrent_num_parts,
1847 current_cut_coordinates,
1848 total_incomplete_cut_count,
1849 num_partitioning_in_current_dim);
1852 obtained_part_index += current_concurrent_num_parts;
1858 mj_part_t output_array_shift = 0;
1859 mj_part_t cut_shift = 0;
1860 size_t tlr_shift = 0;
1861 size_t partweight_array_shift = 0;
1863 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1864 mj_part_t current_concurrent_work_part = current_work_part + kk;
1865 mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
1868 if((num_parts != 1 ) && this->global_min_max_coord_total_weight[kk] >
1869 this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
1871 for(mj_part_t jj = 0; jj < num_parts; ++jj){
1872 this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
1874 cut_shift += num_parts - 1;
1875 tlr_shift += (4 *(num_parts - 1) + 1);
1876 output_array_shift += num_parts;
1877 partweight_array_shift += (2 * (num_parts - 1) + 1);
1881 mj_lno_t coordinate_end = this->part_xadj[current_concurrent_work_part];
1882 mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[current_concurrent_work_part
1884 mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
1885 mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
1888 for(
int ii = 0; ii < this->num_threads; ++ii){
1889 this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
1894 this->create_consistent_chunks(
1896 mj_current_dim_coords,
1897 current_concurrent_cut_coordinate,
1900 used_local_cut_line_weight_to_left,
1901 this->new_part_xadj + output_part_index + output_array_shift,
1903 partition_along_longest_dim,
1904 p_coord_dimension_range_sorted);
1909 mj_lno_t part_size = coordinate_end - coordinate_begin;
1910 *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
1911 memcpy(this->new_coordinate_permutations + coordinate_begin,
1912 this->coordinate_permutations + coordinate_begin,
1913 part_size *
sizeof(mj_lno_t));
1918 cut_shift += num_parts - 1;
1919 tlr_shift += (4 *(num_parts - 1) + 1);
1920 output_array_shift += num_parts;
1921 partweight_array_shift += (2 * (num_parts - 1) + 1);
1930 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
1931 mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
1932 for (mj_part_t ii = 0;ii < num_parts ; ++ii){
1934 this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
1936 mj_lno_t coordinate_end = this->new_part_xadj[output_part_index+ii];
1937 mj_lno_t coordinate_begin = this->new_part_xadj[output_part_index];
1939 for (mj_lno_t task_traverse = coordinate_begin; task_traverse < coordinate_end; ++task_traverse){
1940 mj_lno_t l = this->new_coordinate_permutations[task_traverse];
1942 mj_current_dim_coords[l] = -mj_current_dim_coords[l];
1947 output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
1949 output_part_index += num_parts ;
1956 current_num_parts = output_part_count_in_dimension;
1959 mj_lno_t * tmp = this->coordinate_permutations;
1960 this->coordinate_permutations = this->new_coordinate_permutations;
1961 this->new_coordinate_permutations = tmp;
1963 freeArray<mj_lno_t>(this->part_xadj);
1964 this->part_xadj = this->new_part_xadj;
1965 this->new_part_xadj = NULL;
1968 for(mj_lno_t i = 0; i < num_total_coords; ++i){
1969 inital_adjList_output_adjlist[i] = this->coordinate_permutations[i];
1974 for(
size_t i = 0; i < this->num_global_parts ; ++i){
1975 output_xadj[i+1] = this->part_xadj[i];
1978 delete future_num_part_in_parts;
1979 delete next_future_num_parts_in_parts;
1982 freeArray<mj_part_t>(this->assigned_part_ids);
1983 freeArray<mj_gno_t>(this->initial_mj_gnos);
1984 freeArray<mj_gno_t>(this->current_mj_gnos);
1985 freeArray<bool>(tmp_mj_uniform_weights);
1986 freeArray<bool>(tmp_mj_uniform_parts);
1987 freeArray<mj_scalar_t *>(tmp_mj_weights);
1988 freeArray<mj_scalar_t *>(tmp_mj_part_sizes);
1990 this->free_work_memory();
1992 #ifdef HAVE_ZOLTAN2_OMP
2000 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2003 mj_env(), mj_problemComm(), imbalance_tolerance(0),
2004 part_no_array(NULL), recursion_depth(0), coord_dim(0),
2005 num_weights_per_coord(0), initial_num_loc_coords(0),
2006 initial_num_glob_coords(0),
2007 num_local_coords(0), num_global_coords(0), mj_coordinates(NULL),
2008 mj_weights(NULL), mj_uniform_parts(NULL), mj_part_sizes(NULL),
2009 mj_uniform_weights(NULL), mj_gnos(), num_global_parts(1),
2010 initial_mj_gnos(NULL), current_mj_gnos(NULL), owner_of_coordinate(NULL),
2011 coordinate_permutations(NULL), new_coordinate_permutations(NULL),
2012 assigned_part_ids(NULL), part_xadj(NULL), new_part_xadj(NULL),
2013 distribute_points_on_cut_lines(true), max_concurrent_part_calculation(1),
2014 mj_run_as_rcb(false), mj_user_recursion_depth(0), mj_keep_part_boxes(false),
2015 check_migrate_avoid_migration_option(0), migration_type(0), minimum_migration_imbalance(0.30),
2016 num_threads(1), num_first_level_parts(1), first_level_distribution(NULL),
2017 total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
2018 max_num_cut_along_dim(0), max_num_total_part_along_dim(0), total_dim_num_reduce_all(0),
2019 last_dim_num_part(0), comm(), fEpsilon(0), sEpsilon(0), maxScalar_t(0), minScalar_t(0),
2020 all_cut_coordinates(NULL), max_min_coords(NULL), process_cut_line_weight_to_put_left(NULL),
2021 thread_cut_line_weight_to_put_left(NULL), cut_coordinates_work_array(NULL),
2022 target_part_weights(NULL), cut_upper_bound_coordinates(NULL), cut_lower_bound_coordinates(NULL),
2023 cut_lower_bound_weights(NULL), cut_upper_bound_weights(NULL),
2024 process_local_min_max_coord_total_weight(NULL), global_min_max_coord_total_weight(NULL),
2025 is_cut_line_determined(NULL), my_incomplete_cut_count(NULL),
2026 thread_part_weights(NULL), thread_part_weight_work(NULL),
2027 thread_cut_left_closest_point(NULL), thread_cut_right_closest_point(NULL),
2028 thread_point_counts(NULL), process_rectilinear_cut_weight(NULL),
2029 global_rectilinear_cut_weight(NULL),total_part_weight_left_right_closests(NULL),
2030 global_total_part_weight_left_right_closests(NULL),
2031 kept_boxes(),global_box(),
2032 myRank(0), myActualRank(0), divide_to_prime_first(false)
2037 this->maxScalar_t = std::numeric_limits<mj_scalar_t>::max();
2038 this->minScalar_t = -std::numeric_limits<mj_scalar_t>::max();
2046 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2048 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBox_t>
2051 return this->global_box;
2057 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2060 this->mj_keep_part_boxes =
true;
2071 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2075 this->total_num_cut = 0;
2076 this->total_num_part = 1;
2077 this->max_num_part_along_dim = 0;
2078 this->total_dim_num_reduce_all = 0;
2079 this->last_dim_num_part = 1;
2082 this->max_num_cut_along_dim = 0;
2083 this->max_num_total_part_along_dim = 0;
2085 if (this->part_no_array) {
2087 for (
int i = 0; i < this->recursion_depth; ++i){
2088 this->total_dim_num_reduce_all += this->total_num_part;
2089 this->total_num_part *= this->part_no_array[i];
2090 if(this->part_no_array[i] > this->max_num_part_along_dim) {
2091 this->max_num_part_along_dim = this->part_no_array[i];
2094 this->last_dim_num_part = this->total_num_part / this->part_no_array[recursion_depth-1];
2095 this->num_global_parts = this->total_num_part;
2098 mj_part_t future_num_parts = this->num_global_parts;
2102 if (this->first_level_distribution != NULL &&
2103 this->num_first_level_parts > 1) {
2104 this->max_num_part_along_dim = this->num_first_level_parts;
2108 for (
int rd = 0; rd < this->recursion_depth; ++rd){
2110 mj_part_t maxNoPartAlongI = 0;
2111 mj_part_t nfutureNumParts = 0;
2116 this->first_level_distribution != NULL &&
2117 this->num_first_level_parts > 1) {
2119 maxNoPartAlongI = this->num_first_level_parts;
2120 this->max_num_part_along_dim = this->num_first_level_parts;
2122 mj_part_t sum_first_level_dist = 0;
2123 mj_part_t max_part = 0;
2126 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2128 sum_first_level_dist += this->first_level_distribution[i];
2130 if (this->first_level_distribution[i] > max_part)
2131 max_part = this->first_level_distribution[i];
2135 nfutureNumParts = this->num_global_parts * max_part / sum_first_level_dist;
2140 maxNoPartAlongI = this->get_part_count(future_num_parts,
2141 1.0f / (this->recursion_depth - rd));
2143 if (maxNoPartAlongI > this->max_num_part_along_dim)
2144 this->max_num_part_along_dim = maxNoPartAlongI;
2147 nfutureNumParts = future_num_parts / maxNoPartAlongI;
2148 if (future_num_parts % maxNoPartAlongI){
2153 future_num_parts = nfutureNumParts;
2155 this->total_num_part = this->num_global_parts;
2157 if (this->divide_to_prime_first){
2158 this->total_dim_num_reduce_all = this->num_global_parts * 2;
2159 this->last_dim_num_part = this->num_global_parts;
2168 for (
int i = 0; i < this->recursion_depth; ++i){
2169 this->total_dim_num_reduce_all += p;
2170 p *= this->max_num_part_along_dim;
2173 if (p / this->max_num_part_along_dim > this->num_global_parts){
2174 this->last_dim_num_part = this->num_global_parts;
2177 this->last_dim_num_part = p / this->max_num_part_along_dim;
2183 this->total_num_cut = this->total_num_part - 1;
2184 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2185 this->max_num_total_part_along_dim = this->max_num_part_along_dim + size_t(this->max_num_cut_along_dim);
2189 if(this->max_concurrent_part_calculation > this->last_dim_num_part){
2190 if(this->mj_problemComm->getRank() == 0){
2191 std::cerr <<
"Warning: Concurrent part count ("<< this->max_concurrent_part_calculation <<
2192 ") has been set bigger than maximum amount that can be used." <<
2193 " Setting to:" << this->last_dim_num_part <<
"." << std::endl;
2195 this->max_concurrent_part_calculation = this->last_dim_num_part;
2204 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2206 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_part_count(
2207 mj_part_t num_total_future,
2210 double fp = pow(num_total_future,
root);
2211 mj_part_t ip = mj_part_t (fp);
2212 if (fp - ip < this->fEpsilon * 100){
2234 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2236 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::update_part_num_arrays(
2237 std::vector <mj_part_t> &num_partitioning_in_current_dim,
2238 std::vector<mj_part_t> *future_num_part_in_parts,
2239 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2240 mj_part_t &future_num_parts,
2241 mj_part_t current_num_parts,
2242 int current_iteration,
2243 RCP<mj_partBoxVector_t> input_part_boxes,
2244 RCP<mj_partBoxVector_t> output_part_boxes,
2245 mj_part_t atomic_part_count) {
2248 mj_part_t output_num_parts = 0;
2250 if(this->part_no_array){
2255 mj_part_t p = this->part_no_array[current_iteration];
2257 std::cout <<
"Current recursive iteration: " << current_iteration
2258 <<
" part_no_array[" << current_iteration <<
"] is given as:" << p << std::endl;
2262 return current_num_parts;
2265 if (this->first_level_distribution != NULL &&
2266 current_iteration == 0 &&
2267 p != this->num_first_level_parts)
2269 std::cout <<
"Current recursive iteration: " << current_iteration
2270 <<
" part_no_array[" << current_iteration <<
"] is given as: " << p
2271 <<
" and contradicts num_first_level_parts: " << this->num_first_level_parts << std::endl;
2275 for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2276 num_partitioning_in_current_dim.push_back(p);
2292 future_num_parts /= num_partitioning_in_current_dim[0];
2293 output_num_parts = current_num_parts * num_partitioning_in_current_dim[0];
2295 if (this->mj_keep_part_boxes){
2296 for (mj_part_t k = 0; k < current_num_parts; ++k){
2298 for (mj_part_t j = 0; j < num_partitioning_in_current_dim[0]; ++j){
2299 output_part_boxes->push_back((*input_part_boxes)[k]);
2307 for (mj_part_t ii = 0; ii < output_num_parts; ++ii){
2308 next_future_num_parts_in_parts->push_back(future_num_parts);
2318 future_num_parts = 1;
2322 for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2324 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2328 mj_part_t num_partitions_in_current_dim =
2329 this->get_part_count(future_num_parts_of_part_ii,
2330 1.0 / (this->recursion_depth - current_iteration) );
2332 if (num_partitions_in_current_dim > this->max_num_part_along_dim){
2333 std::cerr <<
"ERROR: maxPartNo calculation is wrong. num_partitions_in_current_dim: "
2334 << num_partitions_in_current_dim <<
" this->max_num_part_along_dim: "
2335 << this->max_num_part_along_dim <<
2336 " this->recursion_depth: " << this->recursion_depth <<
2337 " current_iteration: " << current_iteration <<
2338 " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2339 " might need to fix max part no calculation for largest_prime_first partitioning." <<
2352 if (current_iteration == 0 &&
2353 this->first_level_distribution != NULL &&
2354 this->num_first_level_parts > 1) {
2358 num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2361 output_num_parts = this->num_first_level_parts;
2364 future_num_parts /= this->num_first_level_parts;
2366 mj_part_t max_part = 0;
2367 mj_part_t sum_first_level_dist = 0;
2371 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2372 sum_first_level_dist += this->first_level_distribution[i];
2374 if (this->first_level_distribution[i] > max_part)
2375 max_part = this->first_level_distribution[i];
2379 future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2383 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2385 next_future_num_parts_in_parts->push_back(this->first_level_distribution[i] *
2386 this->num_global_parts / sum_first_level_dist);
2389 else if (this->divide_to_prime_first) {
2392 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2394 mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2397 output_num_parts += num_partitions_in_current_dim;
2399 if (future_num_parts_of_part_ii == atomic_part_count ||
2400 future_num_parts_of_part_ii % atomic_part_count != 0) {
2401 atomic_part_count = 1;
2404 largest_prime_factor =
2405 this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2412 if (largest_prime_factor < num_partitions_in_current_dim) {
2413 largest_prime_factor = num_partitions_in_current_dim;
2417 mj_part_t ideal_num_future_parts_in_part =
2418 (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2420 mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2428 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2430 mj_part_t my_ideal_primescale = ideal_prime_scale;
2432 if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2433 ++my_ideal_primescale;
2436 mj_part_t num_future_parts_for_part_iii =
2437 ideal_num_future_parts_in_part * my_ideal_primescale;
2440 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2442 ++num_future_parts_for_part_iii;
2445 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2448 if (this->mj_keep_part_boxes) {
2449 output_part_boxes->push_back((*input_part_boxes)[ii]);
2453 if (num_future_parts_for_part_iii > future_num_parts)
2454 future_num_parts = num_future_parts_for_part_iii;
2461 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2464 output_num_parts += num_partitions_in_current_dim;
2466 if (future_num_parts_of_part_ii == atomic_part_count ||
2467 future_num_parts_of_part_ii % atomic_part_count != 0) {
2468 atomic_part_count = 1;
2471 mj_part_t ideal_num_future_parts_in_part =
2472 (future_num_parts_of_part_ii / atomic_part_count) / num_partitions_in_current_dim;
2474 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
2475 mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part;
2478 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % num_partitions_in_current_dim){
2480 ++num_future_parts_for_part_iii;
2483 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2486 if (this->mj_keep_part_boxes){
2487 output_part_boxes->push_back((*input_part_boxes)[ii]);
2491 if (num_future_parts_for_part_iii > future_num_parts)
2492 future_num_parts = num_future_parts_for_part_iii;
2497 return output_num_parts;
2504 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2506 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::allocate_set_work_memory(){
2509 this->owner_of_coordinate = NULL;
2514 this->coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2516 #ifdef HAVE_ZOLTAN2_OMP
2517 #pragma omp parallel for
2519 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
2520 this->coordinate_permutations[i] = i;
2524 this->new_coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2526 this->assigned_part_ids = NULL;
2527 if(this->num_local_coords > 0){
2528 this->assigned_part_ids = allocMemory<mj_part_t>(this->num_local_coords);
2534 this->part_xadj = allocMemory<mj_lno_t>(1);
2535 this->part_xadj[0] =
static_cast<mj_lno_t
>(this->num_local_coords);
2537 this->new_part_xadj = NULL;
2543 this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2545 this->max_min_coords = allocMemory< mj_scalar_t>(this->num_threads * 2);
2547 this->process_cut_line_weight_to_put_left = NULL;
2548 this->thread_cut_line_weight_to_put_left = NULL;
2550 if(this->distribute_points_on_cut_lines){
2551 this->process_cut_line_weight_to_put_left = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2552 this->thread_cut_line_weight_to_put_left = allocMemory<mj_scalar_t *>(this->num_threads);
2553 for(
int i = 0; i < this->num_threads; ++i){
2554 this->thread_cut_line_weight_to_put_left[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2556 this->process_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2557 this->global_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2565 this->cut_coordinates_work_array = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim *
2566 this->max_concurrent_part_calculation);
2570 this->target_part_weights = allocMemory<mj_scalar_t>(
2571 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2574 this->cut_upper_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2575 this->cut_lower_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2576 this->cut_lower_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2577 this->cut_upper_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2579 this->process_local_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);
2580 this->global_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);
2584 this->is_cut_line_determined = allocMemory<bool>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2587 this->my_incomplete_cut_count = allocMemory<mj_part_t>(this->max_concurrent_part_calculation);
2589 this->thread_part_weights = allocMemory<double *>(this->num_threads);
2591 this->thread_part_weight_work = allocMemory<double *>(this->num_threads);
2594 this->thread_cut_left_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2596 this->thread_cut_right_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2599 this->thread_point_counts = allocMemory<mj_lno_t *>(this->num_threads);
2601 for(
int i = 0; i < this->num_threads; ++i){
2603 this->thread_part_weights[i] = allocMemory < double >(this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2604 this->thread_cut_right_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2605 this->thread_cut_left_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2606 this->thread_point_counts[i] = allocMemory<mj_lno_t>(this->max_num_part_along_dim);
2612 this->total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2613 this->global_total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2616 mj_scalar_t **coord = allocMemory<mj_scalar_t *>(this->coord_dim);
2617 for (
int i=0; i < this->coord_dim; i++){
2618 coord[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2619 #ifdef HAVE_ZOLTAN2_OMP
2620 #pragma omp parallel for
2622 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2623 coord[i][j] = this->mj_coordinates[i][j];
2625 this->mj_coordinates = coord;
2628 int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
2629 mj_scalar_t **
weights = allocMemory<mj_scalar_t *>(criteria_dim);
2631 for (
int i=0; i < criteria_dim; i++){
2634 for (
int i=0; i < this->num_weights_per_coord; i++){
2635 weights[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2636 #ifdef HAVE_ZOLTAN2_OMP
2637 #pragma omp parallel for
2639 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2640 weights[i][j] = this->mj_weights[i][j];
2644 this->current_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
2645 #ifdef HAVE_ZOLTAN2_OMP
2646 #pragma omp parallel for
2648 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2649 this->current_mj_gnos[j] = this->initial_mj_gnos[j];
2651 this->owner_of_coordinate = allocMemory<int>(this->num_local_coords);
2653 #ifdef HAVE_ZOLTAN2_OMP
2654 #pragma omp parallel for
2656 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2657 this->owner_of_coordinate[j] = this->myActualRank;
2662 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2664 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::compute_global_box()
2667 mj_scalar_t *mins = allocMemory<mj_scalar_t>(this->coord_dim);
2669 mj_scalar_t *gmins = allocMemory<mj_scalar_t>(this->coord_dim);
2671 mj_scalar_t *maxs = allocMemory<mj_scalar_t>(this->coord_dim);
2673 mj_scalar_t *gmaxs = allocMemory<mj_scalar_t>(this->coord_dim);
2675 for (
int i = 0; i < this->coord_dim; ++i){
2676 mj_scalar_t localMin = std::numeric_limits<mj_scalar_t>::max();
2677 mj_scalar_t localMax = -localMin;
2678 if (localMax > 0) localMax = 0;
2681 for (mj_lno_t j = 0; j < this->num_local_coords; ++j){
2682 if (this->mj_coordinates[i][j] < localMin){
2683 localMin = this->mj_coordinates[i][j];
2685 if (this->mj_coordinates[i][j] > localMax){
2686 localMax = this->mj_coordinates[i][j];
2695 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2696 this->coord_dim, mins, gmins
2700 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2701 this->coord_dim, maxs, gmaxs
2707 global_box = rcp(
new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2709 freeArray<mj_scalar_t>(mins);
2710 freeArray<mj_scalar_t>(gmins);
2711 freeArray<mj_scalar_t>(maxs);
2712 freeArray<mj_scalar_t>(gmaxs);
2720 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2722 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::init_part_boxes(
2723 RCP<mj_partBoxVector_t> & initial_partitioning_boxes
2726 mj_partBox_t tmp_box(*global_box);
2727 initial_partitioning_boxes->push_back(tmp_box);
2740 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2742 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_local_min_max_coord_totW(
2743 mj_lno_t coordinate_begin_index,
2744 mj_lno_t coordinate_end_index,
2745 mj_lno_t *mj_current_coordinate_permutations,
2746 mj_scalar_t *mj_current_dim_coords,
2747 mj_scalar_t &min_coordinate,
2748 mj_scalar_t &max_coordinate,
2749 mj_scalar_t &total_weight){
2753 if(coordinate_begin_index >= coordinate_end_index)
2755 min_coordinate = this->maxScalar_t;
2756 max_coordinate = this->minScalar_t;
2760 mj_scalar_t my_total_weight = 0;
2761 #ifdef HAVE_ZOLTAN2_OMP
2762 #pragma omp parallel num_threads(this->num_threads)
2766 if (this->mj_uniform_weights[0]) {
2767 #ifdef HAVE_ZOLTAN2_OMP
2771 my_total_weight = coordinate_end_index - coordinate_begin_index;
2777 #ifdef HAVE_ZOLTAN2_OMP
2778 #pragma omp for reduction(+:my_total_weight)
2780 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2781 int i = mj_current_coordinate_permutations[ii];
2782 my_total_weight += this->mj_weights[0][i];
2786 int my_thread_id = 0;
2787 #ifdef HAVE_ZOLTAN2_OMP
2788 my_thread_id = omp_get_thread_num();
2790 mj_scalar_t my_thread_min_coord, my_thread_max_coord;
2791 my_thread_min_coord=my_thread_max_coord
2792 =mj_current_dim_coords[mj_current_coordinate_permutations[coordinate_begin_index]];
2795 #ifdef HAVE_ZOLTAN2_OMP
2798 for(mj_lno_t j = coordinate_begin_index + 1; j < coordinate_end_index; ++j){
2799 int i = mj_current_coordinate_permutations[j];
2800 if(mj_current_dim_coords[i] > my_thread_max_coord)
2801 my_thread_max_coord = mj_current_dim_coords[i];
2802 if(mj_current_dim_coords[i] < my_thread_min_coord)
2803 my_thread_min_coord = mj_current_dim_coords[i];
2805 this->max_min_coords[my_thread_id] = my_thread_min_coord;
2806 this->max_min_coords[my_thread_id + this->num_threads] = my_thread_max_coord;
2808 #ifdef HAVE_ZOLTAN2_OMP
2811 #pragma omp single nowait
2814 min_coordinate = this->max_min_coords[0];
2815 for(
int i = 1; i < this->num_threads; ++i){
2816 if(this->max_min_coords[i] < min_coordinate)
2817 min_coordinate = this->max_min_coords[i];
2821 #ifdef HAVE_ZOLTAN2_OMP
2822 #pragma omp single nowait
2825 max_coordinate = this->max_min_coords[this->num_threads];
2826 for(
int i = this->num_threads + 1; i < this->num_threads * 2; ++i){
2827 if(this->max_min_coords[i] > max_coordinate)
2828 max_coordinate = this->max_min_coords[i];
2832 total_weight = my_total_weight;
2844 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2846 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_global_min_max_coord_totW(
2847 mj_part_t current_concurrent_num_parts,
2848 mj_scalar_t *local_min_max_total,
2849 mj_scalar_t *global_min_max_total){
2854 if(this->comm->getSize() > 1){
2857 current_concurrent_num_parts,
2858 current_concurrent_num_parts,
2859 current_concurrent_num_parts);
2861 reduceAll<int, mj_scalar_t>(
2864 3 * current_concurrent_num_parts,
2865 local_min_max_total,
2866 global_min_max_total);
2871 mj_part_t s = 3 * current_concurrent_num_parts;
2872 for (mj_part_t i = 0; i < s; ++i){
2873 global_min_max_total[i] = local_min_max_total[i];
2907 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2909 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_initial_cut_coords_target_weights(
2910 mj_scalar_t min_coord,
2911 mj_scalar_t max_coord,
2912 mj_part_t num_cuts ,
2913 mj_scalar_t global_weight,
2914 mj_scalar_t *initial_cut_coords ,
2915 mj_scalar_t *current_target_part_weights ,
2917 std::vector <mj_part_t> *future_num_part_in_parts,
2918 std::vector <mj_part_t> *next_future_num_parts_in_parts,
2919 mj_part_t concurrent_current_part,
2920 mj_part_t obtained_part_index,
2921 mj_part_t num_target_first_level_parts,
2922 const mj_part_t *target_first_level_dist) {
2924 mj_scalar_t coord_range = max_coord - min_coord;
2927 if (num_target_first_level_parts <= 1 &&
2928 this->mj_uniform_parts[0]) {
2930 mj_part_t cumulative = 0;
2933 mj_scalar_t total_future_part_count_in_part = mj_scalar_t((*future_num_part_in_parts)[concurrent_current_part]);
2936 mj_scalar_t unit_part_weight = global_weight / total_future_part_count_in_part;
2938 for (mj_part_t i = 0; i < num_cuts; ++i) {
2939 cumulative += (*next_future_num_parts_in_parts)[i + obtained_part_index];
2942 current_target_part_weights[i] = cumulative * unit_part_weight;
2945 initial_cut_coords[i] = min_coord + (coord_range * cumulative) / total_future_part_count_in_part;
2948 current_target_part_weights[num_cuts] = global_weight;
2952 if (this->mj_uniform_weights[0]) {
2953 for (mj_part_t i = 0; i < num_cuts + 1; ++i) {
2954 current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2959 else if(num_target_first_level_parts > 1 &&
2960 target_first_level_dist != NULL) {
2963 mj_part_t cumulative = 0.0;
2966 mj_scalar_t sum_target_first_level_dist = 0.0;
2968 for (
int i = 0; i < num_target_first_level_parts; ++i) {
2969 sum_target_first_level_dist += target_first_level_dist[i];
2972 for (mj_part_t i = 0; i < num_cuts; ++i) {
2973 cumulative += global_weight * target_first_level_dist[i] / sum_target_first_level_dist;
2976 current_target_part_weights[i] = cumulative;
2979 initial_cut_coords[i] = min_coord + (coord_range * cumulative) / global_weight;
2982 current_target_part_weights[num_cuts] = global_weight;
2986 for (mj_part_t i = 0; i < num_cuts + 1; ++i) {
2987 current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2991 std::cerr <<
"MJ does not support non uniform part weights beyond the first partition" << std::endl;
3009 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3011 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_initial_coordinate_parts(
3012 mj_scalar_t &max_coordinate,
3013 mj_scalar_t &min_coordinate,
3015 mj_lno_t coordinate_begin_index,
3016 mj_lno_t coordinate_end_index,
3017 mj_lno_t *mj_current_coordinate_permutations,
3018 mj_scalar_t *mj_current_dim_coords,
3019 mj_part_t *mj_part_ids,
3020 mj_part_t &partition_count
3022 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3026 if(
ZOLTAN2_ABS(coordinate_range) < this->sEpsilon ){
3027 #ifdef HAVE_ZOLTAN2_OMP
3028 #pragma omp parallel for
3030 for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
3031 mj_part_ids[mj_current_coordinate_permutations[ii]] = 0;
3038 mj_scalar_t slice = coordinate_range / partition_count;
3040 #ifdef HAVE_ZOLTAN2_OMP
3041 #pragma omp parallel for
3043 for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
3045 mj_lno_t iii = mj_current_coordinate_permutations[ii];
3046 mj_part_t pp = mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3047 mj_part_ids[iii] = 2 * pp;
3063 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3065 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part(
3066 mj_scalar_t *mj_current_dim_coords,
3067 double used_imbalance_tolerance,
3068 mj_part_t current_work_part,
3069 mj_part_t current_concurrent_num_parts,
3070 mj_scalar_t *current_cut_coordinates,
3071 mj_part_t total_incomplete_cut_count,
3072 std::vector <mj_part_t> &num_partitioning_in_current_dim
3076 mj_part_t rectilinear_cut_count = 0;
3077 mj_scalar_t *temp_cut_coords = current_cut_coordinates;
3080 *reductionOp = NULL;
3082 <mj_part_t, mj_scalar_t>(
3083 &num_partitioning_in_current_dim ,
3085 current_concurrent_num_parts);
3087 size_t total_reduction_size = 0;
3088 #ifdef HAVE_ZOLTAN2_OMP
3089 #pragma omp parallel shared(total_incomplete_cut_count, rectilinear_cut_count) num_threads(this->num_threads)
3093 #ifdef HAVE_ZOLTAN2_OMP
3094 me = omp_get_thread_num();
3096 double *my_thread_part_weights = this->thread_part_weights[me];
3097 mj_scalar_t *my_thread_left_closest = this->thread_cut_left_closest_point[me];
3098 mj_scalar_t *my_thread_right_closest = this->thread_cut_right_closest_point[me];
3100 #ifdef HAVE_ZOLTAN2_OMP
3106 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3108 mj_part_t num_part_in_dim = num_partitioning_in_current_dim[current_work_part + i];
3109 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3110 total_reduction_size += (4 * num_cut_in_dim + 1);
3112 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii){
3113 this->is_cut_line_determined[next] =
false;
3114 this->cut_lower_bound_coordinates[next] = global_min_max_coord_total_weight[i];
3115 this->cut_upper_bound_coordinates[next] = global_min_max_coord_total_weight[i + current_concurrent_num_parts];
3117 this->cut_upper_bound_weights[next] = global_min_max_coord_total_weight[i + 2 * current_concurrent_num_parts];
3118 this->cut_lower_bound_weights[next] = 0;
3120 if(this->distribute_points_on_cut_lines){
3121 this->process_cut_line_weight_to_put_left[next] = 0;
3132 while (total_incomplete_cut_count != 0){
3134 mj_part_t concurrent_cut_shifts = 0;
3135 size_t total_part_shift = 0;
3137 for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
3138 mj_part_t num_parts = -1;
3139 num_parts = num_partitioning_in_current_dim[current_work_part + kk];
3141 mj_part_t num_cuts = num_parts - 1;
3142 size_t total_part_count = num_parts + size_t (num_cuts) ;
3143 if (this->my_incomplete_cut_count[kk] > 0){
3146 bool *current_cut_status = this->is_cut_line_determined + concurrent_cut_shifts;
3147 double *my_current_part_weights = my_thread_part_weights + total_part_shift;
3148 mj_scalar_t *my_current_left_closest = my_thread_left_closest + concurrent_cut_shifts;
3149 mj_scalar_t *my_current_right_closest = my_thread_right_closest + concurrent_cut_shifts;
3151 mj_part_t conccurent_current_part = current_work_part + kk;
3152 mj_lno_t coordinate_begin_index = conccurent_current_part ==0 ? 0: this->part_xadj[conccurent_current_part -1];
3153 mj_lno_t coordinate_end_index = this->part_xadj[conccurent_current_part];
3154 mj_scalar_t *temp_current_cut_coords = temp_cut_coords + concurrent_cut_shifts;
3156 mj_scalar_t min_coord = global_min_max_coord_total_weight[kk];
3157 mj_scalar_t max_coord = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
3160 this->mj_1D_part_get_thread_part_weights(
3165 coordinate_begin_index,
3166 coordinate_end_index,
3167 mj_current_dim_coords,
3168 temp_current_cut_coords,
3170 my_current_part_weights,
3171 my_current_left_closest,
3172 my_current_right_closest);
3176 concurrent_cut_shifts += num_cuts;
3177 total_part_shift += total_part_count;
3181 this->mj_accumulate_thread_results(
3182 num_partitioning_in_current_dim,
3184 current_concurrent_num_parts);
3187 #ifdef HAVE_ZOLTAN2_OMP
3191 if(this->comm->getSize() > 1){
3192 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3193 total_reduction_size,
3194 this->total_part_weight_left_right_closests,
3195 this->global_total_part_weight_left_right_closests);
3200 this->global_total_part_weight_left_right_closests,
3201 this->total_part_weight_left_right_closests,
3202 total_reduction_size *
sizeof(mj_scalar_t));
3207 mj_part_t cut_shift = 0;
3210 size_t tlr_shift = 0;
3211 for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
3212 mj_part_t num_parts = num_partitioning_in_current_dim[current_work_part + kk];
3213 mj_part_t num_cuts = num_parts - 1;
3214 size_t num_total_part = num_parts + size_t (num_cuts) ;
3219 if (this->my_incomplete_cut_count[kk] == 0) {
3220 cut_shift += num_cuts;
3221 tlr_shift += (num_total_part + 2 * num_cuts);
3225 mj_scalar_t *current_local_part_weights = this->total_part_weight_left_right_closests + tlr_shift ;
3226 mj_scalar_t *current_global_tlr = this->global_total_part_weight_left_right_closests + tlr_shift;
3227 mj_scalar_t *current_global_left_closest_points = current_global_tlr + num_total_part;
3228 mj_scalar_t *current_global_right_closest_points = current_global_tlr + num_total_part + num_cuts;
3229 mj_scalar_t *current_global_part_weights = current_global_tlr;
3230 bool *current_cut_line_determined = this->is_cut_line_determined + cut_shift;
3232 mj_scalar_t *current_part_target_weights = this->target_part_weights + cut_shift + kk;
3233 mj_scalar_t *current_part_cut_line_weight_to_put_left = this->process_cut_line_weight_to_put_left + cut_shift;
3235 mj_scalar_t min_coordinate = global_min_max_coord_total_weight[kk];
3236 mj_scalar_t max_coordinate = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
3237 mj_scalar_t global_total_weight = global_min_max_coord_total_weight[kk + current_concurrent_num_parts * 2];
3238 mj_scalar_t *current_cut_lower_bound_weights = this->cut_lower_bound_weights + cut_shift;
3239 mj_scalar_t *current_cut_upper_weights = this->cut_upper_bound_weights + cut_shift;
3240 mj_scalar_t *current_cut_upper_bounds = this->cut_upper_bound_coordinates + cut_shift;
3241 mj_scalar_t *current_cut_lower_bounds = this->cut_lower_bound_coordinates + cut_shift;
3243 mj_part_t initial_incomplete_cut_count = this->my_incomplete_cut_count[kk];
3246 this->mj_get_new_cut_coordinates(
3251 global_total_weight,
3252 used_imbalance_tolerance,
3253 current_global_part_weights,
3254 current_local_part_weights,
3255 current_part_target_weights,
3256 current_cut_line_determined,
3257 temp_cut_coords + cut_shift,
3258 current_cut_upper_bounds,
3259 current_cut_lower_bounds,
3260 current_global_left_closest_points,
3261 current_global_right_closest_points,
3262 current_cut_lower_bound_weights,
3263 current_cut_upper_weights,
3264 this->cut_coordinates_work_array +cut_shift,
3265 current_part_cut_line_weight_to_put_left,
3266 &rectilinear_cut_count,
3267 this->my_incomplete_cut_count[kk]);
3269 cut_shift += num_cuts;
3270 tlr_shift += (num_total_part + 2 * num_cuts);
3271 mj_part_t iteration_complete_cut_count = initial_incomplete_cut_count - this->my_incomplete_cut_count[kk];
3272 #ifdef HAVE_ZOLTAN2_OMP
3276 total_incomplete_cut_count -= iteration_complete_cut_count;
3281 #ifdef HAVE_ZOLTAN2_OMP
3287 mj_scalar_t *t = temp_cut_coords;
3288 temp_cut_coords = this->cut_coordinates_work_array;
3289 this->cut_coordinates_work_array = t;
3300 if (current_cut_coordinates != temp_cut_coords){
3301 #ifdef HAVE_ZOLTAN2_OMP
3306 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3307 mj_part_t num_parts = -1;
3308 num_parts = num_partitioning_in_current_dim[current_work_part + i];
3309 mj_part_t num_cuts = num_parts - 1;
3311 for(mj_part_t ii = 0; ii < num_cuts; ++ii){
3312 current_cut_coordinates[next + ii] = temp_cut_coords[next + ii];
3318 #ifdef HAVE_ZOLTAN2_OMP
3322 this->cut_coordinates_work_array = temp_cut_coords;
3349 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3351 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part_get_thread_part_weights(
3352 size_t total_part_count,
3354 mj_scalar_t max_coord,
3355 mj_scalar_t min_coord,
3356 mj_lno_t coordinate_begin_index,
3357 mj_lno_t coordinate_end_index,
3358 mj_scalar_t *mj_current_dim_coords,
3359 mj_scalar_t *temp_current_cut_coords,
3361 double *my_current_part_weights,
3362 mj_scalar_t *my_current_left_closest,
3363 mj_scalar_t *my_current_right_closest){
3366 for (
size_t i = 0; i < total_part_count; ++i){
3367 my_current_part_weights[i] = 0;
3372 for(mj_part_t i = 0; i < num_cuts; ++i){
3373 my_current_left_closest[i] = min_coord - 1;
3374 my_current_right_closest[i] = max_coord + 1;
3377 mj_scalar_t minus_EPSILON = -this->sEpsilon;
3378 #ifdef HAVE_ZOLTAN2_OMP
3384 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
3385 int i = this->coordinate_permutations[ii];
3389 mj_part_t j = this->assigned_part_ids[i] / 2;
3395 mj_part_t lower_cut_index = 0;
3396 mj_part_t upper_cut_index = num_cuts - 1;
3398 mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
3399 bool is_inserted =
false;
3400 bool is_on_left_of_cut =
false;
3401 bool is_on_right_of_cut =
false;
3402 mj_part_t last_compared_part = -1;
3404 mj_scalar_t coord = mj_current_dim_coords[i];
3406 while(upper_cut_index >= lower_cut_index)
3409 last_compared_part = -1;
3410 is_on_left_of_cut =
false;
3411 is_on_right_of_cut =
false;
3412 mj_scalar_t cut = temp_current_cut_coords[j];
3413 mj_scalar_t distance_to_cut = coord - cut;
3414 mj_scalar_t abs_distance_to_cut =
ZOLTAN2_ABS(distance_to_cut);
3417 if(abs_distance_to_cut < this->sEpsilon){
3419 my_current_part_weights[j * 2 + 1] += w;
3420 this->assigned_part_ids[i] = j * 2 + 1;
3423 my_current_left_closest[j] = coord;
3424 my_current_right_closest[j] = coord;
3427 mj_part_t kk = j + 1;
3428 while(kk < num_cuts){
3430 distance_to_cut =
ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3431 if(distance_to_cut < this->sEpsilon){
3432 my_current_part_weights[2 * kk + 1] += w;
3433 my_current_left_closest[kk] = coord;
3434 my_current_right_closest[kk] = coord;
3440 if(coord - my_current_left_closest[kk] > this->sEpsilon){
3441 my_current_left_closest[kk] = coord;
3451 distance_to_cut =
ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3452 if(distance_to_cut < this->sEpsilon){
3453 my_current_part_weights[2 * kk + 1] += w;
3455 this->assigned_part_ids[i] = kk * 2 + 1;
3456 my_current_left_closest[kk] = coord;
3457 my_current_right_closest[kk] = coord;
3463 if(my_current_right_closest[kk] - coord > this->sEpsilon){
3464 my_current_right_closest[kk] = coord;
3475 if (distance_to_cut < 0) {
3476 bool _break =
false;
3480 mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j - 1];
3481 if(distance_to_next_cut > this->sEpsilon){
3487 upper_cut_index = j - 1;
3489 is_on_left_of_cut =
true;
3490 last_compared_part = j;
3495 bool _break =
false;
3496 if(j < num_cuts - 1){
3499 mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j + 1];
3500 if(distance_to_next_cut < minus_EPSILON){
3507 lower_cut_index = j + 1;
3509 is_on_right_of_cut =
true;
3510 last_compared_part = j;
3515 j = (upper_cut_index + lower_cut_index) / 2;
3518 if(is_on_right_of_cut){
3521 my_current_part_weights[2 * last_compared_part + 2] += w;
3522 this->assigned_part_ids[i] = 2 * last_compared_part + 2;
3525 if(my_current_right_closest[last_compared_part] - coord > this->sEpsilon){
3526 my_current_right_closest[last_compared_part] = coord;
3529 if(last_compared_part+1 < num_cuts){
3531 if(coord - my_current_left_closest[last_compared_part + 1] > this->sEpsilon){
3532 my_current_left_closest[last_compared_part + 1] = coord;
3537 else if(is_on_left_of_cut){
3540 my_current_part_weights[2 * last_compared_part] += w;
3541 this->assigned_part_ids[i] = 2 * last_compared_part;
3545 if(coord - my_current_left_closest[last_compared_part] > this->sEpsilon){
3546 my_current_left_closest[last_compared_part] = coord;
3550 if(last_compared_part-1 >= 0){
3551 if(my_current_right_closest[last_compared_part -1] - coord > this->sEpsilon){
3552 my_current_right_closest[last_compared_part -1] = coord;
3561 for (
size_t i = 1; i < total_part_count; ++i){
3565 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
3566 ZOLTAN2_ABS(temp_current_cut_coords[i / 2] - temp_current_cut_coords[i /2 - 1])
3571 my_current_part_weights[i] = my_current_part_weights[i-2];
3575 my_current_part_weights[i] += my_current_part_weights[i-1];
3587 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3589 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_accumulate_thread_results(
3590 const std::vector <mj_part_t> &num_partitioning_in_current_dim,
3591 mj_part_t current_work_part,
3592 mj_part_t current_concurrent_num_parts){
3594 #ifdef HAVE_ZOLTAN2_OMP
3601 size_t tlr_array_shift = 0;
3602 mj_part_t cut_shift = 0;
3605 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3607 mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3608 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3609 size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3612 for(mj_part_t ii = 0; ii < num_cuts_in_part ; ++ii){
3613 mj_part_t next = tlr_array_shift + ii;
3614 mj_part_t cut_index = cut_shift + ii;
3615 if(this->is_cut_line_determined[cut_index])
continue;
3616 mj_scalar_t left_closest_in_process = this->thread_cut_left_closest_point[0][cut_index],
3617 right_closest_in_process = this->thread_cut_right_closest_point[0][cut_index];
3620 for (
int j = 1; j < this->num_threads; ++j){
3621 if (this->thread_cut_right_closest_point[j][cut_index] < right_closest_in_process ){
3622 right_closest_in_process = this->thread_cut_right_closest_point[j][cut_index];
3624 if (this->thread_cut_left_closest_point[j][cut_index] > left_closest_in_process ){
3625 left_closest_in_process = this->thread_cut_left_closest_point[j][cut_index];
3629 this->total_part_weight_left_right_closests[num_total_part_in_part +
3630 next] = left_closest_in_process;
3631 this->total_part_weight_left_right_closests[num_total_part_in_part +
3632 num_cuts_in_part + next] = right_closest_in_process;
3635 tlr_array_shift += (num_total_part_in_part + 2 * num_cuts_in_part);
3636 cut_shift += num_cuts_in_part;
3639 tlr_array_shift = 0;
3641 size_t total_part_array_shift = 0;
3644 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3646 mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3647 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3648 size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3650 for(
size_t j = 0; j < num_total_part_in_part; ++j){
3652 mj_part_t cut_ind = j / 2 + cut_shift;
3657 if(j != num_total_part_in_part - 1 && this->is_cut_line_determined[cut_ind])
continue;
3659 for (
int k = 0; k < this->num_threads; ++k){
3660 pwj += this->thread_part_weights[k][total_part_array_shift + j];
3663 this->total_part_weight_left_right_closests[tlr_array_shift + j] = pwj;
3665 cut_shift += num_cuts_in_part;
3666 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
3667 total_part_array_shift += num_total_part_in_part;
3685 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3687 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_calculate_new_cut_position (
3688 mj_scalar_t cut_upper_bound,
3689 mj_scalar_t cut_lower_bound,
3690 mj_scalar_t cut_upper_weight,
3691 mj_scalar_t cut_lower_weight,
3692 mj_scalar_t expected_weight,
3693 mj_scalar_t &new_cut_position){
3695 if(
ZOLTAN2_ABS(cut_upper_bound - cut_lower_bound) < this->sEpsilon){
3696 new_cut_position = cut_upper_bound;
3700 if(
ZOLTAN2_ABS(cut_upper_weight - cut_lower_weight) < this->sEpsilon){
3701 new_cut_position = cut_lower_bound;
3704 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
3705 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
3706 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
3708 mj_scalar_t required_shift = (my_weight_diff / weight_range);
3709 int scale_constant = 20;
3710 int shiftint= int (required_shift * scale_constant);
3711 if (shiftint == 0) shiftint = 1;
3712 required_shift = mj_scalar_t (shiftint) / scale_constant;
3713 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
3727 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3729 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_create_new_partitions(
3730 mj_part_t num_parts,
3732 mj_scalar_t *current_concurrent_cut_coordinate,
3733 mj_lno_t coordinate_begin,
3734 mj_lno_t coordinate_end,
3735 mj_scalar_t *used_local_cut_line_weight_to_left,
3736 double **used_thread_part_weight_work,
3737 mj_lno_t *out_part_xadj){
3739 mj_part_t num_cuts = num_parts - 1;
3741 #ifdef HAVE_ZOLTAN2_OMP
3742 #pragma omp parallel
3746 #ifdef HAVE_ZOLTAN2_OMP
3747 me = omp_get_thread_num();
3750 mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
3751 mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
3755 if (this->distribute_points_on_cut_lines){
3756 my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
3758 #ifdef HAVE_ZOLTAN2_OMP
3761 for (mj_part_t i = 0; i < num_cuts; ++i){
3763 mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
3764 for(
int ii = 0; ii < this->num_threads; ++ii){
3765 if(left_weight > this->sEpsilon){
3767 mj_scalar_t thread_ii_weight_on_cut = used_thread_part_weight_work[ii][i * 2 + 1] - used_thread_part_weight_work[ii][i * 2 ];
3768 if(thread_ii_weight_on_cut < left_weight){
3770 this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
3774 this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
3776 left_weight -= thread_ii_weight_on_cut;
3779 this->thread_cut_line_weight_to_put_left[ii][i] = 0;
3787 for (mj_part_t i = num_cuts - 1; i > 0 ; --i){
3788 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
3789 my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
3797 for(mj_part_t ii = 0; ii < num_parts; ++ii){
3798 thread_num_points_in_parts[ii] = 0;
3802 #ifdef HAVE_ZOLTAN2_OMP
3806 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3808 mj_lno_t coordinate_index = this->coordinate_permutations[ii];
3809 mj_scalar_t coordinate_weight = this->mj_uniform_weights[0]? 1:this->mj_weights[0][coordinate_index];
3810 mj_part_t coordinate_assigned_place = this->assigned_part_ids[coordinate_index];
3811 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
3812 if(coordinate_assigned_place % 2 == 1){
3814 if(this->distribute_points_on_cut_lines
3815 && my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > this->sEpsilon){
3819 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3824 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0
3825 && coordinate_assigned_part < num_cuts - 1
3826 &&
ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3827 current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3828 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3830 ++thread_num_points_in_parts[coordinate_assigned_part];
3831 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3835 ++coordinate_assigned_part;
3837 while(this->distribute_points_on_cut_lines &&
3838 coordinate_assigned_part < num_cuts){
3840 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part] -
3841 current_concurrent_cut_coordinate[coordinate_assigned_part - 1])
3844 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >
3846 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >=
3847 ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] - coordinate_weight)){
3848 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3851 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 &&
3852 coordinate_assigned_part < num_cuts - 1 &&
3853 ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3854 current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3855 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3863 ++coordinate_assigned_part;
3865 ++thread_num_points_in_parts[coordinate_assigned_part];
3866 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3871 ++thread_num_points_in_parts[coordinate_assigned_part];
3872 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3881 #ifdef HAVE_ZOLTAN2_OMP
3884 for(mj_part_t j = 0; j < num_parts; ++j){
3885 mj_lno_t num_points_in_part_j_upto_thread_i = 0;
3886 for (
int i = 0; i < this->num_threads; ++i){
3887 mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
3889 this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
3890 num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
3893 out_part_xadj[j] = num_points_in_part_j_upto_thread_i;
3897 #ifdef HAVE_ZOLTAN2_OMP
3902 for(mj_part_t j = 1; j < num_parts; ++j){
3903 out_part_xadj[j] += out_part_xadj[j - 1];
3909 for(mj_part_t j = 1; j < num_parts; ++j){
3910 thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
3916 #ifdef HAVE_ZOLTAN2_OMP
3919 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3920 mj_lno_t i = this->coordinate_permutations[ii];
3921 mj_part_t p = this->assigned_part_ids[i];
3922 this->new_coordinate_permutations[coordinate_begin +
3923 thread_num_points_in_parts[p]++] = i;
3958 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3960 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_new_cut_coordinates(
3962 const mj_part_t &num_cuts,
3963 const mj_scalar_t &max_coordinate,
3964 const mj_scalar_t &min_coordinate,
3965 const mj_scalar_t &global_total_weight,
3966 const double &used_imbalance_tolerance,
3967 mj_scalar_t * current_global_part_weights,
3968 const mj_scalar_t * current_local_part_weights,
3969 const mj_scalar_t *current_part_target_weights,
3970 bool *current_cut_line_determined,
3971 mj_scalar_t *current_cut_coordinates,
3972 mj_scalar_t *current_cut_upper_bounds,
3973 mj_scalar_t *current_cut_lower_bounds,
3974 mj_scalar_t *current_global_left_closest_points,
3975 mj_scalar_t *current_global_right_closest_points,
3976 mj_scalar_t * current_cut_lower_bound_weights,
3977 mj_scalar_t * current_cut_upper_weights,
3978 mj_scalar_t *new_current_cut_coordinates,
3979 mj_scalar_t *current_part_cut_line_weight_to_put_left,
3980 mj_part_t *rectilinear_cut_count,
3981 mj_part_t &my_num_incomplete_cut){
3984 mj_scalar_t seen_weight_in_part = 0;
3986 mj_scalar_t expected_weight_in_part = 0;
3988 double imbalance_on_left = 0, imbalance_on_right = 0;
3991 #ifdef HAVE_ZOLTAN2_OMP
3994 for (mj_part_t i = 0; i < num_cuts; i++){
3997 if(min_coordinate - current_global_left_closest_points[i] > this->sEpsilon)
3998 current_global_left_closest_points[i] = current_cut_coordinates[i];
3999 if(current_global_right_closest_points[i] - max_coordinate > this->sEpsilon)
4000 current_global_right_closest_points[i] = current_cut_coordinates[i];
4003 #ifdef HAVE_ZOLTAN2_OMP
4006 for (mj_part_t i = 0; i < num_cuts; i++){
4008 if(this->distribute_points_on_cut_lines){
4010 this->global_rectilinear_cut_weight[i] = 0;
4011 this->process_rectilinear_cut_weight[i] = 0;
4015 if(current_cut_line_determined[i]) {
4016 new_current_cut_coordinates[i] = current_cut_coordinates[i];
4021 seen_weight_in_part = current_global_part_weights[i * 2];
4030 expected_weight_in_part = current_part_target_weights[i];
4032 imbalance_on_left =
imbalanceOf2(seen_weight_in_part, expected_weight_in_part);
4034 imbalance_on_right =
imbalanceOf2(global_total_weight - seen_weight_in_part, global_total_weight - expected_weight_in_part);
4036 bool is_left_imbalance_valid =
ZOLTAN2_ABS(imbalance_on_left) - used_imbalance_tolerance < this->sEpsilon ;
4037 bool is_right_imbalance_valid =
ZOLTAN2_ABS(imbalance_on_right) - used_imbalance_tolerance < this->sEpsilon;
4040 if(is_left_imbalance_valid && is_right_imbalance_valid){
4041 current_cut_line_determined[i] =
true;
4042 #ifdef HAVE_ZOLTAN2_OMP
4045 my_num_incomplete_cut -= 1;
4046 new_current_cut_coordinates [i] = current_cut_coordinates[i];
4049 else if(imbalance_on_left < 0){
4052 if(this->distribute_points_on_cut_lines){
4057 if (current_global_part_weights[i * 2 + 1] == expected_weight_in_part){
4059 current_cut_line_determined[i] =
true;
4060 #ifdef HAVE_ZOLTAN2_OMP
4063 my_num_incomplete_cut -= 1;
4066 new_current_cut_coordinates [i] = current_cut_coordinates[i];
4070 current_part_cut_line_weight_to_put_left[i] = current_local_part_weights[i * 2 + 1] - current_local_part_weights[i * 2];
4073 else if (current_global_part_weights[i * 2 + 1] > expected_weight_in_part){
4077 current_cut_line_determined[i] =
true;
4078 #ifdef HAVE_ZOLTAN2_OMP
4081 *rectilinear_cut_count += 1;
4084 #ifdef HAVE_ZOLTAN2_OMP
4087 my_num_incomplete_cut -= 1;
4088 new_current_cut_coordinates [i] = current_cut_coordinates[i];
4089 this->process_rectilinear_cut_weight[i] = current_local_part_weights[i * 2 + 1] -
4090 current_local_part_weights[i * 2];
4095 current_cut_lower_bounds[i] = current_global_right_closest_points[i];
4097 current_cut_lower_bound_weights[i] = seen_weight_in_part;
4101 for (mj_part_t ii = i + 1; ii < num_cuts ; ++ii){
4102 mj_scalar_t p_weight = current_global_part_weights[ii * 2];
4103 mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
4105 if(p_weight >= expected_weight_in_part){
4110 if(p_weight == expected_weight_in_part){
4111 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
4112 current_cut_upper_weights[i] = p_weight;
4113 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
4114 current_cut_lower_bound_weights[i] = p_weight;
4115 }
else if (p_weight < current_cut_upper_weights[i]){
4118 current_cut_upper_bounds[i] = current_global_left_closest_points[ii];
4119 current_cut_upper_weights[i] = p_weight;
4125 if(line_weight >= expected_weight_in_part){
4128 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
4129 current_cut_upper_weights[i] = line_weight;
4130 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
4131 current_cut_lower_bound_weights[i] = p_weight;
4136 if (p_weight <= expected_weight_in_part && p_weight >= current_cut_lower_bound_weights[i]){
4137 current_cut_lower_bounds[i] = current_global_right_closest_points[ii] ;
4138 current_cut_lower_bound_weights[i] = p_weight;
4143 mj_scalar_t new_cut_position = 0;
4144 this->mj_calculate_new_cut_position(
4145 current_cut_upper_bounds[i],
4146 current_cut_lower_bounds[i],
4147 current_cut_upper_weights[i],
4148 current_cut_lower_bound_weights[i],
4149 expected_weight_in_part, new_cut_position);
4153 if (
ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
4156 current_cut_line_determined[i] =
true;
4157 #ifdef HAVE_ZOLTAN2_OMP
4160 my_num_incomplete_cut -= 1;
4163 new_current_cut_coordinates [i] = current_cut_coordinates[i];
4165 new_current_cut_coordinates [i] = new_cut_position;
4171 current_cut_upper_bounds[i] = current_global_left_closest_points[i];
4172 current_cut_upper_weights[i] = seen_weight_in_part;
4175 for (
int ii = i - 1; ii >= 0; --ii){
4176 mj_scalar_t p_weight = current_global_part_weights[ii * 2];
4177 mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
4178 if(p_weight <= expected_weight_in_part){
4179 if(p_weight == expected_weight_in_part){
4182 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
4183 current_cut_upper_weights[i] = p_weight;
4184 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
4185 current_cut_lower_bound_weights[i] = p_weight;
4187 else if (p_weight > current_cut_lower_bound_weights[i]){
4190 current_cut_lower_bounds[i] = current_global_right_closest_points[ii];
4191 current_cut_lower_bound_weights[i] = p_weight;
4197 if(line_weight > expected_weight_in_part){
4198 current_cut_upper_bounds[i] = current_global_right_closest_points[ii];
4199 current_cut_upper_weights[i] = line_weight;
4208 if (p_weight >= expected_weight_in_part &&
4209 (p_weight < current_cut_upper_weights[i] ||
4210 (p_weight == current_cut_upper_weights[i] &&
4211 current_cut_upper_bounds[i] > current_global_left_closest_points[ii]
4215 current_cut_upper_bounds[i] = current_global_left_closest_points[ii] ;
4216 current_cut_upper_weights[i] = p_weight;
4219 mj_scalar_t new_cut_position = 0;
4220 this->mj_calculate_new_cut_position(
4221 current_cut_upper_bounds[i],
4222 current_cut_lower_bounds[i],
4223 current_cut_upper_weights[i],
4224 current_cut_lower_bound_weights[i],
4225 expected_weight_in_part,
4229 if (
ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
4231 current_cut_line_determined[i] =
true;
4232 #ifdef HAVE_ZOLTAN2_OMP
4235 my_num_incomplete_cut -= 1;
4237 new_current_cut_coordinates [ i] = current_cut_coordinates[i];
4239 new_current_cut_coordinates [ i] = new_cut_position;
4248 #ifdef HAVE_ZOLTAN2_OMP
4253 if(*rectilinear_cut_count > 0){
4256 Teuchos::scan<int,mj_scalar_t>(
4257 *comm, Teuchos::REDUCE_SUM,
4259 this->process_rectilinear_cut_weight,
4260 this->global_rectilinear_cut_weight
4265 for (mj_part_t i = 0; i < num_cuts; ++i){
4267 if(this->global_rectilinear_cut_weight[i] > 0) {
4269 mj_scalar_t expected_part_weight = current_part_target_weights[i];
4271 mj_scalar_t necessary_weight_on_line_for_left = expected_part_weight - current_global_part_weights[i * 2];
4273 mj_scalar_t my_weight_on_line = this->process_rectilinear_cut_weight[i];
4275 mj_scalar_t weight_on_line_upto_process_inclusive = this->global_rectilinear_cut_weight[i];
4278 mj_scalar_t space_to_put_left = necessary_weight_on_line_for_left - weight_on_line_upto_process_inclusive;
4280 mj_scalar_t space_left_to_me = space_to_put_left + my_weight_on_line;
4290 if(space_left_to_me < 0){
4292 current_part_cut_line_weight_to_put_left[i] = 0;
4294 else if(space_left_to_me >= my_weight_on_line){
4297 current_part_cut_line_weight_to_put_left[i] = my_weight_on_line;
4302 current_part_cut_line_weight_to_put_left[i] = space_left_to_me ;
4309 *rectilinear_cut_count = 0;
4324 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4326 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_processor_num_points_in_parts(
4327 mj_part_t num_procs,
4328 mj_part_t num_parts,
4329 mj_gno_t *&num_points_in_all_processor_parts){
4332 size_t allocation_size = num_parts * (num_procs + 1);
4339 mj_gno_t *num_local_points_in_each_part_to_reduce_sum = allocMemory<mj_gno_t>(allocation_size);
4344 mj_gno_t *my_local_points_to_reduce_sum = num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
4347 mj_gno_t *my_local_point_counts_in_each_art = num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
4350 memset(num_local_points_in_each_part_to_reduce_sum, 0,
sizeof(mj_gno_t)*allocation_size);
4353 for (mj_part_t i = 0; i < num_parts; ++i){
4354 mj_lno_t part_begin_index = 0;
4356 part_begin_index = this->new_part_xadj[i - 1];
4358 mj_lno_t part_end_index = this->new_part_xadj[i];
4359 my_local_points_to_reduce_sum[i] = part_end_index - part_begin_index;
4364 memcpy (my_local_point_counts_in_each_art,
4365 my_local_points_to_reduce_sum,
4366 sizeof(mj_gno_t) * (num_parts) );
4374 reduceAll<int, mj_gno_t>(
4376 Teuchos::REDUCE_SUM,
4378 num_local_points_in_each_part_to_reduce_sum,
4379 num_points_in_all_processor_parts);
4382 freeArray<mj_gno_t>(num_local_points_in_each_part_to_reduce_sum);
4399 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4401 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_check_to_migrate(
4402 size_t migration_reduce_all_population,
4403 mj_lno_t num_coords_for_last_dim_part,
4404 mj_part_t num_procs,
4405 mj_part_t num_parts,
4406 mj_gno_t *num_points_in_all_processor_parts){
4414 if (this->check_migrate_avoid_migration_option == 0){
4415 double global_imbalance = 0;
4417 size_t global_shift = num_procs * num_parts;
4419 for (mj_part_t ii = 0; ii < num_procs; ++ii){
4420 for (mj_part_t i = 0; i < num_parts; ++i){
4421 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
4422 / double(num_procs);
4425 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
4428 global_imbalance /= num_parts;
4429 global_imbalance /= num_procs;
4437 if(global_imbalance <= this->minimum_migration_imbalance){
4460 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4462 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations(
4463 mj_part_t num_parts,
4464 mj_part_t *part_assignment_proc_begin_indices,
4465 mj_part_t *processor_chains_in_parts,
4466 mj_lno_t *send_count_to_each_proc,
4467 int *coordinate_destinations){
4469 for (mj_part_t p = 0; p < num_parts; ++p){
4470 mj_lno_t part_begin = 0;
4471 if (p > 0) part_begin = this->new_part_xadj[p - 1];
4472 mj_lno_t part_end = this->new_part_xadj[p];
4475 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
4477 mj_lno_t num_total_send = 0;
4478 for (mj_lno_t j=part_begin; j < part_end; j++){
4479 mj_lno_t local_ind = this->new_coordinate_permutations[j];
4480 while (num_total_send >= send_count_to_each_proc[proc_to_sent]){
4484 part_assignment_proc_begin_indices[p] = processor_chains_in_parts[proc_to_sent];
4486 processor_chains_in_parts[proc_to_sent] = -1;
4488 proc_to_sent = part_assignment_proc_begin_indices[p];
4491 coordinate_destinations[local_ind] = proc_to_sent;
4511 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4513 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_proc_to_parts(
4514 mj_gno_t * num_points_in_all_processor_parts,
4515 mj_part_t num_parts,
4516 mj_part_t num_procs,
4517 mj_lno_t *send_count_to_each_proc,
4518 std::vector<mj_part_t> &processor_ranks_for_subcomm,
4519 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4520 mj_part_t &out_part_index,
4521 mj_part_t &output_part_numbering_begin_index,
4522 int *coordinate_destinations){
4525 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4526 mj_part_t *num_procs_assigned_to_each_part = allocMemory<mj_part_t>(num_parts);
4529 bool did_i_find_my_group =
false;
4531 mj_part_t num_free_procs = num_procs;
4532 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
4534 double max_imbalance_difference = 0;
4535 mj_part_t max_differing_part = 0;
4538 for (mj_part_t i=0; i < num_parts; i++){
4541 double scalar_required_proc = num_procs *
4542 (double (global_num_points_in_parts[i]) / double (this->num_global_coords));
4545 mj_part_t required_proc =
static_cast<mj_part_t
> (0.5 + scalar_required_proc);
4546 if (required_proc == 0) required_proc = 1;
4550 if (num_free_procs - required_proc < minimum_num_procs_required_for_rest_of_parts){
4551 required_proc = num_free_procs - (minimum_num_procs_required_for_rest_of_parts);
4555 num_free_procs -= required_proc;
4557 --minimum_num_procs_required_for_rest_of_parts;
4560 num_procs_assigned_to_each_part[i] = required_proc;
4565 double imbalance_wrt_ideal = (scalar_required_proc - required_proc) / required_proc;
4566 if (imbalance_wrt_ideal > max_imbalance_difference){
4567 max_imbalance_difference = imbalance_wrt_ideal;
4568 max_differing_part = i;
4573 if (num_free_procs > 0){
4574 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
4581 mj_part_t *part_assignment_proc_begin_indices = allocMemory<mj_part_t>(num_parts);
4583 mj_part_t *processor_chains_in_parts = allocMemory<mj_part_t>(num_procs);
4584 mj_part_t *processor_part_assignments = allocMemory<mj_part_t>(num_procs);
4593 for (
int i = 0; i < num_procs; ++i ){
4594 processor_part_assignments[i] = -1;
4595 processor_chains_in_parts[i] = -1;
4597 for (
int i = 0; i < num_parts; ++i ){
4598 part_assignment_proc_begin_indices[i] = -1;
4604 uSignedSortItem<mj_part_t, mj_gno_t, char> * sort_item_num_part_points_in_procs = allocMemory <uSignedSortItem<mj_part_t, mj_gno_t, char> > (num_procs);
4605 for(mj_part_t i = 0; i < num_parts; ++i){
4609 for(mj_part_t ii = 0; ii < num_procs; ++ii){
4610 sort_item_num_part_points_in_procs[ii].id = ii;
4613 if (processor_part_assignments[ii] == -1){
4614 sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4615 sort_item_num_part_points_in_procs[ii].signbit = 1;
4625 sort_item_num_part_points_in_procs[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4626 sort_item_num_part_points_in_procs[ii].signbit = 0;
4630 uqSignsort<mj_part_t, mj_gno_t,char>(num_procs, sort_item_num_part_points_in_procs);
4640 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
4641 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
4642 mj_gno_t ideal_num_points_in_a_proc =
4643 Teuchos::as<mj_gno_t>(ceil (total_num_points_in_part /
double (required_proc_count)));
4646 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
4647 mj_part_t next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4648 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4652 for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4653 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
4655 processor_part_assignments[proc_id] = i;
4658 bool did_change_sign =
false;
4660 for(mj_part_t ii = 0; ii < num_procs; ++ii){
4663 if (sort_item_num_part_points_in_procs[ii].signbit == 0){
4664 did_change_sign =
true;
4665 sort_item_num_part_points_in_procs[ii].signbit = 1;
4671 if(did_change_sign){
4673 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count, sort_item_num_part_points_in_procs);
4685 if (!did_i_find_my_group){
4686 for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4688 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
4690 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
4692 if(proc_id_to_assign == this->myRank){
4694 did_i_find_my_group =
true;
4696 part_assignment_proc_begin_indices[i] = this->myRank;
4697 processor_chains_in_parts[this->myRank] = -1;
4700 send_count_to_each_proc[this->myRank] = sort_item_num_part_points_in_procs[ii].val;
4703 for (mj_part_t in = 0; in < i; ++in){
4704 output_part_numbering_begin_index += (*next_future_num_parts_in_parts)[in];
4711 if (!did_i_find_my_group){
4712 processor_ranks_for_subcomm.clear();
4720 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii){
4721 mj_part_t nonassigned_proc_id = sort_item_num_part_points_in_procs[ii].id;
4722 mj_lno_t num_points_to_sent = sort_item_num_part_points_in_procs[ii].val;
4727 if (num_points_to_sent < 0) {
4728 std::cout <<
"Migration - processor assignments - for part:" << i <<
"from proc:" << nonassigned_proc_id <<
" num_points_to_sent:" << num_points_to_sent << std::endl;
4733 switch (migration_type){
4737 while (num_points_to_sent > 0){
4739 if (num_points_to_sent <= space_left_in_sent_proc){
4741 space_left_in_sent_proc -= num_points_to_sent;
4743 if (this->myRank == nonassigned_proc_id){
4745 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4748 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4749 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4750 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4752 num_points_to_sent = 0;
4756 if(space_left_in_sent_proc > 0){
4757 num_points_to_sent -= space_left_in_sent_proc;
4760 if (this->myRank == nonassigned_proc_id){
4762 send_count_to_each_proc[next_proc_to_send_id] = space_left_in_sent_proc;
4763 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4764 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4765 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4770 ++next_proc_to_send_index;
4773 if(next_part_to_send_index < nprocs - required_proc_count ){
4774 std::cout <<
"Migration - processor assignments - for part:"
4776 <<
" next_part_to_send :" << next_part_to_send_index
4777 <<
" nprocs:" << nprocs
4778 <<
" required_proc_count:" << required_proc_count
4779 <<
" Error: next_part_to_send_index < nprocs - required_proc_count" << std::endl;
4785 next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4787 space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4796 if (this->myRank == nonassigned_proc_id){
4798 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4801 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4802 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4803 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4805 num_points_to_sent = 0;
4806 ++next_proc_to_send_index;
4809 if (next_proc_to_send_index == num_procs){
4810 next_proc_to_send_index = num_procs - required_proc_count;
4813 next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
4815 space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
4828 this->assign_send_destinations(
4830 part_assignment_proc_begin_indices,
4831 processor_chains_in_parts,
4832 send_count_to_each_proc,
4833 coordinate_destinations);
4835 freeArray<mj_part_t>(part_assignment_proc_begin_indices);
4836 freeArray<mj_part_t>(processor_chains_in_parts);
4837 freeArray<mj_part_t>(processor_part_assignments);
4838 freeArray<uSignedSortItem<mj_part_t, mj_gno_t, char> > (sort_item_num_part_points_in_procs);
4839 freeArray<mj_part_t > (num_procs_assigned_to_each_part);
4856 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4858 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations2(
4859 mj_part_t num_parts,
4860 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
4861 int *coordinate_destinations,
4862 mj_part_t &output_part_numbering_begin_index,
4863 std::vector<mj_part_t> *next_future_num_parts_in_parts){
4865 mj_part_t part_shift_amount = output_part_numbering_begin_index;
4866 mj_part_t previous_processor = -1;
4867 for(mj_part_t i = 0; i < num_parts; ++i){
4868 mj_part_t p = sort_item_part_to_proc_assignment[i].id;
4870 mj_lno_t part_begin_index = 0;
4871 if (p > 0) part_begin_index = this->new_part_xadj[p - 1];
4872 mj_lno_t part_end_index = this->new_part_xadj[p];
4874 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
4875 if (this->myRank == assigned_proc && previous_processor != assigned_proc){
4876 output_part_numbering_begin_index = part_shift_amount;
4878 previous_processor = assigned_proc;
4879 part_shift_amount += (*next_future_num_parts_in_parts)[p];
4881 for (mj_lno_t j=part_begin_index; j < part_end_index; j++){
4882 mj_lno_t localInd = this->new_coordinate_permutations[j];
4883 coordinate_destinations[localInd] = assigned_proc;
4905 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4907 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_parts_to_procs(
4908 mj_gno_t * num_points_in_all_processor_parts,
4909 mj_part_t num_parts,
4910 mj_part_t num_procs,
4911 mj_lno_t *send_count_to_each_proc,
4912 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4913 mj_part_t &out_num_part,
4914 std::vector<mj_part_t> &out_part_indices,
4915 mj_part_t &output_part_numbering_begin_index,
4916 int *coordinate_destinations){
4919 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4920 out_part_indices.clear();
4924 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment = allocMemory <uSortItem<mj_part_t, mj_part_t> >(num_parts);
4925 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_procs);
4929 mj_lno_t work_each = mj_lno_t (this->num_global_coords / (
double (num_procs)) + 0.5f);
4931 mj_lno_t *space_in_each_processor = allocMemory <mj_lno_t>(num_procs);
4933 for (mj_part_t i = 0; i < num_procs; ++i){
4934 space_in_each_processor[i] = work_each;
4941 mj_part_t *num_parts_proc_assigned = allocMemory <mj_part_t>(num_procs);
4942 memset(num_parts_proc_assigned, 0,
sizeof(mj_part_t) * num_procs);
4943 int empty_proc_count = num_procs;
4947 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_parts);
4951 for (mj_part_t i = 0; i < num_parts; ++i){
4952 sort_item_point_counts_in_parts[i].id = i;
4953 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
4956 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
4962 for (mj_part_t j = 0; j < num_parts; ++j){
4964 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
4966 mj_gno_t load = global_num_points_in_parts[i];
4969 mj_part_t assigned_proc = -1;
4971 mj_part_t best_proc_to_assign = 0;
4975 for (mj_part_t ii = 0; ii < num_procs; ++ii){
4976 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
4981 if (empty_proc_count < num_parts - j || num_parts_proc_assigned[ii] == 0){
4983 sort_item_num_points_of_proc_in_part_i[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4986 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
4989 uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_points_of_proc_in_part_i);
4992 for (mj_part_t iii = num_procs - 1; iii >= 0; --iii){
4993 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
4994 mj_lno_t left_space = space_in_each_processor[ii] - load;
4996 if(left_space >= 0 ){
5001 if (space_in_each_processor[best_proc_to_assign] < space_in_each_processor[ii]){
5002 best_proc_to_assign = ii;
5007 if (assigned_proc == -1){
5008 assigned_proc = best_proc_to_assign;
5011 if (num_parts_proc_assigned[assigned_proc]++ == 0){
5014 space_in_each_processor[assigned_proc] -= load;
5016 sort_item_part_to_proc_assignment[j].id = i;
5017 sort_item_part_to_proc_assignment[j].val = assigned_proc;
5021 if (assigned_proc == this->myRank){
5023 out_part_indices.push_back(i);
5027 send_count_to_each_proc[assigned_proc] += num_points_in_all_processor_parts[this->myRank * num_parts + i];
5029 freeArray<mj_part_t>(num_parts_proc_assigned);
5030 freeArray< uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_points_of_proc_in_part_i);
5031 freeArray<uSortItem<mj_part_t, mj_gno_t> >(sort_item_point_counts_in_parts);
5032 freeArray<mj_lno_t >(space_in_each_processor);
5036 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
5040 this->assign_send_destinations2(
5042 sort_item_part_to_proc_assignment,
5043 coordinate_destinations,
5044 output_part_numbering_begin_index,
5045 next_future_num_parts_in_parts);
5047 freeArray<uSortItem<mj_part_t, mj_part_t> >(sort_item_part_to_proc_assignment);
5068 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5070 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migration_part_proc_assignment(
5071 mj_gno_t * num_points_in_all_processor_parts,
5072 mj_part_t num_parts,
5073 mj_part_t num_procs,
5074 mj_lno_t *send_count_to_each_proc,
5075 std::vector<mj_part_t> &processor_ranks_for_subcomm,
5076 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5077 mj_part_t &out_num_part,
5078 std::vector<mj_part_t> &out_part_indices,
5079 mj_part_t &output_part_numbering_begin_index,
5080 int *coordinate_destinations){
5084 processor_ranks_for_subcomm.clear();
5086 if (num_procs > num_parts){
5091 mj_part_t out_part_index = 0;
5092 this->mj_assign_proc_to_parts(
5093 num_points_in_all_processor_parts,
5096 send_count_to_each_proc,
5097 processor_ranks_for_subcomm,
5098 next_future_num_parts_in_parts,
5100 output_part_numbering_begin_index,
5101 coordinate_destinations
5105 out_part_indices.clear();
5106 out_part_indices.push_back(out_part_index);
5113 processor_ranks_for_subcomm.push_back(this->myRank);
5117 this->mj_assign_parts_to_procs(
5118 num_points_in_all_processor_parts,
5121 send_count_to_each_proc,
5122 next_future_num_parts_in_parts,
5125 output_part_numbering_begin_index,
5126 coordinate_destinations);
5142 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5144 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migrate_coords(
5145 mj_part_t num_procs,
5146 mj_lno_t &num_new_local_points,
5147 std::string iteration,
5148 int *coordinate_destinations,
5149 mj_part_t num_parts)
5151 #ifdef ENABLE_ZOLTAN_MIGRATION
5152 if (
sizeof(mj_lno_t) <=
sizeof(
int)) {
5158 ZOLTAN_COMM_OBJ *plan = NULL;
5159 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
5160 int num_incoming_gnos = 0;
5161 int message_tag = 7859;
5163 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration Z1PlanCreating-" + iteration);
5164 int ierr = Zoltan_Comm_Create(
5166 int(this->num_local_coords),
5167 coordinate_destinations,
5170 &num_incoming_gnos);
5172 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration Z1PlanCreating-" + iteration);
5174 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration Z1Migration-" + iteration);
5175 mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(num_incoming_gnos);
5179 ierr = Zoltan_Comm_Do(
5182 (
char *) this->current_mj_gnos,
5184 (
char *) incoming_gnos);
5187 freeArray<mj_gno_t>(this->current_mj_gnos);
5188 this->current_mj_gnos = incoming_gnos;
5192 for (
int i = 0; i < this->coord_dim; ++i){
5194 mj_scalar_t *coord = this->mj_coordinates[i];
5196 this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5197 ierr = Zoltan_Comm_Do(
5201 sizeof(mj_scalar_t),
5202 (
char *) this->mj_coordinates[i]);
5204 freeArray<mj_scalar_t>(coord);
5208 for (
int i = 0; i < this->num_weights_per_coord; ++i){
5210 mj_scalar_t *weight = this->mj_weights[i];
5212 this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5213 ierr = Zoltan_Comm_Do(
5217 sizeof(mj_scalar_t),
5218 (
char *) this->mj_weights[i]);
5220 freeArray<mj_scalar_t>(weight);
5225 int *coord_own = allocMemory<int>(num_incoming_gnos);
5227 ierr = Zoltan_Comm_Do(
5230 (
char *) this->owner_of_coordinate,
5231 sizeof(
int), (
char *) coord_own);
5233 freeArray<int>(this->owner_of_coordinate);
5234 this->owner_of_coordinate = coord_own;
5240 mj_part_t *new_parts = allocMemory<mj_part_t>(num_incoming_gnos);
5241 if(num_procs < num_parts){
5243 ierr = Zoltan_Comm_Do(
5246 (
char *) this->assigned_part_ids,
5248 (
char *) new_parts);
5251 freeArray<mj_part_t>(this->assigned_part_ids);
5252 this->assigned_part_ids = new_parts;
5254 ierr = Zoltan_Comm_Destroy(&plan);
5256 num_new_local_points = num_incoming_gnos;
5257 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration Z1Migration-" + iteration);
5264 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration DistributorPlanCreating-" + iteration);
5265 Tpetra::Distributor distributor(this->comm);
5266 ArrayView<const mj_part_t> destinations( coordinate_destinations, this->num_local_coords);
5267 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
5268 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration DistributorPlanCreating-" + iteration);
5270 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration DistributorMigration-" + iteration);
5273 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
5274 ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5275 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5276 freeArray<mj_gno_t>(this->current_mj_gnos);
5277 this->current_mj_gnos = allocMemory<mj_gno_t>(num_incoming_gnos);
5279 this->current_mj_gnos,
5280 received_gnos.getRawPtr(),
5281 num_incoming_gnos *
sizeof(mj_gno_t));
5284 for (
int i = 0; i < this->coord_dim; ++i){
5286 ArrayView<mj_scalar_t> sent_coord(this->mj_coordinates[i], this->num_local_coords);
5287 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
5288 distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
5289 freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5290 this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5292 this->mj_coordinates[i],
5293 received_coord.getRawPtr(),
5294 num_incoming_gnos *
sizeof(mj_scalar_t));
5298 for (
int i = 0; i < this->num_weights_per_coord; ++i){
5300 ArrayView<mj_scalar_t> sent_weight(this->mj_weights[i], this->num_local_coords);
5301 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
5302 distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
5303 freeArray<mj_scalar_t>(this->mj_weights[i]);
5304 this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5306 this->mj_weights[i],
5307 received_weight.getRawPtr(),
5308 num_incoming_gnos *
sizeof(mj_scalar_t));
5313 ArrayView<int> sent_owners(this->owner_of_coordinate, this->num_local_coords);
5314 ArrayRCP<int> received_owners(num_incoming_gnos);
5315 distributor.doPostsAndWaits<
int>(sent_owners, 1, received_owners());
5316 freeArray<int>(this->owner_of_coordinate);
5317 this->owner_of_coordinate = allocMemory<int>(num_incoming_gnos);
5319 this->owner_of_coordinate,
5320 received_owners.getRawPtr(),
5321 num_incoming_gnos *
sizeof(
int));
5327 if(num_procs < num_parts){
5328 ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5329 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
5330 distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5331 freeArray<mj_part_t>(this->assigned_part_ids);
5332 this->assigned_part_ids = allocMemory<mj_part_t>(num_incoming_gnos);
5334 this->assigned_part_ids,
5335 received_partids.getRawPtr(),
5336 num_incoming_gnos *
sizeof(mj_part_t));
5339 mj_part_t *new_parts = allocMemory<int>(num_incoming_gnos);
5340 freeArray<mj_part_t>(this->assigned_part_ids);
5341 this->assigned_part_ids = new_parts;
5343 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration DistributorMigration-" + iteration);
5344 num_new_local_points = num_incoming_gnos;
5355 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5357 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm){
5358 mj_part_t group_size = processor_ranks_for_subcomm.size();
5359 mj_part_t *ids = allocMemory<mj_part_t>(group_size);
5360 for(mj_part_t i = 0; i < group_size; ++i) {
5361 ids[i] = processor_ranks_for_subcomm[i];
5363 ArrayView<const mj_part_t> idView(ids, group_size);
5364 this->comm = this->comm->createSubcommunicator(idView);
5365 freeArray<mj_part_t>(ids);
5374 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5376 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::fill_permutation_array(
5377 mj_part_t output_num_parts,
5378 mj_part_t num_parts){
5380 if (output_num_parts == 1){
5381 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
5382 this->new_coordinate_permutations[i] = i;
5384 this->new_part_xadj[0] = this->num_local_coords;
5391 mj_lno_t *num_points_in_parts = allocMemory<mj_lno_t>(num_parts);
5393 mj_part_t *part_shifts = allocMemory<mj_part_t>(num_parts);
5395 memset(num_points_in_parts, 0,
sizeof(mj_lno_t) * num_parts);
5397 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
5398 mj_part_t ii = this->assigned_part_ids[i];
5399 ++num_points_in_parts[ii];
5404 mj_lno_t prev_index = 0;
5405 for(mj_part_t i = 0; i < num_parts; ++i){
5406 if(num_points_in_parts[i] > 0) {
5407 this->new_part_xadj[p] = prev_index + num_points_in_parts[i];
5408 prev_index += num_points_in_parts[i];
5409 part_shifts[i] = p++;
5414 mj_part_t assigned_num_parts = p - 1;
5415 for (;p < num_parts; ++p){
5416 this->new_part_xadj[p] = this->new_part_xadj[assigned_num_parts];
5418 for(mj_part_t i = 0; i < output_num_parts; ++i){
5419 num_points_in_parts[i] = this->new_part_xadj[i];
5425 for(mj_lno_t i = this->num_local_coords - 1; i >= 0; --i){
5426 mj_part_t part = part_shifts[mj_part_t(this->assigned_part_ids[i])];
5427 this->new_coordinate_permutations[--num_points_in_parts[part]] = i;
5430 freeArray<mj_lno_t>(num_points_in_parts);
5431 freeArray<mj_part_t>(part_shifts);
5458 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5460 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_perform_migration(
5461 mj_part_t input_num_parts,
5462 mj_part_t &output_num_parts,
5463 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5464 mj_part_t &output_part_begin_index,
5465 size_t migration_reduce_all_population,
5466 mj_lno_t num_coords_for_last_dim_part,
5467 std::string iteration,
5468 RCP<mj_partBoxVector_t> &input_part_boxes,
5469 RCP<mj_partBoxVector_t> &output_part_boxes
5472 mj_part_t num_procs = this->comm->getSize();
5473 this->myRank = this->comm->getRank();
5479 mj_gno_t *num_points_in_all_processor_parts = allocMemory<mj_gno_t>(input_num_parts * (num_procs + 1));
5482 this->get_processor_num_points_in_parts(
5485 num_points_in_all_processor_parts);
5489 if (!this->mj_check_to_migrate(
5490 migration_reduce_all_population,
5491 num_coords_for_last_dim_part,
5494 num_points_in_all_processor_parts)){
5495 freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5500 mj_lno_t *send_count_to_each_proc = NULL;
5501 int *coordinate_destinations = allocMemory<int>(this->num_local_coords);
5502 send_count_to_each_proc = allocMemory<mj_lno_t>(num_procs);
5503 for (
int i = 0; i < num_procs; ++i) send_count_to_each_proc[i] = 0;
5505 std::vector<mj_part_t> processor_ranks_for_subcomm;
5506 std::vector<mj_part_t> out_part_indices;
5509 this->mj_migration_part_proc_assignment(
5510 num_points_in_all_processor_parts,
5513 send_count_to_each_proc,
5514 processor_ranks_for_subcomm,
5515 next_future_num_parts_in_parts,
5518 output_part_begin_index,
5519 coordinate_destinations);
5524 freeArray<mj_lno_t>(send_count_to_each_proc);
5525 std::vector <mj_part_t> tmpv;
5527 std::sort (out_part_indices.begin(), out_part_indices.end());
5528 mj_part_t outP = out_part_indices.size();
5530 mj_gno_t new_global_num_points = 0;
5531 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * input_num_parts;
5533 if (this->mj_keep_part_boxes){
5534 input_part_boxes->clear();
5539 for (mj_part_t i = 0; i < outP; ++i){
5540 mj_part_t ind = out_part_indices[i];
5541 new_global_num_points += global_num_points_in_parts[ind];
5542 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
5543 if (this->mj_keep_part_boxes){
5544 input_part_boxes->push_back((*output_part_boxes)[ind]);
5548 if (this->mj_keep_part_boxes){
5549 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5550 input_part_boxes = output_part_boxes;
5551 output_part_boxes = tmpPartBoxes;
5553 next_future_num_parts_in_parts->clear();
5554 for (mj_part_t i = 0; i < outP; ++i){
5555 mj_part_t p = tmpv[i];
5556 next_future_num_parts_in_parts->push_back(p);
5559 freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5561 mj_lno_t num_new_local_points = 0;
5565 this->mj_migrate_coords(
5567 num_new_local_points,
5569 coordinate_destinations,
5573 freeArray<int>(coordinate_destinations);
5575 if(this->num_local_coords != num_new_local_points){
5576 freeArray<mj_lno_t>(this->new_coordinate_permutations);
5577 freeArray<mj_lno_t>(this->coordinate_permutations);
5579 this->new_coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5580 this->coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5582 this->num_local_coords = num_new_local_points;
5583 this->num_global_coords = new_global_num_points;
5588 this->create_sub_communicator(processor_ranks_for_subcomm);
5589 processor_ranks_for_subcomm.clear();
5592 this->fill_permutation_array(
5612 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5614 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_consistent_chunks(
5615 mj_part_t num_parts,
5616 mj_scalar_t *mj_current_dim_coords,
5617 mj_scalar_t *current_concurrent_cut_coordinate,
5618 mj_lno_t coordinate_begin,
5619 mj_lno_t coordinate_end,
5620 mj_scalar_t *used_local_cut_line_weight_to_left,
5621 mj_lno_t *out_part_xadj,
5622 int coordInd,
bool longest_dim_part, uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted){
5625 mj_part_t no_cuts = num_parts - 1;
5630 mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
5631 mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
5636 if (this->distribute_points_on_cut_lines){
5638 my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
5639 for (mj_part_t i = 0; i < no_cuts; ++i){
5641 mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
5643 for(
int ii = 0; ii < this->num_threads; ++ii){
5644 if(left_weight > this->sEpsilon){
5646 mj_scalar_t thread_ii_weight_on_cut = this->thread_part_weight_work[ii][i * 2 + 1] - this->thread_part_weight_work[ii][i * 2 ];
5647 if(thread_ii_weight_on_cut < left_weight){
5648 this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
5651 this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
5653 left_weight -= thread_ii_weight_on_cut;
5656 this->thread_cut_line_weight_to_put_left[ii][i] = 0;
5664 for (mj_part_t i = no_cuts - 1; i > 0 ; --i){
5665 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5666 my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
5674 for(mj_part_t ii = 0; ii < num_parts; ++ii){
5675 thread_num_points_in_parts[ii] = 0;
5687 mj_part_t *cut_map = allocMemory<mj_part_t> (no_cuts);
5690 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
5691 typedef std::vector< multiSItem > multiSVector;
5692 typedef std::vector<multiSVector> multiS2Vector;
5695 std::vector<mj_scalar_t *>allocated_memory;
5698 multiS2Vector sort_vector_points_on_cut;
5701 mj_part_t different_cut_count = 1;
5706 multiSVector tmpMultiSVector;
5707 sort_vector_points_on_cut.push_back(tmpMultiSVector);
5709 for (mj_part_t i = 1; i < no_cuts ; ++i){
5712 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5713 cut_map[i] = cut_map[i-1];
5716 cut_map[i] = different_cut_count++;
5717 multiSVector tmp2MultiSVector;
5718 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
5724 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5726 mj_lno_t i = this->coordinate_permutations[ii];
5728 mj_part_t pp = this->assigned_part_ids[i];
5729 mj_part_t p = pp / 2;
5732 mj_scalar_t *
vals = allocMemory<mj_scalar_t>(this->coord_dim -1);
5733 allocated_memory.push_back(
vals);
5738 if (longest_dim_part){
5740 for(
int dim = this->coord_dim - 2; dim >= 0; --dim){
5742 int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
5744 vals[val_ind++] = this->mj_coordinates[next_largest_coord_dim][i];
5748 for(
int dim = coordInd + 1; dim < this->coord_dim; ++dim){
5749 vals[val_ind++] = this->mj_coordinates[dim][i];
5751 for(
int dim = 0; dim < coordInd; ++dim){
5752 vals[val_ind++] = this->mj_coordinates[dim][i];
5755 multiSItem tempSortItem(i, this->coord_dim -1,
vals);
5757 mj_part_t cmap = cut_map[p];
5758 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
5762 ++thread_num_points_in_parts[p];
5763 this->assigned_part_ids[i] = p;
5768 for (mj_part_t i = 0; i < different_cut_count; ++i){
5769 std::sort (sort_vector_points_on_cut[i].begin(), sort_vector_points_on_cut[i].end());
5773 mj_part_t previous_cut_map = cut_map[0];
5782 mj_scalar_t weight_stolen_from_previous_part = 0;
5783 for (mj_part_t p = 0; p < no_cuts; ++p){
5785 mj_part_t mapped_cut = cut_map[p];
5789 if (previous_cut_map != mapped_cut){
5790 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5791 for (; sort_vector_end >= 0; --sort_vector_end){
5792 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5793 mj_lno_t i = t.index;
5794 ++thread_num_points_in_parts[p];
5795 this->assigned_part_ids[i] = p;
5797 sort_vector_points_on_cut[previous_cut_map].clear();
5801 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size() - 1;
5806 for (; sort_vector_end >= 0; --sort_vector_end){
5809 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
5811 mj_lno_t i = t.index;
5812 mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
5816 if( my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part> this->sEpsilon &&
5817 my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part -
ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - w)
5820 my_local_thread_cut_weights_to_put_left[p] -= w;
5821 sort_vector_points_on_cut[mapped_cut].pop_back();
5822 ++thread_num_points_in_parts[p];
5823 this->assigned_part_ids[i] = p;
5826 if(p < no_cuts - 1 && my_local_thread_cut_weights_to_put_left[p] < this->sEpsilon){
5827 if(mapped_cut == cut_map[p + 1] ){
5830 if (previous_cut_map != mapped_cut){
5831 weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5836 weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5840 weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5849 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]){
5850 if (previous_cut_map != mapped_cut){
5851 weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5854 weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5858 weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5864 previous_cut_map = mapped_cut;
5869 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5874 for (; sort_vector_end >= 0; --sort_vector_end){
5877 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5879 mj_lno_t i = t.index;
5880 ++thread_num_points_in_parts[no_cuts];
5881 this->assigned_part_ids[i] = no_cuts;
5883 sort_vector_points_on_cut[previous_cut_map].clear();
5884 freeArray<mj_part_t> (cut_map);
5887 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
5888 for(mj_lno_t i = 0; i < vSize; ++i){
5889 freeArray<mj_scalar_t> (allocated_memory[i]);
5893 for(mj_part_t j = 0; j < num_parts; ++j){
5894 mj_lno_t num_points_in_part_j_upto_thread_i = 0;
5895 for (
int i = 0; i < this->num_threads; ++i){
5896 mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
5897 this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
5898 num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
5901 out_part_xadj[j] = num_points_in_part_j_upto_thread_i;
5905 for(mj_part_t j = 1; j < num_parts; ++j){
5906 out_part_xadj[j] += out_part_xadj[j - 1];
5912 for(mj_part_t j = 1; j < num_parts; ++j){
5913 thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
5918 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5919 mj_lno_t i = this->coordinate_permutations[ii];
5920 mj_part_t p = this->assigned_part_ids[i];
5921 this->new_coordinate_permutations[coordinate_begin +
5922 thread_num_points_in_parts[p]++] = i;
5937 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5939 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_final_parts(
5940 mj_part_t current_num_parts,
5941 mj_part_t output_part_begin_index,
5942 RCP<mj_partBoxVector_t> &output_part_boxes,
5943 bool is_data_ever_migrated)
5945 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Part_Assignment");
5947 #ifdef HAVE_ZOLTAN2_OMP
5948 #pragma omp parallel for
5950 for(mj_part_t i = 0; i < current_num_parts;++i){
5953 mj_lno_t end = this->part_xadj[i];
5955 if(i > 0) begin = this->part_xadj[i -1];
5956 mj_part_t part_to_set_index = i + output_part_begin_index;
5957 if (this->mj_keep_part_boxes){
5958 (*output_part_boxes)[i].setpId(part_to_set_index);
5960 for (mj_lno_t ii = begin; ii < end; ++ii){
5961 mj_lno_t k = this->coordinate_permutations[ii];
5962 this->assigned_part_ids[k] = part_to_set_index;
5967 if(!is_data_ever_migrated){
5974 #ifdef ENABLE_ZOLTAN_MIGRATION
5975 if (
sizeof(mj_lno_t) <=
sizeof(
int)) {
5982 ZOLTAN_COMM_OBJ *plan = NULL;
5983 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
5986 int message_tag = 7856;
5988 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanCreating");
5989 int ierr = Zoltan_Comm_Create( &plan,
int(this->num_local_coords),
5990 this->owner_of_coordinate, mpi_comm, message_tag,
5993 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanCreating" );
5995 mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(incoming);
5998 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanComm");
5999 ierr = Zoltan_Comm_Do( plan, message_tag, (
char *) this->current_mj_gnos,
6000 sizeof(mj_gno_t), (
char *) incoming_gnos);
6003 freeArray<mj_gno_t>(this->current_mj_gnos);
6004 this->current_mj_gnos = incoming_gnos;
6006 mj_part_t *incoming_partIds = allocMemory< mj_part_t>(incoming);
6009 ierr = Zoltan_Comm_Do( plan, message_tag, (
char *) this->assigned_part_ids,
6010 sizeof(mj_part_t), (
char *) incoming_partIds);
6012 freeArray<mj_part_t>(this->assigned_part_ids);
6013 this->assigned_part_ids = incoming_partIds;
6015 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanComm");
6016 ierr = Zoltan_Comm_Destroy(&plan);
6019 this->num_local_coords = incoming;
6027 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanCreating");
6028 Tpetra::Distributor distributor(this->mj_problemComm);
6029 ArrayView<const mj_part_t> owners_of_coords(this->owner_of_coordinate, this->num_local_coords);
6030 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
6031 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanCreating" );
6033 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanComm");
6035 ArrayRCP<mj_gno_t> received_gnos(incoming);
6036 ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
6037 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
6038 freeArray<mj_gno_t>(this->current_mj_gnos);
6039 this->current_mj_gnos = allocMemory<mj_gno_t>(incoming);
6040 memcpy( this->current_mj_gnos,
6041 received_gnos.getRawPtr(),
6042 incoming *
sizeof(mj_gno_t));
6045 ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
6046 ArrayRCP<mj_part_t> received_partids(incoming);
6047 distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
6048 freeArray<mj_part_t>(this->assigned_part_ids);
6049 this->assigned_part_ids = allocMemory<mj_part_t>(incoming);
6050 memcpy( this->assigned_part_ids,
6051 received_partids.getRawPtr(),
6052 incoming *
sizeof(mj_part_t));
6054 this->num_local_coords = incoming;
6055 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanComm");
6060 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Part_Assignment");
6062 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Solution_Part_Assignment");
6067 if (this->mj_keep_part_boxes){
6068 this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
6072 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Solution_Part_Assignment");
6077 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6079 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::free_work_memory(){
6080 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Free");
6082 for (
int i=0; i < this->coord_dim; i++){
6083 freeArray<mj_scalar_t>(this->mj_coordinates[i]);
6085 freeArray<mj_scalar_t *>(this->mj_coordinates);
6087 for (
int i=0; i < this->num_weights_per_coord; i++){
6088 freeArray<mj_scalar_t>(this->mj_weights[i]);
6090 freeArray<mj_scalar_t *>(this->mj_weights);
6092 freeArray<int>(this->owner_of_coordinate);
6094 for(
int i = 0; i < this->num_threads; ++i){
6095 freeArray<mj_lno_t>(this->thread_point_counts[i]);
6098 freeArray<mj_lno_t *>(this->thread_point_counts);
6099 freeArray<double *> (this->thread_part_weight_work);
6101 if(this->distribute_points_on_cut_lines){
6102 freeArray<mj_scalar_t>(this->process_cut_line_weight_to_put_left);
6103 for(
int i = 0; i < this->num_threads; ++i){
6104 freeArray<mj_scalar_t>(this->thread_cut_line_weight_to_put_left[i]);
6106 freeArray<mj_scalar_t *>(this->thread_cut_line_weight_to_put_left);
6107 freeArray<mj_scalar_t>(this->process_rectilinear_cut_weight);
6108 freeArray<mj_scalar_t>(this->global_rectilinear_cut_weight);
6111 freeArray<mj_part_t>(this->my_incomplete_cut_count);
6113 freeArray<mj_scalar_t>(this->max_min_coords);
6115 freeArray<mj_lno_t>(this->part_xadj);
6117 freeArray<mj_lno_t>(this->coordinate_permutations);
6119 freeArray<mj_lno_t>(this->new_coordinate_permutations);
6121 freeArray<mj_scalar_t>(this->all_cut_coordinates);
6123 freeArray<mj_scalar_t> (this->process_local_min_max_coord_total_weight);
6125 freeArray<mj_scalar_t> (this->global_min_max_coord_total_weight);
6127 freeArray<mj_scalar_t>(this->cut_coordinates_work_array);
6129 freeArray<mj_scalar_t>(this->target_part_weights);
6131 freeArray<mj_scalar_t>(this->cut_upper_bound_coordinates);
6133 freeArray<mj_scalar_t>(this->cut_lower_bound_coordinates);
6135 freeArray<mj_scalar_t>(this->cut_lower_bound_weights);
6136 freeArray<mj_scalar_t>(this->cut_upper_bound_weights);
6137 freeArray<bool>(this->is_cut_line_determined);
6138 freeArray<mj_scalar_t>(this->total_part_weight_left_right_closests);
6139 freeArray<mj_scalar_t>(this->global_total_part_weight_left_right_closests);
6141 for(
int i = 0; i < this->num_threads; ++i){
6142 freeArray<double>(this->thread_part_weights[i]);
6143 freeArray<mj_scalar_t>(this->thread_cut_right_closest_point[i]);
6144 freeArray<mj_scalar_t>(this->thread_cut_left_closest_point[i]);
6147 freeArray<double *>(this->thread_part_weights);
6148 freeArray<mj_scalar_t *>(this->thread_cut_left_closest_point);
6149 freeArray<mj_scalar_t *>(this->thread_cut_right_closest_point);
6151 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Free");
6162 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6165 bool distribute_points_on_cut_lines_,
6166 int max_concurrent_part_calculation_,
6167 int check_migrate_avoid_migration_option_,
6168 double minimum_migration_imbalance_,
6169 int migration_type_ ){
6170 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
6171 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
6172 this->check_migrate_avoid_migration_option = check_migrate_avoid_migration_option_;
6173 this->minimum_migration_imbalance = minimum_migration_imbalance_;
6174 this->migration_type = migration_type_;
6208 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6212 const RCP<const Environment> &env,
6213 RCP<
const Comm<int> > &problemComm,
6215 double imbalance_tolerance_,
6216 size_t num_global_parts_,
6217 mj_part_t *part_no_array_,
6218 int recursion_depth_,
6221 mj_lno_t num_local_coords_,
6222 mj_gno_t num_global_coords_,
6223 const mj_gno_t *initial_mj_gnos_,
6224 mj_scalar_t **mj_coordinates_,
6226 int num_weights_per_coord_,
6227 bool *mj_uniform_weights_,
6228 mj_scalar_t **mj_weights_,
6229 bool *mj_uniform_parts_,
6230 mj_scalar_t **mj_part_sizes_,
6232 mj_part_t *&result_assigned_part_ids_,
6233 mj_gno_t *&result_mj_gnos_
6240 if(comm->getRank() == 0){
6241 std::cout <<
"size of gno:" <<
sizeof(mj_gno_t) << std::endl;
6242 std::cout <<
"size of lno:" <<
sizeof(mj_lno_t) << std::endl;
6243 std::cout <<
"size of mj_scalar_t:" <<
sizeof(mj_scalar_t) << std::endl;
6247 this->mj_problemComm = problemComm;
6248 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
6270 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Total");
6271 this->mj_env->debug(3,
"In MultiJagged Jagged");
6274 this->imbalance_tolerance = imbalance_tolerance_;
6275 this->num_global_parts = num_global_parts_;
6276 this->part_no_array = part_no_array_;
6277 this->recursion_depth = recursion_depth_;
6279 this->coord_dim = coord_dim_;
6280 this->num_local_coords = num_local_coords_;
6281 this->num_global_coords = num_global_coords_;
6282 this->mj_coordinates = mj_coordinates_;
6283 this->initial_mj_gnos = (mj_gno_t *) initial_mj_gnos_;
6285 this->num_weights_per_coord = num_weights_per_coord_;
6286 this->mj_uniform_weights = mj_uniform_weights_;
6287 this->mj_weights = mj_weights_;
6288 this->mj_uniform_parts = mj_uniform_parts_;
6289 this->mj_part_sizes = mj_part_sizes_;
6291 this->num_threads = 1;
6292 #ifdef HAVE_ZOLTAN2_OMP
6293 #pragma omp parallel
6296 this->num_threads = omp_get_num_threads();
6301 this->set_part_specifications();
6303 this->allocate_set_work_memory();
6307 this->comm = this->mj_problemComm->duplicate();
6310 mj_part_t current_num_parts = 1;
6311 mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
6313 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning");
6315 mj_part_t output_part_begin_index = 0;
6316 mj_part_t future_num_parts = this->total_num_part;
6317 bool is_data_ever_migrated =
false;
6319 std::vector<mj_part_t> *future_num_part_in_parts =
new std::vector<mj_part_t> ();
6320 std::vector<mj_part_t> *next_future_num_parts_in_parts =
new std::vector<mj_part_t> ();
6321 next_future_num_parts_in_parts->push_back(this->num_global_parts);
6323 RCP<mj_partBoxVector_t> input_part_boxes(
new mj_partBoxVector_t(),
true) ;
6324 RCP<mj_partBoxVector_t> output_part_boxes(
new mj_partBoxVector_t(),
true);
6326 compute_global_box();
6327 if(this->mj_keep_part_boxes){
6328 this->init_part_boxes(output_part_boxes);
6331 for (
int i = 0; i < this->recursion_depth; ++i){
6334 std::vector <mj_part_t> num_partitioning_in_current_dim;
6348 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
6349 future_num_part_in_parts = next_future_num_parts_in_parts;
6350 next_future_num_parts_in_parts = tmpPartVect;
6355 next_future_num_parts_in_parts->clear();
6357 if(this->mj_keep_part_boxes){
6358 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6359 input_part_boxes = output_part_boxes;
6360 output_part_boxes = tmpPartBoxes;
6361 output_part_boxes->clear();
6365 mj_part_t output_part_count_in_dimension =
6366 this->update_part_num_arrays(
6367 num_partitioning_in_current_dim,
6368 future_num_part_in_parts,
6369 next_future_num_parts_in_parts,
6374 output_part_boxes, 1);
6379 if(output_part_count_in_dimension == current_num_parts) {
6381 tmpPartVect= future_num_part_in_parts;
6382 future_num_part_in_parts = next_future_num_parts_in_parts;
6383 next_future_num_parts_in_parts = tmpPartVect;
6385 if(this->mj_keep_part_boxes){
6386 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6387 input_part_boxes = output_part_boxes;
6388 output_part_boxes = tmpPartBoxes;
6395 int coordInd = i % this->coord_dim;
6396 mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
6399 std::string istring = Teuchos::toString<int>(i);
6400 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning_" + istring);
6404 this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
6407 mj_part_t output_part_index = 0;
6410 mj_part_t output_coordinate_end_index = 0;
6412 mj_part_t current_work_part = 0;
6413 mj_part_t current_concurrent_num_parts =
6414 std::min(current_num_parts - current_work_part, this->max_concurrent_part_calculation);
6416 mj_part_t obtained_part_index = 0;
6419 for (; current_work_part < current_num_parts;
6420 current_work_part += current_concurrent_num_parts){
6422 current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
6423 this->max_concurrent_part_calculation);
6425 mj_part_t actual_work_part_count = 0;
6429 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
6430 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
6434 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
6437 ++actual_work_part_count;
6438 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
6439 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts == 0 ?
6440 0 : this->part_xadj[current_work_part_in_concurrent_parts - 1];
6448 this->mj_get_local_min_max_coord_totW(
6449 coordinate_begin_index,
6450 coordinate_end_index,
6451 this->coordinate_permutations,
6452 mj_current_dim_coords,
6453 this->process_local_min_max_coord_total_weight[kk],
6454 this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts],
6455 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]);
6460 if (actual_work_part_count > 0){
6462 this->mj_get_global_min_max_coord_totW(
6463 current_concurrent_num_parts,
6464 this->process_local_min_max_coord_total_weight,
6465 this->global_min_max_coord_total_weight);
6469 mj_part_t total_incomplete_cut_count = 0;
6474 mj_part_t concurrent_part_cut_shift = 0;
6475 mj_part_t concurrent_part_part_shift = 0;
6476 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
6477 mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
6478 mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
6479 current_concurrent_num_parts];
6481 mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
6482 2 * current_concurrent_num_parts];
6484 mj_part_t concurrent_current_part_index = current_work_part + kk;
6486 mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
6488 mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
6489 mj_scalar_t *current_target_part_weights = this->target_part_weights +
6490 concurrent_part_part_shift;
6492 concurrent_part_cut_shift += partition_count - 1;
6494 concurrent_part_part_shift += partition_count;
6499 if(partition_count > 1 && min_coordinate <= max_coordinate){
6503 total_incomplete_cut_count += partition_count - 1;
6506 this->my_incomplete_cut_count[kk] = partition_count - 1;
6509 this->mj_get_initial_cut_coords_target_weights(
6512 partition_count - 1,
6513 global_total_weight,
6515 current_target_part_weights,
6516 future_num_part_in_parts,
6517 next_future_num_parts_in_parts,
6518 concurrent_current_part_index,
6519 obtained_part_index);
6521 mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
6522 mj_lno_t coordinate_begin_index = concurrent_current_part_index == 0 ?
6523 0 : this->part_xadj[concurrent_current_part_index - 1];
6527 this->set_initial_coordinate_parts(
6530 concurrent_current_part_index,
6531 coordinate_begin_index, coordinate_end_index,
6532 this->coordinate_permutations,
6533 mj_current_dim_coords,
6534 this->assigned_part_ids,
6539 this->my_incomplete_cut_count[kk] = 0;
6541 obtained_part_index += partition_count;
6548 double used_imbalance = 0;
6553 mj_current_dim_coords,
6556 current_concurrent_num_parts,
6557 current_cut_coordinates,
6558 total_incomplete_cut_count,
6559 num_partitioning_in_current_dim);
6564 mj_part_t output_array_shift = 0;
6565 mj_part_t cut_shift = 0;
6566 size_t tlr_shift = 0;
6567 size_t partweight_array_shift = 0;
6569 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
6570 mj_part_t current_concurrent_work_part = current_work_part + kk;
6571 mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
6574 if((num_parts != 1 )
6576 this->global_min_max_coord_total_weight[kk] >
6577 this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
6581 for(mj_part_t jj = 0; jj < num_parts; ++jj){
6582 this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
6584 cut_shift += num_parts - 1;
6585 tlr_shift += (4 *(num_parts - 1) + 1);
6586 output_array_shift += num_parts;
6587 partweight_array_shift += (2 * (num_parts - 1) + 1);
6591 mj_lno_t coordinate_end= this->part_xadj[current_concurrent_work_part];
6592 mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[
6593 current_concurrent_work_part -1];
6594 mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
6595 mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
6600 for(
int ii = 0; ii < this->num_threads; ++ii){
6601 this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
6605 if(this->mj_keep_part_boxes){
6607 for (mj_part_t j = 0; j < num_parts - 1; ++j){
6608 (*output_part_boxes)[output_array_shift + output_part_index +
6609 j].updateMinMax(current_concurrent_cut_coordinate[j], 1
6612 (*output_part_boxes)[output_array_shift + output_part_index + j +
6613 1].updateMinMax(current_concurrent_cut_coordinate[j], 0
6619 this->mj_create_new_partitions(
6621 mj_current_dim_coords,
6622 current_concurrent_cut_coordinate,
6625 used_local_cut_line_weight_to_left,
6626 this->thread_part_weight_work,
6627 this->new_part_xadj + output_part_index + output_array_shift
6634 mj_lno_t part_size = coordinate_end - coordinate_begin;
6635 *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
6637 this->new_coordinate_permutations + coordinate_begin,
6638 this->coordinate_permutations + coordinate_begin,
6639 part_size *
sizeof(mj_lno_t));
6641 cut_shift += num_parts - 1;
6642 tlr_shift += (4 *(num_parts - 1) + 1);
6643 output_array_shift += num_parts;
6644 partweight_array_shift += (2 * (num_parts - 1) + 1);
6654 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
6655 mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
6656 for (mj_part_t ii = 0;ii < num_parts ; ++ii){
6658 this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
6661 output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
6663 output_part_index += num_parts ;
6670 int current_world_size = this->comm->getSize();
6671 long migration_reduce_all_population = this->total_dim_num_reduce_all * current_world_size;
6674 bool is_migrated_in_current_dimension =
false;
6679 if (future_num_parts > 1 &&
6680 this->check_migrate_avoid_migration_option >= 0 &&
6681 current_world_size > 1){
6683 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" + istring);
6684 mj_part_t num_parts = output_part_count_in_dimension;
6685 if ( this->mj_perform_migration(
6688 next_future_num_parts_in_parts,
6689 output_part_begin_index,
6690 migration_reduce_all_population,
6691 this->num_global_coords / (future_num_parts * current_num_parts),
6693 input_part_boxes, output_part_boxes) ) {
6694 is_migrated_in_current_dimension =
true;
6695 is_data_ever_migrated =
true;
6696 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" +
6699 this->total_dim_num_reduce_all /= num_parts;
6702 is_migrated_in_current_dimension =
false;
6703 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" + istring);
6708 mj_lno_t * tmp = this->coordinate_permutations;
6709 this->coordinate_permutations = this->new_coordinate_permutations;
6710 this->new_coordinate_permutations = tmp;
6712 if(!is_migrated_in_current_dimension){
6713 this->total_dim_num_reduce_all -= current_num_parts;
6714 current_num_parts = output_part_count_in_dimension;
6716 freeArray<mj_lno_t>(this->part_xadj);
6717 this->part_xadj = this->new_part_xadj;
6718 this->new_part_xadj = NULL;
6719 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning_" + istring);
6723 delete future_num_part_in_parts;
6724 delete next_future_num_parts_in_parts;
6726 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning");
6733 this->set_final_parts(
6735 output_part_begin_index,
6737 is_data_ever_migrated);
6739 result_assigned_part_ids_ = this->assigned_part_ids;
6740 result_mj_gnos_ = this->current_mj_gnos;
6742 this->free_work_memory();
6743 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Total");
6744 this->mj_env->debug(3,
"Out of MultiJagged");
6752 template <
typename Adapter>
6757 #ifndef DOXYGEN_SHOULD_SKIP_THIS
6764 typedef typename Adapter::scalar_t adapter_scalar_t;
6767 typedef float default_mj_scalar_t;
6773 (std::is_same<adapter_scalar_t, float>::value ||
6774 std::is_same<adapter_scalar_t, double>::value),
6775 adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
6777 typedef typename Adapter::gno_t mj_gno_t;
6778 typedef typename Adapter::lno_t mj_lno_t;
6779 typedef typename Adapter::node_t mj_node_t;
6782 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
6786 RCP<const Environment> mj_env;
6787 RCP<const Comm<int> > mj_problemComm;
6788 RCP<const coordinateModel_t> mj_coords;
6791 double imbalance_tolerance;
6792 size_t num_global_parts;
6793 mj_part_t *part_no_array;
6794 int recursion_depth;
6797 mj_lno_t num_local_coords;
6798 mj_gno_t num_global_coords;
6799 const mj_gno_t *initial_mj_gnos;
6800 mj_scalar_t **mj_coordinates;
6802 int num_weights_per_coord;
6803 bool *mj_uniform_weights;
6804 mj_scalar_t **mj_weights;
6805 bool *mj_uniform_parts;
6806 mj_scalar_t **mj_part_sizes;
6814 mj_part_t num_first_level_parts;
6815 const mj_part_t *first_level_distribution;
6817 bool distribute_points_on_cut_lines;
6818 mj_part_t max_concurrent_part_calculation;
6819 int check_migrate_avoid_migration_option;
6822 double minimum_migration_imbalance;
6823 bool mj_keep_part_boxes;
6828 int mj_premigration_option;
6829 int min_coord_per_rank_for_premigration;
6831 ArrayRCP<mj_part_t> comXAdj_;
6832 ArrayRCP<mj_part_t> comAdj_;
6838 ArrayRCP<const mj_scalar_t> * coordinate_ArrayRCP_holder;
6840 void set_up_partitioning_data(
6843 void set_input_parameters(
const Teuchos::ParameterList &p);
6845 void free_work_memory();
6847 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries()
const;
6849 bool mj_premigrate_to_subset(
int used_num_ranks,
int migration_selection_option,
6850 RCP<const Environment> mj_env_,
6851 RCP<
const Comm<int> > mj_problemComm_,
6853 mj_lno_t num_local_coords_,
6854 mj_gno_t num_global_coords_,
size_t num_global_parts_,
6855 const mj_gno_t *initial_mj_gnos_,
6856 mj_scalar_t **mj_coordinates_,
6857 int num_weights_per_coord_,
6858 mj_scalar_t **mj_weights_,
6860 RCP<
const Comm<int> > &result_problemComm_,
6861 mj_lno_t & result_num_local_coords_,
6862 mj_gno_t * &result_initial_mj_gnos_,
6863 mj_scalar_t ** &result_mj_coordinates_,
6864 mj_scalar_t ** &result_mj_weights_,
6865 int * &result_actual_owner_rank_);
6870 RCP<
const Comm<int> > &problemComm,
6871 const RCP<const coordinateModel_t> &coords) :
6872 mj_partitioner(), mj_env(env),
6873 mj_problemComm(problemComm),
6875 imbalance_tolerance(0),
6876 num_global_parts(1),
6877 part_no_array(NULL),
6880 num_local_coords(0),
6881 num_global_coords(0),
6882 initial_mj_gnos(NULL),
6883 mj_coordinates(NULL),
6884 num_weights_per_coord(0),
6885 mj_uniform_weights(NULL),
6887 mj_uniform_parts(NULL),
6888 mj_part_sizes(NULL),
6889 num_first_level_parts(1),
6890 first_level_distribution(NULL),
6891 distribute_points_on_cut_lines(true),
6892 max_concurrent_part_calculation(1),
6893 check_migrate_avoid_migration_option(0),
6895 minimum_migration_imbalance(0.30),
6896 mj_keep_part_boxes(false),
6898 mj_run_as_rcb(false),
6899 mj_premigration_option(0),
6900 min_coord_per_rank_for_premigration(32000),
6901 comXAdj_(), comAdj_(),
6902 coordinate_ArrayRCP_holder(NULL)
6906 if (coordinate_ArrayRCP_holder != NULL){
6907 delete [] this->coordinate_ArrayRCP_holder;
6908 this->coordinate_ArrayRCP_holder = NULL;
6916 const bool bUnsorted =
true;
6917 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
6919 pl.set(
"mj_parts",
"0",
"list of parts for multiJagged partitioning "
6920 "algorithm. As many as the dimension count.", mj_parts_Validator);
6922 pl.set(
"mj_concurrent_part_count", 1,
"The number of parts whose cut "
6925 pl.set(
"mj_minimum_migration_imbalance", 1.1,
6926 "mj_minimum_migration_imbalance, the minimum imbalance of the "
6927 "processors to avoid migration",
6930 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
6931 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 2) );
6932 pl.set(
"mj_migration_option", 1,
"Migration option, 0 for decision "
6933 "depending on the imbalance, 1 for forcing migration, 2 for "
6934 "avoiding migration", mj_migration_option_validator);
6939 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
6940 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1) );
6941 pl.set(
"mj_migration_type", 0,
"Migration type, 0 for migration to minimize the imbalance "
6942 "1 for migration to minimize messages exchanged the migration." ,
6943 mj_migration_option_validator);
6946 pl.set(
"mj_keep_part_boxes",
false,
"Keep the part boundaries of the "
6950 pl.set(
"mj_enable_rcb",
false,
"Use MJ as RCB.",
6953 pl.set(
"mj_recursion_depth", -1,
"Recursion depth for MJ: Must be "
6956 RCP<Teuchos::EnhancedNumberValidator<int>> mj_premigration_option_validator =
6957 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
6959 pl.set(
"mj_premigration_option", 0,
"Whether to do premigration or not. 0 for no migration "
6960 "x > 0 for migration to consecutive processors, the subset will be 0,x,2x,3x,...subset ranks."
6961 , mj_premigration_option_validator);
6963 pl.set(
"mj_premigration_coordinate_count", 32000,
"How many coordinate to assign each rank in multijagged after premigration"
6978 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6982 mj_part_t
pointAssign(
int dim, adapter_scalar_t *point)
const;
6984 void boxAssign(
int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
6985 size_t &nPartsFound, mj_part_t **partsFound)
const;
6992 ArrayRCP<mj_part_t> &comXAdj,
6993 ArrayRCP<mj_part_t> &comAdj);
6999 template <
typename Adapter>
7000 bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
int used_num_ranks,
7002 RCP<const Environment> mj_env_,
7003 RCP<
const Comm<int> > mj_problemComm_,
7005 mj_lno_t num_local_coords_,
7007 const mj_gno_t *initial_mj_gnos_,
7008 mj_scalar_t **mj_coordinates_,
7009 int num_weights_per_coord_,
7010 mj_scalar_t **mj_weights_,
7012 RCP<
const Comm<int> > &result_problemComm_,
7013 mj_lno_t &result_num_local_coords_,
7014 mj_gno_t * &result_initial_mj_gnos_,
7015 mj_scalar_t ** &result_mj_coordinates_,
7016 mj_scalar_t ** &result_mj_weights_,
7017 int * &result_actual_owner_rank_){
7018 mj_env_->timerStart(
MACRO_TIMERS,
"MultiJagged - PreMigration DistributorPlanCreating");
7021 int myRank = mj_problemComm_->getRank();
7022 int worldSize = mj_problemComm_->getSize();
7024 mj_part_t groupsize = worldSize / used_num_ranks;
7028 std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
7030 mj_part_t i_am_sending_to = 0;
7031 bool am_i_a_receiver =
false;
7033 for(
int i = 0; i < used_num_ranks; ++i){
7034 group_begins[i+ 1] = group_begins[i] + groupsize;
7035 if (worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
7036 if (i == used_num_ranks) group_begins[i+ 1] = worldSize;
7037 if (myRank >= group_begins[i] && myRank < group_begins[i + 1]) i_am_sending_to = group_begins[i];
7038 if (myRank == group_begins[i]) am_i_a_receiver=
true;
7041 ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
7042 result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
7045 Tpetra::Distributor distributor(mj_problemComm_);
7047 std::vector<mj_part_t> coordinate_destinations(num_local_coords_, i_am_sending_to);
7048 ArrayView<const mj_part_t> destinations( &(coordinate_destinations[0]), num_local_coords_);
7049 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
7050 result_num_local_coords_ = num_incoming_gnos;
7051 mj_env_->timerStop(
MACRO_TIMERS,
"MultiJagged - PreMigration DistributorPlanCreating");
7053 mj_env_->timerStart(
MACRO_TIMERS,
"MultiJagged - PreMigration DistributorMigration");
7057 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
7059 ArrayView<const mj_gno_t> sent_gnos(initial_mj_gnos_, num_local_coords_);
7060 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
7062 result_initial_mj_gnos_ = allocMemory<mj_gno_t>(num_incoming_gnos);
7064 result_initial_mj_gnos_,
7065 received_gnos.getRawPtr(),
7066 num_incoming_gnos *
sizeof(mj_gno_t));
7070 result_mj_coordinates_ = allocMemory<mj_scalar_t *>(coord_dim_);
7071 for (
int i = 0; i < coord_dim_; ++i){
7072 ArrayView<const mj_scalar_t> sent_coord(mj_coordinates_[i], num_local_coords_);
7073 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
7074 distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
7075 result_mj_coordinates_[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
7077 result_mj_coordinates_[i],
7078 received_coord.getRawPtr(),
7079 num_incoming_gnos *
sizeof(mj_scalar_t));
7082 result_mj_weights_ = allocMemory<mj_scalar_t *>(num_weights_per_coord_);
7084 for (
int i = 0; i < num_weights_per_coord_; ++i){
7085 ArrayView<const mj_scalar_t> sent_weight(mj_weights_[i], num_local_coords_);
7086 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
7087 distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
7088 result_mj_weights_[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
7090 result_mj_weights_[i],
7091 received_weight.getRawPtr(),
7092 num_incoming_gnos *
sizeof(mj_scalar_t));
7097 std::vector<int> owner_of_coordinate(num_local_coords_, myRank);
7098 ArrayView<int> sent_owners(&(owner_of_coordinate[0]), num_local_coords_);
7099 ArrayRCP<int> received_owners(num_incoming_gnos);
7100 distributor.doPostsAndWaits<
int>(sent_owners, 1, received_owners());
7101 result_actual_owner_rank_ = allocMemory<int>(num_incoming_gnos);
7103 result_actual_owner_rank_,
7104 received_owners.getRawPtr(),
7105 num_incoming_gnos *
sizeof(
int));
7107 mj_env_->timerStop(
MACRO_TIMERS,
"MultiJagged - PreMigration DistributorMigration");
7108 return am_i_a_receiver;
7126 template <
typename Adapter>
7131 this->set_up_partitioning_data(solution);
7132 this->set_input_parameters(this->mj_env->getParameters());
7133 if (this->mj_keep_part_boxes){
7134 this->mj_partitioner.set_to_keep_part_boxes();
7136 this->mj_partitioner.set_partitioning_parameters(
7137 this->distribute_points_on_cut_lines,
7138 this->max_concurrent_part_calculation,
7139 this->check_migrate_avoid_migration_option,
7140 this->minimum_migration_imbalance, this->migration_type);
7143 RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
7144 mj_lno_t result_num_local_coords = this->num_local_coords;
7145 mj_gno_t * result_initial_mj_gnos = NULL;
7146 mj_scalar_t **result_mj_coordinates = this->mj_coordinates;
7147 mj_scalar_t **result_mj_weights = this->mj_weights;
7148 int *result_actual_owner_rank = NULL;
7149 const mj_gno_t * result_initial_mj_gnos_ = this->initial_mj_gnos;
7166 int current_world_size = this->mj_problemComm->getSize();
7167 mj_lno_t threshold_num_local_coords = this->min_coord_per_rank_for_premigration;
7168 bool is_pre_migrated =
false;
7169 bool am_i_in_subset =
true;
7170 if ( mj_premigration_option > 0 &&
7171 size_t (current_world_size) > this->num_global_parts &&
7172 this->num_global_coords < mj_gno_t (current_world_size * threshold_num_local_coords)){
7173 if (this->mj_keep_part_boxes){
7174 throw std::logic_error(
"Multijagged: mj_keep_part_boxes and mj_premigration_option are not supported together yet.");
7176 is_pre_migrated =
true;
7177 int migration_selection_option = mj_premigration_option;
7178 if(migration_selection_option * this->num_global_parts > (
size_t) (current_world_size)){
7179 migration_selection_option = current_world_size / this->num_global_parts;
7181 int used_num_ranks = int (this->num_global_coords /
float (threshold_num_local_coords) + 0.5);
7182 if (used_num_ranks == 0) used_num_ranks = 1;
7184 am_i_in_subset = this->mj_premigrate_to_subset(
7186 migration_selection_option,
7188 this->mj_problemComm,
7190 this->num_local_coords,
7191 this->num_global_coords,
7192 this->num_global_parts,
7193 this->initial_mj_gnos,
7194 this->mj_coordinates,
7195 this->num_weights_per_coord,
7199 result_num_local_coords,
7200 result_initial_mj_gnos,
7201 result_mj_coordinates,
7203 result_actual_owner_rank);
7204 result_initial_mj_gnos_ = result_initial_mj_gnos;
7209 mj_part_t *result_assigned_part_ids = NULL;
7210 mj_gno_t *result_mj_gnos = NULL;
7212 if (am_i_in_subset){
7213 this->mj_partitioner.multi_jagged_part(
7217 this->imbalance_tolerance,
7218 this->num_global_parts,
7219 this->part_no_array,
7220 this->recursion_depth,
7223 result_num_local_coords,
7224 this->num_global_coords,
7225 result_initial_mj_gnos_,
7226 result_mj_coordinates,
7228 this->num_weights_per_coord,
7229 this->mj_uniform_weights,
7231 this->mj_uniform_parts,
7232 this->mj_part_sizes,
7234 result_assigned_part_ids,
7242 #if defined(__cplusplus) && __cplusplus >= 201103L
7243 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
7244 localGidToLid.reserve(result_num_local_coords);
7245 for (mj_lno_t i = 0; i < result_num_local_coords; i++)
7246 localGidToLid[result_initial_mj_gnos_[i]] = i;
7247 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[result_num_local_coords],
7248 0, result_num_local_coords,
true);
7250 for (mj_lno_t i = 0; i < result_num_local_coords; i++) {
7251 mj_lno_t origLID = localGidToLid[result_mj_gnos[i]];
7252 partId[origLID] = result_assigned_part_ids[i];
7256 Teuchos::Hashtable<mj_gno_t, mj_lno_t>
7257 localGidToLid(result_num_local_coords);
7258 for (mj_lno_t i = 0; i < result_num_local_coords; i++)
7259 localGidToLid.put(result_initial_mj_gnos_[i], i);
7261 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[result_num_local_coords],
7262 0, result_num_local_coords,
true);
7264 for (mj_lno_t i = 0; i < result_num_local_coords; i++) {
7265 mj_lno_t origLID = localGidToLid.get(result_mj_gnos[i]);
7266 partId[origLID] = result_assigned_part_ids[i];
7271 delete [] result_mj_gnos;
7272 delete [] result_assigned_part_ids;
7277 if (is_pre_migrated){
7278 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - PostMigration DistributorPlanCreating");
7279 Tpetra::Distributor distributor(this->mj_problemComm);
7281 ArrayView<const mj_part_t> actual_owner_destinations( result_actual_owner_rank , result_num_local_coords);
7282 mj_lno_t num_incoming_gnos = distributor.createFromSends(actual_owner_destinations);
7283 if (num_incoming_gnos != this->num_local_coords){
7284 throw std::logic_error(
"Zoltan2 - Multijagged Post Migration - num incoming is not equal to num local coords");
7286 mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - PostMigration DistributorPlanCreating");
7287 mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - PostMigration DistributorMigration");
7288 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
7289 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
7291 ArrayView<const mj_gno_t> sent_gnos(result_initial_mj_gnos_, result_num_local_coords);
7292 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
7295 ArrayView<mj_part_t> sent_partnos(partId());
7296 distributor.doPostsAndWaits<mj_part_t>(sent_partnos, 1, received_partids());
7298 partId = arcp(
new mj_part_t[this->num_local_coords],
7299 0, this->num_local_coords,
true);
7302 #if defined(__cplusplus) && __cplusplus >= 201103L
7303 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
7304 localGidToLid2.reserve(this->num_local_coords);
7305 for (mj_lno_t i = 0; i < this->num_local_coords; i++)
7306 localGidToLid2[this->initial_mj_gnos[i]] = i;
7309 for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
7310 mj_lno_t origLID = localGidToLid2[received_gnos[i]];
7311 partId[origLID] = received_partids[i];
7315 Teuchos::Hashtable<mj_gno_t, mj_lno_t>
7316 localGidToLid2(this->num_local_coords);
7317 for (mj_lno_t i = 0; i < this->num_local_coords; i++)
7318 localGidToLid2.put(this->initial_mj_gnos[i], i);
7321 for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
7322 mj_lno_t origLID = localGidToLid2.get(received_gnos[i]);
7323 partId[origLID] = received_partids[i];
7331 freeArray<mj_gno_t> (result_initial_mj_gnos);
7332 for (
int i = 0; i < this->coord_dim; ++i){
7333 freeArray<mj_scalar_t> (result_mj_coordinates[i]);
7335 freeArray<mj_scalar_t *> (result_mj_coordinates);
7337 for (
int i = 0; i < this->num_weights_per_coord; ++i){
7338 freeArray<mj_scalar_t> (result_mj_weights[i]);
7340 freeArray<mj_scalar_t *> (result_mj_weights);
7341 freeArray<int> (result_actual_owner_rank);
7343 mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - PostMigration DistributorMigration");
7347 solution->setParts(partId);
7348 this->free_work_memory();
7353 template <
typename Adapter>
7355 freeArray<mj_scalar_t *>(this->mj_coordinates);
7356 freeArray<mj_scalar_t *>(this->mj_weights);
7357 freeArray<bool>(this->mj_uniform_parts);
7358 freeArray<mj_scalar_t *>(this->mj_part_sizes);
7359 freeArray<bool>(this->mj_uniform_weights);
7365 template <
typename Adapter>
7366 void Zoltan2_AlgMJ<Adapter>::set_up_partitioning_data(
7367 const RCP<PartitioningSolution<Adapter> > &solution
7370 this->coord_dim = this->mj_coords->getCoordinateDim();
7371 this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
7372 this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
7373 this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
7374 int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
7379 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
7382 this->mj_coordinates = allocMemory<mj_scalar_t *>(this->coord_dim);
7383 this->mj_weights = allocMemory<mj_scalar_t *>(criteria_dim);
7386 this->mj_uniform_parts = allocMemory< bool >(criteria_dim);
7389 this->mj_part_sizes = allocMemory<mj_scalar_t *>(criteria_dim);
7391 this->mj_uniform_weights = allocMemory< bool >(criteria_dim);
7393 typedef StridedData<mj_lno_t, adapter_scalar_t> input_t;
7394 ArrayView<const mj_gno_t> gnos;
7395 ArrayView<input_t> xyz;
7396 ArrayView<input_t> wgts;
7399 this->coordinate_ArrayRCP_holder =
new ArrayRCP<const mj_scalar_t> [this->coord_dim + this->num_weights_per_coord];
7401 this->mj_coords->getCoordinates(gnos, xyz, wgts);
7403 ArrayView<const mj_gno_t> mj_gnos = gnos;
7404 this->initial_mj_gnos = mj_gnos.getRawPtr();
7407 for (
int dim=0; dim < this->coord_dim; dim++){
7408 ArrayRCP<const mj_scalar_t> ar;
7409 xyz[dim].getInputArray(ar);
7411 this->coordinate_ArrayRCP_holder[dim] = ar;
7414 this->mj_coordinates[dim] = (mj_scalar_t *)ar.getRawPtr();
7418 if (this->num_weights_per_coord == 0){
7419 this->mj_uniform_weights[0] =
true;
7420 this->mj_weights[0] = NULL;
7424 for (
int wdim = 0; wdim < this->num_weights_per_coord; wdim++){
7425 ArrayRCP<const mj_scalar_t> ar;
7426 wgts[wdim].getInputArray(ar);
7429 this->coordinate_ArrayRCP_holder[this->coord_dim + wdim] = ar;
7430 this->mj_uniform_weights[wdim] =
false;
7431 this->mj_weights[wdim] = (mj_scalar_t *) ar.getRawPtr();
7435 for (
int wdim = 0; wdim < criteria_dim; wdim++){
7436 if (solution->criteriaHasUniformPartSizes(wdim)){
7437 this->mj_uniform_parts[wdim] =
true;
7438 this->mj_part_sizes[wdim] = NULL;
7441 std::cerr <<
"MJ does not support non uniform target part weights" << std::endl;
7450 template <
typename Adapter>
7451 void Zoltan2_AlgMJ<Adapter>::set_input_parameters(
const Teuchos::ParameterList &pl){
7453 const Teuchos::ParameterEntry *pe = pl.getEntryPtr(
"imbalance_tolerance");
7456 tol = pe->getValue(&tol);
7457 this->imbalance_tolerance = tol - 1.0;
7461 if (this->imbalance_tolerance <= 0)
7462 this->imbalance_tolerance= 10e-4;
7465 this->part_no_array = NULL;
7467 this->recursion_depth = 0;
7469 if (pl.getPtr<Array <mj_part_t> >(
"mj_parts")){
7470 this->part_no_array = (mj_part_t *) pl.getPtr<Array <mj_part_t> >(
"mj_parts")->getRawPtr();
7471 this->recursion_depth = pl.getPtr<Array <mj_part_t> >(
"mj_parts")->size() - 1;
7472 this->mj_env->debug(2,
"mj_parts provided by user");
7476 this->distribute_points_on_cut_lines =
true;
7477 this->max_concurrent_part_calculation = 1;
7479 this->mj_run_as_rcb =
false;
7480 this->mj_premigration_option = 0;
7481 this->min_coord_per_rank_for_premigration = 32000;
7483 int mj_user_recursion_depth = -1;
7484 this->mj_keep_part_boxes =
false;
7485 this->check_migrate_avoid_migration_option = 0;
7486 this->migration_type = 0;
7487 this->minimum_migration_imbalance = 0.35;
7489 pe = pl.getEntryPtr(
"mj_minimum_migration_imbalance");
7492 imb = pe->getValue(&imb);
7493 this->minimum_migration_imbalance = imb - 1.0;
7496 pe = pl.getEntryPtr(
"mj_migration_option");
7498 this->check_migrate_avoid_migration_option = pe->getValue(&this->check_migrate_avoid_migration_option);
7500 this->check_migrate_avoid_migration_option = 0;
7502 if (this->check_migrate_avoid_migration_option > 1) this->check_migrate_avoid_migration_option = -1;
7505 pe = pl.getEntryPtr(
"mj_migration_type");
7507 this->migration_type = pe->getValue(&this->migration_type);
7509 this->migration_type = 0;
7514 pe = pl.getEntryPtr(
"mj_concurrent_part_count");
7516 this->max_concurrent_part_calculation = pe->getValue(&this->max_concurrent_part_calculation);
7518 this->max_concurrent_part_calculation = 1;
7521 pe = pl.getEntryPtr(
"mj_keep_part_boxes");
7523 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
7525 this->mj_keep_part_boxes =
false;
7537 if (this->mj_keep_part_boxes ==
false){
7538 pe = pl.getEntryPtr(
"mapping_type");
7540 int mapping_type = -1;
7541 mapping_type = pe->getValue(&mapping_type);
7542 if (mapping_type == 0){
7543 mj_keep_part_boxes =
true;
7549 pe = pl.getEntryPtr(
"mj_enable_rcb");
7551 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
7553 this->mj_run_as_rcb =
false;
7556 pe = pl.getEntryPtr(
"mj_premigration_option");
7558 mj_premigration_option = pe->getValue(&mj_premigration_option);
7560 mj_premigration_option = 0;
7563 pe = pl.getEntryPtr(
"mj_premigration_coordinate_count");
7565 min_coord_per_rank_for_premigration = pe->getValue(&min_coord_per_rank_for_premigration);
7567 min_coord_per_rank_for_premigration = 32000;
7570 pe = pl.getEntryPtr(
"mj_recursion_depth");
7572 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
7574 mj_user_recursion_depth = -1;
7578 pe = pl.getEntryPtr(
"rectilinear");
7579 if (pe) val = pe->getValue(&val);
7581 this->distribute_points_on_cut_lines =
false;
7583 this->distribute_points_on_cut_lines =
true;
7586 if (this->mj_run_as_rcb){
7587 mj_user_recursion_depth = (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
7589 if (this->recursion_depth < 1){
7590 if (mj_user_recursion_depth > 0){
7591 this->recursion_depth = mj_user_recursion_depth;
7594 this->recursion_depth = this->coord_dim;
7598 this->num_threads = 1;
7599 #ifdef HAVE_ZOLTAN2_OMP
7600 #pragma omp parallel
7602 this->num_threads = omp_get_num_threads();
7609 template <
typename Adapter>
7612 adapter_scalar_t *lower,
7613 adapter_scalar_t *upper,
7614 size_t &nPartsFound,
7624 if (this->mj_keep_part_boxes) {
7627 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
7629 size_t nBoxes = (*partBoxes).size();
7631 throw std::logic_error(
"no part boxes exist");
7635 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
7637 if (globalBox->boxesOverlap(dim, lower, upper)) {
7639 std::vector<typename Adapter::part_t> partlist;
7642 for (
size_t i = 0; i < nBoxes; i++) {
7644 if ((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
7646 partlist.push_back((*partBoxes)[i].getpId());
7667 *partsFound =
new mj_part_t[nPartsFound];
7668 for (
size_t i = 0; i < nPartsFound; i++)
7669 (*partsFound)[i] = partlist[i];
7682 throw std::logic_error(
"need to use keep_cuts parameter for boxAssign");
7687 template <
typename Adapter>
7690 adapter_scalar_t *point)
const
7697 if (this->mj_keep_part_boxes) {
7701 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
7703 size_t nBoxes = (*partBoxes).size();
7705 throw std::logic_error(
"no part boxes exist");
7709 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
7711 if (globalBox->pointInBox(dim, point)) {
7715 for (i = 0; i < nBoxes; i++) {
7717 if ((*partBoxes)[i].pointInBox(dim, point)) {
7718 foundPart = (*partBoxes)[i].getpId();
7732 std::ostringstream oss;
7734 for (
int j = 0; j < dim; j++) oss << point[j] <<
" ";
7735 oss <<
") not found in domain";
7736 throw std::logic_error(oss.str());
7746 size_t closestBox = 0;
7747 coord_t minDistance = std::numeric_limits<coord_t>::max();
7748 coord_t *centroid =
new coord_t[dim];
7749 for (
size_t i = 0; i < nBoxes; i++) {
7750 (*partBoxes)[i].computeCentroid(centroid);
7753 for (
int j = 0; j < dim; j++) {
7754 diff = centroid[j] - point[j];
7757 if (sum < minDistance) {
7762 foundPart = (*partBoxes)[closestBox].getpId();
7769 throw std::logic_error(
"need to use keep_cuts parameter for pointAssign");
7773 template <
typename Adapter>
7779 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL){
7780 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
7781 mj_part_t ntasks = (*pBoxes).size();
7782 int dim = (*pBoxes)[0].getDim();
7783 GridHash grid(pBoxes, ntasks, dim);
7791 template <
typename Adapter>
7792 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
7795 return this->mj_partitioner.get_kept_boxes();
7799 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7801 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
7804 if (this->mj_keep_part_boxes)
7805 return this->kept_boxes;
7807 throw std::logic_error(
"Error: part boxes are not stored.");
7810 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7812 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
7814 RCP<mj_partBoxVector_t> &localPartBoxes
7818 mj_part_t ntasks = this->num_global_parts;
7819 int dim = (*localPartBoxes)[0].getDim();
7820 coord_t *localPartBoundaries =
new coord_t[ntasks * 2 *dim];
7822 memset(localPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
7824 coord_t *globalPartBoundaries =
new coord_t[ntasks * 2 *dim];
7825 memset(globalPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
7827 coord_t *localPartMins = localPartBoundaries;
7828 coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
7830 coord_t *globalPartMins = globalPartBoundaries;
7831 coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
7833 mj_part_t boxCount = localPartBoxes->size();
7834 for (mj_part_t i = 0; i < boxCount; ++i){
7835 mj_part_t pId = (*localPartBoxes)[i].getpId();
7838 coord_t *lmins = (*localPartBoxes)[i].getlmins();
7839 coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
7841 for (
int j = 0; j < dim; ++j){
7842 localPartMins[dim * pId + j] = lmins[j];
7843 localPartMaxs[dim * pId + j] = lmaxs[j];
7855 reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
7856 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
7857 RCP<mj_partBoxVector_t> pB(
new mj_partBoxVector_t(),
true);
7858 for (mj_part_t i = 0; i < ntasks; ++i){
7860 globalPartMaxs + dim * i);
7872 delete []localPartBoundaries;
7873 delete []globalPartBoundaries;
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
#define MIN_WORK_LAST_DIM
#define LEAST_SIGNIFICANCE
#define FUTURE_REDUCEALL_CUTOFF
#define imbalanceOf2(Wachieved, wExpected)
Defines the CoordinateModel classes.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
Define IntegerRangeList validator.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Defines Parameter related enumerators, declares functions.
A gathering of useful namespace methods.
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Zoltan2_BoxBoundaries()
Default Constructor.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBox_t > get_global_box() const
Return the global bounding box: min/max coords of global domain.
RCP< mj_partBoxVector_t > get_kept_boxes() const
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, size_t num_global_parts, mj_part_t *part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, const mj_gno_t *initial_mj_gnos, mj_scalar_t **mj_coordinates, int num_weights_per_coord, bool *mj_uniform_weights, mj_scalar_t **mj_weights, bool *mj_uniform_parts, mj_scalar_t **mj_part_sizes, mj_part_t *&result_assigned_part_ids, mj_gno_t *&result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, mj_scalar_t **mj_coordinates, mj_lno_t *initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth, const mj_part_t *part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const mj_part_t *first_level_distribution_=NULL)
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
Algorithm defines the base class for all algorithms.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
GridHash Class, Hashing Class for part boxes.
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
A ParameterList validator for integer range lists.
A PartitioningSolution is a solution to a partitioning problem.
Multi Jagged coordinate partitioning algorithm.
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
Class for sorting items with multiple values. First sorting with respect to val[0],...
uMultiSortItem< IT, CT, WT > operator=(const uMultiSortItem< IT, CT, WT > &other)
void set(IT index_, CT count_, WT *vals_)
bool operator<(const uMultiSortItem< IT, CT, WT > &other) const
uMultiSortItem(const uMultiSortItem< IT, CT, WT > &other)
bool operator>(const uMultiSortItem< IT, CT, WT > &other) const
uMultiSortItem(IT index_, CT count_, WT *vals_)
Tpetra::global_size_t global_size_t
@ MACRO_TIMERS
Time an algorithm (or other entity) as a whole.
void freeArray(T *&array)
Frees the given array.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals.
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals.
T * allocMemory(size_t size)
Allocates memory for the given size.
SparseMatrixAdapter_t::part_t part_t
bool operator<=(const uSignedSortItem< IT, WT, SIGN > &rhs)
bool operator>=(const uSignedSortItem< IT, WT, SIGN > &rhs)
bool operator<(const uSignedSortItem< IT, WT, SIGN > &rhs) const
bool operator>(const uSignedSortItem< IT, WT, SIGN > &rhs) const
Sort items for quick sort function.