49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_ 50 #define _ZOLTAN2_ALGMultiJagged_HPP_ 57 #include <Tpetra_Distributor.hpp> 58 #include <Teuchos_ParameterList.hpp> 65 #if defined(__cplusplus) && __cplusplus >= 201103L 66 #include <unordered_map> 68 #include <Teuchos_Hashtable.hpp> 69 #endif // C++11 is enabled 71 #ifdef ZOLTAN2_USEZOLTANCOMM 72 #ifdef HAVE_ZOLTAN2_MPI 73 #define ENABLE_ZOLTAN_MIGRATION 74 #include "zoltan_comm_cpp.h" 75 #include "zoltan_types.h" 79 #ifdef HAVE_ZOLTAN2_OMP 83 #define LEAST_SIGNIFICANCE 0.0001 84 #define SIGNIFICANCE_MUL 1000 89 #define FUTURE_REDUCEALL_CUTOFF 1500000 92 #define MIN_WORK_LAST_DIM 1000 97 #define ZOLTAN2_ABS(x) ((x) >= 0 ? (x) : -(x)) 99 #define imbalanceOf(Wachieved, totalW, expectedRatio) \ 100 (Wachieved) / ((totalW) * (expectedRatio)) - 1 101 #define imbalanceOf2(Wachieved, wExpected) \ 102 (Wachieved) / (wExpected) - 1 105 #define ZOLTAN2_ALGMULTIJAGGED_SWAP(a,b,temp) temp=(a);(a)=(b);(b)=temp; 114 template <
typename Ordinal,
typename T>
133 size(s_), _EPSILON (
std::numeric_limits<T>::
epsilon()){}
137 void reduce(
const Ordinal count,
const T inBuffer[], T inoutBuffer[])
const 139 for (Ordinal i=0; i < count; i++){
140 if (
Z2_ABS(inBuffer[i]) > _EPSILON){
141 inoutBuffer[i] = inBuffer[i];
153 template <
typename T>
158 throw "cannot allocate memory";
170 template <
typename T>
181 template <
typename tt>
183 std::stringstream ss (std::stringstream::in |std::stringstream::out);
185 std::string tmp =
"";
197 template <
typename IT,
typename CT,
typename WT>
216 this->index = index_;
217 this->count = count_;
223 this->index = other.
index;
224 this->count = other.
count;
225 this->val = other.
val;
233 void set(IT index_ ,CT count_, WT *vals_){
234 this->index = index_;
235 this->count = count_;
241 this->index = other.
index;
242 this->count = other.
count;
243 this->val = other.
val;
247 bool operator<(const uMultiSortItem<IT,CT,WT>& other)
const{
248 assert (this->count == other.count);
249 for(CT i = 0; i < this->count; ++i){
251 if (
ZOLTAN2_ABS(this->val[i] - other.val[i]) < this->_EPSILON){
255 if(this->val[i] < other.val[i]){
264 return this->index < other.index;
267 assert (this->count == other.
count);
268 for(CT i = 0; i < this->count; ++i){
274 if(this->val[i] > other.
val[i]){
284 return this->index > other.
index;
291 template <
class IT,
class WT>
302 template <
class IT,
class WT>
307 IT i, ir=n, j, k, l=1;
308 IT jstack=0, istack[50];
317 for (j=l+1;j<=ir;j++)
323 if (arr[i].val <= aval)
338 if (arr[l+1].val > arr[ir].val)
342 if (arr[l].val > arr[ir].val)
346 if (arr[l+1].val > arr[l].val)
356 do i++;
while (arr[i].val < aval);
357 do j--;
while (arr[j].val > aval);
364 if (jstack > NSTACK){
365 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
389 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
395 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
397 RCP<const Environment> mj_env;
398 RCP<Comm<int> > mj_problemComm;
400 double imbalance_tolerance;
401 mj_part_t *part_no_array;
403 int coord_dim, num_weights_per_coord;
405 size_t initial_num_loc_coords;
408 mj_lno_t num_local_coords;
409 mj_gno_t num_global_coords;
411 mj_scalar_t **mj_coordinates;
412 mj_scalar_t **mj_weights;
413 bool *mj_uniform_parts;
414 mj_scalar_t **mj_part_sizes;
415 bool *mj_uniform_weights;
417 ArrayView<const mj_gno_t> mj_gnos;
418 size_t num_global_parts;
420 mj_gno_t *initial_mj_gnos;
421 mj_gno_t *current_mj_gnos;
422 int *owner_of_coordinate;
424 mj_lno_t *coordinate_permutations;
425 mj_lno_t *new_coordinate_permutations;
426 mj_part_t *assigned_part_ids;
429 mj_lno_t *new_part_xadj;
432 bool distribute_points_on_cut_lines;
433 mj_part_t max_concurrent_part_calculation;
436 int mj_user_recursion_depth;
437 int mj_keep_part_boxes;
439 int check_migrate_avoid_migration_option;
440 mj_scalar_t minimum_migration_imbalance;
443 mj_part_t total_num_cut ;
444 mj_part_t total_num_part;
446 mj_part_t max_num_part_along_dim ;
447 mj_part_t max_num_cut_along_dim;
448 size_t max_num_total_part_along_dim;
450 mj_part_t total_dim_num_reduce_all;
451 mj_part_t last_dim_num_part;
455 RCP<Comm<int> > comm;
457 mj_scalar_t sEpsilon;
459 mj_scalar_t maxScalar_t;
460 mj_scalar_t minScalar_t;
462 mj_scalar_t *all_cut_coordinates;
463 mj_scalar_t *max_min_coords;
464 mj_scalar_t *process_cut_line_weight_to_put_left;
465 mj_scalar_t **thread_cut_line_weight_to_put_left;
471 mj_scalar_t *cut_coordinates_work_array;
474 mj_scalar_t *target_part_weights;
476 mj_scalar_t *cut_upper_bound_coordinates ;
477 mj_scalar_t *cut_lower_bound_coordinates ;
478 mj_scalar_t *cut_lower_bound_weights ;
479 mj_scalar_t *cut_upper_bound_weights ;
481 mj_scalar_t *process_local_min_max_coord_total_weight ;
482 mj_scalar_t *global_min_max_coord_total_weight ;
486 bool *is_cut_line_determined;
489 mj_part_t *my_incomplete_cut_count;
491 double **thread_part_weights;
493 double **thread_part_weight_work;
496 mj_scalar_t **thread_cut_left_closest_point;
498 mj_scalar_t **thread_cut_right_closest_point;
501 mj_lno_t **thread_point_counts;
503 mj_scalar_t *process_rectilinear_cut_weight;
504 mj_scalar_t *global_rectilinear_cut_weight;
510 mj_scalar_t *total_part_weight_left_right_closests ;
511 mj_scalar_t *global_total_part_weight_left_right_closests;
513 RCP<mj_partBoxVector_t> kept_boxes;
516 RCP<mj_partBox_t> global_box;
517 int myRank, myActualRank;
526 void set_part_specifications();
533 inline mj_part_t get_part_count(
534 mj_part_t num_total_future,
540 void allocate_set_work_memory();
547 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
550 void compute_global_box();
566 mj_part_t update_part_num_arrays(
567 std::vector<mj_part_t> &num_partitioning_in_current_dim,
568 std::vector<mj_part_t> *future_num_part_in_parts,
569 std::vector<mj_part_t> *next_future_num_parts_in_parts,
570 mj_part_t &future_num_parts,
571 mj_part_t current_num_parts,
572 int current_iteration,
573 RCP<mj_partBoxVector_t> input_part_boxes,
574 RCP<mj_partBoxVector_t> output_part_boxes);
587 void mj_get_local_min_max_coord_totW(
588 mj_lno_t coordinate_begin_index,
589 mj_lno_t coordinate_end_index,
590 mj_lno_t *mj_current_coordinate_permutations,
591 mj_scalar_t *mj_current_dim_coords,
592 mj_scalar_t &min_coordinate,
593 mj_scalar_t &max_coordinate,
594 mj_scalar_t &total_weight);
603 void mj_get_global_min_max_coord_totW(
604 mj_part_t current_concurrent_num_parts,
605 mj_scalar_t *local_min_max_total,
606 mj_scalar_t *global_min_max_total);
626 void mj_get_initial_cut_coords_target_weights(
627 mj_scalar_t min_coord,
628 mj_scalar_t max_coord,
630 mj_scalar_t global_weight,
631 mj_scalar_t *initial_cut_coords ,
632 mj_scalar_t *target_part_weights ,
634 std::vector <mj_part_t> *future_num_part_in_parts,
635 std::vector <mj_part_t> *next_future_num_parts_in_parts,
636 mj_part_t concurrent_current_part,
637 mj_part_t obtained_part_index);
651 void set_initial_coordinate_parts(
652 mj_scalar_t &max_coordinate,
653 mj_scalar_t &min_coordinate,
654 mj_part_t &concurrent_current_part_index,
655 mj_lno_t coordinate_begin_index,
656 mj_lno_t coordinate_end_index,
657 mj_lno_t *mj_current_coordinate_permutations,
658 mj_scalar_t *mj_current_dim_coords,
659 mj_part_t *mj_part_ids,
660 mj_part_t &partition_count);
673 mj_scalar_t *mj_current_dim_coords,
674 mj_scalar_t imbalanceTolerance,
675 mj_part_t current_work_part,
676 mj_part_t current_concurrent_num_parts,
677 mj_scalar_t *current_cut_coordinates,
678 mj_part_t total_incomplete_cut_count,
679 std::vector <mj_part_t> &num_partitioning_in_current_dim);
700 void mj_1D_part_get_thread_part_weights(
701 size_t total_part_count,
703 mj_scalar_t max_coord,
704 mj_scalar_t min_coord,
705 mj_lno_t coordinate_begin_index,
706 mj_lno_t coordinate_end_index,
707 mj_scalar_t *mj_current_dim_coords,
708 mj_scalar_t *temp_current_cut_coords,
709 bool *current_cut_status,
710 double *my_current_part_weights,
711 mj_scalar_t *my_current_left_closest,
712 mj_scalar_t *my_current_right_closest);
721 void mj_accumulate_thread_results(
722 const std::vector <mj_part_t> &num_partitioning_in_current_dim,
723 mj_part_t current_work_part,
724 mj_part_t current_concurrent_num_parts);
756 void mj_get_new_cut_coordinates(
757 const size_t &num_total_part,
758 const mj_part_t &num_cuts,
759 const mj_scalar_t &max_coordinate,
760 const mj_scalar_t &min_coordinate,
761 const mj_scalar_t &global_total_weight,
762 const mj_scalar_t &used_imbalance_tolerance,
763 mj_scalar_t * current_global_part_weights,
764 const mj_scalar_t * current_local_part_weights,
765 const mj_scalar_t *current_part_target_weights,
766 bool *current_cut_line_determined,
767 mj_scalar_t *current_cut_coordinates,
768 mj_scalar_t *current_cut_upper_bounds,
769 mj_scalar_t *current_cut_lower_bounds,
770 mj_scalar_t *current_global_left_closest_points,
771 mj_scalar_t *current_global_right_closest_points,
772 mj_scalar_t * current_cut_lower_bound_weights,
773 mj_scalar_t * current_cut_upper_weights,
774 mj_scalar_t *new_current_cut_coordinates,
775 mj_scalar_t *current_part_cut_line_weight_to_put_left,
776 mj_part_t *rectilinear_cut_count,
777 mj_part_t &my_num_incomplete_cut);
788 void mj_calculate_new_cut_position (
789 mj_scalar_t cut_upper_bound,
790 mj_scalar_t cut_lower_bound,
791 mj_scalar_t cut_upper_weight,
792 mj_scalar_t cut_lower_weight,
793 mj_scalar_t expected_weight,
794 mj_scalar_t &new_cut_position);
806 void mj_create_new_partitions(
808 mj_scalar_t *mj_current_dim_coords,
809 mj_scalar_t *current_concurrent_cut_coordinate,
810 mj_lno_t coordinate_begin,
811 mj_lno_t coordinate_end,
812 mj_scalar_t *used_local_cut_line_weight_to_left,
813 double **used_thread_part_weight_work,
814 mj_lno_t *out_part_xadj);
838 bool mj_perform_migration(
839 mj_part_t in_num_parts,
840 mj_part_t &out_num_parts,
841 std::vector<mj_part_t> *next_future_num_parts_in_parts,
842 mj_part_t &output_part_begin_index,
843 size_t migration_reduce_all_population,
844 mj_lno_t num_coords_for_last_dim_part,
845 std::string iteration,
846 RCP<mj_partBoxVector_t> &input_part_boxes,
847 RCP<mj_partBoxVector_t> &output_part_boxes);
858 void get_processor_num_points_in_parts(
861 mj_gno_t *&num_points_in_all_processor_parts);
875 bool mj_check_to_migrate(
876 size_t migration_reduce_all_population,
877 mj_lno_t num_coords_for_last_dim_part,
880 mj_gno_t *num_points_in_all_processor_parts);
900 void mj_migration_part_proc_assignment(
901 mj_gno_t * num_points_in_all_processor_parts,
904 mj_lno_t *send_count_to_each_proc,
905 std::vector<mj_part_t> &processor_ranks_for_subcomm,
906 std::vector<mj_part_t> *next_future_num_parts_in_parts,
907 mj_part_t &out_num_part,
908 std::vector<mj_part_t> &out_part_indices,
909 mj_part_t &output_part_numbering_begin_index,
910 int *coordinate_destinations);
928 void mj_assign_proc_to_parts(
929 mj_gno_t * num_points_in_all_processor_parts,
932 mj_lno_t *send_count_to_each_proc,
933 std::vector<mj_part_t> &processor_ranks_for_subcomm,
934 std::vector<mj_part_t> *next_future_num_parts_in_parts,
935 mj_part_t &out_part_index,
936 mj_part_t &output_part_numbering_begin_index,
937 int *coordinate_destinations);
949 void assign_send_destinations(
951 mj_part_t *part_assignment_proc_begin_indices,
952 mj_part_t *processor_chains_in_parts,
953 mj_lno_t *send_count_to_each_proc,
954 int *coordinate_destinations);
968 void assign_send_destinations2(
971 int *coordinate_destinations,
972 mj_part_t &output_part_numbering_begin_index,
973 std::vector<mj_part_t> *next_future_num_parts_in_parts);
991 void mj_assign_parts_to_procs(
992 mj_gno_t * num_points_in_all_processor_parts,
995 mj_lno_t *send_count_to_each_proc,
996 std::vector<mj_part_t> *next_future_num_parts_in_parts,
997 mj_part_t &out_num_part,
998 std::vector<mj_part_t> &out_part_indices,
999 mj_part_t &output_part_numbering_begin_index,
1000 int *coordinate_destinations);
1014 void mj_migrate_coords(
1015 mj_part_t num_procs,
1016 mj_lno_t &num_new_local_points,
1017 std::string iteration,
1018 int *coordinate_destinations,
1019 mj_part_t num_parts);
1027 void create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm);
1035 void fill_permutation_array(
1036 mj_part_t output_num_parts,
1037 mj_part_t num_parts);
1047 void set_final_parts(
1048 mj_part_t current_num_parts,
1049 mj_part_t output_part_begin_index,
1050 RCP<mj_partBoxVector_t> &output_part_boxes,
1051 bool is_data_ever_migrated);
1054 void free_work_memory();
1068 void create_consistent_chunks(
1069 mj_part_t num_parts,
1070 mj_scalar_t *mj_current_dim_coords,
1071 mj_scalar_t *current_concurrent_cut_coordinate,
1072 mj_lno_t coordinate_begin,
1073 mj_lno_t coordinate_end,
1074 mj_scalar_t *used_local_cut_line_weight_to_left,
1075 mj_lno_t *out_part_xadj,
1108 void multi_jagged_part(
1109 const RCP<const Environment> &env,
1110 RCP<Comm<int> > &problemComm,
1112 double imbalance_tolerance,
1113 size_t num_global_parts,
1114 mj_part_t *part_no_array,
1115 int recursion_depth,
1118 mj_lno_t num_local_coords,
1119 mj_gno_t num_global_coords,
1120 const mj_gno_t *initial_mj_gnos,
1121 mj_scalar_t **mj_coordinates,
1123 int num_weights_per_coord,
1124 bool *mj_uniform_weights,
1125 mj_scalar_t **mj_weights,
1126 bool *mj_uniform_parts,
1127 mj_scalar_t **mj_part_sizes,
1129 mj_part_t *&result_assigned_part_ids,
1130 mj_gno_t *&result_mj_gnos
1140 void set_partitioning_parameters(
1141 bool distribute_points_on_cut_lines_,
1142 int max_concurrent_part_calculation_,
1143 int check_migrate_avoid_migration_option_,
1144 mj_scalar_t minimum_migration_imbalance_);
1148 void set_to_keep_part_boxes();
1152 RCP<mj_partBox_t> get_global_box()
const;
1154 RCP<mj_partBoxVector_t> get_kept_boxes()
const;
1156 RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1157 RCP<mj_partBoxVector_t> &localPartBoxes)
const;
1183 void sequential_task_partitioning(
1184 const RCP<const Environment> &env,
1185 mj_lno_t num_total_coords,
1186 mj_lno_t num_selected_coords,
1187 size_t num_target_part,
1189 mj_scalar_t **mj_coordinates,
1190 mj_lno_t *initial_selected_coords_output_permutation,
1191 mj_lno_t *output_xadj,
1192 int recursion_depth,
1193 const mj_part_t *part_no_array);
1221 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1224 const RCP<const Environment> &env,
1225 mj_lno_t num_total_coords,
1226 mj_lno_t num_selected_coords,
1227 size_t num_target_part,
1229 mj_scalar_t **mj_coordinates_,
1230 mj_lno_t *inital_adjList_output_adjlist,
1231 mj_lno_t *output_xadj,
1233 const mj_part_t *part_no_array_
1237 const RCP<Comm<int> > commN;
1238 this->comm = this->mj_problemComm = Teuchos::rcp_const_cast<Comm<int> >
1239 (Teuchos::DefaultComm<int>::getDefaultSerialComm(commN));
1240 this->myActualRank = this->myRank = 1;
1242 #ifdef HAVE_ZOLTAN2_OMP 1243 int actual_num_threads = omp_get_num_threads();
1244 omp_set_num_threads(1);
1252 this->imbalance_tolerance = 0;
1253 this->num_global_parts = num_target_part;
1254 this->part_no_array = (mj_part_t *)part_no_array_;
1255 this->recursion_depth = rd;
1257 this->coord_dim = coord_dim_;
1258 this->num_local_coords = num_total_coords;
1259 this->num_global_coords = num_total_coords;
1260 this->mj_coordinates = mj_coordinates_;
1264 this->initial_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
1266 this->num_weights_per_coord = 0;
1267 bool *tmp_mj_uniform_weights =
new bool[1];
1268 this->mj_uniform_weights = tmp_mj_uniform_weights ;
1269 this->mj_uniform_weights[0] =
true;
1271 mj_scalar_t **tmp_mj_weights =
new mj_scalar_t *[1];
1272 this->mj_weights = tmp_mj_weights;
1274 bool *tmp_mj_uniform_parts =
new bool[1];
1275 this->mj_uniform_parts = tmp_mj_uniform_parts;
1276 this->mj_uniform_parts[0] =
true;
1278 mj_scalar_t **tmp_mj_part_sizes =
new mj_scalar_t * [1];
1279 this->mj_part_sizes = tmp_mj_part_sizes;
1280 this->mj_part_sizes[0] = NULL;
1282 this->num_threads = 1;
1283 this->set_part_specifications();
1285 this->allocate_set_work_memory();
1287 this->part_xadj[0] =
static_cast<mj_lno_t
>(num_selected_coords);
1288 for(
size_t i = 0; i < static_cast<size_t>(num_total_coords); ++i){
1289 this->coordinate_permutations[i] = inital_adjList_output_adjlist[i];
1292 mj_part_t current_num_parts = 1;
1294 mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
1296 mj_part_t future_num_parts = this->total_num_part;
1298 std::vector<mj_part_t> *future_num_part_in_parts =
new std::vector<mj_part_t> ();
1299 std::vector<mj_part_t> *next_future_num_parts_in_parts =
new std::vector<mj_part_t> ();
1300 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1301 RCP<mj_partBoxVector_t> t1;
1302 RCP<mj_partBoxVector_t> t2;
1304 for (
int i = 0; i < this->recursion_depth; ++i){
1308 std::vector <mj_part_t> num_partitioning_in_current_dim;
1322 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
1323 future_num_part_in_parts = next_future_num_parts_in_parts;
1324 next_future_num_parts_in_parts = tmpPartVect;
1329 next_future_num_parts_in_parts->clear();
1333 mj_part_t output_part_count_in_dimension =
1334 this->update_part_num_arrays(
1335 num_partitioning_in_current_dim,
1336 future_num_part_in_parts,
1337 next_future_num_parts_in_parts,
1347 if(output_part_count_in_dimension == current_num_parts) {
1348 tmpPartVect= future_num_part_in_parts;
1349 future_num_part_in_parts = next_future_num_parts_in_parts;
1350 next_future_num_parts_in_parts = tmpPartVect;
1355 int coordInd = i % this->coord_dim;
1356 mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
1358 std::string istring = toString<int>(i);
1362 this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
1365 mj_part_t output_part_index = 0;
1368 mj_part_t output_coordinate_end_index = 0;
1370 mj_part_t current_work_part = 0;
1371 mj_part_t current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
1372 this->max_concurrent_part_calculation);
1374 mj_part_t obtained_part_index = 0;
1377 for (; current_work_part < current_num_parts;
1378 current_work_part += current_concurrent_num_parts){
1380 current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
1381 this->max_concurrent_part_calculation);
1383 mj_part_t actual_work_part_count = 0;
1387 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1388 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
1392 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
1395 ++actual_work_part_count;
1396 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
1397 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
1405 this->mj_get_local_min_max_coord_totW(
1406 coordinate_begin_index,
1407 coordinate_end_index,
1408 this->coordinate_permutations,
1409 mj_current_dim_coords,
1410 this->process_local_min_max_coord_total_weight[kk],
1411 this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts],
1412 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]
1417 if (actual_work_part_count > 0){
1419 this->mj_get_global_min_max_coord_totW(
1420 current_concurrent_num_parts,
1421 this->process_local_min_max_coord_total_weight,
1422 this->global_min_max_coord_total_weight);
1426 mj_part_t total_incomplete_cut_count = 0;
1431 mj_part_t concurrent_part_cut_shift = 0;
1432 mj_part_t concurrent_part_part_shift = 0;
1433 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1434 mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
1435 mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
1436 current_concurrent_num_parts];
1437 mj_scalar_t global_total_weight =
1438 this->global_min_max_coord_total_weight[kk +
1439 2 * current_concurrent_num_parts];
1441 mj_part_t concurrent_current_part_index = current_work_part + kk;
1443 mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
1445 mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
1446 mj_scalar_t *current_target_part_weights = this->target_part_weights +
1447 concurrent_part_part_shift;
1449 concurrent_part_cut_shift += partition_count - 1;
1451 concurrent_part_part_shift += partition_count;
1455 if(partition_count > 1 && min_coordinate <= max_coordinate){
1459 total_incomplete_cut_count += partition_count - 1;
1462 this->my_incomplete_cut_count[kk] = partition_count - 1;
1465 this->mj_get_initial_cut_coords_target_weights(
1468 partition_count - 1,
1469 global_total_weight,
1471 current_target_part_weights,
1472 future_num_part_in_parts,
1473 next_future_num_parts_in_parts,
1474 concurrent_current_part_index,
1475 obtained_part_index);
1477 mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
1478 mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
1481 this->set_initial_coordinate_parts(
1484 concurrent_current_part_index,
1485 coordinate_begin_index, coordinate_end_index,
1486 this->coordinate_permutations,
1487 mj_current_dim_coords,
1488 this->assigned_part_ids,
1493 this->my_incomplete_cut_count[kk] = 0;
1495 obtained_part_index += partition_count;
1499 mj_scalar_t used_imbalance = 0;
1503 mj_current_dim_coords,
1506 current_concurrent_num_parts,
1507 current_cut_coordinates,
1508 total_incomplete_cut_count,
1509 num_partitioning_in_current_dim);
1515 mj_part_t output_array_shift = 0;
1516 mj_part_t cut_shift = 0;
1517 size_t tlr_shift = 0;
1518 size_t partweight_array_shift = 0;
1520 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1521 mj_part_t current_concurrent_work_part = current_work_part + kk;
1522 mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
1525 if((num_parts != 1 ) && this->global_min_max_coord_total_weight[kk] >
1526 this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
1528 for(mj_part_t jj = 0; jj < num_parts; ++jj){
1529 this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
1531 cut_shift += num_parts - 1;
1532 tlr_shift += (4 *(num_parts - 1) + 1);
1533 output_array_shift += num_parts;
1534 partweight_array_shift += (2 * (num_parts - 1) + 1);
1538 mj_lno_t coordinate_end = this->part_xadj[current_concurrent_work_part];
1539 mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[current_concurrent_work_part
1541 mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
1542 mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
1545 for(
int ii = 0; ii < this->num_threads; ++ii){
1546 this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
1551 this->create_consistent_chunks(
1553 mj_current_dim_coords,
1554 current_concurrent_cut_coordinate,
1557 used_local_cut_line_weight_to_left,
1558 this->new_part_xadj + output_part_index + output_array_shift,
1564 mj_lno_t part_size = coordinate_end - coordinate_begin;
1565 *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
1566 memcpy(this->new_coordinate_permutations + coordinate_begin,
1567 this->coordinate_permutations + coordinate_begin,
1568 part_size *
sizeof(mj_lno_t));
1570 cut_shift += num_parts - 1;
1571 tlr_shift += (4 *(num_parts - 1) + 1);
1572 output_array_shift += num_parts;
1573 partweight_array_shift += (2 * (num_parts - 1) + 1);
1582 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
1583 mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
1584 for (mj_part_t ii = 0;ii < num_parts ; ++ii){
1586 this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
1589 output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
1591 output_part_index += num_parts ;
1598 current_num_parts = output_part_count_in_dimension;
1601 mj_lno_t * tmp = this->coordinate_permutations;
1602 this->coordinate_permutations = this->new_coordinate_permutations;
1603 this->new_coordinate_permutations = tmp;
1605 freeArray<mj_lno_t>(this->part_xadj);
1606 this->part_xadj = this->new_part_xadj;
1609 for(mj_lno_t i = 0; i < num_total_coords; ++i){
1610 inital_adjList_output_adjlist[i] = this->coordinate_permutations[i];
1615 for(
size_t i = 0; i < this->num_global_parts ; ++i){
1616 output_xadj[i+1] = this->part_xadj[i];
1619 delete future_num_part_in_parts;
1620 delete next_future_num_parts_in_parts;
1623 freeArray<mj_part_t>(this->assigned_part_ids);
1624 freeArray<mj_gno_t>(this->initial_mj_gnos);
1625 freeArray<mj_gno_t>(this->current_mj_gnos);
1626 freeArray<bool>(tmp_mj_uniform_weights);
1627 freeArray<bool>(tmp_mj_uniform_parts);
1628 freeArray<mj_scalar_t *>(tmp_mj_weights);
1629 freeArray<mj_scalar_t *>(tmp_mj_part_sizes);
1631 this->free_work_memory();
1633 #ifdef HAVE_ZOLTAN2_OMP 1634 omp_set_num_threads(actual_num_threads);
1641 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1644 mj_env(), mj_problemComm(), imbalance_tolerance(0),
1645 part_no_array(NULL), recursion_depth(0), coord_dim(0),
1646 num_weights_per_coord(0), initial_num_loc_coords(0),
1647 initial_num_glob_coords(0),
1648 num_local_coords(0), num_global_coords(0), mj_coordinates(NULL),
1649 mj_weights(NULL), mj_uniform_parts(NULL), mj_part_sizes(NULL),
1650 mj_uniform_weights(NULL), mj_gnos(), num_global_parts(1),
1651 initial_mj_gnos(NULL), current_mj_gnos(NULL), owner_of_coordinate(NULL),
1652 coordinate_permutations(NULL), new_coordinate_permutations(NULL),
1653 assigned_part_ids(NULL), part_xadj(NULL), new_part_xadj(NULL),
1654 distribute_points_on_cut_lines(true), max_concurrent_part_calculation(1),
1655 mj_run_as_rcb(0), mj_user_recursion_depth(0), mj_keep_part_boxes(0),
1656 check_migrate_avoid_migration_option(0), minimum_migration_imbalance(0.30),
1657 num_threads(1), total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1658 max_num_cut_along_dim(0), max_num_total_part_along_dim(0), total_dim_num_reduce_all(0),
1659 last_dim_num_part(0), comm(), fEpsilon(0), sEpsilon(0), maxScalar_t(0), minScalar_t(0),
1660 all_cut_coordinates(NULL), max_min_coords(NULL), process_cut_line_weight_to_put_left(NULL),
1661 thread_cut_line_weight_to_put_left(NULL), cut_coordinates_work_array(NULL),
1662 target_part_weights(NULL), cut_upper_bound_coordinates(NULL), cut_lower_bound_coordinates(NULL),
1663 cut_lower_bound_weights(NULL), cut_upper_bound_weights(NULL),
1664 process_local_min_max_coord_total_weight(NULL), global_min_max_coord_total_weight(NULL),
1665 is_cut_line_determined(NULL), my_incomplete_cut_count(NULL),
1666 thread_part_weights(NULL), thread_part_weight_work(NULL),
1667 thread_cut_left_closest_point(NULL), thread_cut_right_closest_point(NULL),
1668 thread_point_counts(NULL), process_rectilinear_cut_weight(NULL),
1669 global_rectilinear_cut_weight(NULL),total_part_weight_left_right_closests(NULL),
1670 global_total_part_weight_left_right_closests(NULL),
1671 kept_boxes(),global_box(),
1672 myRank(0), myActualRank(0)
1677 this->maxScalar_t = std::numeric_limits<mj_scalar_t>::max();
1678 this->minScalar_t = -std::numeric_limits<mj_scalar_t>::max();
1686 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1688 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBox_t>
1691 return this->global_box;
1697 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1700 this->mj_keep_part_boxes = 1;
1711 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1715 this->total_num_cut = 0;
1716 this->total_num_part = 1;
1717 this->max_num_part_along_dim = 0;
1718 this->total_dim_num_reduce_all = 0;
1719 this->last_dim_num_part = 1;
1722 this->max_num_cut_along_dim = 0;
1723 this->max_num_total_part_along_dim = 0;
1725 if (this->part_no_array){
1727 for (
int i = 0; i < this->recursion_depth; ++i){
1728 this->total_dim_num_reduce_all += this->total_num_part;
1729 this->total_num_part *= this->part_no_array[i];
1730 if(this->part_no_array[i] > this->max_num_part_along_dim) {
1731 this->max_num_part_along_dim = this->part_no_array[i];
1734 this->last_dim_num_part = this->total_num_part / this->part_no_array[recursion_depth-1];
1735 this->num_global_parts = this->total_num_part;
1737 mj_part_t future_num_parts = this->num_global_parts;
1740 for (
int i = 0; i < this->recursion_depth; ++i){
1742 mj_part_t maxNoPartAlongI = this->get_part_count(
1743 future_num_parts, 1.0f / (this->recursion_depth - i));
1745 if (maxNoPartAlongI > this->max_num_part_along_dim){
1746 this->max_num_part_along_dim = maxNoPartAlongI;
1749 mj_part_t nfutureNumParts = future_num_parts / maxNoPartAlongI;
1750 if (future_num_parts % maxNoPartAlongI){
1753 future_num_parts = nfutureNumParts;
1755 this->total_num_part = this->num_global_parts;
1759 for (
int i = 0; i < this->recursion_depth; ++i){
1760 this->total_dim_num_reduce_all += p;
1761 p *= this->max_num_part_along_dim;
1764 this->last_dim_num_part = p / this->max_num_part_along_dim;
1767 this->total_num_cut = this->total_num_part - 1;
1768 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
1769 this->max_num_total_part_along_dim = this->max_num_part_along_dim + size_t(this->max_num_cut_along_dim);
1773 if(this->max_concurrent_part_calculation > this->last_dim_num_part){
1774 if(this->mj_problemComm->getRank() == 0){
1775 std::cerr <<
"Warning: Concurrent part count ("<< this->max_concurrent_part_calculation <<
1776 ") has been set bigger than maximum amount that can be used." <<
1777 " Setting to:" << this->last_dim_num_part <<
"." << std::endl;
1779 this->max_concurrent_part_calculation = this->last_dim_num_part;
1788 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1791 mj_part_t num_total_future,
1794 double fp = pow(num_total_future, root);
1795 mj_part_t ip = mj_part_t (fp);
1796 if (fp - ip < this->fEpsilon * 100){
1818 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1821 std::vector <mj_part_t> &num_partitioning_in_current_dim,
1822 std::vector<mj_part_t> *future_num_part_in_parts,
1823 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1824 mj_part_t &future_num_parts,
1825 mj_part_t current_num_parts,
1826 int current_iteration,
1827 RCP<mj_partBoxVector_t> input_part_boxes,
1828 RCP<mj_partBoxVector_t> output_part_boxes
1831 mj_part_t output_num_parts = 0;
1832 if(this->part_no_array){
1837 mj_part_t p = this->part_no_array[current_iteration];
1839 std::cout <<
"i:" << current_iteration <<
" p is given as:" << p << std::endl;
1843 return current_num_parts;
1846 for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
1847 num_partitioning_in_current_dim.push_back(p);
1861 future_num_parts /= num_partitioning_in_current_dim[0];
1862 output_num_parts = current_num_parts * num_partitioning_in_current_dim[0];
1864 if (this->mj_keep_part_boxes){
1865 for (mj_part_t k = 0; k < current_num_parts; ++k){
1867 for (mj_part_t j = 0; j < num_partitioning_in_current_dim[0]; ++j){
1868 output_part_boxes->push_back((*input_part_boxes)[k]);
1876 for (mj_part_t ii = 0; ii < output_num_parts; ++ii){
1877 next_future_num_parts_in_parts->push_back(future_num_parts);
1887 future_num_parts = 1;
1891 for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
1893 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
1897 mj_part_t num_partitions_in_current_dim =
1898 this->get_part_count(
1899 future_num_parts_of_part_ii,
1900 1.0 / (this->recursion_depth - current_iteration)
1903 if (num_partitions_in_current_dim > this->max_num_part_along_dim){
1904 std::cerr <<
"ERROR: maxPartNo calculation is wrong." << std::endl;
1908 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
1912 output_num_parts += num_partitions_in_current_dim;
1915 mj_part_t ideal_num_future_parts_in_part = future_num_parts_of_part_ii / num_partitions_in_current_dim;
1916 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
1917 mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part;
1920 if (iii < future_num_parts_of_part_ii % num_partitions_in_current_dim){
1922 ++num_future_parts_for_part_iii;
1924 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii);
1927 if (this->mj_keep_part_boxes){
1928 output_part_boxes->push_back((*input_part_boxes)[ii]);
1932 if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii;
1936 return output_num_parts;
1943 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1948 this->owner_of_coordinate = NULL;
1953 this->coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
1955 #ifdef HAVE_ZOLTAN2_OMP 1956 #pragma omp parallel for 1958 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
1959 this->coordinate_permutations[i] = i;
1963 this->new_coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
1965 this->assigned_part_ids = NULL;
1966 if(this->num_local_coords > 0){
1967 this->assigned_part_ids = allocMemory<mj_part_t>(this->num_local_coords);
1973 this->part_xadj = allocMemory<mj_lno_t>(1);
1974 this->part_xadj[0] =
static_cast<mj_lno_t
>(this->num_local_coords);
1976 this->new_part_xadj = NULL;
1982 this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
1984 this->max_min_coords = allocMemory< mj_scalar_t>(this->num_threads * 2);
1986 this->process_cut_line_weight_to_put_left = NULL;
1987 this->thread_cut_line_weight_to_put_left = NULL;
1989 if(this->distribute_points_on_cut_lines){
1990 this->process_cut_line_weight_to_put_left = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
1991 this->thread_cut_line_weight_to_put_left = allocMemory<mj_scalar_t *>(this->num_threads);
1992 for(
int i = 0; i < this->num_threads; ++i){
1993 this->thread_cut_line_weight_to_put_left[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
1995 this->process_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
1996 this->global_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2004 this->cut_coordinates_work_array = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim *
2005 this->max_concurrent_part_calculation);
2009 this->target_part_weights = allocMemory<mj_scalar_t>(
2010 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2013 this->cut_upper_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2014 this->cut_lower_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2015 this->cut_lower_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2016 this->cut_upper_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2018 this->process_local_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);
2019 this->global_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);
2023 this->is_cut_line_determined = allocMemory<bool>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2026 this->my_incomplete_cut_count = allocMemory<mj_part_t>(this->max_concurrent_part_calculation);
2028 this->thread_part_weights = allocMemory<double *>(this->num_threads);
2030 this->thread_part_weight_work = allocMemory<double *>(this->num_threads);
2033 this->thread_cut_left_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2035 this->thread_cut_right_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2038 this->thread_point_counts = allocMemory<mj_lno_t *>(this->num_threads);
2040 for(
int i = 0; i < this->num_threads; ++i){
2042 this->thread_part_weights[i] = allocMemory < double >(this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2043 this->thread_cut_right_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2044 this->thread_cut_left_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2045 this->thread_point_counts[i] = allocMemory<mj_lno_t>(this->max_num_part_along_dim);
2051 this->total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2052 this->global_total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2055 mj_scalar_t **coord = allocMemory<mj_scalar_t *>(this->coord_dim);
2056 for (
int i=0; i < this->coord_dim; i++){
2057 coord[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2058 #ifdef HAVE_ZOLTAN2_OMP 2059 #pragma omp parallel for 2061 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2062 coord[i][j] = this->mj_coordinates[i][j];
2064 this->mj_coordinates = coord;
2067 int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
2068 mj_scalar_t **weights = allocMemory<mj_scalar_t *>(criteria_dim);
2070 for (
int i=0; i < criteria_dim; i++){
2073 for (
int i=0; i < this->num_weights_per_coord; i++){
2074 weights[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2075 #ifdef HAVE_ZOLTAN2_OMP 2076 #pragma omp parallel for 2078 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2079 weights[i][j] = this->mj_weights[i][j];
2082 this->mj_weights = weights;
2083 this->current_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
2084 #ifdef HAVE_ZOLTAN2_OMP 2085 #pragma omp parallel for 2087 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2088 this->current_mj_gnos[j] = this->initial_mj_gnos[j];
2090 this->owner_of_coordinate = allocMemory<int>(this->num_local_coords);
2092 #ifdef HAVE_ZOLTAN2_OMP 2093 #pragma omp parallel for 2095 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2096 this->owner_of_coordinate[j] = this->myActualRank;
2101 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2106 mj_scalar_t *mins = allocMemory<mj_scalar_t>(this->coord_dim);
2108 mj_scalar_t *gmins = allocMemory<mj_scalar_t>(this->coord_dim);
2110 mj_scalar_t *maxs = allocMemory<mj_scalar_t>(this->coord_dim);
2112 mj_scalar_t *gmaxs = allocMemory<mj_scalar_t>(this->coord_dim);
2114 for (
int i = 0; i < this->coord_dim; ++i){
2115 mj_scalar_t localMin = this->mj_coordinates[i][0];
2116 mj_scalar_t localMax = this->mj_coordinates[i][0];
2117 for (mj_lno_t j = 1; j < this->num_local_coords; ++j){
2118 if (this->mj_coordinates[i][j] < localMin){
2119 localMin = this->mj_coordinates[i][j];
2121 if (this->mj_coordinates[i][j] > localMax){
2122 localMax = this->mj_coordinates[i][j];
2130 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2131 this->coord_dim, mins, gmins
2134 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2135 this->coord_dim, maxs, gmaxs
2139 global_box = rcp(
new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2141 freeArray<mj_scalar_t>(mins);
2142 freeArray<mj_scalar_t>(gmins);
2143 freeArray<mj_scalar_t>(maxs);
2144 freeArray<mj_scalar_t>(gmaxs);
2152 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2155 RCP<mj_partBoxVector_t> & initial_partitioning_boxes
2159 initial_partitioning_boxes->push_back(tmp_box);
2172 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2175 mj_lno_t coordinate_begin_index,
2176 mj_lno_t coordinate_end_index,
2177 mj_lno_t *mj_current_coordinate_permutations,
2178 mj_scalar_t *mj_current_dim_coords,
2179 mj_scalar_t &min_coordinate,
2180 mj_scalar_t &max_coordinate,
2181 mj_scalar_t &total_weight){
2185 if(coordinate_begin_index >= coordinate_end_index)
2187 min_coordinate = this->maxScalar_t;
2188 max_coordinate = this->minScalar_t;
2192 mj_scalar_t my_total_weight = 0;
2193 #ifdef HAVE_ZOLTAN2_OMP 2194 #pragma omp parallel 2198 if (this->mj_uniform_weights[0]) {
2199 #ifdef HAVE_ZOLTAN2_OMP 2203 my_total_weight = coordinate_end_index - coordinate_begin_index;
2209 #ifdef HAVE_ZOLTAN2_OMP 2210 #pragma omp for reduction(+:my_total_weight) 2212 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2213 int i = mj_current_coordinate_permutations[ii];
2214 my_total_weight += this->mj_weights[0][i];
2218 int my_thread_id = 0;
2219 #ifdef HAVE_ZOLTAN2_OMP 2220 my_thread_id = omp_get_thread_num();
2222 mj_scalar_t my_thread_min_coord, my_thread_max_coord;
2223 my_thread_min_coord=my_thread_max_coord
2224 =mj_current_dim_coords[mj_current_coordinate_permutations[coordinate_begin_index]];
2227 #ifdef HAVE_ZOLTAN2_OMP 2230 for(mj_lno_t j = coordinate_begin_index + 1; j < coordinate_end_index; ++j){
2231 int i = mj_current_coordinate_permutations[j];
2232 if(mj_current_dim_coords[i] > my_thread_max_coord)
2233 my_thread_max_coord = mj_current_dim_coords[i];
2234 if(mj_current_dim_coords[i] < my_thread_min_coord)
2235 my_thread_min_coord = mj_current_dim_coords[i];
2237 this->max_min_coords[my_thread_id] = my_thread_min_coord;
2238 this->max_min_coords[my_thread_id + this->num_threads] = my_thread_max_coord;
2240 #ifdef HAVE_ZOLTAN2_OMP 2243 #pragma omp single nowait 2246 min_coordinate = this->max_min_coords[0];
2247 for(
int i = 1; i < this->num_threads; ++i){
2248 if(this->max_min_coords[i] < min_coordinate)
2249 min_coordinate = this->max_min_coords[i];
2253 #ifdef HAVE_ZOLTAN2_OMP 2254 #pragma omp single nowait 2257 max_coordinate = this->max_min_coords[this->num_threads];
2258 for(
int i = this->num_threads + 1; i < this->num_threads * 2; ++i){
2259 if(this->max_min_coords[i] > max_coordinate)
2260 max_coordinate = this->max_min_coords[i];
2264 total_weight = my_total_weight;
2276 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2279 mj_part_t current_concurrent_num_parts,
2280 mj_scalar_t *local_min_max_total,
2281 mj_scalar_t *global_min_max_total){
2286 if(this->comm->getSize() > 1){
2289 current_concurrent_num_parts,
2290 current_concurrent_num_parts,
2291 current_concurrent_num_parts);
2293 reduceAll<int, mj_scalar_t>(
2296 3 * current_concurrent_num_parts,
2297 local_min_max_total,
2298 global_min_max_total);
2303 mj_part_t s = 3 * current_concurrent_num_parts;
2304 for (mj_part_t i = 0; i < s; ++i){
2305 global_min_max_total[i] = local_min_max_total[i];
2330 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2333 mj_scalar_t min_coord,
2334 mj_scalar_t max_coord,
2335 mj_part_t num_cuts ,
2336 mj_scalar_t global_weight,
2337 mj_scalar_t *initial_cut_coords ,
2338 mj_scalar_t *current_target_part_weights ,
2340 std::vector <mj_part_t> *future_num_part_in_parts,
2341 std::vector <mj_part_t> *next_future_num_parts_in_parts,
2342 mj_part_t concurrent_current_part,
2343 mj_part_t obtained_part_index
2346 mj_scalar_t coord_range = max_coord - min_coord;
2347 if(this->mj_uniform_parts[0]){
2349 mj_part_t cumulative = 0;
2351 mj_scalar_t total_future_part_count_in_part = mj_scalar_t((*future_num_part_in_parts)[concurrent_current_part]);
2355 mj_scalar_t unit_part_weight = global_weight / total_future_part_count_in_part;
2361 for(mj_part_t i = 0; i < num_cuts; ++i){
2362 cumulative += (*next_future_num_parts_in_parts)[i + obtained_part_index];
2370 current_target_part_weights[i] = cumulative * unit_part_weight;
2373 initial_cut_coords[i] = min_coord + (coord_range *
2374 cumulative) / total_future_part_count_in_part;
2376 current_target_part_weights[num_cuts] = 1;
2380 if (this->mj_uniform_weights[0]){
2381 for(mj_part_t i = 0; i < num_cuts + 1; ++i){
2382 current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2387 std::cerr <<
"MJ does not support non uniform part weights" << std::endl;
2405 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2408 mj_scalar_t &max_coordinate,
2409 mj_scalar_t &min_coordinate,
2410 mj_part_t &concurrent_current_part_index,
2411 mj_lno_t coordinate_begin_index,
2412 mj_lno_t coordinate_end_index,
2413 mj_lno_t *mj_current_coordinate_permutations,
2414 mj_scalar_t *mj_current_dim_coords,
2415 mj_part_t *mj_part_ids,
2416 mj_part_t &partition_count
2418 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
2422 if(
ZOLTAN2_ABS(coordinate_range) < this->sEpsilon ){
2423 #ifdef HAVE_ZOLTAN2_OMP 2424 #pragma omp parallel for 2426 for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2427 mj_part_ids[mj_current_coordinate_permutations[ii]] = 0;
2434 mj_scalar_t slice = coordinate_range / partition_count;
2436 #ifdef HAVE_ZOLTAN2_OMP 2437 #pragma omp parallel for 2439 for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2441 mj_lno_t iii = mj_current_coordinate_permutations[ii];
2442 mj_part_t pp = mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
2443 mj_part_ids[iii] = 2 * pp;
2459 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2462 mj_scalar_t *mj_current_dim_coords,
2463 mj_scalar_t used_imbalance_tolerance,
2464 mj_part_t current_work_part,
2465 mj_part_t current_concurrent_num_parts,
2466 mj_scalar_t *current_cut_coordinates,
2467 mj_part_t total_incomplete_cut_count,
2468 std::vector <mj_part_t> &num_partitioning_in_current_dim
2472 mj_part_t rectilinear_cut_count = 0;
2473 mj_scalar_t *temp_cut_coords = current_cut_coordinates;
2476 *reductionOp = NULL;
2478 <mj_part_t, mj_scalar_t>(
2479 &num_partitioning_in_current_dim ,
2481 current_concurrent_num_parts);
2483 size_t total_reduction_size = 0;
2484 #ifdef HAVE_ZOLTAN2_OMP 2485 #pragma omp parallel shared(total_incomplete_cut_count, rectilinear_cut_count) 2489 #ifdef HAVE_ZOLTAN2_OMP 2490 me = omp_get_thread_num();
2492 double *my_thread_part_weights = this->thread_part_weights[me];
2493 mj_scalar_t *my_thread_left_closest = this->thread_cut_left_closest_point[me];
2494 mj_scalar_t *my_thread_right_closest = this->thread_cut_right_closest_point[me];
2496 #ifdef HAVE_ZOLTAN2_OMP 2502 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
2504 mj_part_t num_part_in_dim = num_partitioning_in_current_dim[current_work_part + i];
2505 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
2506 total_reduction_size += (4 * num_cut_in_dim + 1);
2508 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii){
2509 this->is_cut_line_determined[next] =
false;
2510 this->cut_lower_bound_coordinates[next] = global_min_max_coord_total_weight[i];
2511 this->cut_upper_bound_coordinates[next] = global_min_max_coord_total_weight[i + current_concurrent_num_parts];
2513 this->cut_upper_bound_weights[next] = global_min_max_coord_total_weight[i + 2 * current_concurrent_num_parts];
2514 this->cut_lower_bound_weights[next] = 0;
2516 if(this->distribute_points_on_cut_lines){
2517 this->process_cut_line_weight_to_put_left[next] = 0;
2528 while (total_incomplete_cut_count != 0){
2531 mj_part_t concurrent_cut_shifts = 0;
2532 size_t total_part_shift = 0;
2534 for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2535 mj_part_t num_parts = -1;
2536 num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2538 mj_part_t num_cuts = num_parts - 1;
2539 size_t total_part_count = num_parts + size_t (num_cuts) ;
2540 if (this->my_incomplete_cut_count[kk] > 0){
2543 bool *current_cut_status = this->is_cut_line_determined + concurrent_cut_shifts;
2544 double *my_current_part_weights = my_thread_part_weights + total_part_shift;
2545 mj_scalar_t *my_current_left_closest = my_thread_left_closest + concurrent_cut_shifts;
2546 mj_scalar_t *my_current_right_closest = my_thread_right_closest + concurrent_cut_shifts;
2548 mj_part_t conccurent_current_part = current_work_part + kk;
2549 mj_lno_t coordinate_begin_index = conccurent_current_part ==0 ? 0: this->part_xadj[conccurent_current_part -1];
2550 mj_lno_t coordinate_end_index = this->part_xadj[conccurent_current_part];
2551 mj_scalar_t *temp_current_cut_coords = temp_cut_coords + concurrent_cut_shifts;
2553 mj_scalar_t min_coord = global_min_max_coord_total_weight[kk];
2554 mj_scalar_t max_coord = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2557 this->mj_1D_part_get_thread_part_weights(
2562 coordinate_begin_index,
2563 coordinate_end_index,
2564 mj_current_dim_coords,
2565 temp_current_cut_coords,
2567 my_current_part_weights,
2568 my_current_left_closest,
2569 my_current_right_closest);
2573 concurrent_cut_shifts += num_cuts;
2574 total_part_shift += total_part_count;
2578 this->mj_accumulate_thread_results(
2579 num_partitioning_in_current_dim,
2581 current_concurrent_num_parts);
2584 #ifdef HAVE_ZOLTAN2_OMP 2588 if(this->comm->getSize() > 1){
2589 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
2590 total_reduction_size,
2591 this->total_part_weight_left_right_closests,
2592 this->global_total_part_weight_left_right_closests);
2597 this->global_total_part_weight_left_right_closests,
2598 this->total_part_weight_left_right_closests,
2599 total_reduction_size *
sizeof(mj_scalar_t));
2604 mj_part_t cut_shift = 0;
2607 size_t tlr_shift = 0;
2608 for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2609 mj_part_t num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2610 mj_part_t num_cuts = num_parts - 1;
2611 size_t num_total_part = num_parts + size_t (num_cuts) ;
2616 if (this->my_incomplete_cut_count[kk] == 0) {
2617 cut_shift += num_cuts;
2618 tlr_shift += (num_total_part + 2 * num_cuts);
2622 mj_scalar_t *current_local_part_weights = this->total_part_weight_left_right_closests + tlr_shift ;
2623 mj_scalar_t *current_global_tlr = this->global_total_part_weight_left_right_closests + tlr_shift;
2624 mj_scalar_t *current_global_left_closest_points = current_global_tlr + num_total_part;
2625 mj_scalar_t *current_global_right_closest_points = current_global_tlr + num_total_part + num_cuts;
2626 mj_scalar_t *current_global_part_weights = current_global_tlr;
2627 bool *current_cut_line_determined = this->is_cut_line_determined + cut_shift;
2629 mj_scalar_t *current_part_target_weights = this->target_part_weights + cut_shift + kk;
2630 mj_scalar_t *current_part_cut_line_weight_to_put_left = this->process_cut_line_weight_to_put_left + cut_shift;
2632 mj_scalar_t min_coordinate = global_min_max_coord_total_weight[kk];
2633 mj_scalar_t max_coordinate = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2634 mj_scalar_t global_total_weight = global_min_max_coord_total_weight[kk + current_concurrent_num_parts * 2];
2635 mj_scalar_t *current_cut_lower_bound_weights = this->cut_lower_bound_weights + cut_shift;
2636 mj_scalar_t *current_cut_upper_weights = this->cut_upper_bound_weights + cut_shift;
2637 mj_scalar_t *current_cut_upper_bounds = this->cut_upper_bound_coordinates + cut_shift;
2638 mj_scalar_t *current_cut_lower_bounds = this->cut_lower_bound_coordinates + cut_shift;
2640 mj_part_t initial_incomplete_cut_count = this->my_incomplete_cut_count[kk];
2643 this->mj_get_new_cut_coordinates(
2648 global_total_weight,
2649 used_imbalance_tolerance,
2650 current_global_part_weights,
2651 current_local_part_weights,
2652 current_part_target_weights,
2653 current_cut_line_determined,
2654 temp_cut_coords + cut_shift,
2655 current_cut_upper_bounds,
2656 current_cut_lower_bounds,
2657 current_global_left_closest_points,
2658 current_global_right_closest_points,
2659 current_cut_lower_bound_weights,
2660 current_cut_upper_weights,
2661 this->cut_coordinates_work_array +cut_shift,
2662 current_part_cut_line_weight_to_put_left,
2663 &rectilinear_cut_count,
2664 this->my_incomplete_cut_count[kk]);
2666 cut_shift += num_cuts;
2667 tlr_shift += (num_total_part + 2 * num_cuts);
2668 mj_part_t iteration_complete_cut_count = initial_incomplete_cut_count - this->my_incomplete_cut_count[kk];
2669 #ifdef HAVE_ZOLTAN2_OMP 2673 total_incomplete_cut_count -= iteration_complete_cut_count;
2677 #ifdef HAVE_ZOLTAN2_OMP 2683 mj_scalar_t *t = temp_cut_coords;
2684 temp_cut_coords = this->cut_coordinates_work_array;
2685 this->cut_coordinates_work_array = t;
2693 if (current_cut_coordinates != temp_cut_coords){
2694 #ifdef HAVE_ZOLTAN2_OMP 2699 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
2700 mj_part_t num_parts = -1;
2701 num_parts = num_partitioning_in_current_dim[current_work_part + i];
2702 mj_part_t num_cuts = num_parts - 1;
2704 for(mj_part_t ii = 0; ii < num_cuts; ++ii){
2705 current_cut_coordinates[next + ii] = temp_cut_coords[next + ii];
2711 #ifdef HAVE_ZOLTAN2_OMP 2715 this->cut_coordinates_work_array = temp_cut_coords;
2742 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2745 size_t total_part_count,
2747 mj_scalar_t max_coord,
2748 mj_scalar_t min_coord,
2749 mj_lno_t coordinate_begin_index,
2750 mj_lno_t coordinate_end_index,
2751 mj_scalar_t *mj_current_dim_coords,
2752 mj_scalar_t *temp_current_cut_coords,
2753 bool *current_cut_status,
2754 double *my_current_part_weights,
2755 mj_scalar_t *my_current_left_closest,
2756 mj_scalar_t *my_current_right_closest){
2759 for (
size_t i = 0; i < total_part_count; ++i){
2760 my_current_part_weights[i] = 0;
2765 for(mj_part_t i = 0; i < num_cuts; ++i){
2766 my_current_left_closest[i] = min_coord - 1;
2767 my_current_right_closest[i] = max_coord + 1;
2770 mj_scalar_t minus_EPSILON = -this->sEpsilon;
2771 #ifdef HAVE_ZOLTAN2_OMP 2777 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2778 int i = this->coordinate_permutations[ii];
2782 mj_part_t j = this->assigned_part_ids[i] / 2;
2788 mj_part_t lower_cut_index = 0;
2789 mj_part_t upper_cut_index = num_cuts - 1;
2791 mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
2792 bool is_inserted =
false;
2793 bool is_on_left_of_cut =
false;
2794 bool is_on_right_of_cut =
false;
2795 mj_part_t last_compared_part = -1;
2797 mj_scalar_t coord = mj_current_dim_coords[i];
2799 while(upper_cut_index >= lower_cut_index)
2802 last_compared_part = -1;
2803 is_on_left_of_cut =
false;
2804 is_on_right_of_cut =
false;
2805 mj_scalar_t cut = temp_current_cut_coords[j];
2806 mj_scalar_t distance_to_cut = coord - cut;
2807 mj_scalar_t abs_distance_to_cut =
ZOLTAN2_ABS(distance_to_cut);
2810 if(abs_distance_to_cut < this->sEpsilon){
2812 my_current_part_weights[j * 2 + 1] += w;
2813 this->assigned_part_ids[i] = j * 2 + 1;
2816 my_current_left_closest[j] = coord;
2817 my_current_right_closest[j] = coord;
2820 mj_part_t kk = j + 1;
2821 while(kk < num_cuts){
2823 distance_to_cut =
ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
2824 if(distance_to_cut < this->sEpsilon){
2825 my_current_part_weights[2 * kk + 1] += w;
2826 my_current_left_closest[kk] = coord;
2827 my_current_right_closest[kk] = coord;
2833 if(coord - my_current_left_closest[kk] > this->sEpsilon){
2834 my_current_left_closest[kk] = coord;
2844 distance_to_cut =
ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
2845 if(distance_to_cut < this->sEpsilon){
2846 my_current_part_weights[2 * kk + 1] += w;
2848 this->assigned_part_ids[i] = kk * 2 + 1;
2849 my_current_left_closest[kk] = coord;
2850 my_current_right_closest[kk] = coord;
2856 if(my_current_right_closest[kk] - coord > this->sEpsilon){
2857 my_current_right_closest[kk] = coord;
2868 if (distance_to_cut < 0) {
2869 bool _break =
false;
2873 mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j - 1];
2874 if(distance_to_next_cut > this->sEpsilon){
2880 upper_cut_index = j - 1;
2882 is_on_left_of_cut =
true;
2883 last_compared_part = j;
2888 bool _break =
false;
2889 if(j < num_cuts - 1){
2892 mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j + 1];
2893 if(distance_to_next_cut < minus_EPSILON){
2900 lower_cut_index = j + 1;
2902 is_on_right_of_cut =
true;
2903 last_compared_part = j;
2908 j = (upper_cut_index + lower_cut_index) / 2;
2911 if(is_on_right_of_cut){
2914 my_current_part_weights[2 * last_compared_part + 2] += w;
2915 this->assigned_part_ids[i] = 2 * last_compared_part + 2;
2918 if(my_current_right_closest[last_compared_part] - coord > this->sEpsilon){
2919 my_current_right_closest[last_compared_part] = coord;
2922 if(last_compared_part+1 < num_cuts){
2924 if(coord - my_current_left_closest[last_compared_part + 1] > this->sEpsilon){
2925 my_current_left_closest[last_compared_part + 1] = coord;
2930 else if(is_on_left_of_cut){
2933 my_current_part_weights[2 * last_compared_part] += w;
2934 this->assigned_part_ids[i] = 2 * last_compared_part;
2938 if(coord - my_current_left_closest[last_compared_part] > this->sEpsilon){
2939 my_current_left_closest[last_compared_part] = coord;
2943 if(last_compared_part-1 >= 0){
2944 if(my_current_right_closest[last_compared_part -1] - coord > this->sEpsilon){
2945 my_current_right_closest[last_compared_part -1] = coord;
2954 for (
size_t i = 1; i < total_part_count; ++i){
2958 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
2959 ZOLTAN2_ABS(temp_current_cut_coords[i / 2] - temp_current_cut_coords[i /2 - 1])
2964 my_current_part_weights[i] = my_current_part_weights[i-2];
2968 my_current_part_weights[i] += my_current_part_weights[i-1];
2980 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2983 const std::vector <mj_part_t> &num_partitioning_in_current_dim,
2984 mj_part_t current_work_part,
2985 mj_part_t current_concurrent_num_parts){
2987 #ifdef HAVE_ZOLTAN2_OMP 2994 size_t tlr_array_shift = 0;
2995 mj_part_t cut_shift = 0;
2998 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3000 mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3001 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3002 size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3005 for(mj_part_t ii = 0; ii < num_cuts_in_part ; ++ii){
3006 mj_part_t next = tlr_array_shift + ii;
3007 mj_part_t cut_index = cut_shift + ii;
3008 if(this->is_cut_line_determined[cut_index])
continue;
3009 mj_scalar_t left_closest_in_process = this->thread_cut_left_closest_point[0][cut_index],
3010 right_closest_in_process = this->thread_cut_right_closest_point[0][cut_index];
3013 for (
int j = 1; j < this->num_threads; ++j){
3014 if (this->thread_cut_right_closest_point[j][cut_index] < right_closest_in_process ){
3015 right_closest_in_process = this->thread_cut_right_closest_point[j][cut_index];
3017 if (this->thread_cut_left_closest_point[j][cut_index] > left_closest_in_process ){
3018 left_closest_in_process = this->thread_cut_left_closest_point[j][cut_index];
3022 this->total_part_weight_left_right_closests[num_total_part_in_part +
3023 next] = left_closest_in_process;
3024 this->total_part_weight_left_right_closests[num_total_part_in_part +
3025 num_cuts_in_part + next] = right_closest_in_process;
3028 tlr_array_shift += (num_total_part_in_part + 2 * num_cuts_in_part);
3029 cut_shift += num_cuts_in_part;
3032 tlr_array_shift = 0;
3034 size_t total_part_array_shift = 0;
3037 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3039 mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3040 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3041 size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3043 for(
size_t j = 0; j < num_total_part_in_part; ++j){
3045 mj_part_t cut_ind = j / 2 + cut_shift;
3050 if(j != num_total_part_in_part - 1 && this->is_cut_line_determined[cut_ind])
continue;
3052 for (
int k = 0; k < this->num_threads; ++k){
3053 pwj += this->thread_part_weights[k][total_part_array_shift + j];
3056 this->total_part_weight_left_right_closests[tlr_array_shift + j] = pwj;
3058 cut_shift += num_cuts_in_part;
3059 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
3060 total_part_array_shift += num_total_part_in_part;
3078 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3081 mj_scalar_t cut_upper_bound,
3082 mj_scalar_t cut_lower_bound,
3083 mj_scalar_t cut_upper_weight,
3084 mj_scalar_t cut_lower_weight,
3085 mj_scalar_t expected_weight,
3086 mj_scalar_t &new_cut_position){
3088 if(
ZOLTAN2_ABS(cut_upper_bound - cut_lower_bound) < this->sEpsilon){
3089 new_cut_position = cut_upper_bound;
3093 if(
ZOLTAN2_ABS(cut_upper_weight - cut_lower_weight) < this->sEpsilon){
3094 new_cut_position = cut_lower_bound;
3097 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
3098 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
3099 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
3101 mj_scalar_t required_shift = (my_weight_diff / weight_range);
3102 int scale_constant = 20;
3103 int shiftint= int (required_shift * scale_constant);
3104 if (shiftint == 0) shiftint = 1;
3105 required_shift = mj_scalar_t (shiftint) / scale_constant;
3106 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
3120 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3123 mj_part_t num_parts,
3124 mj_scalar_t *mj_current_dim_coords,
3125 mj_scalar_t *current_concurrent_cut_coordinate,
3126 mj_lno_t coordinate_begin,
3127 mj_lno_t coordinate_end,
3128 mj_scalar_t *used_local_cut_line_weight_to_left,
3129 double **used_thread_part_weight_work,
3130 mj_lno_t *out_part_xadj){
3132 mj_part_t num_cuts = num_parts - 1;
3134 #ifdef HAVE_ZOLTAN2_OMP 3135 #pragma omp parallel 3139 #ifdef HAVE_ZOLTAN2_OMP 3140 me = omp_get_thread_num();
3143 mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
3144 mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
3148 if (this->distribute_points_on_cut_lines){
3149 my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
3151 #ifdef HAVE_ZOLTAN2_OMP 3154 for (mj_part_t i = 0; i < num_cuts; ++i){
3156 mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
3157 for(
int ii = 0; ii < this->num_threads; ++ii){
3158 if(left_weight > this->sEpsilon){
3160 mj_scalar_t thread_ii_weight_on_cut = used_thread_part_weight_work[ii][i * 2 + 1] - used_thread_part_weight_work[ii][i * 2 ];
3161 if(thread_ii_weight_on_cut < left_weight){
3163 this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
3167 this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
3169 left_weight -= thread_ii_weight_on_cut;
3172 this->thread_cut_line_weight_to_put_left[ii][i] = 0;
3180 for (mj_part_t i = num_cuts - 1; i > 0 ; --i){
3181 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
3182 my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
3185 / mj_scalar_t(SIGNIFICANCE_MUL);
3190 for(mj_part_t ii = 0; ii < num_parts; ++ii){
3191 thread_num_points_in_parts[ii] = 0;
3195 #ifdef HAVE_ZOLTAN2_OMP 3199 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3201 mj_lno_t coordinate_index = this->coordinate_permutations[ii];
3202 mj_scalar_t coordinate_weight = this->mj_uniform_weights[0]? 1:this->mj_weights[0][coordinate_index];
3203 mj_part_t coordinate_assigned_place = this->assigned_part_ids[coordinate_index];
3204 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
3205 if(coordinate_assigned_place % 2 == 1){
3207 if(this->distribute_points_on_cut_lines
3208 && my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > this->sEpsilon){
3212 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3217 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0
3218 && coordinate_assigned_part < num_cuts - 1
3219 &&
ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3220 current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3221 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3223 ++thread_num_points_in_parts[coordinate_assigned_part];
3224 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3228 ++coordinate_assigned_part;
3230 while(this->distribute_points_on_cut_lines &&
3231 coordinate_assigned_part < num_cuts){
3233 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part] -
3234 current_concurrent_cut_coordinate[coordinate_assigned_part - 1])
3237 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >
3239 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >=
3240 ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] - coordinate_weight)){
3241 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3244 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 &&
3245 coordinate_assigned_part < num_cuts - 1 &&
3246 ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3247 current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3248 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3256 ++coordinate_assigned_part;
3258 ++thread_num_points_in_parts[coordinate_assigned_part];
3259 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3264 ++thread_num_points_in_parts[coordinate_assigned_part];
3265 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3274 #ifdef HAVE_ZOLTAN2_OMP 3277 for(mj_part_t j = 0; j < num_parts; ++j){
3278 mj_lno_t num_points_in_part_j_upto_thread_i = 0;
3279 for (
int i = 0; i < this->num_threads; ++i){
3280 mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
3282 this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
3283 num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
3286 out_part_xadj[j] = num_points_in_part_j_upto_thread_i;
3290 #ifdef HAVE_ZOLTAN2_OMP 3295 for(mj_part_t j = 1; j < num_parts; ++j){
3296 out_part_xadj[j] += out_part_xadj[j - 1];
3302 for(mj_part_t j = 1; j < num_parts; ++j){
3303 thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
3309 #ifdef HAVE_ZOLTAN2_OMP 3312 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3313 mj_lno_t i = this->coordinate_permutations[ii];
3314 mj_part_t p = this->assigned_part_ids[i];
3315 this->new_coordinate_permutations[coordinate_begin +
3316 thread_num_points_in_parts[p]++] = i;
3351 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3354 const size_t &num_total_part,
3355 const mj_part_t &num_cuts,
3356 const mj_scalar_t &max_coordinate,
3357 const mj_scalar_t &min_coordinate,
3358 const mj_scalar_t &global_total_weight,
3359 const mj_scalar_t &used_imbalance_tolerance,
3360 mj_scalar_t * current_global_part_weights,
3361 const mj_scalar_t * current_local_part_weights,
3362 const mj_scalar_t *current_part_target_weights,
3363 bool *current_cut_line_determined,
3364 mj_scalar_t *current_cut_coordinates,
3365 mj_scalar_t *current_cut_upper_bounds,
3366 mj_scalar_t *current_cut_lower_bounds,
3367 mj_scalar_t *current_global_left_closest_points,
3368 mj_scalar_t *current_global_right_closest_points,
3369 mj_scalar_t * current_cut_lower_bound_weights,
3370 mj_scalar_t * current_cut_upper_weights,
3371 mj_scalar_t *new_current_cut_coordinates,
3372 mj_scalar_t *current_part_cut_line_weight_to_put_left,
3373 mj_part_t *rectilinear_cut_count,
3374 mj_part_t &my_num_incomplete_cut){
3377 mj_scalar_t seen_weight_in_part = 0;
3379 mj_scalar_t expected_weight_in_part = 0;
3381 mj_scalar_t imbalance_on_left = 0, imbalance_on_right = 0;
3384 #ifdef HAVE_ZOLTAN2_OMP 3387 for (mj_part_t i = 0; i < num_cuts; i++){
3390 if(min_coordinate - current_global_left_closest_points[i] > this->sEpsilon)
3391 current_global_left_closest_points[i] = current_cut_coordinates[i];
3392 if(current_global_right_closest_points[i] - max_coordinate > this->sEpsilon)
3393 current_global_right_closest_points[i] = current_cut_coordinates[i];
3396 #ifdef HAVE_ZOLTAN2_OMP 3399 for (mj_part_t i = 0; i < num_cuts; i++){
3401 if(this->distribute_points_on_cut_lines){
3403 this->global_rectilinear_cut_weight[i] = 0;
3404 this->process_rectilinear_cut_weight[i] = 0;
3408 if(current_cut_line_determined[i]) {
3409 new_current_cut_coordinates[i] = current_cut_coordinates[i];
3414 seen_weight_in_part = current_global_part_weights[i * 2];
3423 expected_weight_in_part = current_part_target_weights[i];
3425 imbalance_on_left =
imbalanceOf2(seen_weight_in_part, expected_weight_in_part);
3427 imbalance_on_right =
imbalanceOf2(global_total_weight - seen_weight_in_part, global_total_weight - expected_weight_in_part);
3429 bool is_left_imbalance_valid =
ZOLTAN2_ABS(imbalance_on_left) - used_imbalance_tolerance < this->sEpsilon ;
3430 bool is_right_imbalance_valid =
ZOLTAN2_ABS(imbalance_on_right) - used_imbalance_tolerance < this->sEpsilon;
3433 if(is_left_imbalance_valid && is_right_imbalance_valid){
3434 current_cut_line_determined[i] =
true;
3435 #ifdef HAVE_ZOLTAN2_OMP 3438 my_num_incomplete_cut -= 1;
3439 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3442 else if(imbalance_on_left < 0){
3445 if(this->distribute_points_on_cut_lines){
3450 if (current_global_part_weights[i * 2 + 1] == expected_weight_in_part){
3452 current_cut_line_determined[i] =
true;
3453 #ifdef HAVE_ZOLTAN2_OMP 3456 my_num_incomplete_cut -= 1;
3459 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3463 current_part_cut_line_weight_to_put_left[i] = current_local_part_weights[i * 2 + 1] - current_local_part_weights[i * 2];
3466 else if (current_global_part_weights[i * 2 + 1] > expected_weight_in_part){
3470 current_cut_line_determined[i] =
true;
3471 #ifdef HAVE_ZOLTAN2_OMP 3474 *rectilinear_cut_count += 1;
3477 #ifdef HAVE_ZOLTAN2_OMP 3480 my_num_incomplete_cut -= 1;
3481 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3482 this->process_rectilinear_cut_weight[i] = current_local_part_weights[i * 2 + 1] -
3483 current_local_part_weights[i * 2];
3488 current_cut_lower_bounds[i] = current_global_right_closest_points[i];
3490 current_cut_lower_bound_weights[i] = seen_weight_in_part;
3494 for (mj_part_t ii = i + 1; ii < num_cuts ; ++ii){
3495 mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3496 mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3498 if(p_weight >= expected_weight_in_part){
3503 if(p_weight == expected_weight_in_part){
3504 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3505 current_cut_upper_weights[i] = p_weight;
3506 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3507 current_cut_lower_bound_weights[i] = p_weight;
3508 }
else if (p_weight < current_cut_upper_weights[i]){
3511 current_cut_upper_bounds[i] = current_global_left_closest_points[ii];
3512 current_cut_upper_weights[i] = p_weight;
3518 if(line_weight >= expected_weight_in_part){
3521 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3522 current_cut_upper_weights[i] = line_weight;
3523 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3524 current_cut_lower_bound_weights[i] = p_weight;
3529 if (p_weight <= expected_weight_in_part && p_weight >= current_cut_lower_bound_weights[i]){
3530 current_cut_lower_bounds[i] = current_global_right_closest_points[ii] ;
3531 current_cut_lower_bound_weights[i] = p_weight;
3536 mj_scalar_t new_cut_position = 0;
3537 this->mj_calculate_new_cut_position(
3538 current_cut_upper_bounds[i],
3539 current_cut_lower_bounds[i],
3540 current_cut_upper_weights[i],
3541 current_cut_lower_bound_weights[i],
3542 expected_weight_in_part, new_cut_position);
3546 if (
ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3549 current_cut_line_determined[i] =
true;
3550 #ifdef HAVE_ZOLTAN2_OMP 3553 my_num_incomplete_cut -= 1;
3556 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3558 new_current_cut_coordinates [i] = new_cut_position;
3564 current_cut_upper_bounds[i] = current_global_left_closest_points[i];
3565 current_cut_upper_weights[i] = seen_weight_in_part;
3568 for (
int ii = i - 1; ii >= 0; --ii){
3569 mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3570 mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3571 if(p_weight <= expected_weight_in_part){
3572 if(p_weight == expected_weight_in_part){
3575 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3576 current_cut_upper_weights[i] = p_weight;
3577 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3578 current_cut_lower_bound_weights[i] = p_weight;
3580 else if (p_weight > current_cut_lower_bound_weights[i]){
3583 current_cut_lower_bounds[i] = current_global_right_closest_points[ii];
3584 current_cut_lower_bound_weights[i] = p_weight;
3590 if(line_weight > expected_weight_in_part){
3591 current_cut_upper_bounds[i] = current_global_right_closest_points[ii];
3592 current_cut_upper_weights[i] = line_weight;
3601 if (p_weight >= expected_weight_in_part &&
3602 (p_weight < current_cut_upper_weights[i] ||
3603 (p_weight == current_cut_upper_weights[i] &&
3604 current_cut_upper_bounds[i] > current_global_left_closest_points[ii]
3608 current_cut_upper_bounds[i] = current_global_left_closest_points[ii] ;
3609 current_cut_upper_weights[i] = p_weight;
3612 mj_scalar_t new_cut_position = 0;
3613 this->mj_calculate_new_cut_position(
3614 current_cut_upper_bounds[i],
3615 current_cut_lower_bounds[i],
3616 current_cut_upper_weights[i],
3617 current_cut_lower_bound_weights[i],
3618 expected_weight_in_part,
3622 if (
ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3624 current_cut_line_determined[i] =
true;
3625 #ifdef HAVE_ZOLTAN2_OMP 3628 my_num_incomplete_cut -= 1;
3630 new_current_cut_coordinates [ i] = current_cut_coordinates[i];
3632 new_current_cut_coordinates [ i] = new_cut_position;
3639 #ifdef HAVE_ZOLTAN2_OMP 3644 if(*rectilinear_cut_count > 0){
3647 Teuchos::scan<int,mj_scalar_t>(
3648 *comm, Teuchos::REDUCE_SUM,
3650 this->process_rectilinear_cut_weight,
3651 this->global_rectilinear_cut_weight
3656 for (mj_part_t i = 0; i < num_cuts; ++i){
3658 if(this->global_rectilinear_cut_weight[i] > 0) {
3660 mj_scalar_t expected_part_weight = current_part_target_weights[i];
3662 mj_scalar_t necessary_weight_on_line_for_left = expected_part_weight - current_global_part_weights[i * 2];
3664 mj_scalar_t my_weight_on_line = this->process_rectilinear_cut_weight[i];
3666 mj_scalar_t weight_on_line_upto_process_inclusive = this->global_rectilinear_cut_weight[i];
3669 mj_scalar_t space_to_put_left = necessary_weight_on_line_for_left - weight_on_line_upto_process_inclusive;
3671 mj_scalar_t space_left_to_me = space_to_put_left + my_weight_on_line;
3681 if(space_left_to_me < 0){
3683 current_part_cut_line_weight_to_put_left[i] = 0;
3685 else if(space_left_to_me >= my_weight_on_line){
3688 current_part_cut_line_weight_to_put_left[i] = my_weight_on_line;
3693 current_part_cut_line_weight_to_put_left[i] = space_left_to_me ;
3700 *rectilinear_cut_count = 0;
3714 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3717 mj_part_t num_procs,
3718 mj_part_t num_parts,
3719 mj_gno_t *&num_points_in_all_processor_parts){
3722 size_t allocation_size = num_parts * (num_procs + 1);
3729 mj_gno_t *num_local_points_in_each_part_to_reduce_sum = allocMemory<mj_gno_t>(allocation_size);
3734 mj_gno_t *my_local_points_to_reduce_sum = num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
3737 mj_gno_t *my_local_point_counts_in_each_art = num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
3740 memset(num_local_points_in_each_part_to_reduce_sum, 0,
sizeof(mj_gno_t)*allocation_size);
3743 for (mj_part_t i = 0; i < num_parts; ++i){
3744 mj_lno_t part_begin_index = 0;
3746 part_begin_index = this->new_part_xadj[i - 1];
3748 mj_lno_t part_end_index = this->new_part_xadj[i];
3749 my_local_points_to_reduce_sum[i] = part_end_index - part_begin_index;
3754 memcpy (my_local_point_counts_in_each_art,
3755 my_local_points_to_reduce_sum,
3756 sizeof(mj_gno_t) * (num_parts) );
3764 reduceAll<int, mj_gno_t>(
3766 Teuchos::REDUCE_SUM,
3768 num_local_points_in_each_part_to_reduce_sum,
3769 num_points_in_all_processor_parts);
3772 freeArray<mj_gno_t>(num_local_points_in_each_part_to_reduce_sum);
3789 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3792 size_t migration_reduce_all_population,
3793 mj_lno_t num_coords_for_last_dim_part,
3794 mj_part_t num_procs,
3795 mj_part_t num_parts,
3796 mj_gno_t *num_points_in_all_processor_parts){
3804 if (this->check_migrate_avoid_migration_option == 0){
3805 double global_imbalance = 0;
3807 size_t global_shift = num_procs * num_parts;
3809 for (mj_part_t ii = 0; ii < num_procs; ++ii){
3810 for (mj_part_t i = 0; i < num_parts; ++i){
3811 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
3812 / double(num_procs);
3815 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
3818 global_imbalance /= num_parts;
3819 global_imbalance /= num_procs;
3827 if(global_imbalance <= this->minimum_migration_imbalance){
3850 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3853 mj_part_t num_parts,
3854 mj_part_t *part_assignment_proc_begin_indices,
3855 mj_part_t *processor_chains_in_parts,
3856 mj_lno_t *send_count_to_each_proc,
3857 int *coordinate_destinations){
3859 for (mj_part_t p = 0; p < num_parts; ++p){
3860 mj_lno_t part_begin = 0;
3861 if (p > 0) part_begin = this->new_part_xadj[p - 1];
3862 mj_lno_t part_end = this->new_part_xadj[p];
3865 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
3867 mj_lno_t num_total_send = 0;
3868 for (mj_lno_t j=part_begin; j < part_end; j++){
3869 mj_lno_t local_ind = this->new_coordinate_permutations[j];
3870 while (num_total_send >= send_count_to_each_proc[proc_to_sent]){
3874 part_assignment_proc_begin_indices[p] = processor_chains_in_parts[proc_to_sent];
3876 processor_chains_in_parts[proc_to_sent] = -1;
3878 proc_to_sent = part_assignment_proc_begin_indices[p];
3881 coordinate_destinations[local_ind] = proc_to_sent;
3901 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3904 mj_gno_t * num_points_in_all_processor_parts,
3905 mj_part_t num_parts,
3906 mj_part_t num_procs,
3907 mj_lno_t *send_count_to_each_proc,
3908 std::vector<mj_part_t> &processor_ranks_for_subcomm,
3909 std::vector<mj_part_t> *next_future_num_parts_in_parts,
3910 mj_part_t &out_part_index,
3911 mj_part_t &output_part_numbering_begin_index,
3912 int *coordinate_destinations){
3915 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
3916 mj_part_t *num_procs_assigned_to_each_part = allocMemory<mj_part_t>(num_parts);
3919 bool did_i_find_my_group =
false;
3921 mj_part_t num_free_procs = num_procs;
3922 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
3924 double max_imbalance_difference = 0;
3925 mj_part_t max_differing_part = 0;
3928 for (mj_part_t i=0; i < num_parts; i++){
3931 double scalar_required_proc = num_procs *
3932 (double (global_num_points_in_parts[i]) / double (this->num_global_coords));
3935 mj_part_t required_proc =
static_cast<mj_part_t
> (0.5 + scalar_required_proc);
3939 if (num_free_procs - required_proc < minimum_num_procs_required_for_rest_of_parts){
3940 required_proc = num_free_procs - (minimum_num_procs_required_for_rest_of_parts);
3944 num_free_procs -= required_proc;
3946 --minimum_num_procs_required_for_rest_of_parts;
3949 num_procs_assigned_to_each_part[i] = required_proc;
3954 double imbalance_wrt_ideal = (scalar_required_proc - required_proc) / required_proc;
3955 if (imbalance_wrt_ideal > max_imbalance_difference){
3956 max_imbalance_difference = imbalance_wrt_ideal;
3957 max_differing_part = i;
3962 if (num_free_procs > 0){
3963 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
3970 mj_part_t *part_assignment_proc_begin_indices = allocMemory<mj_part_t>(num_parts);
3972 mj_part_t *processor_chains_in_parts = allocMemory<mj_part_t>(num_procs);
3973 mj_part_t *processor_part_assignments = allocMemory<mj_part_t>(num_procs);
3982 for (
int i = 0; i < num_procs; ++i ){
3983 processor_part_assignments[i] = -1;
3984 processor_chains_in_parts[i] = -1;
3986 for (
int i = 0; i < num_parts; ++i ){
3987 part_assignment_proc_begin_indices[i] = -1;
3993 for(mj_part_t i = 0; i < num_parts; ++i){
3997 for(mj_part_t ii = 0; ii < num_procs; ++ii){
3998 sort_item_num_part_points_in_procs[ii].
id = ii;
4001 if (processor_part_assignments[ii] == -1){
4002 sort_item_num_part_points_in_procs[ii].
val =
4003 num_points_in_all_processor_parts[ii * num_parts + i];
4009 sort_item_num_part_points_in_procs[ii].
val = -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
4013 uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_part_points_in_procs);
4015 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
4016 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
4017 mj_gno_t ideal_num_points_in_a_proc =
4018 Teuchos::as<mj_gno_t>(ceil (total_num_points_in_part /
double (required_proc_count)));
4021 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
4022 mj_part_t next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].
id;
4023 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].
val;
4027 for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4028 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].
id;
4030 processor_part_assignments[proc_id] = i;
4033 bool did_change_sign =
false;
4035 for(mj_part_t ii = 0; ii < num_procs; ++ii){
4038 if (sort_item_num_part_points_in_procs[ii].val < 0){
4039 did_change_sign =
true;
4040 sort_item_num_part_points_in_procs[ii].
val = -sort_item_num_part_points_in_procs[ii].
val - 1;
4046 if(did_change_sign){
4048 uqsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count, sort_item_num_part_points_in_procs);
4053 if (!did_i_find_my_group){
4054 for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4056 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].
id;
4058 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
4060 if(proc_id_to_assign == this->myRank){
4062 did_i_find_my_group =
true;
4064 part_assignment_proc_begin_indices[i] = this->myRank;
4065 processor_chains_in_parts[this->myRank] = -1;
4068 send_count_to_each_proc[this->myRank] = sort_item_num_part_points_in_procs[ii].
val;
4071 for (mj_part_t in = 0; in < i; ++in){
4072 output_part_numbering_begin_index += (*next_future_num_parts_in_parts)[in];
4079 if (!did_i_find_my_group){
4080 processor_ranks_for_subcomm.clear();
4088 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii){
4089 mj_part_t nonassigned_proc_id = sort_item_num_part_points_in_procs[ii].
id;
4090 mj_lno_t num_points_to_sent = sort_item_num_part_points_in_procs[ii].
val;
4095 if (num_points_to_sent < 0) {
4096 cout <<
"Migration - processor assignments - for part:" << i <<
"from proc:" << nonassigned_proc_id <<
" num_points_to_sent:" << num_points_to_sent << std::endl;
4102 while (num_points_to_sent > 0){
4104 if (num_points_to_sent <= space_left_in_sent_proc){
4106 space_left_in_sent_proc -= num_points_to_sent;
4108 if (this->myRank == nonassigned_proc_id){
4110 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4113 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4114 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4115 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4117 num_points_to_sent = 0;
4121 if(space_left_in_sent_proc > 0){
4122 num_points_to_sent -= space_left_in_sent_proc;
4125 if (this->myRank == nonassigned_proc_id){
4127 send_count_to_each_proc[next_proc_to_send_id] = space_left_in_sent_proc;
4128 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4129 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4130 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4135 ++next_proc_to_send_index;
4138 if(next_part_to_send_index < nprocs - required_proc_count ){
4139 cout <<
"Migration - processor assignments - for part:" 4141 <<
" next_part_to_send :" << next_part_to_send_index
4142 <<
" nprocs:" << nprocs
4143 <<
" required_proc_count:" << required_proc_count
4144 <<
" Error: next_part_to_send_index < nprocs - required_proc_count" << std::endl;
4150 next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].
id;
4152 space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].
val;
4160 this->assign_send_destinations(
4162 part_assignment_proc_begin_indices,
4163 processor_chains_in_parts,
4164 send_count_to_each_proc,
4165 coordinate_destinations);
4167 freeArray<mj_part_t>(part_assignment_proc_begin_indices);
4168 freeArray<mj_part_t>(processor_chains_in_parts);
4169 freeArray<mj_part_t>(processor_part_assignments);
4170 freeArray<uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_part_points_in_procs);
4171 freeArray<mj_part_t > (num_procs_assigned_to_each_part);
4188 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4191 mj_part_t num_parts,
4193 int *coordinate_destinations,
4194 mj_part_t &output_part_numbering_begin_index,
4195 std::vector<mj_part_t> *next_future_num_parts_in_parts){
4197 mj_part_t part_shift_amount = output_part_numbering_begin_index;
4198 mj_part_t previous_processor = -1;
4199 for(mj_part_t i = 0; i < num_parts; ++i){
4200 mj_part_t p = sort_item_part_to_proc_assignment[i].
id;
4202 mj_lno_t part_begin_index = 0;
4203 if (p > 0) part_begin_index = this->new_part_xadj[p - 1];
4204 mj_lno_t part_end_index = this->new_part_xadj[p];
4206 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].
val;
4207 if (this->myRank == assigned_proc && previous_processor != assigned_proc){
4208 output_part_numbering_begin_index = part_shift_amount;
4210 previous_processor = assigned_proc;
4211 part_shift_amount += (*next_future_num_parts_in_parts)[p];
4213 for (mj_lno_t j=part_begin_index; j < part_end_index; j++){
4214 mj_lno_t localInd = this->new_coordinate_permutations[j];
4215 coordinate_destinations[localInd] = assigned_proc;
4237 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4240 mj_gno_t * num_points_in_all_processor_parts,
4241 mj_part_t num_parts,
4242 mj_part_t num_procs,
4243 mj_lno_t *send_count_to_each_proc,
4244 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4245 mj_part_t &out_num_part,
4246 std::vector<mj_part_t> &out_part_indices,
4247 mj_part_t &output_part_numbering_begin_index,
4248 int *coordinate_destinations){
4251 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4252 out_part_indices.clear();
4261 mj_lno_t work_each = mj_lno_t (this->num_global_coords / (
double (num_procs)) + 0.5f);
4263 mj_lno_t *space_in_each_processor = allocMemory <mj_lno_t>(num_procs);
4265 for (mj_part_t i = 0; i < num_procs; ++i){
4266 space_in_each_processor[i] = work_each;
4273 mj_part_t *num_parts_proc_assigned = allocMemory <mj_part_t>(num_procs);
4274 memset(num_parts_proc_assigned, 0,
sizeof(mj_part_t) * num_procs);
4275 int empty_proc_count = num_procs;
4283 for (mj_part_t i = 0; i < num_parts; ++i){
4284 sort_item_point_counts_in_parts[i].
id = i;
4285 sort_item_point_counts_in_parts[i].
val = global_num_points_in_parts[i];
4288 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
4294 for (mj_part_t j = 0; j < num_parts; ++j){
4296 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].
id;
4298 mj_gno_t load = global_num_points_in_parts[i];
4301 mj_part_t assigned_proc = -1;
4303 mj_part_t best_proc_to_assign = 0;
4307 for (mj_part_t ii = 0; ii < num_procs; ++ii){
4308 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
4313 if (empty_proc_count < num_parts - j || num_parts_proc_assigned[ii] == 0){
4315 sort_item_num_points_of_proc_in_part_i[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4318 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
4321 uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_points_of_proc_in_part_i);
4324 for (mj_part_t iii = num_procs - 1; iii >= 0; --iii){
4325 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
4326 mj_lno_t left_space = space_in_each_processor[ii] - load;
4328 if(left_space >= 0 ){
4333 if (space_in_each_processor[best_proc_to_assign] < space_in_each_processor[ii]){
4334 best_proc_to_assign = ii;
4339 if (assigned_proc == -1){
4340 assigned_proc = best_proc_to_assign;
4343 if (num_parts_proc_assigned[assigned_proc]++ == 0){
4346 space_in_each_processor[assigned_proc] -= load;
4348 sort_item_part_to_proc_assignment[j].
id = i;
4349 sort_item_part_to_proc_assignment[j].
val = assigned_proc;
4353 if (assigned_proc == this->myRank){
4355 out_part_indices.push_back(i);
4359 send_count_to_each_proc[assigned_proc] += num_points_in_all_processor_parts[this->myRank * num_parts + i];
4361 freeArray<mj_part_t>(num_parts_proc_assigned);
4362 freeArray< uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_points_of_proc_in_part_i);
4363 freeArray<uSortItem<mj_part_t, mj_gno_t> >(sort_item_point_counts_in_parts);
4364 freeArray<mj_lno_t >(space_in_each_processor);
4368 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
4372 this->assign_send_destinations2(
4374 sort_item_part_to_proc_assignment,
4375 coordinate_destinations,
4376 output_part_numbering_begin_index,
4377 next_future_num_parts_in_parts);
4379 freeArray<uSortItem<mj_part_t, mj_part_t> >(sort_item_part_to_proc_assignment);
4400 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4403 mj_gno_t * num_points_in_all_processor_parts,
4404 mj_part_t num_parts,
4405 mj_part_t num_procs,
4406 mj_lno_t *send_count_to_each_proc,
4407 std::vector<mj_part_t> &processor_ranks_for_subcomm,
4408 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4409 mj_part_t &out_num_part,
4410 std::vector<mj_part_t> &out_part_indices,
4411 mj_part_t &output_part_numbering_begin_index,
4412 int *coordinate_destinations){
4416 processor_ranks_for_subcomm.clear();
4418 if (num_procs > num_parts){
4423 mj_part_t out_part_index = 0;
4424 this->mj_assign_proc_to_parts(
4425 num_points_in_all_processor_parts,
4428 send_count_to_each_proc,
4429 processor_ranks_for_subcomm,
4430 next_future_num_parts_in_parts,
4432 output_part_numbering_begin_index,
4433 coordinate_destinations
4437 out_part_indices.clear();
4438 out_part_indices.push_back(out_part_index);
4445 processor_ranks_for_subcomm.push_back(this->myRank);
4449 this->mj_assign_parts_to_procs(
4450 num_points_in_all_processor_parts,
4453 send_count_to_each_proc,
4454 next_future_num_parts_in_parts,
4457 output_part_numbering_begin_index,
4458 coordinate_destinations);
4474 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4477 mj_part_t num_procs,
4478 mj_lno_t &num_new_local_points,
4479 std::string iteration,
4480 int *coordinate_destinations,
4481 mj_part_t num_parts)
4483 #ifdef ENABLE_ZOLTAN_MIGRATION 4484 if (
sizeof(mj_lno_t) <=
sizeof(
int)) {
4490 ZOLTAN_COMM_OBJ *plan = NULL;
4491 MPI_Comm mpi_comm = Teuchos2MPI (this->comm);
4492 int num_incoming_gnos = 0;
4493 int message_tag = 7859;
4495 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration Z1PlanCreating-" + iteration);
4496 int ierr = Zoltan_Comm_Create(
4498 int(this->num_local_coords),
4499 coordinate_destinations,
4502 &num_incoming_gnos);
4504 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration Z1PlanCreating-" + iteration);
4506 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration Z1Migration-" + iteration);
4507 mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(num_incoming_gnos);
4511 ierr = Zoltan_Comm_Do(
4514 (
char *) this->current_mj_gnos,
4516 (
char *) incoming_gnos);
4519 freeArray<mj_gno_t>(this->current_mj_gnos);
4520 this->current_mj_gnos = incoming_gnos;
4524 for (
int i = 0; i < this->coord_dim; ++i){
4526 mj_scalar_t *coord = this->mj_coordinates[i];
4528 this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4529 ierr = Zoltan_Comm_Do(
4533 sizeof(mj_scalar_t),
4534 (
char *) this->mj_coordinates[i]);
4536 freeArray<mj_scalar_t>(coord);
4540 for (
int i = 0; i < this->num_weights_per_coord; ++i){
4542 mj_scalar_t *weight = this->mj_weights[i];
4544 this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4545 ierr = Zoltan_Comm_Do(
4549 sizeof(mj_scalar_t),
4550 (
char *) this->mj_weights[i]);
4552 freeArray<mj_scalar_t>(weight);
4557 int *coord_own = allocMemory<int>(num_incoming_gnos);
4559 ierr = Zoltan_Comm_Do(
4562 (
char *) this->owner_of_coordinate,
4563 sizeof(
int), (
char *) coord_own);
4565 freeArray<int>(this->owner_of_coordinate);
4566 this->owner_of_coordinate = coord_own;
4572 mj_part_t *new_parts = allocMemory<mj_part_t>(num_incoming_gnos);
4573 if(num_procs < num_parts){
4575 ierr = Zoltan_Comm_Do(
4578 (
char *) this->assigned_part_ids,
4580 (
char *) new_parts);
4583 freeArray<mj_part_t>(this->assigned_part_ids);
4584 this->assigned_part_ids = new_parts;
4586 ierr = Zoltan_Comm_Destroy(&plan);
4588 num_new_local_points = num_incoming_gnos;
4589 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration Z1Migration-" + iteration);
4594 #endif // ENABLE_ZOLTAN_MIGRATION 4596 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration DistributorPlanCreating-" + iteration);
4597 Tpetra::Distributor distributor(this->comm);
4598 ArrayView<const mj_part_t> destinations( coordinate_destinations, this->num_local_coords);
4599 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
4600 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration DistributorPlanCreating-" + iteration);
4602 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration DistributorMigration-" + iteration);
4605 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
4606 ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
4607 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
4608 freeArray<mj_gno_t>(this->current_mj_gnos);
4609 this->current_mj_gnos = allocMemory<mj_gno_t>(num_incoming_gnos);
4611 this->current_mj_gnos,
4612 received_gnos.getRawPtr(),
4613 num_incoming_gnos *
sizeof(mj_gno_t));
4616 for (
int i = 0; i < this->coord_dim; ++i){
4618 ArrayView<mj_scalar_t> sent_coord(this->mj_coordinates[i], this->num_local_coords);
4619 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
4620 distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
4621 freeArray<mj_scalar_t>(this->mj_coordinates[i]);
4622 this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4624 this->mj_coordinates[i],
4625 received_coord.getRawPtr(),
4626 num_incoming_gnos *
sizeof(mj_scalar_t));
4630 for (
int i = 0; i < this->num_weights_per_coord; ++i){
4632 ArrayView<mj_scalar_t> sent_weight(this->mj_weights[i], this->num_local_coords);
4633 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
4634 distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
4635 freeArray<mj_scalar_t>(this->mj_weights[i]);
4636 this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4638 this->mj_weights[i],
4639 received_weight.getRawPtr(),
4640 num_incoming_gnos *
sizeof(mj_scalar_t));
4645 ArrayView<int> sent_owners(this->owner_of_coordinate, this->num_local_coords);
4646 ArrayRCP<int> received_owners(num_incoming_gnos);
4647 distributor.doPostsAndWaits<
int>(sent_owners, 1, received_owners());
4648 freeArray<int>(this->owner_of_coordinate);
4649 this->owner_of_coordinate = allocMemory<int>(num_incoming_gnos);
4651 this->owner_of_coordinate,
4652 received_owners.getRawPtr(),
4653 num_incoming_gnos *
sizeof(int));
4659 if(num_procs < num_parts){
4660 ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
4661 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
4662 distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
4663 freeArray<mj_part_t>(this->assigned_part_ids);
4664 this->assigned_part_ids = allocMemory<mj_part_t>(num_incoming_gnos);
4666 this->assigned_part_ids,
4667 received_partids.getRawPtr(),
4668 num_incoming_gnos *
sizeof(mj_part_t));
4671 mj_part_t *new_parts = allocMemory<int>(num_incoming_gnos);
4672 freeArray<mj_part_t>(this->assigned_part_ids);
4673 this->assigned_part_ids = new_parts;
4675 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration DistributorMigration-" + iteration);
4676 num_new_local_points = num_incoming_gnos;
4687 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4690 mj_part_t group_size = processor_ranks_for_subcomm.size();
4691 mj_part_t *ids = allocMemory<mj_part_t>(group_size);
4692 for(mj_part_t i = 0; i < group_size; ++i) {
4693 ids[i] = processor_ranks_for_subcomm[i];
4695 ArrayView<const mj_part_t> idView(ids, group_size);
4696 this->comm = this->comm->createSubcommunicator(idView);
4697 freeArray<mj_part_t>(ids);
4706 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4709 mj_part_t output_num_parts,
4710 mj_part_t num_parts){
4712 if (output_num_parts == 1){
4713 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
4714 this->new_coordinate_permutations[i] = i;
4716 this->new_part_xadj[0] = this->num_local_coords;
4723 mj_lno_t *num_points_in_parts = allocMemory<mj_lno_t>(num_parts);
4725 mj_part_t *part_shifts = allocMemory<mj_part_t>(num_parts);
4727 memset(num_points_in_parts, 0,
sizeof(mj_lno_t) * num_parts);
4729 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
4730 mj_part_t ii = this->assigned_part_ids[i];
4731 ++num_points_in_parts[ii];
4736 mj_lno_t prev_index = 0;
4737 for(mj_part_t i = 0; i < num_parts; ++i){
4738 if(num_points_in_parts[i] > 0) {
4739 this->new_part_xadj[p] = prev_index + num_points_in_parts[i];
4740 prev_index += num_points_in_parts[i];
4741 part_shifts[i] = p++;
4746 mj_part_t assigned_num_parts = p - 1;
4747 for (;p < num_parts; ++p){
4748 this->new_part_xadj[p] = this->new_part_xadj[assigned_num_parts];
4750 for(mj_part_t i = 0; i < output_num_parts; ++i){
4751 num_points_in_parts[i] = this->new_part_xadj[i];
4757 for(mj_lno_t i = this->num_local_coords - 1; i >= 0; --i){
4758 mj_part_t part = part_shifts[mj_part_t(this->assigned_part_ids[i])];
4759 this->new_coordinate_permutations[--num_points_in_parts[part]] = i;
4762 freeArray<mj_lno_t>(num_points_in_parts);
4763 freeArray<mj_part_t>(part_shifts);
4790 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4793 mj_part_t input_num_parts,
4794 mj_part_t &output_num_parts,
4795 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4796 mj_part_t &output_part_begin_index,
4797 size_t migration_reduce_all_population,
4798 mj_lno_t num_coords_for_last_dim_part,
4799 std::string iteration,
4800 RCP<mj_partBoxVector_t> &input_part_boxes,
4801 RCP<mj_partBoxVector_t> &output_part_boxes
4804 mj_part_t num_procs = this->comm->getSize();
4805 this->myRank = this->comm->getRank();
4811 mj_gno_t *num_points_in_all_processor_parts = allocMemory<mj_gno_t>(input_num_parts * (num_procs + 1));
4814 this->get_processor_num_points_in_parts(
4817 num_points_in_all_processor_parts);
4821 if (!this->mj_check_to_migrate(
4822 migration_reduce_all_population,
4823 num_coords_for_last_dim_part,
4826 num_points_in_all_processor_parts)){
4827 freeArray<mj_gno_t>(num_points_in_all_processor_parts);
4832 mj_lno_t *send_count_to_each_proc = NULL;
4833 int *coordinate_destinations = allocMemory<int>(this->num_local_coords);
4834 send_count_to_each_proc = allocMemory<mj_lno_t>(num_procs);
4835 for (
int i = 0; i < num_procs; ++i) send_count_to_each_proc[i] = 0;
4837 std::vector<mj_part_t> processor_ranks_for_subcomm;
4838 std::vector<mj_part_t> out_part_indices;
4841 this->mj_migration_part_proc_assignment(
4842 num_points_in_all_processor_parts,
4845 send_count_to_each_proc,
4846 processor_ranks_for_subcomm,
4847 next_future_num_parts_in_parts,
4850 output_part_begin_index,
4851 coordinate_destinations);
4856 freeArray<mj_lno_t>(send_count_to_each_proc);
4857 std::vector <mj_part_t> tmpv;
4859 std::sort (out_part_indices.begin(), out_part_indices.end());
4860 mj_part_t outP = out_part_indices.size();
4862 mj_gno_t new_global_num_points = 0;
4863 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * input_num_parts;
4865 if (this->mj_keep_part_boxes){
4866 input_part_boxes->clear();
4871 for (mj_part_t i = 0; i < outP; ++i){
4872 mj_part_t ind = out_part_indices[i];
4873 new_global_num_points += global_num_points_in_parts[ind];
4874 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
4875 if (this->mj_keep_part_boxes){
4876 input_part_boxes->push_back((*output_part_boxes)[ind]);
4880 if (this->mj_keep_part_boxes){
4881 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
4882 input_part_boxes = output_part_boxes;
4883 output_part_boxes = tmpPartBoxes;
4885 next_future_num_parts_in_parts->clear();
4886 for (mj_part_t i = 0; i < outP; ++i){
4887 mj_part_t p = tmpv[i];
4888 next_future_num_parts_in_parts->push_back(p);
4891 freeArray<mj_gno_t>(num_points_in_all_processor_parts);
4893 mj_lno_t num_new_local_points = 0;
4897 this->mj_migrate_coords(
4899 num_new_local_points,
4901 coordinate_destinations,
4905 freeArray<int>(coordinate_destinations);
4907 if(this->num_local_coords != num_new_local_points){
4908 freeArray<mj_lno_t>(this->new_coordinate_permutations);
4909 freeArray<mj_lno_t>(this->coordinate_permutations);
4911 this->new_coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
4912 this->coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
4914 this->num_local_coords = num_new_local_points;
4915 this->num_global_coords = new_global_num_points;
4920 this->create_sub_communicator(processor_ranks_for_subcomm);
4921 processor_ranks_for_subcomm.clear();
4924 this->fill_permutation_array(
4944 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4947 mj_part_t num_parts,
4948 mj_scalar_t *mj_current_dim_coords,
4949 mj_scalar_t *current_concurrent_cut_coordinate,
4950 mj_lno_t coordinate_begin,
4951 mj_lno_t coordinate_end,
4952 mj_scalar_t *used_local_cut_line_weight_to_left,
4953 mj_lno_t *out_part_xadj,
4957 mj_part_t no_cuts = num_parts - 1;
4962 mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
4963 mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
4968 if (this->distribute_points_on_cut_lines){
4970 my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
4971 for (mj_part_t i = 0; i < no_cuts; ++i){
4973 mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
4975 for(
int ii = 0; ii < this->num_threads; ++ii){
4976 if(left_weight > this->sEpsilon){
4978 mj_scalar_t thread_ii_weight_on_cut = this->thread_part_weight_work[ii][i * 2 + 1] - this->thread_part_weight_work[ii][i * 2 ];
4979 if(thread_ii_weight_on_cut < left_weight){
4980 this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
4983 this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
4985 left_weight -= thread_ii_weight_on_cut;
4988 this->thread_cut_line_weight_to_put_left[ii][i] = 0;
4996 for (mj_part_t i = no_cuts - 1; i > 0 ; --i){
4997 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
4998 my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
5001 / mj_scalar_t(SIGNIFICANCE_MUL);
5006 for(mj_part_t ii = 0; ii < num_parts; ++ii){
5007 thread_num_points_in_parts[ii] = 0;
5019 mj_part_t *cut_map = allocMemory<mj_part_t> (no_cuts);
5023 typedef std::vector< multiSItem > multiSVector;
5024 typedef std::vector<multiSVector> multiS2Vector;
5027 std::vector<mj_scalar_t *>allocated_memory;
5030 multiS2Vector sort_vector_points_on_cut;
5033 mj_part_t different_cut_count = 1;
5038 multiSVector tmpMultiSVector;
5039 sort_vector_points_on_cut.push_back(tmpMultiSVector);
5041 for (mj_part_t i = 1; i < no_cuts ; ++i){
5044 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5045 cut_map[i] = cut_map[i-1];
5048 cut_map[i] = different_cut_count++;
5049 multiSVector tmp2MultiSVector;
5050 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
5056 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5058 mj_lno_t i = this->coordinate_permutations[ii];
5060 mj_part_t pp = this->assigned_part_ids[i];
5061 mj_part_t p = pp / 2;
5064 mj_scalar_t *
vals = allocMemory<mj_scalar_t>(this->coord_dim -1);
5065 allocated_memory.push_back(vals);
5069 for(
int dim = coordInd + 1; dim < this->coord_dim; ++dim){
5070 vals[val_ind++] = this->mj_coordinates[dim][i];
5072 for(
int dim = 0; dim < coordInd; ++dim){
5073 vals[val_ind++] = this->mj_coordinates[dim][i];
5075 multiSItem tempSortItem(i, this->coord_dim -1, vals);
5077 mj_part_t cmap = cut_map[p];
5078 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
5082 ++thread_num_points_in_parts[p];
5083 this->assigned_part_ids[i] = p;
5088 for (mj_part_t i = 0; i < different_cut_count; ++i){
5089 std::sort (sort_vector_points_on_cut[i].begin(), sort_vector_points_on_cut[i].end());
5093 mj_part_t previous_cut_map = cut_map[0];
5102 mj_scalar_t weight_stolen_from_previous_part = 0;
5103 for (mj_part_t p = 0; p < no_cuts; ++p){
5105 mj_part_t mapped_cut = cut_map[p];
5109 if (previous_cut_map != mapped_cut){
5110 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5111 for (; sort_vector_end >= 0; --sort_vector_end){
5112 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5113 mj_lno_t i = t.index;
5114 ++thread_num_points_in_parts[p];
5115 this->assigned_part_ids[i] = p;
5117 sort_vector_points_on_cut[previous_cut_map].clear();
5119 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size() - 1;
5121 for (; sort_vector_end >= 0; --sort_vector_end){
5122 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
5123 mj_lno_t i = t.index;
5124 mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
5128 if( my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part> this->sEpsilon &&
5129 my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part -
ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - w)
5132 my_local_thread_cut_weights_to_put_left[p] -= w;
5133 sort_vector_points_on_cut[mapped_cut].pop_back();
5134 ++thread_num_points_in_parts[p];
5135 this->assigned_part_ids[i] = p;
5138 if(p < no_cuts - 1 && my_local_thread_cut_weights_to_put_left[p] < this->sEpsilon){
5139 if(mapped_cut == cut_map[p + 1] ){
5142 if (previous_cut_map != mapped_cut){
5143 weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5148 weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5152 weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5161 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]){
5162 if (previous_cut_map != mapped_cut){
5163 weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5166 weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5170 weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5176 previous_cut_map = mapped_cut;
5179 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5180 for (; sort_vector_end >= 0; --sort_vector_end){
5181 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5182 mj_lno_t i = t.index;
5183 ++thread_num_points_in_parts[no_cuts];
5184 this->assigned_part_ids[i] = no_cuts;
5186 sort_vector_points_on_cut[previous_cut_map].clear();
5187 freeArray<mj_part_t> (cut_map);
5190 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
5191 for(mj_lno_t i = 0; i < vSize; ++i){
5192 freeArray<mj_scalar_t> (allocated_memory[i]);
5196 for(mj_part_t j = 0; j < num_parts; ++j){
5197 mj_lno_t num_points_in_part_j_upto_thread_i = 0;
5198 for (
int i = 0; i < this->num_threads; ++i){
5199 mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
5200 this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
5201 num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
5204 out_part_xadj[j] = num_points_in_part_j_upto_thread_i;
5208 for(mj_part_t j = 1; j < num_parts; ++j){
5209 out_part_xadj[j] += out_part_xadj[j - 1];
5215 for(mj_part_t j = 1; j < num_parts; ++j){
5216 thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
5221 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5222 mj_lno_t i = this->coordinate_permutations[ii];
5223 mj_part_t p = this->assigned_part_ids[i];
5224 this->new_coordinate_permutations[coordinate_begin +
5225 thread_num_points_in_parts[p]++] = i;
5240 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5243 mj_part_t current_num_parts,
5244 mj_part_t output_part_begin_index,
5245 RCP<mj_partBoxVector_t> &output_part_boxes,
5246 bool is_data_ever_migrated)
5248 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Part_Assignment");
5250 #ifdef HAVE_ZOLTAN2_OMP 5251 #pragma omp parallel for 5253 for(mj_part_t i = 0; i < current_num_parts;++i){
5256 mj_lno_t end = this->part_xadj[i];
5258 if(i > 0) begin = this->part_xadj[i -1];
5259 mj_part_t part_to_set_index = i + output_part_begin_index;
5260 if (this->mj_keep_part_boxes){
5261 (*output_part_boxes)[i].setpId(part_to_set_index);
5263 for (mj_lno_t ii = begin; ii < end; ++ii){
5264 mj_lno_t k = this->coordinate_permutations[ii];
5265 this->assigned_part_ids[k] = part_to_set_index;
5270 if(!is_data_ever_migrated){
5277 #ifdef ENABLE_ZOLTAN_MIGRATION 5278 if (
sizeof(mj_lno_t) <=
sizeof(
int)) {
5285 ZOLTAN_COMM_OBJ *plan = NULL;
5286 MPI_Comm mpi_comm = Teuchos2MPI (this->mj_problemComm);
5289 int message_tag = 7856;
5291 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanCreating");
5292 int ierr = Zoltan_Comm_Create( &plan,
int(this->num_local_coords),
5293 this->owner_of_coordinate, mpi_comm, message_tag,
5296 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanCreating" );
5298 mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(incoming);
5301 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanComm");
5302 ierr = Zoltan_Comm_Do( plan, message_tag, (
char *) this->current_mj_gnos,
5303 sizeof(mj_gno_t), (
char *) incoming_gnos);
5306 freeArray<mj_gno_t>(this->current_mj_gnos);
5307 this->current_mj_gnos = incoming_gnos;
5309 mj_part_t *incoming_partIds = allocMemory< mj_part_t>(incoming);
5312 ierr = Zoltan_Comm_Do( plan, message_tag, (
char *) this->assigned_part_ids,
5313 sizeof(mj_part_t), (
char *) incoming_partIds);
5315 freeArray<mj_part_t>(this->assigned_part_ids);
5316 this->assigned_part_ids = incoming_partIds;
5318 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanComm");
5319 ierr = Zoltan_Comm_Destroy(&plan);
5322 this->num_local_coords = incoming;
5327 #endif // !ENABLE_ZOLTAN_MIGRATION 5330 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanCreating");
5331 Tpetra::Distributor distributor(this->mj_problemComm);
5332 ArrayView<const mj_part_t> owners_of_coords(this->owner_of_coordinate, this->num_local_coords);
5333 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
5334 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanCreating" );
5336 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanComm");
5338 ArrayRCP<mj_gno_t> received_gnos(incoming);
5339 ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5340 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5341 freeArray<mj_gno_t>(this->current_mj_gnos);
5342 this->current_mj_gnos = allocMemory<mj_gno_t>(incoming);
5343 memcpy( this->current_mj_gnos,
5344 received_gnos.getRawPtr(),
5345 incoming *
sizeof(mj_gno_t));
5348 ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5349 ArrayRCP<mj_part_t> received_partids(incoming);
5350 distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5351 freeArray<mj_part_t>(this->assigned_part_ids);
5352 this->assigned_part_ids = allocMemory<mj_part_t>(incoming);
5353 memcpy( this->assigned_part_ids,
5354 received_partids.getRawPtr(),
5355 incoming *
sizeof(mj_part_t));
5357 this->num_local_coords = incoming;
5358 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanComm");
5363 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Part_Assignment");
5365 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Solution_Part_Assignment");
5370 if (this->mj_keep_part_boxes){
5375 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Solution_Part_Assignment");
5380 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5383 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Free");
5385 for (
int i=0; i < this->coord_dim; i++){
5386 freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5388 freeArray<mj_scalar_t *>(this->mj_coordinates);
5390 for (
int i=0; i < this->num_weights_per_coord; i++){
5391 freeArray<mj_scalar_t>(this->mj_weights[i]);
5393 freeArray<mj_scalar_t *>(this->mj_weights);
5395 freeArray<int>(this->owner_of_coordinate);
5397 for(
int i = 0; i < this->num_threads; ++i){
5398 freeArray<mj_lno_t>(this->thread_point_counts[i]);
5401 freeArray<mj_lno_t *>(this->thread_point_counts);
5402 freeArray<double *> (this->thread_part_weight_work);
5404 if(this->distribute_points_on_cut_lines){
5405 freeArray<mj_scalar_t>(this->process_cut_line_weight_to_put_left);
5406 for(
int i = 0; i < this->num_threads; ++i){
5407 freeArray<mj_scalar_t>(this->thread_cut_line_weight_to_put_left[i]);
5409 freeArray<mj_scalar_t *>(this->thread_cut_line_weight_to_put_left);
5410 freeArray<mj_scalar_t>(this->process_rectilinear_cut_weight);
5411 freeArray<mj_scalar_t>(this->global_rectilinear_cut_weight);
5414 freeArray<mj_part_t>(this->my_incomplete_cut_count);
5416 freeArray<mj_scalar_t>(this->max_min_coords);
5418 freeArray<mj_lno_t>(this->new_part_xadj);
5420 freeArray<mj_lno_t>(this->coordinate_permutations);
5422 freeArray<mj_lno_t>(this->new_coordinate_permutations);
5424 freeArray<mj_scalar_t>(this->all_cut_coordinates);
5426 freeArray<mj_scalar_t> (this->process_local_min_max_coord_total_weight);
5428 freeArray<mj_scalar_t> (this->global_min_max_coord_total_weight);
5430 freeArray<mj_scalar_t>(this->cut_coordinates_work_array);
5432 freeArray<mj_scalar_t>(this->target_part_weights);
5434 freeArray<mj_scalar_t>(this->cut_upper_bound_coordinates);
5436 freeArray<mj_scalar_t>(this->cut_lower_bound_coordinates);
5438 freeArray<mj_scalar_t>(this->cut_lower_bound_weights);
5439 freeArray<mj_scalar_t>(this->cut_upper_bound_weights);
5440 freeArray<bool>(this->is_cut_line_determined);
5441 freeArray<mj_scalar_t>(this->total_part_weight_left_right_closests);
5442 freeArray<mj_scalar_t>(this->global_total_part_weight_left_right_closests);
5444 for(
int i = 0; i < this->num_threads; ++i){
5445 freeArray<double>(this->thread_part_weights[i]);
5446 freeArray<mj_scalar_t>(this->thread_cut_right_closest_point[i]);
5447 freeArray<mj_scalar_t>(this->thread_cut_left_closest_point[i]);
5450 freeArray<double *>(this->thread_part_weights);
5451 freeArray<mj_scalar_t *>(this->thread_cut_left_closest_point);
5452 freeArray<mj_scalar_t *>(this->thread_cut_right_closest_point);
5454 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Free");
5464 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5467 bool distribute_points_on_cut_lines_,
5468 int max_concurrent_part_calculation_,
5469 int check_migrate_avoid_migration_option_,
5470 mj_scalar_t minimum_migration_imbalance_){
5471 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
5472 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
5473 this->check_migrate_avoid_migration_option = check_migrate_avoid_migration_option_;
5474 this->minimum_migration_imbalance = minimum_migration_imbalance_;
5507 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5511 const RCP<const Environment> &env,
5512 RCP<Comm<int> > &problemComm,
5514 double imbalance_tolerance_,
5515 size_t num_global_parts_,
5516 mj_part_t *part_no_array_,
5517 int recursion_depth_,
5520 mj_lno_t num_local_coords_,
5521 mj_gno_t num_global_coords_,
5522 const mj_gno_t *initial_mj_gnos_,
5523 mj_scalar_t **mj_coordinates_,
5525 int num_weights_per_coord_,
5526 bool *mj_uniform_weights_,
5527 mj_scalar_t **mj_weights_,
5528 bool *mj_uniform_parts_,
5529 mj_scalar_t **mj_part_sizes_,
5531 mj_part_t *&result_assigned_part_ids_,
5532 mj_gno_t *&result_mj_gnos_
5537 if(comm->getRank() == 0){
5538 std::cout <<
"size of gno:" <<
sizeof(mj_gno_t) << std::endl;
5539 std::cout <<
"size of lno:" <<
sizeof(mj_lno_t) << std::endl;
5540 std::cout <<
"size of mj_scalar_t:" <<
sizeof(mj_scalar_t) << std::endl;
5545 this->mj_problemComm = problemComm;
5546 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
5548 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Total");
5549 this->mj_env->debug(3,
"In MultiJagged Jagged");
5552 this->imbalance_tolerance = imbalance_tolerance_;
5553 this->num_global_parts = num_global_parts_;
5554 this->part_no_array = part_no_array_;
5555 this->recursion_depth = recursion_depth_;
5557 this->coord_dim = coord_dim_;
5558 this->num_local_coords = num_local_coords_;
5559 this->num_global_coords = num_global_coords_;
5560 this->mj_coordinates = mj_coordinates_;
5561 this->initial_mj_gnos = (mj_gno_t *) initial_mj_gnos_;
5563 this->num_weights_per_coord = num_weights_per_coord_;
5564 this->mj_uniform_weights = mj_uniform_weights_;
5565 this->mj_weights = mj_weights_;
5566 this->mj_uniform_parts = mj_uniform_parts_;
5567 this->mj_part_sizes = mj_part_sizes_;
5569 this->num_threads = 1;
5570 #ifdef HAVE_ZOLTAN2_OMP 5571 #pragma omp parallel 5574 this->num_threads = omp_get_num_threads();
5579 this->set_part_specifications();
5581 this->allocate_set_work_memory();
5585 this->comm = this->mj_problemComm->duplicate();
5588 mj_part_t current_num_parts = 1;
5589 mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
5591 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning");
5593 mj_part_t output_part_begin_index = 0;
5594 mj_part_t future_num_parts = this->total_num_part;
5595 bool is_data_ever_migrated =
false;
5597 std::vector<mj_part_t> *future_num_part_in_parts =
new std::vector<mj_part_t> ();
5598 std::vector<mj_part_t> *next_future_num_parts_in_parts =
new std::vector<mj_part_t> ();
5599 next_future_num_parts_in_parts->push_back(this->num_global_parts);
5601 RCP<mj_partBoxVector_t> input_part_boxes(
new mj_partBoxVector_t(),
true) ;
5602 RCP<mj_partBoxVector_t> output_part_boxes(
new mj_partBoxVector_t(),
true);
5604 compute_global_box();
5605 if(this->mj_keep_part_boxes){
5606 this->init_part_boxes(output_part_boxes);
5609 for (
int i = 0; i < this->recursion_depth; ++i){
5612 std::vector <mj_part_t> num_partitioning_in_current_dim;
5626 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
5627 future_num_part_in_parts = next_future_num_parts_in_parts;
5628 next_future_num_parts_in_parts = tmpPartVect;
5633 next_future_num_parts_in_parts->clear();
5635 if(this->mj_keep_part_boxes){
5636 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5637 input_part_boxes = output_part_boxes;
5638 output_part_boxes = tmpPartBoxes;
5639 output_part_boxes->clear();
5643 mj_part_t output_part_count_in_dimension =
5644 this->update_part_num_arrays(
5645 num_partitioning_in_current_dim,
5646 future_num_part_in_parts,
5647 next_future_num_parts_in_parts,
5657 if(output_part_count_in_dimension == current_num_parts) {
5659 tmpPartVect= future_num_part_in_parts;
5660 future_num_part_in_parts = next_future_num_parts_in_parts;
5661 next_future_num_parts_in_parts = tmpPartVect;
5663 if(this->mj_keep_part_boxes){
5664 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5665 input_part_boxes = output_part_boxes;
5666 output_part_boxes = tmpPartBoxes;
5673 int coordInd = i % this->coord_dim;
5674 mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
5677 std::string istring = toString<int>(i);
5678 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning_" + istring);
5682 this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
5685 mj_part_t output_part_index = 0;
5688 mj_part_t output_coordinate_end_index = 0;
5690 mj_part_t current_work_part = 0;
5691 mj_part_t current_concurrent_num_parts =
5692 std::min(current_num_parts - current_work_part, this->max_concurrent_part_calculation);
5694 mj_part_t obtained_part_index = 0;
5697 for (; current_work_part < current_num_parts;
5698 current_work_part += current_concurrent_num_parts){
5700 current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
5701 this->max_concurrent_part_calculation);
5703 mj_part_t actual_work_part_count = 0;
5707 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
5708 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
5712 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
5715 ++actual_work_part_count;
5716 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
5717 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
5725 this->mj_get_local_min_max_coord_totW(
5726 coordinate_begin_index,
5727 coordinate_end_index,
5728 this->coordinate_permutations,
5729 mj_current_dim_coords,
5730 this->process_local_min_max_coord_total_weight[kk],
5731 this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts],
5732 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]);
5737 if (actual_work_part_count > 0){
5739 this->mj_get_global_min_max_coord_totW(
5740 current_concurrent_num_parts,
5741 this->process_local_min_max_coord_total_weight,
5742 this->global_min_max_coord_total_weight);
5746 mj_part_t total_incomplete_cut_count = 0;
5751 mj_part_t concurrent_part_cut_shift = 0;
5752 mj_part_t concurrent_part_part_shift = 0;
5753 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
5754 mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
5755 mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
5756 current_concurrent_num_parts];
5758 mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
5759 2 * current_concurrent_num_parts];
5761 mj_part_t concurrent_current_part_index = current_work_part + kk;
5763 mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
5765 mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
5766 mj_scalar_t *current_target_part_weights = this->target_part_weights +
5767 concurrent_part_part_shift;
5769 concurrent_part_cut_shift += partition_count - 1;
5771 concurrent_part_part_shift += partition_count;
5776 if(partition_count > 1 && min_coordinate <= max_coordinate){
5780 total_incomplete_cut_count += partition_count - 1;
5783 this->my_incomplete_cut_count[kk] = partition_count - 1;
5786 this->mj_get_initial_cut_coords_target_weights(
5789 partition_count - 1,
5790 global_total_weight,
5792 current_target_part_weights,
5793 future_num_part_in_parts,
5794 next_future_num_parts_in_parts,
5795 concurrent_current_part_index,
5796 obtained_part_index);
5798 mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
5799 mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
5803 this->set_initial_coordinate_parts(
5806 concurrent_current_part_index,
5807 coordinate_begin_index, coordinate_end_index,
5808 this->coordinate_permutations,
5809 mj_current_dim_coords,
5810 this->assigned_part_ids,
5815 this->my_incomplete_cut_count[kk] = 0;
5817 obtained_part_index += partition_count;
5824 mj_scalar_t used_imbalance = 0;
5829 mj_current_dim_coords,
5832 current_concurrent_num_parts,
5833 current_cut_coordinates,
5834 total_incomplete_cut_count,
5835 num_partitioning_in_current_dim);
5840 mj_part_t output_array_shift = 0;
5841 mj_part_t cut_shift = 0;
5842 size_t tlr_shift = 0;
5843 size_t partweight_array_shift = 0;
5845 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
5846 mj_part_t current_concurrent_work_part = current_work_part + kk;
5847 mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
5850 if((num_parts != 1 )
5852 this->global_min_max_coord_total_weight[kk] >
5853 this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
5857 for(mj_part_t jj = 0; jj < num_parts; ++jj){
5858 this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
5860 cut_shift += num_parts - 1;
5861 tlr_shift += (4 *(num_parts - 1) + 1);
5862 output_array_shift += num_parts;
5863 partweight_array_shift += (2 * (num_parts - 1) + 1);
5867 mj_lno_t coordinate_end= this->part_xadj[current_concurrent_work_part];
5868 mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[
5869 current_concurrent_work_part -1];
5870 mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
5871 mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
5876 for(
int ii = 0; ii < this->num_threads; ++ii){
5877 this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
5881 if(this->mj_keep_part_boxes){
5883 for (mj_part_t j = 0; j < num_parts - 1; ++j){
5884 (*output_part_boxes)[output_array_shift + output_part_index +
5885 j].updateMinMax(current_concurrent_cut_coordinate[j], 1
5888 (*output_part_boxes)[output_array_shift + output_part_index + j +
5889 1].updateMinMax(current_concurrent_cut_coordinate[j], 0
5895 this->mj_create_new_partitions(
5897 mj_current_dim_coords,
5898 current_concurrent_cut_coordinate,
5901 used_local_cut_line_weight_to_left,
5902 this->thread_part_weight_work,
5903 this->new_part_xadj + output_part_index + output_array_shift
5910 mj_lno_t part_size = coordinate_end - coordinate_begin;
5911 *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
5913 this->new_coordinate_permutations + coordinate_begin,
5914 this->coordinate_permutations + coordinate_begin,
5915 part_size *
sizeof(mj_lno_t));
5917 cut_shift += num_parts - 1;
5918 tlr_shift += (4 *(num_parts - 1) + 1);
5919 output_array_shift += num_parts;
5920 partweight_array_shift += (2 * (num_parts - 1) + 1);
5930 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
5931 mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
5932 for (mj_part_t ii = 0;ii < num_parts ; ++ii){
5934 this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
5937 output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
5939 output_part_index += num_parts ;
5946 int current_world_size = this->comm->getSize();
5947 long migration_reduce_all_population = this->total_dim_num_reduce_all * current_world_size;
5950 bool is_migrated_in_current_dimension =
false;
5955 if (future_num_parts > 1 &&
5956 this->check_migrate_avoid_migration_option >= 0 &&
5957 current_world_size > 1){
5959 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" + istring);
5960 mj_part_t num_parts = output_part_count_in_dimension;
5961 if ( this->mj_perform_migration(
5964 next_future_num_parts_in_parts,
5965 output_part_begin_index,
5966 migration_reduce_all_population,
5967 this->num_local_coords / (future_num_parts * current_num_parts),
5969 input_part_boxes, output_part_boxes) ) {
5970 is_migrated_in_current_dimension =
true;
5971 is_data_ever_migrated =
true;
5972 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" +
5975 this->total_dim_num_reduce_all /= num_parts;
5978 is_migrated_in_current_dimension =
false;
5979 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" + istring);
5984 mj_lno_t * tmp = this->coordinate_permutations;
5985 this->coordinate_permutations = this->new_coordinate_permutations;
5986 this->new_coordinate_permutations = tmp;
5988 if(!is_migrated_in_current_dimension){
5989 this->total_dim_num_reduce_all -= current_num_parts;
5990 current_num_parts = output_part_count_in_dimension;
5992 freeArray<mj_lno_t>(this->part_xadj);
5993 this->part_xadj = this->new_part_xadj;
5995 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning_" + istring);
5999 delete future_num_part_in_parts;
6000 delete next_future_num_parts_in_parts;
6002 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning");
6009 this->set_final_parts(
6011 output_part_begin_index,
6013 is_data_ever_migrated);
6015 result_assigned_part_ids_ = this->assigned_part_ids;
6016 result_mj_gnos_ = this->current_mj_gnos;
6018 this->free_work_memory();
6019 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Total");
6020 this->mj_env->debug(3,
"Out of MultiJagged");
6028 template <
typename Adapter>
6033 #ifndef DOXYGEN_SHOULD_SKIP_THIS 6036 typedef typename Adapter::scalar_t mj_scalar_t;
6037 typedef typename Adapter::gno_t mj_gno_t;
6038 typedef typename Adapter::lno_t mj_lno_t;
6039 typedef typename Adapter::node_t mj_node_t;
6040 typedef typename Adapter::part_t mj_part_t;
6042 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
6046 RCP<const Environment> mj_env;
6047 RCP<Comm<int> > mj_problemComm;
6048 RCP<const coordinateModel_t> mj_coords;
6051 double imbalance_tolerance;
6052 size_t num_global_parts;
6053 mj_part_t *part_no_array;
6054 int recursion_depth;
6057 mj_lno_t num_local_coords;
6058 mj_gno_t num_global_coords;
6059 const mj_gno_t *initial_mj_gnos;
6060 mj_scalar_t **mj_coordinates;
6062 int num_weights_per_coord;
6063 bool *mj_uniform_weights;
6064 mj_scalar_t **mj_weights;
6065 bool *mj_uniform_parts;
6066 mj_scalar_t **mj_part_sizes;
6068 bool distribute_points_on_cut_lines;
6069 mj_part_t max_concurrent_part_calculation;
6070 int check_migrate_avoid_migration_option;
6071 mj_scalar_t minimum_migration_imbalance;
6072 int mj_keep_part_boxes;
6078 ArrayRCP<mj_part_t> comXAdj_;
6079 ArrayRCP<mj_part_t> comAdj_;
6085 ArrayRCP<const mj_scalar_t> * coordinate_ArrayRCP_holder;
6087 void set_up_partitioning_data(
6090 void set_input_parameters(
const Teuchos::ParameterList &p);
6092 void free_work_memory();
6094 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries()
const;
6099 RCP<Comm<int> > &problemComm,
6100 const RCP<const coordinateModel_t> &coords) :
6101 mj_partitioner(), mj_env(env),
6102 mj_problemComm(problemComm),
6104 imbalance_tolerance(0),
6105 num_global_parts(1), part_no_array(NULL),
6107 coord_dim(0),num_local_coords(0), num_global_coords(0),
6108 initial_mj_gnos(NULL), mj_coordinates(NULL),
6109 num_weights_per_coord(0),
6110 mj_uniform_weights(NULL), mj_weights(NULL),
6111 mj_uniform_parts(NULL),
6112 mj_part_sizes(NULL),
6113 distribute_points_on_cut_lines(true),
6114 max_concurrent_part_calculation(1),
6115 check_migrate_avoid_migration_option(0),
6116 minimum_migration_imbalance(0.30),
6117 mj_keep_part_boxes(0), num_threads(1), mj_run_as_rcb(0),
6118 comXAdj_(), comAdj_(), coordinate_ArrayRCP_holder (NULL)
6121 if (coordinate_ArrayRCP_holder != NULL){
6122 delete [] this->coordinate_ArrayRCP_holder;
6123 this->coordinate_ArrayRCP_holder = NULL;
6137 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6141 mj_part_t pointAssign(
int dim, mj_scalar_t *point)
const;
6143 void boxAssign(
int dim, mj_scalar_t *lower, mj_scalar_t *upper,
6144 size_t &nPartsFound, mj_part_t **partsFound)
const;
6149 void getCommunicationGraph(
6151 ArrayRCP<mj_part_t> &comXAdj,
6152 ArrayRCP<mj_part_t> &comAdj);
6165 template <
typename Adapter>
6170 this->set_up_partitioning_data(solution);
6171 this->set_input_parameters(this->mj_env->getParameters());
6172 if (this->mj_keep_part_boxes){
6173 this->mj_partitioner.set_to_keep_part_boxes();
6175 this->mj_partitioner.set_partitioning_parameters(
6176 this->distribute_points_on_cut_lines,
6177 this->max_concurrent_part_calculation,
6178 this->check_migrate_avoid_migration_option,
6179 this->minimum_migration_imbalance);
6181 mj_part_t *result_assigned_part_ids = NULL;
6182 mj_gno_t *result_mj_gnos = NULL;
6183 this->mj_partitioner.multi_jagged_part(
6185 this->mj_problemComm,
6187 this->imbalance_tolerance,
6188 this->num_global_parts,
6189 this->part_no_array,
6190 this->recursion_depth,
6193 this->num_local_coords,
6194 this->num_global_coords,
6195 this->initial_mj_gnos,
6196 this->mj_coordinates,
6198 this->num_weights_per_coord,
6199 this->mj_uniform_weights,
6201 this->mj_uniform_parts,
6202 this->mj_part_sizes,
6204 result_assigned_part_ids,
6209 #if defined(__cplusplus) && __cplusplus >= 201103L 6210 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
6211 localGidToLid.reserve(this->num_local_coords);
6212 for (mj_lno_t i = 0; i < this->num_local_coords; i++)
6213 localGidToLid[this->initial_mj_gnos[i]] = i;
6215 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[this->num_local_coords],
6216 0, this->num_local_coords,
true);
6218 for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
6219 mj_lno_t origLID = localGidToLid[result_mj_gnos[i]];
6220 partId[origLID] = result_assigned_part_ids[i];
6224 Teuchos::Hashtable<mj_gno_t, mj_lno_t>
6225 localGidToLid(this->num_local_coords);
6226 for (mj_lno_t i = 0; i < this->num_local_coords; i++)
6227 localGidToLid.put(this->initial_mj_gnos[i], i);
6229 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[this->num_local_coords],
6230 0, this->num_local_coords,
true);
6232 for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
6233 mj_lno_t origLID = localGidToLid.get(result_mj_gnos[i]);
6234 partId[origLID] = result_assigned_part_ids[i];
6237 #endif // C++11 is enabled 6239 delete [] result_mj_gnos;
6240 delete [] result_assigned_part_ids;
6242 solution->setParts(partId);
6243 this->free_work_memory();
6248 template <
typename Adapter>
6250 freeArray<mj_scalar_t *>(this->mj_coordinates);
6251 freeArray<mj_scalar_t *>(this->mj_weights);
6252 freeArray<bool>(this->mj_uniform_parts);
6253 freeArray<mj_scalar_t *>(this->mj_part_sizes);
6254 freeArray<bool>(this->mj_uniform_weights);
6260 template <
typename Adapter>
6265 this->coord_dim = this->mj_coords->getCoordinateDim();
6266 this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
6267 this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
6268 this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
6269 int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
6274 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
6277 this->mj_coordinates = allocMemory<mj_scalar_t *>(this->coord_dim);
6278 this->mj_weights = allocMemory<mj_scalar_t *>(criteria_dim);
6281 this->mj_uniform_parts = allocMemory< bool >(criteria_dim);
6284 this->mj_part_sizes = allocMemory<mj_scalar_t *>(criteria_dim);
6286 this->mj_uniform_weights = allocMemory< bool >(criteria_dim);
6289 ArrayView<const mj_gno_t> gnos;
6290 ArrayView<input_t> xyz;
6291 ArrayView<input_t> wgts;
6294 this->coordinate_ArrayRCP_holder =
new ArrayRCP<const mj_scalar_t> [this->coord_dim + this->num_weights_per_coord];
6296 this->mj_coords->getCoordinates(gnos, xyz, wgts);
6298 ArrayView<const mj_gno_t> mj_gnos = gnos;
6299 this->initial_mj_gnos = mj_gnos.getRawPtr();
6302 for (
int dim=0; dim < this->coord_dim; dim++){
6303 ArrayRCP<const mj_scalar_t> ar;
6305 this->coordinate_ArrayRCP_holder[dim] = ar;
6308 this->mj_coordinates[dim] = (mj_scalar_t *)ar.getRawPtr();
6312 if (this->num_weights_per_coord == 0){
6313 this->mj_uniform_weights[0] =
true;
6314 this->mj_weights[0] = NULL;
6318 for (
int wdim = 0; wdim < this->num_weights_per_coord; wdim++){
6319 ArrayRCP<const mj_scalar_t> ar;
6320 wgts[wdim].getInputArray(ar);
6321 this->coordinate_ArrayRCP_holder[this->coord_dim + wdim] = ar;
6322 this->mj_uniform_weights[wdim] =
false;
6323 this->mj_weights[wdim] = (mj_scalar_t *) ar.getRawPtr();
6327 for (
int wdim = 0; wdim < criteria_dim; wdim++){
6328 if (solution->criteriaHasUniformPartSizes(wdim)){
6329 this->mj_uniform_parts[wdim] =
true;
6330 this->mj_part_sizes[wdim] = NULL;
6333 std::cerr <<
"MJ does not support non uniform target part weights" << std::endl;
6342 template <
typename Adapter>
6345 const Teuchos::ParameterEntry *pe = pl.getEntryPtr(
"imbalance_tolerance");
6348 tol = pe->getValue(&tol);
6349 this->imbalance_tolerance = tol - 1.0;
6353 if (this->imbalance_tolerance <= 0)
6354 this->imbalance_tolerance= 10e-4;
6357 this->part_no_array = NULL;
6359 this->recursion_depth = 0;
6361 if (pl.getPtr<Array <mj_part_t> >(
"mj_parts")){
6362 this->part_no_array = (mj_part_t *) pl.getPtr<Array <mj_part_t> >(
"mj_parts")->getRawPtr();
6363 this->recursion_depth = pl.getPtr<Array <mj_part_t> >(
"mj_parts")->size() - 1;
6364 this->mj_env->debug(2,
"mj_parts provided by user");
6368 this->distribute_points_on_cut_lines =
true;
6369 this->max_concurrent_part_calculation = 1;
6371 this->mj_run_as_rcb = 0;
6372 int mj_user_recursion_depth = -1;
6373 this->mj_keep_part_boxes = 0;
6374 this->check_migrate_avoid_migration_option = 0;
6375 this->minimum_migration_imbalance = 0.35;
6377 pe = pl.getEntryPtr(
"mj_minimum_migration_imbalance");
6380 imb = pe->getValue(&imb);
6381 this->minimum_migration_imbalance = imb - 1.0;
6384 pe = pl.getEntryPtr(
"mj_migration_option");
6386 this->check_migrate_avoid_migration_option = pe->getValue(&this->check_migrate_avoid_migration_option);
6388 this->check_migrate_avoid_migration_option = 0;
6390 if (this->check_migrate_avoid_migration_option > 1) this->check_migrate_avoid_migration_option = -1;
6393 pe = pl.getEntryPtr(
"mj_concurrent_part_count");
6395 this->max_concurrent_part_calculation = pe->getValue(&this->max_concurrent_part_calculation);
6397 this->max_concurrent_part_calculation = 1;
6400 pe = pl.getEntryPtr(
"mj_keep_part_boxes");
6402 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
6404 this->mj_keep_part_boxes = 0;
6415 if (this->mj_keep_part_boxes == 0){
6416 pe = pl.getEntryPtr(
"mapping_type");
6418 int mapping_type = -1;
6419 mapping_type = pe->getValue(&mapping_type);
6420 if (mapping_type == 0){
6421 mj_keep_part_boxes = 1;
6427 pe = pl.getEntryPtr(
"mj_enable_rcb");
6429 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
6431 this->mj_run_as_rcb = 0;
6434 pe = pl.getEntryPtr(
"mj_recursion_depth");
6436 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
6438 mj_user_recursion_depth = -1;
6442 pe = pl.getEntryPtr(
"rectilinear");
6443 if (pe) val = pe->getValue(&val);
6445 this->distribute_points_on_cut_lines =
false;
6447 this->distribute_points_on_cut_lines =
true;
6450 if (this->mj_run_as_rcb){
6451 mj_user_recursion_depth = (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
6453 if (this->recursion_depth < 1){
6454 if (mj_user_recursion_depth > 0){
6455 this->recursion_depth = mj_user_recursion_depth;
6458 this->recursion_depth = this->coord_dim;
6462 this->num_threads = 1;
6463 #ifdef HAVE_ZOLTAN2_OMP 6464 #pragma omp parallel 6466 this->num_threads = omp_get_num_threads();
6473 template <
typename Adapter>
6476 typename Adapter::scalar_t *lower,
6477 typename Adapter::scalar_t *upper,
6478 size_t &nPartsFound,
6479 typename Adapter::part_t **partsFound)
const 6488 if (this->mj_keep_part_boxes) {
6491 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
6493 size_t nBoxes = (*partBoxes).size();
6495 throw std::logic_error(
"no part boxes exist");
6499 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
6501 if (globalBox->boxesOverlap(dim, lower, upper)) {
6503 std::vector<typename Adapter::part_t> partlist;
6506 for (
size_t i = 0; i < nBoxes; i++) {
6508 if ((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
6510 partlist.push_back((*partBoxes)[i].getpId());
6531 *partsFound =
new mj_part_t[nPartsFound];
6532 for (
size_t i = 0; i < nPartsFound; i++)
6533 (*partsFound)[i] = partlist[i];
6546 throw std::logic_error(
"need to use keep_cuts parameter for boxAssign");
6551 template <
typename Adapter>
6554 typename Adapter::scalar_t *point)
const 6561 if (this->mj_keep_part_boxes) {
6562 typename Adapter::part_t foundPart = -1;
6565 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
6567 size_t nBoxes = (*partBoxes).size();
6569 throw std::logic_error(
"no part boxes exist");
6573 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
6575 if (globalBox->pointInBox(dim, point)) {
6579 for (i = 0; i < nBoxes; i++) {
6581 if ((*partBoxes)[i].pointInBox(dim, point)) {
6582 foundPart = (*partBoxes)[i].getpId();
6596 std::ostringstream oss;
6598 for (
int j = 0; j < dim; j++) oss << point[j] <<
" ";
6599 oss <<
") not found in domain";
6600 throw std::logic_error(oss.str());
6609 size_t closestBox = 0;
6610 mj_scalar_t minDistance = std::numeric_limits<mj_scalar_t>::max();
6611 mj_scalar_t *centroid =
new mj_scalar_t[dim];
6612 for (
size_t i = 0; i < nBoxes; i++) {
6613 (*partBoxes)[i].computeCentroid(centroid);
6614 mj_scalar_t sum = 0.;
6616 for (
int j = 0; j < dim; j++) {
6617 diff = centroid[j] - point[j];
6620 if (sum < minDistance) {
6625 foundPart = (*partBoxes)[closestBox].getpId();
6632 throw std::logic_error(
"need to use keep_cuts parameter for pointAssign");
6636 template <
typename Adapter>
6642 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL){
6643 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6644 mj_part_t ntasks = (*pBoxes).size();
6645 int dim = (*pBoxes)[0].getDim();
6654 template <
typename Adapter>
6655 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
6658 return this->mj_partitioner.get_kept_boxes();
6662 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6664 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
6667 if (this->mj_keep_part_boxes)
6668 return this->kept_boxes;
6670 throw std::logic_error(
"Error: part boxes are not stored.");
6673 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6675 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
6677 RCP<mj_partBoxVector_t> &localPartBoxes
6680 mj_part_t ntasks = this->num_global_parts;
6681 int dim = (*localPartBoxes)[0].getDim();
6682 mj_scalar_t *localPartBoundaries =
new mj_scalar_t[ntasks * 2 *dim];
6684 memset(localPartBoundaries, 0,
sizeof(mj_scalar_t) * ntasks * 2 *dim);
6686 mj_scalar_t *globalPartBoundaries =
new mj_scalar_t[ntasks * 2 *dim];
6687 memset(globalPartBoundaries, 0,
sizeof(mj_scalar_t) * ntasks * 2 *dim);
6689 mj_scalar_t *localPartMins = localPartBoundaries;
6690 mj_scalar_t *localPartMaxs = localPartBoundaries + ntasks * dim;
6692 mj_scalar_t *globalPartMins = globalPartBoundaries;
6693 mj_scalar_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
6695 mj_part_t boxCount = localPartBoxes->size();
6696 for (mj_part_t i = 0; i < boxCount; ++i){
6697 mj_part_t pId = (*localPartBoxes)[i].getpId();
6700 mj_scalar_t *lmins = (*localPartBoxes)[i].getlmins();
6701 mj_scalar_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
6703 for (
int j = 0; j < dim; ++j){
6704 localPartMins[dim * pId + j] = lmins[j];
6705 localPartMaxs[dim * pId + j] = lmaxs[j];
6717 reduceAll<int, mj_scalar_t>(*mj_problemComm, reductionOp,
6718 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
6719 RCP<mj_partBoxVector_t> pB(
new mj_partBoxVector_t(),
true);
6720 for (mj_part_t i = 0; i < ntasks; ++i){
6722 globalPartMins + dim * i,
6723 globalPartMaxs + dim * i);
6735 delete []localPartBoundaries;
6736 delete []globalPartBoundaries;
#define MIN_WORK_LAST_DIM
GridHash Class, Hashing Class for part boxes.
Time an algorithm (or other entity) as a whole.
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, mj_scalar_t minimum_migration_imbalance_)
Multi Jagged coordinate partitioning algorithm.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
Defines Parameter related enumerators, declares functions.
void boxAssign(int dim, mj_scalar_t *lower, mj_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Sort items for quick sort function.
#define imbalanceOf2(Wachieved, wExpected)
RCP< mj_partBoxVector_t > get_kept_boxes() const
void freeArray(T *&array)
Frees the given array.
Class for sorting items with multiple values. First sorting with respect to val[0], then val[1] then ... val[count-1]. The last tie breaking is done with index values. Used for task mapping partitioning where the points on a cut line needs to be distributed consistently.
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
#define FUTURE_REDUCEALL_CUTOFF
Multi Jagged coordinate partitioning algorithm.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
T * allocMemory(size_t size)
Allocates memory for the given size.
mj_part_t pointAssign(int dim, mj_scalar_t *point) const
uMultiSortItem< IT, CT, WT > operator=(const uMultiSortItem< IT, CT, WT > &other)
A PartitioningSolution is a solution to a partitioning problem.
Zoltan2_BoxBoundaries()
Default Constructor.
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
void getInputArray(ArrayRCP< const T > &array) const
Create a contiguous array of the required type, perhaps for a TPL.
uMultiSortItem(const uMultiSortItem< IT, CT, WT > &other)
#define LEAST_SIGNIFICANCE
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
The StridedData class manages lists of weights or coordinates.
Algorithm defines the base class for all algorithms.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
RCP< mj_partBox_t > get_global_box() const
Return the global bounding box: min/max coords of global domain.
uMultiSortItem(IT index_, CT count_, WT *vals_)
std::string toString(tt obj)
Converts the given object to string.
Defines the CoordinateModel classes.
void multi_jagged_part(const RCP< const Environment > &env, RCP< Comm< int > > &problemComm, double imbalance_tolerance, size_t num_global_parts, mj_part_t *part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, const mj_gno_t *initial_mj_gnos, mj_scalar_t **mj_coordinates, int num_weights_per_coord, bool *mj_uniform_weights, mj_scalar_t **mj_weights, bool *mj_uniform_parts, mj_scalar_t **mj_part_sizes, mj_part_t *&result_assigned_part_ids, mj_gno_t *&result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
bool operator>(const uMultiSortItem< IT, CT, WT > &other) const
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
A gathering of useful namespace methods.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Multi Jagged coordinate partitioning algorithm.