10 #ifndef _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
11 #define _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
21 #include "Teuchos_ArrayViewDecl.hpp"
24 #include "Teuchos_ReductionOp.hpp"
32 #include "Teuchos_Comm.hpp"
33 #ifdef HAVE_ZOLTAN2_MPI
34 #include "Teuchos_DefaultMpiComm.hpp"
35 #endif // HAVE_ZOLTAN2_MPI
36 #include <Teuchos_DefaultSerialComm.hpp>
47 template <
typename Ordinal,
typename T>
63 T inoutBuffer[])
const {
65 for (Ordinal i = 0; i < count; i++) {
66 if (inBuffer[0] - inoutBuffer[0] < -epsilon) {
67 inoutBuffer[0] = inBuffer[0];
68 inoutBuffer[1] = inBuffer[1];
69 }
else if(inBuffer[0] - inoutBuffer[0] < epsilon &&
70 inBuffer[1] - inoutBuffer[1] < epsilon) {
71 inoutBuffer[0] = inBuffer[0];
72 inoutBuffer[1] = inBuffer[1];
83 template <
typename it>
85 return (x == 1 ? x : x * z2Fact<it>(x - 1));
88 template <
typename gno_t,
typename part_t>
96 template <
typename IT>
100 IT *fact =
new IT[n];
106 fact[k] = fact[k - 1] * k;
109 for (k = 0; k < n; ++k)
111 perm[k] = i / fact[n - 1 - k];
112 i = i % fact[n - 1 - k];
117 for (k = n - 1; k > 0; --k)
118 for (j = k - 1; j >= 0; --j)
119 if (perm[j] <= perm[k])
125 template <
typename part_t>
129 std::vector<int> grid_dims) {
130 int dim = grid_dims.size();
131 int neighborCount = 2 * dim;
132 task_comm_xadj =
new part_t[taskCount + 1];
133 task_comm_adj =
new part_t[taskCount * neighborCount];
136 task_comm_xadj[0] = 0;
137 for (
part_t i = 0; i < taskCount; ++i) {
139 for (
int j = 0; j < dim; ++j) {
140 part_t lNeighbor = i - prevDimMul;
141 part_t rNeighbor = i + prevDimMul;
142 prevDimMul *= grid_dims[j];
143 if (lNeighbor >= 0 &&
144 lNeighbor/ prevDimMul == i / prevDimMul &&
145 lNeighbor < taskCount) {
146 task_comm_adj[neighBorIndex++] = lNeighbor;
148 if (rNeighbor >= 0 &&
149 rNeighbor/ prevDimMul == i / prevDimMul &&
150 rNeighbor < taskCount) {
151 task_comm_adj[neighBorIndex++] = rNeighbor;
154 task_comm_xadj[i + 1] = neighBorIndex;
159 template <
typename Adapter,
typename scalar_t,
typename part_t>
162 const Teuchos::Comm<int> *comm,
168 scalar_t **partCenters) {
174 ArrayView<const gno_t> gnos;
175 ArrayView<input_t> xyz;
176 ArrayView<input_t> wgts;
184 gno_t *point_counts =
new gno_t[ntasks];
185 memset(point_counts, 0,
sizeof(gno_t) * ntasks);
188 gno_t *global_point_counts =
new gno_t[ntasks];
190 scalar_t **multiJagged_coordinates =
new scalar_t*[coordDim];
192 ArrayRCP<ArrayRCP<const scalar_t>> ar = arcp(
new ArrayRCP<const scalar_t>[coordDim], 0, coordDim);
193 for (
int dim=0; dim < coordDim; dim++){
194 xyz[dim].getInputArray(ar[dim]);
196 multiJagged_coordinates[dim] = (scalar_t *)ar[dim].getRawPtr();
197 memset(partCenters[dim], 0,
sizeof(scalar_t) * ntasks);
205 for (lno_t i=0; i < numLocalCoords; i++) {
208 for(
int j = 0; j < coordDim; ++j) {
209 scalar_t c = multiJagged_coordinates[j][i];
210 partCenters[j][p] += c;
216 reduceAll<int, gno_t>(*comm, Teuchos::REDUCE_SUM,
217 ntasks, point_counts, global_point_counts);
219 for(
int j = 0; j < coordDim; ++j) {
220 for (
part_t i=0; i < ntasks; ++i) {
221 if (global_point_counts[i] > 0)
222 partCenters[j][i] /= global_point_counts[i];
226 scalar_t *tmpCoords =
new scalar_t[ntasks];
227 for(
int j = 0; j < coordDim; ++j) {
228 reduceAll<int, scalar_t>(*comm, Teuchos::REDUCE_SUM,
229 ntasks, partCenters[j], tmpCoords);
231 scalar_t *tmp = partCenters[j];
232 partCenters[j] = tmpCoords;
238 delete [] point_counts;
239 delete [] global_point_counts;
242 delete [] multiJagged_coordinates;
246 template <
typename Adapter,
typename scalar_t,
typename part_t>
249 const Teuchos::Comm<int> *comm,
254 ArrayRCP<part_t> &g_part_xadj,
255 ArrayRCP<part_t> &g_part_adj,
256 ArrayRCP<scalar_t> &g_part_ew) {
260 typedef typename Adapter::scalar_t t_scalar_t;
261 typedef typename Adapter::offset_t t_offset_t;
283 ArrayView<const t_gno_t> Ids;
284 ArrayView<t_input_t> v_wghts;
288 ArrayView<const t_gno_t> edgeIds;
289 ArrayView<const t_offset_t> offsets;
290 ArrayView<t_input_t> e_wgts;
293 std::vector<t_scalar_t> edge_weights;
296 if (numWeightPerEdge > 0) {
297 edge_weights = std::vector<t_scalar_t>(localNumEdges);
298 for (t_lno_t i = 0; i < localNumEdges; ++i) {
299 edge_weights[i] = e_wgts[0][i];
305 std::vector<part_t> e_parts(localNumEdges);
306 #ifdef HAVE_ZOLTAN2_MPI
307 if (comm->getSize() > 1) {
308 const bool bUseLocalIDs =
false;
311 const RCP<const Comm<int> > rcp_comm(comm,
false);
312 directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
313 directory.update(localNumVertices, &Ids[0], NULL, &parts[0],
315 directory.find(localNumEdges, &edgeIds[0], NULL, &e_parts[0],
328 for (t_lno_t i = 0; i < localNumEdges; ++i) {
329 t_gno_t ei = edgeIds[i];
335 std::vector<t_lno_t> part_begins(np, -1);
336 std::vector<t_lno_t> part_nexts(localNumVertices, -1);
340 for (t_lno_t i = 0; i < localNumVertices; ++i) {
342 part_nexts[i] = part_begins[ap];
347 g_part_xadj = ArrayRCP<part_t>(np + 1);
348 g_part_adj = ArrayRCP<part_t>(localNumEdges);
349 g_part_ew = ArrayRCP<t_scalar_t>(localNumEdges);
352 std::vector<part_t> part_neighbors(np);
353 std::vector<t_scalar_t> part_neighbor_weights(np, 0);
354 std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
357 for (t_lno_t i = 0; i < np; ++i) {
358 part_t num_neighbor_parts = 0;
359 t_lno_t v = part_begins[i];
363 for (t_offset_t j = offsets[v]; j < offsets[v+1]; ++j) {
368 if (numWeightPerEdge > 0) {
369 ew = edge_weights[j];
376 if (part_neighbor_weights[ep] < 0.00001) {
377 part_neighbors[num_neighbor_parts++] = ep;
380 part_neighbor_weights[ep] += ew;
388 for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
389 part_t neighbor_part = part_neighbors[j];
390 g_part_adj[nindex] = neighbor_part;
391 g_part_ew[nindex++] = part_neighbor_weights[neighbor_part];
392 part_neighbor_weights[neighbor_part] = 0;
394 g_part_xadj[i + 1] = nindex;
398 #ifdef HAVE_ZOLTAN2_MPI
399 if(comm->getSize() > 1) {
405 part_info() : weight(0) {
408 const part_info & operator+=(
const part_info & src) {
409 this->weight += src.weight;
413 bool operator>(
const part_info & src) {
414 return (destination_part > src.destination_part);
417 bool operator==(
const part_info & src) {
418 return (destination_part == src.destination_part);
425 bool bUseLocalIDs =
false;
426 const int debug_level = 0;
429 const RCP<const Comm<int> > rcp_comm(comm,
false);
430 directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
431 std::vector<part_t> part_data;
432 std::vector<std::vector<part_info>> user_data;
437 std::vector<t_lno_t> part_begins(np, -1);
438 std::vector<t_lno_t> part_nexts(localNumVertices, -1);
442 for (t_lno_t i = 0; i < localNumVertices; ++i) {
444 part_nexts[i] = part_begins[ap];
448 std::vector<part_t> part_neighbors(np);
449 std::vector<t_scalar_t> part_neighbor_weights(np, 0);
450 std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
453 for (t_lno_t i = 0; i < np; ++i) {
454 part_t num_neighbor_parts = 0;
455 t_lno_t v = part_begins[i];
460 for (t_offset_t j = offsets[v]; j < offsets[v+1]; ++j) {
465 if (numWeightPerEdge > 0) {
466 ew = edge_weights[j];
470 if (part_neighbor_weights[ep] < 0.00001) {
471 part_neighbors[num_neighbor_parts++] = ep;
474 part_neighbor_weights[ep] += ew;
481 for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
482 part_t neighbor_part = part_neighbors[j];
483 part_neighbor_weights_ordered[j] =
484 part_neighbor_weights[neighbor_part];
485 part_neighbor_weights[neighbor_part] = 0;
489 if (num_neighbor_parts > 0) {
490 part_data.push_back(i);
491 std::vector<part_info> new_user_data(num_neighbor_parts);
492 for(
int q = 0; q < num_neighbor_parts; ++q) {
493 part_info & info = new_user_data[q];
494 info.weight = part_neighbor_weights_ordered[q];
495 info.destination_part = part_neighbors[q];
498 user_data.push_back(new_user_data);
504 std::vector<part_t> part_indices(np);
506 for (
part_t i = 0; i < np; ++i) part_indices[i] = i;
509 directory.update(part_data.size(), &part_data[0], NULL, &user_data[0],
510 NULL, directory_t::Update_Mode::AggregateAdd);
514 std::vector<std::vector<part_info>> find_user_data(part_indices.size());
515 directory.find(find_user_data.size(), &part_indices[0], NULL,
516 &find_user_data[0], NULL, NULL,
false);
526 int get_total_length = 0;
527 for (
size_t n = 0; n < find_user_data.size(); ++n) {
528 get_total_length += find_user_data[n].size();
532 g_part_xadj = ArrayRCP<part_t>(np + 1);
533 g_part_adj = ArrayRCP<part_t>(get_total_length);
534 g_part_ew = ArrayRCP<t_scalar_t>(get_total_length);
537 int track_insert_index = 0;
538 for(
size_t n = 0; n < find_user_data.size(); ++n) {
539 g_part_xadj[n] = track_insert_index;
540 const std::vector<part_info> & user_data_vector = find_user_data[n];
541 for(
size_t q = 0; q < user_data_vector.size(); ++q) {
542 const part_info & info = user_data_vector[q];
543 g_part_adj[track_insert_index] = info.destination_part;
544 g_part_ew[track_insert_index] = info.weight;
545 ++track_insert_index;
548 g_part_xadj[np] = get_total_length;
550 #endif // HAVE_ZOLTAN2_MPI
556 template <
class IT,
class WT>
567 delete [] this->indices;
568 delete [] this->values;
572 this->heapSize = heapsize_;
573 this->indices =
new IT[heapsize_];
574 this->values =
new WT[heapsize_];
579 WT maxVal = this->values[0];
584 if (distance >= maxVal)
587 this->values[0] = distance;
588 this->indices[0] = index;
595 IT child_index1 = 2 * index_on_heap + 1;
596 IT child_index2 = 2 * index_on_heap + 2;
599 if(child_index1 < this->heapSize && child_index2 < this->heapSize) {
601 if (this->values[child_index1] < this->values[child_index2]) {
602 biggerIndex = child_index2;
605 biggerIndex = child_index1;
608 else if(child_index1 < this->heapSize) {
609 biggerIndex = child_index1;
612 else if(child_index2 < this->heapSize) {
613 biggerIndex = child_index2;
615 if (biggerIndex >= 0 &&
616 this->values[biggerIndex] > this->values[index_on_heap]) {
617 WT tmpVal = this->values[biggerIndex];
618 this->values[biggerIndex] = this->values[index_on_heap];
619 this->values[index_on_heap] = tmpVal;
621 IT tmpIndex = this->indices[biggerIndex];
622 this->indices[biggerIndex] = this->indices[index_on_heap];
623 this->indices[index_on_heap] = tmpIndex;
629 WT MAXVAL = std::numeric_limits<WT>::max();
631 for(IT i = 0; i < this->heapSize; ++i) {
632 this->values[i] = MAXVAL;
633 this->indices[i] = -1;
641 for(IT j = 0; j < this->heapSize; ++j) {
642 nc += this->values[j];
654 for(
int i = 0; i < dimension; ++i) {
657 for(IT j = 0; j < this->heapSize; ++j) {
658 IT k = this->indices[j];
666 nc /= this->heapSize;
667 moved = (std::abs(center[i] - nc) > this->epsilon || moved );
674 for (IT i = 0; i < this->heapSize; ++i) {
675 permutation[i] = this->indices[i];
682 template <
class IT,
class WT>
697 this->dimension = dimension_;
698 this->
center =
new WT[dimension_];
699 this->closestPoints.setHeapsize(heapsize);
703 this->closestPoints.initValues();
707 return this->closestPoints.getNewCenters(
center, coords, dimension);
715 for (
int i = 0; i < this->dimension; ++i) {
716 WT d = (
center[i] - elementCoords[i][index]);
719 distance = pow(distance, WT(1.0 / this->dimension));
720 closestPoints.addPoint(index, distance);
726 return closestPoints.getTotalDistance();
730 closestPoints.copyCoordinates(permutation);
740 template <
class IT,
class WT>
747 IT required_elements;
755 delete [] maxCoordinates;
756 delete [] minCoordinates;
765 IT required_elements_):
767 numElements(numElements_),
768 elementCoords(elementCoords_),
769 numClusters((1 << dim_) + 1),
770 required_elements(required_elements_) {
774 for (
int i = 0; i < numClusters; ++i) {
775 this->clusters[i].setParams(this->dim, this->required_elements);
778 this->maxCoordinates =
new WT[this->dim];
779 this->minCoordinates =
new WT[this->dim];
782 for (
int j = 0; j < dim; ++j) {
783 this->minCoordinates[j] = this->elementCoords[j][0];
784 this->maxCoordinates[j] = this->elementCoords[j][0];
786 for(IT i = 1; i < numElements; ++i) {
787 WT t = this->elementCoords[j][i];
788 if(t > this->maxCoordinates[j]){
789 this->maxCoordinates[j] = t;
792 if (t < minCoordinates[j]) {
793 this->minCoordinates[j] = t;
800 for (
int j = 0; j < dim; ++j) {
801 int mod = (1 << (j+1));
802 for (
int i = 0; i < numClusters - 1; ++i) {
805 if ( (i % mod) < mod / 2) {
806 c = this->maxCoordinates[j];
811 c = this->minCoordinates[j];
814 this->clusters[i].center[j] = c;
819 for (
int j = 0; j < dim; ++j) {
820 this->clusters[numClusters - 1].center[j] =
821 (this->maxCoordinates[j] + this->minCoordinates[j]) / 2;
838 for (
int it = 0; it < 10; ++it) {
841 for (IT j = 0; j < this->numClusters; ++j) {
842 this->clusters[j].clearHeap();
845 for (IT i = 0; i < this->numElements; ++i) {
848 for (IT j = 0; j < this->numClusters; ++j) {
852 this->clusters[j].getDistance(i,this->elementCoords);
858 for (IT j = 0; j < this->numClusters; ++j) {
860 (this->clusters[j].getNewCenters(this->elementCoords) || moved );
872 WT minDistance = this->clusters[0].getDistanceToCenter();
879 for (IT j = 1; j < this->numClusters; ++j) {
880 WT minTmpDistance = this->clusters[j].getDistanceToCenter();
886 if (minTmpDistance < minDistance) {
887 minDistance = minTmpDistance;
893 this->clusters[minCluster].copyCoordinates(procPermutation);
898 #define MINOF(a,b) (((a)<(b))?(a):(b))
907 template <
typename T>
911 #ifdef HAVE_ZOLTAN2_OMP
912 #pragma omp parallel for
914 for (
size_t i = 0; i < arrSize; ++i) {
921 #ifdef HAVE_ZOLTAN2_OMP
922 #pragma omp parallel for
924 for (
size_t i = 0; i < arrSize; ++i) {
934 template <
typename part_t,
typename pcoord_t,
typename node_t>
965 part_t *task_communication_xadj,
966 part_t *task_communication_adj,
967 pcoord_t *task_communication_edge_weight) {
969 double totalCost = 0;
973 int assigned_proc = task_to_proc[task];
975 part_t task_adj_begin = task_communication_xadj[task];
976 part_t task_adj_end = task_communication_xadj[task + 1];
978 commCount += task_adj_end - task_adj_begin;
980 for (
part_t task2 = task_adj_begin; task2 < task_adj_end; ++task2) {
982 part_t neighborTask = task_communication_adj[task2];
983 int neighborProc = task_to_proc[neighborTask];
986 if (task_communication_edge_weight == NULL) {
987 totalCost += distance ;
990 totalCost += distance * task_communication_edge_weight[task2];
1016 const RCP<const Environment> &env,
1017 ArrayRCP <part_t> &proc_to_task_xadj,
1018 ArrayRCP <part_t> &proc_to_task_adj,
1019 ArrayRCP <part_t> &task_to_proc,
1020 const Teuchos::RCP <
const Teuchos::Comm<int> > comm_
1028 template <
typename pcoord_t,
typename tcoord_t,
typename part_t,
typename node_t>
1084 pcoord_t **pcoords_,
1086 tcoord_t **tcoords_,
1089 int *machine_extent_,
1090 bool *machine_extent_wrap_around_,
1125 minCoordDim, nprocs,
1131 for(
int i = ntasks; i < nprocs; ++i) {
1132 proc_permutation[i] = -1;
1159 lo = _u_umpa_seed % q;
1160 hi = _u_umpa_seed / q;
1161 test = (a * lo) - (r * hi);
1163 _u_umpa_seed = test;
1165 _u_umpa_seed = test + m;
1167 d = (double) ((
double) _u_umpa_seed / (double) m);
1169 return (
part_t) (d*(double)l);
1173 pcoord_t distance = 0;
1187 this->
machine->getHopCount(procId1, procId2, distance);
1196 int &_u_umpa_seed,
part_t rndm) {
1208 for (i = 0; i < n; i += 16) {
1215 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v], a[u], tmp);
1216 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 1], a[u + 1], tmp);
1217 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 2], a[u + 2], tmp);
1218 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 3], a[u + 3], tmp);
1224 for (i = 1; i < end; i++) {
1246 const RCP<const Environment> &env,
1247 ArrayRCP <part_t> &rcp_proc_to_task_xadj,
1248 ArrayRCP <part_t> &rcp_proc_to_task_adj,
1249 ArrayRCP <part_t> &rcp_task_to_proc,
1250 const Teuchos::RCP <
const Teuchos::Comm<int> > comm_
1253 rcp_proc_to_task_xadj = ArrayRCP <part_t>(this->
no_procs + 1);
1254 rcp_proc_to_task_adj = ArrayRCP <part_t>(this->
no_tasks);
1255 rcp_task_to_proc = ArrayRCP <part_t>(this->
no_tasks);
1258 part_t *proc_to_task_xadj = rcp_proc_to_task_xadj.getRawPtr();
1261 part_t *proc_to_task_adj = rcp_proc_to_task_adj.getRawPtr();
1264 part_t *task_to_proc = rcp_task_to_proc.getRawPtr();
1267 fillContinousArray<part_t> (proc_to_task_xadj, this->
no_procs + 1, &invalid);
1294 recursion_depth = log(
float(this->no_procs)) / log(2.0) + 1;
1307 taskPerm = z2Fact<int>(8);
1314 procPerm = z2Fact<int>(8);
1318 int permutations = taskPerm * procPerm;
1322 permutations += taskPerm;
1326 permutations += procPerm;
1344 if (this->no_procs > this->
no_tasks) {
1352 fillContinousArray<part_t>(proc_adjList,this->
no_procs, NULL);
1356 int myPermutation = myRank % permutations;
1357 bool task_partition_along_longest_dim =
false;
1358 bool proc_partition_along_longest_dim =
false;
1364 if (myPermutation == 0) {
1365 task_partition_along_longest_dim =
true;
1366 proc_partition_along_longest_dim =
true;
1370 if (myPermutation < taskPerm) {
1371 proc_partition_along_longest_dim =
true;
1373 myTaskPerm = myPermutation;
1376 myPermutation -= taskPerm;
1377 if (myPermutation < procPerm) {
1378 task_partition_along_longest_dim =
true;
1380 myProcPerm = myPermutation;
1383 myPermutation -= procPerm;
1385 myProcPerm = myPermutation % procPerm;
1387 myTaskPerm = myPermutation / procPerm;
1417 ithPermutation<int>(this->
proc_coord_dim, myProcPerm, permutation);
1419 ithPermutation<int>(8, myProcPerm, permutation);
1487 part_t num_group_count = 1;
1488 part_t *group_count = NULL;
1491 num_group_count =
machine->getNumUniqueGroups();
1493 if (num_group_count > 1) {
1494 group_count =
new part_t[num_group_count];
1495 memset(group_count, 0,
sizeof(
part_t) * num_group_count);
1497 machine->getGroupCount(group_count);
1501 env->timerStart(
MACRO_TIMERS,
"Mapping - Proc Partitioning");
1505 typedef typename node_t::device_type
device_t;
1507 Kokkos::View<pcoord_t**, Kokkos::LayoutLeft, device_t>
1508 kokkos_pcoords(
"pcoords", this->no_procs, procdim);
1509 auto host_kokkos_pcoords = Kokkos::create_mirror_view(kokkos_pcoords);
1510 for(
int i = 0; i < procdim; ++i) {
1511 for(
int j = 0; j < this->
no_procs; ++j) {
1512 host_kokkos_pcoords(j,i) = pcoords[i][j];
1515 Kokkos::deep_copy(kokkos_pcoords, host_kokkos_pcoords);
1517 Kokkos::View<part_t*, device_t> initial_selected_coords_output_permutation_pcoords(
1518 "initial_selected_coords_output_permutation_pcoords", this->no_procs);
1519 typename Kokkos::View<part_t*, device_t>::HostMirror
1520 host_initial_selected_coords_output_permutation_pcoords =
1521 Kokkos::create_mirror_view(initial_selected_coords_output_permutation_pcoords);
1522 for(
int n = 0; n < this->
no_procs; ++n) {
1523 host_initial_selected_coords_output_permutation_pcoords(n) =
1526 Kokkos::deep_copy(initial_selected_coords_output_permutation_pcoords,
1527 host_initial_selected_coords_output_permutation_pcoords);
1530 Kokkos::View<part_t *, Kokkos::HostSpace> kokkos_group_count(
1531 "kokkos_group_count", group_count ? num_group_count : 0);
1533 for(
int n = 0; n < num_group_count; ++n) {
1534 kokkos_group_count(n) = group_count[n];
1546 initial_selected_coords_output_permutation_pcoords,
1550 proc_partition_along_longest_dim,
1554 kokkos_group_count);
1555 env->timerStop(
MACRO_TIMERS,
"Mapping - Proc Partitioning");
1560 Kokkos::deep_copy(host_initial_selected_coords_output_permutation_pcoords,
1561 initial_selected_coords_output_permutation_pcoords);
1562 for(
int n = 0; n < this->
no_procs; ++n) {
1564 host_initial_selected_coords_output_permutation_pcoords(n);
1571 fillContinousArray<part_t>(task_adjList,this->
no_tasks, NULL);
1575 ithPermutation<int>(this->
task_coord_dim, myTaskPerm, permutation);
1577 ithPermutation<int>(8, myTaskPerm, permutation);
1586 Kokkos::View<tcoord_t**, Kokkos::LayoutLeft, device_t>
1587 kokkos_tcoords(
"tcoords", this->no_tasks, this->task_coord_dim);
1588 auto host_kokkos_tcoords = Kokkos::create_mirror_view(kokkos_tcoords);
1590 for(
int j = 0; j < this->
no_tasks; ++j) {
1591 host_kokkos_tcoords(j,i) = tcoords[i][j];
1594 Kokkos::deep_copy(kokkos_tcoords, host_kokkos_tcoords);
1596 env->timerStart(
MACRO_TIMERS,
"Mapping - Task Partitioning");
1598 Kokkos::View<part_t*, device_t> initial_selected_coords_output_permutation_tcoords(
1599 "initial_selected_coords_output_permutation_tcoords", this->no_tasks);
1600 typename Kokkos::View<part_t*, device_t>::HostMirror
1601 host_initial_selected_coords_output_permutation_tcoords =
1602 Kokkos::create_mirror_view(initial_selected_coords_output_permutation_tcoords);
1603 for(
int n = 0; n < this->
no_tasks; ++n) {
1604 host_initial_selected_coords_output_permutation_tcoords(n) =
1607 Kokkos::deep_copy(initial_selected_coords_output_permutation_tcoords,
1608 host_initial_selected_coords_output_permutation_tcoords);
1616 this->task_coord_dim,
1619 initial_selected_coords_output_permutation_tcoords,
1623 task_partition_along_longest_dim,
1627 kokkos_group_count);
1628 env->timerStop(
MACRO_TIMERS,
"Mapping - Task Partitioning");
1630 Kokkos::deep_copy(host_initial_selected_coords_output_permutation_tcoords,
1631 initial_selected_coords_output_permutation_tcoords);
1632 for(
int n = 0; n < this->
no_tasks; ++n) {
1634 host_initial_selected_coords_output_permutation_tcoords(n);
1643 delete [] permutation;
1647 for(
part_t i = 0; i < num_parts; ++i) {
1649 part_t proc_index_begin = proc_xadj[i];
1650 part_t task_begin_index = task_xadj[i];
1651 part_t proc_index_end = proc_xadj[i + 1];
1652 part_t task_end_index = task_xadj[i + 1];
1655 if(proc_index_end - proc_index_begin != 1) {
1656 std::cerr <<
"Error at partitioning of processors" << std::endl;
1657 std::cerr <<
"PART:" << i <<
" is assigned to "
1658 << proc_index_end - proc_index_begin <<
" processors."
1662 part_t assigned_proc = proc_adjList[proc_index_begin];
1663 proc_to_task_xadj[assigned_proc] = task_end_index - task_begin_index;
1671 part_t tmp = proc_to_task_xadj[i];
1672 proc_to_task_xadj[i] = sum;
1674 proc_to_task_xadj_work[i] = sum;
1676 proc_to_task_xadj[this->
no_procs] = sum;
1678 for(
part_t i = 0; i < num_parts; ++i){
1680 part_t proc_index_begin = proc_xadj[i];
1681 part_t task_begin_index = task_xadj[i];
1682 part_t task_end_index = task_xadj[i + 1];
1684 part_t assigned_proc = proc_adjList[proc_index_begin];
1686 for (
part_t j = task_begin_index; j < task_end_index; ++j) {
1687 part_t taskId = task_adjList[j];
1689 task_to_proc[taskId] = assigned_proc;
1691 proc_to_task_adj [--proc_to_task_xadj_work[assigned_proc]] = taskId;
1748 delete [] proc_to_task_xadj_work;
1749 delete [] task_xadj;
1750 delete [] task_adjList;
1751 delete [] proc_xadj;
1752 delete [] proc_adjList;
1757 template <
typename Adapter,
typename part_t>
1761 #ifndef DOXYGEN_SHOULD_SKIP_THIS
1763 typedef typename Adapter::scalar_t pcoord_t;
1764 typedef typename Adapter::scalar_t tcoord_t;
1765 typedef typename Adapter::scalar_t
scalar_t;
1768 #if defined(KOKKOS_ENABLE_CUDA)
1769 using node_t = Tpetra::KokkosCompat::KokkosDeviceWrapperNode<
1770 Kokkos::Cuda, Kokkos::CudaSpace>;
1771 #elif defined(KOKKOS_ENABLE_HIP)
1772 using node_t = Tpetra::KokkosCompat::KokkosDeviceWrapperNode<
1773 Kokkos::HIP, Kokkos::HIPSpace>;
1811 const Teuchos::RCP<
const Teuchos::Comm<int> > comm_) {
1827 std::cerr <<
"communicationModel is not specified in the Mapper"
1847 taskPerm = z2Fact<int>(taskDim);
1850 taskPerm = z2Fact<int>(8);
1854 procPerm = z2Fact<int>(procDim);
1857 procPerm = z2Fact<int>(8);
1860 int idealGroupSize = taskPerm * procPerm;
1863 idealGroupSize += taskPerm + procPerm + 1;
1865 int myRank = this->
comm->getRank();
1866 int commSize = this->
comm->getSize();
1868 int myGroupIndex = myRank / idealGroupSize;
1870 int prevGroupBegin = (myGroupIndex - 1)* idealGroupSize;
1871 if (prevGroupBegin < 0) prevGroupBegin = 0;
1872 int myGroupBegin = myGroupIndex * idealGroupSize;
1873 int myGroupEnd = (myGroupIndex + 1) * idealGroupSize;
1874 int nextGroupEnd = (myGroupIndex + 2)* idealGroupSize;
1876 if (myGroupEnd > commSize) {
1877 myGroupBegin = prevGroupBegin;
1878 myGroupEnd = commSize;
1880 if (nextGroupEnd > commSize) {
1881 myGroupEnd = commSize;
1883 int myGroupSize = myGroupEnd - myGroupBegin;
1886 for (
int i = 0; i < myGroupSize; ++i) {
1887 myGroup[i] = myGroupBegin + i;
1892 ArrayView<const part_t> myGroupView(myGroup, myGroupSize);
1894 RCP<Comm<int> > subComm =
1895 this->
comm->createSubcommunicator(myGroupView);
1908 double myCost = this->
proc_task_comm->getCommunicationCostMetric();
1911 double localCost[2], globalCost[2];
1913 localCost[0] = myCost;
1914 localCost[1] = double(subComm->getRank());
1916 globalCost[1] = globalCost[0] = std::numeric_limits<double>::max();
1918 reduceAll<int, double>(*subComm, reduceBest,
1919 2, localCost, globalCost);
1921 int sender = int(globalCost[1]);
1938 broadcast(*subComm, sender, this->
ntasks,
1940 broadcast(*subComm, sender, this->
nprocs,
1942 broadcast(*subComm, sender, this->
ntasks,
1948 std::ofstream gnuPlotCode(
"gnuPlot.plot", std::ofstream::out);
1952 std::string ss =
"";
1955 std::string procFile = Teuchos::toString<int>(i) +
"_mapping.txt";
1957 gnuPlotCode <<
"plot \"" << procFile <<
"\"\n";
1960 gnuPlotCode <<
"replot \"" << procFile <<
"\"\n";
1963 std::ofstream inpFile(procFile.c_str(), std::ofstream::out);
1965 std::string gnuPlotArrow =
"set arrow from ";
1966 for (
int j = 0; j < mindim; ++j) {
1967 if (j == mindim - 1) {
1977 proc_coords[j][i]) +
",";
1980 gnuPlotArrow +=
" to ";
1982 inpFile << std::endl;
1985 for (
int k = 0; k < a.size(); ++k) {
1988 std::string gnuPlotArrow2 = gnuPlotArrow;
1989 for (
int z = 0; z < mindim; ++z) {
1990 if (z == mindim - 1) {
2002 task_coords[z][j]) +
",";
2005 ss += gnuPlotArrow2 +
"\n";
2006 inpFile << std::endl;
2011 gnuPlotCode <<
"\nreplot\n pause -1 \n";
2012 gnuPlotCode.close();
2017 std::string rankStr = Teuchos::toString<int>(myRank);
2018 std::string gnuPlots =
"gnuPlot", extentionS =
".plot";
2019 std::string outF = gnuPlots + rankStr+ extentionS;
2020 std::ofstream gnuPlotCode(outF.c_str(), std::ofstream::out);
2023 *tmpproc_task_comm =
2030 int mindim = tmpproc_task_comm->proc_coord_dim;
2032 std::cerr <<
"Mapping Write is only good for 3 dim" << std::endl;
2035 std::string ss =
"";
2036 std::string procs =
"";
2038 std::set < std::tuple<int,int,int,int,int,int> > my_arrows;
2039 for (part_t origin_rank = 0; origin_rank < this->
nprocs; ++origin_rank) {
2041 if (a.size() == 0) {
2045 std::string gnuPlotArrow =
"set arrow from ";
2046 for (
int j = 0; j < mindim; ++j) {
2047 if (j == mindim - 1) {
2049 Teuchos::toString<float>(tmpproc_task_comm->
2050 proc_coords[j][origin_rank]);
2052 Teuchos::toString<float>(tmpproc_task_comm->
2053 proc_coords[j][origin_rank]);
2058 Teuchos::toString<float>(tmpproc_task_comm->
2059 proc_coords[j][origin_rank]) +
",";
2061 Teuchos::toString<float>(tmpproc_task_comm->
2062 proc_coords[j][origin_rank])+
" ";
2067 gnuPlotArrow +=
" to ";
2070 for (
int k = 0; k < a.size(); ++k) {
2071 int origin_task = a[k];
2077 bool differentnode =
false;
2081 for (
int j = 0; j < mindim; ++j) {
2082 if (
int(tmpproc_task_comm->proc_coords[j][origin_rank]) !=
2083 int(tmpproc_task_comm->proc_coords[j][neighbor_rank])) {
2084 differentnode =
true;
break;
2087 std::tuple<int,int,int, int, int, int> foo(
2088 int(tmpproc_task_comm->proc_coords[0][origin_rank]),
2089 int(tmpproc_task_comm->proc_coords[1][origin_rank]),
2090 int(tmpproc_task_comm->proc_coords[2][origin_rank]),
2091 int(tmpproc_task_comm->proc_coords[0][neighbor_rank]),
2092 int(tmpproc_task_comm->proc_coords[1][neighbor_rank]),
2093 int(tmpproc_task_comm->proc_coords[2][neighbor_rank]));
2096 if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
2097 my_arrows.insert(foo);
2099 std::string gnuPlotArrow2 =
"";
2100 for (
int j = 0; j < mindim; ++j) {
2101 if (j == mindim - 1) {
2103 Teuchos::toString<float>(tmpproc_task_comm->
2104 proc_coords[j][neighbor_rank]);
2108 Teuchos::toString<float>(tmpproc_task_comm->
2109 proc_coords[j][neighbor_rank]) +
",";
2112 ss += gnuPlotArrow + gnuPlotArrow2 +
" nohead\n";
2118 std::ofstream procFile(
"procPlot.plot", std::ofstream::out);
2119 procFile << procs <<
"\n";
2124 gnuPlotCode <<
"plot \"procPlot.plot\" with points pointsize 3\n";
2126 gnuPlotCode <<
"splot \"procPlot.plot\" with points pointsize 3\n";
2129 gnuPlotCode << ss <<
"\nreplot\n pause -1 \n";
2130 gnuPlotCode.close();
2137 const Teuchos::Comm<int> *comm_,
2140 tcoord_t **partCenters) {
2142 std::string file =
"gggnuPlot";
2143 std::string exten =
".plot";
2144 std::ofstream mm(
"2d.txt");
2145 file += Teuchos::toString<int>(comm_->getRank()) + exten;
2146 std::ofstream ff(file.c_str());
2148 std::vector<Zoltan2::coordinateModelPartBox <tcoord_t, part_t> >
2154 outPartBoxes[i].writeGnuPlot(ff, mm);
2156 if (coordDim == 2) {
2157 ff <<
"plot \"2d.txt\"" << std::endl;
2161 ff <<
"splot \"2d.txt\"" << std::endl;
2166 ff <<
"set style arrow 5 nohead size screen 0.03,15,135 ls 1"
2173 for (
part_t p = pb; p < pe; ++p) {
2177 std::string arrowline =
"set arrow from ";
2178 for (
int j = 0; j < coordDim - 1; ++j) {
2180 Teuchos::toString<tcoord_t>(partCenters[j][n]) +
",";
2183 Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][n]) +
2186 for (
int j = 0; j < coordDim - 1; ++j) {
2188 Teuchos::toString<tcoord_t>(partCenters[j][i]) +
",";
2191 Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][i]) +
2199 ff <<
"replot\n pause -1" << std::endl;
2207 part_t* &proc_to_task_adj_) {
2236 const lno_t num_local_coords,
2237 const part_t *local_coord_parts,
2238 const ArrayRCP<part_t> task_to_proc_) {
2241 for (
lno_t i = 0; i < num_local_coords; ++i) {
2242 part_t local_coord_part = local_coord_parts[i];
2243 part_t rank_index = task_to_proc_[local_coord_part];
2264 const Teuchos::RCP <
const Teuchos::Comm<int> > comm_,
2267 const Teuchos::RCP <const Adapter> input_adapter_,
2270 const Teuchos::RCP <const Environment> envConst,
2271 bool is_input_adapter_distributed =
true,
2272 int num_ranks_per_node = 1,
2273 bool divide_to_prime_first =
false,
2274 bool reduce_best_mapping =
true):
2286 using namespace Teuchos;
2289 RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2290 RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2292 RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2293 RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2294 if (!is_input_adapter_distributed) {
2295 ia_comm = Teuchos::createSerialComm<int>();
2298 RCP<const Environment> envConst_ = envConst;
2300 RCP<const ctm_base_adapter_t> baseInputAdapter_(
2301 rcp(dynamic_cast<const ctm_base_adapter_t *>(
2302 input_adapter_.getRawPtr()),
false));
2310 baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2320 baseInputAdapter_, envConst_, ia_comm,
2324 if (!machine_->hasMachineCoordinates()) {
2325 throw std::runtime_error(
"Existing machine does not provide "
2326 "coordinates for coordinate task mapping");
2330 int procDim = machine_->getMachineDim();
2331 this->
nprocs = machine_->getNumRanks();
2334 pcoord_t **procCoordinates = NULL;
2335 if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2336 throw std::runtime_error(
"Existing machine does not implement "
2337 "getAllMachineCoordinatesView");
2344 std::vector<int> machine_extent_vec(procDim);
2346 int *machine_extent = &(machine_extent_vec[0]);
2347 bool *machine_extent_wrap_around =
new bool[procDim];
2348 for (
int i = 0; i < procDim; ++i)
2349 machine_extent_wrap_around[i] =
false;
2351 bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2357 if (machine_->getMachineExtent(machine_extent) &&
2364 machine_extent_wrap_around,
2371 int coordDim = coordinateModel_->getCoordinateDim();
2382 tcoord_t **partCenters =
new tcoord_t*[coordDim];
2383 for (
int i = 0; i < coordDim; ++i) {
2384 partCenters[i] =
new tcoord_t[this->
ntasks];
2387 typedef typename Adapter::scalar_t t_scalar_t;
2390 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Solution Center");
2393 getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2394 envConst.getRawPtr(),
2395 ia_comm.getRawPtr(),
2396 coordinateModel_.getRawPtr(),
2404 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Solution Center");
2407 if (graph_model_.getRawPtr() != NULL) {
2408 getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2409 envConst.getRawPtr(),
2410 ia_comm.getRawPtr(),
2411 graph_model_.getRawPtr(),
2432 machine_extent_wrap_around,
2433 machine_.getRawPtr());
2435 int myRank = comm_->getRank();
2437 this->
proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2439 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Processor Task map");
2441 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Processor Task map");
2443 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Graph");
2450 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Graph");
2452 if (comm_->getRank() == 0) {
2455 std::cout <<
" TotalComm:"
2458 for (
part_t i = 1; i <= taskCommCount; ++i) {
2464 std::cout <<
" maxNeighbor:" << maxN << std::endl;
2467 this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2470 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Cost");
2485 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Cost");
2492 coordinateModel_->getLocalNumCoordinates(),
2515 delete [] machine_extent_wrap_around;
2517 if (machine_->getMachineExtent(machine_extent) &&
2519 for (
int i = 0; i < procDim; ++i) {
2520 delete [] procCoordinates[i];
2522 delete [] procCoordinates;
2525 for (
int i = 0; i < coordDim; ++i) {
2526 delete [] partCenters[i];
2528 delete [] partCenters;
2551 const Teuchos::RCP <
const Teuchos::Comm<int> > comm_,
2554 const Teuchos::RCP <const Adapter> input_adapter_,
2556 const part_t *result_parts,
2557 const Teuchos::RCP <const Environment> envConst,
2558 bool is_input_adapter_distributed =
true,
2559 int num_ranks_per_node = 1,
2560 bool divide_to_prime_first =
false,
2561 bool reduce_best_mapping =
true):
2563 num_parts_, result_parts, envConst),
2573 using namespace Teuchos;
2576 RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2577 RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2579 RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2580 RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2581 if (!is_input_adapter_distributed) {
2582 ia_comm = Teuchos::createSerialComm<int>();
2584 RCP<const Environment> envConst_ = envConst;
2586 RCP<const ctm_base_adapter_t> baseInputAdapter_(
2587 rcp(dynamic_cast<const ctm_base_adapter_t *>(
2588 input_adapter_.getRawPtr()),
false));
2596 baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2606 baseInputAdapter_, envConst_, ia_comm,
2610 if (!machine_->hasMachineCoordinates()) {
2611 throw std::runtime_error(
"Existing machine does not provide "
2612 "coordinates for coordinate task mapping.");
2616 int procDim = machine_->getMachineDim();
2617 this->
nprocs = machine_->getNumRanks();
2620 pcoord_t **procCoordinates = NULL;
2621 if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2622 throw std::runtime_error(
"Existing machine does not implement "
2623 "getAllMachineCoordinatesView");
2631 std::vector<int> machine_extent_vec(procDim);
2633 int *machine_extent = &(machine_extent_vec[0]);
2634 bool *machine_extent_wrap_around =
new bool[procDim];
2635 bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2641 if (machine_->getMachineExtent(machine_extent) &&
2647 machine_extent_wrap_around,
2654 int coordDim = coordinateModel_->getCoordinateDim();
2659 this->ntasks = num_parts_;
2663 tcoord_t **partCenters =
new tcoord_t*[coordDim];
2664 for (
int i = 0; i < coordDim; ++i) {
2665 partCenters[i] =
new tcoord_t[this->
ntasks];
2668 typedef typename Adapter::scalar_t t_scalar_t;
2671 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Solution Center");
2674 getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2675 envConst.getRawPtr(),
2676 ia_comm.getRawPtr(),
2677 coordinateModel_.getRawPtr(),
2685 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Solution Center");
2689 if (graph_model_.getRawPtr() != NULL) {
2690 getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2691 envConst.getRawPtr(),
2692 ia_comm.getRawPtr(),
2693 graph_model_.getRawPtr(),
2706 "CoordinateCommunicationModel Create");
2717 machine_extent_wrap_around,
2718 machine_.getRawPtr());
2721 "CoordinateCommunicationModel Create");
2725 this->
proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2727 int myRank = comm_->getRank();
2730 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Processor Task map");
2732 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Processor Task map");
2735 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Graph");
2742 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Graph");
2744 if (comm_->getRank() == 0) {
2747 std::cout <<
" TotalComm:"
2750 for (
part_t i = 1; i <= taskCommCount; ++i) {
2756 std::cout <<
" maxNeighbor:" << maxN << std::endl;
2759 this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2762 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Cost");
2777 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Cost");
2785 coordinateModel_->getLocalNumCoordinates(),
2809 delete [] machine_extent_wrap_around;
2811 if (machine_->getMachineExtent(machine_extent) &&
2813 for (
int i = 0; i < procDim; ++i) {
2814 delete [] procCoordinates[i];
2816 delete [] procCoordinates;
2819 for (
int i = 0; i < coordDim; ++i) {
2820 delete [] partCenters[i];
2822 delete [] partCenters;
2892 const Teuchos::Comm<int> *problemComm,
2895 pcoord_t **machine_coords,
2898 tcoord_t **task_coords,
2899 ArrayRCP<part_t>task_comm_xadj,
2900 ArrayRCP<part_t>task_comm_adj,
2901 pcoord_t *task_communication_edge_weight_,
2902 int recursion_depth,
2903 Kokkos::View<part_t *, Kokkos::HostSpace> part_no_array,
2904 const part_t *machine_dimensions,
2905 int num_ranks_per_node = 1,
2906 bool divide_to_prime_first =
false,
2907 bool reduce_best_mapping =
true):
2909 Teuchos::rcpFromRef<const Teuchos::Comm<int> >(*problemComm),
2910 Teuchos::rcpFromRef<const
Environment>(*env_const_)),
2920 pcoord_t ** virtual_machine_coordinates = machine_coords;
2921 bool *wrap_arounds =
new bool [proc_dim];
2922 for (
int i = 0; i < proc_dim; ++i) wrap_arounds[i] =
true;
2924 if (machine_dimensions) {
2925 virtual_machine_coordinates =
2934 this->
nprocs = num_processors;
2936 int coordDim = task_dim;
2937 this->ntasks = num_tasks;
2940 tcoord_t **partCenters = task_coords;
2946 virtual_machine_coordinates,
2954 this->
proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2959 int myRank = problemComm->getRank();
2974 task_communication_edge_weight_
2995 delete [] wrap_arounds;
2997 if (machine_dimensions) {
2998 for (
int i = 0; i < proc_dim; ++i) {
2999 delete [] virtual_machine_coordinates[i];
3001 delete [] virtual_machine_coordinates;
3004 if (problemComm->getRank() == 0)
3034 const part_t *machine_dimensions,
3035 bool *machine_extent_wrap_around,
3037 pcoord_t **mCoords) {
3039 pcoord_t **result_machine_coords = NULL;
3040 result_machine_coords =
new pcoord_t*[machine_dim];
3042 for (
int i = 0; i < machine_dim; ++i) {
3043 result_machine_coords[i] =
new pcoord_t [numProcs];
3046 for (
int i = 0; i < machine_dim; ++i) {
3047 part_t numMachinesAlongDim = machine_dimensions[i];
3049 part_t *machineCounts =
new part_t[numMachinesAlongDim];
3050 memset(machineCounts, 0,
sizeof(
part_t) * numMachinesAlongDim);
3052 int *filledCoordinates =
new int[numMachinesAlongDim];
3054 pcoord_t *coords = mCoords[i];
3056 for (
part_t j = 0; j < numProcs; ++j) {
3058 ++machineCounts[mc];
3061 part_t filledCoordinateCount = 0;
3062 for (
part_t j = 0; j < numMachinesAlongDim; ++j) {
3063 if (machineCounts[j] > 0) {
3064 filledCoordinates[filledCoordinateCount++] = j;
3068 part_t firstProcCoord = filledCoordinates[0];
3069 part_t firstProcCount = machineCounts[firstProcCoord];
3071 part_t lastProcCoord = filledCoordinates[filledCoordinateCount - 1];
3072 part_t lastProcCount = machineCounts[lastProcCoord];
3075 numMachinesAlongDim - lastProcCoord + firstProcCoord;
3076 part_t firstLastGapProc = lastProcCount + firstProcCount;
3078 part_t leftSideProcCoord = firstProcCoord;
3079 part_t leftSideProcCount = firstProcCount;
3081 part_t biggestGapProc = numProcs;
3083 part_t shiftBorderCoordinate = -1;
3084 for (
part_t j = 1; j < filledCoordinateCount; ++j) {
3085 part_t rightSideProcCoord= filledCoordinates[j];
3086 part_t rightSideProcCount = machineCounts[rightSideProcCoord];
3088 part_t gap = rightSideProcCoord - leftSideProcCoord;
3089 part_t gapProc = rightSideProcCount + leftSideProcCount;
3095 if (gap > biggestGap ||
3096 (gap == biggestGap && biggestGapProc > gapProc)) {
3097 shiftBorderCoordinate = rightSideProcCoord;
3098 biggestGapProc = gapProc;
3101 leftSideProcCoord = rightSideProcCoord;
3102 leftSideProcCount = rightSideProcCount;
3106 if (!(biggestGap > firstLastGap ||
3107 (biggestGap == firstLastGap &&
3108 biggestGapProc < firstLastGapProc))) {
3109 shiftBorderCoordinate = -1;
3112 for (
part_t j = 0; j < numProcs; ++j) {
3114 if (machine_extent_wrap_around[i] &&
3115 coords[j] < shiftBorderCoordinate) {
3116 result_machine_coords[i][j] = coords[j] + numMachinesAlongDim;
3120 result_machine_coords[i][j] = coords[j];
3123 delete [] machineCounts;
3124 delete [] filledCoordinates;
3127 return result_machine_coords;
3166 numParts = taskend - task_begin;
3182 if (taskend - task_begin > 0) {
3183 ArrayView <part_t> assignedParts(
3185 taskend - task_begin);
3187 return assignedParts;
3190 ArrayView <part_t> assignedParts;
3192 return assignedParts;
3276 template <
typename part_t,
typename pcoord_t,
typename tcoord_t>
3278 RCP<
const Teuchos::Comm<int> > problemComm,
3281 pcoord_t **machine_coords,
3284 tcoord_t **task_coords,
3289 pcoord_t *task_communication_edge_weight_,
3292 int recursion_depth,
3293 Kokkos::View<part_t *, Kokkos::HostSpace> part_no_array,
3294 const part_t *machine_dimensions,
3295 int num_ranks_per_node = 1,
3296 bool divide_to_prime_first =
false) {
3303 typedef Tpetra::MultiVector<tcoord_t, part_t, part_t>
tMVector_t;
3306 task_comm_xadj, 0, num_tasks + 1,
false);
3309 if (task_comm_xadj) {
3310 Teuchos::ArrayRCP<part_t> tmp_task_communication_adj(
3311 task_comm_adj, 0, task_comm_xadj[num_tasks],
false);
3312 task_communication_adj = tmp_task_communication_adj;
3319 problemComm.getRawPtr(),
3331 task_communication_edge_weight_,
3336 divide_to_prime_first);
3339 part_t* proc_to_task_xadj_;
3340 part_t* proc_to_task_adj_;
3342 ctm->
getProcTask(proc_to_task_xadj_, proc_to_task_adj_);
3344 for (
part_t i = 0; i <= num_processors; ++i) {
3345 proc_to_task_xadj[i] = proc_to_task_xadj_[i];
3348 for (
part_t i = 0; i < num_tasks; ++i) {
3349 proc_to_task_adj[i] = proc_to_task_adj_[i];
3356 template <
typename proc_coord_t,
typename v_lno_t>
3358 const int machine_coord_dim,
3359 const int num_ranks,
3360 proc_coord_t **machine_coords,
3361 const v_lno_t num_tasks,
3364 const int *task_to_rank) {
3366 std::string rankStr = Teuchos::toString<int>(myRank);
3367 std::string gnuPlots =
"gnuPlot", extentionS =
".plot";
3368 std::string outF = gnuPlots + rankStr+ extentionS;
3369 std::ofstream gnuPlotCode( outF.c_str(), std::ofstream::out);
3371 if (machine_coord_dim != 3) {
3372 std::cerr <<
"Mapping Write is only good for 3 dim" << std::endl;
3375 std::string ss =
"";
3376 std::string procs =
"";
3378 std::set<std::tuple<int, int, int, int, int, int> > my_arrows;
3380 for (v_lno_t origin_task = 0; origin_task < num_tasks; ++origin_task) {
3381 int origin_rank = task_to_rank[origin_task];
3382 std::string gnuPlotArrow =
"set arrow from ";
3384 for (
int j = 0; j < machine_coord_dim; ++j) {
3385 if (j == machine_coord_dim - 1) {
3387 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3389 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3394 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3397 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3403 gnuPlotArrow +=
" to ";
3406 for (
int nind = task_communication_xadj[origin_task];
3407 nind < task_communication_xadj[origin_task + 1]; ++nind) {
3409 int neighbor_task = task_communication_adj[nind];
3411 bool differentnode =
false;
3412 int neighbor_rank = task_to_rank[neighbor_task];
3414 for (
int j = 0; j < machine_coord_dim; ++j) {
3415 if (
int(machine_coords[j][origin_rank]) !=
3416 int(machine_coords[j][neighbor_rank])) {
3417 differentnode =
true;
break;
3421 std::tuple<int,int,int, int, int, int> foo(
3422 (
int)(machine_coords[0][origin_rank]),
3423 (
int)(machine_coords[1][origin_rank]),
3424 (
int)(machine_coords[2][origin_rank]),
3425 (
int)(machine_coords[0][neighbor_rank]),
3426 (
int)(machine_coords[1][neighbor_rank]),
3427 (
int)(machine_coords[2][neighbor_rank]));
3429 if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
3430 my_arrows.insert(foo);
3432 std::string gnuPlotArrow2 =
"";
3433 for (
int j = 0; j < machine_coord_dim; ++j) {
3434 if (j == machine_coord_dim - 1) {
3436 Teuchos::toString<float>(machine_coords[j][neighbor_rank]);
3440 Teuchos::toString<float>(machine_coords[j][neighbor_rank])
3444 ss += gnuPlotArrow + gnuPlotArrow2 +
" nohead\n";
3449 std::ofstream procFile(
"procPlot.plot", std::ofstream::out);
3450 procFile << procs <<
"\n";
3454 if (machine_coord_dim == 2) {
3455 gnuPlotCode <<
"plot \"procPlot.plot\" with points pointsize 3\n";
3458 gnuPlotCode <<
"splot \"procPlot.plot\" with points pointsize 3\n";
3461 gnuPlotCode << ss <<
"\nreplot\n pause -1\npause -1";
3462 gnuPlotCode.close();
void setParams(int dimension_, int heapsize)
CommunicationModel Base Class that performs mapping between the coordinate partitioning result...
size_t getLocalNumEdges() const
Returns the number of edges on this process. In global or subset graphs, includes off-process edges...
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &rcp_proc_to_task_xadj, ArrayRCP< part_t > &rcp_proc_to_task_adj, ArrayRCP< part_t > &rcp_task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
void getBestMapping()
finds the lowest cost mapping and broadcasts solution to everyone.
void setPartArraySize(int psize)
void ithPermutation(const IT n, IT i, IT *perm)
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const part_t num_parts_, const part_t *result_parts, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. Instead of Solution we have two parameters, numparts.
size_t getVertexList(ArrayView< const gno_t > &Ids, ArrayView< input_t > &wgts) const
Sets pointers to this process' vertex Ids and their weights.
Time an algorithm (or other entity) as a whole.
WT getDistance(IT index, WT **elementCoords)
virtual size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
void getCoarsenedPartGraph(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::GraphModel< typename Adapter::base_adapter_t > *graph, part_t np, const part_t *parts, ArrayRCP< part_t > &g_part_xadj, ArrayRCP< part_t > &g_part_adj, ArrayRCP< scalar_t > &g_part_ew)
void visualize_mapping(int myRank, const int machine_coord_dim, const int num_ranks, proc_coord_t **machine_coords, const v_lno_t num_tasks, const v_lno_t *task_communication_xadj, const v_lno_t *task_communication_adj, const int *task_to_rank)
PartitionMapping maps a solution or an input distribution to ranks.
ArrayRCP< part_t > local_task_to_rank
virtual double getProcDistance(int procId1, int procId2) const
void getSolutionCenterCoordinates(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::CoordinateModel< typename Adapter::base_adapter_t > *coords, const part_t *parts, int coordDim, part_t ntasks, scalar_t **partCenters)
bool getNewCenters(WT **coords)
void calculateCommunicationCost(part_t *task_to_proc, part_t *task_communication_xadj, part_t *task_communication_adj, pcoord_t *task_communication_edge_weight)
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
map_t::global_ordinal_type gno_t
virtual ~CoordinateTaskMapper()
const part_t * solution_parts
ArrayView< part_t > getAssignedTasksForProc(part_t procId)
bool * machine_extent_wrap_around
part_t getAssignedProcForTask(part_t taskId)
getAssignedProcForTask function, returns the assigned processor id for the given task ...
void getMinDistanceCluster(IT *procPermutation)
void doMapping(int myRank, const Teuchos::RCP< const Teuchos::Comm< int > > comm_)
doMapping function, calls getMapping function of communicationModel object.
virtual std::vector< coordinateModelPartBox > & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
Defines the XpetraMultiVectorAdapter.
const MachineRepresentation< pcoord_t, part_t > * machine
SparseMatrixAdapter_t::part_t part_t
Multi Jagged coordinate partitioning algorithm.
void create_local_task_to_rank(const lno_t num_local_coords, const part_t *local_coord_parts, const ArrayRCP< part_t > task_to_proc_)
Kokkos::View< part_t *, Kokkos::HostSpace > kokkos_partNoArray
void timerStop(TimerType tt, const char *timerName) const
Stop a named timer.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
double getCommunicationCostMetric()
virtual void map(const RCP< MappingSolution< Adapter > > &mappingsoln)
Mapping method.
CoordinateTaskMapper(const Environment *env_const_, const Teuchos::Comm< int > *problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, ArrayRCP< part_t >task_comm_xadj, ArrayRCP< part_t >task_comm_adj, pcoord_t *task_communication_edge_weight_, int recursion_depth, Kokkos::View< part_t *, Kokkos::HostSpace > part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor The mapping constructor which will also perform the mapping operation. The result mapping can be obtained by –getAssignedProcForTask function: which returns the assigned processor id for the given task –getPartsForProc: which returns the assigned tasks with the number of tasks.
Contains the Multi-jagged algorthm.
Adapter::scalar_t scalar_t
size_t getCoordinates(ArrayView< const gno_t > &Ids, ArrayView< input_t > &xyz, ArrayView< input_t > &wgts) const
Returns the coordinate ids, values and optional weights.
PartitionMapping maps a solution or an input distribution to ranks.
void getProcTask(part_t *&proc_to_task_xadj_, part_t *&proc_to_task_adj_)
void update_visit_order(part_t *visitOrder, part_t n, int &_u_umpa_seed, part_t rndm)
A PartitioningSolution is a solution to a partitioning problem.
CoordinateCommunicationModel< pcoord_t, tcoord_t, part_t, node_t > * proc_task_comm
virtual void getProcsForPart(part_t taskId, part_t &numProcs, part_t *&procs) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
size_t getEdgeList(ArrayView< const gno_t > &edgeIds, ArrayView< const offset_t > &offsets, ArrayView< input_t > &wgts) const
Sets pointers to this process' edge (neighbor) global Ids, including off-process edges.
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
void copyCoordinates(IT *permutation)
void getGridCommunicationGraph(part_t taskCount, part_t *&task_comm_xadj, part_t *&task_comm_adj, std::vector< int > grid_dims)
Zoltan2_ReduceBestMapping Class, reduces the minimum cost mapping, ties breaks with minimum proc id...
ArrayRCP< part_t > task_communication_adj
BaseAdapterType
An enum to identify general types of adapters.
void setPartArray(Kokkos::View< part_t *, Kokkos::HostSpace > pNo)
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
The StridedData class manages lists of weights or coordinates.
CoordinateCommunicationModel()
map_t::local_ordinal_type lno_t
ArrayRCP< part_t > proc_to_task_adj
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &proc_to_task_xadj, ArrayRCP< part_t > &proc_to_task_adj, ArrayRCP< part_t > &task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const =0
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
The user parameters, debug, timing and memory profiling output objects, and error checking methods...
void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
void addPoint(IT index, WT distance)
bool getNewCenters(WT *center, WT **coords, int dimension)
size_t getLocalNumVertices() const
Returns the number vertices on this process.
MachineRepresentation Class Base class for representing machine coordinates, networks, etc.
CommunicationModel(part_t no_procs_, part_t no_tasks_)
static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const Teuchos::RCP< const Zoltan2::PartitioningSolution< Adapter > > soln_, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. When this constructor is called, in order to calculate the communication metric...
virtual ~CoordinateCommunicationModel()
CoordinateModelInput Class that performs mapping between the coordinate partitioning result and mpi r...
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS's idx_t...
pcoord_t ** shiftMachineCoordinates(int machine_dim, const part_t *machine_dimensions, bool *machine_extent_wrap_around, part_t numProcs, pcoord_t **mCoords)
Using the machine dimensions provided, create virtual machine coordinates by assigning the largest ga...
Defines the MappingSolution class.
const Teuchos::RCP< const Teuchos::Comm< int > > comm
CoordinateCommunicationModel(int pcoord_dim_, pcoord_t **pcoords_, int tcoord_dim_, tcoord_t **tcoords_, part_t no_procs_, part_t no_tasks_, int *machine_extent_, bool *machine_extent_wrap_around_, const MachineRepresentation< pcoord_t, part_t > *machine_=NULL)
Class Constructor:
bool divide_to_prime_first
virtual ~CommunicationModel()
ArrayRCP< part_t > task_communication_xadj
GraphModel defines the interface required for graph models.
void coordinateTaskMapperInterface(RCP< const Teuchos::Comm< int > > problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, part_t *task_comm_xadj, part_t *task_comm_adj, pcoord_t *task_communication_edge_weight_, part_t *proc_to_task_xadj, part_t *proc_to_task_adj, int recursion_depth, Kokkos::View< part_t *, Kokkos::HostSpace > part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false)
Constructor The interface function that calls CoordinateTaskMapper which will also perform the mappin...
Gathering definitions used in software development.
Zoltan2_ReduceBestMapping()
Default Constructor.
KmeansHeap Class, max heap, but holds the minimum values.
int getNumWeightsPerEdge() const
Returns the number (0 or greater) of weights per edge.
Defines the GraphModel interface.
void fillContinousArray(T *arr, size_t arrSize, T *val)
fillContinousArray function
void push_down(IT index_on_heap)
ArrayRCP< scalar_t > task_communication_edge_weight
RCP< Comm< int > > create_subCommunicator()
creates and returns the subcommunicator for the processor group.
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
KMeansAlgorithm Class that performs clustering of the coordinates, and returns the closest set of coo...
const Teuchos::RCP< const Environment > env
void timerStart(TimerType tt, const char *timerName) const
Start a named timer.
ArrayRCP< part_t > task_to_proc
KMeansAlgorithm(int dim_, IT numElements_, WT **elementCoords_, IT required_elements_)
KMeansAlgorithm Constructor.
Tpetra::MultiVector< zscalar_t, zlno_t, zgno_t, znode_t > tMVector_t
void writeMapping2(int myRank)
void copyCoordinates(IT *permutation)
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
virtual double getProcDistance(int procId1, int procId2) const =0
virtual void getPartsForProc(int procId, part_t &numParts, part_t *&parts) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
ArrayRCP< part_t > proc_to_task_xadj
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void setHeapsize(IT heapsize_)