1 #ifndef _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
2 #define _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
12 #include "Teuchos_ArrayViewDecl.hpp"
15 #include "Teuchos_ReductionOp.hpp"
23 #include "Teuchos_Comm.hpp"
24 #ifdef HAVE_ZOLTAN2_MPI
25 #include "Teuchos_DefaultMpiComm.hpp"
26 #endif // HAVE_ZOLTAN2_MPI
27 #include <Teuchos_DefaultSerialComm.hpp>
38 template <
typename Ordinal,
typename T>
54 T inoutBuffer[])
const {
56 for (Ordinal i = 0; i < count; i++) {
57 if (inBuffer[0] - inoutBuffer[0] < -_EPSILON) {
58 inoutBuffer[0] = inBuffer[0];
59 inoutBuffer[1] = inBuffer[1];
61 else if (inBuffer[0] - inoutBuffer[0] < _EPSILON &&
62 inBuffer[1] - inoutBuffer[1] < _EPSILON) {
63 inoutBuffer[0] = inBuffer[0];
64 inoutBuffer[1] = inBuffer[1];
75 template <
typename it>
77 return (x == 1 ? x : x * z2Fact<it>(x - 1));
80 template <
typename gno_t,
typename part_t>
90 template <
typename IT>
99 fact[k] = fact[k - 1] * k;
102 for (k = 0; k < n; ++k)
104 perm[k] = i / fact[n - 1 - k];
105 i = i % fact[n - 1 - k];
110 for (k = n - 1; k > 0; --k)
111 for (j = k - 1; j >= 0; --j)
112 if (perm[j] <= perm[k])
118 template <
typename part_t>
122 std::vector<int> grid_dims) {
124 int dim = grid_dims.size();
125 int neighborCount = 2 * dim;
126 task_comm_xadj = allocMemory<part_t>(taskCount + 1);
127 task_comm_adj = allocMemory<part_t>(taskCount * neighborCount);
130 task_comm_xadj[0] = 0;
132 for (
part_t i = 0; i < taskCount; ++i) {
135 for (
int j = 0; j < dim; ++j) {
136 part_t lNeighbor = i - prevDimMul;
137 part_t rNeighbor = i + prevDimMul;
138 prevDimMul *= grid_dims[j];
140 if (lNeighbor >= 0 &&
141 lNeighbor / prevDimMul == i / prevDimMul &&
142 lNeighbor < taskCount) {
143 task_comm_adj[neighBorIndex++] = lNeighbor;
146 if (rNeighbor >= 0 &&
147 rNeighbor / prevDimMul == i / prevDimMul &&
148 rNeighbor < taskCount) {
149 task_comm_adj[neighBorIndex++] = rNeighbor;
153 task_comm_xadj[i + 1] = neighBorIndex;
158 template <
typename Adapter,
typename scalar_t,
typename part_t>
161 const Teuchos::Comm<int> *comm,
167 scalar_t **partCenters) {
169 typedef typename Adapter::lno_t lno_t;
170 typedef typename Adapter::gno_t gno_t;
173 ArrayView<const gno_t> gnos;
174 ArrayView<input_t> xyz;
175 ArrayView<input_t> wgts;
183 gno_t *point_counts = allocMemory<gno_t>(ntasks);
184 memset(point_counts, 0,
sizeof(gno_t) * ntasks);
187 gno_t *global_point_counts = allocMemory<gno_t>(ntasks);
189 scalar_t **multiJagged_coordinates = allocMemory<scalar_t *>(coordDim);
191 for (
int dim = 0; dim < coordDim; dim++) {
192 ArrayRCP<const scalar_t> ar;
193 xyz[dim].getInputArray(ar);
196 multiJagged_coordinates[dim] = (scalar_t *)ar.getRawPtr();
197 memset(partCenters[dim], 0,
sizeof(scalar_t) * ntasks);
205 for (lno_t i = 0; i < numLocalCoords; i++) {
209 for (
int j = 0; j < coordDim; ++j) {
210 scalar_t c = multiJagged_coordinates[j][i];
211 partCenters[j][p] += c;
217 reduceAll<int, gno_t>(*comm, Teuchos::REDUCE_SUM,
218 ntasks, point_counts, global_point_counts);
220 for (
int j = 0; j < coordDim; ++j) {
221 for (
part_t i = 0; i < ntasks; ++i) {
222 if (global_point_counts[i] > 0)
223 partCenters[j][i] /= global_point_counts[i];
227 scalar_t *tmpCoords = allocMemory<scalar_t>(ntasks);
228 for (
int j = 0; j < coordDim; ++j) {
229 reduceAll<int, scalar_t>(*comm, Teuchos::REDUCE_SUM,
230 ntasks, partCenters[j], tmpCoords);
232 scalar_t *tmp = partCenters[j];
233 partCenters[j] = tmpCoords;
239 freeArray<gno_t>(point_counts);
240 freeArray<gno_t>(global_point_counts);
242 freeArray<scalar_t>(tmpCoords);
243 freeArray<scalar_t *>(multiJagged_coordinates);
247 template <
typename Adapter,
typename scalar_t,
typename part_t>
250 const Teuchos::Comm<int> *comm,
255 ArrayRCP<part_t> &g_part_xadj,
256 ArrayRCP<part_t> &g_part_adj,
257 ArrayRCP<scalar_t> &g_part_ew) {
259 typedef typename Adapter::lno_t t_lno_t;
260 typedef typename Adapter::gno_t t_gno_t;
261 typedef typename Adapter::scalar_t t_scalar_t;
262 typedef typename Adapter::offset_t t_offset_t;
285 ArrayView<const t_gno_t> Ids;
286 ArrayView<t_input_t> v_wghts;
290 ArrayView<const t_gno_t> edgeIds;
291 ArrayView<const t_offset_t> offsets;
292 ArrayView<t_input_t> e_wgts;
295 std::vector<t_scalar_t> edge_weights;
298 if (numWeightPerEdge > 0) {
299 edge_weights = std::vector<t_scalar_t>(localNumEdges);
301 for (t_lno_t i = 0; i < localNumEdges; ++i) {
302 edge_weights[i] = e_wgts[0][i];
308 std::vector<part_t> e_parts(localNumEdges);
310 #ifdef HAVE_ZOLTAN2_MPI
311 if (comm->getSize() > 1)
315 const bool bUseLocalIDs =
false;
317 const RCP<const Comm<int> > rcp_comm(comm,
false);
319 directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
321 directory.update(localNumVertices, &Ids[0], NULL, &parts[0],
322 NULL, directory_t::Update_Mode::Replace);
324 directory.find(localNumEdges, &edgeIds[0], NULL, &e_parts[0],
337 for (t_lno_t i = 0; i < localNumEdges; ++i) {
338 t_gno_t ei = edgeIds[i];
344 std::vector<t_lno_t> part_begins(np, -1);
345 std::vector<t_lno_t> part_nexts(localNumVertices, -1);
349 for (t_lno_t i = 0; i < localNumVertices; ++i) {
351 part_nexts[i] = part_begins[ap];
355 g_part_xadj = ArrayRCP<part_t>(np + 1);
356 g_part_adj = ArrayRCP<part_t>(localNumEdges);
357 g_part_ew = ArrayRCP<t_scalar_t>(localNumEdges);
362 std::vector<part_t> part_neighbors(np);
363 std::vector<t_scalar_t> part_neighbor_weights(np, 0);
364 std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
367 for (t_lno_t i = 0; i < np; ++i) {
368 part_t num_neighbor_parts = 0;
369 t_lno_t v = part_begins[i];
374 for (t_offset_t j = offsets[v]; j < offsets[v + 1]; ++j) {
379 if (numWeightPerEdge > 0) {
380 ew = edge_weights[j];
388 if (part_neighbor_weights[ep] < 0.00001) {
389 part_neighbors[num_neighbor_parts++] = ep;
392 part_neighbor_weights[ep] += ew;
400 for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
401 part_t neighbor_part = part_neighbors[j];
402 g_part_adj[nindex] = neighbor_part;
403 g_part_ew[nindex++] = part_neighbor_weights[neighbor_part];
404 part_neighbor_weights[neighbor_part] = 0;
406 g_part_xadj[i + 1] = nindex;
415 part_info() : weight(0) {
418 const part_info & operator+=(
const part_info & src) {
419 this->weight += src.weight;
423 bool operator>(
const part_info & src) {
424 return (destination_part > src.destination_part);
427 bool operator==(
const part_info & src) {
428 return (destination_part == src.destination_part);
438 bool bUseLocalIDs =
false;
439 const int debug_level = 0;
440 const RCP<const Comm<int> > rcp_comm(comm,
false);
442 directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
444 std::vector<part_t> part_data;
445 std::vector<std::vector<part_info>> user_data;
450 std::vector<t_lno_t> part_begins(np, -1);
451 std::vector<t_lno_t> part_nexts(localNumVertices, -1);
455 for (t_lno_t i = 0; i < localNumVertices; ++i) {
457 part_nexts[i] = part_begins[ap];
461 std::vector<part_t> part_neighbors(np);
462 std::vector<t_scalar_t> part_neighbor_weights(np, 0);
463 std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
466 for (t_lno_t i = 0; i < np; ++i) {
467 part_t num_neighbor_parts = 0;
468 t_lno_t v = part_begins[i];
473 for (t_offset_t j = offsets[v]; j < offsets[v + 1]; ++j) {
478 if (numWeightPerEdge > 0) {
479 ew = edge_weights[j];
483 if (part_neighbor_weights[ep] < 0.00001) {
484 part_neighbors[num_neighbor_parts++] = ep;
487 part_neighbor_weights[ep] += ew;
494 for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
495 part_t neighbor_part = part_neighbors[j];
496 part_neighbor_weights_ordered[j] =
497 part_neighbor_weights[neighbor_part];
498 part_neighbor_weights[neighbor_part] = 0;
502 if (num_neighbor_parts > 0) {
504 part_data.push_back(i);
505 std::vector<part_info> new_user_data(num_neighbor_parts);
507 for (
int q = 0; q < num_neighbor_parts; ++q) {
508 part_info & info = new_user_data[q];
509 info.weight = part_neighbor_weights_ordered[q];
510 info.destination_part = part_neighbors[q];
513 user_data.push_back(new_user_data);
519 std::vector<part_t> part_indices(np);
521 for (
part_t i = 0; i < np; ++i) part_indices[i] = i;
524 directory.update(part_data.size(), &part_data[0], NULL, &user_data[0],
525 NULL, directory_t::Update_Mode::AggregateAdd);
529 std::vector<std::vector<part_info>> find_user_data(part_indices.size());
530 directory.find(find_user_data.size(), &part_indices[0], NULL,
531 &find_user_data[0], NULL, NULL,
false);
541 int get_total_length = 0;
542 for (
size_t n = 0; n < find_user_data.size(); ++n) {
543 get_total_length += find_user_data[n].size();
547 g_part_xadj = ArrayRCP<part_t>(np + 1);
548 g_part_adj = ArrayRCP<part_t>(get_total_length);
549 g_part_ew = ArrayRCP<t_scalar_t>(get_total_length);
552 int track_insert_index = 0;
553 for (
size_t n = 0; n < find_user_data.size(); ++n) {
554 g_part_xadj[n] = track_insert_index;
555 const std::vector<part_info> & user_data_vector = find_user_data[n];
557 for (
size_t q = 0; q < user_data_vector.size(); ++q) {
558 const part_info & info = user_data_vector[q];
559 g_part_adj[track_insert_index] = info.destination_part;
560 g_part_ew[track_insert_index] = info.weight;
561 ++track_insert_index;
564 g_part_xadj[np] = get_total_length;
603 template <
class IT,
class WT>
614 freeArray<IT>(this->indices);
615 freeArray<WT>(this->values);
619 this->heapSize = heapsize_;
620 this->indices = allocMemory<IT>(heapsize_ );
621 this->values = allocMemory<WT>(heapsize_ );
627 WT maxVal = this->values[0];
633 if (distance >= maxVal)
636 this->values[0] = distance;
637 this->indices[0] = index;
644 IT child_index1 = 2 * index_on_heap + 1;
645 IT child_index2 = 2 * index_on_heap + 2;
648 if (child_index1 < this->heapSize && child_index2 < this->heapSize) {
650 if (this->values[child_index1] < this->values[child_index2]) {
651 biggerIndex = child_index2;
654 biggerIndex = child_index1;
657 else if (child_index1 < this->heapSize) {
658 biggerIndex = child_index1;
660 else if (child_index2 < this->heapSize) {
661 biggerIndex = child_index2;
664 if (biggerIndex >= 0 &&
665 this->values[biggerIndex] > this->values[index_on_heap]) {
666 WT tmpVal = this->values[biggerIndex];
667 this->values[biggerIndex] = this->values[index_on_heap];
668 this->values[index_on_heap] = tmpVal;
670 IT tmpIndex = this->indices[biggerIndex];
671 this->indices[biggerIndex] = this->indices[index_on_heap];
672 this->indices[index_on_heap] = tmpIndex;
678 WT MAXVAL = std::numeric_limits<WT>::max();
680 for (IT i = 0; i < this->heapSize; ++i) {
681 this->values[i] = MAXVAL;
682 this->indices[i] = -1;
690 for (IT j = 0; j < this->heapSize; ++j) {
691 nc += this->values[j];
704 for (
int i = 0; i < dimension; ++i) {
707 for (IT j = 0; j < this->heapSize; ++j) {
708 IT k = this->indices[j];
716 nc /= this->heapSize;
717 moved = (
ZOLTAN2_ABS(center[i] - nc) > this->_EPSILON || moved );
725 for (IT i = 0; i < this->heapSize; ++i) {
726 permutation[i] = this->indices[i];
733 template <
class IT,
class WT>
748 this->dimension = dimension_;
749 this->
center = allocMemory<WT>(dimension_);
750 this->closestPoints.setHeapsize(heapsize);
754 this->closestPoints.initValues();
758 return this->closestPoints.getNewCenters(
center, coords, dimension);
766 for (
int i = 0; i < this->dimension; ++i) {
767 WT d = (
center[i] - elementCoords[i][index]);
771 distance = pow(distance, WT(1.0 / this->dimension));
772 closestPoints.addPoint(index, distance);
778 return closestPoints.getTotalDistance();
782 closestPoints.copyCoordinates(permutation);
792 template <
class IT,
class WT>
799 IT required_elements;
807 freeArray<KMeansCluster<IT,WT> >(clusters);
808 freeArray<WT>(maxCoordinates);
809 freeArray<WT>(minCoordinates);
818 IT required_elements_):
820 numElements(numElements_),
821 elementCoords(elementCoords_),
822 numClusters((1 << dim_) + 1),
823 required_elements(required_elements_) {
825 this->clusters = allocMemory<KMeansCluster<IT,WT> >(this->numClusters);
828 for (
int i = 0; i < numClusters; ++i) {
829 this->clusters[i].setParams(this->dim, this->required_elements);
832 this->maxCoordinates = allocMemory <WT>(this->dim);
833 this->minCoordinates = allocMemory <WT>(this->dim);
836 for (
int j = 0; j < dim; ++j) {
837 this->minCoordinates[j] = this->elementCoords[j][0];
838 this->maxCoordinates[j] = this->elementCoords[j][0];
840 for (IT i = 1; i < numElements; ++i) {
841 WT t = this->elementCoords[j][i];
843 if (t > this->maxCoordinates[j]) {
844 this->maxCoordinates[j] = t;
847 if (t < minCoordinates[j]) {
848 this->minCoordinates[j] = t;
855 for (
int j = 0; j < dim; ++j) {
856 int mod = (1 << (j + 1));
858 for (
int i = 0; i < numClusters - 1; ++i) {
861 if ( (i % mod) < mod / 2) {
862 c = this->maxCoordinates[j];
867 c = this->minCoordinates[j];
870 this->clusters[i].center[j] = c;
875 for (
int j = 0; j < dim; ++j) {
876 this->clusters[numClusters - 1].center[j] =
877 (this->maxCoordinates[j] + this->minCoordinates[j]) / 2;
894 for (
int it = 0; it < 10; ++it) {
897 for (IT j = 0; j < this->numClusters; ++j) {
898 this->clusters[j].clearHeap();
901 for (IT i = 0; i < this->numElements; ++i) {
904 for (IT j = 0; j < this->numClusters; ++j) {
908 this->clusters[j].getDistance(i,this->elementCoords);
914 for (IT j = 0; j < this->numClusters; ++j) {
916 (this->clusters[j].getNewCenters(this->elementCoords) || moved );
928 WT minDistance = this->clusters[0].getDistanceToCenter();
935 for (IT j = 1; j < this->numClusters; ++j) {
936 WT minTmpDistance = this->clusters[j].getDistanceToCenter();
942 if (minTmpDistance < minDistance) {
943 minDistance = minTmpDistance;
949 this->clusters[minCluster].copyCoordinates(procPermutation);
954 #define MINOF(a,b) (((a)<(b))?(a):(b))
963 template <
typename T>
967 #ifdef HAVE_ZOLTAN2_OMP
968 #pragma omp parallel for
970 for (
size_t i = 0; i < arrSize; ++i) {
977 #ifdef HAVE_ZOLTAN2_OMP
978 #pragma omp parallel for
980 for (
size_t i = 0; i < arrSize; ++i) {
990 template <
typename part_t,
typename pcoord_t>
1021 part_t *task_communication_xadj,
1022 part_t *task_communication_adj,
1023 pcoord_t *task_communication_edge_weight) {
1025 double totalCost = 0;
1029 int assigned_proc = task_to_proc[task];
1031 part_t task_adj_begin = task_communication_xadj[task];
1032 part_t task_adj_end = task_communication_xadj[task + 1];
1034 commCount += task_adj_end - task_adj_begin;
1036 for (
part_t task2 = task_adj_begin; task2 < task_adj_end; ++task2) {
1038 part_t neighborTask = task_communication_adj[task2];
1039 int neighborProc = task_to_proc[neighborTask];
1042 if (task_communication_edge_weight == NULL) {
1043 totalCost += distance ;
1046 totalCost += distance * task_communication_edge_weight[task2];
1072 const RCP<const Environment> &env,
1073 ArrayRCP <part_t> &proc_to_task_xadj,
1074 ArrayRCP <part_t> &proc_to_task_adj,
1075 ArrayRCP <part_t> &task_to_proc,
1076 const Teuchos::RCP <
const Teuchos::Comm<int> > comm_
1085 template <
typename pcoord_t,
typename tcoord_t,
typename part_t>
1139 pcoord_t **pcoords_,
1141 tcoord_t **tcoords_,
1144 int *machine_extent_,
1145 bool *machine_extent_wrap_around_,
1184 minCoordDim, nprocs,
1190 for (
int i = ntasks; i < nprocs; ++i) {
1191 proc_permutation[i] = -1;
1219 lo = _u_umpa_seed % q;
1220 hi = _u_umpa_seed / q;
1221 test = (a * lo) - (r * hi);
1224 _u_umpa_seed = test;
1226 _u_umpa_seed = test + m;
1228 d = (double) ((
double) _u_umpa_seed / (double) m);
1230 return (
part_t) (d*(double)l);
1234 pcoord_t distance = 0;
1248 this->
machine->getHopCount(procId1, procId2, distance);
1257 int &_u_umpa_seed,
part_t rndm) {
1269 for (i = 0; i < n; i += 16) {
1285 for (i = 1; i < end; i++) {
1306 const RCP<const Environment> &env,
1307 ArrayRCP <part_t> &rcp_proc_to_task_xadj,
1308 ArrayRCP <part_t> &rcp_proc_to_task_adj,
1309 ArrayRCP <part_t> &rcp_task_to_proc,
1310 const Teuchos::RCP <
const Teuchos::Comm<int> > comm_
1313 rcp_proc_to_task_xadj = ArrayRCP <part_t>(this->
no_procs + 1);
1314 rcp_proc_to_task_adj = ArrayRCP <part_t>(this->
no_tasks);
1315 rcp_task_to_proc = ArrayRCP <part_t>(this->
no_tasks);
1318 part_t *proc_to_task_xadj = rcp_proc_to_task_xadj.getRawPtr();
1321 part_t *proc_to_task_adj = rcp_proc_to_task_adj.getRawPtr();
1324 part_t *task_to_proc = rcp_task_to_proc.getRawPtr();
1327 fillContinousArray<part_t>(proc_to_task_xadj, this->
no_procs + 1,
1353 recursion_depth = log(
float(this->no_procs)) / log(2.0) * 2 + 1;
1356 recursion_depth = log(
float(this->no_procs)) / log(2.0) + 1;
1366 int permutations = taskPerm * procPerm;
1370 permutations += taskPerm;
1374 permutations += procPerm;
1381 part_t *proc_xadj = allocMemory<part_t>(num_parts + 1);
1391 if (this->no_procs > this->
no_tasks) {
1399 fillContinousArray<part_t>(proc_adjList,this->
no_procs, NULL);
1403 int myPermutation = myRank % permutations;
1404 bool task_partition_along_longest_dim =
false;
1405 bool proc_partition_along_longest_dim =
false;
1411 if (myPermutation == 0) {
1412 task_partition_along_longest_dim =
true;
1413 proc_partition_along_longest_dim =
true;
1417 if (myPermutation < taskPerm) {
1418 proc_partition_along_longest_dim =
true;
1420 myTaskPerm = myPermutation;
1423 myPermutation -= taskPerm;
1425 if (myPermutation < procPerm) {
1426 task_partition_along_longest_dim =
true;
1428 myProcPerm = myPermutation;
1431 myPermutation -= procPerm;
1433 myProcPerm = myPermutation % procPerm;
1435 myTaskPerm = myPermutation / procPerm;
1465 ithPermutation<int>(this->
proc_coord_dim, myProcPerm, permutation);
1533 part_t num_group_count = 1;
1534 part_t *group_count = NULL;
1537 num_group_count =
machine->getNumUniqueGroups();
1539 if (num_group_count > 1) {
1540 group_count =
new part_t[num_group_count];
1541 memset(group_count, 0,
sizeof(
part_t) * num_group_count);
1543 machine->getGroupCount(group_count);
1547 env->timerStart(
MACRO_TIMERS,
"Mapping - Proc Partitioning");
1563 proc_partition_along_longest_dim,
1569 env->timerStop(
MACRO_TIMERS,
"Mapping - Proc Partitioning");
1575 part_t *task_xadj = allocMemory<part_t>(num_parts + 1);
1579 fillContinousArray<part_t>(task_adjList,this->
no_tasks, NULL);
1582 ithPermutation<int>(this->
task_coord_dim, myTaskPerm, permutation);
1585 tcoord_t **tcoords = allocMemory<tcoord_t *>(this->
task_coord_dim);
1590 env->timerStart(
MACRO_TIMERS,
"Mapping - Task Partitioning");
1597 this->task_coord_dim,
1604 task_partition_along_longest_dim,
1612 env->timerStop(
MACRO_TIMERS,
"Mapping - Task Partitioning");
1619 freeArray<pcoord_t *>(tcoords);
1620 freeArray<int>(permutation);
1623 for (
part_t i = 0; i < num_parts; ++i) {
1625 part_t proc_index_begin = proc_xadj[i];
1626 part_t task_begin_index = task_xadj[i];
1627 part_t proc_index_end = proc_xadj[i + 1];
1628 part_t task_end_index = task_xadj[i + 1];
1630 if (proc_index_end - proc_index_begin != 1) {
1631 std::cerr <<
"Error at partitioning of processors" << std::endl;
1632 std::cerr <<
"PART:" << i <<
" is assigned to "
1633 << proc_index_end - proc_index_begin <<
" processors."
1637 part_t assigned_proc = proc_adjList[proc_index_begin];
1638 proc_to_task_xadj[assigned_proc] = task_end_index - task_begin_index;
1643 part_t *proc_to_task_xadj_work = allocMemory<part_t>(this->
no_procs);
1646 part_t tmp = proc_to_task_xadj[i];
1647 proc_to_task_xadj[i] = sum;
1649 proc_to_task_xadj_work[i] = sum;
1651 proc_to_task_xadj[this->
no_procs] = sum;
1653 for (
part_t i = 0; i < num_parts; ++i) {
1655 part_t proc_index_begin = proc_xadj[i];
1656 part_t task_begin_index = task_xadj[i];
1657 part_t task_end_index = task_xadj[i + 1];
1659 part_t assigned_proc = proc_adjList[proc_index_begin];
1661 for (
part_t j = task_begin_index; j < task_end_index; ++j) {
1662 part_t taskId = task_adjList[j];
1664 task_to_proc[taskId] = assigned_proc;
1666 proc_to_task_adj[--proc_to_task_xadj_work[assigned_proc]] = taskId;
1723 freeArray<part_t>(proc_to_task_xadj_work);
1724 freeArray<part_t>(task_xadj);
1725 freeArray<part_t>(task_adjList);
1726 freeArray<part_t>(proc_xadj);
1727 freeArray<part_t>(proc_adjList);
1732 template <
typename Adapter,
typename part_t>
1736 #ifndef DOXYGEN_SHOULD_SKIP_THIS
1738 typedef typename Adapter::scalar_t pcoord_t;
1739 typedef typename Adapter::scalar_t tcoord_t;
1740 typedef typename Adapter::scalar_t
scalar_t;
1741 typedef typename Adapter::lno_t
lno_t;
1776 const Teuchos::RCP<
const Teuchos::Comm<int> > comm_) {
1792 std::cerr <<
"communicationModel is not specified in the Mapper"
1807 int taskPerm = z2Fact<int>(procDim);
1809 int procPerm = z2Fact<int>(taskDim);
1811 int idealGroupSize = taskPerm * procPerm;
1814 idealGroupSize += taskPerm + procPerm + 1;
1816 int myRank = this->
comm->getRank();
1817 int commSize = this->
comm->getSize();
1819 int myGroupIndex = myRank / idealGroupSize;
1821 int prevGroupBegin = (myGroupIndex - 1)* idealGroupSize;
1822 if (prevGroupBegin < 0) prevGroupBegin = 0;
1823 int myGroupBegin = myGroupIndex * idealGroupSize;
1824 int myGroupEnd = (myGroupIndex + 1) * idealGroupSize;
1825 int nextGroupEnd = (myGroupIndex + 2)* idealGroupSize;
1827 if (myGroupEnd > commSize) {
1828 myGroupBegin = prevGroupBegin;
1829 myGroupEnd = commSize;
1831 if (nextGroupEnd > commSize) {
1832 myGroupEnd = commSize;
1834 int myGroupSize = myGroupEnd - myGroupBegin;
1836 part_t *myGroup = allocMemory<part_t>(myGroupSize);
1837 for (
int i = 0; i < myGroupSize; ++i) {
1838 myGroup[i] = myGroupBegin + i;
1843 ArrayView<const part_t> myGroupView(myGroup, myGroupSize);
1845 RCP<Comm<int> > subComm =
1846 this->
comm->createSubcommunicator(myGroupView);
1847 freeArray<part_t>(myGroup);
1859 double myCost = this->
proc_task_comm->getCommunicationCostMetric();
1862 double localCost[2], globalCost[2];
1864 localCost[0] = myCost;
1865 localCost[1] = double(subComm->getRank());
1867 globalCost[1] = globalCost[0] = std::numeric_limits<double>::max();
1869 reduceAll<int, double>(*subComm, reduceBest,
1870 2, localCost, globalCost);
1872 int sender = int(globalCost[1]);
1889 broadcast(*subComm, sender, this->
ntasks,
1891 broadcast(*subComm, sender, this->
nprocs,
1893 broadcast(*subComm, sender, this->
ntasks,
1899 std::ofstream gnuPlotCode(
"gnuPlot.plot", std::ofstream::out);
1903 std::string ss =
"";
1906 std::string procFile = Teuchos::toString<int>(i) +
"_mapping.txt";
1908 gnuPlotCode <<
"plot \"" << procFile <<
"\"\n";
1911 gnuPlotCode <<
"replot \"" << procFile <<
"\"\n";
1914 std::ofstream inpFile(procFile.c_str(), std::ofstream::out);
1916 std::string gnuPlotArrow =
"set arrow from ";
1917 for (
int j = 0; j < mindim; ++j) {
1918 if (j == mindim - 1) {
1928 proc_coords[j][i]) +
",";
1931 gnuPlotArrow +=
" to ";
1933 inpFile << std::endl;
1936 for (
int k = 0; k < a.size(); ++k) {
1939 std::string gnuPlotArrow2 = gnuPlotArrow;
1940 for (
int z = 0; z < mindim; ++z) {
1941 if (z == mindim - 1) {
1953 task_coords[z][j]) +
",";
1956 ss += gnuPlotArrow2 +
"\n";
1957 inpFile << std::endl;
1962 gnuPlotCode <<
"\nreplot\n pause -1 \n";
1963 gnuPlotCode.close();
1968 std::string rankStr = Teuchos::toString<int>(myRank);
1969 std::string gnuPlots =
"gnuPlot", extentionS =
".plot";
1970 std::string outF = gnuPlots + rankStr+ extentionS;
1971 std::ofstream gnuPlotCode(outF.c_str(), std::ofstream::out);
1974 *tmpproc_task_comm =
1983 std::cerr <<
"Mapping Write is only good for 3 dim" << std::endl;
1986 std::string ss =
"";
1987 std::string procs =
"";
1989 std::set < std::tuple<int,int,int,int,int,int> > my_arrows;
1990 for (
part_t origin_rank = 0; origin_rank < this->
nprocs; ++origin_rank) {
1992 if (a.size() == 0) {
1996 std::string gnuPlotArrow =
"set arrow from ";
1997 for (
int j = 0; j < mindim; ++j) {
1998 if (j == mindim - 1) {
2000 Teuchos::toString<float>(tmpproc_task_comm->
2001 proc_coords[j][origin_rank]);
2003 Teuchos::toString<float>(tmpproc_task_comm->
2004 proc_coords[j][origin_rank]);
2009 Teuchos::toString<float>(tmpproc_task_comm->
2010 proc_coords[j][origin_rank]) +
",";
2012 Teuchos::toString<float>(tmpproc_task_comm->
2013 proc_coords[j][origin_rank])+
" ";
2018 gnuPlotArrow +=
" to ";
2021 for (
int k = 0; k < a.size(); ++k) {
2022 int origin_task = a[k];
2028 bool differentnode =
false;
2032 for (
int j = 0; j < mindim; ++j) {
2033 if (
int(tmpproc_task_comm->proc_coords[j][origin_rank]) !=
2034 int(tmpproc_task_comm->proc_coords[j][neighbor_rank])) {
2035 differentnode =
true;
break;
2038 std::tuple<int,int,int, int, int, int> foo(
2039 int(tmpproc_task_comm->proc_coords[0][origin_rank]),
2040 int(tmpproc_task_comm->proc_coords[1][origin_rank]),
2041 int(tmpproc_task_comm->proc_coords[2][origin_rank]),
2042 int(tmpproc_task_comm->proc_coords[0][neighbor_rank]),
2043 int(tmpproc_task_comm->proc_coords[1][neighbor_rank]),
2044 int(tmpproc_task_comm->proc_coords[2][neighbor_rank]));
2047 if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
2048 my_arrows.insert(foo);
2050 std::string gnuPlotArrow2 =
"";
2051 for (
int j = 0; j < mindim; ++j) {
2052 if (j == mindim - 1) {
2054 Teuchos::toString<float>(tmpproc_task_comm->
2055 proc_coords[j][neighbor_rank]);
2059 Teuchos::toString<float>(tmpproc_task_comm->
2060 proc_coords[j][neighbor_rank]) +
",";
2063 ss += gnuPlotArrow + gnuPlotArrow2 +
" nohead\n";
2069 std::ofstream procFile(
"procPlot.plot", std::ofstream::out);
2070 procFile << procs <<
"\n";
2075 gnuPlotCode <<
"plot \"procPlot.plot\" with points pointsize 3\n";
2077 gnuPlotCode <<
"splot \"procPlot.plot\" with points pointsize 3\n";
2080 gnuPlotCode << ss <<
"\nreplot\n pause -1 \n";
2081 gnuPlotCode.close();
2088 const Teuchos::Comm<int> *comm_,
2091 tcoord_t **partCenters) {
2093 std::string file =
"gggnuPlot";
2094 std::string exten =
".plot";
2095 std::ofstream mm(
"2d.txt");
2096 file += Teuchos::toString<int>(comm_->getRank()) + exten;
2097 std::ofstream ff(file.c_str());
2099 std::vector<Zoltan2::coordinateModelPartBox <tcoord_t, part_t> >
2105 outPartBoxes[i].writeGnuPlot(ff, mm);
2107 if (coordDim == 2) {
2108 ff <<
"plot \"2d.txt\"" << std::endl;
2112 ff <<
"splot \"2d.txt\"" << std::endl;
2117 ff <<
"set style arrow 5 nohead size screen 0.03,15,135 ls 1"
2124 for (
part_t p = pb; p < pe; ++p) {
2128 std::string arrowline =
"set arrow from ";
2129 for (
int j = 0; j < coordDim - 1; ++j) {
2131 Teuchos::toString<tcoord_t>(partCenters[j][n]) +
",";
2134 Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][n]) +
2137 for (
int j = 0; j < coordDim - 1; ++j) {
2139 Teuchos::toString<tcoord_t>(partCenters[j][i]) +
",";
2142 Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][i]) +
2150 ff <<
"replot\n pause -1" << std::endl;
2158 part_t* &proc_to_task_adj_) {
2187 const lno_t num_local_coords,
2188 const part_t *local_coord_parts,
2189 const ArrayRCP<part_t> task_to_proc_) {
2192 for (
lno_t i = 0; i < num_local_coords; ++i) {
2193 part_t local_coord_part = local_coord_parts[i];
2194 part_t rank_index = task_to_proc_[local_coord_part];
2213 const Teuchos::RCP <
const Teuchos::Comm<int> > comm_,
2216 const Teuchos::RCP <const Adapter> input_adapter_,
2219 const Teuchos::RCP <const Environment> envConst,
2220 bool is_input_adapter_distributed =
true,
2221 int num_ranks_per_node = 1,
2222 bool divide_to_prime_first =
false,
2223 bool reduce_best_mapping =
true):
2235 using namespace Teuchos;
2238 RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2239 RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2241 RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2242 RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2243 if (!is_input_adapter_distributed) {
2244 ia_comm = Teuchos::createSerialComm<int>();
2247 RCP<const Environment> envConst_ = envConst;
2249 RCP<const ctm_base_adapter_t> baseInputAdapter_(
2250 rcp(dynamic_cast<const ctm_base_adapter_t *>(
2251 input_adapter_.getRawPtr()),
false));
2259 baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2269 baseInputAdapter_, envConst_, ia_comm,
2273 if (!machine_->hasMachineCoordinates()) {
2274 throw std::runtime_error(
"Existing machine does not provide "
2275 "coordinates for coordinate task mapping");
2279 int procDim = machine_->getMachineDim();
2280 this->
nprocs = machine_->getNumRanks();
2283 pcoord_t **procCoordinates = NULL;
2284 if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2285 throw std::runtime_error(
"Existing machine does not implement "
2286 "getAllMachineCoordinatesView");
2293 std::vector<int> machine_extent_vec(procDim);
2295 int *machine_extent = &(machine_extent_vec[0]);
2296 bool *machine_extent_wrap_around =
new bool[procDim];
2297 for (
int i = 0; i < procDim; ++i)
2298 machine_extent_wrap_around[i] =
false;
2300 bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2306 if (machine_->getMachineExtent(machine_extent) &&
2313 machine_extent_wrap_around,
2320 int coordDim = coordinateModel_->getCoordinateDim();
2331 tcoord_t **partCenters = NULL;
2332 partCenters = allocMemory<tcoord_t *>(coordDim);
2333 for (
int i = 0; i < coordDim; ++i) {
2334 partCenters[i] = allocMemory<tcoord_t>(this->
ntasks);
2337 typedef typename Adapter::scalar_t t_scalar_t;
2340 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Solution Center");
2343 getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2344 envConst.getRawPtr(),
2345 ia_comm.getRawPtr(),
2346 coordinateModel_.getRawPtr(),
2354 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Solution Center");
2357 if (graph_model_.getRawPtr() != NULL) {
2358 getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2359 envConst.getRawPtr(),
2360 ia_comm.getRawPtr(),
2361 graph_model_.getRawPtr(),
2383 machine_extent_wrap_around,
2384 machine_.getRawPtr());
2386 int myRank = comm_->getRank();
2388 this->
proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2390 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Processor Task map");
2392 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Processor Task map");
2394 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Graph");
2401 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Graph");
2403 if (comm_->getRank() == 0) {
2406 std::cout <<
" TotalComm:"
2409 for (
part_t i = 1; i <= taskCommCount; ++i) {
2415 std::cout <<
" maxNeighbor:" << maxN << std::endl;
2418 this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2421 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Cost");
2436 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Cost");
2443 coordinateModel_->getLocalNumCoordinates(),
2466 delete [] machine_extent_wrap_around;
2468 if (machine_->getMachineExtent(machine_extent) &&
2470 for (
int i = 0; i < procDim; ++i) {
2471 delete [] procCoordinates[i];
2473 delete [] procCoordinates;
2476 for (
int i = 0; i < coordDim; ++i) {
2477 freeArray<tcoord_t>(partCenters[i]);
2479 freeArray<tcoord_t *>(partCenters);
2501 const Teuchos::RCP <
const Teuchos::Comm<int> > comm_,
2504 const Teuchos::RCP <const Adapter> input_adapter_,
2506 const part_t *result_parts,
2507 const Teuchos::RCP <const Environment> envConst,
2508 bool is_input_adapter_distributed =
true,
2509 int num_ranks_per_node = 1,
2510 bool divide_to_prime_first =
false,
2511 bool reduce_best_mapping =
true):
2513 num_parts_, result_parts, envConst),
2523 using namespace Teuchos;
2526 RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2527 RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2529 RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2530 RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2531 if (!is_input_adapter_distributed) {
2532 ia_comm = Teuchos::createSerialComm<int>();
2534 RCP<const Environment> envConst_ = envConst;
2536 RCP<const ctm_base_adapter_t> baseInputAdapter_(
2537 rcp(dynamic_cast<const ctm_base_adapter_t *>(
2538 input_adapter_.getRawPtr()),
false));
2546 baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2556 baseInputAdapter_, envConst_, ia_comm,
2560 if (!machine_->hasMachineCoordinates()) {
2561 throw std::runtime_error(
"Existing machine does not provide "
2562 "coordinates for coordinate task mapping.");
2566 int procDim = machine_->getMachineDim();
2567 this->
nprocs = machine_->getNumRanks();
2570 pcoord_t **procCoordinates = NULL;
2571 if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2572 throw std::runtime_error(
"Existing machine does not implement "
2573 "getAllMachineCoordinatesView");
2581 std::vector<int> machine_extent_vec(procDim);
2583 int *machine_extent = &(machine_extent_vec[0]);
2584 bool *machine_extent_wrap_around =
new bool[procDim];
2585 bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2591 if (machine_->getMachineExtent(machine_extent) &&
2597 machine_extent_wrap_around,
2604 int coordDim = coordinateModel_->getCoordinateDim();
2609 this->ntasks = num_parts_;
2613 tcoord_t **partCenters = NULL;
2614 partCenters = allocMemory<tcoord_t *>(coordDim);
2615 for (
int i = 0; i < coordDim; ++i) {
2616 partCenters[i] = allocMemory<tcoord_t>(this->
ntasks);
2619 typedef typename Adapter::scalar_t t_scalar_t;
2622 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Solution Center");
2625 getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2626 envConst.getRawPtr(),
2627 ia_comm.getRawPtr(),
2628 coordinateModel_.getRawPtr(),
2636 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Solution Center");
2640 if (graph_model_.getRawPtr() != NULL) {
2641 getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2642 envConst.getRawPtr(),
2643 ia_comm.getRawPtr(),
2644 graph_model_.getRawPtr(),
2657 "CoordinateCommunicationModel Create");
2669 machine_extent_wrap_around,
2670 machine_.getRawPtr());
2673 "CoordinateCommunicationModel Create");
2677 this->
proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2679 int myRank = comm_->getRank();
2682 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Processor Task map");
2684 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Processor Task map");
2687 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Graph");
2694 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Graph");
2696 if (comm_->getRank() == 0) {
2699 std::cout <<
" TotalComm:"
2702 for (
part_t i = 1; i <= taskCommCount; ++i) {
2708 std::cout <<
" maxNeighbor:" << maxN << std::endl;
2711 this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2714 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Cost");
2729 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Cost");
2737 coordinateModel_->getLocalNumCoordinates(),
2761 delete [] machine_extent_wrap_around;
2763 if (machine_->getMachineExtent(machine_extent) &&
2765 for (
int i = 0; i < procDim; ++i) {
2766 delete [] procCoordinates[i];
2768 delete [] procCoordinates;
2771 for (
int i = 0; i < coordDim; ++i) {
2772 freeArray<tcoord_t>(partCenters[i]);
2775 freeArray<tcoord_t *>(partCenters);
2845 const Teuchos::Comm<int> *problemComm,
2848 pcoord_t **machine_coords,
2851 tcoord_t **task_coords,
2852 ArrayRCP<part_t>task_comm_xadj,
2853 ArrayRCP<part_t>task_comm_adj,
2854 pcoord_t *task_communication_edge_weight_,
2855 int recursion_depth,
2857 const part_t *machine_dimensions,
2858 int num_ranks_per_node = 1,
2859 bool divide_to_prime_first =
false,
2860 bool reduce_best_mapping =
true):
2862 Teuchos::rcpFromRef<const Teuchos::Comm<int> >(*problemComm),
2863 Teuchos::rcpFromRef<const
Environment>(*env_const_)),
2873 pcoord_t ** virtual_machine_coordinates = machine_coords;
2874 bool *wrap_arounds =
new bool [proc_dim];
2875 for (
int i = 0; i < proc_dim; ++i) wrap_arounds[i] =
true;
2877 if (machine_dimensions) {
2878 virtual_machine_coordinates =
2887 this->
nprocs = num_processors;
2889 int coordDim = task_dim;
2890 this->ntasks = num_tasks;
2893 tcoord_t **partCenters = task_coords;
2899 virtual_machine_coordinates,
2907 this->
proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2912 int myRank = problemComm->getRank();
2925 task_communication_edge_weight_
2945 delete [] wrap_arounds;
2947 if (machine_dimensions) {
2948 for (
int i = 0; i < proc_dim; ++i) {
2949 delete [] virtual_machine_coordinates[i];
2951 delete [] virtual_machine_coordinates;
2954 if (problemComm->getRank() == 0)
2984 const part_t *machine_dimensions,
2985 bool *machine_extent_wrap_around,
2987 pcoord_t **mCoords) {
2989 pcoord_t **result_machine_coords = NULL;
2990 result_machine_coords =
new pcoord_t*[machine_dim];
2992 for (
int i = 0; i < machine_dim; ++i) {
2993 result_machine_coords[i] =
new pcoord_t [numProcs];
2996 for (
int i = 0; i < machine_dim; ++i) {
2997 part_t numMachinesAlongDim = machine_dimensions[i];
2999 part_t *machineCounts =
new part_t[numMachinesAlongDim];
3000 memset(machineCounts, 0,
sizeof(
part_t) * numMachinesAlongDim);
3002 int *filledCoordinates =
new int[numMachinesAlongDim];
3004 pcoord_t *coords = mCoords[i];
3006 for (
part_t j = 0; j < numProcs; ++j) {
3008 ++machineCounts[mc];
3011 part_t filledCoordinateCount = 0;
3012 for (
part_t j = 0; j < numMachinesAlongDim; ++j) {
3013 if (machineCounts[j] > 0) {
3014 filledCoordinates[filledCoordinateCount++] = j;
3018 part_t firstProcCoord = filledCoordinates[0];
3019 part_t firstProcCount = machineCounts[firstProcCoord];
3021 part_t lastProcCoord = filledCoordinates[filledCoordinateCount - 1];
3022 part_t lastProcCount = machineCounts[lastProcCoord];
3025 numMachinesAlongDim - lastProcCoord + firstProcCoord;
3026 part_t firstLastGapProc = lastProcCount + firstProcCount;
3028 part_t leftSideProcCoord = firstProcCoord;
3029 part_t leftSideProcCount = firstProcCount;
3031 part_t biggestGapProc = numProcs;
3033 part_t shiftBorderCoordinate = -1;
3034 for (
part_t j = 1; j < filledCoordinateCount; ++j) {
3035 part_t rightSideProcCoord= filledCoordinates[j];
3036 part_t rightSideProcCount = machineCounts[rightSideProcCoord];
3038 part_t gap = rightSideProcCoord - leftSideProcCoord;
3039 part_t gapProc = rightSideProcCount + leftSideProcCount;
3045 if (gap > biggestGap ||
3046 (gap == biggestGap && biggestGapProc > gapProc)) {
3047 shiftBorderCoordinate = rightSideProcCoord;
3048 biggestGapProc = gapProc;
3051 leftSideProcCoord = rightSideProcCoord;
3052 leftSideProcCount = rightSideProcCount;
3056 if (!(biggestGap > firstLastGap ||
3057 (biggestGap == firstLastGap &&
3058 biggestGapProc < firstLastGapProc))) {
3059 shiftBorderCoordinate = -1;
3062 for (
part_t j = 0; j < numProcs; ++j) {
3064 if (machine_extent_wrap_around[i] &&
3065 coords[j] < shiftBorderCoordinate) {
3066 result_machine_coords[i][j] = coords[j] + numMachinesAlongDim;
3070 result_machine_coords[i][j] = coords[j];
3073 delete [] machineCounts;
3074 delete [] filledCoordinates;
3077 return result_machine_coords;
3116 numParts = taskend - task_begin;
3132 if (taskend - task_begin > 0) {
3133 ArrayView <part_t> assignedParts(
3135 taskend - task_begin);
3137 return assignedParts;
3140 ArrayView <part_t> assignedParts;
3142 return assignedParts;
3226 template <
typename part_t,
typename pcoord_t,
typename tcoord_t>
3228 RCP<
const Teuchos::Comm<int> > problemComm,
3231 pcoord_t **machine_coords,
3234 tcoord_t **task_coords,
3239 pcoord_t *task_communication_edge_weight_,
3242 int recursion_depth,
3244 const part_t *machine_dimensions,
3245 int num_ranks_per_node = 1,
3246 bool divide_to_prime_first =
false) {
3253 typedef Tpetra::MultiVector<tcoord_t, part_t, part_t>
tMVector_t;
3256 task_comm_xadj, 0, num_tasks + 1,
false);
3259 if (task_comm_xadj) {
3260 Teuchos::ArrayRCP<part_t> tmp_task_communication_adj(
3261 task_comm_adj, 0, task_comm_xadj[num_tasks],
false);
3262 task_communication_adj = tmp_task_communication_adj;
3270 problemComm.getRawPtr(),
3282 task_communication_edge_weight_,
3287 divide_to_prime_first);
3290 part_t* proc_to_task_xadj_;
3291 part_t* proc_to_task_adj_;
3293 ctm->
getProcTask(proc_to_task_xadj_, proc_to_task_adj_);
3295 for (
part_t i = 0; i <= num_processors; ++i) {
3296 proc_to_task_xadj[i] = proc_to_task_xadj_[i];
3299 for (
part_t i = 0; i < num_tasks; ++i) {
3300 proc_to_task_adj[i] = proc_to_task_adj_[i];
3307 template <
typename proc_coord_t,
typename v_lno_t>
3309 const int machine_coord_dim,
3310 const int num_ranks,
3311 proc_coord_t **machine_coords,
3312 const v_lno_t num_tasks,
3315 const int *task_to_rank) {
3317 std::string rankStr = Teuchos::toString<int>(myRank);
3318 std::string gnuPlots =
"gnuPlot", extentionS =
".plot";
3319 std::string outF = gnuPlots + rankStr+ extentionS;
3320 std::ofstream gnuPlotCode( outF.c_str(), std::ofstream::out);
3322 if (machine_coord_dim != 3) {
3323 std::cerr <<
"Mapping Write is only good for 3 dim" << std::endl;
3326 std::string ss =
"";
3327 std::string procs =
"";
3329 std::set<std::tuple<int, int, int, int, int, int> > my_arrows;
3331 for (v_lno_t origin_task = 0; origin_task < num_tasks; ++origin_task) {
3332 int origin_rank = task_to_rank[origin_task];
3333 std::string gnuPlotArrow =
"set arrow from ";
3335 for (
int j = 0; j < machine_coord_dim; ++j) {
3336 if (j == machine_coord_dim - 1) {
3338 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3340 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3345 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3348 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3354 gnuPlotArrow +=
" to ";
3357 for (
int nind = task_communication_xadj[origin_task];
3358 nind < task_communication_xadj[origin_task + 1]; ++nind) {
3360 int neighbor_task = task_communication_adj[nind];
3362 bool differentnode =
false;
3363 int neighbor_rank = task_to_rank[neighbor_task];
3365 for (
int j = 0; j < machine_coord_dim; ++j) {
3366 if (
int(machine_coords[j][origin_rank]) !=
3367 int(machine_coords[j][neighbor_rank])) {
3368 differentnode =
true;
break;
3372 std::tuple<int,int,int, int, int, int> foo(
3373 (
int)(machine_coords[0][origin_rank]),
3374 (
int)(machine_coords[1][origin_rank]),
3375 (
int)(machine_coords[2][origin_rank]),
3376 (
int)(machine_coords[0][neighbor_rank]),
3377 (
int)(machine_coords[1][neighbor_rank]),
3378 (
int)(machine_coords[2][neighbor_rank]));
3380 if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
3381 my_arrows.insert(foo);
3383 std::string gnuPlotArrow2 =
"";
3384 for (
int j = 0; j < machine_coord_dim; ++j) {
3385 if (j == machine_coord_dim - 1) {
3387 Teuchos::toString<float>(machine_coords[j][neighbor_rank]);
3391 Teuchos::toString<float>(machine_coords[j][neighbor_rank])
3395 ss += gnuPlotArrow + gnuPlotArrow2 +
" nohead\n";
3400 std::ofstream procFile(
"procPlot.plot", std::ofstream::out);
3401 procFile << procs <<
"\n";
3405 if (machine_coord_dim == 2) {
3406 gnuPlotCode <<
"plot \"procPlot.plot\" with points pointsize 3\n";
3409 gnuPlotCode <<
"splot \"procPlot.plot\" with points pointsize 3\n";
3412 gnuPlotCode << ss <<
"\nreplot\n pause -1\npause -1";
3413 gnuPlotCode.close();
void setParams(int dimension_, int heapsize)
CommunicationModel Base Class that performs mapping between the coordinate partitioning result...
size_t getLocalNumEdges() const
Returns the number of edges on this process. In global or subset graphs, includes off-process edges...
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
void getBestMapping()
finds the lowest cost mapping and broadcasts solution to everyone.
void ithPermutation(const IT n, IT i, IT *perm)
CommunicationModel(part_t no_procs_, part_t no_tasks_)
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const part_t num_parts_, const part_t *result_parts, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. Instead of Solution we have two parameters, numparts.
size_t getVertexList(ArrayView< const gno_t > &Ids, ArrayView< input_t > &wgts) const
Sets pointers to this process' vertex Ids and their weights.
virtual ~CommunicationModel()
Time an algorithm (or other entity) as a whole.
bool * machine_extent_wrap_around
void setPartArraySize(int psize)
WT getDistance(IT index, WT **elementCoords)
virtual size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
CoordinateCommunicationModel(int pcoord_dim_, pcoord_t **pcoords_, int tcoord_dim_, tcoord_t **tcoords_, part_t no_procs_, part_t no_tasks_, int *machine_extent_, bool *machine_extent_wrap_around_, const MachineRepresentation< pcoord_t, part_t > *machine_=NULL)
Class Constructor:
void getCoarsenedPartGraph(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::GraphModel< typename Adapter::base_adapter_t > *graph, part_t np, const part_t *parts, ArrayRCP< part_t > &g_part_xadj, ArrayRCP< part_t > &g_part_adj, ArrayRCP< scalar_t > &g_part_ew)
void visualize_mapping(int myRank, const int machine_coord_dim, const int num_ranks, proc_coord_t **machine_coords, const v_lno_t num_tasks, const v_lno_t *task_communication_xadj, const v_lno_t *task_communication_adj, const int *task_to_rank)
PartitionMapping maps a solution or an input distribution to ranks.
ArrayRCP< part_t > local_task_to_rank
void getSolutionCenterCoordinates(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::CoordinateModel< typename Adapter::base_adapter_t > *coords, const part_t *parts, int coordDim, part_t ntasks, scalar_t **partCenters)
bool getNewCenters(WT **coords)
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, mj_scalar_t **mj_coordinates, mj_lno_t *initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth, const mj_part_t *part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const mj_part_t *first_level_distribution_=NULL)
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
virtual double getProcDistance(int procId1, int procId2) const =0
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
virtual ~CoordinateTaskMapper()
void calculateCommunicationCost(part_t *task_to_proc, part_t *task_communication_xadj, part_t *task_communication_adj, pcoord_t *task_communication_edge_weight)
const part_t * solution_parts
ArrayView< part_t > getAssignedTasksForProc(part_t procId)
const MachineRepresentation< pcoord_t, part_t > * machine
part_t getAssignedProcForTask(part_t taskId)
getAssignedProcForTask function, returns the assigned processor id for the given task ...
void getMinDistanceCluster(IT *procPermutation)
void doMapping(int myRank, const Teuchos::RCP< const Teuchos::Comm< int > > comm_)
doMapping function, calls getMapping function of communicationModel object.
virtual std::vector< coordinateModelPartBox > & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
Defines the XpetraMultiVectorAdapter.
SparseMatrixAdapter_t::part_t part_t
Multi Jagged coordinate partitioning algorithm.
void create_local_task_to_rank(const lno_t num_local_coords, const part_t *local_coord_parts, const ArrayRCP< part_t > task_to_proc_)
void timerStop(TimerType tt, const char *timerName) const
Stop a named timer.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &proc_to_task_xadj, ArrayRCP< part_t > &proc_to_task_adj, ArrayRCP< part_t > &task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const =0
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
virtual void map(const RCP< MappingSolution< Adapter > > &mappingsoln)
Mapping method.
Contains the Multi-jagged algorthm.
Adapter::scalar_t scalar_t
size_t getCoordinates(ArrayView< const gno_t > &Ids, ArrayView< input_t > &xyz, ArrayView< input_t > &wgts) const
Returns the coordinate ids, values and optional weights.
PartitionMapping maps a solution or an input distribution to ranks.
void getProcTask(part_t *&proc_to_task_xadj_, part_t *&proc_to_task_adj_)
A PartitioningSolution is a solution to a partitioning problem.
virtual void getProcsForPart(part_t taskId, part_t &numProcs, part_t *&procs) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
size_t getEdgeList(ArrayView< const gno_t > &edgeIds, ArrayView< const offset_t > &offsets, ArrayView< input_t > &wgts) const
Sets pointers to this process' edge (neighbor) global Ids, including off-process edges.
void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
void copyCoordinates(IT *permutation)
void getGridCommunicationGraph(part_t taskCount, part_t *&task_comm_xadj, part_t *&task_comm_adj, std::vector< int > grid_dims)
Zoltan2_ReduceBestMapping Class, reduces the minimum cost mapping, ties breaks with minimum proc id...
ArrayRCP< part_t > task_communication_adj
BaseAdapterType
An enum to identify general types of adapters.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
The StridedData class manages lists of weights or coordinates.
CoordinateTaskMapper(const Environment *env_const_, const Teuchos::Comm< int > *problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, ArrayRCP< part_t >task_comm_xadj, ArrayRCP< part_t >task_comm_adj, pcoord_t *task_communication_edge_weight_, int recursion_depth, part_t *part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor The mapping constructor which will also perform the mapping operation. The result mapping can be obtained by –getAssignedProcForTask function: which returns the assigned processor id for the given task –getPartsForProc: which returns the assigned tasks with the number of tasks.
void setPartArray(part_t *pNo)
ArrayRCP< part_t > proc_to_task_adj
The user parameters, debug, timing and memory profiling output objects, and error checking methods...
static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
void addPoint(IT index, WT distance)
bool getNewCenters(WT *center, WT **coords, int dimension)
size_t getLocalNumVertices() const
Returns the number vertices on this process.
MachineRepresentation Class Base class for representing machine coordinates, networks, etc.
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const Teuchos::RCP< const Zoltan2::PartitioningSolution< Adapter > > soln_, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. When this constructor is called, in order to calculate the communication metric...
CoordinateModelInput Class that performs mapping between the coordinate partitioning result and mpi r...
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS's idx_t...
double getCommunicationCostMetric()
pcoord_t ** shiftMachineCoordinates(int machine_dim, const part_t *machine_dimensions, bool *machine_extent_wrap_around, part_t numProcs, pcoord_t **mCoords)
Using the machine dimensions provided, create virtual machine coordinates by assigning the largest ga...
Defines the MappingSolution class.
const Teuchos::RCP< const Teuchos::Comm< int > > comm
ArrayRCP< part_t > task_communication_xadj
GraphModel defines the interface required for graph models.
Gathering definitions used in software development.
void update_visit_order(part_t *visitOrder, part_t n, int &_u_umpa_seed, part_t rndm)
virtual double getProcDistance(int procId1, int procId2) const
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &rcp_proc_to_task_xadj, ArrayRCP< part_t > &rcp_proc_to_task_adj, ArrayRCP< part_t > &rcp_task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
Zoltan2_ReduceBestMapping()
Default Constructor.
bool divide_to_prime_first
KmeansHeap Class, max heap, but holds the minimum values.
int getNumWeightsPerEdge() const
Returns the number (0 or greater) of weights per edge.
Defines the GraphModel interface.
void fillContinousArray(T *arr, size_t arrSize, T *val)
fillContinousArray function
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
void push_down(IT index_on_heap)
ArrayRCP< scalar_t > task_communication_edge_weight
virtual ~CoordinateCommunicationModel()
RCP< Comm< int > > create_subCommunicator()
creates and returns the subcommunicator for the processor group.
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
CoordinateCommunicationModel< pcoord_t, tcoord_t, part_t > * proc_task_comm
KMeansAlgorithm Class that performs clustering of the coordinates, and returns the closest set of coo...
const Teuchos::RCP< const Environment > env
void timerStart(TimerType tt, const char *timerName) const
Start a named timer.
ArrayRCP< part_t > task_to_proc
KMeansAlgorithm(int dim_, IT numElements_, WT **elementCoords_, IT required_elements_)
KMeansAlgorithm Constructor.
void writeMapping2(int myRank)
void copyCoordinates(IT *permutation)
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
void coordinateTaskMapperInterface(RCP< const Teuchos::Comm< int > > problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, part_t *task_comm_xadj, part_t *task_comm_adj, pcoord_t *task_communication_edge_weight_, part_t *proc_to_task_xadj, part_t *proc_to_task_adj, int recursion_depth, part_t *part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false)
Constructor The interface function that calls CoordinateTaskMapper which will also perform the mappin...
CoordinateCommunicationModel()
virtual void getPartsForProc(int procId, part_t &numParts, part_t *&parts) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
ArrayRCP< part_t > proc_to_task_xadj
void setHeapsize(IT heapsize_)