1 #ifndef _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
2 #define _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
12 #include "Teuchos_ArrayViewDecl.hpp"
15 #include "Teuchos_ReductionOp.hpp"
23 #include "Teuchos_Comm.hpp"
24 #ifdef HAVE_ZOLTAN2_MPI
25 #include "Teuchos_DefaultMpiComm.hpp"
26 #endif // HAVE_ZOLTAN2_MPI
27 #include <Teuchos_DefaultSerialComm.hpp>
37 template <
typename Ordinal,
typename T>
50 void reduce(
const Ordinal count,
const T inBuffer[], T inoutBuffer[])
const
53 for (Ordinal i=0; i < count; i++){
54 if (inBuffer[0] - inoutBuffer[0] < -_EPSILON){
55 inoutBuffer[0] = inBuffer[0];
56 inoutBuffer[1] = inBuffer[1];
57 }
else if(inBuffer[0] - inoutBuffer[0] < _EPSILON &&
58 inBuffer[1] - inoutBuffer[1] < _EPSILON){
59 inoutBuffer[0] = inBuffer[0];
60 inoutBuffer[1] = inBuffer[1];
70 template <
typename it>
72 return (x == 1 ? x : x * z2Fact<it>(x - 1));
75 template <
typename gno_t,
typename part_t>
83 template <
typename IT>
93 fact[k] = fact[k - 1] * k;
96 for (k = 0; k < n; ++k)
98 perm[k] = i / fact[n - 1 - k];
99 i = i % fact[n - 1 - k];
104 for (k = n - 1; k > 0; --k)
105 for (j = k - 1; j >= 0; --j)
106 if (perm[j] <= perm[k])
112 template <
typename part_t>
114 int dim = grid_dims.size();
115 int neighborCount = 2 * dim;
116 task_comm_xadj = allocMemory<part_t>(taskCount+1);
117 task_comm_adj = allocMemory<part_t>(taskCount * neighborCount);
120 task_comm_xadj[0] = 0;
121 for (
part_t i = 0; i < taskCount; ++i){
123 for (
int j = 0; j < dim; ++j){
124 part_t lNeighbor = i - prevDimMul;
125 part_t rNeighbor = i + prevDimMul;
126 prevDimMul *= grid_dims[j];
127 if (lNeighbor >= 0 && lNeighbor/ prevDimMul == i / prevDimMul && lNeighbor < taskCount){
128 task_comm_adj[neighBorIndex++] = lNeighbor;
130 if (rNeighbor >= 0 && rNeighbor/ prevDimMul == i / prevDimMul && rNeighbor < taskCount){
131 task_comm_adj[neighBorIndex++] = rNeighbor;
134 task_comm_xadj[i+1] = neighBorIndex;
139 template <
typename Adapter,
typename scalar_t,
typename part_t>
142 const Teuchos::Comm<int> *comm,
148 scalar_t **partCenters){
150 typedef typename Adapter::lno_t lno_t;
151 typedef typename Adapter::gno_t gno_t;
154 ArrayView<const gno_t> gnos;
155 ArrayView<input_t> xyz;
156 ArrayView<input_t> wgts;
166 gno_t *point_counts = allocMemory<gno_t>(ntasks);
167 memset(point_counts, 0,
sizeof(gno_t) * ntasks);
170 gno_t *global_point_counts = allocMemory<gno_t>(ntasks);
173 scalar_t **multiJagged_coordinates = allocMemory<scalar_t *>(coordDim);
175 for (
int dim=0; dim < coordDim; dim++){
176 ArrayRCP<const scalar_t> ar;
177 xyz[dim].getInputArray(ar);
179 multiJagged_coordinates[dim] = (scalar_t *)ar.getRawPtr();
180 memset(partCenters[dim], 0,
sizeof(scalar_t) * ntasks);
195 for (lno_t i=0; i < numLocalCoords; i++){
198 for(
int j = 0; j < coordDim; ++j){
199 scalar_t c = multiJagged_coordinates[j][i];
200 partCenters[j][p] += c;
206 reduceAll<int, gno_t>(*comm, Teuchos::REDUCE_SUM,
207 ntasks, point_counts, global_point_counts
210 for(
int j = 0; j < coordDim; ++j){
211 for (
part_t i=0; i < ntasks; ++i){
212 partCenters[j][i] /= global_point_counts[i];
216 scalar_t *tmpCoords = allocMemory<scalar_t>(ntasks);
217 for(
int j = 0; j < coordDim; ++j){
218 reduceAll<int, scalar_t>(*comm, Teuchos::REDUCE_SUM,
219 ntasks, partCenters[j], tmpCoords
222 scalar_t *tmp = partCenters[j];
223 partCenters[j] = tmpCoords;
228 freeArray<gno_t>(point_counts);
229 freeArray<gno_t>(global_point_counts);
231 freeArray<scalar_t>(tmpCoords);
232 freeArray<scalar_t *>(multiJagged_coordinates);
236 template <
typename Adapter,
typename scalar_t,
typename part_t>
239 const Teuchos::Comm<int> *comm,
244 ArrayRCP<part_t> &g_part_xadj,
245 ArrayRCP<part_t> &g_part_adj,
246 ArrayRCP<scalar_t> &g_part_ew
249 typedef typename Adapter::lno_t t_lno_t;
250 typedef typename Adapter::gno_t t_gno_t;
251 typedef typename Adapter::scalar_t t_scalar_t;
252 typedef typename Adapter::offset_t t_offset_t;
273 ArrayView<const t_gno_t> Ids;
274 ArrayView<t_input_t> v_wghts;
278 ArrayView<const t_gno_t> edgeIds;
279 ArrayView<const t_offset_t> offsets;
280 ArrayView<t_input_t> e_wgts;
283 std::vector<t_scalar_t> edge_weights;
286 if (numWeightPerEdge > 0){
287 edge_weights = std::vector<t_scalar_t>(localNumEdges);
288 for (t_lno_t i = 0; i < localNumEdges; ++i){
289 edge_weights[i] = e_wgts[0][i];
295 std::vector<part_t> e_parts(localNumEdges);
296 #ifdef HAVE_ZOLTAN2_MPI
297 if (comm->getSize() > 1)
299 const bool bUseLocalIDs =
false;
302 const RCP<const Comm<int> > rcp_comm(comm,
false);
303 directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
304 directory.update(localNumVertices, &Ids[0], NULL, &parts[0],
305 NULL, directory_t::Update_Mode::Replace);
306 directory.find(localNumEdges, &edgeIds[0], NULL, &e_parts[0],
319 for (t_lno_t i = 0; i < localNumEdges; ++i){
320 t_gno_t ei = edgeIds[i];
326 std::vector<t_lno_t> part_begins(np, -1);
327 std::vector<t_lno_t> part_nexts(localNumVertices, -1);
331 for (t_lno_t i = 0; i < localNumVertices; ++i){
333 part_nexts[i] = part_begins[ap];
338 g_part_xadj = ArrayRCP<part_t>(np + 1);
339 g_part_adj = ArrayRCP<part_t>(localNumEdges);
340 g_part_ew = ArrayRCP<t_scalar_t>(localNumEdges);
343 std::vector<part_t> part_neighbors(np);
344 std::vector<t_scalar_t> part_neighbor_weights(np, 0);
345 std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
348 for (t_lno_t i = 0; i < np; ++i){
349 part_t num_neighbor_parts = 0;
350 t_lno_t v = part_begins[i];
354 for (t_offset_t j = offsets[v]; j < offsets[v+1]; ++j){
359 if (numWeightPerEdge > 0){
360 ew = edge_weights[j];
364 if (part_neighbor_weights[ep] < 0.00001){
365 part_neighbors[num_neighbor_parts++] = ep;
367 part_neighbor_weights[ep] += ew;
374 for (t_lno_t j = 0; j < num_neighbor_parts; ++j){
375 part_t neighbor_part = part_neighbors[j];
376 g_part_adj[nindex] = neighbor_part;
377 g_part_ew[nindex++] = part_neighbor_weights[neighbor_part];
378 part_neighbor_weights[neighbor_part] = 0;
380 g_part_xadj[i + 1] = nindex;
389 part_info() : weight(0) {
391 const part_info & operator+=(
const part_info & src) {
392 this->weight += src.weight;
395 bool operator>(
const part_info & src) {
396 return (destination_part > src.destination_part);
398 bool operator==(
const part_info & src) {
399 return (destination_part == src.destination_part);
405 bool bUseLocalIDs =
false;
406 const int debug_level = 0;
409 const RCP<const Comm<int> > rcp_comm(comm,
false);
410 directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
411 std::vector<part_t> part_data;
412 std::vector<std::vector<part_info>> user_data;
417 std::vector<t_lno_t> part_begins(np, -1);
418 std::vector<t_lno_t> part_nexts(localNumVertices, -1);
422 for (t_lno_t i = 0; i < localNumVertices; ++i){
424 part_nexts[i] = part_begins[ap];
428 std::vector<part_t> part_neighbors(np);
429 std::vector<t_scalar_t> part_neighbor_weights(np, 0);
430 std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
433 for (t_lno_t i = 0; i < np; ++i){
434 part_t num_neighbor_parts = 0;
435 t_lno_t v = part_begins[i];
439 for (t_offset_t j = offsets[v]; j < offsets[v+1]; ++j){
444 if (numWeightPerEdge > 0){
445 ew = edge_weights[j];
448 if (part_neighbor_weights[ep] < 0.00001){
449 part_neighbors[num_neighbor_parts++] = ep;
451 part_neighbor_weights[ep] += ew;
457 for (t_lno_t j = 0; j < num_neighbor_parts; ++j){
458 part_t neighbor_part = part_neighbors[j];
459 part_neighbor_weights_ordered[j] = part_neighbor_weights[neighbor_part];
460 part_neighbor_weights[neighbor_part] = 0;
464 if (num_neighbor_parts > 0){
465 part_data.push_back(i);
466 std::vector<part_info> new_user_data(num_neighbor_parts);
467 for(
int q = 0; q < num_neighbor_parts; ++q) {
468 part_info & info = new_user_data[q];
469 info.weight = part_neighbor_weights_ordered[q];
470 info.destination_part = part_neighbors[q];
472 user_data.push_back(new_user_data);
478 std::vector<part_t> part_indices(np);
480 for (
part_t i = 0; i < np; ++i) part_indices[i] = i;
483 directory.update(part_data.size(), &part_data[0], NULL, &user_data[0],
484 NULL, directory_t::Update_Mode::AggregateAdd);
488 std::vector<std::vector<part_info>> find_user_data(part_indices.size());
489 directory.find(find_user_data.size(), &part_indices[0], NULL,
490 &find_user_data[0], NULL, NULL,
false);
500 int get_total_length = 0;
501 for(
size_t n = 0; n < find_user_data.size(); ++n) {
502 get_total_length += find_user_data[n].size();
506 g_part_xadj = ArrayRCP<part_t> (np + 1);
507 g_part_adj = ArrayRCP<part_t> (get_total_length);
508 g_part_ew = ArrayRCP<t_scalar_t> (get_total_length);
511 int track_insert_index = 0;
512 for(
size_t n = 0; n < find_user_data.size(); ++n) {
513 g_part_xadj[n] = track_insert_index;
514 const std::vector<part_info> & user_data_vector = find_user_data[n];
515 for(
size_t q = 0; q < user_data_vector.size(); ++q) {
516 const part_info & info = user_data_vector[q];
517 g_part_adj[track_insert_index] = info.destination_part;
518 g_part_ew[track_insert_index] = info.weight;
519 ++track_insert_index;
522 g_part_xadj[np] = get_total_length;
555 template <
class IT,
class WT>
565 this->heapSize = heapsize_;
566 this->indices = allocMemory<IT>(heapsize_ );
567 this->values = allocMemory<WT>(heapsize_ );
572 freeArray<IT>(this->indices);
573 freeArray<WT>(this->values);
578 WT maxVal = this->values[0];
581 if (distance >= maxVal)
return;
583 this->values[0] = distance;
584 this->indices[0] = index;
591 IT child_index1 = 2 * index_on_heap + 1;
592 IT child_index2 = 2 * index_on_heap + 2;
595 if(child_index1 < this->heapSize && child_index2 < this->heapSize){
597 if (this->values[child_index1] < this->values[child_index2]){
598 biggerIndex = child_index2;
601 biggerIndex = child_index1;
604 else if(child_index1 < this->heapSize){
605 biggerIndex = child_index1;
608 else if(child_index2 < this->heapSize){
609 biggerIndex = child_index2;
611 if (biggerIndex >= 0 && this->values[biggerIndex] > this->values[index_on_heap]){
612 WT tmpVal = this->values[biggerIndex];
613 this->values[biggerIndex] = this->values[index_on_heap];
614 this->values[index_on_heap] = tmpVal;
616 IT tmpIndex = this->indices[biggerIndex];
617 this->indices[biggerIndex] = this->indices[index_on_heap];
618 this->indices[index_on_heap] = tmpIndex;
624 WT MAXVAL = std::numeric_limits<WT>::max();
625 for(IT i = 0; i < this->heapSize; ++i){
626 this->values[i] = MAXVAL;
627 this->indices[i] = -1;
635 for(IT j = 0; j < this->heapSize; ++j){
636 nc += this->values[j];
646 for(
int i = 0; i < dimension; ++i){
648 for(IT j = 0; j < this->heapSize; ++j){
649 IT k = this->indices[j];
653 nc /= this->heapSize;
654 moved = (
ZOLTAN2_ABS(center[i] - nc) > this->_EPSILON || moved );
662 for(IT i = 0; i < this->heapSize; ++i){
663 permutation[i] = this->indices[i];
670 template <
class IT,
class WT>
683 this->dimension = dimension_;
684 this->
center = allocMemory<WT>(dimension_);
685 this->closestPoints.setHeapsize(heapsize);
689 this->closestPoints.initValues();
693 return this->closestPoints.getNewCenters(
center, coords, dimension);
700 for (
int i = 0; i < this->dimension; ++i){
701 WT d = (
center[i] - elementCoords[i][index]);
704 distance = pow(distance, WT(1.0 / this->dimension));
705 closestPoints.addPoint(index, distance);
710 return closestPoints.getTotalDistance();
714 closestPoints.copyCoordinates(permutation);
721 template <
class IT,
class WT>
728 IT required_elements;
734 freeArray<KMeansCluster<IT,WT> >(clusters);
735 freeArray<WT>(maxCoordinates);
736 freeArray<WT>(minCoordinates);
745 IT required_elements_):
747 numElements(numElements_),
748 elementCoords(elementCoords_),
749 numClusters((1 << dim_) + 1),
750 required_elements(required_elements_)
752 this->clusters = allocMemory<KMeansCluster<IT,WT> >(this->numClusters);
754 for (
int i = 0; i < numClusters; ++i){
755 this->clusters[i].setParams(this->dim, this->required_elements);
758 this->maxCoordinates = allocMemory <WT>(this->dim);
759 this->minCoordinates = allocMemory <WT>(this->dim);
762 for (
int j = 0; j < dim; ++j){
763 this->minCoordinates[j] = this->maxCoordinates[j] = this->elementCoords[j][0];
764 for(IT i = 1; i < numElements; ++i){
765 WT t = this->elementCoords[j][i];
766 if(t > this->maxCoordinates[j]){
767 this->maxCoordinates[j] = t;
769 if (t < minCoordinates[j]){
770 this->minCoordinates[j] = t;
777 for (
int j = 0; j < dim; ++j){
778 int mod = (1 << (j+1));
779 for (
int i = 0; i < numClusters - 1; ++i){
781 if ( (i % mod) < mod / 2){
782 c = this->maxCoordinates[j];
786 c = this->minCoordinates[j];
788 this->clusters[i].center[j] = c;
793 for (
int j = 0; j < dim; ++j){
794 this->clusters[numClusters - 1].center[j] = (this->maxCoordinates[j] + this->minCoordinates[j]) / 2;
810 for(
int it = 0; it < 10; ++it){
812 for (IT j = 0; j < this->numClusters; ++j){
813 this->clusters[j].clearHeap();
815 for (IT i = 0; i < this->numElements; ++i){
817 for (IT j = 0; j < this->numClusters; ++j){
819 this->clusters[j].getDistance(i,this->elementCoords);
823 for (IT j = 0; j < this->numClusters; ++j){
824 moved =(this->clusters[j].getNewCenters(this->elementCoords) || moved );
837 WT minDistance = this->clusters[0].getDistanceToCenter();
840 for (IT j = 1; j < this->numClusters; ++j){
841 WT minTmpDistance = this->clusters[j].getDistanceToCenter();
843 if(minTmpDistance < minDistance){
844 minDistance = minTmpDistance;
850 this->clusters[minCluster].copyCoordinates(procPermutation);
856 #define MINOF(a,b) (((a)<(b))?(a):(b))
864 template <
typename T>
868 #ifdef HAVE_ZOLTAN2_OMP
869 #pragma omp parallel for
871 for(
size_t i = 0; i < arrSize; ++i){
878 #ifdef HAVE_ZOLTAN2_OMP
879 #pragma omp parallel for
881 for(
size_t i = 0; i < arrSize; ++i){
890 template <
typename part_t,
typename pcoord_t>
914 part_t *task_communication_xadj,
915 part_t *task_communication_adj,
916 pcoord_t *task_communication_edge_weight){
918 double totalCost = 0;
922 int assigned_proc = task_to_proc[task];
924 part_t task_adj_begin = task_communication_xadj[task];
925 part_t task_adj_end = task_communication_xadj[task+1];
927 commCount += task_adj_end - task_adj_begin;
929 for (
part_t task2 = task_adj_begin; task2 < task_adj_end; ++task2){
932 part_t neighborTask = task_communication_adj[task2];
934 int neighborProc = task_to_proc[neighborTask];
938 if (task_communication_edge_weight == NULL){
939 totalCost += distance ;
942 totalCost += distance * task_communication_edge_weight[task2];
972 const RCP<const Environment> &env,
973 ArrayRCP <part_t> &proc_to_task_xadj,
974 ArrayRCP <part_t> &proc_to_task_adj,
975 ArrayRCP <part_t> &task_to_proc
976 ,
const Teuchos::RCP <
const Teuchos::Comm<int> > comm_
982 template <
typename pcoord_t,
typename tcoord_t,
typename part_t>
1027 pcoord_t **pcoords_,
1029 tcoord_t **tcoords_,
1032 int *machine_extent_,
1033 bool *machine_extent_wrap_around_,
1067 minCoordDim, nprocs,
1073 for(
int i = ntasks; i < nprocs; ++i){
1074 proc_permutation[i] = -1;
1098 lo = _u_umpa_seed % q;
1099 hi = _u_umpa_seed / q;
1100 test = (a*lo)-(r*hi);
1102 _u_umpa_seed = test;
1104 _u_umpa_seed = test + m;
1105 d = (double) ((
double) _u_umpa_seed / (double) m);
1106 return (
part_t) (d*(double)l);
1110 pcoord_t distance = 0;
1123 this->
machine->getHopCount(procId1, procId2, distance);
1144 for (i=0; i<n; i+=16)
1161 for (i=1; i<end; i++)
1182 const RCP<const Environment> &env,
1183 ArrayRCP <part_t> &rcp_proc_to_task_xadj,
1184 ArrayRCP <part_t> &rcp_proc_to_task_adj,
1185 ArrayRCP <part_t> &rcp_task_to_proc
1186 ,
const Teuchos::RCP <
const Teuchos::Comm<int> > comm_
1189 rcp_proc_to_task_xadj = ArrayRCP <part_t>(this->
no_procs+1);
1190 rcp_proc_to_task_adj = ArrayRCP <part_t>(this->
no_tasks);
1191 rcp_task_to_proc = ArrayRCP <part_t>(this->
no_tasks);
1193 part_t *proc_to_task_xadj = rcp_proc_to_task_xadj.getRawPtr();
1194 part_t *proc_to_task_adj = rcp_proc_to_task_adj.getRawPtr();
1195 part_t *task_to_proc = rcp_task_to_proc.getRawPtr();
1199 fillContinousArray<part_t>(proc_to_task_xadj, this->
no_procs+1, &invalid);
1218 recursion_depth = log(
float(this->no_procs)) / log(2.0) * 2 + 1;
1221 recursion_depth = log(
float(this->no_procs)) / log(2.0) + 1;
1229 int permutations = taskPerm * procPerm;
1232 permutations += taskPerm;
1234 permutations += procPerm;
1240 part_t *proc_xadj = allocMemory<part_t>(num_parts+1);
1247 if(this->no_procs > this->
no_tasks){
1253 fillContinousArray<part_t>(proc_adjList,this->
no_procs, NULL);
1256 int myPermutation = myRank % permutations;
1257 bool task_partition_along_longest_dim =
false;
1258 bool proc_partition_along_longest_dim =
false;
1264 if (myPermutation == 0){
1265 task_partition_along_longest_dim =
true;
1266 proc_partition_along_longest_dim =
true;
1270 if (myPermutation < taskPerm){
1271 proc_partition_along_longest_dim =
true;
1272 myTaskPerm = myPermutation;
1275 myPermutation -= taskPerm;
1276 if (myPermutation < procPerm){
1277 task_partition_along_longest_dim =
true;
1278 myProcPerm = myPermutation;
1281 myPermutation -= procPerm;
1282 myProcPerm = myPermutation % procPerm;
1283 myTaskPerm = myPermutation / procPerm;
1312 ithPermutation<int>(this->
proc_coord_dim, myProcPerm, permutation);
1365 env->timerStart(
MACRO_TIMERS,
"Mapping - Proc Partitioning");
1380 proc_partition_along_longest_dim
1384 env->timerStop(
MACRO_TIMERS,
"Mapping - Proc Partitioning");
1392 part_t *task_xadj = allocMemory<part_t>(num_parts+1);
1395 fillContinousArray<part_t>(task_adjList,this->
no_tasks, NULL);
1398 ithPermutation<int>(this->
task_coord_dim, myTaskPerm, permutation);
1401 tcoord_t **tcoords = allocMemory<tcoord_t *>(this->
task_coord_dim);
1407 env->timerStart(
MACRO_TIMERS,
"Mapping - Task Partitioning");
1414 this->task_coord_dim,
1421 task_partition_along_longest_dim
1427 env->timerStop(
MACRO_TIMERS,
"Mapping - Task Partitioning");
1433 freeArray<pcoord_t *>(tcoords);
1434 freeArray<int>(permutation);
1438 for(
part_t i = 0; i < num_parts; ++i){
1440 part_t proc_index_begin = proc_xadj[i];
1441 part_t task_begin_index = task_xadj[i];
1442 part_t proc_index_end = proc_xadj[i+1];
1443 part_t task_end_index = task_xadj[i+1];
1446 if(proc_index_end - proc_index_begin != 1){
1447 std::cerr <<
"Error at partitioning of processors" << std::endl;
1448 std::cerr <<
"PART:" << i <<
" is assigned to " << proc_index_end - proc_index_begin <<
" processors." << std::endl;
1451 part_t assigned_proc = proc_adjList[proc_index_begin];
1452 proc_to_task_xadj[assigned_proc] = task_end_index - task_begin_index;
1458 part_t *proc_to_task_xadj_work = allocMemory<part_t>(this->
no_procs);
1461 part_t tmp = proc_to_task_xadj[i];
1462 proc_to_task_xadj[i] = sum;
1464 proc_to_task_xadj_work[i] = sum;
1466 proc_to_task_xadj[this->
no_procs] = sum;
1468 for(
part_t i = 0; i < num_parts; ++i){
1470 part_t proc_index_begin = proc_xadj[i];
1471 part_t task_begin_index = task_xadj[i];
1472 part_t task_end_index = task_xadj[i+1];
1474 part_t assigned_proc = proc_adjList[proc_index_begin];
1476 for (
part_t j = task_begin_index; j < task_end_index; ++j){
1477 part_t taskId = task_adjList[j];
1479 task_to_proc[taskId] = assigned_proc;
1481 proc_to_task_adj [ --proc_to_task_xadj_work[assigned_proc] ] = taskId;
1534 freeArray<part_t>(proc_to_task_xadj_work);
1535 freeArray<part_t>(task_xadj);
1536 freeArray<part_t>(task_adjList);
1537 freeArray<part_t>(proc_xadj);
1538 freeArray<part_t>(proc_adjList);
1543 template <
typename Adapter,
typename part_t>
1547 #ifndef DOXYGEN_SHOULD_SKIP_THIS
1549 typedef typename Adapter::scalar_t pcoord_t;
1550 typedef typename Adapter::scalar_t tcoord_t;
1551 typedef typename Adapter::scalar_t
scalar_t;
1552 typedef typename Adapter::lno_t
lno_t;
1573 void doMapping(
int myRank,
const Teuchos::RCP <
const Teuchos::Comm<int> > comm_){
1586 std::cerr <<
"communicationModel is not specified in the Mapper" << std::endl;
1598 int taskPerm = z2Fact<int>(procDim);
1599 int procPerm = z2Fact<int>(taskDim);
1600 int idealGroupSize = taskPerm * procPerm;
1602 idealGroupSize += taskPerm + procPerm + 1;
1604 int myRank = this->
comm->getRank();
1605 int commSize = this->
comm->getSize();
1607 int myGroupIndex = myRank / idealGroupSize;
1609 int prevGroupBegin = (myGroupIndex - 1)* idealGroupSize;
1610 if (prevGroupBegin < 0) prevGroupBegin = 0;
1611 int myGroupBegin = myGroupIndex * idealGroupSize;
1612 int myGroupEnd = (myGroupIndex + 1) * idealGroupSize;
1613 int nextGroupEnd = (myGroupIndex + 2)* idealGroupSize;
1615 if (myGroupEnd > commSize){
1616 myGroupBegin = prevGroupBegin;
1617 myGroupEnd = commSize;
1619 if (nextGroupEnd > commSize){
1620 myGroupEnd = commSize;
1622 int myGroupSize = myGroupEnd - myGroupBegin;
1624 part_t *myGroup = allocMemory<part_t>(myGroupSize);
1625 for (
int i = 0; i < myGroupSize; ++i){
1626 myGroup[i] = myGroupBegin + i;
1630 ArrayView<const part_t> myGroupView(myGroup, myGroupSize);
1632 RCP<Comm<int> > subComm = this->
comm->createSubcommunicator(myGroupView);
1633 freeArray<part_t>(myGroup);
1644 double myCost = this->
proc_task_comm->getCommunicationCostMetric();
1646 double localCost[2], globalCost[2];
1648 localCost[0] = myCost;
1649 localCost[1] = double(subComm->getRank());
1651 globalCost[1] = globalCost[0] = std::numeric_limits<double>::max();
1653 reduceAll<int, double>(*subComm, reduceBest,
1654 2, localCost, globalCost);
1656 int sender = int(globalCost[1]);
1679 std::ofstream gnuPlotCode(
"gnuPlot.plot", std::ofstream::out);
1682 std::string ss =
"";
1685 std::string procFile = Teuchos::toString<int>(i) +
"_mapping.txt";
1687 gnuPlotCode <<
"plot \"" << procFile <<
"\"\n";
1690 gnuPlotCode <<
"replot \"" << procFile <<
"\"\n";
1693 std::ofstream inpFile(procFile.c_str(), std::ofstream::out);
1695 std::string gnuPlotArrow =
"set arrow from ";
1696 for(
int j = 0; j < mindim; ++j){
1697 if (j == mindim - 1){
1699 gnuPlotArrow += Teuchos::toString<float>(
proc_task_comm->proc_coords[j][i]);
1704 gnuPlotArrow += Teuchos::toString<float>(
proc_task_comm->proc_coords[j][i]) +
",";
1707 gnuPlotArrow +=
" to ";
1710 inpFile << std::endl;
1712 for(
int k = 0; k < a.size(); ++k){
1715 std::string gnuPlotArrow2 = gnuPlotArrow;
1716 for(
int z = 0; z < mindim; ++z){
1717 if(z == mindim - 1){
1721 gnuPlotArrow2 += Teuchos::toString<float>(
proc_task_comm->task_coords[z][j]);
1725 gnuPlotArrow2 += Teuchos::toString<float>(
proc_task_comm->task_coords[z][j]) +
",";
1728 ss += gnuPlotArrow2 +
"\n";
1729 inpFile << std::endl;
1734 gnuPlotCode <<
"\nreplot\n pause -1 \n";
1735 gnuPlotCode.close();
1742 std::string rankStr = Teuchos::toString<int>(myRank);
1743 std::string gnuPlots =
"gnuPlot", extentionS =
".plot";
1744 std::string outF = gnuPlots + rankStr+ extentionS;
1745 std::ofstream gnuPlotCode(outF.c_str(), std::ofstream::out);
1752 std::cerr <<
"Mapping Write is only good for 3 dim" << std::endl;
1755 std::string ss =
"";
1756 std::string procs =
"";
1758 std::set < std::tuple<int,int,int,int,int,int> > my_arrows;
1759 for(
part_t origin_rank = 0; origin_rank < this->
nprocs; ++origin_rank){
1765 std::string gnuPlotArrow =
"set arrow from ";
1766 for(
int j = 0; j < mindim; ++j){
1767 if (j == mindim - 1){
1768 gnuPlotArrow += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][origin_rank]);
1769 procs += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][origin_rank]);
1773 gnuPlotArrow += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][origin_rank]) +
",";
1774 procs += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][origin_rank])+
" ";
1779 gnuPlotArrow +=
" to ";
1782 for(
int k = 0; k < a.size(); ++k){
1783 int origin_task = a[k];
1788 bool differentnode =
false;
1792 for(
int j = 0; j < mindim; ++j){
1793 if (
int(tmpproc_task_comm->proc_coords[j][origin_rank]) !=
int(tmpproc_task_comm->proc_coords[j][neighbor_rank])){
1794 differentnode =
true;
break;
1797 std::tuple<int,int,int, int, int, int> foo(
1798 int(tmpproc_task_comm->proc_coords[0][origin_rank]),
1799 int(tmpproc_task_comm->proc_coords[1][origin_rank]),
1800 int(tmpproc_task_comm->proc_coords[2][origin_rank]),
1801 int(tmpproc_task_comm->proc_coords[0][neighbor_rank]),
1802 int(tmpproc_task_comm->proc_coords[1][neighbor_rank]),
1803 int(tmpproc_task_comm->proc_coords[2][neighbor_rank]));
1806 if (differentnode && my_arrows.find(foo) == my_arrows.end()){
1807 my_arrows.insert(foo);
1809 std::string gnuPlotArrow2 =
"";
1810 for(
int j = 0; j < mindim; ++j){
1811 if(j == mindim - 1){
1812 gnuPlotArrow2 += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][neighbor_rank]);
1815 gnuPlotArrow2 += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][neighbor_rank]) +
",";
1818 ss += gnuPlotArrow + gnuPlotArrow2 +
" nohead\n";
1826 std::ofstream procFile(
"procPlot.plot", std::ofstream::out);
1827 procFile << procs <<
"\n";
1832 gnuPlotCode <<
"plot \"procPlot.plot\" with points pointsize 3\n";
1834 gnuPlotCode <<
"splot \"procPlot.plot\" with points pointsize 3\n";
1837 gnuPlotCode << ss <<
"\nreplot\n pause -1 \n";
1838 gnuPlotCode.close();
1845 const Teuchos::Comm<int> *comm_,
1848 tcoord_t **partCenters
1850 std::string file =
"gggnuPlot";
1851 std::string exten =
".plot";
1852 std::ofstream mm(
"2d.txt");
1853 file += Teuchos::toString<int>(comm_->getRank()) + exten;
1854 std::ofstream ff(file.c_str());
1859 outPartBoxes[i].writeGnuPlot(ff, mm);
1862 ff <<
"plot \"2d.txt\"" << std::endl;
1866 ff <<
"splot \"2d.txt\"" << std::endl;
1871 ff <<
"set style arrow 5 nohead size screen 0.03,15,135 ls 1" << std::endl;
1875 for (
part_t p = pb; p < pe; ++p){
1879 std::string arrowline =
"set arrow from ";
1880 for (
int j = 0; j < coordDim - 1; ++j){
1881 arrowline += Teuchos::toString<tcoord_t>(partCenters[j][n]) +
",";
1883 arrowline += Teuchos::toString<tcoord_t>(partCenters[coordDim -1][n]) +
" to ";
1886 for (
int j = 0; j < coordDim - 1; ++j){
1887 arrowline += Teuchos::toString<tcoord_t>(partCenters[j][i]) +
",";
1889 arrowline += Teuchos::toString<tcoord_t>(partCenters[coordDim -1][i]) +
" as 5\n";
1896 ff <<
"replot\n pause -1" << std::endl;
1932 const lno_t num_local_coords,
1933 const part_t *local_coord_parts,
1934 const ArrayRCP<part_t> task_to_proc_){
1937 for (
lno_t i = 0; i < num_local_coords; ++i){
1938 part_t local_coord_part = local_coord_parts[i];
1939 part_t rank_index = task_to_proc_[local_coord_part];
1957 const Teuchos::RCP <
const Teuchos::Comm<int> > comm_,
1959 const Teuchos::RCP <const Adapter> input_adapter_,
1961 const Teuchos::RCP <const Environment> envConst,
1962 bool is_input_adapter_distributed =
true,
1963 int num_ranks_per_node = 1,
1964 bool divide_to_prime_first =
false,
bool reduce_best_mapping =
true):
1965 PartitionMapping<Adapter>(comm_, machine_, input_adapter_, soln_, envConst),
1974 using namespace Teuchos;
1977 RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
1978 RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
1980 RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
1981 RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
1982 if (!is_input_adapter_distributed){
1983 ia_comm = Teuchos::createSerialComm<int>();
1986 RCP<const Environment> envConst_ = envConst;
1988 RCP<const ctm_base_adapter_t> baseInputAdapter_(
1989 rcp(dynamic_cast<const ctm_base_adapter_t *>(input_adapter_.getRawPtr()),
false));
1997 baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2007 baseInputAdapter_, envConst_, ia_comm,
2011 if (!machine_->hasMachineCoordinates()) {
2012 throw std::runtime_error(
"Existing machine does not provide coordinates "
2013 "for coordinate task mapping");
2017 int procDim = machine_->getMachineDim();
2018 this->
nprocs = machine_->getNumRanks();
2021 pcoord_t **procCoordinates = NULL;
2022 if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2023 throw std::runtime_error(
"Existing machine does not implement "
2024 "getAllMachineCoordinatesView");
2031 std::vector<int> machine_extent_vec(procDim);
2033 int *machine_extent = &(machine_extent_vec[0]);
2034 bool *machine_extent_wrap_around =
new bool[procDim];
2035 for (
int i = 0; i < procDim; ++i)machine_extent_wrap_around[i] =
false;
2036 machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2044 if (machine_->getMachineExtent(machine_extent)) {
2049 machine_extent_wrap_around,
2057 int coordDim = coordinateModel_->getCoordinateDim();
2067 tcoord_t **partCenters = NULL;
2068 partCenters = allocMemory<tcoord_t *>(coordDim);
2069 for (
int i = 0; i < coordDim; ++i){
2070 partCenters[i] = allocMemory<tcoord_t>(this->
ntasks);
2073 typedef typename Adapter::scalar_t t_scalar_t;
2076 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Solution Center");
2079 getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2080 envConst.getRawPtr(),
2081 ia_comm.getRawPtr(),
2082 coordinateModel_.getRawPtr(),
2090 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Solution Center");
2094 if (graph_model_.getRawPtr() != NULL){
2095 getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2096 envConst.getRawPtr(),
2097 ia_comm.getRawPtr(),
2098 graph_model_.getRawPtr(),
2119 machine_extent_wrap_around,
2120 machine_.getRawPtr());
2122 int myRank = comm_->getRank();
2124 this->
proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2127 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Processor Task map");
2129 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Processor Task map");
2132 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Graph");
2138 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Graph");
2140 if (comm_->getRank() == 0){
2145 for (
part_t i = 1; i <= taskCommCount; ++i){
2147 if (maxN < nc) maxN = nc;
2149 std::cout <<
" maxNeighbor:" << maxN << std::endl;
2152 this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2155 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Cost");
2168 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Cost");
2175 coordinateModel_->getLocalNumCoordinates(),
2196 delete []machine_extent_wrap_around;
2197 if (machine_->getMachineExtent(machine_extent)){
2198 for (
int i = 0; i < procDim; ++i){
2199 delete [] procCoordinates[i];
2201 delete [] procCoordinates;
2204 for (
int i = 0; i < coordDim; ++i){
2205 freeArray<tcoord_t>(partCenters[i]);
2207 freeArray<tcoord_t *>(partCenters);
2223 const Teuchos::RCP <
const Teuchos::Comm<int> > comm_,
2225 const Teuchos::RCP <const Adapter> input_adapter_,
2227 const part_t *result_parts,
2228 const Teuchos::RCP <const Environment> envConst,
2229 bool is_input_adapter_distributed =
true,
2230 int num_ranks_per_node = 1,
2231 bool divide_to_prime_first =
false,
bool reduce_best_mapping =
true):
2232 PartitionMapping<Adapter>(comm_, machine_, input_adapter_, num_parts_, result_parts, envConst),
2241 using namespace Teuchos;
2244 RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2245 RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2247 RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2248 RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2249 if (!is_input_adapter_distributed){
2250 ia_comm = Teuchos::createSerialComm<int>();
2252 RCP<const Environment> envConst_ = envConst;
2254 RCP<const ctm_base_adapter_t> baseInputAdapter_(
2255 rcp(dynamic_cast<const ctm_base_adapter_t *>(input_adapter_.getRawPtr()),
false));
2263 baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2273 baseInputAdapter_, envConst_, ia_comm,
2277 if (!machine_->hasMachineCoordinates()) {
2278 throw std::runtime_error(
"Existing machine does not provide coordinates "
2279 "for coordinate task mapping");
2283 int procDim = machine_->getMachineDim();
2284 this->
nprocs = machine_->getNumRanks();
2287 pcoord_t **procCoordinates = NULL;
2288 if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2289 throw std::runtime_error(
"Existing machine does not implement "
2290 "getAllMachineCoordinatesView");
2297 std::vector<int> machine_extent_vec(procDim);
2299 int *machine_extent = &(machine_extent_vec[0]);
2300 bool *machine_extent_wrap_around =
new bool[procDim];
2301 machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2309 if (machine_->getMachineExtent(machine_extent)) {
2314 machine_extent_wrap_around,
2322 int coordDim = coordinateModel_->getCoordinateDim();
2325 this->ntasks = num_parts_;
2329 tcoord_t **partCenters = NULL;
2330 partCenters = allocMemory<tcoord_t *>(coordDim);
2331 for (
int i = 0; i < coordDim; ++i){
2332 partCenters[i] = allocMemory<tcoord_t>(this->
ntasks);
2335 typedef typename Adapter::scalar_t t_scalar_t;
2338 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Solution Center");
2341 getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2342 envConst.getRawPtr(),
2343 ia_comm.getRawPtr(),
2344 coordinateModel_.getRawPtr(),
2352 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Solution Center");
2356 if (graph_model_.getRawPtr() != NULL){
2357 getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2358 envConst.getRawPtr(),
2359 ia_comm.getRawPtr(),
2360 graph_model_.getRawPtr(),
2373 envConst->timerStart(
MACRO_TIMERS,
"CoordinateCommunicationModel Create");
2384 machine_extent_wrap_around,
2385 machine_.getRawPtr());
2387 envConst->timerStop(
MACRO_TIMERS,
"CoordinateCommunicationModel Create");
2391 this->
proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2393 int myRank = comm_->getRank();
2396 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Processor Task map");
2398 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Processor Task map");
2401 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Graph");
2407 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Graph");
2409 if (comm_->getRank() == 0){
2414 for (
part_t i = 1; i <= taskCommCount; ++i){
2416 if (maxN < nc) maxN = nc;
2418 std::cout <<
" maxNeighbor:" << maxN << std::endl;
2421 this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2424 envConst->timerStart(
MACRO_TIMERS,
"Mapping - Communication Cost");
2437 envConst->timerStop(
MACRO_TIMERS,
"Mapping - Communication Cost");
2445 coordinateModel_->getLocalNumCoordinates(),
2468 delete []machine_extent_wrap_around;
2469 if (machine_->getMachineExtent(machine_extent)){
2470 for (
int i = 0; i < procDim; ++i){
2471 delete [] procCoordinates[i];
2473 delete [] procCoordinates;
2476 for (
int i = 0; i < coordDim; ++i){
2477 freeArray<tcoord_t>(partCenters[i]);
2479 freeArray<tcoord_t *>(partCenters);
2531 const Teuchos::Comm<int> *problemComm,
2534 pcoord_t **machine_coords,
2538 tcoord_t **task_coords,
2539 ArrayRCP<part_t>task_comm_xadj,
2540 ArrayRCP<part_t>task_comm_adj,
2541 pcoord_t *task_communication_edge_weight_,
2542 int recursion_depth,
2544 const part_t *machine_dimensions,
2545 int num_ranks_per_node = 1,
2546 bool divide_to_prime_first =
false,
bool reduce_best_mapping =
true
2548 Teuchos::rcpFromRef<const Teuchos::Comm<int> >(*problemComm),
2549 Teuchos::rcpFromRef<const
Environment>(*env_const_)),
2559 pcoord_t ** virtual_machine_coordinates = machine_coords;
2560 bool *wrap_arounds =
new bool [proc_dim];
2561 for (
int i = 0; i < proc_dim; ++i) wrap_arounds[i] =
true;
2563 if (machine_dimensions){
2564 virtual_machine_coordinates =
2573 this->
nprocs = num_processors;
2575 int coordDim = task_dim;
2576 this->ntasks = num_tasks;
2579 tcoord_t **partCenters = task_coords;
2585 virtual_machine_coordinates,
2593 this->
proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2598 int myRank = problemComm->getRank();
2610 task_communication_edge_weight_
2629 delete [] wrap_arounds;
2631 if (machine_dimensions){
2632 for (
int i = 0; i < proc_dim; ++i){
2633 delete [] virtual_machine_coordinates[i];
2635 delete [] virtual_machine_coordinates;
2638 if(problemComm->getRank() == 0)
2666 const part_t *machine_dimensions,
2667 bool *machine_extent_wrap_around,
2669 pcoord_t **mCoords){
2670 pcoord_t **result_machine_coords = NULL;
2671 result_machine_coords =
new pcoord_t*[machine_dim];
2672 for (
int i = 0; i < machine_dim; ++i){
2673 result_machine_coords[i] =
new pcoord_t [numProcs];
2676 for (
int i = 0; i < machine_dim; ++i){
2677 part_t numMachinesAlongDim = machine_dimensions[i];
2678 part_t *machineCounts=
new part_t[numMachinesAlongDim];
2679 memset(machineCounts, 0,
sizeof(
part_t) *numMachinesAlongDim);
2681 int *filledCoordinates=
new int[numMachinesAlongDim];
2683 pcoord_t *coords = mCoords[i];
2684 for(
part_t j = 0; j < numProcs; ++j){
2686 ++machineCounts[mc];
2689 part_t filledCoordinateCount = 0;
2690 for(
part_t j = 0; j < numMachinesAlongDim; ++j){
2691 if (machineCounts[j] > 0){
2692 filledCoordinates[filledCoordinateCount++] = j;
2696 part_t firstProcCoord = filledCoordinates[0];
2697 part_t firstProcCount = machineCounts[firstProcCoord];
2699 part_t lastProcCoord = filledCoordinates[filledCoordinateCount - 1];
2700 part_t lastProcCount = machineCounts[lastProcCoord];
2702 part_t firstLastGap = numMachinesAlongDim - lastProcCoord + firstProcCoord;
2703 part_t firstLastGapProc = lastProcCount + firstProcCount;
2705 part_t leftSideProcCoord = firstProcCoord;
2706 part_t leftSideProcCount = firstProcCount;
2708 part_t biggestGapProc = numProcs;
2710 part_t shiftBorderCoordinate = -1;
2711 for(
part_t j = 1; j < filledCoordinateCount; ++j){
2712 part_t rightSideProcCoord= filledCoordinates[j];
2713 part_t rightSideProcCount = machineCounts[rightSideProcCoord];
2715 part_t gap = rightSideProcCoord - leftSideProcCoord;
2716 part_t gapProc = rightSideProcCount + leftSideProcCount;
2721 if (gap > biggestGap || (gap == biggestGap && biggestGapProc > gapProc)){
2722 shiftBorderCoordinate = rightSideProcCoord;
2723 biggestGapProc = gapProc;
2726 leftSideProcCoord = rightSideProcCoord;
2727 leftSideProcCount = rightSideProcCount;
2731 if (!(biggestGap > firstLastGap || (biggestGap == firstLastGap && biggestGapProc < firstLastGapProc))){
2732 shiftBorderCoordinate = -1;
2735 for(
part_t j = 0; j < numProcs; ++j){
2737 if (machine_extent_wrap_around[i] && coords[j] < shiftBorderCoordinate){
2738 result_machine_coords[i][j] = coords[j] + numMachinesAlongDim;
2742 result_machine_coords[i][j] = coords[j];
2746 delete [] machineCounts;
2747 delete [] filledCoordinates;
2750 return result_machine_coords;
2781 numParts = taskend - task_begin;
2794 if (taskend - task_begin > 0){
2795 ArrayView <part_t> assignedParts(this->
proc_to_task_adj.getRawPtr() + task_begin, taskend - task_begin);
2796 return assignedParts;
2799 ArrayView <part_t> assignedParts;
2800 return assignedParts;
2876 template <
typename part_t,
typename pcoord_t,
typename tcoord_t>
2878 RCP<
const Teuchos::Comm<int> > problemComm,
2881 pcoord_t **machine_coords,
2884 tcoord_t **task_coords,
2887 pcoord_t *task_communication_edge_weight_,
2890 int recursion_depth,
2892 const part_t *machine_dimensions,
2893 int num_ranks_per_node = 1,
2894 bool divide_to_prime_first =
false
2903 typedef Tpetra::MultiVector<tcoord_t, part_t, part_t>
tMVector_t;
2908 if (task_comm_xadj){
2909 Teuchos::ArrayRCP<part_t> tmp_task_communication_adj(task_comm_adj, 0, task_comm_xadj[num_tasks],
false);
2910 task_communication_adj = tmp_task_communication_adj;
2917 problemComm.getRawPtr(),
2928 task_communication_edge_weight_,
2933 divide_to_prime_first
2937 part_t* proc_to_task_xadj_;
2938 part_t* proc_to_task_adj_;
2940 ctm->
getProcTask(proc_to_task_xadj_, proc_to_task_adj_);
2942 for (
part_t i = 0; i <= num_processors; ++i){
2943 proc_to_task_xadj[i] = proc_to_task_xadj_[i];
2945 for (
part_t i = 0; i < num_tasks; ++i){
2946 proc_to_task_adj[i] = proc_to_task_adj_[i];
2953 template <
typename proc_coord_t,
typename v_lno_t>
2955 const int machine_coord_dim,
const int num_ranks, proc_coord_t **machine_coords,
2958 std::string rankStr = Teuchos::toString<int>(myRank);
2959 std::string gnuPlots =
"gnuPlot", extentionS =
".plot";
2960 std::string outF = gnuPlots + rankStr+ extentionS;
2961 std::ofstream gnuPlotCode( outF.c_str(), std::ofstream::out);
2963 if (machine_coord_dim != 3) {
2964 std::cerr <<
"Mapping Write is only good for 3 dim" << std::endl;
2967 std::string ss =
"";
2968 std::string procs =
"";
2970 std::set < std::tuple<int,int,int,int,int,int> > my_arrows;
2971 for(v_lno_t origin_task = 0; origin_task < num_tasks; ++origin_task){
2972 int origin_rank = task_to_rank[origin_task];
2973 std::string gnuPlotArrow =
"set arrow from ";
2975 for(
int j = 0; j < machine_coord_dim; ++j){
2976 if (j == machine_coord_dim - 1){
2977 gnuPlotArrow += Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
2978 procs += Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
2982 gnuPlotArrow += Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]) +
",";
2983 procs += Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])+
" ";
2988 gnuPlotArrow +=
" to ";
2991 for (
int nind = task_communication_xadj[origin_task]; nind < task_communication_xadj[origin_task + 1]; ++nind){
2992 int neighbor_task = task_communication_adj[nind];
2994 bool differentnode =
false;
2995 int neighbor_rank = task_to_rank[neighbor_task];
2997 for(
int j = 0; j < machine_coord_dim; ++j){
2998 if (
int(machine_coords[j][origin_rank]) != int(machine_coords[j][neighbor_rank])){
2999 differentnode =
true;
break;
3002 std::tuple<int,int,int, int, int, int> foo(
3003 (
int)(machine_coords[0][origin_rank]),
3004 (
int)(machine_coords[1][origin_rank]),
3005 (
int)(machine_coords[2][origin_rank]),
3006 (
int)(machine_coords[0][neighbor_rank]),
3007 (
int)(machine_coords[1][neighbor_rank]),
3008 (
int)(machine_coords[2][neighbor_rank]));
3011 if (differentnode && my_arrows.find(foo) == my_arrows.end()){
3012 my_arrows.insert(foo);
3014 std::string gnuPlotArrow2 =
"";
3015 for(
int j = 0; j < machine_coord_dim; ++j){
3016 if(j == machine_coord_dim - 1){
3017 gnuPlotArrow2 += Teuchos::toString<float>(machine_coords[j][neighbor_rank]);
3020 gnuPlotArrow2 += Teuchos::toString<float>(machine_coords[j][neighbor_rank]) +
",";
3023 ss += gnuPlotArrow + gnuPlotArrow2 +
" nohead\n";
3028 std::ofstream procFile(
"procPlot.plot", std::ofstream::out);
3029 procFile << procs <<
"\n";
3033 if(machine_coord_dim == 2){
3034 gnuPlotCode <<
"plot \"procPlot.plot\" with points pointsize 3\n";
3036 gnuPlotCode <<
"splot \"procPlot.plot\" with points pointsize 3\n";
3039 gnuPlotCode << ss <<
"\nreplot\n pause -1\npause -1";
3040 gnuPlotCode.close();
void setParams(int dimension_, int heapsize)
CommunicationModel Base Class that performs mapping between the coordinate partitioning result...
size_t getLocalNumEdges() const
Returns the number of edges on this process. In global or subset graphs, includes off-process edges...
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
void getBestMapping()
finds the lowest cost mapping and broadcasts solution to everyone.
void ithPermutation(const IT n, IT i, IT *perm)
CommunicationModel(part_t no_procs_, part_t no_tasks_)
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const part_t num_parts_, const part_t *result_parts, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. Instead of Solution we have two parameters, numparts When this constructor is called...
size_t getVertexList(ArrayView< const gno_t > &Ids, ArrayView< input_t > &wgts) const
Sets pointers to this process' vertex Ids and their weights.
virtual ~CommunicationModel()
Time an algorithm (or other entity) as a whole.
bool * machine_extent_wrap_around
void setPartArraySize(int psize)
WT getDistance(IT index, WT **elementCoords)
virtual size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
CoordinateCommunicationModel(int pcoord_dim_, pcoord_t **pcoords_, int tcoord_dim_, tcoord_t **tcoords_, part_t no_procs_, part_t no_tasks_, int *machine_extent_, bool *machine_extent_wrap_around_, const MachineRepresentation< pcoord_t, part_t > *machine_=NULL)
Class Constructor:
void getCoarsenedPartGraph(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::GraphModel< typename Adapter::base_adapter_t > *graph, part_t np, const part_t *parts, ArrayRCP< part_t > &g_part_xadj, ArrayRCP< part_t > &g_part_adj, ArrayRCP< scalar_t > &g_part_ew)
void visualize_mapping(int myRank, const int machine_coord_dim, const int num_ranks, proc_coord_t **machine_coords, const v_lno_t num_tasks, const v_lno_t *task_communication_xadj, const v_lno_t *task_communication_adj, const int *task_to_rank)
PartitionMapping maps a solution or an input distribution to ranks.
ArrayRCP< part_t > local_task_to_rank
void getSolutionCenterCoordinates(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::CoordinateModel< typename Adapter::base_adapter_t > *coords, const part_t *parts, int coordDim, part_t ntasks, scalar_t **partCenters)
bool getNewCenters(WT **coords)
virtual double getProcDistance(int procId1, int procId2) const =0
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
virtual ~CoordinateTaskMapper()
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, mj_scalar_t **mj_coordinates, mj_lno_t *initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth, const mj_part_t *part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_)
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void calculateCommunicationCost(part_t *task_to_proc, part_t *task_communication_xadj, part_t *task_communication_adj, pcoord_t *task_communication_edge_weight)
const part_t * solution_parts
ArrayView< part_t > getAssignedTasksForProc(part_t procId)
const MachineRepresentation< pcoord_t, part_t > * machine
part_t getAssignedProcForTask(part_t taskId)
getAssignedProcForTask function, returns the assigned processor id for the given task ...
void getMinDistanceCluster(IT *procPermutation)
void doMapping(int myRank, const Teuchos::RCP< const Teuchos::Comm< int > > comm_)
doMapping function, calls getMapping function of communicationModel object.
Defines the XpetraMultiVectorAdapter.
SparseMatrixAdapter_t::part_t part_t
Multi Jagged coordinate partitioning algorithm.
void create_local_task_to_rank(const lno_t num_local_coords, const part_t *local_coord_parts, const ArrayRCP< part_t > task_to_proc_)
void timerStop(TimerType tt, const char *timerName) const
Stop a named timer.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &proc_to_task_xadj, ArrayRCP< part_t > &proc_to_task_adj, ArrayRCP< part_t > &task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const =0
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
virtual void map(const RCP< MappingSolution< Adapter > > &mappingsoln)
Mapping method.
Contains the Multi-jagged algorthm.
Adapter::scalar_t scalar_t
size_t getCoordinates(ArrayView< const gno_t > &Ids, ArrayView< input_t > &xyz, ArrayView< input_t > &wgts) const
Returns the coordinate ids, values and optional weights.
PartitionMapping maps a solution or an input distribution to ranks.
void getProcTask(part_t *&proc_to_task_xadj_, part_t *&proc_to_task_adj_)
A PartitioningSolution is a solution to a partitioning problem.
virtual void getProcsForPart(part_t taskId, part_t &numProcs, part_t *&procs) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
size_t getEdgeList(ArrayView< const gno_t > &edgeIds, ArrayView< const offset_t > &offsets, ArrayView< input_t > &wgts) const
Sets pointers to this process' edge (neighbor) global Ids, including off-process edges.
void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
void copyCoordinates(IT *permutation)
void getGridCommunicationGraph(part_t taskCount, part_t *&task_comm_xadj, part_t *&task_comm_adj, std::vector< int > grid_dims)
Zoltan2_ReduceBestMapping Class, reduces the minimum cost mapping, ties breaks with minimum proc id...
ArrayRCP< part_t > task_communication_adj
BaseAdapterType
An enum to identify general types of adapters.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
The StridedData class manages lists of weights or coordinates.
CoordinateTaskMapper(const Environment *env_const_, const Teuchos::Comm< int > *problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, ArrayRCP< part_t >task_comm_xadj, ArrayRCP< part_t >task_comm_adj, pcoord_t *task_communication_edge_weight_, int recursion_depth, part_t *part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor The mapping constructor which will also perform the mapping operation. The result mapping can be obtained by –getAssignedProcForTask function: which returns the assigned processor id for the given task –getPartsForProc: which returns the assigned tasks with the number of tasks.
void setPartArray(part_t *pNo)
ArrayRCP< part_t > proc_to_task_adj
The user parameters, debug, timing and memory profiling output objects, and error checking methods...
static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
void addPoint(IT index, WT distance)
bool getNewCenters(WT *center, WT **coords, int dimension)
size_t getLocalNumVertices() const
Returns the number vertices on this process.
MachineRepresentation Class Base class for representing machine coordinates, networks, etc.
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const Teuchos::RCP< const Zoltan2::PartitioningSolution< Adapter > > soln_, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. When this constructor is called, in order to calculate the communication metric...
CoordinateModelInput Class that performs mapping between the coordinate partitioning result and mpi r...
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS's idx_t...
double getCommunicationCostMetric()
pcoord_t ** shiftMachineCoordinates(int machine_dim, const part_t *machine_dimensions, bool *machine_extent_wrap_around, part_t numProcs, pcoord_t **mCoords)
Using the machine dimensions provided, create virtual machine coordinates by assigning the largest ga...
Defines the MappingSolution class.
const Teuchos::RCP< const Teuchos::Comm< int > > comm
ArrayRCP< part_t > task_communication_xadj
GraphModel defines the interface required for graph models.
Gathering definitions used in software development.
void update_visit_order(part_t *visitOrder, part_t n, int &_u_umpa_seed, part_t rndm)
virtual double getProcDistance(int procId1, int procId2) const
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &rcp_proc_to_task_xadj, ArrayRCP< part_t > &rcp_proc_to_task_adj, ArrayRCP< part_t > &rcp_task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
Zoltan2_ReduceBestMapping()
Default Constructor.
bool divide_to_prime_first
KmeansHeap Class, max heap, but holds the minimum values.
int getNumWeightsPerEdge() const
Returns the number (0 or greater) of weights per edge.
Defines the GraphModel interface.
void fillContinousArray(T *arr, size_t arrSize, T *val)
fillContinousArray function
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
void push_down(IT index_on_heap)
ArrayRCP< scalar_t > task_communication_edge_weight
virtual ~CoordinateCommunicationModel()
RCP< Comm< int > > create_subCommunicator()
creates and returns the subcommunicator for the processor group.
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
KMeansAlgorithm Class that performs clustering of the coordinates, and returns the closest set of coo...
const Teuchos::RCP< const Environment > env
void timerStart(TimerType tt, const char *timerName) const
Start a named timer.
ArrayRCP< part_t > task_to_proc
KMeansAlgorithm(int dim_, IT numElements_, WT **elementCoords_, IT required_elements_)
KMeansAlgorithm Constructor.
void writeMapping2(int myRank)
void copyCoordinates(IT *permutation)
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
void coordinateTaskMapperInterface(RCP< const Teuchos::Comm< int > > problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, part_t *task_comm_xadj, part_t *task_comm_adj, pcoord_t *task_communication_edge_weight_, part_t *proc_to_task_xadj, part_t *proc_to_task_adj, int recursion_depth, part_t *part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false)
Constructor The interface function that calls CoordinateTaskMapper which will also perform the mappin...
CoordinateCommunicationModel()
virtual void getPartsForProc(int procId, part_t &numParts, part_t *&parts) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
ArrayRCP< part_t > proc_to_task_xadj
void setHeapsize(IT heapsize_)
CoordinateCommunicationModel< pcoord_t, tcoord_t, part_t > * proc_task_comm