Zoltan2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Zoltan2_TaskMapping.hpp
Go to the documentation of this file.
1 
2 #ifndef _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
3 #define _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
4 
5 #include <fstream>
6 #include <ctime>
7 #include <vector>
8 #include <set>
9 #include <tuple>
10 
11 #include "Zoltan2_Standards.hpp"
13 #include "Teuchos_ArrayViewDecl.hpp"
16 #include "Teuchos_ReductionOp.hpp"
17 
19 
20 #include "Zoltan2_GraphModel.hpp"
21 
22 #include <Zoltan2_TPLTraits.hpp>
23 
24 #include "Teuchos_Comm.hpp"
25 #ifdef HAVE_ZOLTAN2_MPI
26 #include "Teuchos_DefaultMpiComm.hpp"
27 #endif // HAVE_ZOLTAN2_MPI
28 #include <Teuchos_DefaultSerialComm.hpp>
29 
30 //#define gnuPlot
33 
34 namespace Teuchos{
35 
39 template <typename Ordinal, typename T>
40 class Zoltan2_ReduceBestMapping : public ValueTypeReductionOp<Ordinal,T>
41 {
42 
43 private:
44  T epsilon;
45 
46 public:
49  Zoltan2_ReduceBestMapping() : epsilon(std::numeric_limits<T>::epsilon()) {}
50 
53  void reduce(const Ordinal count,
54  const T inBuffer[],
55  T inoutBuffer[]) const {
56 
57  for (Ordinal i = 0; i < count; i++) {
58  if (inBuffer[0] - inoutBuffer[0] < -epsilon) {
59  inoutBuffer[0] = inBuffer[0];
60  inoutBuffer[1] = inBuffer[1];
61  } else if(inBuffer[0] - inoutBuffer[0] < epsilon &&
62  inBuffer[1] - inoutBuffer[1] < epsilon) {
63  inoutBuffer[0] = inBuffer[0];
64  inoutBuffer[1] = inBuffer[1];
65  }
66  }
67  }
68 };
69 
70 } // namespace Teuchos
71 
72 
73 namespace Zoltan2{
74 
75 template <typename it>
76 inline it z2Fact(it x) {
77  return (x == 1 ? x : x * z2Fact<it>(x - 1));
78 }
79 
80 template <typename gno_t, typename part_t>
82 public:
85 };
86 
87 //returns the ith permutation indices.
88 template <typename IT>
89 void ithPermutation(const IT n, IT i, IT *perm)
90 {
91  IT j, k = 0;
92  IT *fact = new IT[n];
93 
94 
95  // compute factorial numbers
96  fact[k] = 1;
97  while (++k < n)
98  fact[k] = fact[k - 1] * k;
99 
100  // compute factorial code
101  for (k = 0; k < n; ++k)
102  {
103  perm[k] = i / fact[n - 1 - k];
104  i = i % fact[n - 1 - k];
105  }
106 
107  // readjust values to obtain the permutation
108  // start from the end and check if preceding values are lower
109  for (k = n - 1; k > 0; --k)
110  for (j = k - 1; j >= 0; --j)
111  if (perm[j] <= perm[k])
112  perm[k]++;
113 
114  delete [] fact;
115 }
116 
117 template <typename part_t>
119  part_t *&task_comm_xadj,
120  part_t *&task_comm_adj,
121  std::vector<int> grid_dims) {
122  int dim = grid_dims.size();
123  int neighborCount = 2 * dim;
124  task_comm_xadj = new part_t[taskCount + 1];
125  task_comm_adj = new part_t[taskCount * neighborCount];
126 
127  part_t neighBorIndex = 0;
128  task_comm_xadj[0] = 0;
129  for (part_t i = 0; i < taskCount; ++i) {
130  part_t prevDimMul = 1;
131  for (int j = 0; j < dim; ++j) {
132  part_t lNeighbor = i - prevDimMul;
133  part_t rNeighbor = i + prevDimMul;
134  prevDimMul *= grid_dims[j];
135  if (lNeighbor >= 0 &&
136  lNeighbor/ prevDimMul == i / prevDimMul &&
137  lNeighbor < taskCount) {
138  task_comm_adj[neighBorIndex++] = lNeighbor;
139  }
140  if (rNeighbor >= 0 &&
141  rNeighbor/ prevDimMul == i / prevDimMul &&
142  rNeighbor < taskCount) {
143  task_comm_adj[neighBorIndex++] = rNeighbor;
144  }
145  }
146  task_comm_xadj[i + 1] = neighBorIndex;
147  }
148 
149 }
150 //returns the center of the parts.
151 template <typename Adapter, typename scalar_t, typename part_t>
153  const Environment *envConst,
154  const Teuchos::Comm<int> *comm,
156  //const Zoltan2::PartitioningSolution<Adapter> *soln_,
157  const part_t *parts,
158  int coordDim,
159  part_t ntasks,
160  scalar_t **partCenters) {
161 
162  typedef typename Adapter::lno_t lno_t;
163  typedef typename Adapter::gno_t gno_t;
164 
165  typedef StridedData<lno_t, scalar_t> input_t;
166  ArrayView<const gno_t> gnos;
167  ArrayView<input_t> xyz;
168  ArrayView<input_t> wgts;
169  coords->getCoordinates(gnos, xyz, wgts);
170 
171  //local and global num coordinates.
172  lno_t numLocalCoords = coords->getLocalNumCoordinates();
173  //gno_t numGlobalCoords = coords->getGlobalNumCoordinates();
174 
175  //local number of points in each part.
176  gno_t *point_counts = new gno_t[ntasks];
177  memset(point_counts, 0, sizeof(gno_t) * ntasks);
178 
179  //global number of points in each part.
180  gno_t *global_point_counts = new gno_t[ntasks];
181 
182  scalar_t **multiJagged_coordinates = new scalar_t*[coordDim];
183 
184  ArrayRCP<ArrayRCP<const scalar_t>> ar = arcp(new ArrayRCP<const scalar_t>[coordDim], 0, coordDim);
185  for (int dim=0; dim < coordDim; dim++){
186  xyz[dim].getInputArray(ar[dim]);
187  //multiJagged coordinate values assignment
188  multiJagged_coordinates[dim] = (scalar_t *)ar[dim].getRawPtr();
189  memset(partCenters[dim], 0, sizeof(scalar_t) * ntasks);
190  }
191 
192  //get parts with parallel gnos.
193  //const part_t *parts = soln_->getPartListView();
194 
195  envConst->timerStart(MACRO_TIMERS, "Mapping - Center Calculation");
196 
197  for (lno_t i=0; i < numLocalCoords; i++) {
198  part_t p = parts[i];
199  //add up all coordinates in each part.
200  for(int j = 0; j < coordDim; ++j) {
201  scalar_t c = multiJagged_coordinates[j][i];
202  partCenters[j][p] += c;
203  }
204  ++point_counts[p];
205  }
206 
207  //get global number of points in each part.
208  reduceAll<int, gno_t>(*comm, Teuchos::REDUCE_SUM,
209  ntasks, point_counts, global_point_counts);
210 
211  for(int j = 0; j < coordDim; ++j) {
212  for (part_t i=0; i < ntasks; ++i) {
213  if (global_point_counts[i] > 0)
214  partCenters[j][i] /= global_point_counts[i];
215  }
216  }
217 
218  scalar_t *tmpCoords = new scalar_t[ntasks];
219  for(int j = 0; j < coordDim; ++j) {
220  reduceAll<int, scalar_t>(*comm, Teuchos::REDUCE_SUM,
221  ntasks, partCenters[j], tmpCoords);
222 
223  scalar_t *tmp = partCenters[j];
224  partCenters[j] = tmpCoords;
225  tmpCoords = tmp;
226  }
227 
228  envConst->timerStop(MACRO_TIMERS, "Mapping - Center Calculation");
229 
230  delete [] point_counts;
231  delete [] global_point_counts;
232 
233  delete [] tmpCoords;
234  delete [] multiJagged_coordinates;
235 }
236 
237 //returns the coarsend part graph.
238 template <typename Adapter, typename scalar_t, typename part_t>
240  const Environment *envConst,
241  const Teuchos::Comm<int> *comm,
243  //const Zoltan2::PartitioningSolution<Adapter> *soln_,
244  part_t np,
245  const part_t *parts,
246  ArrayRCP<part_t> &g_part_xadj,
247  ArrayRCP<part_t> &g_part_adj,
248  ArrayRCP<scalar_t> &g_part_ew) {
249 
250  typedef typename Adapter::lno_t t_lno_t;
251  typedef typename Adapter::gno_t t_gno_t;
252  typedef typename Adapter::scalar_t t_scalar_t;
253  typedef typename Adapter::offset_t t_offset_t;
254  typedef typename Zoltan2::GraphModel<
255  typename Adapter::base_adapter_t>::input_t t_input_t;
256 
257  //int numRanks = comm->getSize();
258  //int myRank = comm->getRank();
259 
260  //get parts with parallel gnos.
261  /*
262  const part_t *parts = soln_->getPartListView();
263 
264  part_t np = soln_->getActualGlobalNumberOfParts();
265  if (part_t(soln_->getTargetGlobalNumberOfParts()) > np) {
266  np = soln_->getTargetGlobalNumberOfParts();
267  }
268  */
269 
270 
271  t_lno_t localNumVertices = graph->getLocalNumVertices();
272  t_lno_t localNumEdges = graph->getLocalNumEdges();
273 
274  //get the vertex global ids, and weights
275  ArrayView<const t_gno_t> Ids;
276  ArrayView<t_input_t> v_wghts;
277  graph->getVertexList(Ids, v_wghts);
278 
279  //get the edge ids, and weights
280  ArrayView<const t_gno_t> edgeIds;
281  ArrayView<const t_offset_t> offsets;
282  ArrayView<t_input_t> e_wgts;
283  graph->getEdgeList(edgeIds, offsets, e_wgts);
284 
285  std::vector<t_scalar_t> edge_weights;
286  int numWeightPerEdge = graph->getNumWeightsPerEdge();
287 
288  if (numWeightPerEdge > 0) {
289  edge_weights = std::vector<t_scalar_t>(localNumEdges);
290  for (t_lno_t i = 0; i < localNumEdges; ++i) {
291  edge_weights[i] = e_wgts[0][i];
292  }
293  }
294 
295  //create a zoltan dictionary to get the parts of the vertices
296  //at the other end of edges
297  std::vector<part_t> e_parts(localNumEdges);
298 #ifdef HAVE_ZOLTAN2_MPI
299  if (comm->getSize() > 1) {
300  const bool bUseLocalIDs = false; // Local IDs not needed
302  int debug_level = 0;
303  const RCP<const Comm<int> > rcp_comm(comm,false);
304  directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
305  directory.update(localNumVertices, &Ids[0], NULL, &parts[0],
307  directory.find(localNumEdges, &edgeIds[0], NULL, &e_parts[0],
308  NULL, NULL, false);
309  } else
310 #endif
311  {
312 
313  /*
314  std::cout << "localNumVertices:" << localNumVertices
315  << " np:" << np
316  << " globalNumVertices:" << graph->getGlobalNumVertices()
317  << " localNumEdges:" << localNumEdges << std::endl;
318  */
319 
320  for (t_lno_t i = 0; i < localNumEdges; ++i) {
321  t_gno_t ei = edgeIds[i];
322  part_t p = parts[ei];
323  e_parts[i] = p;
324  }
325 
326  //get the vertices in each part in my part.
327  std::vector<t_lno_t> part_begins(np, -1);
328  std::vector<t_lno_t> part_nexts(localNumVertices, -1);
329 
330  //cluster vertices according to their parts.
331  //create local part graph.
332  for (t_lno_t i = 0; i < localNumVertices; ++i) {
333  part_t ap = parts[i];
334  part_nexts[i] = part_begins[ap];
335  part_begins[ap] = i;
336  }
337 
338 
339  g_part_xadj = ArrayRCP<part_t>(np + 1);
340  g_part_adj = ArrayRCP<part_t>(localNumEdges);
341  g_part_ew = ArrayRCP<t_scalar_t>(localNumEdges);
342  part_t nindex = 0;
343  g_part_xadj[0] = 0;
344  std::vector<part_t> part_neighbors(np);
345  std::vector<t_scalar_t> part_neighbor_weights(np, 0);
346  std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
347 
348  //coarsen for all vertices in my part in order with parts.
349  for (t_lno_t i = 0; i < np; ++i) {
350  part_t num_neighbor_parts = 0;
351  t_lno_t v = part_begins[i];
352  //get part i, and first vertex in this part v.
353  while (v != -1) {
354  //now get the neightbors of v.
355  for (t_offset_t j = offsets[v]; j < offsets[v+1]; ++j) {
356  //get the part of the second vertex.
357  part_t ep = e_parts[j];
358 
359  t_scalar_t ew = 1;
360  if (numWeightPerEdge > 0) {
361  ew = edge_weights[j];
362  }
363 
364 // std::cout << "part:" << i << " v:" << v
365 // << " part2:" << ep << " v2:" << edgeIds[j]
366 // << " w:" << ew << std::endl;
367 
368  if (part_neighbor_weights[ep] < 0.00001) {
369  part_neighbors[num_neighbor_parts++] = ep;
370  }
371 
372  part_neighbor_weights[ep] += ew;
373  }
374 
375  v = part_nexts[v];
376  }
377 
378 
379  //now get the part list.
380  for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
381  part_t neighbor_part = part_neighbors[j];
382  g_part_adj[nindex] = neighbor_part;
383  g_part_ew[nindex++] = part_neighbor_weights[neighbor_part];
384  part_neighbor_weights[neighbor_part] = 0;
385  }
386  g_part_xadj[i + 1] = nindex;
387  }
388  }
389 
390 #ifdef HAVE_ZOLTAN2_MPI
391  if(comm->getSize() > 1) { // Otherwise it's already handled above and done
392 
393  // struct for directory data - note more extensive comments in
394  // Zoltan2_GraphMetricsUtility.hpp which I didn't want to duplicate
395  // because this is in progress. Similar concept there.
396  struct part_info {
397  part_info() : weight(0) {
398  }
399 
400  const part_info & operator+=(const part_info & src) {
401  this->weight += src.weight;
402  return *this;
403  }
404 
405  bool operator>(const part_info & src) {
406  return (destination_part > src.destination_part);
407  }
408 
409  bool operator==(const part_info & src) {
410  return (destination_part == src.destination_part);
411  }
412 
413  part_t destination_part;
414  scalar_t weight;
415  };
416 
417  bool bUseLocalIDs = false;
418  const int debug_level = 0;
420  directory_t;
421  const RCP<const Comm<int> > rcp_comm(comm,false);
422  directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
423  std::vector<part_t> part_data;
424  std::vector<std::vector<part_info>> user_data;
425 
426  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE Coarsen");
427  {
428  //get the vertices in each part in my part.
429  std::vector<t_lno_t> part_begins(np, -1);
430  std::vector<t_lno_t> part_nexts(localNumVertices, -1);
431 
432  //cluster vertices according to their parts.
433  //create local part graph.
434  for (t_lno_t i = 0; i < localNumVertices; ++i) {
435  part_t ap = parts[i];
436  part_nexts[i] = part_begins[ap];
437  part_begins[ap] = i;
438  }
439 
440  std::vector<part_t> part_neighbors(np);
441  std::vector<t_scalar_t> part_neighbor_weights(np, 0);
442  std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
443 
444  //coarsen for all vertices in my part in order with parts.
445  for (t_lno_t i = 0; i < np; ++i) {
446  part_t num_neighbor_parts = 0;
447  t_lno_t v = part_begins[i];
448 
449  //get part i, and first vertex in this part v.
450  while (v != -1) {
451  //now get the neightbors of v.
452  for (t_offset_t j = offsets[v]; j < offsets[v+1]; ++j) {
453  //get the part of the second vertex.
454  part_t ep = e_parts[j];
455 
456  t_scalar_t ew = 1;
457  if (numWeightPerEdge > 0) {
458  ew = edge_weights[j];
459  }
460 
461  //add it to my local part neighbors for part i.
462  if (part_neighbor_weights[ep] < 0.00001) {
463  part_neighbors[num_neighbor_parts++] = ep;
464  }
465 
466  part_neighbor_weights[ep] += ew;
467  }
468 
469  v = part_nexts[v];
470  }
471 
472  //now get the part list.
473  for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
474  part_t neighbor_part = part_neighbors[j];
475  part_neighbor_weights_ordered[j] =
476  part_neighbor_weights[neighbor_part];
477  part_neighbor_weights[neighbor_part] = 0;
478  }
479 
480  //insert it to tpetra crsmatrix.
481  if (num_neighbor_parts > 0) {
482  part_data.push_back(i); // TODO: optimize to avoid push_back
483  std::vector<part_info> new_user_data(num_neighbor_parts);
484  for(int q = 0; q < num_neighbor_parts; ++q) {
485  part_info & info = new_user_data[q];
486  info.weight = part_neighbor_weights_ordered[q];
487  info.destination_part = part_neighbors[q];
488  }
489  // TODO: optimize to avoid push_back
490  user_data.push_back(new_user_data);
491  }
492  }
493  }
494  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE Coarsen");
495 
496  std::vector<part_t> part_indices(np);
497 
498  for (part_t i = 0; i < np; ++i) part_indices[i] = i;
499 
500  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE directory update");
501  directory.update(part_data.size(), &part_data[0], NULL, &user_data[0],
502  NULL, directory_t::Update_Mode::AggregateAdd);
503  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE directory update");
504 
505  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE directory find");
506  std::vector<std::vector<part_info>> find_user_data(part_indices.size());
507  directory.find(find_user_data.size(), &part_indices[0], NULL,
508  &find_user_data[0], NULL, NULL, false);
509  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE directory find");
510 
511  // Now reconstruct the output data from the directory find data
512  // This code was designed to reproduce the exact format of the original
513  // setup for g_part_xadj, g_part_adj, and g_part_ew but before making any
514  // further changes I wanted to verify if this formatting should be
515  // preserved or potentially changed further.
516 
517  // first thing is get the total number of elements
518  int get_total_length = 0;
519  for (size_t n = 0; n < find_user_data.size(); ++n) {
520  get_total_length += find_user_data[n].size();
521  }
522 
523  // setup data space
524  g_part_xadj = ArrayRCP<part_t>(np + 1);
525  g_part_adj = ArrayRCP<part_t>(get_total_length);
526  g_part_ew = ArrayRCP<t_scalar_t>(get_total_length);
527 
528  // loop through again and fill to match the original formatting
529  int track_insert_index = 0;
530  for(size_t n = 0; n < find_user_data.size(); ++n) {
531  g_part_xadj[n] = track_insert_index;
532  const std::vector<part_info> & user_data_vector = find_user_data[n];
533  for(size_t q = 0; q < user_data_vector.size(); ++q) {
534  const part_info & info = user_data_vector[q];
535  g_part_adj[track_insert_index] = info.destination_part;
536  g_part_ew[track_insert_index] = info.weight;
537  ++track_insert_index;
538  }
539  }
540  g_part_xadj[np] = get_total_length; // complete the series
541  }
542 #endif // HAVE_ZOLTAN2_MPI
543 }
544 
545 
548 template <class IT, class WT>
550 
551  IT heapSize;
552  IT *indices;
553  WT *values;
554  WT epsilon;
555 
556 
557 public:
559  delete [] this->indices;
560  delete [] this->values;
561  }
562 
563  void setHeapsize(IT heapsize_){
564  this->heapSize = heapsize_;
565  this->indices = new IT[heapsize_];
566  this->values = new WT[heapsize_];
567  this->epsilon = std::numeric_limits<WT>::epsilon();
568  }
569 
570  void addPoint(IT index, WT distance) {
571  WT maxVal = this->values[0];
572  //add only the distance is smaller than the maximum distance.
573 // std::cout << "indeX:" << index
574 // << " distance:" << distance
575 // << " maxVal:" << maxVal << endl;
576  if (distance >= maxVal)
577  return;
578  else {
579  this->values[0] = distance;
580  this->indices[0] = index;
581  this->push_down(0);
582  }
583  }
584 
585  //heap push down operation
586  void push_down(IT index_on_heap) {
587  IT child_index1 = 2 * index_on_heap + 1;
588  IT child_index2 = 2 * index_on_heap + 2;
589 
590  IT biggerIndex = -1;
591  if(child_index1 < this->heapSize && child_index2 < this->heapSize) {
592 
593  if (this->values[child_index1] < this->values[child_index2]) {
594  biggerIndex = child_index2;
595  }
596  else {
597  biggerIndex = child_index1;
598  }
599  }
600  else if(child_index1 < this->heapSize) {
601  biggerIndex = child_index1;
602 
603  }
604  else if(child_index2 < this->heapSize) {
605  biggerIndex = child_index2;
606  }
607  if (biggerIndex >= 0 &&
608  this->values[biggerIndex] > this->values[index_on_heap]) {
609  WT tmpVal = this->values[biggerIndex];
610  this->values[biggerIndex] = this->values[index_on_heap];
611  this->values[index_on_heap] = tmpVal;
612 
613  IT tmpIndex = this->indices[biggerIndex];
614  this->indices[biggerIndex] = this->indices[index_on_heap];
615  this->indices[index_on_heap] = tmpIndex;
616  this->push_down(biggerIndex);
617  }
618  }
619 
620  void initValues() {
621  WT MAXVAL = std::numeric_limits<WT>::max();
622 
623  for(IT i = 0; i < this->heapSize; ++i) {
624  this->values[i] = MAXVAL;
625  this->indices[i] = -1;
626  }
627  }
628 
629  //returns the total distance to center in the cluster.
631 
632  WT nc = 0;
633  for(IT j = 0; j < this->heapSize; ++j) {
634  nc += this->values[j];
635 
636 // std::cout << "index:" << this->indices[j]
637 // << " distance:" << this->values[j] << endl;
638  }
639  return nc;
640  }
641 
642  //returns the new center of the cluster.
643  bool getNewCenters(WT *center, WT **coords, int dimension) {
644  bool moved = false;
645 
646  for(int i = 0; i < dimension; ++i) {
647  WT nc = 0;
648 
649  for(IT j = 0; j < this->heapSize; ++j) {
650  IT k = this->indices[j];
651 // std::cout << "i:" << i
652 // << " dim:" << dimension
653 // << " k:" << k
654 // << " heapSize:" << heapSize << endl; nc += coords[i][k];
655  nc += coords[i][k];
656  }
657 
658  nc /= this->heapSize;
659  moved = (std::abs(center[i] - nc) > this->epsilon || moved );
660  center[i] = nc;
661  }
662  return moved;
663  }
664 
665  void copyCoordinates(IT *permutation) {
666  for (IT i = 0; i < this->heapSize; ++i) {
667  permutation[i] = this->indices[i];
668  }
669  }
670 };
671 
674 template <class IT, class WT>
676 
677  int dimension;
678  KmeansHeap<IT,WT> closestPoints;
679 
680 public:
681 
682  WT *center;
683 
685  delete [] center;
686  }
687 
688  void setParams(int dimension_, int heapsize) {
689  this->dimension = dimension_;
690  this->center = new WT[dimension_];
691  this->closestPoints.setHeapsize(heapsize);
692  }
693 
694  void clearHeap(){
695  this->closestPoints.initValues();
696  }
697 
698  bool getNewCenters( WT **coords) {
699  return this->closestPoints.getNewCenters(center, coords, dimension);
700  }
701 
702  //returns the distance of the coordinate to the center.
703  //also adds it to the heap.
704  WT getDistance(IT index, WT **elementCoords) {
705  WT distance = 0;
706 
707  for (int i = 0; i < this->dimension; ++i) {
708  WT d = (center[i] - elementCoords[i][index]);
709  distance += d * d;
710  }
711  distance = pow(distance, WT(1.0 / this->dimension));
712  closestPoints.addPoint(index, distance);
713 
714  return distance;
715  }
716 
718  return closestPoints.getTotalDistance();
719  }
720 
721  void copyCoordinates(IT *permutation) {
722  closestPoints.copyCoordinates(permutation);
723  }
724 };
725 
732 template <class IT, class WT>
734 
735  int dim;
736  IT numElements;
737  WT **elementCoords;
738  IT numClusters;
739  IT required_elements;
740  KMeansCluster<IT,WT> *clusters;
741  WT *maxCoordinates;
742  WT *minCoordinates;
743 public:
744 
746  delete [] clusters;
747  delete [] maxCoordinates;
748  delete [] minCoordinates;
749  }
750 
754  int dim_ ,
755  IT numElements_,
756  WT **elementCoords_,
757  IT required_elements_):
758  dim(dim_),
759  numElements(numElements_),
760  elementCoords(elementCoords_),
761  numClusters((1 << dim_) + 1),
762  required_elements(required_elements_) {
763  this->clusters = new KMeansCluster<IT,WT>[this->numClusters];
764 
765  //set dimension and the number of required elements for all clusters.
766  for (int i = 0; i < numClusters; ++i) {
767  this->clusters[i].setParams(this->dim, this->required_elements);
768  }
769 
770  this->maxCoordinates = new WT[this->dim];
771  this->minCoordinates = new WT[this->dim];
772 
773  //obtain the min and max coordinates for each dimension.
774  for (int j = 0; j < dim; ++j) {
775  this->minCoordinates[j] = this->elementCoords[j][0];
776  this->maxCoordinates[j] = this->elementCoords[j][0];
777 
778  for(IT i = 1; i < numElements; ++i) {
779  WT t = this->elementCoords[j][i];
780  if(t > this->maxCoordinates[j]){
781  this->maxCoordinates[j] = t;
782  }
783 
784  if (t < minCoordinates[j]) {
785  this->minCoordinates[j] = t;
786  }
787  }
788  }
789 
790 
791  //assign initial cluster centers.
792  for (int j = 0; j < dim; ++j) {
793  int mod = (1 << (j+1));
794  for (int i = 0; i < numClusters - 1; ++i) {
795  WT c = 0;
796 
797  if ( (i % mod) < mod / 2) {
798  c = this->maxCoordinates[j];
799 // std::cout << "i:" << i << " j:" << j
800 // << " setting max:" << c << endl;
801  }
802  else {
803  c = this->minCoordinates[j];
804  }
805 
806  this->clusters[i].center[j] = c;
807  }
808  }
809 
810  //last cluster center is placed to middle.
811  for (int j = 0; j < dim; ++j) {
812  this->clusters[numClusters - 1].center[j] =
813  (this->maxCoordinates[j] + this->minCoordinates[j]) / 2;
814  }
815 
816 
817 /*
818  for (int i = 0; i < numClusters; ++i) {
819 // std::cout << endl << "cluster:" << i << endl << "\t";
820 
821  for (int j = 0; j < dim; ++j) {
822  std::cout << this->clusters[i].center[j] << " ";
823  }
824  }
825 */
826  }
827 
828  // Performs kmeans clustering of coordinates.
829  void kmeans() {
830  for (int it = 0; it < 10; ++it) {
831 // std::cout << "it:" << it << endl;
832 
833  for (IT j = 0; j < this->numClusters; ++j) {
834  this->clusters[j].clearHeap();
835  }
836 
837  for (IT i = 0; i < this->numElements; ++i) {
838 // std::cout << "i:" << i << " numEl:" << this->numElements << endl;
839 
840  for (IT j = 0; j < this->numClusters; ++j) {
841 // std::cout << "j:" << j
842 // << " numClusters:" << this->numClusters << endl;
843 
844  this->clusters[j].getDistance(i,this->elementCoords);
845  }
846  }
847 
848  bool moved = false;
849 
850  for (IT j = 0; j < this->numClusters; ++j) {
851  moved =
852  (this->clusters[j].getNewCenters(this->elementCoords) || moved );
853  }
854  if (!moved) {
855  break;
856  }
857  }
858  }
859 
860  // Finds the cluster in which the coordinates are the closest to each
861  // other.
862  void getMinDistanceCluster(IT *procPermutation) {
863 
864  WT minDistance = this->clusters[0].getDistanceToCenter();
865  IT minCluster = 0;
866 
867 // std::cout << "j:" << 0 << " minDistance:" << minDistance
868 // << " minTmpDistance:" << minDistance
869 // << " minCluster:" << minCluster << endl;
870 
871  for (IT j = 1; j < this->numClusters; ++j) {
872  WT minTmpDistance = this->clusters[j].getDistanceToCenter();
873 
874 // std::cout << "j:" << j << " minDistance:" << minDistance
875 // << " minTmpDistance:" << minTmpDistance
876 // << " minCluster:" << minCluster << endl;
877 
878  if (minTmpDistance < minDistance) {
879  minDistance = minTmpDistance;
880  minCluster = j;
881  }
882  }
883 
884 // std::cout << "minCluster:" << minCluster << endl;
885  this->clusters[minCluster].copyCoordinates(procPermutation);
886  }
887 };
888 
889 
890 #define MINOF(a,b) (((a)<(b))?(a):(b))
891 
899 template <typename T>
900 void fillContinousArray(T *arr, size_t arrSize, T *val) {
901  if (val == NULL) {
902 
903 #ifdef HAVE_ZOLTAN2_OMP
904 #pragma omp parallel for
905 #endif
906  for (size_t i = 0; i < arrSize; ++i) {
907  arr[i] = i;
908  }
909 
910  }
911  else {
912  T v = *val;
913 #ifdef HAVE_ZOLTAN2_OMP
914 #pragma omp parallel for
915 #endif
916  for (size_t i = 0; i < arrSize; ++i) {
917 // std::cout << "writing to i:" << i << " arr:" << arrSize << endl;
918  arr[i] = v;
919  }
920  }
921 }
922 
926 template <typename part_t, typename pcoord_t, typename node_t>
928 
929 protected:
930  double commCost;
931 
932 public:
933 
934  // Number of processors and number of tasks
937 
938 
940  CommunicationModel(part_t no_procs_, part_t no_tasks_):
941  commCost(),
942  no_procs(no_procs_),
943  no_tasks(no_tasks_) {}
944 
945  virtual ~CommunicationModel() {}
946 
947  part_t getNProcs() const{
948  return this->no_procs;
949  }
950 
952  return this->no_tasks;
953  }
954 
956  part_t *task_to_proc,
957  part_t *task_communication_xadj,
958  part_t *task_communication_adj,
959  pcoord_t *task_communication_edge_weight) {
960 
961  double totalCost = 0;
962 
963  part_t commCount = 0;
964  for (part_t task = 0; task < this->no_tasks; ++task) {
965  int assigned_proc = task_to_proc[task];
966 
967  part_t task_adj_begin = task_communication_xadj[task];
968  part_t task_adj_end = task_communication_xadj[task + 1];
969 
970  commCount += task_adj_end - task_adj_begin;
971 
972  for (part_t task2 = task_adj_begin; task2 < task_adj_end; ++task2) {
973 
974  part_t neighborTask = task_communication_adj[task2];
975  int neighborProc = task_to_proc[neighborTask];
976  double distance = getProcDistance(assigned_proc, neighborProc);
977 
978  if (task_communication_edge_weight == NULL) {
979  totalCost += distance ;
980  }
981  else {
982  totalCost += distance * task_communication_edge_weight[task2];
983  }
984  }
985  }
986 
987  // commCount
988  this->commCost = totalCost;
989  }
990 
992  return this->commCost;
993  }
994 
995  virtual double getProcDistance(int procId1, int procId2) const = 0;
996 
1006  virtual void getMapping(
1007  int myRank,
1008  const RCP<const Environment> &env,
1009  ArrayRCP <part_t> &proc_to_task_xadj,
1010  ArrayRCP <part_t> &proc_to_task_adj,
1011  ArrayRCP <part_t> &task_to_proc,
1012  const Teuchos::RCP <const Teuchos::Comm<int> > comm_
1013  ) const = 0;
1014 };
1015 
1016 
1020 template <typename pcoord_t, typename tcoord_t, typename part_t, typename node_t>
1022  public CommunicationModel<part_t, pcoord_t, node_t> {
1023 public:
1024  //private:
1025 
1026  // Dimension of the processors
1028  // Processor coordinates (allocated outside of the class)
1029  pcoord_t **proc_coords;
1030  // Dimension of the tasks coordinates.
1032  // Task coordinates (allocated outside of the class)
1033  tcoord_t **task_coords;
1034 
1035  // TODO: Perhaps delete this and just reference the view size?
1036  // Need to check the handling of size -1 versus size 0
1038 
1039  Kokkos::View<part_t *, Kokkos::HostSpace> kokkos_partNoArray;
1040 
1044 
1047 
1048  //public:
1050  CommunicationModel<part_t, pcoord_t, node_t>(),
1051  proc_coord_dim(0),
1052  proc_coords(0),
1053  task_coord_dim(0),
1054  task_coords(0),
1055  partArraySize(-1),
1056  machine_extent(NULL),
1058  machine(NULL),
1059  num_ranks_per_node(1),
1060  divide_to_prime_first(false){}
1061 
1063 
1075  int pcoord_dim_,
1076  pcoord_t **pcoords_,
1077  int tcoord_dim_,
1078  tcoord_t **tcoords_,
1079  part_t no_procs_,
1080  part_t no_tasks_,
1081  int *machine_extent_,
1082  bool *machine_extent_wrap_around_,
1083  const MachineRepresentation<pcoord_t,part_t> *machine_ = NULL
1084  ):
1085  CommunicationModel<part_t, pcoord_t, node_t>(no_procs_, no_tasks_),
1086  proc_coord_dim(pcoord_dim_), proc_coords(pcoords_),
1087  task_coord_dim(tcoord_dim_), task_coords(tcoords_),
1088  partArraySize(-1),
1089  machine_extent(machine_extent_),
1090  machine_extent_wrap_around(machine_extent_wrap_around_),
1091  machine(machine_),
1092  num_ranks_per_node(1),
1093  divide_to_prime_first(false) {
1094  }
1095 
1096 
1097  void setPartArraySize(int psize) {
1098  this->partArraySize = psize;
1099  }
1100 
1101  void setPartArray(Kokkos::View<part_t *, Kokkos::HostSpace> pNo) {
1102  this->kokkos_partNoArray = pNo;
1103  }
1104 
1112  void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const{
1113  //currently returns a random subset.
1114 
1115  part_t minCoordDim = MINOF(this->task_coord_dim, this->proc_coord_dim);
1117  minCoordDim, nprocs,
1118  this->proc_coords, ntasks);
1119 
1120  kma.kmeans();
1121  kma.getMinDistanceCluster(proc_permutation);
1122 
1123  for(int i = ntasks; i < nprocs; ++i) {
1124  proc_permutation[i] = -1;
1125  }
1126  /*
1127  //fill array.
1128  fillContinousArray<part_t>(proc_permutation, nprocs, NULL);
1129  '
1130  int _u_umpa_seed = 847449649;
1131  srand (time(NULL));
1132 
1133  int a = rand() % 1000 + 1;
1134  _u_umpa_seed -= a;
1135 
1136  //permute array randomly.
1137  update_visit_order(proc_permutation, nprocs,_u_umpa_seed, 1);
1138  */
1139  }
1140 
1141  // Temporary, necessary for random permutation.
1142  static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
1143  {
1144  int a = 16807;
1145  int m = 2147483647;
1146  int q = 127773;
1147  int r = 2836;
1148  int lo, hi, test;
1149  double d;
1150 
1151  lo = _u_umpa_seed % q;
1152  hi = _u_umpa_seed / q;
1153  test = (a * lo) - (r * hi);
1154  if (test>0)
1155  _u_umpa_seed = test;
1156  else
1157  _u_umpa_seed = test + m;
1158 
1159  d = (double) ((double) _u_umpa_seed / (double) m);
1160 
1161  return (part_t) (d*(double)l);
1162  }
1163 
1164  virtual double getProcDistance(int procId1, int procId2) const {
1165  pcoord_t distance = 0;
1166  if (machine == NULL) {
1167  for (int i = 0 ; i < this->proc_coord_dim; ++i) {
1168  double d =
1169  std::abs(proc_coords[i][procId1] - proc_coords[i][procId2]);
1171  if (machine_extent[i] - d < d) {
1172  d = machine_extent[i] - d;
1173  }
1174  }
1175  distance += d;
1176  }
1177  }
1178  else {
1179  this->machine->getHopCount(procId1, procId2, distance);
1180  }
1181 
1182  return distance;
1183  }
1184 
1185 
1186  // Temporary, does random permutation.
1187  void update_visit_order(part_t* visitOrder, part_t n,
1188  int &_u_umpa_seed, part_t rndm) {
1189  part_t *a = visitOrder;
1190 
1191  if (rndm) {
1192  part_t i, u, v, tmp;
1193 
1194  if (n <= 4)
1195  return;
1196 
1197 // srand ( time(NULL) );
1198 // _u_umpa_seed = _u_umpa_seed1 - (rand()%100);
1199 
1200  for (i = 0; i < n; i += 16) {
1201  u = umpa_uRandom(n-4, _u_umpa_seed);
1202  v = umpa_uRandom(n-4, _u_umpa_seed);
1203 
1204  // FIXME (mfh 30 Sep 2015) This requires including
1205  // Zoltan2_AlgMultiJagged.hpp.
1206 
1207  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v], a[u], tmp);
1208  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 1], a[u + 1], tmp);
1209  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 2], a[u + 2], tmp);
1210  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 3], a[u + 3], tmp);
1211  }
1212  }
1213  else {
1214  part_t i, end = n / 4;
1215 
1216  for (i = 1; i < end; i++) {
1217  part_t j = umpa_uRandom(n - i, _u_umpa_seed);
1218  part_t t = a[j];
1219  a[j] = a[n - i];
1220  a[n - i] = t;
1221  }
1222  }
1223 // PermuteInPlace(visitOrder, n);
1224  }
1225 
1226 
1227 
1236  virtual void getMapping(
1237  int myRank,
1238  const RCP<const Environment> &env,
1239  ArrayRCP <part_t> &rcp_proc_to_task_xadj,
1240  ArrayRCP <part_t> &rcp_proc_to_task_adj,
1241  ArrayRCP <part_t> &rcp_task_to_proc,
1242  const Teuchos::RCP <const Teuchos::Comm<int> > comm_
1243  ) const {
1244 
1245  rcp_proc_to_task_xadj = ArrayRCP <part_t>(this->no_procs + 1);
1246  rcp_proc_to_task_adj = ArrayRCP <part_t>(this->no_tasks);
1247  rcp_task_to_proc = ArrayRCP <part_t>(this->no_tasks);
1248 
1249  // Holds the pointer to the task array
1250  part_t *proc_to_task_xadj = rcp_proc_to_task_xadj.getRawPtr();
1251 
1252  // Holds the indices of task wrt to proc_to_task_xadj array.
1253  part_t *proc_to_task_adj = rcp_proc_to_task_adj.getRawPtr();
1254 
1255  // Holds the processors mapped to tasks.
1256  part_t *task_to_proc = rcp_task_to_proc.getRawPtr();
1257 
1258  part_t invalid = 0;
1259  fillContinousArray<part_t> (proc_to_task_xadj, this->no_procs + 1, &invalid);
1260 
1261  // Obtain the number of parts that should be divided.
1262  part_t num_parts = MINOF(this->no_procs, this->no_tasks);
1263 
1264  // Obtain the min coordinate dim.
1265  // No more want to do min coord dim. If machine dimension > task_dim,
1266  // we end up with a long line.
1267 // part_t minCoordDim = MINOF(this->task_coord_dim, this->proc_coord_dim);
1268 
1269  int recursion_depth = partArraySize;
1270 
1271 // if (partArraySize < minCoordDim)
1272 // recursion_depth = minCoordDim;
1273  if (partArraySize == -1) {
1274 
1275  if (divide_to_prime_first) {
1276  // It is difficult to estimate the number of steps in this case
1277  // as each branch will have different depth.
1278  // The worst case happens when all prime factors are 3s.
1279  // P = 3^n, n recursion depth will divide parts to 2x and x
1280  // and n recursion depth with divide 2x into x and x.
1281  // Set it to upperbound here.
1282  // We could calculate the exact value here as well, but the
1283  // partitioning algorithm skips further ones anyways. recursion_depth = log(float(this->no_procs)) / log(2.0) * 2 + 1;
1284  }
1285  else {
1286  recursion_depth = log(float(this->no_procs)) / log(2.0) + 1;
1287  }
1288  }
1289 
1290  // Number of permutations for tasks and processors
1291  int taskPerm = 1;
1292  int procPerm = 1;
1293 
1294  // Get number of different permutations for task dimension ordering
1295  if (this->task_coord_dim <= 8)
1296  taskPerm = z2Fact<int>(this->task_coord_dim);
1297  // Prevent overflow
1298  else
1299  taskPerm = z2Fact<int>(8);
1300 
1301  // Get number of different permutations for proc dimension ordering
1302  if (this->proc_coord_dim <= 8)
1303  procPerm = z2Fact<int>(this->proc_coord_dim);
1304  // Prevent overflow
1305  else
1306  procPerm = z2Fact<int>(8);
1307 
1308 
1309  // Total number of permutations (both task and proc permuted)
1310  int permutations = taskPerm * procPerm;
1311 
1312  // Add permutations where we divide the processors with longest
1313  // dimension but permute tasks.
1314  permutations += taskPerm;
1315 
1316  // Add permutations where we divide the tasks with longest
1317  // dimension but permute procs.
1318  permutations += procPerm;
1319 
1320  // Add permutation with both tasks and procs divided by longest
1321  // dimension
1322  permutations += 1;
1323  //add one also that partitions based the longest dimension.
1324 
1325  // Holds the pointers to proc_adjList
1326  part_t *proc_xadj = new part_t[num_parts+1];
1327 
1328  // Holds the processors in parts according to the result of
1329  // partitioning algorithm.
1330  // The processors assigned to part x is at
1331  // proc_adjList[ proc_xadj[x] : proc_xadj[x + 1] ]
1332  part_t *proc_adjList = new part_t[this->no_procs];
1333 
1334 
1335  part_t used_num_procs = this->no_procs;
1336  if (this->no_procs > this->no_tasks) {
1337  // Obtain the subset of the processors that are closest to each
1338  // other.
1339  this->getClosestSubset(proc_adjList, this->no_procs,
1340  this->no_tasks);
1341  used_num_procs = this->no_tasks;
1342  }
1343  else {
1344  fillContinousArray<part_t>(proc_adjList,this->no_procs, NULL);
1345  }
1346 
1347  // Index of the permutation
1348  int myPermutation = myRank % permutations;
1349  bool task_partition_along_longest_dim = false;
1350  bool proc_partition_along_longest_dim = false;
1351 
1352 
1353  int myProcPerm = 0;
1354  int myTaskPerm = 0;
1355 
1356  if (myPermutation == 0) {
1357  task_partition_along_longest_dim = true;
1358  proc_partition_along_longest_dim = true;
1359  }
1360  else {
1361  --myPermutation;
1362  if (myPermutation < taskPerm) {
1363  proc_partition_along_longest_dim = true;
1364  // Index of the task permutation
1365  myTaskPerm = myPermutation;
1366  }
1367  else {
1368  myPermutation -= taskPerm;
1369  if (myPermutation < procPerm) {
1370  task_partition_along_longest_dim = true;
1371  // Index of the task permutation
1372  myProcPerm = myPermutation;
1373  }
1374  else {
1375  myPermutation -= procPerm;
1376  // Index of the proc permutation
1377  myProcPerm = myPermutation % procPerm;
1378  // Index of the task permutation
1379  myTaskPerm = myPermutation / procPerm;
1380  }
1381  }
1382  }
1383 
1384 /*
1385  if (task_partition_along_longest_dim &&
1386  proc_partition_along_longest_dim) {
1387  std::cout <<"me:" << myRank << " task:longest proc:longest"
1388  << " numPerms:" << permutations << std::endl;
1389  }
1390  else if (proc_partition_along_longest_dim) {
1391  std::cout <<"me:" << myRank << " task:" << myTaskPerm
1392  << " proc:longest" << " numPerms:" << permutations << std::endl;
1393  }
1394  else if (task_partition_along_longest_dim) {
1395  std::cout <<"me:" << myRank << " task: longest" << " proc:"
1396  << myProcPerm << " numPerms:" << permutations << std::endl;
1397  }
1398  else {
1399  std::cout <<"me:" << myRank << " task:" << myTaskPerm << " proc:"
1400  << myProcPerm << " numPerms:" << permutations << std::endl;
1401  }
1402 */
1403 
1404  int *permutation = new int[(this->proc_coord_dim > this->task_coord_dim)
1405  ? this->proc_coord_dim : this->task_coord_dim];
1406 
1407  // Get the permutation order from the proc permutation index.
1408  if (this->proc_coord_dim <= 8)
1409  ithPermutation<int>(this->proc_coord_dim, myProcPerm, permutation);
1410  else
1411  ithPermutation<int>(8, myProcPerm, permutation);
1412 
1413 /*
1414  // Reorder the coordinate dimensions.
1415  pcoord_t **pcoords = allocMemory<pcoord_t *>(this->proc_coord_dim);
1416  for (int i = 0; i < this->proc_coord_dim; ++i) {
1417  pcoords[i] = this->proc_coords[permutation[i]];
1418 // std::cout << permutation[i] << " ";
1419  }
1420 */
1421 
1422  int procdim = this->proc_coord_dim;
1423  pcoord_t **pcoords = this->proc_coords;
1424 
1425 /*
1426  int procdim = this->proc_coord_dim;
1427  procdim = 6;
1428  //reorder the coordinate dimensions.
1429  pcoord_t **pcoords = allocMemory<pcoord_t *>(procdim);
1430  for (int i = 0; i < procdim; ++i) {
1431  pcoords[i] = new pcoord_t[used_num_procs] ;
1432 // this->proc_coords[permutation[i]];
1433  }
1434 
1435  for (int k = 0; k < used_num_procs ; k++) {
1436  pcoords[0][k] = (int (this->proc_coords[0][k]) / 2) * 64;
1437  pcoords[3][k] = (int (this->proc_coords[0][k]) % 2) * 8 ;
1438 
1439  pcoords[1][k] = (int (this->proc_coords[1][k]) / 2) * 8 * 2400;
1440  pcoords[4][k] = (int (this->proc_coords[1][k]) % 2) * 8;
1441  pcoords[2][k] = ((int (this->proc_coords[2][k])) / 8) * 160;
1442  pcoords[5][k] = ((int (this->proc_coords[2][k])) % 8) * 5;
1443 
1444  //if (this->proc_coords[0][k] == 40 &&
1445  // this->proc_coords[1][k] == 8 &&
1446  // this->proc_coords[2][k] == 48) {
1447  if (this->proc_coords[0][k] == 5 &&
1448  this->proc_coords[1][k] == 0 &&
1449  this->proc_coords[2][k] == 10) {
1450  std::cout << "pcoords[0][k]:" << pcoords[0][k]
1451  << "pcoords[1][k]:" << pcoords[1][k]
1452  << "pcoords[2][k]:" << pcoords[2][k]
1453  << "pcoords[3][k]:" << pcoords[3][k]
1454  << "pcoords[4][k]:" << pcoords[4][k]
1455  << "pcoords[5][k]:" << pcoords[5][k] << std::endl;
1456  }
1457  else if (pcoords[0][k] == 64 &&
1458  pcoords[1][k] == 0 &&
1459  pcoords[2][k] == 160 &&
1460  pcoords[3][k] == 16 &&
1461  pcoords[4][k] == 0 &&
1462  pcoords[5][k] == 10) {
1463  std::cout << "this->proc_coords[0][k]:" << this->proc_coords[0][k]
1464  << "this->proc_coords[1][k]:" << this->proc_coords[1][k]
1465  << "this->proc_coords[2][k]:" << this->proc_coords[2][k]
1466  << std::endl;
1467  }
1468  }
1469 */
1470 
1471 // if (partNoArray == NULL)
1472 // std::cout << "partNoArray is null" << std::endl;
1473 // std::cout << "recursion_depth:" << recursion_depth
1474 // << " partArraySize:" << partArraySize << std::endl;
1475 
1476  // Optimization for Dragonfly Networks, First Level of partitioning
1477  // is imbalanced to ensure procs are divided by first RCA
1478  // coord (a.k.a. group).
1479  part_t num_group_count = 1;
1480  part_t *group_count = NULL;
1481 
1482  if (machine != NULL)
1483  num_group_count = machine->getNumUniqueGroups();
1484 
1485  if (num_group_count > 1) {
1486  group_count = new part_t[num_group_count];
1487  memset(group_count, 0, sizeof(part_t) * num_group_count);
1488 
1489  machine->getGroupCount(group_count);
1490  }
1491 
1492  // Do the partitioning and renumber the parts.
1493  env->timerStart(MACRO_TIMERS, "Mapping - Proc Partitioning");
1494  // Partitioning of Processors
1496 
1497  typedef typename node_t::device_type device_t;
1498  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1499  Kokkos::View<pcoord_t**, Kokkos::LayoutLeft, device_t>
1500  kokkos_pcoords("pcoords", this->no_procs, procdim);
1501  auto host_kokkos_pcoords = Kokkos::create_mirror_view(kokkos_pcoords);
1502  for(int i = 0; i < procdim; ++i) {
1503  for(int j = 0; j < this->no_procs; ++j) {
1504  host_kokkos_pcoords(j,i) = pcoords[i][j];
1505  }
1506  }
1507  Kokkos::deep_copy(kokkos_pcoords, host_kokkos_pcoords);
1508 
1509  Kokkos::View<part_t*, device_t> initial_selected_coords_output_permutation_pcoords(
1510  "initial_selected_coords_output_permutation_pcoords", this->no_procs);
1511  typename Kokkos::View<part_t*, device_t>::HostMirror
1512  host_initial_selected_coords_output_permutation_pcoords =
1513  Kokkos::create_mirror_view(initial_selected_coords_output_permutation_pcoords);
1514  for(int n = 0; n < this->no_procs; ++n) {
1515  host_initial_selected_coords_output_permutation_pcoords(n) =
1516  proc_adjList[n];
1517  }
1518  Kokkos::deep_copy(initial_selected_coords_output_permutation_pcoords,
1519  host_initial_selected_coords_output_permutation_pcoords);
1520 
1521  // Note num_group_count = 1 when group_count = NULL - perhaps could change
1522  Kokkos::View<part_t *, Kokkos::HostSpace> kokkos_group_count(
1523  "kokkos_group_count", group_count ? num_group_count : 0);
1524  if(group_count) {
1525  for(int n = 0; n < num_group_count; ++n) {
1526  kokkos_group_count(n) = group_count[n];
1527  }
1528  }
1529 
1530  mj_partitioner.sequential_task_partitioning(
1531  env,
1532  this->no_procs,
1533  used_num_procs,
1534  num_parts,
1535  procdim,
1536  //minCoordDim,
1537  kokkos_pcoords,
1538  initial_selected_coords_output_permutation_pcoords,
1539  proc_xadj,
1540  recursion_depth,
1542  proc_partition_along_longest_dim, // false
1545  num_group_count,
1546  kokkos_group_count);
1547  env->timerStop(MACRO_TIMERS, "Mapping - Proc Partitioning");
1548 // comm_->barrier();
1549 // std::cout << "mj_partitioner.for procs over" << std::endl;
1550 // freeArray<pcoord_t *>(pcoords);
1551 
1552  Kokkos::deep_copy(host_initial_selected_coords_output_permutation_pcoords,
1553  initial_selected_coords_output_permutation_pcoords);
1554  for(int n = 0; n < this->no_procs; ++n) {
1555  proc_adjList[n] =
1556  host_initial_selected_coords_output_permutation_pcoords(n);
1557  }
1558 
1559  part_t *task_xadj = new part_t[num_parts + 1];
1560  part_t *task_adjList = new part_t[this->no_tasks];
1561 
1562  // Fill task_adjList st: task_adjList[i] <- i.
1563  fillContinousArray<part_t>(task_adjList,this->no_tasks, NULL);
1564 
1565  // Get the permutation order from the task permutation index.
1566  if (this->task_coord_dim <= 8)
1567  ithPermutation<int>(this->task_coord_dim, myTaskPerm, permutation);
1568  else
1569  ithPermutation<int>(8, myTaskPerm, permutation);
1570 
1571  // Reorder task coordinate dimensions.
1572  tcoord_t **tcoords = new tcoord_t*[this->task_coord_dim];
1573  for(int i = 0; i < this->task_coord_dim; ++i) {
1574  tcoords[i] = this->task_coords[permutation[i]];
1575  }
1576 
1577  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1578  Kokkos::View<tcoord_t**, Kokkos::LayoutLeft, device_t>
1579  kokkos_tcoords("tcoords", this->no_tasks, this->task_coord_dim);
1580  auto host_kokkos_tcoords = Kokkos::create_mirror_view(kokkos_tcoords);
1581  for(int i = 0; i < this->task_coord_dim; ++i) {
1582  for(int j = 0; j < this->no_tasks; ++j) {
1583  host_kokkos_tcoords(j,i) = tcoords[i][j];
1584  }
1585  }
1586  Kokkos::deep_copy(kokkos_tcoords, host_kokkos_tcoords);
1587 
1588  env->timerStart(MACRO_TIMERS, "Mapping - Task Partitioning");
1589 
1590  Kokkos::View<part_t*, device_t> initial_selected_coords_output_permutation_tcoords(
1591  "initial_selected_coords_output_permutation_tcoords", this->no_tasks);
1592  typename Kokkos::View<part_t*, device_t>::HostMirror
1593  host_initial_selected_coords_output_permutation_tcoords =
1594  Kokkos::create_mirror_view(initial_selected_coords_output_permutation_tcoords);
1595  for(int n = 0; n < this->no_tasks; ++n) {
1596  host_initial_selected_coords_output_permutation_tcoords(n) =
1597  task_adjList[n];
1598  }
1599  Kokkos::deep_copy(initial_selected_coords_output_permutation_tcoords,
1600  host_initial_selected_coords_output_permutation_tcoords);
1601 
1602  //partitioning of tasks
1603  mj_partitioner.sequential_task_partitioning(
1604  env,
1605  this->no_tasks,
1606  this->no_tasks,
1607  num_parts,
1608  this->task_coord_dim,
1609  //minCoordDim,
1610  kokkos_tcoords,
1611  initial_selected_coords_output_permutation_tcoords,
1612  task_xadj,
1613  recursion_depth,
1615  task_partition_along_longest_dim,
1618  num_group_count,
1619  kokkos_group_count);
1620  env->timerStop(MACRO_TIMERS, "Mapping - Task Partitioning");
1621 
1622  Kokkos::deep_copy(host_initial_selected_coords_output_permutation_tcoords,
1623  initial_selected_coords_output_permutation_tcoords);
1624  for(int n = 0; n < this->no_tasks; ++n) {
1625  task_adjList[n] =
1626  host_initial_selected_coords_output_permutation_tcoords(n);
1627  }
1628 
1629 // std::cout << "myrank:" << myRank << std::endl;
1630 // comm_->barrier();
1631 // std::cout << "mj_partitioner.sequential_task_partitioning over"
1632 // << std::endl;
1633 
1634  delete [] tcoords;
1635  delete [] permutation;
1636 
1637 
1638  //filling proc_to_task_xadj, proc_to_task_adj, task_to_proc arrays.
1639  for(part_t i = 0; i < num_parts; ++i) {
1640 
1641  part_t proc_index_begin = proc_xadj[i];
1642  part_t task_begin_index = task_xadj[i];
1643  part_t proc_index_end = proc_xadj[i + 1];
1644  part_t task_end_index = task_xadj[i + 1];
1645 
1646 
1647  if(proc_index_end - proc_index_begin != 1) {
1648  std::cerr << "Error at partitioning of processors" << std::endl;
1649  std::cerr << "PART:" << i << " is assigned to "
1650  << proc_index_end - proc_index_begin << " processors."
1651  << std::endl;
1652  std::terminate();
1653  }
1654  part_t assigned_proc = proc_adjList[proc_index_begin];
1655  proc_to_task_xadj[assigned_proc] = task_end_index - task_begin_index;
1656  }
1657 
1658  //holds the pointer to the task array
1659  //convert proc_to_task_xadj to CSR index array
1660  part_t *proc_to_task_xadj_work = new part_t[this->no_procs];
1661  part_t sum = 0;
1662  for(part_t i = 0; i < this->no_procs; ++i) {
1663  part_t tmp = proc_to_task_xadj[i];
1664  proc_to_task_xadj[i] = sum;
1665  sum += tmp;
1666  proc_to_task_xadj_work[i] = sum;
1667  }
1668  proc_to_task_xadj[this->no_procs] = sum;
1669 
1670  for(part_t i = 0; i < num_parts; ++i){
1671 
1672  part_t proc_index_begin = proc_xadj[i];
1673  part_t task_begin_index = task_xadj[i];
1674  part_t task_end_index = task_xadj[i + 1];
1675 
1676  part_t assigned_proc = proc_adjList[proc_index_begin];
1677 
1678  for (part_t j = task_begin_index; j < task_end_index; ++j) {
1679  part_t taskId = task_adjList[j];
1680 
1681  task_to_proc[taskId] = assigned_proc;
1682 
1683  proc_to_task_adj [--proc_to_task_xadj_work[assigned_proc]] = taskId;
1684  }
1685  }
1686 
1687 /*
1688  if (myPermutation == 0) {
1689  std::ofstream gnuPlotCode ("mymapping.out", std::ofstream::out);
1690 
1691  for (part_t i = 0; i < num_parts; ++i) {
1692 
1693  part_t proc_index_begin = proc_xadj[i];
1694  part_t proc_index_end = proc_xadj[i + 1];
1695 
1696  if (proc_index_end - proc_index_begin != 1) {
1697  std::cerr << "Error at partitioning of processors" << std::endl;
1698  std::cerr << "PART:" << i << " is assigned to "
1699  << proc_index_end - proc_index_begin << " processors."
1700  << std::endl;
1701  exit(1);
1702  }
1703 
1704  part_t assigned_proc = proc_adjList[proc_index_begin];
1705  gnuPlotCode << "Rank:" << i << " "
1706  << this->proc_coords[0][assigned_proc] << " "
1707  << this->proc_coords[1][assigned_proc] << " "
1708  << this->proc_coords[2][assigned_proc] << " "
1709  << pcoords[0][assigned_proc] << " "
1710  << pcoords[1][assigned_proc] << " "
1711  << pcoords[2][assigned_proc] << " "
1712  << pcoords[3][assigned_proc] << std::endl;
1713  }
1714 
1715  gnuPlotCode << "Machine Extent:" << std::endl;
1716  //filling proc_to_task_xadj, proc_to_task_adj, task_to_proc arrays.
1717  for (part_t i = 0; i < num_parts; ++i) {
1718 
1719  part_t proc_index_begin = proc_xadj[i];
1720  part_t proc_index_end = proc_xadj[i + 1];
1721 
1722  if (proc_index_end - proc_index_begin != 1) {
1723  std::cerr << "Error at partitioning of processors" << std::endl;
1724  std::cerr << "PART:" << i << " is assigned to "
1725  << proc_index_end - proc_index_begin << " processors."
1726  << std::endl;
1727  exit(1);
1728  }
1729 
1730  part_t assigned_proc = proc_adjList[proc_index_begin];
1731  gnuPlotCode << "Rank:" << i << " "
1732  << this->proc_coords[0][assigned_proc] << " "
1733  << this->proc_coords[1][assigned_proc] << " "
1734  << this->proc_coords[2][assigned_proc] << std::endl;
1735  }
1736  gnuPlotCode.close();
1737  }
1738 */
1739 
1740  delete [] proc_to_task_xadj_work;
1741  delete [] task_xadj;
1742  delete [] task_adjList;
1743  delete [] proc_xadj;
1744  delete [] proc_adjList;
1745  }
1746 
1747 };
1748 
1749 template <typename Adapter, typename part_t>
1751 protected:
1752 
1753 #ifndef DOXYGEN_SHOULD_SKIP_THIS
1754 
1755  typedef typename Adapter::scalar_t pcoord_t;
1756  typedef typename Adapter::scalar_t tcoord_t;
1757  typedef typename Adapter::scalar_t scalar_t;
1758  typedef typename Adapter::lno_t lno_t;
1759 
1760 #if defined(KOKKOS_ENABLE_CUDA)
1761  using node_t = Tpetra::KokkosCompat::KokkosDeviceWrapperNode<
1762  Kokkos::Cuda, Kokkos::CudaSpace>;
1763 #elif defined(KOKKOS_ENABLE_HIP)
1764  using node_t = Tpetra::KokkosCompat::KokkosDeviceWrapperNode<
1765  Kokkos::HIP, Kokkos::HIPSpace>;
1766 #else
1767  using node_t = typename Adapter::node_t;
1768 #endif
1769 
1770 #endif
1771 
1772 // RCP<const Environment> env;
1773 
1774  // Holds the pointer to the task array
1775  ArrayRCP<part_t> proc_to_task_xadj;
1776 // = allocMemory<part_t> (this->no_procs + 1);
1777 
1778  // Holds the indices of tasks wrt to proc_to_task_xadj array.
1779  ArrayRCP<part_t> proc_to_task_adj;
1780 // = allocMemory<part_t>(this->no_tasks);
1781 
1782  // Holds the processors mapped to tasks.
1783  ArrayRCP<part_t> task_to_proc;
1784 // = allocMemory<part_t>(this->no_procs);
1785 
1786  // Holds the processors mapped to tasks.
1787  ArrayRCP<part_t> local_task_to_rank;
1788 // = allocMemory<part_t>(this->no_procs);
1789 
1794  ArrayRCP<part_t> task_communication_xadj;
1795  ArrayRCP<part_t> task_communication_adj;
1797 
1798 
1802  void doMapping(int myRank,
1803  const Teuchos::RCP<const Teuchos::Comm<int> > comm_) {
1804 
1805  if (this->proc_task_comm) {
1806  this->proc_task_comm->getMapping(
1807  myRank,
1808  this->env,
1809  // Holds the pointer to the task array
1810  this->proc_to_task_xadj,
1811  // Holds the indices of task wrt to proc_to_task_xadj array
1812  this->proc_to_task_adj,
1813  // Holds the processors mapped to tasks
1814  this->task_to_proc,
1815  comm_
1816  );
1817  }
1818  else {
1819  std::cerr << "communicationModel is not specified in the Mapper"
1820  << std::endl;
1821  exit(1);
1822  }
1823  }
1824 
1825 
1829  RCP<Comm<int> > create_subCommunicator() {
1830  int procDim = this->proc_task_comm->proc_coord_dim;
1831  int taskDim = this->proc_task_comm->task_coord_dim;
1832 
1833  // Number of permutations for tasks and processors
1834  int taskPerm = 1;
1835  int procPerm = 1;
1836 
1837  // Get number of different permutations for task dimension ordering
1838  if (taskDim <= 8)
1839  taskPerm = z2Fact<int>(taskDim);
1840  // Prevent overflow
1841  else
1842  taskPerm = z2Fact<int>(8);
1843 
1844  // Get number of different permutations for proc dimension ordering
1845  if (procDim <= 8)
1846  procPerm = z2Fact<int>(procDim);
1847  // Prevent overflow
1848  else
1849  procPerm = z2Fact<int>(8);
1850 
1851  // Total number of permutations
1852  int idealGroupSize = taskPerm * procPerm;
1853 
1854  // For the one that does longest dimension partitioning.
1855  idealGroupSize += taskPerm + procPerm + 1;
1856 
1857  int myRank = this->comm->getRank();
1858  int commSize = this->comm->getSize();
1859 
1860  int myGroupIndex = myRank / idealGroupSize;
1861 
1862  int prevGroupBegin = (myGroupIndex - 1)* idealGroupSize;
1863  if (prevGroupBegin < 0) prevGroupBegin = 0;
1864  int myGroupBegin = myGroupIndex * idealGroupSize;
1865  int myGroupEnd = (myGroupIndex + 1) * idealGroupSize;
1866  int nextGroupEnd = (myGroupIndex + 2)* idealGroupSize;
1867 
1868  if (myGroupEnd > commSize) {
1869  myGroupBegin = prevGroupBegin;
1870  myGroupEnd = commSize;
1871  }
1872  if (nextGroupEnd > commSize) {
1873  myGroupEnd = commSize;
1874  }
1875  int myGroupSize = myGroupEnd - myGroupBegin;
1876 
1877  part_t *myGroup = new part_t[myGroupSize];
1878  for (int i = 0; i < myGroupSize; ++i) {
1879  myGroup[i] = myGroupBegin + i;
1880  }
1881 // std::cout << "me:" << myRank << " myGroupBegin:" << myGroupBegin
1882 // << " myGroupEnd:" << myGroupEnd << endl;
1883 
1884  ArrayView<const part_t> myGroupView(myGroup, myGroupSize);
1885 
1886  RCP<Comm<int> > subComm =
1887  this->comm->createSubcommunicator(myGroupView);
1888  delete [] myGroup;
1889  return subComm;
1890  }
1891 
1892 
1897  //create the sub group.
1898  RCP<Comm<int> > subComm = this->create_subCommunicator();
1899  //calculate cost.
1900  double myCost = this->proc_task_comm->getCommunicationCostMetric();
1901 // std::cout << "me:" << this->comm->getRank() << " myCost:"
1902 // << myCost << std::endl;
1903  double localCost[2], globalCost[2];
1904 
1905  localCost[0] = myCost;
1906  localCost[1] = double(subComm->getRank());
1907 
1908  globalCost[1] = globalCost[0] = std::numeric_limits<double>::max();
1910  reduceAll<int, double>(*subComm, reduceBest,
1911  2, localCost, globalCost);
1912 
1913  int sender = int(globalCost[1]);
1914 
1915 /*
1916  if ( this->comm->getRank() == 0) {
1917  std::cout << "me:" << localCost[1] <<
1918  " localcost:" << localCost[0]<<
1919  " bestcost:" << globalCost[0] <<
1920  " Sender:" << sender <<
1921  " procDim" << proc_task_comm->proc_coord_dim <<
1922  " taskDim:" << proc_task_comm->task_coord_dim << std::endl;
1923  }
1924 */
1925 
1926 // std::cout << "me:" << localCost[1] << " localcost:" << localCost[0]
1927 // << " bestcost:" << globalCost[0] << endl;
1928 // std::cout << "me:" << localCost[1] << " proc:" << globalCost[1]
1929 // << endl;
1930  broadcast(*subComm, sender, this->ntasks,
1931  this->task_to_proc.getRawPtr());
1932  broadcast(*subComm, sender, this->nprocs,
1933  this->proc_to_task_xadj.getRawPtr());
1934  broadcast(*subComm, sender, this->ntasks,
1935  this->proc_to_task_adj.getRawPtr());
1936  }
1937 
1938  //write mapping to gnuPlot code to visualize.
1939  void writeMapping() {
1940  std::ofstream gnuPlotCode("gnuPlot.plot", std::ofstream::out);
1941 
1942  int mindim = MINOF(proc_task_comm->proc_coord_dim,
1943  proc_task_comm->task_coord_dim);
1944  std::string ss = "";
1945  for (part_t i = 0; i < this->nprocs; ++i) {
1946 
1947  std::string procFile = Teuchos::toString<int>(i) + "_mapping.txt";
1948  if (i == 0) {
1949  gnuPlotCode << "plot \"" << procFile << "\"\n";
1950  }
1951  else {
1952  gnuPlotCode << "replot \"" << procFile << "\"\n";
1953  }
1954 
1955  std::ofstream inpFile(procFile.c_str(), std::ofstream::out);
1956 
1957  std::string gnuPlotArrow = "set arrow from ";
1958  for (int j = 0; j < mindim; ++j) {
1959  if (j == mindim - 1) {
1960  inpFile << proc_task_comm->proc_coords[j][i];
1961  gnuPlotArrow +=
1962  Teuchos::toString<float>(proc_task_comm->proc_coords[j][i]);
1963 
1964  }
1965  else {
1966  inpFile << proc_task_comm->proc_coords[j][i] << " ";
1967  gnuPlotArrow +=
1968  Teuchos::toString<float>(proc_task_comm->
1969  proc_coords[j][i]) + ",";
1970  }
1971  }
1972  gnuPlotArrow += " to ";
1973 
1974  inpFile << std::endl;
1975  ArrayView<part_t> a = this->getAssignedTasksForProc(i);
1976 
1977  for (int k = 0; k < a.size(); ++k) {
1978  int j = a[k];
1979 // std::cout << "i:" << i << " j:"
1980  std::string gnuPlotArrow2 = gnuPlotArrow;
1981  for (int z = 0; z < mindim; ++z) {
1982  if (z == mindim - 1) {
1983 
1984 // std::cout << "z:" << z << " j:" << j << " "
1985 // << proc_task_comm->task_coords[z][j] << endl;
1986  inpFile << proc_task_comm->task_coords[z][j];
1987  gnuPlotArrow2 +=
1988  Teuchos::toString<float>(proc_task_comm->task_coords[z][j]);
1989  }
1990  else {
1991  inpFile << proc_task_comm->task_coords[z][j] << " ";
1992  gnuPlotArrow2 +=
1993  Teuchos::toString<float>(proc_task_comm->
1994  task_coords[z][j]) + ",";
1995  }
1996  }
1997  ss += gnuPlotArrow2 + "\n";
1998  inpFile << std::endl;
1999  }
2000  inpFile.close();
2001  }
2002  gnuPlotCode << ss;
2003  gnuPlotCode << "\nreplot\n pause -1 \n";
2004  gnuPlotCode.close();
2005  }
2006 
2007  //write mapping to gnuPlot code to visualize.
2008  void writeMapping2(int myRank) {
2009  std::string rankStr = Teuchos::toString<int>(myRank);
2010  std::string gnuPlots = "gnuPlot", extentionS = ".plot";
2011  std::string outF = gnuPlots + rankStr+ extentionS;
2012  std::ofstream gnuPlotCode(outF.c_str(), std::ofstream::out);
2013 
2015  *tmpproc_task_comm =
2016  static_cast <CoordinateCommunicationModel<
2017  pcoord_t, tcoord_t, part_t, node_t> * > (
2018  proc_task_comm);
2019 
2020 // int mindim = MINOF(tmpproc_task_comm->proc_coord_dim,
2021 // tmpproc_task_comm->task_coord_dim);
2022  int mindim = tmpproc_task_comm->proc_coord_dim;
2023  if (mindim != 3) {
2024  std::cerr << "Mapping Write is only good for 3 dim" << std::endl;
2025  return;
2026  }
2027  std::string ss = "";
2028  std::string procs = "";
2029 
2030  std::set < std::tuple<int,int,int,int,int,int> > my_arrows;
2031  for (part_t origin_rank = 0; origin_rank < this->nprocs; ++origin_rank) {
2032  ArrayView<part_t> a = this->getAssignedTasksForProc(origin_rank);
2033  if (a.size() == 0) {
2034  continue;
2035  }
2036 
2037  std::string gnuPlotArrow = "set arrow from ";
2038  for (int j = 0; j < mindim; ++j) {
2039  if (j == mindim - 1) {
2040  gnuPlotArrow +=
2041  Teuchos::toString<float>(tmpproc_task_comm->
2042  proc_coords[j][origin_rank]);
2043  procs +=
2044  Teuchos::toString<float>(tmpproc_task_comm->
2045  proc_coords[j][origin_rank]);
2046 
2047  }
2048  else {
2049  gnuPlotArrow +=
2050  Teuchos::toString<float>(tmpproc_task_comm->
2051  proc_coords[j][origin_rank]) + ",";
2052  procs +=
2053  Teuchos::toString<float>(tmpproc_task_comm->
2054  proc_coords[j][origin_rank])+ " ";
2055  }
2056  }
2057  procs += "\n";
2058 
2059  gnuPlotArrow += " to ";
2060 
2061 
2062  for (int k = 0; k < a.size(); ++k) {
2063  int origin_task = a[k];
2064 
2065  for (int nind = task_communication_xadj[origin_task];
2066  nind < task_communication_xadj[origin_task + 1]; ++nind) {
2067  int neighbor_task = task_communication_adj[nind];
2068 
2069  bool differentnode = false;
2070 
2071  int neighbor_rank = this->getAssignedProcForTask(neighbor_task);
2072 
2073  for (int j = 0; j < mindim; ++j) {
2074  if (int(tmpproc_task_comm->proc_coords[j][origin_rank]) !=
2075  int(tmpproc_task_comm->proc_coords[j][neighbor_rank])) {
2076  differentnode = true; break;
2077  }
2078  }
2079  std::tuple<int,int,int, int, int, int> foo(
2080  int(tmpproc_task_comm->proc_coords[0][origin_rank]),
2081  int(tmpproc_task_comm->proc_coords[1][origin_rank]),
2082  int(tmpproc_task_comm->proc_coords[2][origin_rank]),
2083  int(tmpproc_task_comm->proc_coords[0][neighbor_rank]),
2084  int(tmpproc_task_comm->proc_coords[1][neighbor_rank]),
2085  int(tmpproc_task_comm->proc_coords[2][neighbor_rank]));
2086 
2087 
2088  if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
2089  my_arrows.insert(foo);
2090 
2091  std::string gnuPlotArrow2 = "";
2092  for (int j = 0; j < mindim; ++j) {
2093  if (j == mindim - 1) {
2094  gnuPlotArrow2 +=
2095  Teuchos::toString<float>(tmpproc_task_comm->
2096  proc_coords[j][neighbor_rank]);
2097  }
2098  else {
2099  gnuPlotArrow2 +=
2100  Teuchos::toString<float>(tmpproc_task_comm->
2101  proc_coords[j][neighbor_rank]) + ",";
2102  }
2103  }
2104  ss += gnuPlotArrow + gnuPlotArrow2 + " nohead\n";
2105  }
2106  }
2107  }
2108  }
2109 
2110  std::ofstream procFile("procPlot.plot", std::ofstream::out);
2111  procFile << procs << "\n";
2112  procFile.close();
2113 
2114  //gnuPlotCode << ss;
2115  if (mindim == 2) {
2116  gnuPlotCode << "plot \"procPlot.plot\" with points pointsize 3\n";
2117  } else {
2118  gnuPlotCode << "splot \"procPlot.plot\" with points pointsize 3\n";
2119  }
2120 
2121  gnuPlotCode << ss << "\nreplot\n pause -1 \n";
2122  gnuPlotCode.close();
2123  }
2124 
2125 
2126 // KDD Need to provide access to algorithm for getPartBoxes
2127 #ifdef gnuPlot
2128  void writeGnuPlot(
2129  const Teuchos::Comm<int> *comm_,
2131  int coordDim,
2132  tcoord_t **partCenters) {
2133 
2134  std::string file = "gggnuPlot";
2135  std::string exten = ".plot";
2136  std::ofstream mm("2d.txt");
2137  file += Teuchos::toString<int>(comm_->getRank()) + exten;
2138  std::ofstream ff(file.c_str());
2139 // ff.seekg(0, ff.end);
2140  std::vector<Zoltan2::coordinateModelPartBox <tcoord_t, part_t> >
2141  outPartBoxes =
2143  getPartBoxesView();
2144 
2145  for (part_t i = 0; i < this->ntasks;++i) {
2146  outPartBoxes[i].writeGnuPlot(ff, mm);
2147  }
2148  if (coordDim == 2) {
2149  ff << "plot \"2d.txt\"" << std::endl;
2150 // ff << "\n pause -1" << endl;
2151  }
2152  else {
2153  ff << "splot \"2d.txt\"" << std::endl;
2154 // ff << "\n pause -1" << endl;
2155  }
2156  mm.close();
2157 
2158  ff << "set style arrow 5 nohead size screen 0.03,15,135 ls 1"
2159  << std::endl;
2160 
2161  for (part_t i = 0; i < this->ntasks; ++i) {
2163  part_t pe = task_communication_xadj[i + 1];
2164 
2165  for (part_t p = pb; p < pe; ++p) {
2167 
2168 // std::cout << "i:" << i << " n:" << n << endl;
2169  std::string arrowline = "set arrow from ";
2170  for (int j = 0; j < coordDim - 1; ++j) {
2171  arrowline +=
2172  Teuchos::toString<tcoord_t>(partCenters[j][n]) + ",";
2173  }
2174  arrowline +=
2175  Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][n]) +
2176  " to ";
2177 
2178  for (int j = 0; j < coordDim - 1; ++j) {
2179  arrowline +=
2180  Teuchos::toString<tcoord_t>(partCenters[j][i]) + ",";
2181  }
2182  arrowline +=
2183  Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][i]) +
2184  " as 5\n";
2185 
2186 // std::cout << "arrow:" << arrowline << endl;
2187  ff << arrowline;
2188  }
2189  }
2190 
2191  ff << "replot\n pause -1" << std::endl;
2192  ff.close();
2193  }
2194 #endif // gnuPlot
2195 
2196 public:
2197 
2198  void getProcTask(part_t* &proc_to_task_xadj_,
2199  part_t* &proc_to_task_adj_) {
2200  proc_to_task_xadj_ = this->proc_to_task_xadj.getRawPtr();
2201  proc_to_task_adj_ = this->proc_to_task_adj.getRawPtr();
2202  }
2203 
2204  virtual void map(const RCP<MappingSolution<Adapter> > &mappingsoln) {
2205 
2206  // Mapping was already computed in the constructor; we need to store it
2207  // in the solution.
2208  mappingsoln->setMap_RankForLocalElements(local_task_to_rank);
2209 
2210  // KDDKDD TODO: Algorithm is also creating task_to_proc, which maybe
2211  // KDDKDD is not needed once we use MappingSolution to answer queries
2212  // KDDKDD instead of this algorithm.
2213  // KDDKDD Ask Mehmet: what is the most efficient way to get the answer
2214  // KDDKDD out of CoordinateTaskMapper and into the MappingSolution?
2215  }
2216 
2217 
2219  //freeArray<part_t>(proc_to_task_xadj);
2220  //freeArray<part_t>(proc_to_task_adj);
2221  //freeArray<part_t>(task_to_proc);
2222  if(this->isOwnerofModel) {
2223  delete this->proc_task_comm;
2224  }
2225  }
2226 
2228  const lno_t num_local_coords,
2229  const part_t *local_coord_parts,
2230  const ArrayRCP<part_t> task_to_proc_) {
2231  local_task_to_rank = ArrayRCP <part_t>(num_local_coords);
2232 
2233  for (lno_t i = 0; i < num_local_coords; ++i) {
2234  part_t local_coord_part = local_coord_parts[i];
2235  part_t rank_index = task_to_proc_[local_coord_part];
2236  local_task_to_rank[i] = rank_index;
2237  }
2238  }
2239 
2240 
2241 
2256  const Teuchos::RCP <const Teuchos::Comm<int> > comm_,
2257  const Teuchos::RCP <const MachineRepresentation<pcoord_t, part_t> >
2258  machine_,
2259  const Teuchos::RCP <const Adapter> input_adapter_,
2260  const Teuchos::RCP <const Zoltan2::PartitioningSolution<Adapter> >
2261  soln_,
2262  const Teuchos::RCP <const Environment> envConst,
2263  bool is_input_adapter_distributed = true,
2264  int num_ranks_per_node = 1,
2265  bool divide_to_prime_first = false,
2266  bool reduce_best_mapping = true):
2267  PartitionMapping<Adapter>(comm_, machine_, input_adapter_,
2268  soln_, envConst),
2269  proc_to_task_xadj(0),
2270  proc_to_task_adj(0),
2271  task_to_proc(0),
2272  isOwnerofModel(true),
2273  proc_task_comm(0),
2277 
2278  using namespace Teuchos;
2279  typedef typename Adapter::base_adapter_t ctm_base_adapter_t;
2280 
2281  RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2282  RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2283 
2284  RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2285  RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2286  if (!is_input_adapter_distributed) {
2287  ia_comm = Teuchos::createSerialComm<int>();
2288  }
2289 
2290  RCP<const Environment> envConst_ = envConst;
2291 
2292  RCP<const ctm_base_adapter_t> baseInputAdapter_(
2293  rcp(dynamic_cast<const ctm_base_adapter_t *>(
2294  input_adapter_.getRawPtr()), false));
2295 
2296  modelFlag_t coordFlags_, graphFlags_;
2297 
2298  //create coordinate model
2299  //since this is coordinate task mapper,
2300  //the adapter has to have the coordinates
2301  coordinateModel_ = rcp(new CoordinateModel<ctm_base_adapter_t>(
2302  baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2303 
2304  //if the adapter has also graph model, we will use graph model
2305  //to calculate the cost mapping.
2306  BaseAdapterType inputType_ = input_adapter_->adapterType();
2307  if (inputType_ == MatrixAdapterType ||
2308  inputType_ == GraphAdapterType ||
2309  inputType_ == MeshAdapterType)
2310  {
2311  graph_model_ = rcp(new GraphModel<ctm_base_adapter_t>(
2312  baseInputAdapter_, envConst_, ia_comm,
2313  graphFlags_));
2314  }
2315 
2316  if (!machine_->hasMachineCoordinates()) {
2317  throw std::runtime_error("Existing machine does not provide "
2318  "coordinates for coordinate task mapping");
2319  }
2320 
2321  //if mapping type is 0 then it is coordinate mapping
2322  int procDim = machine_->getMachineDim();
2323  this->nprocs = machine_->getNumRanks();
2324 
2325  //get processor coordinates.
2326  pcoord_t **procCoordinates = NULL;
2327  if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2328  throw std::runtime_error("Existing machine does not implement "
2329  "getAllMachineCoordinatesView");
2330  }
2331 
2332  //get the machine extent.
2333  //if we have machine extent,
2334  //if the machine has wrap-around links, we would like to shift the
2335  //coordinates, so that the largest hap would be the wrap-around.
2336  std::vector<int> machine_extent_vec(procDim);
2337  //std::vector<bool> machine_extent_wrap_around_vec(procDim, 0);
2338  int *machine_extent = &(machine_extent_vec[0]);
2339  bool *machine_extent_wrap_around = new bool[procDim];
2340  for (int i = 0; i < procDim; ++i)
2341  machine_extent_wrap_around[i] = false;
2342 
2343  bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2344 
2345  // KDDKDD ASK MEHMET: SHOULD WE GET AND USE machine_dimension HERE IF IT
2346  // KDDKDD ASK MEHMET: IS PROVIDED BY THE MACHINE REPRESENTATION?
2347  // KDDKDD ASK MEHMET: IF NOT HERE, THEN WHERE?
2348  // MD: Yes, I ADDED BELOW:
2349  if (machine_->getMachineExtent(machine_extent) &&
2350  haveWrapArounds) {
2351 
2352  procCoordinates =
2354  procDim,
2355  machine_extent,
2356  machine_extent_wrap_around,
2357  this->nprocs,
2358  procCoordinates);
2359  }
2360 
2361  //get the tasks information, such as coordinate dimension,
2362  //number of parts.
2363  int coordDim = coordinateModel_->getCoordinateDim();
2364 
2365 // int coordDim = machine_->getMachineDim();
2366 
2367  this->ntasks = soln_->getActualGlobalNumberOfParts();
2368  if (part_t(soln_->getTargetGlobalNumberOfParts()) > this->ntasks) {
2369  this->ntasks = soln_->getTargetGlobalNumberOfParts();
2370  }
2371  this->solution_parts = soln_->getPartListView();
2372 
2373  //we need to calculate the center of parts.
2374  tcoord_t **partCenters = new tcoord_t*[coordDim];
2375  for (int i = 0; i < coordDim; ++i) {
2376  partCenters[i] = new tcoord_t[this->ntasks];
2377  }
2378 
2379  typedef typename Adapter::scalar_t t_scalar_t;
2380 
2381 
2382  envConst->timerStart(MACRO_TIMERS, "Mapping - Solution Center");
2383 
2384  //get centers for the parts.
2385  getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2386  envConst.getRawPtr(),
2387  ia_comm.getRawPtr(),
2388  coordinateModel_.getRawPtr(),
2389  this->solution_parts,
2390 // soln_->getPartListView();
2391 // this->soln.getRawPtr(),
2392  coordDim,
2393  ntasks,
2394  partCenters);
2395 
2396  envConst->timerStop(MACRO_TIMERS, "Mapping - Solution Center");
2397 
2398  //create the part graph
2399  if (graph_model_.getRawPtr() != NULL) {
2400  getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2401  envConst.getRawPtr(),
2402  ia_comm.getRawPtr(),
2403  graph_model_.getRawPtr(),
2404  this->ntasks,
2405  this->solution_parts,
2406 // soln_->getPartListView(),
2407 // this->soln.getRawPtr(),
2411  );
2412  }
2413 
2414  //create coordinate communication model.
2416  pcoord_t, tcoord_t, part_t, node_t>(
2417  procDim,
2418  procCoordinates,
2419  coordDim,
2420  partCenters,
2421  this->nprocs,
2422  this->ntasks,
2423  machine_extent,
2424  machine_extent_wrap_around,
2425  machine_.getRawPtr());
2426 
2427  int myRank = comm_->getRank();
2428  this->proc_task_comm->num_ranks_per_node = num_ranks_per_node ;
2429  this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2430 
2431  envConst->timerStart(MACRO_TIMERS, "Mapping - Processor Task map");
2432  this->doMapping(myRank, comm_);
2433  envConst->timerStop(MACRO_TIMERS, "Mapping - Processor Task map");
2434 
2435  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Graph");
2436 
2437 /*
2438  soln_->getCommunicationGraph(task_communication_xadj,
2439  task_communication_adj);
2440 */
2441 
2442  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Graph");
2443  #ifdef gnuPlot1
2444  if (comm_->getRank() == 0) {
2445 
2446  part_t taskCommCount = task_communication_xadj.size();
2447  std::cout << " TotalComm:"
2448  << task_communication_xadj[taskCommCount] << std::endl;
2449  part_t maxN = task_communication_xadj[0];
2450  for (part_t i = 1; i <= taskCommCount; ++i) {
2451  part_t nc =
2453  if (maxN < nc)
2454  maxN = nc;
2455  }
2456  std::cout << " maxNeighbor:" << maxN << std::endl;
2457  }
2458 
2459  this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2460  #endif
2461 
2462  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Cost");
2463 
2464  if (reduce_best_mapping && task_communication_xadj.getRawPtr() &&
2465  task_communication_adj.getRawPtr()) {
2466  this->proc_task_comm->calculateCommunicationCost(
2467  task_to_proc.getRawPtr(),
2468  task_communication_xadj.getRawPtr(),
2469  task_communication_adj.getRawPtr(),
2470  task_communication_edge_weight.getRawPtr()
2471  );
2472  }
2473 
2474 // std::cout << "me: " << comm_->getRank() << " cost:"
2475 // << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2476 
2477  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Cost");
2478 
2479  //processors are divided into groups of size procDim! * coordDim!
2480  //each processor in the group obtains a mapping with a different
2481  //rotation and best one is broadcasted all processors.
2482  this->getBestMapping();
2484  coordinateModel_->getLocalNumCoordinates(),
2485  this->solution_parts,
2486  this->task_to_proc);
2487 /*
2488  {
2489  if (task_communication_xadj.getRawPtr() &&
2490  task_communication_adj.getRawPtr())
2491  this->proc_task_comm->calculateCommunicationCost(
2492  task_to_proc.getRawPtr(),
2493  task_communication_xadj.getRawPtr(),
2494  task_communication_adj.getRawPtr(),
2495  task_communication_edge_weight.getRawPtr()
2496  );
2497  std::cout << "me: " << comm_->getRank() << " cost:"
2498  << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2499  }
2500 */
2501 
2502 
2503  #ifdef gnuPlot
2504  this->writeMapping2(comm_->getRank());
2505  #endif
2506 
2507  delete [] machine_extent_wrap_around;
2508 
2509  if (machine_->getMachineExtent(machine_extent) &&
2510  haveWrapArounds) {
2511  for (int i = 0; i < procDim; ++i) {
2512  delete [] procCoordinates[i];
2513  }
2514  delete [] procCoordinates;
2515  }
2516 
2517  for (int i = 0; i < coordDim; ++i) {
2518  delete [] partCenters[i];
2519  }
2520  delete [] partCenters;
2521 
2522 
2523  }
2524 
2525 
2543  const Teuchos::RCP <const Teuchos::Comm<int> > comm_,
2544  const Teuchos::RCP <const MachineRepresentation<pcoord_t,part_t> >
2545  machine_,
2546  const Teuchos::RCP <const Adapter> input_adapter_,
2547  const part_t num_parts_,
2548  const part_t *result_parts,
2549  const Teuchos::RCP <const Environment> envConst,
2550  bool is_input_adapter_distributed = true,
2551  int num_ranks_per_node = 1,
2552  bool divide_to_prime_first = false,
2553  bool reduce_best_mapping = true):
2554  PartitionMapping<Adapter>(comm_, machine_, input_adapter_,
2555  num_parts_, result_parts, envConst),
2556  proc_to_task_xadj(0),
2557  proc_to_task_adj(0),
2558  task_to_proc(0),
2559  isOwnerofModel(true),
2560  proc_task_comm(0),
2564 
2565  using namespace Teuchos;
2566  typedef typename Adapter::base_adapter_t ctm_base_adapter_t;
2567 
2568  RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2569  RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2570 
2571  RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2572  RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2573  if (!is_input_adapter_distributed) {
2574  ia_comm = Teuchos::createSerialComm<int>();
2575  }
2576  RCP<const Environment> envConst_ = envConst;
2577 
2578  RCP<const ctm_base_adapter_t> baseInputAdapter_(
2579  rcp(dynamic_cast<const ctm_base_adapter_t *>(
2580  input_adapter_.getRawPtr()), false));
2581 
2582  modelFlag_t coordFlags_, graphFlags_;
2583 
2584  //create coordinate model
2585  //since this is coordinate task mapper,
2586  //the adapter has to have the coordinates
2587  coordinateModel_ = rcp(new CoordinateModel<ctm_base_adapter_t>(
2588  baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2589 
2590  //if the adapter has also graph model, we will use graph model
2591  //to calculate the cost mapping.
2592  BaseAdapterType inputType_ = input_adapter_->adapterType();
2593  if (inputType_ == MatrixAdapterType ||
2594  inputType_ == GraphAdapterType ||
2595  inputType_ == MeshAdapterType)
2596  {
2597  graph_model_ = rcp(new GraphModel<ctm_base_adapter_t>(
2598  baseInputAdapter_, envConst_, ia_comm,
2599  graphFlags_));
2600  }
2601 
2602  if (!machine_->hasMachineCoordinates()) {
2603  throw std::runtime_error("Existing machine does not provide "
2604  "coordinates for coordinate task mapping.");
2605  }
2606 
2607  //if mapping type is 0 then it is coordinate mapping
2608  int procDim = machine_->getMachineDim();
2609  this->nprocs = machine_->getNumRanks();
2610 
2611  //get processor coordinates.
2612  pcoord_t **procCoordinates = NULL;
2613  if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2614  throw std::runtime_error("Existing machine does not implement "
2615  "getAllMachineCoordinatesView");
2616  }
2617 
2618  //get the machine extent.
2619  //if we have machine extent,
2620  //if the machine has wrap-around links, we would like to shift the
2621  //coordinates,
2622  //so that the largest hap would be the wrap-around.
2623  std::vector<int> machine_extent_vec(procDim);
2624 // std::vector<bool> machine_extent_wrap_around_vec(procDim, 0);
2625  int *machine_extent = &(machine_extent_vec[0]);
2626  bool *machine_extent_wrap_around = new bool[procDim];
2627  bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2628 
2629  // KDDKDD ASK MEHMET: SHOULD WE GET AND USE machine_dimension HERE IF IT
2630  // KDDKDD ASK MEHMET: IS PROVIDED BY THE MACHINE REPRESENTATION?
2631  // KDDKDD ASK MEHMET: IF NOT HERE, THEN WHERE?
2632  // MD: Yes, I ADDED BELOW:
2633  if (machine_->getMachineExtent(machine_extent) &&
2634  haveWrapArounds) {
2635  procCoordinates =
2637  procDim,
2638  machine_extent,
2639  machine_extent_wrap_around,
2640  this->nprocs,
2641  procCoordinates);
2642  }
2643 
2644  //get the tasks information, such as coordinate dimension,
2645  //number of parts.
2646  int coordDim = coordinateModel_->getCoordinateDim();
2647 
2648 // int coordDim = machine_->getMachineDim();
2649 
2650 
2651  this->ntasks = num_parts_;
2652  this->solution_parts = result_parts;
2653 
2654  //we need to calculate the center of parts.
2655  tcoord_t **partCenters = new tcoord_t*[coordDim];
2656  for (int i = 0; i < coordDim; ++i) {
2657  partCenters[i] = new tcoord_t[this->ntasks];
2658  }
2659 
2660  typedef typename Adapter::scalar_t t_scalar_t;
2661 
2662 
2663  envConst->timerStart(MACRO_TIMERS, "Mapping - Solution Center");
2664 
2665  //get centers for the parts.
2666  getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2667  envConst.getRawPtr(),
2668  ia_comm.getRawPtr(),
2669  coordinateModel_.getRawPtr(),
2670  this->solution_parts,
2671 // soln_->getPartListView();
2672 // this->soln.getRawPtr(),
2673  coordDim,
2674  ntasks,
2675  partCenters);
2676 
2677  envConst->timerStop(MACRO_TIMERS, "Mapping - Solution Center");
2678 
2679  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE");
2680  //create the part graph
2681  if (graph_model_.getRawPtr() != NULL) {
2682  getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2683  envConst.getRawPtr(),
2684  ia_comm.getRawPtr(),
2685  graph_model_.getRawPtr(),
2686  this->ntasks,
2687  this->solution_parts,
2688 // soln_->getPartListView(),
2689 // this->soln.getRawPtr(),
2693  );
2694  }
2695  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE");
2696 
2697  envConst->timerStart(MACRO_TIMERS,
2698  "CoordinateCommunicationModel Create");
2699  //create coordinate communication model.
2701  pcoord_t, tcoord_t, part_t, node_t>(
2702  procDim,
2703  procCoordinates,
2704  coordDim,
2705  partCenters,
2706  this->nprocs,
2707  this->ntasks,
2708  machine_extent,
2709  machine_extent_wrap_around,
2710  machine_.getRawPtr());
2711 
2712  envConst->timerStop(MACRO_TIMERS,
2713  "CoordinateCommunicationModel Create");
2714 
2715 
2716  this->proc_task_comm->num_ranks_per_node = num_ranks_per_node;
2717  this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2718 
2719  int myRank = comm_->getRank();
2720 
2721 
2722  envConst->timerStart(MACRO_TIMERS, "Mapping - Processor Task map");
2723  this->doMapping(myRank, comm_);
2724  envConst->timerStop(MACRO_TIMERS, "Mapping - Processor Task map");
2725 
2726 
2727  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Graph");
2728 
2729 /*
2730  soln_->getCommunicationGraph(task_communication_xadj,
2731  task_communication_adj);
2732 */
2733 
2734  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Graph");
2735  #ifdef gnuPlot1
2736  if (comm_->getRank() == 0) {
2737 
2738  part_t taskCommCount = task_communication_xadj.size();
2739  std::cout << " TotalComm:"
2740  << task_communication_xadj[taskCommCount] << std::endl;
2741  part_t maxN = task_communication_xadj[0];
2742  for (part_t i = 1; i <= taskCommCount; ++i) {
2743  part_t nc =
2745  if (maxN < nc)
2746  maxN = nc;
2747  }
2748  std::cout << " maxNeighbor:" << maxN << std::endl;
2749  }
2750 
2751  this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2752  #endif
2753 
2754  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Cost");
2755 
2756  if (reduce_best_mapping && task_communication_xadj.getRawPtr() &&
2757  task_communication_adj.getRawPtr()) {
2758  this->proc_task_comm->calculateCommunicationCost(
2759  task_to_proc.getRawPtr(),
2760  task_communication_xadj.getRawPtr(),
2761  task_communication_adj.getRawPtr(),
2762  task_communication_edge_weight.getRawPtr()
2763  );
2764  }
2765 
2766 // std::cout << "me: " << comm_->getRank() << " cost:"
2767 // << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2768 
2769  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Cost");
2770 
2771  //processors are divided into groups of size procDim! * coordDim!
2772  //each processor in the group obtains a mapping with a different rotation
2773  //and best one is broadcasted all processors.
2774  this->getBestMapping();
2775 
2777  coordinateModel_->getLocalNumCoordinates(),
2778  this->solution_parts,
2779  this->task_to_proc);
2780 /*
2781  {
2782  if (task_communication_xadj.getRawPtr() &&
2783  task_communication_adj.getRawPtr())
2784  this->proc_task_comm->calculateCommunicationCost(
2785  task_to_proc.getRawPtr(),
2786  task_communication_xadj.getRawPtr(),
2787  task_communication_adj.getRawPtr(),
2788  task_communication_edge_weight.getRawPtr()
2789  );
2790  std::cout << "me: " << comm_->getRank() << " cost:"
2791  << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2792  }
2793 */
2794 
2795 
2796 
2797  #ifdef gnuPlot
2798  this->writeMapping2(comm_->getRank());
2799  #endif
2800 
2801  delete [] machine_extent_wrap_around;
2802 
2803  if (machine_->getMachineExtent(machine_extent) &&
2804  haveWrapArounds) {
2805  for (int i = 0; i < procDim; ++i) {
2806  delete [] procCoordinates[i];
2807  }
2808  delete [] procCoordinates;
2809  }
2810 
2811  for (int i = 0; i < coordDim; ++i) {
2812  delete [] partCenters[i];
2813  }
2814  delete [] partCenters;
2815  }
2816 
2883  const Environment *env_const_,
2884  const Teuchos::Comm<int> *problemComm,
2885  int proc_dim,
2886  int num_processors,
2887  pcoord_t **machine_coords,
2888  int task_dim,
2889  part_t num_tasks,
2890  tcoord_t **task_coords,
2891  ArrayRCP<part_t>task_comm_xadj,
2892  ArrayRCP<part_t>task_comm_adj,
2893  pcoord_t *task_communication_edge_weight_,
2894  int recursion_depth,
2895  Kokkos::View<part_t *, Kokkos::HostSpace> part_no_array,
2896  const part_t *machine_dimensions,
2897  int num_ranks_per_node = 1,
2898  bool divide_to_prime_first = false,
2899  bool reduce_best_mapping = true):
2900  PartitionMapping<Adapter>(
2901  Teuchos::rcpFromRef<const Teuchos::Comm<int> >(*problemComm),
2902  Teuchos::rcpFromRef<const Environment>(*env_const_)),
2903  proc_to_task_xadj(0),
2904  proc_to_task_adj(0),
2905  task_to_proc(0),
2906  isOwnerofModel(true),
2907  proc_task_comm(0),
2908  task_communication_xadj(task_comm_xadj),
2909  task_communication_adj(task_comm_adj) {
2910 
2911  //if mapping type is 0 then it is coordinate mapping
2912  pcoord_t ** virtual_machine_coordinates = machine_coords;
2913  bool *wrap_arounds = new bool [proc_dim];
2914  for (int i = 0; i < proc_dim; ++i) wrap_arounds[i] = true;
2915 
2916  if (machine_dimensions) {
2917  virtual_machine_coordinates =
2919  proc_dim,
2920  machine_dimensions,
2921  wrap_arounds,
2922  num_processors,
2923  machine_coords);
2924  }
2925 
2926  this->nprocs = num_processors;
2927 
2928  int coordDim = task_dim;
2929  this->ntasks = num_tasks;
2930 
2931  //alloc memory for part centers.
2932  tcoord_t **partCenters = task_coords;
2933 
2934  //create coordinate communication model.
2935  this->proc_task_comm =
2937  proc_dim,
2938  virtual_machine_coordinates,
2939  coordDim,
2940  partCenters,
2941  this->nprocs,
2942  this->ntasks, NULL, NULL
2943  );
2944 
2945  this->proc_task_comm->num_ranks_per_node = num_ranks_per_node;
2946  this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2947 
2948  this->proc_task_comm->setPartArraySize(recursion_depth);
2949  this->proc_task_comm->setPartArray(part_no_array);
2950 
2951  int myRank = problemComm->getRank();
2952 
2953  this->doMapping(myRank, this->comm);
2954 #ifdef gnuPlot
2955  this->writeMapping2(myRank);
2956 #endif
2957 
2958  // MDM added this edge case - for example if NX = 1 NY = 1 NZ = 1
2959  // That would pass on original develop so updated this so now it will also pass.
2960  if (reduce_best_mapping && task_communication_xadj.size() &&
2961  task_communication_adj.size()) {
2962  this->proc_task_comm->calculateCommunicationCost(
2963  task_to_proc.getRawPtr(),
2964  task_communication_xadj.getRawPtr(),
2965  task_communication_adj.getRawPtr(),
2966  task_communication_edge_weight_
2967  );
2968 
2969 
2970  this->getBestMapping();
2971 
2972 /*
2973  if (myRank == 0) {
2974  this->proc_task_comm->calculateCommunicationCost(
2975  task_to_proc.getRawPtr(),
2976  task_communication_xadj.getRawPtr(),
2977  task_communication_adj.getRawPtr(),
2978  task_communication_edge_weight_
2979  );
2980  cout << "me: " << problemComm->getRank() << " cost:"
2981  << this->proc_task_comm->getCommunicationCostMetric() << endl;
2982  }
2983 */
2984 
2985  }
2986 
2987  delete [] wrap_arounds;
2988 
2989  if (machine_dimensions) {
2990  for (int i = 0; i < proc_dim; ++i) {
2991  delete [] virtual_machine_coordinates[i];
2992  }
2993  delete [] virtual_machine_coordinates;
2994  }
2995 #ifdef gnuPlot
2996  if (problemComm->getRank() == 0)
2997  this->writeMapping2(-1);
2998 #endif
2999  }
3000 
3001 
3002  /*
3003  double getCommunicationCostMetric() {
3004  return this->proc_task_comm->getCommCost();
3005  }
3006  */
3007 
3010  virtual size_t getLocalNumberOfParts() const{
3011  return 0;
3012  }
3013 
3025  int machine_dim,
3026  const part_t *machine_dimensions,
3027  bool *machine_extent_wrap_around,
3028  part_t numProcs,
3029  pcoord_t **mCoords) {
3030 
3031  pcoord_t **result_machine_coords = NULL;
3032  result_machine_coords = new pcoord_t*[machine_dim];
3033 
3034  for (int i = 0; i < machine_dim; ++i) {
3035  result_machine_coords[i] = new pcoord_t [numProcs];
3036  }
3037 
3038  for (int i = 0; i < machine_dim; ++i) {
3039  part_t numMachinesAlongDim = machine_dimensions[i];
3040 
3041  part_t *machineCounts = new part_t[numMachinesAlongDim];
3042  memset(machineCounts, 0, sizeof(part_t) * numMachinesAlongDim);
3043 
3044  int *filledCoordinates = new int[numMachinesAlongDim];
3045 
3046  pcoord_t *coords = mCoords[i];
3047 
3048  for (part_t j = 0; j < numProcs; ++j) {
3049  part_t mc = (part_t) coords[j];
3050  ++machineCounts[mc];
3051  }
3052 
3053  part_t filledCoordinateCount = 0;
3054  for (part_t j = 0; j < numMachinesAlongDim; ++j) {
3055  if (machineCounts[j] > 0) {
3056  filledCoordinates[filledCoordinateCount++] = j;
3057  }
3058  }
3059 
3060  part_t firstProcCoord = filledCoordinates[0];
3061  part_t firstProcCount = machineCounts[firstProcCoord];
3062 
3063  part_t lastProcCoord = filledCoordinates[filledCoordinateCount - 1];
3064  part_t lastProcCount = machineCounts[lastProcCoord];
3065 
3066  part_t firstLastGap =
3067  numMachinesAlongDim - lastProcCoord + firstProcCoord;
3068  part_t firstLastGapProc = lastProcCount + firstProcCount;
3069 
3070  part_t leftSideProcCoord = firstProcCoord;
3071  part_t leftSideProcCount = firstProcCount;
3072  part_t biggestGap = 0;
3073  part_t biggestGapProc = numProcs;
3074 
3075  part_t shiftBorderCoordinate = -1;
3076  for (part_t j = 1; j < filledCoordinateCount; ++j) {
3077  part_t rightSideProcCoord= filledCoordinates[j];
3078  part_t rightSideProcCount = machineCounts[rightSideProcCoord];
3079 
3080  part_t gap = rightSideProcCoord - leftSideProcCoord;
3081  part_t gapProc = rightSideProcCount + leftSideProcCount;
3082 
3083  // Pick the largest gap in this dimension. Use fewer process on
3084  // either side of the largest gap to break the tie. An easy
3085  // addition to this would be to weight the gap by the number of
3086  // processes.
3087  if (gap > biggestGap ||
3088  (gap == biggestGap && biggestGapProc > gapProc)) {
3089  shiftBorderCoordinate = rightSideProcCoord;
3090  biggestGapProc = gapProc;
3091  biggestGap = gap;
3092  }
3093  leftSideProcCoord = rightSideProcCoord;
3094  leftSideProcCount = rightSideProcCount;
3095  }
3096 
3097 
3098  if (!(biggestGap > firstLastGap ||
3099  (biggestGap == firstLastGap &&
3100  biggestGapProc < firstLastGapProc))) {
3101  shiftBorderCoordinate = -1;
3102  }
3103 
3104  for (part_t j = 0; j < numProcs; ++j) {
3105 
3106  if (machine_extent_wrap_around[i] &&
3107  coords[j] < shiftBorderCoordinate) {
3108  result_machine_coords[i][j] = coords[j] + numMachinesAlongDim;
3109 
3110  }
3111  else {
3112  result_machine_coords[i][j] = coords[j];
3113  }
3114  }
3115  delete [] machineCounts;
3116  delete [] filledCoordinates;
3117  }
3118 
3119  return result_machine_coords;
3120 
3121  }
3122 
3130  virtual void getProcsForPart(part_t taskId, part_t &numProcs,
3131  part_t *&procs) const {
3132  numProcs = 1;
3133  procs = this->task_to_proc.getRawPtr() + taskId;
3134  }
3135 
3141  return this->task_to_proc[taskId];
3142  }
3143 
3151  virtual void getPartsForProc(int procId, part_t &numParts,
3152  part_t *&parts) const {
3153 
3154  part_t task_begin = this->proc_to_task_xadj[procId];
3155  part_t taskend = this->proc_to_task_xadj[procId + 1];
3156 
3157  parts = this->proc_to_task_adj.getRawPtr() + task_begin;
3158  numParts = taskend - task_begin;
3159  }
3160 
3161  ArrayView<part_t> getAssignedTasksForProc(part_t procId) {
3162  part_t task_begin = this->proc_to_task_xadj[procId];
3163  part_t taskend = this->proc_to_task_xadj[procId + 1];
3164 
3165 /*
3166  std::cout << "part_t:" << procId << " taskCount:"
3167  << taskend - task_begin << std::endl;
3168 
3169  for (part_t i = task_begin; i < taskend; ++i) {
3170  std::cout << "part_t:" << procId << " task:"
3171  << proc_to_task_adj[i] << endl;
3172  }
3173 */
3174  if (taskend - task_begin > 0) {
3175  ArrayView <part_t> assignedParts(
3176  this->proc_to_task_adj.getRawPtr() + task_begin,
3177  taskend - task_begin);
3178 
3179  return assignedParts;
3180  }
3181  else {
3182  ArrayView <part_t> assignedParts;
3183 
3184  return assignedParts;
3185  }
3186  }
3187 
3188 };
3189 
3268 template <typename part_t, typename pcoord_t, typename tcoord_t>
3270  RCP<const Teuchos::Comm<int> > problemComm,
3271  int proc_dim,
3272  int num_processors,
3273  pcoord_t **machine_coords,
3274  int task_dim,
3275  part_t num_tasks,
3276  tcoord_t **task_coords,
3277  part_t *task_comm_xadj,
3278  part_t *task_comm_adj,
3279  // float-like, same size with task_communication_adj_ weight of the
3280  // corresponding edge.
3281  pcoord_t *task_communication_edge_weight_,
3282  part_t *proc_to_task_xadj, /*output*/
3283  part_t *proc_to_task_adj, /*output*/
3284  int recursion_depth,
3285  Kokkos::View<part_t *, Kokkos::HostSpace> part_no_array,
3286  const part_t *machine_dimensions,
3287  int num_ranks_per_node = 1,
3288  bool divide_to_prime_first = false) {
3289 
3290  const Environment *envConst_ = new Environment(problemComm);
3291 
3292  // mfh 03 Mar 2015: It's OK to omit the Node template
3293  // parameter in Tpetra, if you're just going to use the
3294  // default Node.
3295  typedef Tpetra::MultiVector<tcoord_t, part_t, part_t> tMVector_t;
3296 
3297  Teuchos::ArrayRCP<part_t> task_communication_xadj(
3298  task_comm_xadj, 0, num_tasks + 1, false);
3299 
3300  Teuchos::ArrayRCP<part_t> task_communication_adj;
3301  if (task_comm_xadj) {
3302  Teuchos::ArrayRCP<part_t> tmp_task_communication_adj(
3303  task_comm_adj, 0, task_comm_xadj[num_tasks], false);
3304  task_communication_adj = tmp_task_communication_adj;
3305  }
3306 
3307 
3310  envConst_,
3311  problemComm.getRawPtr(),
3312  proc_dim,
3313  num_processors,
3314  machine_coords,
3315 // machine_coords_,
3316 
3317  task_dim,
3318  num_tasks,
3319  task_coords,
3320 
3323  task_communication_edge_weight_,
3324  recursion_depth,
3325  part_no_array,
3326  machine_dimensions,
3327  num_ranks_per_node,
3328  divide_to_prime_first);
3329 
3330 
3331  part_t* proc_to_task_xadj_;
3332  part_t* proc_to_task_adj_;
3333 
3334  ctm->getProcTask(proc_to_task_xadj_, proc_to_task_adj_);
3335 
3336  for (part_t i = 0; i <= num_processors; ++i) {
3337  proc_to_task_xadj[i] = proc_to_task_xadj_[i];
3338  }
3339 
3340  for (part_t i = 0; i < num_tasks; ++i) {
3341  proc_to_task_adj[i] = proc_to_task_adj_[i];
3342  }
3343 
3344  delete ctm;
3345  delete envConst_;
3346 }
3347 
3348 template <typename proc_coord_t, typename v_lno_t>
3349 inline void visualize_mapping(int myRank,
3350  const int machine_coord_dim,
3351  const int num_ranks,
3352  proc_coord_t **machine_coords,
3353  const v_lno_t num_tasks,
3354  const v_lno_t *task_communication_xadj,
3355  const v_lno_t *task_communication_adj,
3356  const int *task_to_rank) {
3357 
3358  std::string rankStr = Teuchos::toString<int>(myRank);
3359  std::string gnuPlots = "gnuPlot", extentionS = ".plot";
3360  std::string outF = gnuPlots + rankStr+ extentionS;
3361  std::ofstream gnuPlotCode( outF.c_str(), std::ofstream::out);
3362 
3363  if (machine_coord_dim != 3) {
3364  std::cerr << "Mapping Write is only good for 3 dim" << std::endl;
3365  return;
3366  }
3367  std::string ss = "";
3368  std::string procs = "";
3369 
3370  std::set<std::tuple<int, int, int, int, int, int> > my_arrows;
3371 
3372  for (v_lno_t origin_task = 0; origin_task < num_tasks; ++origin_task) {
3373  int origin_rank = task_to_rank[origin_task];
3374  std::string gnuPlotArrow = "set arrow from ";
3375 
3376  for (int j = 0; j < machine_coord_dim; ++j) {
3377  if (j == machine_coord_dim - 1) {
3378  gnuPlotArrow +=
3379  Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3380  procs +=
3381  Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3382 
3383  }
3384  else {
3385  gnuPlotArrow +=
3386  Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3387  + ",";
3388  procs +=
3389  Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3390  + " ";
3391  }
3392  }
3393  procs += "\n";
3394 
3395  gnuPlotArrow += " to ";
3396 
3397 
3398  for (int nind = task_communication_xadj[origin_task];
3399  nind < task_communication_xadj[origin_task + 1]; ++nind) {
3400 
3401  int neighbor_task = task_communication_adj[nind];
3402 
3403  bool differentnode = false;
3404  int neighbor_rank = task_to_rank[neighbor_task];
3405 
3406  for (int j = 0; j < machine_coord_dim; ++j) {
3407  if (int(machine_coords[j][origin_rank]) !=
3408  int(machine_coords[j][neighbor_rank])) {
3409  differentnode = true; break;
3410  }
3411  }
3412 
3413  std::tuple<int,int,int, int, int, int> foo(
3414  (int)(machine_coords[0][origin_rank]),
3415  (int)(machine_coords[1][origin_rank]),
3416  (int)(machine_coords[2][origin_rank]),
3417  (int)(machine_coords[0][neighbor_rank]),
3418  (int)(machine_coords[1][neighbor_rank]),
3419  (int)(machine_coords[2][neighbor_rank]));
3420 
3421  if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
3422  my_arrows.insert(foo);
3423 
3424  std::string gnuPlotArrow2 = "";
3425  for (int j = 0; j < machine_coord_dim; ++j) {
3426  if (j == machine_coord_dim - 1) {
3427  gnuPlotArrow2 +=
3428  Teuchos::toString<float>(machine_coords[j][neighbor_rank]);
3429  }
3430  else {
3431  gnuPlotArrow2 +=
3432  Teuchos::toString<float>(machine_coords[j][neighbor_rank])
3433  + ",";
3434  }
3435  }
3436  ss += gnuPlotArrow + gnuPlotArrow2 + " nohead\n";
3437  }
3438  }
3439  }
3440 
3441  std::ofstream procFile("procPlot.plot", std::ofstream::out);
3442  procFile << procs << "\n";
3443  procFile.close();
3444 
3445  //gnuPlotCode << ss;
3446  if (machine_coord_dim == 2) {
3447  gnuPlotCode << "plot \"procPlot.plot\" with points pointsize 3\n";
3448  }
3449  else {
3450  gnuPlotCode << "splot \"procPlot.plot\" with points pointsize 3\n";
3451  }
3452 
3453  gnuPlotCode << ss << "\nreplot\n pause -1\npause -1";
3454  gnuPlotCode.close();
3455 }
3456 
3457 } // namespace Zoltan2
3458 
3459 #endif
void setParams(int dimension_, int heapsize)
CommunicationModel Base Class that performs mapping between the coordinate partitioning result...
size_t getLocalNumEdges() const
Returns the number of edges on this process. In global or subset graphs, includes off-process edges...
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &rcp_proc_to_task_xadj, ArrayRCP< part_t > &rcp_proc_to_task_adj, ArrayRCP< part_t > &rcp_task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const
Function is called whenever nprocs &gt; no_task. Function returns only the subset of processors that are...
void getBestMapping()
finds the lowest cost mapping and broadcasts solution to everyone.
void ithPermutation(const IT n, IT i, IT *perm)
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const part_t num_parts_, const part_t *result_parts, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. Instead of Solution we have two parameters, numparts.
size_t getVertexList(ArrayView< const gno_t > &Ids, ArrayView< input_t > &wgts) const
Sets pointers to this process&#39; vertex Ids and their weights.
Time an algorithm (or other entity) as a whole.
WT getDistance(IT index, WT **elementCoords)
virtual size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
void getCoarsenedPartGraph(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::GraphModel< typename Adapter::base_adapter_t > *graph, part_t np, const part_t *parts, ArrayRCP< part_t > &g_part_xadj, ArrayRCP< part_t > &g_part_adj, ArrayRCP< scalar_t > &g_part_ew)
void visualize_mapping(int myRank, const int machine_coord_dim, const int num_ranks, proc_coord_t **machine_coords, const v_lno_t num_tasks, const v_lno_t *task_communication_xadj, const v_lno_t *task_communication_adj, const int *task_to_rank)
PartitionMapping maps a solution or an input distribution to ranks.
virtual double getProcDistance(int procId1, int procId2) const
void getSolutionCenterCoordinates(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::CoordinateModel< typename Adapter::base_adapter_t > *coords, const part_t *parts, int coordDim, part_t ntasks, scalar_t **partCenters)
bool getNewCenters(WT **coords)
void calculateCommunicationCost(part_t *task_to_proc, part_t *task_communication_xadj, part_t *task_communication_adj, pcoord_t *task_communication_edge_weight)
typename node_t::device_type device_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
map_t::global_ordinal_type gno_t
Definition: mapRemotes.cpp:18
typename Zoltan2::InputTraits< ztcrsmatrix_t >::node_t node_t
ArrayView< part_t > getAssignedTasksForProc(part_t procId)
part_t getAssignedProcForTask(part_t taskId)
getAssignedProcForTask function, returns the assigned processor id for the given task ...
void getMinDistanceCluster(IT *procPermutation)
void doMapping(int myRank, const Teuchos::RCP< const Teuchos::Comm< int > > comm_)
doMapping function, calls getMapping function of communicationModel object.
virtual std::vector< coordinateModelPartBox > & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
Defines the XpetraMultiVectorAdapter.
const MachineRepresentation< pcoord_t, part_t > * machine
SparseMatrixAdapter_t::part_t part_t
Multi Jagged coordinate partitioning algorithm.
#define epsilon
Definition: nd.cpp:82
void create_local_task_to_rank(const lno_t num_local_coords, const part_t *local_coord_parts, const ArrayRCP< part_t > task_to_proc_)
Kokkos::View< part_t *, Kokkos::HostSpace > kokkos_partNoArray
void timerStop(TimerType tt, const char *timerName) const
Stop a named timer.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
virtual void map(const RCP< MappingSolution< Adapter > > &mappingsoln)
Mapping method.
CoordinateTaskMapper(const Environment *env_const_, const Teuchos::Comm< int > *problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, ArrayRCP< part_t >task_comm_xadj, ArrayRCP< part_t >task_comm_adj, pcoord_t *task_communication_edge_weight_, int recursion_depth, Kokkos::View< part_t *, Kokkos::HostSpace > part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor The mapping constructor which will also perform the mapping operation. The result mapping can be obtained by –getAssignedProcForTask function: which returns the assigned processor id for the given task –getPartsForProc: which returns the assigned tasks with the number of tasks.
Contains the Multi-jagged algorthm.
Adapter::scalar_t scalar_t
size_t getCoordinates(ArrayView< const gno_t > &Ids, ArrayView< input_t > &xyz, ArrayView< input_t > &wgts) const
Returns the coordinate ids, values and optional weights.
PartitionMapping maps a solution or an input distribution to ranks.
void getProcTask(part_t *&proc_to_task_xadj_, part_t *&proc_to_task_adj_)
void update_visit_order(part_t *visitOrder, part_t n, int &_u_umpa_seed, part_t rndm)
A PartitioningSolution is a solution to a partitioning problem.
CoordinateCommunicationModel< pcoord_t, tcoord_t, part_t, node_t > * proc_task_comm
virtual void getProcsForPart(part_t taskId, part_t &numProcs, part_t *&procs) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
size_t getEdgeList(ArrayView< const gno_t > &edgeIds, ArrayView< const offset_t > &offsets, ArrayView< input_t > &wgts) const
Sets pointers to this process&#39; edge (neighbor) global Ids, including off-process edges.
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
void copyCoordinates(IT *permutation)
void getGridCommunicationGraph(part_t taskCount, part_t *&task_comm_xadj, part_t *&task_comm_adj, std::vector< int > grid_dims)
Zoltan2_ReduceBestMapping Class, reduces the minimum cost mapping, ties breaks with minimum proc id...
Adapter::part_t part_t
BaseAdapterType
An enum to identify general types of adapters.
void setPartArray(Kokkos::View< part_t *, Kokkos::HostSpace > pNo)
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
The StridedData class manages lists of weights or coordinates.
map_t::local_ordinal_type lno_t
Definition: mapRemotes.cpp:17
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &proc_to_task_xadj, ArrayRCP< part_t > &proc_to_task_adj, ArrayRCP< part_t > &task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const =0
Function is called whenever nprocs &gt; no_task. Function returns only the subset of processors that are...
The user parameters, debug, timing and memory profiling output objects, and error checking methods...
void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const
Function is called whenever nprocs &gt; no_task. Function returns only the subset of processors that are...
void addPoint(IT index, WT distance)
bool getNewCenters(WT *center, WT **coords, int dimension)
size_t getLocalNumVertices() const
Returns the number vertices on this process.
MachineRepresentation Class Base class for representing machine coordinates, networks, etc.
CommunicationModel(part_t no_procs_, part_t no_tasks_)
static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const Teuchos::RCP< const Zoltan2::PartitioningSolution< Adapter > > soln_, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. When this constructor is called, in order to calculate the communication metric...
CoordinateModelInput Class that performs mapping between the coordinate partitioning result and mpi r...
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS&#39;s idx_t...
pcoord_t ** shiftMachineCoordinates(int machine_dim, const part_t *machine_dimensions, bool *machine_extent_wrap_around, part_t numProcs, pcoord_t **mCoords)
Using the machine dimensions provided, create virtual machine coordinates by assigning the largest ga...
Defines the MappingSolution class.
const Teuchos::RCP< const Teuchos::Comm< int > > comm
CoordinateCommunicationModel(int pcoord_dim_, pcoord_t **pcoords_, int tcoord_dim_, tcoord_t **tcoords_, part_t no_procs_, part_t no_tasks_, int *machine_extent_, bool *machine_extent_wrap_around_, const MachineRepresentation< pcoord_t, part_t > *machine_=NULL)
Class Constructor:
GraphModel defines the interface required for graph models.
void coordinateTaskMapperInterface(RCP< const Teuchos::Comm< int > > problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, part_t *task_comm_xadj, part_t *task_comm_adj, pcoord_t *task_communication_edge_weight_, part_t *proc_to_task_xadj, part_t *proc_to_task_adj, int recursion_depth, Kokkos::View< part_t *, Kokkos::HostSpace > part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false)
Constructor The interface function that calls CoordinateTaskMapper which will also perform the mappin...
Gathering definitions used in software development.
Zoltan2_ReduceBestMapping()
Default Constructor.
KmeansHeap Class, max heap, but holds the minimum values.
int getNumWeightsPerEdge() const
Returns the number (0 or greater) of weights per edge.
Defines the GraphModel interface.
void fillContinousArray(T *arr, size_t arrSize, T *val)
fillContinousArray function
void push_down(IT index_on_heap)
ArrayRCP< scalar_t > task_communication_edge_weight
RCP< Comm< int > > create_subCommunicator()
creates and returns the subcommunicator for the processor group.
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
KMeansAlgorithm Class that performs clustering of the coordinates, and returns the closest set of coo...
const Teuchos::RCP< const Environment > env
void timerStart(TimerType tt, const char *timerName) const
Start a named timer.
KMeansAlgorithm(int dim_, IT numElements_, WT **elementCoords_, IT required_elements_)
KMeansAlgorithm Constructor.
Tpetra::MultiVector< zscalar_t, zlno_t, zgno_t, znode_t > tMVector_t
void copyCoordinates(IT *permutation)
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
virtual double getProcDistance(int procId1, int procId2) const =0
#define MINOF(a, b)
virtual void getPartsForProc(int procId, part_t &numParts, part_t *&parts) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void setHeapsize(IT heapsize_)