Zoltan2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Zoltan2_TaskMapping.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Zoltan2: A package of combinatorial algorithms for scientific computing
4 //
5 // Copyright 2012 NTESS and the Zoltan2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
11 #define _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
12 
13 #include <fstream>
14 #include <ctime>
15 #include <vector>
16 #include <set>
17 #include <tuple>
18 
19 #include "Zoltan2_Standards.hpp"
21 #include "Teuchos_ArrayViewDecl.hpp"
24 #include "Teuchos_ReductionOp.hpp"
25 
27 
28 #include "Zoltan2_GraphModel.hpp"
29 
30 #include <Zoltan2_TPLTraits.hpp>
31 
32 #include "Teuchos_Comm.hpp"
33 #ifdef HAVE_ZOLTAN2_MPI
34 #include "Teuchos_DefaultMpiComm.hpp"
35 #endif // HAVE_ZOLTAN2_MPI
36 #include <Teuchos_DefaultSerialComm.hpp>
37 
38 //#define gnuPlot
41 
42 namespace Teuchos{
43 
47 template <typename Ordinal, typename T>
48 class Zoltan2_ReduceBestMapping : public ValueTypeReductionOp<Ordinal,T>
49 {
50 
51 private:
52  T epsilon;
53 
54 public:
57  Zoltan2_ReduceBestMapping() : epsilon(std::numeric_limits<T>::epsilon()) {}
58 
61  void reduce(const Ordinal count,
62  const T inBuffer[],
63  T inoutBuffer[]) const {
64 
65  for (Ordinal i = 0; i < count; i++) {
66  if (inBuffer[0] - inoutBuffer[0] < -epsilon) {
67  inoutBuffer[0] = inBuffer[0];
68  inoutBuffer[1] = inBuffer[1];
69  } else if(inBuffer[0] - inoutBuffer[0] < epsilon &&
70  inBuffer[1] - inoutBuffer[1] < epsilon) {
71  inoutBuffer[0] = inBuffer[0];
72  inoutBuffer[1] = inBuffer[1];
73  }
74  }
75  }
76 };
77 
78 } // namespace Teuchos
79 
80 
81 namespace Zoltan2{
82 
83 template <typename it>
84 inline it z2Fact(it x) {
85  return (x == 1 ? x : x * z2Fact<it>(x - 1));
86 }
87 
88 template <typename gno_t, typename part_t>
90 public:
93 };
94 
95 //returns the ith permutation indices.
96 template <typename IT>
97 void ithPermutation(const IT n, IT i, IT *perm)
98 {
99  IT j, k = 0;
100  IT *fact = new IT[n];
101 
102 
103  // compute factorial numbers
104  fact[k] = 1;
105  while (++k < n)
106  fact[k] = fact[k - 1] * k;
107 
108  // compute factorial code
109  for (k = 0; k < n; ++k)
110  {
111  perm[k] = i / fact[n - 1 - k];
112  i = i % fact[n - 1 - k];
113  }
114 
115  // readjust values to obtain the permutation
116  // start from the end and check if preceding values are lower
117  for (k = n - 1; k > 0; --k)
118  for (j = k - 1; j >= 0; --j)
119  if (perm[j] <= perm[k])
120  perm[k]++;
121 
122  delete [] fact;
123 }
124 
125 template <typename part_t>
127  part_t *&task_comm_xadj,
128  part_t *&task_comm_adj,
129  std::vector<int> grid_dims) {
130  int dim = grid_dims.size();
131  int neighborCount = 2 * dim;
132  task_comm_xadj = new part_t[taskCount + 1];
133  task_comm_adj = new part_t[taskCount * neighborCount];
134 
135  part_t neighBorIndex = 0;
136  task_comm_xadj[0] = 0;
137  for (part_t i = 0; i < taskCount; ++i) {
138  part_t prevDimMul = 1;
139  for (int j = 0; j < dim; ++j) {
140  part_t lNeighbor = i - prevDimMul;
141  part_t rNeighbor = i + prevDimMul;
142  prevDimMul *= grid_dims[j];
143  if (lNeighbor >= 0 &&
144  lNeighbor/ prevDimMul == i / prevDimMul &&
145  lNeighbor < taskCount) {
146  task_comm_adj[neighBorIndex++] = lNeighbor;
147  }
148  if (rNeighbor >= 0 &&
149  rNeighbor/ prevDimMul == i / prevDimMul &&
150  rNeighbor < taskCount) {
151  task_comm_adj[neighBorIndex++] = rNeighbor;
152  }
153  }
154  task_comm_xadj[i + 1] = neighBorIndex;
155  }
156 
157 }
158 //returns the center of the parts.
159 template <typename Adapter, typename scalar_t, typename part_t>
161  const Environment *envConst,
162  const Teuchos::Comm<int> *comm,
164  //const Zoltan2::PartitioningSolution<Adapter> *soln_,
165  const part_t *parts,
166  int coordDim,
167  part_t ntasks,
168  scalar_t **partCenters) {
169 
170  typedef typename Adapter::lno_t lno_t;
171  typedef typename Adapter::gno_t gno_t;
172 
173  typedef StridedData<lno_t, scalar_t> input_t;
174  ArrayView<const gno_t> gnos;
175  ArrayView<input_t> xyz;
176  ArrayView<input_t> wgts;
177  coords->getCoordinates(gnos, xyz, wgts);
178 
179  //local and global num coordinates.
180  lno_t numLocalCoords = coords->getLocalNumCoordinates();
181  //gno_t numGlobalCoords = coords->getGlobalNumCoordinates();
182 
183  //local number of points in each part.
184  gno_t *point_counts = new gno_t[ntasks];
185  memset(point_counts, 0, sizeof(gno_t) * ntasks);
186 
187  //global number of points in each part.
188  gno_t *global_point_counts = new gno_t[ntasks];
189 
190  scalar_t **multiJagged_coordinates = new scalar_t*[coordDim];
191 
192  ArrayRCP<ArrayRCP<const scalar_t>> ar = arcp(new ArrayRCP<const scalar_t>[coordDim], 0, coordDim);
193  for (int dim=0; dim < coordDim; dim++){
194  xyz[dim].getInputArray(ar[dim]);
195  //multiJagged coordinate values assignment
196  multiJagged_coordinates[dim] = (scalar_t *)ar[dim].getRawPtr();
197  memset(partCenters[dim], 0, sizeof(scalar_t) * ntasks);
198  }
199 
200  //get parts with parallel gnos.
201  //const part_t *parts = soln_->getPartListView();
202 
203  envConst->timerStart(MACRO_TIMERS, "Mapping - Center Calculation");
204 
205  for (lno_t i=0; i < numLocalCoords; i++) {
206  part_t p = parts[i];
207  //add up all coordinates in each part.
208  for(int j = 0; j < coordDim; ++j) {
209  scalar_t c = multiJagged_coordinates[j][i];
210  partCenters[j][p] += c;
211  }
212  ++point_counts[p];
213  }
214 
215  //get global number of points in each part.
216  reduceAll<int, gno_t>(*comm, Teuchos::REDUCE_SUM,
217  ntasks, point_counts, global_point_counts);
218 
219  for(int j = 0; j < coordDim; ++j) {
220  for (part_t i=0; i < ntasks; ++i) {
221  if (global_point_counts[i] > 0)
222  partCenters[j][i] /= global_point_counts[i];
223  }
224  }
225 
226  scalar_t *tmpCoords = new scalar_t[ntasks];
227  for(int j = 0; j < coordDim; ++j) {
228  reduceAll<int, scalar_t>(*comm, Teuchos::REDUCE_SUM,
229  ntasks, partCenters[j], tmpCoords);
230 
231  scalar_t *tmp = partCenters[j];
232  partCenters[j] = tmpCoords;
233  tmpCoords = tmp;
234  }
235 
236  envConst->timerStop(MACRO_TIMERS, "Mapping - Center Calculation");
237 
238  delete [] point_counts;
239  delete [] global_point_counts;
240 
241  delete [] tmpCoords;
242  delete [] multiJagged_coordinates;
243 }
244 
245 //returns the coarsend part graph.
246 template <typename Adapter, typename scalar_t, typename part_t>
248  const Environment *envConst,
249  const Teuchos::Comm<int> *comm,
251  //const Zoltan2::PartitioningSolution<Adapter> *soln_,
252  part_t np,
253  const part_t *parts,
254  ArrayRCP<part_t> &g_part_xadj,
255  ArrayRCP<part_t> &g_part_adj,
256  ArrayRCP<scalar_t> &g_part_ew) {
257 
258  typedef typename Adapter::lno_t t_lno_t;
259  typedef typename Adapter::gno_t t_gno_t;
260  typedef typename Adapter::scalar_t t_scalar_t;
261  typedef typename Adapter::offset_t t_offset_t;
262  typedef typename Zoltan2::GraphModel<
263  typename Adapter::base_adapter_t>::input_t t_input_t;
264 
265  //int numRanks = comm->getSize();
266  //int myRank = comm->getRank();
267 
268  //get parts with parallel gnos.
269  /*
270  const part_t *parts = soln_->getPartListView();
271 
272  part_t np = soln_->getActualGlobalNumberOfParts();
273  if (part_t(soln_->getTargetGlobalNumberOfParts()) > np) {
274  np = soln_->getTargetGlobalNumberOfParts();
275  }
276  */
277 
278 
279  t_lno_t localNumVertices = graph->getLocalNumVertices();
280  t_lno_t localNumEdges = graph->getLocalNumEdges();
281 
282  //get the vertex global ids, and weights
283  ArrayView<const t_gno_t> Ids;
284  ArrayView<t_input_t> v_wghts;
285  graph->getVertexList(Ids, v_wghts);
286 
287  //get the edge ids, and weights
288  ArrayView<const t_gno_t> edgeIds;
289  ArrayView<const t_offset_t> offsets;
290  ArrayView<t_input_t> e_wgts;
291  graph->getEdgeList(edgeIds, offsets, e_wgts);
292 
293  std::vector<t_scalar_t> edge_weights;
294  int numWeightPerEdge = graph->getNumWeightsPerEdge();
295 
296  if (numWeightPerEdge > 0) {
297  edge_weights = std::vector<t_scalar_t>(localNumEdges);
298  for (t_lno_t i = 0; i < localNumEdges; ++i) {
299  edge_weights[i] = e_wgts[0][i];
300  }
301  }
302 
303  //create a zoltan dictionary to get the parts of the vertices
304  //at the other end of edges
305  std::vector<part_t> e_parts(localNumEdges);
306 #ifdef HAVE_ZOLTAN2_MPI
307  if (comm->getSize() > 1) {
308  const bool bUseLocalIDs = false; // Local IDs not needed
310  int debug_level = 0;
311  const RCP<const Comm<int> > rcp_comm(comm,false);
312  directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
313  directory.update(localNumVertices, &Ids[0], NULL, &parts[0],
315  directory.find(localNumEdges, &edgeIds[0], NULL, &e_parts[0],
316  NULL, NULL, false);
317  } else
318 #endif
319  {
320 
321  /*
322  std::cout << "localNumVertices:" << localNumVertices
323  << " np:" << np
324  << " globalNumVertices:" << graph->getGlobalNumVertices()
325  << " localNumEdges:" << localNumEdges << std::endl;
326  */
327 
328  for (t_lno_t i = 0; i < localNumEdges; ++i) {
329  t_gno_t ei = edgeIds[i];
330  part_t p = parts[ei];
331  e_parts[i] = p;
332  }
333 
334  //get the vertices in each part in my part.
335  std::vector<t_lno_t> part_begins(np, -1);
336  std::vector<t_lno_t> part_nexts(localNumVertices, -1);
337 
338  //cluster vertices according to their parts.
339  //create local part graph.
340  for (t_lno_t i = 0; i < localNumVertices; ++i) {
341  part_t ap = parts[i];
342  part_nexts[i] = part_begins[ap];
343  part_begins[ap] = i;
344  }
345 
346 
347  g_part_xadj = ArrayRCP<part_t>(np + 1);
348  g_part_adj = ArrayRCP<part_t>(localNumEdges);
349  g_part_ew = ArrayRCP<t_scalar_t>(localNumEdges);
350  part_t nindex = 0;
351  g_part_xadj[0] = 0;
352  std::vector<part_t> part_neighbors(np);
353  std::vector<t_scalar_t> part_neighbor_weights(np, 0);
354  std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
355 
356  //coarsen for all vertices in my part in order with parts.
357  for (t_lno_t i = 0; i < np; ++i) {
358  part_t num_neighbor_parts = 0;
359  t_lno_t v = part_begins[i];
360  //get part i, and first vertex in this part v.
361  while (v != -1) {
362  //now get the neightbors of v.
363  for (t_offset_t j = offsets[v]; j < offsets[v+1]; ++j) {
364  //get the part of the second vertex.
365  part_t ep = e_parts[j];
366 
367  t_scalar_t ew = 1;
368  if (numWeightPerEdge > 0) {
369  ew = edge_weights[j];
370  }
371 
372 // std::cout << "part:" << i << " v:" << v
373 // << " part2:" << ep << " v2:" << edgeIds[j]
374 // << " w:" << ew << std::endl;
375 
376  if (part_neighbor_weights[ep] < 0.00001) {
377  part_neighbors[num_neighbor_parts++] = ep;
378  }
379 
380  part_neighbor_weights[ep] += ew;
381  }
382 
383  v = part_nexts[v];
384  }
385 
386 
387  //now get the part list.
388  for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
389  part_t neighbor_part = part_neighbors[j];
390  g_part_adj[nindex] = neighbor_part;
391  g_part_ew[nindex++] = part_neighbor_weights[neighbor_part];
392  part_neighbor_weights[neighbor_part] = 0;
393  }
394  g_part_xadj[i + 1] = nindex;
395  }
396  }
397 
398 #ifdef HAVE_ZOLTAN2_MPI
399  if(comm->getSize() > 1) { // Otherwise it's already handled above and done
400 
401  // struct for directory data - note more extensive comments in
402  // Zoltan2_GraphMetricsUtility.hpp which I didn't want to duplicate
403  // because this is in progress. Similar concept there.
404  struct part_info {
405  part_info() : weight(0) {
406  }
407 
408  const part_info & operator+=(const part_info & src) {
409  this->weight += src.weight;
410  return *this;
411  }
412 
413  bool operator>(const part_info & src) {
414  return (destination_part > src.destination_part);
415  }
416 
417  bool operator==(const part_info & src) {
418  return (destination_part == src.destination_part);
419  }
420 
421  part_t destination_part;
422  scalar_t weight;
423  };
424 
425  bool bUseLocalIDs = false;
426  const int debug_level = 0;
428  directory_t;
429  const RCP<const Comm<int> > rcp_comm(comm,false);
430  directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
431  std::vector<part_t> part_data;
432  std::vector<std::vector<part_info>> user_data;
433 
434  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE Coarsen");
435  {
436  //get the vertices in each part in my part.
437  std::vector<t_lno_t> part_begins(np, -1);
438  std::vector<t_lno_t> part_nexts(localNumVertices, -1);
439 
440  //cluster vertices according to their parts.
441  //create local part graph.
442  for (t_lno_t i = 0; i < localNumVertices; ++i) {
443  part_t ap = parts[i];
444  part_nexts[i] = part_begins[ap];
445  part_begins[ap] = i;
446  }
447 
448  std::vector<part_t> part_neighbors(np);
449  std::vector<t_scalar_t> part_neighbor_weights(np, 0);
450  std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
451 
452  //coarsen for all vertices in my part in order with parts.
453  for (t_lno_t i = 0; i < np; ++i) {
454  part_t num_neighbor_parts = 0;
455  t_lno_t v = part_begins[i];
456 
457  //get part i, and first vertex in this part v.
458  while (v != -1) {
459  //now get the neightbors of v.
460  for (t_offset_t j = offsets[v]; j < offsets[v+1]; ++j) {
461  //get the part of the second vertex.
462  part_t ep = e_parts[j];
463 
464  t_scalar_t ew = 1;
465  if (numWeightPerEdge > 0) {
466  ew = edge_weights[j];
467  }
468 
469  //add it to my local part neighbors for part i.
470  if (part_neighbor_weights[ep] < 0.00001) {
471  part_neighbors[num_neighbor_parts++] = ep;
472  }
473 
474  part_neighbor_weights[ep] += ew;
475  }
476 
477  v = part_nexts[v];
478  }
479 
480  //now get the part list.
481  for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
482  part_t neighbor_part = part_neighbors[j];
483  part_neighbor_weights_ordered[j] =
484  part_neighbor_weights[neighbor_part];
485  part_neighbor_weights[neighbor_part] = 0;
486  }
487 
488  //insert it to tpetra crsmatrix.
489  if (num_neighbor_parts > 0) {
490  part_data.push_back(i); // TODO: optimize to avoid push_back
491  std::vector<part_info> new_user_data(num_neighbor_parts);
492  for(int q = 0; q < num_neighbor_parts; ++q) {
493  part_info & info = new_user_data[q];
494  info.weight = part_neighbor_weights_ordered[q];
495  info.destination_part = part_neighbors[q];
496  }
497  // TODO: optimize to avoid push_back
498  user_data.push_back(new_user_data);
499  }
500  }
501  }
502  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE Coarsen");
503 
504  std::vector<part_t> part_indices(np);
505 
506  for (part_t i = 0; i < np; ++i) part_indices[i] = i;
507 
508  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE directory update");
509  directory.update(part_data.size(), &part_data[0], NULL, &user_data[0],
510  NULL, directory_t::Update_Mode::AggregateAdd);
511  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE directory update");
512 
513  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE directory find");
514  std::vector<std::vector<part_info>> find_user_data(part_indices.size());
515  directory.find(find_user_data.size(), &part_indices[0], NULL,
516  &find_user_data[0], NULL, NULL, false);
517  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE directory find");
518 
519  // Now reconstruct the output data from the directory find data
520  // This code was designed to reproduce the exact format of the original
521  // setup for g_part_xadj, g_part_adj, and g_part_ew but before making any
522  // further changes I wanted to verify if this formatting should be
523  // preserved or potentially changed further.
524 
525  // first thing is get the total number of elements
526  int get_total_length = 0;
527  for (size_t n = 0; n < find_user_data.size(); ++n) {
528  get_total_length += find_user_data[n].size();
529  }
530 
531  // setup data space
532  g_part_xadj = ArrayRCP<part_t>(np + 1);
533  g_part_adj = ArrayRCP<part_t>(get_total_length);
534  g_part_ew = ArrayRCP<t_scalar_t>(get_total_length);
535 
536  // loop through again and fill to match the original formatting
537  int track_insert_index = 0;
538  for(size_t n = 0; n < find_user_data.size(); ++n) {
539  g_part_xadj[n] = track_insert_index;
540  const std::vector<part_info> & user_data_vector = find_user_data[n];
541  for(size_t q = 0; q < user_data_vector.size(); ++q) {
542  const part_info & info = user_data_vector[q];
543  g_part_adj[track_insert_index] = info.destination_part;
544  g_part_ew[track_insert_index] = info.weight;
545  ++track_insert_index;
546  }
547  }
548  g_part_xadj[np] = get_total_length; // complete the series
549  }
550 #endif // HAVE_ZOLTAN2_MPI
551 }
552 
553 
556 template <class IT, class WT>
558 
559  IT heapSize;
560  IT *indices;
561  WT *values;
562  WT epsilon;
563 
564 
565 public:
567  delete [] this->indices;
568  delete [] this->values;
569  }
570 
571  void setHeapsize(IT heapsize_){
572  this->heapSize = heapsize_;
573  this->indices = new IT[heapsize_];
574  this->values = new WT[heapsize_];
575  this->epsilon = std::numeric_limits<WT>::epsilon();
576  }
577 
578  void addPoint(IT index, WT distance) {
579  WT maxVal = this->values[0];
580  //add only the distance is smaller than the maximum distance.
581 // std::cout << "indeX:" << index
582 // << " distance:" << distance
583 // << " maxVal:" << maxVal << endl;
584  if (distance >= maxVal)
585  return;
586  else {
587  this->values[0] = distance;
588  this->indices[0] = index;
589  this->push_down(0);
590  }
591  }
592 
593  //heap push down operation
594  void push_down(IT index_on_heap) {
595  IT child_index1 = 2 * index_on_heap + 1;
596  IT child_index2 = 2 * index_on_heap + 2;
597 
598  IT biggerIndex = -1;
599  if(child_index1 < this->heapSize && child_index2 < this->heapSize) {
600 
601  if (this->values[child_index1] < this->values[child_index2]) {
602  biggerIndex = child_index2;
603  }
604  else {
605  biggerIndex = child_index1;
606  }
607  }
608  else if(child_index1 < this->heapSize) {
609  biggerIndex = child_index1;
610 
611  }
612  else if(child_index2 < this->heapSize) {
613  biggerIndex = child_index2;
614  }
615  if (biggerIndex >= 0 &&
616  this->values[biggerIndex] > this->values[index_on_heap]) {
617  WT tmpVal = this->values[biggerIndex];
618  this->values[biggerIndex] = this->values[index_on_heap];
619  this->values[index_on_heap] = tmpVal;
620 
621  IT tmpIndex = this->indices[biggerIndex];
622  this->indices[biggerIndex] = this->indices[index_on_heap];
623  this->indices[index_on_heap] = tmpIndex;
624  this->push_down(biggerIndex);
625  }
626  }
627 
628  void initValues() {
629  WT MAXVAL = std::numeric_limits<WT>::max();
630 
631  for(IT i = 0; i < this->heapSize; ++i) {
632  this->values[i] = MAXVAL;
633  this->indices[i] = -1;
634  }
635  }
636 
637  //returns the total distance to center in the cluster.
639 
640  WT nc = 0;
641  for(IT j = 0; j < this->heapSize; ++j) {
642  nc += this->values[j];
643 
644 // std::cout << "index:" << this->indices[j]
645 // << " distance:" << this->values[j] << endl;
646  }
647  return nc;
648  }
649 
650  //returns the new center of the cluster.
651  bool getNewCenters(WT *center, WT **coords, int dimension) {
652  bool moved = false;
653 
654  for(int i = 0; i < dimension; ++i) {
655  WT nc = 0;
656 
657  for(IT j = 0; j < this->heapSize; ++j) {
658  IT k = this->indices[j];
659 // std::cout << "i:" << i
660 // << " dim:" << dimension
661 // << " k:" << k
662 // << " heapSize:" << heapSize << endl; nc += coords[i][k];
663  nc += coords[i][k];
664  }
665 
666  nc /= this->heapSize;
667  moved = (std::abs(center[i] - nc) > this->epsilon || moved );
668  center[i] = nc;
669  }
670  return moved;
671  }
672 
673  void copyCoordinates(IT *permutation) {
674  for (IT i = 0; i < this->heapSize; ++i) {
675  permutation[i] = this->indices[i];
676  }
677  }
678 };
679 
682 template <class IT, class WT>
684 
685  int dimension;
686  KmeansHeap<IT,WT> closestPoints;
687 
688 public:
689 
690  WT *center;
691 
693  delete [] center;
694  }
695 
696  void setParams(int dimension_, int heapsize) {
697  this->dimension = dimension_;
698  this->center = new WT[dimension_];
699  this->closestPoints.setHeapsize(heapsize);
700  }
701 
702  void clearHeap(){
703  this->closestPoints.initValues();
704  }
705 
706  bool getNewCenters( WT **coords) {
707  return this->closestPoints.getNewCenters(center, coords, dimension);
708  }
709 
710  //returns the distance of the coordinate to the center.
711  //also adds it to the heap.
712  WT getDistance(IT index, WT **elementCoords) {
713  WT distance = 0;
714 
715  for (int i = 0; i < this->dimension; ++i) {
716  WT d = (center[i] - elementCoords[i][index]);
717  distance += d * d;
718  }
719  distance = pow(distance, WT(1.0 / this->dimension));
720  closestPoints.addPoint(index, distance);
721 
722  return distance;
723  }
724 
726  return closestPoints.getTotalDistance();
727  }
728 
729  void copyCoordinates(IT *permutation) {
730  closestPoints.copyCoordinates(permutation);
731  }
732 };
733 
740 template <class IT, class WT>
742 
743  int dim;
744  IT numElements;
745  WT **elementCoords;
746  IT numClusters;
747  IT required_elements;
748  KMeansCluster<IT,WT> *clusters;
749  WT *maxCoordinates;
750  WT *minCoordinates;
751 public:
752 
754  delete [] clusters;
755  delete [] maxCoordinates;
756  delete [] minCoordinates;
757  }
758 
762  int dim_ ,
763  IT numElements_,
764  WT **elementCoords_,
765  IT required_elements_):
766  dim(dim_),
767  numElements(numElements_),
768  elementCoords(elementCoords_),
769  numClusters((1 << dim_) + 1),
770  required_elements(required_elements_) {
771  this->clusters = new KMeansCluster<IT,WT>[this->numClusters];
772 
773  //set dimension and the number of required elements for all clusters.
774  for (int i = 0; i < numClusters; ++i) {
775  this->clusters[i].setParams(this->dim, this->required_elements);
776  }
777 
778  this->maxCoordinates = new WT[this->dim];
779  this->minCoordinates = new WT[this->dim];
780 
781  //obtain the min and max coordinates for each dimension.
782  for (int j = 0; j < dim; ++j) {
783  this->minCoordinates[j] = this->elementCoords[j][0];
784  this->maxCoordinates[j] = this->elementCoords[j][0];
785 
786  for(IT i = 1; i < numElements; ++i) {
787  WT t = this->elementCoords[j][i];
788  if(t > this->maxCoordinates[j]){
789  this->maxCoordinates[j] = t;
790  }
791 
792  if (t < minCoordinates[j]) {
793  this->minCoordinates[j] = t;
794  }
795  }
796  }
797 
798 
799  //assign initial cluster centers.
800  for (int j = 0; j < dim; ++j) {
801  int mod = (1 << (j+1));
802  for (int i = 0; i < numClusters - 1; ++i) {
803  WT c = 0;
804 
805  if ( (i % mod) < mod / 2) {
806  c = this->maxCoordinates[j];
807 // std::cout << "i:" << i << " j:" << j
808 // << " setting max:" << c << endl;
809  }
810  else {
811  c = this->minCoordinates[j];
812  }
813 
814  this->clusters[i].center[j] = c;
815  }
816  }
817 
818  //last cluster center is placed to middle.
819  for (int j = 0; j < dim; ++j) {
820  this->clusters[numClusters - 1].center[j] =
821  (this->maxCoordinates[j] + this->minCoordinates[j]) / 2;
822  }
823 
824 
825 /*
826  for (int i = 0; i < numClusters; ++i) {
827 // std::cout << endl << "cluster:" << i << endl << "\t";
828 
829  for (int j = 0; j < dim; ++j) {
830  std::cout << this->clusters[i].center[j] << " ";
831  }
832  }
833 */
834  }
835 
836  // Performs kmeans clustering of coordinates.
837  void kmeans() {
838  for (int it = 0; it < 10; ++it) {
839 // std::cout << "it:" << it << endl;
840 
841  for (IT j = 0; j < this->numClusters; ++j) {
842  this->clusters[j].clearHeap();
843  }
844 
845  for (IT i = 0; i < this->numElements; ++i) {
846 // std::cout << "i:" << i << " numEl:" << this->numElements << endl;
847 
848  for (IT j = 0; j < this->numClusters; ++j) {
849 // std::cout << "j:" << j
850 // << " numClusters:" << this->numClusters << endl;
851 
852  this->clusters[j].getDistance(i,this->elementCoords);
853  }
854  }
855 
856  bool moved = false;
857 
858  for (IT j = 0; j < this->numClusters; ++j) {
859  moved =
860  (this->clusters[j].getNewCenters(this->elementCoords) || moved );
861  }
862  if (!moved) {
863  break;
864  }
865  }
866  }
867 
868  // Finds the cluster in which the coordinates are the closest to each
869  // other.
870  void getMinDistanceCluster(IT *procPermutation) {
871 
872  WT minDistance = this->clusters[0].getDistanceToCenter();
873  IT minCluster = 0;
874 
875 // std::cout << "j:" << 0 << " minDistance:" << minDistance
876 // << " minTmpDistance:" << minDistance
877 // << " minCluster:" << minCluster << endl;
878 
879  for (IT j = 1; j < this->numClusters; ++j) {
880  WT minTmpDistance = this->clusters[j].getDistanceToCenter();
881 
882 // std::cout << "j:" << j << " minDistance:" << minDistance
883 // << " minTmpDistance:" << minTmpDistance
884 // << " minCluster:" << minCluster << endl;
885 
886  if (minTmpDistance < minDistance) {
887  minDistance = minTmpDistance;
888  minCluster = j;
889  }
890  }
891 
892 // std::cout << "minCluster:" << minCluster << endl;
893  this->clusters[minCluster].copyCoordinates(procPermutation);
894  }
895 };
896 
897 
898 #define MINOF(a,b) (((a)<(b))?(a):(b))
899 
907 template <typename T>
908 void fillContinousArray(T *arr, size_t arrSize, T *val) {
909  if (val == NULL) {
910 
911 #ifdef HAVE_ZOLTAN2_OMP
912 #pragma omp parallel for
913 #endif
914  for (size_t i = 0; i < arrSize; ++i) {
915  arr[i] = i;
916  }
917 
918  }
919  else {
920  T v = *val;
921 #ifdef HAVE_ZOLTAN2_OMP
922 #pragma omp parallel for
923 #endif
924  for (size_t i = 0; i < arrSize; ++i) {
925 // std::cout << "writing to i:" << i << " arr:" << arrSize << endl;
926  arr[i] = v;
927  }
928  }
929 }
930 
934 template <typename part_t, typename pcoord_t, typename node_t>
936 
937 protected:
938  double commCost;
939 
940 public:
941 
942  // Number of processors and number of tasks
945 
946 
948  CommunicationModel(part_t no_procs_, part_t no_tasks_):
949  commCost(),
950  no_procs(no_procs_),
951  no_tasks(no_tasks_) {}
952 
953  virtual ~CommunicationModel() {}
954 
955  part_t getNProcs() const{
956  return this->no_procs;
957  }
958 
960  return this->no_tasks;
961  }
962 
964  part_t *task_to_proc,
965  part_t *task_communication_xadj,
966  part_t *task_communication_adj,
967  pcoord_t *task_communication_edge_weight) {
968 
969  double totalCost = 0;
970 
971  part_t commCount = 0;
972  for (part_t task = 0; task < this->no_tasks; ++task) {
973  int assigned_proc = task_to_proc[task];
974 
975  part_t task_adj_begin = task_communication_xadj[task];
976  part_t task_adj_end = task_communication_xadj[task + 1];
977 
978  commCount += task_adj_end - task_adj_begin;
979 
980  for (part_t task2 = task_adj_begin; task2 < task_adj_end; ++task2) {
981 
982  part_t neighborTask = task_communication_adj[task2];
983  int neighborProc = task_to_proc[neighborTask];
984  double distance = getProcDistance(assigned_proc, neighborProc);
985 
986  if (task_communication_edge_weight == NULL) {
987  totalCost += distance ;
988  }
989  else {
990  totalCost += distance * task_communication_edge_weight[task2];
991  }
992  }
993  }
994 
995  // commCount
996  this->commCost = totalCost;
997  }
998 
1000  return this->commCost;
1001  }
1002 
1003  virtual double getProcDistance(int procId1, int procId2) const = 0;
1004 
1014  virtual void getMapping(
1015  int myRank,
1016  const RCP<const Environment> &env,
1017  ArrayRCP <part_t> &proc_to_task_xadj,
1018  ArrayRCP <part_t> &proc_to_task_adj,
1019  ArrayRCP <part_t> &task_to_proc,
1020  const Teuchos::RCP <const Teuchos::Comm<int> > comm_
1021  ) const = 0;
1022 };
1023 
1024 
1028 template <typename pcoord_t, typename tcoord_t, typename part_t, typename node_t>
1030  public CommunicationModel<part_t, pcoord_t, node_t> {
1031 public:
1032  //private:
1033 
1034  // Dimension of the processors
1036  // Processor coordinates (allocated outside of the class)
1037  pcoord_t **proc_coords;
1038  // Dimension of the tasks coordinates.
1040  // Task coordinates (allocated outside of the class)
1041  tcoord_t **task_coords;
1042 
1043  // TODO: Perhaps delete this and just reference the view size?
1044  // Need to check the handling of size -1 versus size 0
1046 
1047  Kokkos::View<part_t *, Kokkos::HostSpace> kokkos_partNoArray;
1048 
1052 
1055 
1056  //public:
1058  CommunicationModel<part_t, pcoord_t, node_t>(),
1059  proc_coord_dim(0),
1060  proc_coords(0),
1061  task_coord_dim(0),
1062  task_coords(0),
1063  partArraySize(-1),
1064  machine_extent(NULL),
1066  machine(NULL),
1067  num_ranks_per_node(1),
1068  divide_to_prime_first(false){}
1069 
1071 
1083  int pcoord_dim_,
1084  pcoord_t **pcoords_,
1085  int tcoord_dim_,
1086  tcoord_t **tcoords_,
1087  part_t no_procs_,
1088  part_t no_tasks_,
1089  int *machine_extent_,
1090  bool *machine_extent_wrap_around_,
1091  const MachineRepresentation<pcoord_t,part_t> *machine_ = NULL
1092  ):
1093  CommunicationModel<part_t, pcoord_t, node_t>(no_procs_, no_tasks_),
1094  proc_coord_dim(pcoord_dim_), proc_coords(pcoords_),
1095  task_coord_dim(tcoord_dim_), task_coords(tcoords_),
1096  partArraySize(-1),
1097  machine_extent(machine_extent_),
1098  machine_extent_wrap_around(machine_extent_wrap_around_),
1099  machine(machine_),
1100  num_ranks_per_node(1),
1101  divide_to_prime_first(false) {
1102  }
1103 
1104 
1105  void setPartArraySize(int psize) {
1106  this->partArraySize = psize;
1107  }
1108 
1109  void setPartArray(Kokkos::View<part_t *, Kokkos::HostSpace> pNo) {
1110  this->kokkos_partNoArray = pNo;
1111  }
1112 
1120  void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const{
1121  //currently returns a random subset.
1122 
1123  part_t minCoordDim = MINOF(this->task_coord_dim, this->proc_coord_dim);
1125  minCoordDim, nprocs,
1126  this->proc_coords, ntasks);
1127 
1128  kma.kmeans();
1129  kma.getMinDistanceCluster(proc_permutation);
1130 
1131  for(int i = ntasks; i < nprocs; ++i) {
1132  proc_permutation[i] = -1;
1133  }
1134  /*
1135  //fill array.
1136  fillContinousArray<part_t>(proc_permutation, nprocs, NULL);
1137  '
1138  int _u_umpa_seed = 847449649;
1139  srand (time(NULL));
1140 
1141  int a = rand() % 1000 + 1;
1142  _u_umpa_seed -= a;
1143 
1144  //permute array randomly.
1145  update_visit_order(proc_permutation, nprocs,_u_umpa_seed, 1);
1146  */
1147  }
1148 
1149  // Temporary, necessary for random permutation.
1150  static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
1151  {
1152  int a = 16807;
1153  int m = 2147483647;
1154  int q = 127773;
1155  int r = 2836;
1156  int lo, hi, test;
1157  double d;
1158 
1159  lo = _u_umpa_seed % q;
1160  hi = _u_umpa_seed / q;
1161  test = (a * lo) - (r * hi);
1162  if (test>0)
1163  _u_umpa_seed = test;
1164  else
1165  _u_umpa_seed = test + m;
1166 
1167  d = (double) ((double) _u_umpa_seed / (double) m);
1168 
1169  return (part_t) (d*(double)l);
1170  }
1171 
1172  virtual double getProcDistance(int procId1, int procId2) const {
1173  pcoord_t distance = 0;
1174  if (machine == NULL) {
1175  for (int i = 0 ; i < this->proc_coord_dim; ++i) {
1176  double d =
1177  std::abs(proc_coords[i][procId1] - proc_coords[i][procId2]);
1179  if (machine_extent[i] - d < d) {
1180  d = machine_extent[i] - d;
1181  }
1182  }
1183  distance += d;
1184  }
1185  }
1186  else {
1187  this->machine->getHopCount(procId1, procId2, distance);
1188  }
1189 
1190  return distance;
1191  }
1192 
1193 
1194  // Temporary, does random permutation.
1195  void update_visit_order(part_t* visitOrder, part_t n,
1196  int &_u_umpa_seed, part_t rndm) {
1197  part_t *a = visitOrder;
1198 
1199  if (rndm) {
1200  part_t i, u, v, tmp;
1201 
1202  if (n <= 4)
1203  return;
1204 
1205 // srand ( time(NULL) );
1206 // _u_umpa_seed = _u_umpa_seed1 - (rand()%100);
1207 
1208  for (i = 0; i < n; i += 16) {
1209  u = umpa_uRandom(n-4, _u_umpa_seed);
1210  v = umpa_uRandom(n-4, _u_umpa_seed);
1211 
1212  // FIXME (mfh 30 Sep 2015) This requires including
1213  // Zoltan2_AlgMultiJagged.hpp.
1214 
1215  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v], a[u], tmp);
1216  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 1], a[u + 1], tmp);
1217  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 2], a[u + 2], tmp);
1218  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 3], a[u + 3], tmp);
1219  }
1220  }
1221  else {
1222  part_t i, end = n / 4;
1223 
1224  for (i = 1; i < end; i++) {
1225  part_t j = umpa_uRandom(n - i, _u_umpa_seed);
1226  part_t t = a[j];
1227  a[j] = a[n - i];
1228  a[n - i] = t;
1229  }
1230  }
1231 // PermuteInPlace(visitOrder, n);
1232  }
1233 
1234 
1235 
1244  virtual void getMapping(
1245  int myRank,
1246  const RCP<const Environment> &env,
1247  ArrayRCP <part_t> &rcp_proc_to_task_xadj,
1248  ArrayRCP <part_t> &rcp_proc_to_task_adj,
1249  ArrayRCP <part_t> &rcp_task_to_proc,
1250  const Teuchos::RCP <const Teuchos::Comm<int> > comm_
1251  ) const {
1252 
1253  rcp_proc_to_task_xadj = ArrayRCP <part_t>(this->no_procs + 1);
1254  rcp_proc_to_task_adj = ArrayRCP <part_t>(this->no_tasks);
1255  rcp_task_to_proc = ArrayRCP <part_t>(this->no_tasks);
1256 
1257  // Holds the pointer to the task array
1258  part_t *proc_to_task_xadj = rcp_proc_to_task_xadj.getRawPtr();
1259 
1260  // Holds the indices of task wrt to proc_to_task_xadj array.
1261  part_t *proc_to_task_adj = rcp_proc_to_task_adj.getRawPtr();
1262 
1263  // Holds the processors mapped to tasks.
1264  part_t *task_to_proc = rcp_task_to_proc.getRawPtr();
1265 
1266  part_t invalid = 0;
1267  fillContinousArray<part_t> (proc_to_task_xadj, this->no_procs + 1, &invalid);
1268 
1269  // Obtain the number of parts that should be divided.
1270  part_t num_parts = MINOF(this->no_procs, this->no_tasks);
1271 
1272  // Obtain the min coordinate dim.
1273  // No more want to do min coord dim. If machine dimension > task_dim,
1274  // we end up with a long line.
1275 // part_t minCoordDim = MINOF(this->task_coord_dim, this->proc_coord_dim);
1276 
1277  int recursion_depth = partArraySize;
1278 
1279 // if (partArraySize < minCoordDim)
1280 // recursion_depth = minCoordDim;
1281  if (partArraySize == -1) {
1282 
1283  if (divide_to_prime_first) {
1284  // It is difficult to estimate the number of steps in this case
1285  // as each branch will have different depth.
1286  // The worst case happens when all prime factors are 3s.
1287  // P = 3^n, n recursion depth will divide parts to 2x and x
1288  // and n recursion depth with divide 2x into x and x.
1289  // Set it to upperbound here.
1290  // We could calculate the exact value here as well, but the
1291  // partitioning algorithm skips further ones anyways. recursion_depth = log(float(this->no_procs)) / log(2.0) * 2 + 1;
1292  }
1293  else {
1294  recursion_depth = log(float(this->no_procs)) / log(2.0) + 1;
1295  }
1296  }
1297 
1298  // Number of permutations for tasks and processors
1299  int taskPerm = 1;
1300  int procPerm = 1;
1301 
1302  // Get number of different permutations for task dimension ordering
1303  if (this->task_coord_dim <= 8)
1304  taskPerm = z2Fact<int>(this->task_coord_dim);
1305  // Prevent overflow
1306  else
1307  taskPerm = z2Fact<int>(8);
1308 
1309  // Get number of different permutations for proc dimension ordering
1310  if (this->proc_coord_dim <= 8)
1311  procPerm = z2Fact<int>(this->proc_coord_dim);
1312  // Prevent overflow
1313  else
1314  procPerm = z2Fact<int>(8);
1315 
1316 
1317  // Total number of permutations (both task and proc permuted)
1318  int permutations = taskPerm * procPerm;
1319 
1320  // Add permutations where we divide the processors with longest
1321  // dimension but permute tasks.
1322  permutations += taskPerm;
1323 
1324  // Add permutations where we divide the tasks with longest
1325  // dimension but permute procs.
1326  permutations += procPerm;
1327 
1328  // Add permutation with both tasks and procs divided by longest
1329  // dimension
1330  permutations += 1;
1331  //add one also that partitions based the longest dimension.
1332 
1333  // Holds the pointers to proc_adjList
1334  part_t *proc_xadj = new part_t[num_parts+1];
1335 
1336  // Holds the processors in parts according to the result of
1337  // partitioning algorithm.
1338  // The processors assigned to part x is at
1339  // proc_adjList[ proc_xadj[x] : proc_xadj[x + 1] ]
1340  part_t *proc_adjList = new part_t[this->no_procs];
1341 
1342 
1343  part_t used_num_procs = this->no_procs;
1344  if (this->no_procs > this->no_tasks) {
1345  // Obtain the subset of the processors that are closest to each
1346  // other.
1347  this->getClosestSubset(proc_adjList, this->no_procs,
1348  this->no_tasks);
1349  used_num_procs = this->no_tasks;
1350  }
1351  else {
1352  fillContinousArray<part_t>(proc_adjList,this->no_procs, NULL);
1353  }
1354 
1355  // Index of the permutation
1356  int myPermutation = myRank % permutations;
1357  bool task_partition_along_longest_dim = false;
1358  bool proc_partition_along_longest_dim = false;
1359 
1360 
1361  int myProcPerm = 0;
1362  int myTaskPerm = 0;
1363 
1364  if (myPermutation == 0) {
1365  task_partition_along_longest_dim = true;
1366  proc_partition_along_longest_dim = true;
1367  }
1368  else {
1369  --myPermutation;
1370  if (myPermutation < taskPerm) {
1371  proc_partition_along_longest_dim = true;
1372  // Index of the task permutation
1373  myTaskPerm = myPermutation;
1374  }
1375  else {
1376  myPermutation -= taskPerm;
1377  if (myPermutation < procPerm) {
1378  task_partition_along_longest_dim = true;
1379  // Index of the task permutation
1380  myProcPerm = myPermutation;
1381  }
1382  else {
1383  myPermutation -= procPerm;
1384  // Index of the proc permutation
1385  myProcPerm = myPermutation % procPerm;
1386  // Index of the task permutation
1387  myTaskPerm = myPermutation / procPerm;
1388  }
1389  }
1390  }
1391 
1392 /*
1393  if (task_partition_along_longest_dim &&
1394  proc_partition_along_longest_dim) {
1395  std::cout <<"me:" << myRank << " task:longest proc:longest"
1396  << " numPerms:" << permutations << std::endl;
1397  }
1398  else if (proc_partition_along_longest_dim) {
1399  std::cout <<"me:" << myRank << " task:" << myTaskPerm
1400  << " proc:longest" << " numPerms:" << permutations << std::endl;
1401  }
1402  else if (task_partition_along_longest_dim) {
1403  std::cout <<"me:" << myRank << " task: longest" << " proc:"
1404  << myProcPerm << " numPerms:" << permutations << std::endl;
1405  }
1406  else {
1407  std::cout <<"me:" << myRank << " task:" << myTaskPerm << " proc:"
1408  << myProcPerm << " numPerms:" << permutations << std::endl;
1409  }
1410 */
1411 
1412  int *permutation = new int[(this->proc_coord_dim > this->task_coord_dim)
1413  ? this->proc_coord_dim : this->task_coord_dim];
1414 
1415  // Get the permutation order from the proc permutation index.
1416  if (this->proc_coord_dim <= 8)
1417  ithPermutation<int>(this->proc_coord_dim, myProcPerm, permutation);
1418  else
1419  ithPermutation<int>(8, myProcPerm, permutation);
1420 
1421 /*
1422  // Reorder the coordinate dimensions.
1423  pcoord_t **pcoords = allocMemory<pcoord_t *>(this->proc_coord_dim);
1424  for (int i = 0; i < this->proc_coord_dim; ++i) {
1425  pcoords[i] = this->proc_coords[permutation[i]];
1426 // std::cout << permutation[i] << " ";
1427  }
1428 */
1429 
1430  int procdim = this->proc_coord_dim;
1431  pcoord_t **pcoords = this->proc_coords;
1432 
1433 /*
1434  int procdim = this->proc_coord_dim;
1435  procdim = 6;
1436  //reorder the coordinate dimensions.
1437  pcoord_t **pcoords = allocMemory<pcoord_t *>(procdim);
1438  for (int i = 0; i < procdim; ++i) {
1439  pcoords[i] = new pcoord_t[used_num_procs] ;
1440 // this->proc_coords[permutation[i]];
1441  }
1442 
1443  for (int k = 0; k < used_num_procs ; k++) {
1444  pcoords[0][k] = (int (this->proc_coords[0][k]) / 2) * 64;
1445  pcoords[3][k] = (int (this->proc_coords[0][k]) % 2) * 8 ;
1446 
1447  pcoords[1][k] = (int (this->proc_coords[1][k]) / 2) * 8 * 2400;
1448  pcoords[4][k] = (int (this->proc_coords[1][k]) % 2) * 8;
1449  pcoords[2][k] = ((int (this->proc_coords[2][k])) / 8) * 160;
1450  pcoords[5][k] = ((int (this->proc_coords[2][k])) % 8) * 5;
1451 
1452  //if (this->proc_coords[0][k] == 40 &&
1453  // this->proc_coords[1][k] == 8 &&
1454  // this->proc_coords[2][k] == 48) {
1455  if (this->proc_coords[0][k] == 5 &&
1456  this->proc_coords[1][k] == 0 &&
1457  this->proc_coords[2][k] == 10) {
1458  std::cout << "pcoords[0][k]:" << pcoords[0][k]
1459  << "pcoords[1][k]:" << pcoords[1][k]
1460  << "pcoords[2][k]:" << pcoords[2][k]
1461  << "pcoords[3][k]:" << pcoords[3][k]
1462  << "pcoords[4][k]:" << pcoords[4][k]
1463  << "pcoords[5][k]:" << pcoords[5][k] << std::endl;
1464  }
1465  else if (pcoords[0][k] == 64 &&
1466  pcoords[1][k] == 0 &&
1467  pcoords[2][k] == 160 &&
1468  pcoords[3][k] == 16 &&
1469  pcoords[4][k] == 0 &&
1470  pcoords[5][k] == 10) {
1471  std::cout << "this->proc_coords[0][k]:" << this->proc_coords[0][k]
1472  << "this->proc_coords[1][k]:" << this->proc_coords[1][k]
1473  << "this->proc_coords[2][k]:" << this->proc_coords[2][k]
1474  << std::endl;
1475  }
1476  }
1477 */
1478 
1479 // if (partNoArray == NULL)
1480 // std::cout << "partNoArray is null" << std::endl;
1481 // std::cout << "recursion_depth:" << recursion_depth
1482 // << " partArraySize:" << partArraySize << std::endl;
1483 
1484  // Optimization for Dragonfly Networks, First Level of partitioning
1485  // is imbalanced to ensure procs are divided by first RCA
1486  // coord (a.k.a. group).
1487  part_t num_group_count = 1;
1488  part_t *group_count = NULL;
1489 
1490  if (machine != NULL)
1491  num_group_count = machine->getNumUniqueGroups();
1492 
1493  if (num_group_count > 1) {
1494  group_count = new part_t[num_group_count];
1495  memset(group_count, 0, sizeof(part_t) * num_group_count);
1496 
1497  machine->getGroupCount(group_count);
1498  }
1499 
1500  // Do the partitioning and renumber the parts.
1501  env->timerStart(MACRO_TIMERS, "Mapping - Proc Partitioning");
1502  // Partitioning of Processors
1504 
1505  typedef typename node_t::device_type device_t;
1506  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1507  Kokkos::View<pcoord_t**, Kokkos::LayoutLeft, device_t>
1508  kokkos_pcoords("pcoords", this->no_procs, procdim);
1509  auto host_kokkos_pcoords = Kokkos::create_mirror_view(kokkos_pcoords);
1510  for(int i = 0; i < procdim; ++i) {
1511  for(int j = 0; j < this->no_procs; ++j) {
1512  host_kokkos_pcoords(j,i) = pcoords[i][j];
1513  }
1514  }
1515  Kokkos::deep_copy(kokkos_pcoords, host_kokkos_pcoords);
1516 
1517  Kokkos::View<part_t*, device_t> initial_selected_coords_output_permutation_pcoords(
1518  "initial_selected_coords_output_permutation_pcoords", this->no_procs);
1519  typename Kokkos::View<part_t*, device_t>::HostMirror
1520  host_initial_selected_coords_output_permutation_pcoords =
1521  Kokkos::create_mirror_view(initial_selected_coords_output_permutation_pcoords);
1522  for(int n = 0; n < this->no_procs; ++n) {
1523  host_initial_selected_coords_output_permutation_pcoords(n) =
1524  proc_adjList[n];
1525  }
1526  Kokkos::deep_copy(initial_selected_coords_output_permutation_pcoords,
1527  host_initial_selected_coords_output_permutation_pcoords);
1528 
1529  // Note num_group_count = 1 when group_count = NULL - perhaps could change
1530  Kokkos::View<part_t *, Kokkos::HostSpace> kokkos_group_count(
1531  "kokkos_group_count", group_count ? num_group_count : 0);
1532  if(group_count) {
1533  for(int n = 0; n < num_group_count; ++n) {
1534  kokkos_group_count(n) = group_count[n];
1535  }
1536  }
1537 
1538  mj_partitioner.sequential_task_partitioning(
1539  env,
1540  this->no_procs,
1541  used_num_procs,
1542  num_parts,
1543  procdim,
1544  //minCoordDim,
1545  kokkos_pcoords,
1546  initial_selected_coords_output_permutation_pcoords,
1547  proc_xadj,
1548  recursion_depth,
1550  proc_partition_along_longest_dim, // false
1553  num_group_count,
1554  kokkos_group_count);
1555  env->timerStop(MACRO_TIMERS, "Mapping - Proc Partitioning");
1556 // comm_->barrier();
1557 // std::cout << "mj_partitioner.for procs over" << std::endl;
1558 // freeArray<pcoord_t *>(pcoords);
1559 
1560  Kokkos::deep_copy(host_initial_selected_coords_output_permutation_pcoords,
1561  initial_selected_coords_output_permutation_pcoords);
1562  for(int n = 0; n < this->no_procs; ++n) {
1563  proc_adjList[n] =
1564  host_initial_selected_coords_output_permutation_pcoords(n);
1565  }
1566 
1567  part_t *task_xadj = new part_t[num_parts + 1];
1568  part_t *task_adjList = new part_t[this->no_tasks];
1569 
1570  // Fill task_adjList st: task_adjList[i] <- i.
1571  fillContinousArray<part_t>(task_adjList,this->no_tasks, NULL);
1572 
1573  // Get the permutation order from the task permutation index.
1574  if (this->task_coord_dim <= 8)
1575  ithPermutation<int>(this->task_coord_dim, myTaskPerm, permutation);
1576  else
1577  ithPermutation<int>(8, myTaskPerm, permutation);
1578 
1579  // Reorder task coordinate dimensions.
1580  tcoord_t **tcoords = new tcoord_t*[this->task_coord_dim];
1581  for(int i = 0; i < this->task_coord_dim; ++i) {
1582  tcoords[i] = this->task_coords[permutation[i]];
1583  }
1584 
1585  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1586  Kokkos::View<tcoord_t**, Kokkos::LayoutLeft, device_t>
1587  kokkos_tcoords("tcoords", this->no_tasks, this->task_coord_dim);
1588  auto host_kokkos_tcoords = Kokkos::create_mirror_view(kokkos_tcoords);
1589  for(int i = 0; i < this->task_coord_dim; ++i) {
1590  for(int j = 0; j < this->no_tasks; ++j) {
1591  host_kokkos_tcoords(j,i) = tcoords[i][j];
1592  }
1593  }
1594  Kokkos::deep_copy(kokkos_tcoords, host_kokkos_tcoords);
1595 
1596  env->timerStart(MACRO_TIMERS, "Mapping - Task Partitioning");
1597 
1598  Kokkos::View<part_t*, device_t> initial_selected_coords_output_permutation_tcoords(
1599  "initial_selected_coords_output_permutation_tcoords", this->no_tasks);
1600  typename Kokkos::View<part_t*, device_t>::HostMirror
1601  host_initial_selected_coords_output_permutation_tcoords =
1602  Kokkos::create_mirror_view(initial_selected_coords_output_permutation_tcoords);
1603  for(int n = 0; n < this->no_tasks; ++n) {
1604  host_initial_selected_coords_output_permutation_tcoords(n) =
1605  task_adjList[n];
1606  }
1607  Kokkos::deep_copy(initial_selected_coords_output_permutation_tcoords,
1608  host_initial_selected_coords_output_permutation_tcoords);
1609 
1610  //partitioning of tasks
1611  mj_partitioner.sequential_task_partitioning(
1612  env,
1613  this->no_tasks,
1614  this->no_tasks,
1615  num_parts,
1616  this->task_coord_dim,
1617  //minCoordDim,
1618  kokkos_tcoords,
1619  initial_selected_coords_output_permutation_tcoords,
1620  task_xadj,
1621  recursion_depth,
1623  task_partition_along_longest_dim,
1626  num_group_count,
1627  kokkos_group_count);
1628  env->timerStop(MACRO_TIMERS, "Mapping - Task Partitioning");
1629 
1630  Kokkos::deep_copy(host_initial_selected_coords_output_permutation_tcoords,
1631  initial_selected_coords_output_permutation_tcoords);
1632  for(int n = 0; n < this->no_tasks; ++n) {
1633  task_adjList[n] =
1634  host_initial_selected_coords_output_permutation_tcoords(n);
1635  }
1636 
1637 // std::cout << "myrank:" << myRank << std::endl;
1638 // comm_->barrier();
1639 // std::cout << "mj_partitioner.sequential_task_partitioning over"
1640 // << std::endl;
1641 
1642  delete [] tcoords;
1643  delete [] permutation;
1644 
1645 
1646  //filling proc_to_task_xadj, proc_to_task_adj, task_to_proc arrays.
1647  for(part_t i = 0; i < num_parts; ++i) {
1648 
1649  part_t proc_index_begin = proc_xadj[i];
1650  part_t task_begin_index = task_xadj[i];
1651  part_t proc_index_end = proc_xadj[i + 1];
1652  part_t task_end_index = task_xadj[i + 1];
1653 
1654 
1655  if(proc_index_end - proc_index_begin != 1) {
1656  std::cerr << "Error at partitioning of processors" << std::endl;
1657  std::cerr << "PART:" << i << " is assigned to "
1658  << proc_index_end - proc_index_begin << " processors."
1659  << std::endl;
1660  std::terminate();
1661  }
1662  part_t assigned_proc = proc_adjList[proc_index_begin];
1663  proc_to_task_xadj[assigned_proc] = task_end_index - task_begin_index;
1664  }
1665 
1666  //holds the pointer to the task array
1667  //convert proc_to_task_xadj to CSR index array
1668  part_t *proc_to_task_xadj_work = new part_t[this->no_procs];
1669  part_t sum = 0;
1670  for(part_t i = 0; i < this->no_procs; ++i) {
1671  part_t tmp = proc_to_task_xadj[i];
1672  proc_to_task_xadj[i] = sum;
1673  sum += tmp;
1674  proc_to_task_xadj_work[i] = sum;
1675  }
1676  proc_to_task_xadj[this->no_procs] = sum;
1677 
1678  for(part_t i = 0; i < num_parts; ++i){
1679 
1680  part_t proc_index_begin = proc_xadj[i];
1681  part_t task_begin_index = task_xadj[i];
1682  part_t task_end_index = task_xadj[i + 1];
1683 
1684  part_t assigned_proc = proc_adjList[proc_index_begin];
1685 
1686  for (part_t j = task_begin_index; j < task_end_index; ++j) {
1687  part_t taskId = task_adjList[j];
1688 
1689  task_to_proc[taskId] = assigned_proc;
1690 
1691  proc_to_task_adj [--proc_to_task_xadj_work[assigned_proc]] = taskId;
1692  }
1693  }
1694 
1695 /*
1696  if (myPermutation == 0) {
1697  std::ofstream gnuPlotCode ("mymapping.out", std::ofstream::out);
1698 
1699  for (part_t i = 0; i < num_parts; ++i) {
1700 
1701  part_t proc_index_begin = proc_xadj[i];
1702  part_t proc_index_end = proc_xadj[i + 1];
1703 
1704  if (proc_index_end - proc_index_begin != 1) {
1705  std::cerr << "Error at partitioning of processors" << std::endl;
1706  std::cerr << "PART:" << i << " is assigned to "
1707  << proc_index_end - proc_index_begin << " processors."
1708  << std::endl;
1709  exit(1);
1710  }
1711 
1712  part_t assigned_proc = proc_adjList[proc_index_begin];
1713  gnuPlotCode << "Rank:" << i << " "
1714  << this->proc_coords[0][assigned_proc] << " "
1715  << this->proc_coords[1][assigned_proc] << " "
1716  << this->proc_coords[2][assigned_proc] << " "
1717  << pcoords[0][assigned_proc] << " "
1718  << pcoords[1][assigned_proc] << " "
1719  << pcoords[2][assigned_proc] << " "
1720  << pcoords[3][assigned_proc] << std::endl;
1721  }
1722 
1723  gnuPlotCode << "Machine Extent:" << std::endl;
1724  //filling proc_to_task_xadj, proc_to_task_adj, task_to_proc arrays.
1725  for (part_t i = 0; i < num_parts; ++i) {
1726 
1727  part_t proc_index_begin = proc_xadj[i];
1728  part_t proc_index_end = proc_xadj[i + 1];
1729 
1730  if (proc_index_end - proc_index_begin != 1) {
1731  std::cerr << "Error at partitioning of processors" << std::endl;
1732  std::cerr << "PART:" << i << " is assigned to "
1733  << proc_index_end - proc_index_begin << " processors."
1734  << std::endl;
1735  exit(1);
1736  }
1737 
1738  part_t assigned_proc = proc_adjList[proc_index_begin];
1739  gnuPlotCode << "Rank:" << i << " "
1740  << this->proc_coords[0][assigned_proc] << " "
1741  << this->proc_coords[1][assigned_proc] << " "
1742  << this->proc_coords[2][assigned_proc] << std::endl;
1743  }
1744  gnuPlotCode.close();
1745  }
1746 */
1747 
1748  delete [] proc_to_task_xadj_work;
1749  delete [] task_xadj;
1750  delete [] task_adjList;
1751  delete [] proc_xadj;
1752  delete [] proc_adjList;
1753  }
1754 
1755 };
1756 
1757 template <typename Adapter, typename part_t>
1759 protected:
1760 
1761 #ifndef DOXYGEN_SHOULD_SKIP_THIS
1762 
1763  typedef typename Adapter::scalar_t pcoord_t;
1764  typedef typename Adapter::scalar_t tcoord_t;
1765  typedef typename Adapter::scalar_t scalar_t;
1766  typedef typename Adapter::lno_t lno_t;
1767 
1768 #if defined(KOKKOS_ENABLE_CUDA)
1769  using node_t = Tpetra::KokkosCompat::KokkosDeviceWrapperNode<
1770  Kokkos::Cuda, Kokkos::CudaSpace>;
1771 #elif defined(KOKKOS_ENABLE_HIP)
1772  using node_t = Tpetra::KokkosCompat::KokkosDeviceWrapperNode<
1773  Kokkos::HIP, Kokkos::HIPSpace>;
1774 #else
1775  using node_t = typename Adapter::node_t;
1776 #endif
1777 
1778 #endif
1779 
1780 // RCP<const Environment> env;
1781 
1782  // Holds the pointer to the task array
1783  ArrayRCP<part_t> proc_to_task_xadj;
1784 // = allocMemory<part_t> (this->no_procs + 1);
1785 
1786  // Holds the indices of tasks wrt to proc_to_task_xadj array.
1787  ArrayRCP<part_t> proc_to_task_adj;
1788 // = allocMemory<part_t>(this->no_tasks);
1789 
1790  // Holds the processors mapped to tasks.
1791  ArrayRCP<part_t> task_to_proc;
1792 // = allocMemory<part_t>(this->no_procs);
1793 
1794  // Holds the processors mapped to tasks.
1795  ArrayRCP<part_t> local_task_to_rank;
1796 // = allocMemory<part_t>(this->no_procs);
1797 
1802  ArrayRCP<part_t> task_communication_xadj;
1803  ArrayRCP<part_t> task_communication_adj;
1805 
1806 
1810  void doMapping(int myRank,
1811  const Teuchos::RCP<const Teuchos::Comm<int> > comm_) {
1812 
1813  if (this->proc_task_comm) {
1814  this->proc_task_comm->getMapping(
1815  myRank,
1816  this->env,
1817  // Holds the pointer to the task array
1818  this->proc_to_task_xadj,
1819  // Holds the indices of task wrt to proc_to_task_xadj array
1820  this->proc_to_task_adj,
1821  // Holds the processors mapped to tasks
1822  this->task_to_proc,
1823  comm_
1824  );
1825  }
1826  else {
1827  std::cerr << "communicationModel is not specified in the Mapper"
1828  << std::endl;
1829  exit(1);
1830  }
1831  }
1832 
1833 
1837  RCP<Comm<int> > create_subCommunicator() {
1838  int procDim = this->proc_task_comm->proc_coord_dim;
1839  int taskDim = this->proc_task_comm->task_coord_dim;
1840 
1841  // Number of permutations for tasks and processors
1842  int taskPerm = 1;
1843  int procPerm = 1;
1844 
1845  // Get number of different permutations for task dimension ordering
1846  if (taskDim <= 8)
1847  taskPerm = z2Fact<int>(taskDim);
1848  // Prevent overflow
1849  else
1850  taskPerm = z2Fact<int>(8);
1851 
1852  // Get number of different permutations for proc dimension ordering
1853  if (procDim <= 8)
1854  procPerm = z2Fact<int>(procDim);
1855  // Prevent overflow
1856  else
1857  procPerm = z2Fact<int>(8);
1858 
1859  // Total number of permutations
1860  int idealGroupSize = taskPerm * procPerm;
1861 
1862  // For the one that does longest dimension partitioning.
1863  idealGroupSize += taskPerm + procPerm + 1;
1864 
1865  int myRank = this->comm->getRank();
1866  int commSize = this->comm->getSize();
1867 
1868  int myGroupIndex = myRank / idealGroupSize;
1869 
1870  int prevGroupBegin = (myGroupIndex - 1)* idealGroupSize;
1871  if (prevGroupBegin < 0) prevGroupBegin = 0;
1872  int myGroupBegin = myGroupIndex * idealGroupSize;
1873  int myGroupEnd = (myGroupIndex + 1) * idealGroupSize;
1874  int nextGroupEnd = (myGroupIndex + 2)* idealGroupSize;
1875 
1876  if (myGroupEnd > commSize) {
1877  myGroupBegin = prevGroupBegin;
1878  myGroupEnd = commSize;
1879  }
1880  if (nextGroupEnd > commSize) {
1881  myGroupEnd = commSize;
1882  }
1883  int myGroupSize = myGroupEnd - myGroupBegin;
1884 
1885  part_t *myGroup = new part_t[myGroupSize];
1886  for (int i = 0; i < myGroupSize; ++i) {
1887  myGroup[i] = myGroupBegin + i;
1888  }
1889 // std::cout << "me:" << myRank << " myGroupBegin:" << myGroupBegin
1890 // << " myGroupEnd:" << myGroupEnd << endl;
1891 
1892  ArrayView<const part_t> myGroupView(myGroup, myGroupSize);
1893 
1894  RCP<Comm<int> > subComm =
1895  this->comm->createSubcommunicator(myGroupView);
1896  delete [] myGroup;
1897  return subComm;
1898  }
1899 
1900 
1905  //create the sub group.
1906  RCP<Comm<int> > subComm = this->create_subCommunicator();
1907  //calculate cost.
1908  double myCost = this->proc_task_comm->getCommunicationCostMetric();
1909 // std::cout << "me:" << this->comm->getRank() << " myCost:"
1910 // << myCost << std::endl;
1911  double localCost[2], globalCost[2];
1912 
1913  localCost[0] = myCost;
1914  localCost[1] = double(subComm->getRank());
1915 
1916  globalCost[1] = globalCost[0] = std::numeric_limits<double>::max();
1918  reduceAll<int, double>(*subComm, reduceBest,
1919  2, localCost, globalCost);
1920 
1921  int sender = int(globalCost[1]);
1922 
1923 /*
1924  if ( this->comm->getRank() == 0) {
1925  std::cout << "me:" << localCost[1] <<
1926  " localcost:" << localCost[0]<<
1927  " bestcost:" << globalCost[0] <<
1928  " Sender:" << sender <<
1929  " procDim" << proc_task_comm->proc_coord_dim <<
1930  " taskDim:" << proc_task_comm->task_coord_dim << std::endl;
1931  }
1932 */
1933 
1934 // std::cout << "me:" << localCost[1] << " localcost:" << localCost[0]
1935 // << " bestcost:" << globalCost[0] << endl;
1936 // std::cout << "me:" << localCost[1] << " proc:" << globalCost[1]
1937 // << endl;
1938  broadcast(*subComm, sender, this->ntasks,
1939  this->task_to_proc.getRawPtr());
1940  broadcast(*subComm, sender, this->nprocs,
1941  this->proc_to_task_xadj.getRawPtr());
1942  broadcast(*subComm, sender, this->ntasks,
1943  this->proc_to_task_adj.getRawPtr());
1944  }
1945 
1946  //write mapping to gnuPlot code to visualize.
1947  void writeMapping() {
1948  std::ofstream gnuPlotCode("gnuPlot.plot", std::ofstream::out);
1949 
1950  int mindim = MINOF(proc_task_comm->proc_coord_dim,
1951  proc_task_comm->task_coord_dim);
1952  std::string ss = "";
1953  for (part_t i = 0; i < this->nprocs; ++i) {
1954 
1955  std::string procFile = Teuchos::toString<int>(i) + "_mapping.txt";
1956  if (i == 0) {
1957  gnuPlotCode << "plot \"" << procFile << "\"\n";
1958  }
1959  else {
1960  gnuPlotCode << "replot \"" << procFile << "\"\n";
1961  }
1962 
1963  std::ofstream inpFile(procFile.c_str(), std::ofstream::out);
1964 
1965  std::string gnuPlotArrow = "set arrow from ";
1966  for (int j = 0; j < mindim; ++j) {
1967  if (j == mindim - 1) {
1968  inpFile << proc_task_comm->proc_coords[j][i];
1969  gnuPlotArrow +=
1970  Teuchos::toString<float>(proc_task_comm->proc_coords[j][i]);
1971 
1972  }
1973  else {
1974  inpFile << proc_task_comm->proc_coords[j][i] << " ";
1975  gnuPlotArrow +=
1976  Teuchos::toString<float>(proc_task_comm->
1977  proc_coords[j][i]) + ",";
1978  }
1979  }
1980  gnuPlotArrow += " to ";
1981 
1982  inpFile << std::endl;
1983  ArrayView<part_t> a = this->getAssignedTasksForProc(i);
1984 
1985  for (int k = 0; k < a.size(); ++k) {
1986  int j = a[k];
1987 // std::cout << "i:" << i << " j:"
1988  std::string gnuPlotArrow2 = gnuPlotArrow;
1989  for (int z = 0; z < mindim; ++z) {
1990  if (z == mindim - 1) {
1991 
1992 // std::cout << "z:" << z << " j:" << j << " "
1993 // << proc_task_comm->task_coords[z][j] << endl;
1994  inpFile << proc_task_comm->task_coords[z][j];
1995  gnuPlotArrow2 +=
1996  Teuchos::toString<float>(proc_task_comm->task_coords[z][j]);
1997  }
1998  else {
1999  inpFile << proc_task_comm->task_coords[z][j] << " ";
2000  gnuPlotArrow2 +=
2001  Teuchos::toString<float>(proc_task_comm->
2002  task_coords[z][j]) + ",";
2003  }
2004  }
2005  ss += gnuPlotArrow2 + "\n";
2006  inpFile << std::endl;
2007  }
2008  inpFile.close();
2009  }
2010  gnuPlotCode << ss;
2011  gnuPlotCode << "\nreplot\n pause -1 \n";
2012  gnuPlotCode.close();
2013  }
2014 
2015  //write mapping to gnuPlot code to visualize.
2016  void writeMapping2(int myRank) {
2017  std::string rankStr = Teuchos::toString<int>(myRank);
2018  std::string gnuPlots = "gnuPlot", extentionS = ".plot";
2019  std::string outF = gnuPlots + rankStr+ extentionS;
2020  std::ofstream gnuPlotCode(outF.c_str(), std::ofstream::out);
2021 
2023  *tmpproc_task_comm =
2024  static_cast <CoordinateCommunicationModel<
2025  pcoord_t, tcoord_t, part_t, node_t> * > (
2026  proc_task_comm);
2027 
2028 // int mindim = MINOF(tmpproc_task_comm->proc_coord_dim,
2029 // tmpproc_task_comm->task_coord_dim);
2030  int mindim = tmpproc_task_comm->proc_coord_dim;
2031  if (mindim != 3) {
2032  std::cerr << "Mapping Write is only good for 3 dim" << std::endl;
2033  return;
2034  }
2035  std::string ss = "";
2036  std::string procs = "";
2037 
2038  std::set < std::tuple<int,int,int,int,int,int> > my_arrows;
2039  for (part_t origin_rank = 0; origin_rank < this->nprocs; ++origin_rank) {
2040  ArrayView<part_t> a = this->getAssignedTasksForProc(origin_rank);
2041  if (a.size() == 0) {
2042  continue;
2043  }
2044 
2045  std::string gnuPlotArrow = "set arrow from ";
2046  for (int j = 0; j < mindim; ++j) {
2047  if (j == mindim - 1) {
2048  gnuPlotArrow +=
2049  Teuchos::toString<float>(tmpproc_task_comm->
2050  proc_coords[j][origin_rank]);
2051  procs +=
2052  Teuchos::toString<float>(tmpproc_task_comm->
2053  proc_coords[j][origin_rank]);
2054 
2055  }
2056  else {
2057  gnuPlotArrow +=
2058  Teuchos::toString<float>(tmpproc_task_comm->
2059  proc_coords[j][origin_rank]) + ",";
2060  procs +=
2061  Teuchos::toString<float>(tmpproc_task_comm->
2062  proc_coords[j][origin_rank])+ " ";
2063  }
2064  }
2065  procs += "\n";
2066 
2067  gnuPlotArrow += " to ";
2068 
2069 
2070  for (int k = 0; k < a.size(); ++k) {
2071  int origin_task = a[k];
2072 
2073  for (int nind = task_communication_xadj[origin_task];
2074  nind < task_communication_xadj[origin_task + 1]; ++nind) {
2075  int neighbor_task = task_communication_adj[nind];
2076 
2077  bool differentnode = false;
2078 
2079  int neighbor_rank = this->getAssignedProcForTask(neighbor_task);
2080 
2081  for (int j = 0; j < mindim; ++j) {
2082  if (int(tmpproc_task_comm->proc_coords[j][origin_rank]) !=
2083  int(tmpproc_task_comm->proc_coords[j][neighbor_rank])) {
2084  differentnode = true; break;
2085  }
2086  }
2087  std::tuple<int,int,int, int, int, int> foo(
2088  int(tmpproc_task_comm->proc_coords[0][origin_rank]),
2089  int(tmpproc_task_comm->proc_coords[1][origin_rank]),
2090  int(tmpproc_task_comm->proc_coords[2][origin_rank]),
2091  int(tmpproc_task_comm->proc_coords[0][neighbor_rank]),
2092  int(tmpproc_task_comm->proc_coords[1][neighbor_rank]),
2093  int(tmpproc_task_comm->proc_coords[2][neighbor_rank]));
2094 
2095 
2096  if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
2097  my_arrows.insert(foo);
2098 
2099  std::string gnuPlotArrow2 = "";
2100  for (int j = 0; j < mindim; ++j) {
2101  if (j == mindim - 1) {
2102  gnuPlotArrow2 +=
2103  Teuchos::toString<float>(tmpproc_task_comm->
2104  proc_coords[j][neighbor_rank]);
2105  }
2106  else {
2107  gnuPlotArrow2 +=
2108  Teuchos::toString<float>(tmpproc_task_comm->
2109  proc_coords[j][neighbor_rank]) + ",";
2110  }
2111  }
2112  ss += gnuPlotArrow + gnuPlotArrow2 + " nohead\n";
2113  }
2114  }
2115  }
2116  }
2117 
2118  std::ofstream procFile("procPlot.plot", std::ofstream::out);
2119  procFile << procs << "\n";
2120  procFile.close();
2121 
2122  //gnuPlotCode << ss;
2123  if (mindim == 2) {
2124  gnuPlotCode << "plot \"procPlot.plot\" with points pointsize 3\n";
2125  } else {
2126  gnuPlotCode << "splot \"procPlot.plot\" with points pointsize 3\n";
2127  }
2128 
2129  gnuPlotCode << ss << "\nreplot\n pause -1 \n";
2130  gnuPlotCode.close();
2131  }
2132 
2133 
2134 // KDD Need to provide access to algorithm for getPartBoxes
2135 #ifdef gnuPlot
2136  void writeGnuPlot(
2137  const Teuchos::Comm<int> *comm_,
2139  int coordDim,
2140  tcoord_t **partCenters) {
2141 
2142  std::string file = "gggnuPlot";
2143  std::string exten = ".plot";
2144  std::ofstream mm("2d.txt");
2145  file += Teuchos::toString<int>(comm_->getRank()) + exten;
2146  std::ofstream ff(file.c_str());
2147 // ff.seekg(0, ff.end);
2148  std::vector<Zoltan2::coordinateModelPartBox <tcoord_t, part_t> >
2149  outPartBoxes =
2151  getPartBoxesView();
2152 
2153  for (part_t i = 0; i < this->ntasks;++i) {
2154  outPartBoxes[i].writeGnuPlot(ff, mm);
2155  }
2156  if (coordDim == 2) {
2157  ff << "plot \"2d.txt\"" << std::endl;
2158 // ff << "\n pause -1" << endl;
2159  }
2160  else {
2161  ff << "splot \"2d.txt\"" << std::endl;
2162 // ff << "\n pause -1" << endl;
2163  }
2164  mm.close();
2165 
2166  ff << "set style arrow 5 nohead size screen 0.03,15,135 ls 1"
2167  << std::endl;
2168 
2169  for (part_t i = 0; i < this->ntasks; ++i) {
2171  part_t pe = task_communication_xadj[i + 1];
2172 
2173  for (part_t p = pb; p < pe; ++p) {
2175 
2176 // std::cout << "i:" << i << " n:" << n << endl;
2177  std::string arrowline = "set arrow from ";
2178  for (int j = 0; j < coordDim - 1; ++j) {
2179  arrowline +=
2180  Teuchos::toString<tcoord_t>(partCenters[j][n]) + ",";
2181  }
2182  arrowline +=
2183  Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][n]) +
2184  " to ";
2185 
2186  for (int j = 0; j < coordDim - 1; ++j) {
2187  arrowline +=
2188  Teuchos::toString<tcoord_t>(partCenters[j][i]) + ",";
2189  }
2190  arrowline +=
2191  Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][i]) +
2192  " as 5\n";
2193 
2194 // std::cout << "arrow:" << arrowline << endl;
2195  ff << arrowline;
2196  }
2197  }
2198 
2199  ff << "replot\n pause -1" << std::endl;
2200  ff.close();
2201  }
2202 #endif // gnuPlot
2203 
2204 public:
2205 
2206  void getProcTask(part_t* &proc_to_task_xadj_,
2207  part_t* &proc_to_task_adj_) {
2208  proc_to_task_xadj_ = this->proc_to_task_xadj.getRawPtr();
2209  proc_to_task_adj_ = this->proc_to_task_adj.getRawPtr();
2210  }
2211 
2212  virtual void map(const RCP<MappingSolution<Adapter> > &mappingsoln) {
2213 
2214  // Mapping was already computed in the constructor; we need to store it
2215  // in the solution.
2216  mappingsoln->setMap_RankForLocalElements(local_task_to_rank);
2217 
2218  // KDDKDD TODO: Algorithm is also creating task_to_proc, which maybe
2219  // KDDKDD is not needed once we use MappingSolution to answer queries
2220  // KDDKDD instead of this algorithm.
2221  // KDDKDD Ask Mehmet: what is the most efficient way to get the answer
2222  // KDDKDD out of CoordinateTaskMapper and into the MappingSolution?
2223  }
2224 
2225 
2227  //freeArray<part_t>(proc_to_task_xadj);
2228  //freeArray<part_t>(proc_to_task_adj);
2229  //freeArray<part_t>(task_to_proc);
2230  if(this->isOwnerofModel) {
2231  delete this->proc_task_comm;
2232  }
2233  }
2234 
2236  const lno_t num_local_coords,
2237  const part_t *local_coord_parts,
2238  const ArrayRCP<part_t> task_to_proc_) {
2239  local_task_to_rank = ArrayRCP <part_t>(num_local_coords);
2240 
2241  for (lno_t i = 0; i < num_local_coords; ++i) {
2242  part_t local_coord_part = local_coord_parts[i];
2243  part_t rank_index = task_to_proc_[local_coord_part];
2244  local_task_to_rank[i] = rank_index;
2245  }
2246  }
2247 
2248 
2249 
2264  const Teuchos::RCP <const Teuchos::Comm<int> > comm_,
2265  const Teuchos::RCP <const MachineRepresentation<pcoord_t, part_t> >
2266  machine_,
2267  const Teuchos::RCP <const Adapter> input_adapter_,
2268  const Teuchos::RCP <const Zoltan2::PartitioningSolution<Adapter> >
2269  soln_,
2270  const Teuchos::RCP <const Environment> envConst,
2271  bool is_input_adapter_distributed = true,
2272  int num_ranks_per_node = 1,
2273  bool divide_to_prime_first = false,
2274  bool reduce_best_mapping = true):
2275  PartitionMapping<Adapter>(comm_, machine_, input_adapter_,
2276  soln_, envConst),
2277  proc_to_task_xadj(0),
2278  proc_to_task_adj(0),
2279  task_to_proc(0),
2280  isOwnerofModel(true),
2281  proc_task_comm(0),
2285 
2286  using namespace Teuchos;
2287  typedef typename Adapter::base_adapter_t ctm_base_adapter_t;
2288 
2289  RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2290  RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2291 
2292  RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2293  RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2294  if (!is_input_adapter_distributed) {
2295  ia_comm = Teuchos::createSerialComm<int>();
2296  }
2297 
2298  RCP<const Environment> envConst_ = envConst;
2299 
2300  RCP<const ctm_base_adapter_t> baseInputAdapter_(
2301  rcp(dynamic_cast<const ctm_base_adapter_t *>(
2302  input_adapter_.getRawPtr()), false));
2303 
2304  modelFlag_t coordFlags_, graphFlags_;
2305 
2306  //create coordinate model
2307  //since this is coordinate task mapper,
2308  //the adapter has to have the coordinates
2309  coordinateModel_ = rcp(new CoordinateModel<ctm_base_adapter_t>(
2310  baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2311 
2312  //if the adapter has also graph model, we will use graph model
2313  //to calculate the cost mapping.
2314  BaseAdapterType inputType_ = input_adapter_->adapterType();
2315  if (inputType_ == MatrixAdapterType ||
2316  inputType_ == GraphAdapterType ||
2317  inputType_ == MeshAdapterType)
2318  {
2319  graph_model_ = rcp(new GraphModel<ctm_base_adapter_t>(
2320  baseInputAdapter_, envConst_, ia_comm,
2321  graphFlags_));
2322  }
2323 
2324  if (!machine_->hasMachineCoordinates()) {
2325  throw std::runtime_error("Existing machine does not provide "
2326  "coordinates for coordinate task mapping");
2327  }
2328 
2329  //if mapping type is 0 then it is coordinate mapping
2330  int procDim = machine_->getMachineDim();
2331  this->nprocs = machine_->getNumRanks();
2332 
2333  //get processor coordinates.
2334  pcoord_t **procCoordinates = NULL;
2335  if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2336  throw std::runtime_error("Existing machine does not implement "
2337  "getAllMachineCoordinatesView");
2338  }
2339 
2340  //get the machine extent.
2341  //if we have machine extent,
2342  //if the machine has wrap-around links, we would like to shift the
2343  //coordinates, so that the largest hap would be the wrap-around.
2344  std::vector<int> machine_extent_vec(procDim);
2345  //std::vector<bool> machine_extent_wrap_around_vec(procDim, 0);
2346  int *machine_extent = &(machine_extent_vec[0]);
2347  bool *machine_extent_wrap_around = new bool[procDim];
2348  for (int i = 0; i < procDim; ++i)
2349  machine_extent_wrap_around[i] = false;
2350 
2351  bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2352 
2353  // KDDKDD ASK MEHMET: SHOULD WE GET AND USE machine_dimension HERE IF IT
2354  // KDDKDD ASK MEHMET: IS PROVIDED BY THE MACHINE REPRESENTATION?
2355  // KDDKDD ASK MEHMET: IF NOT HERE, THEN WHERE?
2356  // MD: Yes, I ADDED BELOW:
2357  if (machine_->getMachineExtent(machine_extent) &&
2358  haveWrapArounds) {
2359 
2360  procCoordinates =
2362  procDim,
2363  machine_extent,
2364  machine_extent_wrap_around,
2365  this->nprocs,
2366  procCoordinates);
2367  }
2368 
2369  //get the tasks information, such as coordinate dimension,
2370  //number of parts.
2371  int coordDim = coordinateModel_->getCoordinateDim();
2372 
2373 // int coordDim = machine_->getMachineDim();
2374 
2375  this->ntasks = soln_->getActualGlobalNumberOfParts();
2376  if (part_t(soln_->getTargetGlobalNumberOfParts()) > this->ntasks) {
2377  this->ntasks = soln_->getTargetGlobalNumberOfParts();
2378  }
2379  this->solution_parts = soln_->getPartListView();
2380 
2381  //we need to calculate the center of parts.
2382  tcoord_t **partCenters = new tcoord_t*[coordDim];
2383  for (int i = 0; i < coordDim; ++i) {
2384  partCenters[i] = new tcoord_t[this->ntasks];
2385  }
2386 
2387  typedef typename Adapter::scalar_t t_scalar_t;
2388 
2389 
2390  envConst->timerStart(MACRO_TIMERS, "Mapping - Solution Center");
2391 
2392  //get centers for the parts.
2393  getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2394  envConst.getRawPtr(),
2395  ia_comm.getRawPtr(),
2396  coordinateModel_.getRawPtr(),
2397  this->solution_parts,
2398 // soln_->getPartListView();
2399 // this->soln.getRawPtr(),
2400  coordDim,
2401  ntasks,
2402  partCenters);
2403 
2404  envConst->timerStop(MACRO_TIMERS, "Mapping - Solution Center");
2405 
2406  //create the part graph
2407  if (graph_model_.getRawPtr() != NULL) {
2408  getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2409  envConst.getRawPtr(),
2410  ia_comm.getRawPtr(),
2411  graph_model_.getRawPtr(),
2412  this->ntasks,
2413  this->solution_parts,
2414 // soln_->getPartListView(),
2415 // this->soln.getRawPtr(),
2419  );
2420  }
2421 
2422  //create coordinate communication model.
2424  pcoord_t, tcoord_t, part_t, node_t>(
2425  procDim,
2426  procCoordinates,
2427  coordDim,
2428  partCenters,
2429  this->nprocs,
2430  this->ntasks,
2431  machine_extent,
2432  machine_extent_wrap_around,
2433  machine_.getRawPtr());
2434 
2435  int myRank = comm_->getRank();
2436  this->proc_task_comm->num_ranks_per_node = num_ranks_per_node ;
2437  this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2438 
2439  envConst->timerStart(MACRO_TIMERS, "Mapping - Processor Task map");
2440  this->doMapping(myRank, comm_);
2441  envConst->timerStop(MACRO_TIMERS, "Mapping - Processor Task map");
2442 
2443  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Graph");
2444 
2445 /*
2446  soln_->getCommunicationGraph(task_communication_xadj,
2447  task_communication_adj);
2448 */
2449 
2450  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Graph");
2451  #ifdef gnuPlot1
2452  if (comm_->getRank() == 0) {
2453 
2454  part_t taskCommCount = task_communication_xadj.size();
2455  std::cout << " TotalComm:"
2456  << task_communication_xadj[taskCommCount] << std::endl;
2457  part_t maxN = task_communication_xadj[0];
2458  for (part_t i = 1; i <= taskCommCount; ++i) {
2459  part_t nc =
2461  if (maxN < nc)
2462  maxN = nc;
2463  }
2464  std::cout << " maxNeighbor:" << maxN << std::endl;
2465  }
2466 
2467  this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2468  #endif
2469 
2470  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Cost");
2471 
2472  if (reduce_best_mapping && task_communication_xadj.getRawPtr() &&
2473  task_communication_adj.getRawPtr()) {
2474  this->proc_task_comm->calculateCommunicationCost(
2475  task_to_proc.getRawPtr(),
2476  task_communication_xadj.getRawPtr(),
2477  task_communication_adj.getRawPtr(),
2478  task_communication_edge_weight.getRawPtr()
2479  );
2480  }
2481 
2482 // std::cout << "me: " << comm_->getRank() << " cost:"
2483 // << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2484 
2485  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Cost");
2486 
2487  //processors are divided into groups of size procDim! * coordDim!
2488  //each processor in the group obtains a mapping with a different
2489  //rotation and best one is broadcasted all processors.
2490  this->getBestMapping();
2492  coordinateModel_->getLocalNumCoordinates(),
2493  this->solution_parts,
2494  this->task_to_proc);
2495 /*
2496  {
2497  if (task_communication_xadj.getRawPtr() &&
2498  task_communication_adj.getRawPtr())
2499  this->proc_task_comm->calculateCommunicationCost(
2500  task_to_proc.getRawPtr(),
2501  task_communication_xadj.getRawPtr(),
2502  task_communication_adj.getRawPtr(),
2503  task_communication_edge_weight.getRawPtr()
2504  );
2505  std::cout << "me: " << comm_->getRank() << " cost:"
2506  << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2507  }
2508 */
2509 
2510 
2511  #ifdef gnuPlot
2512  this->writeMapping2(comm_->getRank());
2513  #endif
2514 
2515  delete [] machine_extent_wrap_around;
2516 
2517  if (machine_->getMachineExtent(machine_extent) &&
2518  haveWrapArounds) {
2519  for (int i = 0; i < procDim; ++i) {
2520  delete [] procCoordinates[i];
2521  }
2522  delete [] procCoordinates;
2523  }
2524 
2525  for (int i = 0; i < coordDim; ++i) {
2526  delete [] partCenters[i];
2527  }
2528  delete [] partCenters;
2529 
2530 
2531  }
2532 
2533 
2551  const Teuchos::RCP <const Teuchos::Comm<int> > comm_,
2552  const Teuchos::RCP <const MachineRepresentation<pcoord_t,part_t> >
2553  machine_,
2554  const Teuchos::RCP <const Adapter> input_adapter_,
2555  const part_t num_parts_,
2556  const part_t *result_parts,
2557  const Teuchos::RCP <const Environment> envConst,
2558  bool is_input_adapter_distributed = true,
2559  int num_ranks_per_node = 1,
2560  bool divide_to_prime_first = false,
2561  bool reduce_best_mapping = true):
2562  PartitionMapping<Adapter>(comm_, machine_, input_adapter_,
2563  num_parts_, result_parts, envConst),
2564  proc_to_task_xadj(0),
2565  proc_to_task_adj(0),
2566  task_to_proc(0),
2567  isOwnerofModel(true),
2568  proc_task_comm(0),
2572 
2573  using namespace Teuchos;
2574  typedef typename Adapter::base_adapter_t ctm_base_adapter_t;
2575 
2576  RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2577  RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2578 
2579  RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2580  RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2581  if (!is_input_adapter_distributed) {
2582  ia_comm = Teuchos::createSerialComm<int>();
2583  }
2584  RCP<const Environment> envConst_ = envConst;
2585 
2586  RCP<const ctm_base_adapter_t> baseInputAdapter_(
2587  rcp(dynamic_cast<const ctm_base_adapter_t *>(
2588  input_adapter_.getRawPtr()), false));
2589 
2590  modelFlag_t coordFlags_, graphFlags_;
2591 
2592  //create coordinate model
2593  //since this is coordinate task mapper,
2594  //the adapter has to have the coordinates
2595  coordinateModel_ = rcp(new CoordinateModel<ctm_base_adapter_t>(
2596  baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2597 
2598  //if the adapter has also graph model, we will use graph model
2599  //to calculate the cost mapping.
2600  BaseAdapterType inputType_ = input_adapter_->adapterType();
2601  if (inputType_ == MatrixAdapterType ||
2602  inputType_ == GraphAdapterType ||
2603  inputType_ == MeshAdapterType)
2604  {
2605  graph_model_ = rcp(new GraphModel<ctm_base_adapter_t>(
2606  baseInputAdapter_, envConst_, ia_comm,
2607  graphFlags_));
2608  }
2609 
2610  if (!machine_->hasMachineCoordinates()) {
2611  throw std::runtime_error("Existing machine does not provide "
2612  "coordinates for coordinate task mapping.");
2613  }
2614 
2615  //if mapping type is 0 then it is coordinate mapping
2616  int procDim = machine_->getMachineDim();
2617  this->nprocs = machine_->getNumRanks();
2618 
2619  //get processor coordinates.
2620  pcoord_t **procCoordinates = NULL;
2621  if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2622  throw std::runtime_error("Existing machine does not implement "
2623  "getAllMachineCoordinatesView");
2624  }
2625 
2626  //get the machine extent.
2627  //if we have machine extent,
2628  //if the machine has wrap-around links, we would like to shift the
2629  //coordinates,
2630  //so that the largest hap would be the wrap-around.
2631  std::vector<int> machine_extent_vec(procDim);
2632 // std::vector<bool> machine_extent_wrap_around_vec(procDim, 0);
2633  int *machine_extent = &(machine_extent_vec[0]);
2634  bool *machine_extent_wrap_around = new bool[procDim];
2635  bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2636 
2637  // KDDKDD ASK MEHMET: SHOULD WE GET AND USE machine_dimension HERE IF IT
2638  // KDDKDD ASK MEHMET: IS PROVIDED BY THE MACHINE REPRESENTATION?
2639  // KDDKDD ASK MEHMET: IF NOT HERE, THEN WHERE?
2640  // MD: Yes, I ADDED BELOW:
2641  if (machine_->getMachineExtent(machine_extent) &&
2642  haveWrapArounds) {
2643  procCoordinates =
2645  procDim,
2646  machine_extent,
2647  machine_extent_wrap_around,
2648  this->nprocs,
2649  procCoordinates);
2650  }
2651 
2652  //get the tasks information, such as coordinate dimension,
2653  //number of parts.
2654  int coordDim = coordinateModel_->getCoordinateDim();
2655 
2656 // int coordDim = machine_->getMachineDim();
2657 
2658 
2659  this->ntasks = num_parts_;
2660  this->solution_parts = result_parts;
2661 
2662  //we need to calculate the center of parts.
2663  tcoord_t **partCenters = new tcoord_t*[coordDim];
2664  for (int i = 0; i < coordDim; ++i) {
2665  partCenters[i] = new tcoord_t[this->ntasks];
2666  }
2667 
2668  typedef typename Adapter::scalar_t t_scalar_t;
2669 
2670 
2671  envConst->timerStart(MACRO_TIMERS, "Mapping - Solution Center");
2672 
2673  //get centers for the parts.
2674  getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2675  envConst.getRawPtr(),
2676  ia_comm.getRawPtr(),
2677  coordinateModel_.getRawPtr(),
2678  this->solution_parts,
2679 // soln_->getPartListView();
2680 // this->soln.getRawPtr(),
2681  coordDim,
2682  ntasks,
2683  partCenters);
2684 
2685  envConst->timerStop(MACRO_TIMERS, "Mapping - Solution Center");
2686 
2687  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE");
2688  //create the part graph
2689  if (graph_model_.getRawPtr() != NULL) {
2690  getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2691  envConst.getRawPtr(),
2692  ia_comm.getRawPtr(),
2693  graph_model_.getRawPtr(),
2694  this->ntasks,
2695  this->solution_parts,
2696 // soln_->getPartListView(),
2697 // this->soln.getRawPtr(),
2701  );
2702  }
2703  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE");
2704 
2705  envConst->timerStart(MACRO_TIMERS,
2706  "CoordinateCommunicationModel Create");
2707  //create coordinate communication model.
2709  pcoord_t, tcoord_t, part_t, node_t>(
2710  procDim,
2711  procCoordinates,
2712  coordDim,
2713  partCenters,
2714  this->nprocs,
2715  this->ntasks,
2716  machine_extent,
2717  machine_extent_wrap_around,
2718  machine_.getRawPtr());
2719 
2720  envConst->timerStop(MACRO_TIMERS,
2721  "CoordinateCommunicationModel Create");
2722 
2723 
2724  this->proc_task_comm->num_ranks_per_node = num_ranks_per_node;
2725  this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2726 
2727  int myRank = comm_->getRank();
2728 
2729 
2730  envConst->timerStart(MACRO_TIMERS, "Mapping - Processor Task map");
2731  this->doMapping(myRank, comm_);
2732  envConst->timerStop(MACRO_TIMERS, "Mapping - Processor Task map");
2733 
2734 
2735  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Graph");
2736 
2737 /*
2738  soln_->getCommunicationGraph(task_communication_xadj,
2739  task_communication_adj);
2740 */
2741 
2742  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Graph");
2743  #ifdef gnuPlot1
2744  if (comm_->getRank() == 0) {
2745 
2746  part_t taskCommCount = task_communication_xadj.size();
2747  std::cout << " TotalComm:"
2748  << task_communication_xadj[taskCommCount] << std::endl;
2749  part_t maxN = task_communication_xadj[0];
2750  for (part_t i = 1; i <= taskCommCount; ++i) {
2751  part_t nc =
2753  if (maxN < nc)
2754  maxN = nc;
2755  }
2756  std::cout << " maxNeighbor:" << maxN << std::endl;
2757  }
2758 
2759  this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2760  #endif
2761 
2762  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Cost");
2763 
2764  if (reduce_best_mapping && task_communication_xadj.getRawPtr() &&
2765  task_communication_adj.getRawPtr()) {
2766  this->proc_task_comm->calculateCommunicationCost(
2767  task_to_proc.getRawPtr(),
2768  task_communication_xadj.getRawPtr(),
2769  task_communication_adj.getRawPtr(),
2770  task_communication_edge_weight.getRawPtr()
2771  );
2772  }
2773 
2774 // std::cout << "me: " << comm_->getRank() << " cost:"
2775 // << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2776 
2777  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Cost");
2778 
2779  //processors are divided into groups of size procDim! * coordDim!
2780  //each processor in the group obtains a mapping with a different rotation
2781  //and best one is broadcasted all processors.
2782  this->getBestMapping();
2783 
2785  coordinateModel_->getLocalNumCoordinates(),
2786  this->solution_parts,
2787  this->task_to_proc);
2788 /*
2789  {
2790  if (task_communication_xadj.getRawPtr() &&
2791  task_communication_adj.getRawPtr())
2792  this->proc_task_comm->calculateCommunicationCost(
2793  task_to_proc.getRawPtr(),
2794  task_communication_xadj.getRawPtr(),
2795  task_communication_adj.getRawPtr(),
2796  task_communication_edge_weight.getRawPtr()
2797  );
2798  std::cout << "me: " << comm_->getRank() << " cost:"
2799  << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2800  }
2801 */
2802 
2803 
2804 
2805  #ifdef gnuPlot
2806  this->writeMapping2(comm_->getRank());
2807  #endif
2808 
2809  delete [] machine_extent_wrap_around;
2810 
2811  if (machine_->getMachineExtent(machine_extent) &&
2812  haveWrapArounds) {
2813  for (int i = 0; i < procDim; ++i) {
2814  delete [] procCoordinates[i];
2815  }
2816  delete [] procCoordinates;
2817  }
2818 
2819  for (int i = 0; i < coordDim; ++i) {
2820  delete [] partCenters[i];
2821  }
2822  delete [] partCenters;
2823  }
2824 
2891  const Environment *env_const_,
2892  const Teuchos::Comm<int> *problemComm,
2893  int proc_dim,
2894  int num_processors,
2895  pcoord_t **machine_coords,
2896  int task_dim,
2897  part_t num_tasks,
2898  tcoord_t **task_coords,
2899  ArrayRCP<part_t>task_comm_xadj,
2900  ArrayRCP<part_t>task_comm_adj,
2901  pcoord_t *task_communication_edge_weight_,
2902  int recursion_depth,
2903  Kokkos::View<part_t *, Kokkos::HostSpace> part_no_array,
2904  const part_t *machine_dimensions,
2905  int num_ranks_per_node = 1,
2906  bool divide_to_prime_first = false,
2907  bool reduce_best_mapping = true):
2908  PartitionMapping<Adapter>(
2909  Teuchos::rcpFromRef<const Teuchos::Comm<int> >(*problemComm),
2910  Teuchos::rcpFromRef<const Environment>(*env_const_)),
2911  proc_to_task_xadj(0),
2912  proc_to_task_adj(0),
2913  task_to_proc(0),
2914  isOwnerofModel(true),
2915  proc_task_comm(0),
2916  task_communication_xadj(task_comm_xadj),
2917  task_communication_adj(task_comm_adj) {
2918 
2919  //if mapping type is 0 then it is coordinate mapping
2920  pcoord_t ** virtual_machine_coordinates = machine_coords;
2921  bool *wrap_arounds = new bool [proc_dim];
2922  for (int i = 0; i < proc_dim; ++i) wrap_arounds[i] = true;
2923 
2924  if (machine_dimensions) {
2925  virtual_machine_coordinates =
2927  proc_dim,
2928  machine_dimensions,
2929  wrap_arounds,
2930  num_processors,
2931  machine_coords);
2932  }
2933 
2934  this->nprocs = num_processors;
2935 
2936  int coordDim = task_dim;
2937  this->ntasks = num_tasks;
2938 
2939  //alloc memory for part centers.
2940  tcoord_t **partCenters = task_coords;
2941 
2942  //create coordinate communication model.
2943  this->proc_task_comm =
2945  proc_dim,
2946  virtual_machine_coordinates,
2947  coordDim,
2948  partCenters,
2949  this->nprocs,
2950  this->ntasks, NULL, NULL
2951  );
2952 
2953  this->proc_task_comm->num_ranks_per_node = num_ranks_per_node;
2954  this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2955 
2956  this->proc_task_comm->setPartArraySize(recursion_depth);
2957  this->proc_task_comm->setPartArray(part_no_array);
2958 
2959  int myRank = problemComm->getRank();
2960 
2961  this->doMapping(myRank, this->comm);
2962 #ifdef gnuPlot
2963  this->writeMapping2(myRank);
2964 #endif
2965 
2966  // MDM added this edge case - for example if NX = 1 NY = 1 NZ = 1
2967  // That would pass on original develop so updated this so now it will also pass.
2968  if (reduce_best_mapping && task_communication_xadj.size() &&
2969  task_communication_adj.size()) {
2970  this->proc_task_comm->calculateCommunicationCost(
2971  task_to_proc.getRawPtr(),
2972  task_communication_xadj.getRawPtr(),
2973  task_communication_adj.getRawPtr(),
2974  task_communication_edge_weight_
2975  );
2976 
2977 
2978  this->getBestMapping();
2979 
2980 /*
2981  if (myRank == 0) {
2982  this->proc_task_comm->calculateCommunicationCost(
2983  task_to_proc.getRawPtr(),
2984  task_communication_xadj.getRawPtr(),
2985  task_communication_adj.getRawPtr(),
2986  task_communication_edge_weight_
2987  );
2988  cout << "me: " << problemComm->getRank() << " cost:"
2989  << this->proc_task_comm->getCommunicationCostMetric() << endl;
2990  }
2991 */
2992 
2993  }
2994 
2995  delete [] wrap_arounds;
2996 
2997  if (machine_dimensions) {
2998  for (int i = 0; i < proc_dim; ++i) {
2999  delete [] virtual_machine_coordinates[i];
3000  }
3001  delete [] virtual_machine_coordinates;
3002  }
3003 #ifdef gnuPlot
3004  if (problemComm->getRank() == 0)
3005  this->writeMapping2(-1);
3006 #endif
3007  }
3008 
3009 
3010  /*
3011  double getCommunicationCostMetric() {
3012  return this->proc_task_comm->getCommCost();
3013  }
3014  */
3015 
3018  virtual size_t getLocalNumberOfParts() const{
3019  return 0;
3020  }
3021 
3033  int machine_dim,
3034  const part_t *machine_dimensions,
3035  bool *machine_extent_wrap_around,
3036  part_t numProcs,
3037  pcoord_t **mCoords) {
3038 
3039  pcoord_t **result_machine_coords = NULL;
3040  result_machine_coords = new pcoord_t*[machine_dim];
3041 
3042  for (int i = 0; i < machine_dim; ++i) {
3043  result_machine_coords[i] = new pcoord_t [numProcs];
3044  }
3045 
3046  for (int i = 0; i < machine_dim; ++i) {
3047  part_t numMachinesAlongDim = machine_dimensions[i];
3048 
3049  part_t *machineCounts = new part_t[numMachinesAlongDim];
3050  memset(machineCounts, 0, sizeof(part_t) * numMachinesAlongDim);
3051 
3052  int *filledCoordinates = new int[numMachinesAlongDim];
3053 
3054  pcoord_t *coords = mCoords[i];
3055 
3056  for (part_t j = 0; j < numProcs; ++j) {
3057  part_t mc = (part_t) coords[j];
3058  ++machineCounts[mc];
3059  }
3060 
3061  part_t filledCoordinateCount = 0;
3062  for (part_t j = 0; j < numMachinesAlongDim; ++j) {
3063  if (machineCounts[j] > 0) {
3064  filledCoordinates[filledCoordinateCount++] = j;
3065  }
3066  }
3067 
3068  part_t firstProcCoord = filledCoordinates[0];
3069  part_t firstProcCount = machineCounts[firstProcCoord];
3070 
3071  part_t lastProcCoord = filledCoordinates[filledCoordinateCount - 1];
3072  part_t lastProcCount = machineCounts[lastProcCoord];
3073 
3074  part_t firstLastGap =
3075  numMachinesAlongDim - lastProcCoord + firstProcCoord;
3076  part_t firstLastGapProc = lastProcCount + firstProcCount;
3077 
3078  part_t leftSideProcCoord = firstProcCoord;
3079  part_t leftSideProcCount = firstProcCount;
3080  part_t biggestGap = 0;
3081  part_t biggestGapProc = numProcs;
3082 
3083  part_t shiftBorderCoordinate = -1;
3084  for (part_t j = 1; j < filledCoordinateCount; ++j) {
3085  part_t rightSideProcCoord= filledCoordinates[j];
3086  part_t rightSideProcCount = machineCounts[rightSideProcCoord];
3087 
3088  part_t gap = rightSideProcCoord - leftSideProcCoord;
3089  part_t gapProc = rightSideProcCount + leftSideProcCount;
3090 
3091  // Pick the largest gap in this dimension. Use fewer process on
3092  // either side of the largest gap to break the tie. An easy
3093  // addition to this would be to weight the gap by the number of
3094  // processes.
3095  if (gap > biggestGap ||
3096  (gap == biggestGap && biggestGapProc > gapProc)) {
3097  shiftBorderCoordinate = rightSideProcCoord;
3098  biggestGapProc = gapProc;
3099  biggestGap = gap;
3100  }
3101  leftSideProcCoord = rightSideProcCoord;
3102  leftSideProcCount = rightSideProcCount;
3103  }
3104 
3105 
3106  if (!(biggestGap > firstLastGap ||
3107  (biggestGap == firstLastGap &&
3108  biggestGapProc < firstLastGapProc))) {
3109  shiftBorderCoordinate = -1;
3110  }
3111 
3112  for (part_t j = 0; j < numProcs; ++j) {
3113 
3114  if (machine_extent_wrap_around[i] &&
3115  coords[j] < shiftBorderCoordinate) {
3116  result_machine_coords[i][j] = coords[j] + numMachinesAlongDim;
3117 
3118  }
3119  else {
3120  result_machine_coords[i][j] = coords[j];
3121  }
3122  }
3123  delete [] machineCounts;
3124  delete [] filledCoordinates;
3125  }
3126 
3127  return result_machine_coords;
3128 
3129  }
3130 
3138  virtual void getProcsForPart(part_t taskId, part_t &numProcs,
3139  part_t *&procs) const {
3140  numProcs = 1;
3141  procs = this->task_to_proc.getRawPtr() + taskId;
3142  }
3143 
3149  return this->task_to_proc[taskId];
3150  }
3151 
3159  virtual void getPartsForProc(int procId, part_t &numParts,
3160  part_t *&parts) const {
3161 
3162  part_t task_begin = this->proc_to_task_xadj[procId];
3163  part_t taskend = this->proc_to_task_xadj[procId + 1];
3164 
3165  parts = this->proc_to_task_adj.getRawPtr() + task_begin;
3166  numParts = taskend - task_begin;
3167  }
3168 
3169  ArrayView<part_t> getAssignedTasksForProc(part_t procId) {
3170  part_t task_begin = this->proc_to_task_xadj[procId];
3171  part_t taskend = this->proc_to_task_xadj[procId + 1];
3172 
3173 /*
3174  std::cout << "part_t:" << procId << " taskCount:"
3175  << taskend - task_begin << std::endl;
3176 
3177  for (part_t i = task_begin; i < taskend; ++i) {
3178  std::cout << "part_t:" << procId << " task:"
3179  << proc_to_task_adj[i] << endl;
3180  }
3181 */
3182  if (taskend - task_begin > 0) {
3183  ArrayView <part_t> assignedParts(
3184  this->proc_to_task_adj.getRawPtr() + task_begin,
3185  taskend - task_begin);
3186 
3187  return assignedParts;
3188  }
3189  else {
3190  ArrayView <part_t> assignedParts;
3191 
3192  return assignedParts;
3193  }
3194  }
3195 
3196 };
3197 
3276 template <typename part_t, typename pcoord_t, typename tcoord_t>
3278  RCP<const Teuchos::Comm<int> > problemComm,
3279  int proc_dim,
3280  int num_processors,
3281  pcoord_t **machine_coords,
3282  int task_dim,
3283  part_t num_tasks,
3284  tcoord_t **task_coords,
3285  part_t *task_comm_xadj,
3286  part_t *task_comm_adj,
3287  // float-like, same size with task_communication_adj_ weight of the
3288  // corresponding edge.
3289  pcoord_t *task_communication_edge_weight_,
3290  part_t *proc_to_task_xadj, /*output*/
3291  part_t *proc_to_task_adj, /*output*/
3292  int recursion_depth,
3293  Kokkos::View<part_t *, Kokkos::HostSpace> part_no_array,
3294  const part_t *machine_dimensions,
3295  int num_ranks_per_node = 1,
3296  bool divide_to_prime_first = false) {
3297 
3298  const Environment *envConst_ = new Environment(problemComm);
3299 
3300  // mfh 03 Mar 2015: It's OK to omit the Node template
3301  // parameter in Tpetra, if you're just going to use the
3302  // default Node.
3303  typedef Tpetra::MultiVector<tcoord_t, part_t, part_t> tMVector_t;
3304 
3305  Teuchos::ArrayRCP<part_t> task_communication_xadj(
3306  task_comm_xadj, 0, num_tasks + 1, false);
3307 
3308  Teuchos::ArrayRCP<part_t> task_communication_adj;
3309  if (task_comm_xadj) {
3310  Teuchos::ArrayRCP<part_t> tmp_task_communication_adj(
3311  task_comm_adj, 0, task_comm_xadj[num_tasks], false);
3312  task_communication_adj = tmp_task_communication_adj;
3313  }
3314 
3315 
3318  envConst_,
3319  problemComm.getRawPtr(),
3320  proc_dim,
3321  num_processors,
3322  machine_coords,
3323 // machine_coords_,
3324 
3325  task_dim,
3326  num_tasks,
3327  task_coords,
3328 
3331  task_communication_edge_weight_,
3332  recursion_depth,
3333  part_no_array,
3334  machine_dimensions,
3335  num_ranks_per_node,
3336  divide_to_prime_first);
3337 
3338 
3339  part_t* proc_to_task_xadj_;
3340  part_t* proc_to_task_adj_;
3341 
3342  ctm->getProcTask(proc_to_task_xadj_, proc_to_task_adj_);
3343 
3344  for (part_t i = 0; i <= num_processors; ++i) {
3345  proc_to_task_xadj[i] = proc_to_task_xadj_[i];
3346  }
3347 
3348  for (part_t i = 0; i < num_tasks; ++i) {
3349  proc_to_task_adj[i] = proc_to_task_adj_[i];
3350  }
3351 
3352  delete ctm;
3353  delete envConst_;
3354 }
3355 
3356 template <typename proc_coord_t, typename v_lno_t>
3357 inline void visualize_mapping(int myRank,
3358  const int machine_coord_dim,
3359  const int num_ranks,
3360  proc_coord_t **machine_coords,
3361  const v_lno_t num_tasks,
3362  const v_lno_t *task_communication_xadj,
3363  const v_lno_t *task_communication_adj,
3364  const int *task_to_rank) {
3365 
3366  std::string rankStr = Teuchos::toString<int>(myRank);
3367  std::string gnuPlots = "gnuPlot", extentionS = ".plot";
3368  std::string outF = gnuPlots + rankStr+ extentionS;
3369  std::ofstream gnuPlotCode( outF.c_str(), std::ofstream::out);
3370 
3371  if (machine_coord_dim != 3) {
3372  std::cerr << "Mapping Write is only good for 3 dim" << std::endl;
3373  return;
3374  }
3375  std::string ss = "";
3376  std::string procs = "";
3377 
3378  std::set<std::tuple<int, int, int, int, int, int> > my_arrows;
3379 
3380  for (v_lno_t origin_task = 0; origin_task < num_tasks; ++origin_task) {
3381  int origin_rank = task_to_rank[origin_task];
3382  std::string gnuPlotArrow = "set arrow from ";
3383 
3384  for (int j = 0; j < machine_coord_dim; ++j) {
3385  if (j == machine_coord_dim - 1) {
3386  gnuPlotArrow +=
3387  Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3388  procs +=
3389  Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3390 
3391  }
3392  else {
3393  gnuPlotArrow +=
3394  Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3395  + ",";
3396  procs +=
3397  Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3398  + " ";
3399  }
3400  }
3401  procs += "\n";
3402 
3403  gnuPlotArrow += " to ";
3404 
3405 
3406  for (int nind = task_communication_xadj[origin_task];
3407  nind < task_communication_xadj[origin_task + 1]; ++nind) {
3408 
3409  int neighbor_task = task_communication_adj[nind];
3410 
3411  bool differentnode = false;
3412  int neighbor_rank = task_to_rank[neighbor_task];
3413 
3414  for (int j = 0; j < machine_coord_dim; ++j) {
3415  if (int(machine_coords[j][origin_rank]) !=
3416  int(machine_coords[j][neighbor_rank])) {
3417  differentnode = true; break;
3418  }
3419  }
3420 
3421  std::tuple<int,int,int, int, int, int> foo(
3422  (int)(machine_coords[0][origin_rank]),
3423  (int)(machine_coords[1][origin_rank]),
3424  (int)(machine_coords[2][origin_rank]),
3425  (int)(machine_coords[0][neighbor_rank]),
3426  (int)(machine_coords[1][neighbor_rank]),
3427  (int)(machine_coords[2][neighbor_rank]));
3428 
3429  if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
3430  my_arrows.insert(foo);
3431 
3432  std::string gnuPlotArrow2 = "";
3433  for (int j = 0; j < machine_coord_dim; ++j) {
3434  if (j == machine_coord_dim - 1) {
3435  gnuPlotArrow2 +=
3436  Teuchos::toString<float>(machine_coords[j][neighbor_rank]);
3437  }
3438  else {
3439  gnuPlotArrow2 +=
3440  Teuchos::toString<float>(machine_coords[j][neighbor_rank])
3441  + ",";
3442  }
3443  }
3444  ss += gnuPlotArrow + gnuPlotArrow2 + " nohead\n";
3445  }
3446  }
3447  }
3448 
3449  std::ofstream procFile("procPlot.plot", std::ofstream::out);
3450  procFile << procs << "\n";
3451  procFile.close();
3452 
3453  //gnuPlotCode << ss;
3454  if (machine_coord_dim == 2) {
3455  gnuPlotCode << "plot \"procPlot.plot\" with points pointsize 3\n";
3456  }
3457  else {
3458  gnuPlotCode << "splot \"procPlot.plot\" with points pointsize 3\n";
3459  }
3460 
3461  gnuPlotCode << ss << "\nreplot\n pause -1\npause -1";
3462  gnuPlotCode.close();
3463 }
3464 
3465 } // namespace Zoltan2
3466 
3467 #endif
void setParams(int dimension_, int heapsize)
CommunicationModel Base Class that performs mapping between the coordinate partitioning result...
size_t getLocalNumEdges() const
Returns the number of edges on this process. In global or subset graphs, includes off-process edges...
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &rcp_proc_to_task_xadj, ArrayRCP< part_t > &rcp_proc_to_task_adj, ArrayRCP< part_t > &rcp_task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const
Function is called whenever nprocs &gt; no_task. Function returns only the subset of processors that are...
void getBestMapping()
finds the lowest cost mapping and broadcasts solution to everyone.
void ithPermutation(const IT n, IT i, IT *perm)
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const part_t num_parts_, const part_t *result_parts, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. Instead of Solution we have two parameters, numparts.
size_t getVertexList(ArrayView< const gno_t > &Ids, ArrayView< input_t > &wgts) const
Sets pointers to this process&#39; vertex Ids and their weights.
Time an algorithm (or other entity) as a whole.
WT getDistance(IT index, WT **elementCoords)
virtual size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
void getCoarsenedPartGraph(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::GraphModel< typename Adapter::base_adapter_t > *graph, part_t np, const part_t *parts, ArrayRCP< part_t > &g_part_xadj, ArrayRCP< part_t > &g_part_adj, ArrayRCP< scalar_t > &g_part_ew)
void visualize_mapping(int myRank, const int machine_coord_dim, const int num_ranks, proc_coord_t **machine_coords, const v_lno_t num_tasks, const v_lno_t *task_communication_xadj, const v_lno_t *task_communication_adj, const int *task_to_rank)
PartitionMapping maps a solution or an input distribution to ranks.
virtual double getProcDistance(int procId1, int procId2) const
void getSolutionCenterCoordinates(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::CoordinateModel< typename Adapter::base_adapter_t > *coords, const part_t *parts, int coordDim, part_t ntasks, scalar_t **partCenters)
bool getNewCenters(WT **coords)
void calculateCommunicationCost(part_t *task_to_proc, part_t *task_communication_xadj, part_t *task_communication_adj, pcoord_t *task_communication_edge_weight)
typename node_t::device_type device_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
map_t::global_ordinal_type gno_t
Definition: mapRemotes.cpp:27
typename Zoltan2::InputTraits< ztcrsmatrix_t >::node_t node_t
ArrayView< part_t > getAssignedTasksForProc(part_t procId)
part_t getAssignedProcForTask(part_t taskId)
getAssignedProcForTask function, returns the assigned processor id for the given task ...
void getMinDistanceCluster(IT *procPermutation)
void doMapping(int myRank, const Teuchos::RCP< const Teuchos::Comm< int > > comm_)
doMapping function, calls getMapping function of communicationModel object.
virtual std::vector< coordinateModelPartBox > & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
Defines the XpetraMultiVectorAdapter.
const MachineRepresentation< pcoord_t, part_t > * machine
SparseMatrixAdapter_t::part_t part_t
Multi Jagged coordinate partitioning algorithm.
#define epsilon
Definition: nd.cpp:47
void create_local_task_to_rank(const lno_t num_local_coords, const part_t *local_coord_parts, const ArrayRCP< part_t > task_to_proc_)
Kokkos::View< part_t *, Kokkos::HostSpace > kokkos_partNoArray
void timerStop(TimerType tt, const char *timerName) const
Stop a named timer.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
virtual void map(const RCP< MappingSolution< Adapter > > &mappingsoln)
Mapping method.
CoordinateTaskMapper(const Environment *env_const_, const Teuchos::Comm< int > *problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, ArrayRCP< part_t >task_comm_xadj, ArrayRCP< part_t >task_comm_adj, pcoord_t *task_communication_edge_weight_, int recursion_depth, Kokkos::View< part_t *, Kokkos::HostSpace > part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor The mapping constructor which will also perform the mapping operation. The result mapping can be obtained by –getAssignedProcForTask function: which returns the assigned processor id for the given task –getPartsForProc: which returns the assigned tasks with the number of tasks.
Contains the Multi-jagged algorthm.
Adapter::scalar_t scalar_t
size_t getCoordinates(ArrayView< const gno_t > &Ids, ArrayView< input_t > &xyz, ArrayView< input_t > &wgts) const
Returns the coordinate ids, values and optional weights.
PartitionMapping maps a solution or an input distribution to ranks.
void getProcTask(part_t *&proc_to_task_xadj_, part_t *&proc_to_task_adj_)
void update_visit_order(part_t *visitOrder, part_t n, int &_u_umpa_seed, part_t rndm)
A PartitioningSolution is a solution to a partitioning problem.
CoordinateCommunicationModel< pcoord_t, tcoord_t, part_t, node_t > * proc_task_comm
virtual void getProcsForPart(part_t taskId, part_t &numProcs, part_t *&procs) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
size_t getEdgeList(ArrayView< const gno_t > &edgeIds, ArrayView< const offset_t > &offsets, ArrayView< input_t > &wgts) const
Sets pointers to this process&#39; edge (neighbor) global Ids, including off-process edges.
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
void copyCoordinates(IT *permutation)
void getGridCommunicationGraph(part_t taskCount, part_t *&task_comm_xadj, part_t *&task_comm_adj, std::vector< int > grid_dims)
Zoltan2_ReduceBestMapping Class, reduces the minimum cost mapping, ties breaks with minimum proc id...
Adapter::part_t part_t
BaseAdapterType
An enum to identify general types of adapters.
void setPartArray(Kokkos::View< part_t *, Kokkos::HostSpace > pNo)
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
The StridedData class manages lists of weights or coordinates.
map_t::local_ordinal_type lno_t
Definition: mapRemotes.cpp:26
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &proc_to_task_xadj, ArrayRCP< part_t > &proc_to_task_adj, ArrayRCP< part_t > &task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const =0
Function is called whenever nprocs &gt; no_task. Function returns only the subset of processors that are...
The user parameters, debug, timing and memory profiling output objects, and error checking methods...
void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const
Function is called whenever nprocs &gt; no_task. Function returns only the subset of processors that are...
void addPoint(IT index, WT distance)
bool getNewCenters(WT *center, WT **coords, int dimension)
size_t getLocalNumVertices() const
Returns the number vertices on this process.
MachineRepresentation Class Base class for representing machine coordinates, networks, etc.
CommunicationModel(part_t no_procs_, part_t no_tasks_)
static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const Teuchos::RCP< const Zoltan2::PartitioningSolution< Adapter > > soln_, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. When this constructor is called, in order to calculate the communication metric...
CoordinateModelInput Class that performs mapping between the coordinate partitioning result and mpi r...
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS&#39;s idx_t...
pcoord_t ** shiftMachineCoordinates(int machine_dim, const part_t *machine_dimensions, bool *machine_extent_wrap_around, part_t numProcs, pcoord_t **mCoords)
Using the machine dimensions provided, create virtual machine coordinates by assigning the largest ga...
Defines the MappingSolution class.
const Teuchos::RCP< const Teuchos::Comm< int > > comm
CoordinateCommunicationModel(int pcoord_dim_, pcoord_t **pcoords_, int tcoord_dim_, tcoord_t **tcoords_, part_t no_procs_, part_t no_tasks_, int *machine_extent_, bool *machine_extent_wrap_around_, const MachineRepresentation< pcoord_t, part_t > *machine_=NULL)
Class Constructor:
GraphModel defines the interface required for graph models.
void coordinateTaskMapperInterface(RCP< const Teuchos::Comm< int > > problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, part_t *task_comm_xadj, part_t *task_comm_adj, pcoord_t *task_communication_edge_weight_, part_t *proc_to_task_xadj, part_t *proc_to_task_adj, int recursion_depth, Kokkos::View< part_t *, Kokkos::HostSpace > part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false)
Constructor The interface function that calls CoordinateTaskMapper which will also perform the mappin...
Gathering definitions used in software development.
Zoltan2_ReduceBestMapping()
Default Constructor.
KmeansHeap Class, max heap, but holds the minimum values.
int getNumWeightsPerEdge() const
Returns the number (0 or greater) of weights per edge.
Defines the GraphModel interface.
void fillContinousArray(T *arr, size_t arrSize, T *val)
fillContinousArray function
void push_down(IT index_on_heap)
ArrayRCP< scalar_t > task_communication_edge_weight
RCP< Comm< int > > create_subCommunicator()
creates and returns the subcommunicator for the processor group.
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
KMeansAlgorithm Class that performs clustering of the coordinates, and returns the closest set of coo...
const Teuchos::RCP< const Environment > env
void timerStart(TimerType tt, const char *timerName) const
Start a named timer.
KMeansAlgorithm(int dim_, IT numElements_, WT **elementCoords_, IT required_elements_)
KMeansAlgorithm Constructor.
Tpetra::MultiVector< zscalar_t, zlno_t, zgno_t, znode_t > tMVector_t
void copyCoordinates(IT *permutation)
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
virtual double getProcDistance(int procId1, int procId2) const =0
#define MINOF(a, b)
virtual void getPartsForProc(int procId, part_t &numParts, part_t *&parts) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void setHeapsize(IT heapsize_)