Zoltan2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Zoltan2_TaskMapping.hpp
Go to the documentation of this file.
1 #ifndef _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
2 #define _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
3 
4 #include <fstream>
5 #include <ctime>
6 #include <vector>
7 #include <set>
8 #include <tuple>
9 
10 #include "Zoltan2_Standards.hpp"
12 #include "Teuchos_ArrayViewDecl.hpp"
15 #include "Teuchos_ReductionOp.hpp"
16 
18 
19 #include "Zoltan2_GraphModel.hpp"
20 
21 #include <Zoltan2_TPLTraits.hpp>
22 
23 #include "Teuchos_Comm.hpp"
24 #ifdef HAVE_ZOLTAN2_MPI
25 #include "Teuchos_DefaultMpiComm.hpp"
26 #endif // HAVE_ZOLTAN2_MPI
27 #include <Teuchos_DefaultSerialComm.hpp>
28 
29 //#define gnuPlot
32 
33 namespace Teuchos{
34 
38 template <typename Ordinal, typename T>
39 class Zoltan2_ReduceBestMapping : public ValueTypeReductionOp<Ordinal,T>
40 {
41 
42 private:
43  T _EPSILON;
44 
45 public:
48  Zoltan2_ReduceBestMapping():_EPSILON(std::numeric_limits<T>::epsilon()) {}
49 
52  void reduce(const Ordinal count,
53  const T inBuffer[],
54  T inoutBuffer[]) const {
55 
56  for (Ordinal i = 0; i < count; i++) {
57  if (inBuffer[0] - inoutBuffer[0] < -_EPSILON) {
58  inoutBuffer[0] = inBuffer[0];
59  inoutBuffer[1] = inBuffer[1];
60  }
61  else if (inBuffer[0] - inoutBuffer[0] < _EPSILON &&
62  inBuffer[1] - inoutBuffer[1] < _EPSILON) {
63  inoutBuffer[0] = inBuffer[0];
64  inoutBuffer[1] = inBuffer[1];
65  }
66  }
67  }
68 };
69 
70 } // namespace Teuchos
71 
72 
73 namespace Zoltan2{
74 
75 template <typename it>
76 inline it z2Fact(it x) {
77  return (x == 1 ? x : x * z2Fact<it>(x - 1));
78 }
79 
80 template <typename gno_t, typename part_t>
82 
83 public:
84  gno_t gno;
86 
87 };
88 
89 //returns the ith permutation indices.
90 template <typename IT>
91 void ithPermutation(const IT n, IT i, IT *perm)
92 {
93  IT j, k = 0;
94  IT *fact = new IT[n];
95 
96  // compute factorial numbers
97  fact[k] = 1;
98  while (++k < n)
99  fact[k] = fact[k - 1] * k;
100 
101  // compute factorial code
102  for (k = 0; k < n; ++k)
103  {
104  perm[k] = i / fact[n - 1 - k];
105  i = i % fact[n - 1 - k];
106  }
107 
108  // readjust values to obtain the permutation
109  // start from the end and check if preceding values are lower
110  for (k = n - 1; k > 0; --k)
111  for (j = k - 1; j >= 0; --j)
112  if (perm[j] <= perm[k])
113  perm[k]++;
114 
115  delete [] fact;
116 }
117 
118 template <typename part_t>
120  part_t *&task_comm_xadj,
121  part_t *&task_comm_adj,
122  std::vector<int> grid_dims) {
123 
124  int dim = grid_dims.size();
125  int neighborCount = 2 * dim;
126  task_comm_xadj = allocMemory<part_t>(taskCount + 1);
127  task_comm_adj = allocMemory<part_t>(taskCount * neighborCount);
128 
129  part_t neighBorIndex = 0;
130  task_comm_xadj[0] = 0;
131 
132  for (part_t i = 0; i < taskCount; ++i) {
133  part_t prevDimMul = 1;
134 
135  for (int j = 0; j < dim; ++j) {
136  part_t lNeighbor = i - prevDimMul;
137  part_t rNeighbor = i + prevDimMul;
138  prevDimMul *= grid_dims[j];
139 
140  if (lNeighbor >= 0 &&
141  lNeighbor / prevDimMul == i / prevDimMul &&
142  lNeighbor < taskCount) {
143  task_comm_adj[neighBorIndex++] = lNeighbor;
144  }
145 
146  if (rNeighbor >= 0 &&
147  rNeighbor / prevDimMul == i / prevDimMul &&
148  rNeighbor < taskCount) {
149  task_comm_adj[neighBorIndex++] = rNeighbor;
150  }
151  }
152 
153  task_comm_xadj[i + 1] = neighBorIndex;
154  }
155 }
156 
157 //returns the center of the parts.
158 template <typename Adapter, typename scalar_t, typename part_t>
160  const Environment *envConst,
161  const Teuchos::Comm<int> *comm,
163 // const Zoltan2::PartitioningSolution<Adapter> *soln_,
164  const part_t *parts,
165  int coordDim,
166  part_t ntasks,
167  scalar_t **partCenters) {
168 
169  typedef typename Adapter::lno_t lno_t;
170  typedef typename Adapter::gno_t gno_t;
171 
172  typedef StridedData<lno_t, scalar_t> input_t;
173  ArrayView<const gno_t> gnos;
174  ArrayView<input_t> xyz;
175  ArrayView<input_t> wgts;
176  coords->getCoordinates(gnos, xyz, wgts);
177 
178  //local and global num coordinates.
179  lno_t numLocalCoords = coords->getLocalNumCoordinates();
180 // gno_t numGlobalCoords = coords->getGlobalNumCoordinates();
181 
182  //local number of points in each part.
183  gno_t *point_counts = allocMemory<gno_t>(ntasks);
184  memset(point_counts, 0, sizeof(gno_t) * ntasks);
185 
186  //global number of points in each part.
187  gno_t *global_point_counts = allocMemory<gno_t>(ntasks);
188 
189  scalar_t **multiJagged_coordinates = allocMemory<scalar_t *>(coordDim);
190 
191  for (int dim = 0; dim < coordDim; dim++) {
192  ArrayRCP<const scalar_t> ar;
193  xyz[dim].getInputArray(ar);
194 
195  //multiJagged coordinate values assignment
196  multiJagged_coordinates[dim] = (scalar_t *)ar.getRawPtr();
197  memset(partCenters[dim], 0, sizeof(scalar_t) * ntasks);
198  }
199 
200  //get parts with parallel gnos.
201 // const part_t *parts = soln_->getPartListView();
202 
203  envConst->timerStart(MACRO_TIMERS, "Mapping - Center Calculation");
204 
205  for (lno_t i = 0; i < numLocalCoords; i++) {
206  part_t p = parts[i];
207 
208  //add up all coordinates in each part.
209  for (int j = 0; j < coordDim; ++j) {
210  scalar_t c = multiJagged_coordinates[j][i];
211  partCenters[j][p] += c;
212  }
213  ++point_counts[p];
214  }
215 
216  //get global number of points in each part.
217  reduceAll<int, gno_t>(*comm, Teuchos::REDUCE_SUM,
218  ntasks, point_counts, global_point_counts);
219 
220  for (int j = 0; j < coordDim; ++j) {
221  for (part_t i = 0; i < ntasks; ++i) {
222  if (global_point_counts[i] > 0)
223  partCenters[j][i] /= global_point_counts[i];
224  }
225  }
226 
227  scalar_t *tmpCoords = allocMemory<scalar_t>(ntasks);
228  for (int j = 0; j < coordDim; ++j) {
229  reduceAll<int, scalar_t>(*comm, Teuchos::REDUCE_SUM,
230  ntasks, partCenters[j], tmpCoords);
231 
232  scalar_t *tmp = partCenters[j];
233  partCenters[j] = tmpCoords;
234  tmpCoords = tmp;
235  }
236 
237  envConst->timerStop(MACRO_TIMERS, "Mapping - Center Calculation");
238 
239  freeArray<gno_t>(point_counts);
240  freeArray<gno_t>(global_point_counts);
241 
242  freeArray<scalar_t>(tmpCoords);
243  freeArray<scalar_t *>(multiJagged_coordinates);
244 }
245 
246 //returns the coarsend part graph.
247 template <typename Adapter, typename scalar_t, typename part_t>
249  const Environment *envConst,
250  const Teuchos::Comm<int> *comm,
252  //const Zoltan2::PartitioningSolution<Adapter> *soln_,
253  part_t np,
254  const part_t *parts,
255  ArrayRCP<part_t> &g_part_xadj,
256  ArrayRCP<part_t> &g_part_adj,
257  ArrayRCP<scalar_t> &g_part_ew) {
258 
259  typedef typename Adapter::lno_t t_lno_t;
260  typedef typename Adapter::gno_t t_gno_t;
261  typedef typename Adapter::scalar_t t_scalar_t;
262  typedef typename Adapter::offset_t t_offset_t;
263 
264  typedef typename Zoltan2::GraphModel<
265  typename Adapter::base_adapter_t>::input_t t_input_t;
266 
267 // int numRanks = comm->getSize();
268 // int myRank = comm->getRank();
269 
270  //get parts with parallel gnos.
271 /*
272  const part_t *parts = soln_->getPartListView();
273 
274  part_t np = soln_->getActualGlobalNumberOfParts();
275  if (part_t(soln_->getTargetGlobalNumberOfParts()) > np) {
276  np = soln_->getTargetGlobalNumberOfParts();
277  }
278 */
279 
280 
281  t_lno_t localNumVertices = graph->getLocalNumVertices();
282  t_lno_t localNumEdges = graph->getLocalNumEdges();
283 
284  //get the vertex global ids, and weights
285  ArrayView<const t_gno_t> Ids;
286  ArrayView<t_input_t> v_wghts;
287  graph->getVertexList(Ids, v_wghts);
288 
289  //get the edge ids, and weights
290  ArrayView<const t_gno_t> edgeIds;
291  ArrayView<const t_offset_t> offsets;
292  ArrayView<t_input_t> e_wgts;
293  graph->getEdgeList(edgeIds, offsets, e_wgts);
294 
295  std::vector<t_scalar_t> edge_weights;
296  int numWeightPerEdge = graph->getNumWeightsPerEdge();
297 
298  if (numWeightPerEdge > 0) {
299  edge_weights = std::vector<t_scalar_t>(localNumEdges);
300 
301  for (t_lno_t i = 0; i < localNumEdges; ++i) {
302  edge_weights[i] = e_wgts[0][i];
303  }
304  }
305 
306  //create a zoltan dictionary to get the parts of the vertices
307  //at the other end of edges
308  std::vector<part_t> e_parts(localNumEdges);
309 
310 #ifdef HAVE_ZOLTAN2_MPI
311  if (comm->getSize() > 1)
312  {
314 
315  const bool bUseLocalIDs = false; // Local IDs not needed
316  int debug_level = 0;
317  const RCP<const Comm<int> > rcp_comm(comm,false);
318 
319  directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
320 
321  directory.update(localNumVertices, &Ids[0], NULL, &parts[0],
322  NULL, directory_t::Update_Mode::Replace);
323 
324  directory.find(localNumEdges, &edgeIds[0], NULL, &e_parts[0],
325  NULL, NULL, false);
326  } else
327 #endif
328  {
329 
330 /*
331  std::cout << "localNumVertices:" << localNumVertices
332  << " np:" << np
333  << " globalNumVertices:" << graph->getGlobalNumVertices()
334  << " localNumEdges:" << localNumEdges << std::endl;
335 */
336 
337  for (t_lno_t i = 0; i < localNumEdges; ++i) {
338  t_gno_t ei = edgeIds[i];
339  part_t p = parts[ei];
340  e_parts[i] = p;
341  }
342 
343  //get the vertices in each part in my part.
344  std::vector<t_lno_t> part_begins(np, -1);
345  std::vector<t_lno_t> part_nexts(localNumVertices, -1);
346 
347  //cluster vertices according to their parts.
348  //create local part graph.
349  for (t_lno_t i = 0; i < localNumVertices; ++i) {
350  part_t ap = parts[i];
351  part_nexts[i] = part_begins[ap];
352  part_begins[ap] = i;
353  }
354 
355  g_part_xadj = ArrayRCP<part_t>(np + 1);
356  g_part_adj = ArrayRCP<part_t>(localNumEdges);
357  g_part_ew = ArrayRCP<t_scalar_t>(localNumEdges);
358 
359  part_t nindex = 0;
360  g_part_xadj[0] = 0;
361 
362  std::vector<part_t> part_neighbors(np);
363  std::vector<t_scalar_t> part_neighbor_weights(np, 0);
364  std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
365 
366  //coarsen for all vertices in my part in order with parts.
367  for (t_lno_t i = 0; i < np; ++i) {
368  part_t num_neighbor_parts = 0;
369  t_lno_t v = part_begins[i];
370 
371  //get part i, and first vertex in this part v.
372  while (v != -1) {
373  //now get the neightbors of v.
374  for (t_offset_t j = offsets[v]; j < offsets[v + 1]; ++j) {
375  //get the part of the second vertex.
376  part_t ep = e_parts[j];
377 
378  t_scalar_t ew = 1;
379  if (numWeightPerEdge > 0) {
380  ew = edge_weights[j];
381  }
382 
383 // std::cout << "part:" << i << " v:" << v
384 // << " part2:" << ep << " v2:" << edgeIds[j]
385 // << " w:" << ew << std::endl;
386 
387  //add it to my local part neighbors for part i.
388  if (part_neighbor_weights[ep] < 0.00001) {
389  part_neighbors[num_neighbor_parts++] = ep;
390  }
391 
392  part_neighbor_weights[ep] += ew;
393  }
394 
395  v = part_nexts[v];
396  }
397 
398 
399  //now get the part list.
400  for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
401  part_t neighbor_part = part_neighbors[j];
402  g_part_adj[nindex] = neighbor_part;
403  g_part_ew[nindex++] = part_neighbor_weights[neighbor_part];
404  part_neighbor_weights[neighbor_part] = 0;
405  }
406  g_part_xadj[i + 1] = nindex;
407  }
408  return;
409  }
410 
411  // struct for directory data - note more extensive comments in
412  // Zoltan2_GraphMetricsUtility.hpp which I didn't want to duplicate
413  // because this is in progress. Similar concept there.
414  struct part_info {
415  part_info() : weight(0) {
416  }
417 
418  const part_info & operator+=(const part_info & src) {
419  this->weight += src.weight;
420  return *this;
421  }
422 
423  bool operator>(const part_info & src) {
424  return (destination_part > src.destination_part);
425  }
426 
427  bool operator==(const part_info & src) {
428  return (destination_part == src.destination_part);
429  }
430 
431  part_t destination_part;
432  scalar_t weight;
433  };
434 
436  directory_t;
437 
438  bool bUseLocalIDs = false;
439  const int debug_level = 0;
440  const RCP<const Comm<int> > rcp_comm(comm, false);
441 
442  directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
443 
444  std::vector<part_t> part_data;
445  std::vector<std::vector<part_info>> user_data;
446 
447  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE Coarsen");
448  {
449  //get the vertices in each part in my part.
450  std::vector<t_lno_t> part_begins(np, -1);
451  std::vector<t_lno_t> part_nexts(localNumVertices, -1);
452 
453  //cluster vertices according to their parts.
454  //create local part graph.
455  for (t_lno_t i = 0; i < localNumVertices; ++i) {
456  part_t ap = parts[i];
457  part_nexts[i] = part_begins[ap];
458  part_begins[ap] = i;
459  }
460 
461  std::vector<part_t> part_neighbors(np);
462  std::vector<t_scalar_t> part_neighbor_weights(np, 0);
463  std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
464 
465  //coarsen for all vertices in my part in order with parts.
466  for (t_lno_t i = 0; i < np; ++i) {
467  part_t num_neighbor_parts = 0;
468  t_lno_t v = part_begins[i];
469 
470  //get part i, and first vertex in this part v.
471  while (v != -1) {
472  //now get the neightbors of v.
473  for (t_offset_t j = offsets[v]; j < offsets[v + 1]; ++j) {
474  //get the part of the second vertex.
475  part_t ep = e_parts[j];
476 
477  t_scalar_t ew = 1;
478  if (numWeightPerEdge > 0) {
479  ew = edge_weights[j];
480  }
481 
482  //add it to my local part neighbors for part i.
483  if (part_neighbor_weights[ep] < 0.00001) {
484  part_neighbors[num_neighbor_parts++] = ep;
485  }
486 
487  part_neighbor_weights[ep] += ew;
488  }
489 
490  v = part_nexts[v];
491  }
492 
493  //now get the part list.
494  for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
495  part_t neighbor_part = part_neighbors[j];
496  part_neighbor_weights_ordered[j] =
497  part_neighbor_weights[neighbor_part];
498  part_neighbor_weights[neighbor_part] = 0;
499  }
500 
501  //insert it to tpetra crsmatrix.
502  if (num_neighbor_parts > 0) {
503  // TODO: optimize to avoid push_back
504  part_data.push_back(i);
505  std::vector<part_info> new_user_data(num_neighbor_parts);
506 
507  for (int q = 0; q < num_neighbor_parts; ++q) {
508  part_info & info = new_user_data[q];
509  info.weight = part_neighbor_weights_ordered[q];
510  info.destination_part = part_neighbors[q];
511  }
512  // TODO: optimize to avoid push_back
513  user_data.push_back(new_user_data);
514  }
515  }
516  }
517  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE Coarsen");
518 
519  std::vector<part_t> part_indices(np);
520 
521  for (part_t i = 0; i < np; ++i) part_indices[i] = i;
522 
523  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE directory update");
524  directory.update(part_data.size(), &part_data[0], NULL, &user_data[0],
525  NULL, directory_t::Update_Mode::AggregateAdd);
526  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE directory update");
527 
528  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE directory find");
529  std::vector<std::vector<part_info>> find_user_data(part_indices.size());
530  directory.find(find_user_data.size(), &part_indices[0], NULL,
531  &find_user_data[0], NULL, NULL, false);
532  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE directory find");
533 
534  // Now reconstruct the output data from the directory find data
535  // This code was designed to reproduce the exact format of the original
536  // setup for g_part_xadj, g_part_adj, and g_part_ew but before making any
537  // further changes I wanted to verify if this formatting should be
538  // preserved or potentially changed further.
539 
540  // first thing is get the total number of elements
541  int get_total_length = 0;
542  for (size_t n = 0; n < find_user_data.size(); ++n) {
543  get_total_length += find_user_data[n].size();
544  }
545 
546  // setup data space
547  g_part_xadj = ArrayRCP<part_t>(np + 1);
548  g_part_adj = ArrayRCP<part_t>(get_total_length);
549  g_part_ew = ArrayRCP<t_scalar_t>(get_total_length);
550 
551  // loop through again and fill to match the original formatting
552  int track_insert_index = 0;
553  for (size_t n = 0; n < find_user_data.size(); ++n) {
554  g_part_xadj[n] = track_insert_index;
555  const std::vector<part_info> & user_data_vector = find_user_data[n];
556 
557  for (size_t q = 0; q < user_data_vector.size(); ++q) {
558  const part_info & info = user_data_vector[q];
559  g_part_adj[track_insert_index] = info.destination_part;
560  g_part_ew[track_insert_index] = info.weight;
561  ++track_insert_index;
562  }
563  }
564  g_part_xadj[np] = get_total_length; // complete the series
565 
566  // This is purely for debugging logging and to be deleted
567  // hacky sort here just to make each subset of part destinations ordered
568  // so new and old code will produce identical output for validation.
569  // I think the ordering is arbitrary and no requirement to preserve the
570  // old ordering within each part. This code was written so I could drop
571  // it into the develop branch and validate we were producing identical
572  // results at this step. Would like to discuss this further before
573  // continuing.
574  // TODO: Delete all this
575 
576 /*
577  if (comm->getRank() == 0) {
578 
579  for (size_t i = 0; i < (size_t)g_part_xadj.size() - 1; ++i) {
580  part_t b = g_part_xadj[i];
581  part_t e = g_part_xadj[i + 1];
582  printf("Results for part %d:\n", (int)i);
583  for (part_t scan_part = 0; scan_part < np; ++scan_part) {
584 
585  for (part_t q = b; q < e; ++q) {
586  // Inefficient hack to sort g_part_adj
587  if (g_part_adj[q] == scan_part) {
588  printf( "(%d:%.2f) ", (int)scan_part, (float)g_part_ew[q]);
589  }
590  }
591  }
592 
593  printf("\n");
594  }
595  }
596 */
597 
598 }
599 
600 
603 template <class IT, class WT>
605 
606  IT heapSize;
607  IT *indices;
608  WT *values;
609  WT _EPSILON;
610 
611 public:
612 
614  freeArray<IT>(this->indices);
615  freeArray<WT>(this->values);
616  }
617 
618  void setHeapsize(IT heapsize_) {
619  this->heapSize = heapsize_;
620  this->indices = allocMemory<IT>(heapsize_ );
621  this->values = allocMemory<WT>(heapsize_ );
622  this->_EPSILON = std::numeric_limits<WT>::epsilon();
623  }
624 
625 
626  void addPoint(IT index, WT distance) {
627  WT maxVal = this->values[0];
628  //add only the distance is smaller than the maximum distance.
629 // std::cout << "indeX:" << index
630 // << " distance:" << distance
631 // << " maxVal:" << maxVal << endl;
632 
633  if (distance >= maxVal)
634  return;
635  else {
636  this->values[0] = distance;
637  this->indices[0] = index;
638  this->push_down(0);
639  }
640  }
641 
642  //heap push down operation
643  void push_down(IT index_on_heap) {
644  IT child_index1 = 2 * index_on_heap + 1;
645  IT child_index2 = 2 * index_on_heap + 2;
646 
647  IT biggerIndex = -1;
648  if (child_index1 < this->heapSize && child_index2 < this->heapSize) {
649 
650  if (this->values[child_index1] < this->values[child_index2]) {
651  biggerIndex = child_index2;
652  }
653  else {
654  biggerIndex = child_index1;
655  }
656  }
657  else if (child_index1 < this->heapSize) {
658  biggerIndex = child_index1;
659  }
660  else if (child_index2 < this->heapSize) {
661  biggerIndex = child_index2;
662  }
663 
664  if (biggerIndex >= 0 &&
665  this->values[biggerIndex] > this->values[index_on_heap]) {
666  WT tmpVal = this->values[biggerIndex];
667  this->values[biggerIndex] = this->values[index_on_heap];
668  this->values[index_on_heap] = tmpVal;
669 
670  IT tmpIndex = this->indices[biggerIndex];
671  this->indices[biggerIndex] = this->indices[index_on_heap];
672  this->indices[index_on_heap] = tmpIndex;
673  this->push_down(biggerIndex);
674  }
675  }
676 
677  void initValues() {
678  WT MAXVAL = std::numeric_limits<WT>::max();
679 
680  for (IT i = 0; i < this->heapSize; ++i) {
681  this->values[i] = MAXVAL;
682  this->indices[i] = -1;
683  }
684  }
685 
686  //returns the total distance to center in the cluster.
688 
689  WT nc = 0;
690  for (IT j = 0; j < this->heapSize; ++j) {
691  nc += this->values[j];
692 
693 // std::cout << "index:" << this->indices[j]
694 // << " distance:" << this->values[j] << endl;
695  }
696 
697  return nc;
698  }
699 
700  //returns the new center of the cluster.
701  bool getNewCenters(WT *center, WT **coords, int dimension) {
702  bool moved = false;
703 
704  for (int i = 0; i < dimension; ++i) {
705  WT nc = 0;
706 
707  for (IT j = 0; j < this->heapSize; ++j) {
708  IT k = this->indices[j];
709 // std::cout << "i:" << i
710 // << " dim:" << dimension
711 // << " k:" << k
712 // << " heapSize:" << heapSize << endl;
713  nc += coords[i][k];
714  }
715 
716  nc /= this->heapSize;
717  moved = (ZOLTAN2_ABS(center[i] - nc) > this->_EPSILON || moved );
718  center[i] = nc;
719  }
720 
721  return moved;
722  }
723 
724  void copyCoordinates(IT *permutation) {
725  for (IT i = 0; i < this->heapSize; ++i) {
726  permutation[i] = this->indices[i];
727  }
728  }
729 };
730 
733 template <class IT, class WT>
735 
736  int dimension;
737  KmeansHeap<IT,WT> closestPoints;
738 
739 public:
740 
741  WT *center;
742 
744  freeArray<WT>(center);
745  }
746 
747  void setParams(int dimension_, int heapsize) {
748  this->dimension = dimension_;
749  this->center = allocMemory<WT>(dimension_);
750  this->closestPoints.setHeapsize(heapsize);
751  }
752 
753  void clearHeap() {
754  this->closestPoints.initValues();
755  }
756 
757  bool getNewCenters( WT **coords) {
758  return this->closestPoints.getNewCenters(center, coords, dimension);
759  }
760 
761  //returns the distance of the coordinate to the center.
762  //also adds it to the heap.
763  WT getDistance(IT index, WT **elementCoords) {
764  WT distance = 0;
765 
766  for (int i = 0; i < this->dimension; ++i) {
767  WT d = (center[i] - elementCoords[i][index]);
768  distance += d * d;
769  }
770 
771  distance = pow(distance, WT(1.0 / this->dimension));
772  closestPoints.addPoint(index, distance);
773 
774  return distance;
775  }
776 
778  return closestPoints.getTotalDistance();
779  }
780 
781  void copyCoordinates(IT *permutation) {
782  closestPoints.copyCoordinates(permutation);
783  }
784 };
785 
792 template <class IT, class WT>
794 
795  int dim;
796  IT numElements;
797  WT **elementCoords;
798  IT numClusters;
799  IT required_elements;
800  KMeansCluster<IT,WT> *clusters;
801  WT *maxCoordinates;
802  WT *minCoordinates;
803 
804 public:
805 
807  freeArray<KMeansCluster<IT,WT> >(clusters);
808  freeArray<WT>(maxCoordinates);
809  freeArray<WT>(minCoordinates);
810  }
811 
815  int dim_ ,
816  IT numElements_,
817  WT **elementCoords_,
818  IT required_elements_):
819  dim(dim_),
820  numElements(numElements_),
821  elementCoords(elementCoords_),
822  numClusters((1 << dim_) + 1),
823  required_elements(required_elements_) {
824 
825  this->clusters = allocMemory<KMeansCluster<IT,WT> >(this->numClusters);
826 
827  //set dimension and the number of required elements for all clusters.
828  for (int i = 0; i < numClusters; ++i) {
829  this->clusters[i].setParams(this->dim, this->required_elements);
830  }
831 
832  this->maxCoordinates = allocMemory <WT>(this->dim);
833  this->minCoordinates = allocMemory <WT>(this->dim);
834 
835  //obtain the min and max coordiantes for each dimension.
836  for (int j = 0; j < dim; ++j) {
837  this->minCoordinates[j] = this->elementCoords[j][0];
838  this->maxCoordinates[j] = this->elementCoords[j][0];
839 
840  for (IT i = 1; i < numElements; ++i) {
841  WT t = this->elementCoords[j][i];
842 
843  if (t > this->maxCoordinates[j]) {
844  this->maxCoordinates[j] = t;
845  }
846 
847  if (t < minCoordinates[j]) {
848  this->minCoordinates[j] = t;
849  }
850  }
851  }
852 
853 
854  //assign initial cluster centers.
855  for (int j = 0; j < dim; ++j) {
856  int mod = (1 << (j + 1));
857 
858  for (int i = 0; i < numClusters - 1; ++i) {
859  WT c = 0;
860 
861  if ( (i % mod) < mod / 2) {
862  c = this->maxCoordinates[j];
863 // std::cout << "i:" << i << " j:" << j
864 // << " setting max:" << c << endl;
865  }
866  else {
867  c = this->minCoordinates[j];
868  }
869 
870  this->clusters[i].center[j] = c;
871  }
872  }
873 
874  //last cluster center is placed to middle.
875  for (int j = 0; j < dim; ++j) {
876  this->clusters[numClusters - 1].center[j] =
877  (this->maxCoordinates[j] + this->minCoordinates[j]) / 2;
878  }
879 
880 
881 /*
882  for (int i = 0; i < numClusters; ++i) {
883 // std::cout << endl << "cluster:" << i << endl << "\t";
884 
885  for (int j = 0; j < dim; ++j) {
886  std::cout << this->clusters[i].center[j] << " ";
887  }
888  }
889 */
890  }
891 
892  // Performs kmeans clustering of coordinates.
893  void kmeans() {
894  for (int it = 0; it < 10; ++it) {
895 // std::cout << "it:" << it << endl;
896 
897  for (IT j = 0; j < this->numClusters; ++j) {
898  this->clusters[j].clearHeap();
899  }
900 
901  for (IT i = 0; i < this->numElements; ++i) {
902 // std::cout << "i:" << i << " numEl:" << this->numElements << endl;
903 
904  for (IT j = 0; j < this->numClusters; ++j) {
905 // std::cout << "j:" << j
906 // << " numClusters:" << this->numClusters << endl;
907 
908  this->clusters[j].getDistance(i,this->elementCoords);
909  }
910  }
911 
912  bool moved = false;
913 
914  for (IT j = 0; j < this->numClusters; ++j) {
915  moved =
916  (this->clusters[j].getNewCenters(this->elementCoords) || moved );
917  }
918  if (!moved) {
919  break;
920  }
921  }
922  }
923 
924  // Finds the cluster in which the coordinates are the closest to each
925  // other.
926  void getMinDistanceCluster(IT *procPermutation) {
927 
928  WT minDistance = this->clusters[0].getDistanceToCenter();
929  IT minCluster = 0;
930 
931 // std::cout << "j:" << 0 << " minDistance:" << minDistance
932 // << " minTmpDistance:" << minDistance
933 // << " minCluster:" << minCluster << endl;
934 
935  for (IT j = 1; j < this->numClusters; ++j) {
936  WT minTmpDistance = this->clusters[j].getDistanceToCenter();
937 
938 // std::cout << "j:" << j << " minDistance:" << minDistance
939 // << " minTmpDistance:" << minTmpDistance
940 // << " minCluster:" << minCluster << endl;
941 
942  if (minTmpDistance < minDistance) {
943  minDistance = minTmpDistance;
944  minCluster = j;
945  }
946  }
947 
948 // std::cout << "minCluster:" << minCluster << endl;
949  this->clusters[minCluster].copyCoordinates(procPermutation);
950  }
951 };
952 
953 
954 #define MINOF(a,b) (((a)<(b))?(a):(b))
955 
963 template <typename T>
964 void fillContinousArray(T *arr, size_t arrSize, T *val) {
965  if (val == NULL) {
966 
967 #ifdef HAVE_ZOLTAN2_OMP
968 #pragma omp parallel for
969 #endif
970  for (size_t i = 0; i < arrSize; ++i) {
971  arr[i] = i;
972  }
973 
974  }
975  else {
976  T v = *val;
977 #ifdef HAVE_ZOLTAN2_OMP
978 #pragma omp parallel for
979 #endif
980  for (size_t i = 0; i < arrSize; ++i) {
981 // std::cout << "writing to i:" << i << " arr:" << arrSize << endl;
982  arr[i] = v;
983  }
984  }
985 }
986 
990 template <typename part_t, typename pcoord_t>
992 
993 protected:
994  double commCost;
995 
996 public:
997 
998  // Number of processors and number of tasks
1001 
1002 
1004  CommunicationModel(part_t no_procs_, part_t no_tasks_):
1005  commCost(),
1006  no_procs(no_procs_),
1007  no_tasks(no_tasks_) {}
1008 
1009  virtual ~CommunicationModel() {}
1010 
1012  return this->no_procs;
1013  }
1014 
1016  return this->no_tasks;
1017  }
1018 
1020  part_t *task_to_proc,
1021  part_t *task_communication_xadj,
1022  part_t *task_communication_adj,
1023  pcoord_t *task_communication_edge_weight) {
1024 
1025  double totalCost = 0;
1026 
1027  part_t commCount = 0;
1028  for (part_t task = 0; task < this->no_tasks; ++task) {
1029  int assigned_proc = task_to_proc[task];
1030 
1031  part_t task_adj_begin = task_communication_xadj[task];
1032  part_t task_adj_end = task_communication_xadj[task + 1];
1033 
1034  commCount += task_adj_end - task_adj_begin;
1035 
1036  for (part_t task2 = task_adj_begin; task2 < task_adj_end; ++task2) {
1037 
1038  part_t neighborTask = task_communication_adj[task2];
1039  int neighborProc = task_to_proc[neighborTask];
1040  double distance = getProcDistance(assigned_proc, neighborProc);
1041 
1042  if (task_communication_edge_weight == NULL) {
1043  totalCost += distance ;
1044  }
1045  else {
1046  totalCost += distance * task_communication_edge_weight[task2];
1047  }
1048  }
1049  }
1050 
1051  // commCount
1052  this->commCost = totalCost;
1053  }
1054 
1056  return this->commCost;
1057  }
1058 
1059  virtual double getProcDistance(int procId1, int procId2) const = 0;
1060 
1070  virtual void getMapping(
1071  int myRank,
1072  const RCP<const Environment> &env,
1073  ArrayRCP <part_t> &proc_to_task_xadj,
1074  ArrayRCP <part_t> &proc_to_task_adj,
1075  ArrayRCP <part_t> &task_to_proc,
1076  const Teuchos::RCP <const Teuchos::Comm<int> > comm_
1077  ) const = 0;
1078 };
1079 
1080 
1085 template <typename pcoord_t, typename tcoord_t, typename part_t>
1087  pcoord_t> {
1088 public:
1089  //private:
1090 
1091  // Dimension of the processors
1093  // Processor coordinates (allocated outside of the class)
1094  pcoord_t **proc_coords;
1095  // Dimension of the tasks coordinates.
1097  // Task coordinates (allocated outside of the class)
1098  tcoord_t **task_coords;
1099 
1102 
1106 
1109 
1110  //public:
1112  CommunicationModel<part_t, pcoord_t>(),
1113  proc_coord_dim(0),
1114  proc_coords(0),
1115  task_coord_dim(0),
1116  task_coords(0),
1117  partArraySize(-1),
1118  partNoArray(NULL),
1119  machine_extent(NULL),
1121  machine(NULL),
1122  num_ranks_per_node(1),
1123  divide_to_prime_first(false) {}
1124 
1126 
1138  int pcoord_dim_,
1139  pcoord_t **pcoords_,
1140  int tcoord_dim_,
1141  tcoord_t **tcoords_,
1142  part_t no_procs_,
1143  part_t no_tasks_,
1144  int *machine_extent_,
1145  bool *machine_extent_wrap_around_,
1146  const MachineRepresentation<pcoord_t,part_t> *machine_ = NULL
1147  ):
1148  CommunicationModel<part_t, pcoord_t>(no_procs_, no_tasks_),
1149  proc_coord_dim(pcoord_dim_), proc_coords(pcoords_),
1150  task_coord_dim(tcoord_dim_), task_coords(tcoords_),
1151  partArraySize(-1),
1152  partNoArray(NULL),
1153  machine_extent(machine_extent_),
1154  machine_extent_wrap_around(machine_extent_wrap_around_),
1155  machine(machine_),
1156  num_ranks_per_node(1),
1157  divide_to_prime_first(false) {
1158  }
1159 
1160 
1161  void setPartArraySize(int psize) {
1162  this->partArraySize = psize;
1163  }
1164 
1165  void setPartArray(part_t *pNo) {
1166  this->partNoArray = pNo;
1167  }
1168 
1178  void getClosestSubset(part_t *proc_permutation, part_t nprocs,
1179  part_t ntasks) const{
1180  // Currently returns a random subset.
1181 
1182  part_t minCoordDim = MINOF(this->task_coord_dim, this->proc_coord_dim);
1184  minCoordDim, nprocs,
1185  this->proc_coords, ntasks);
1186 
1187  kma.kmeans();
1188  kma.getMinDistanceCluster(proc_permutation);
1189 
1190  for (int i = ntasks; i < nprocs; ++i) {
1191  proc_permutation[i] = -1;
1192  }
1193 
1194 /*
1195  //fill array.
1196  fillContinousArray<part_t>(proc_permutation, nprocs, NULL);
1197 
1198  int _u_umpa_seed = 847449649;
1199  srand (time(NULL));
1200 
1201  int a = rand() % 1000 + 1;
1202  _u_umpa_seed -= a;
1203 
1204  //permute array randomly.
1205  update_visit_order(proc_permutation, nprocs,_u_umpa_seed, 1);
1206 */
1207  }
1208 
1209  // Temporary, necessary for random permutation.
1210  static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
1211  {
1212  int a = 16807;
1213  int m = 2147483647;
1214  int q = 127773;
1215  int r = 2836;
1216  int lo, hi, test;
1217  double d;
1218 
1219  lo = _u_umpa_seed % q;
1220  hi = _u_umpa_seed / q;
1221  test = (a * lo) - (r * hi);
1222 
1223  if (test>0)
1224  _u_umpa_seed = test;
1225  else
1226  _u_umpa_seed = test + m;
1227 
1228  d = (double) ((double) _u_umpa_seed / (double) m);
1229 
1230  return (part_t) (d*(double)l);
1231  }
1232 
1233  virtual double getProcDistance(int procId1, int procId2) const {
1234  pcoord_t distance = 0;
1235  if (machine == NULL) {
1236  for (int i = 0 ; i < this->proc_coord_dim; ++i) {
1237  double d =
1238  ZOLTAN2_ABS(proc_coords[i][procId1] - proc_coords[i][procId2]);
1240  if (machine_extent[i] - d < d) {
1241  d = machine_extent[i] - d;
1242  }
1243  }
1244  distance += d;
1245  }
1246  }
1247  else {
1248  this->machine->getHopCount(procId1, procId2, distance);
1249  }
1250 
1251  return distance;
1252  }
1253 
1254 
1255  // Temporary, does random permutation.
1256  void update_visit_order(part_t* visitOrder, part_t n,
1257  int &_u_umpa_seed, part_t rndm) {
1258  part_t *a = visitOrder;
1259 
1260  if (rndm) {
1261  part_t i, u, v, tmp;
1262 
1263  if (n <= 4)
1264  return;
1265 
1266 // srand ( time(NULL) );
1267 // _u_umpa_seed = _u_umpa_seed1 - (rand()%100);
1268 
1269  for (i = 0; i < n; i += 16) {
1270  u = umpa_uRandom(n-4, _u_umpa_seed);
1271  v = umpa_uRandom(n-4, _u_umpa_seed);
1272 
1273  // FIXME (mfh 30 Sep 2015) This requires including
1274  // Zoltan2_AlgMultiJagged.hpp.
1275 
1276  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v], a[u], tmp);
1277  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 1], a[u + 1], tmp);
1278  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 2], a[u + 2], tmp);
1279  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 3], a[u + 3], tmp);
1280  }
1281  }
1282  else {
1283  part_t i, end = n / 4;
1284 
1285  for (i = 1; i < end; i++) {
1286  part_t j = umpa_uRandom(n - i, _u_umpa_seed);
1287  part_t t = a[j];
1288  a[j] = a[n - i];
1289  a[n - i] = t;
1290  }
1291  }
1292 // PermuteInPlace(visitOrder, n);
1293  }
1294 
1304  virtual void getMapping(
1305  int myRank,
1306  const RCP<const Environment> &env,
1307  ArrayRCP <part_t> &rcp_proc_to_task_xadj,
1308  ArrayRCP <part_t> &rcp_proc_to_task_adj,
1309  ArrayRCP <part_t> &rcp_task_to_proc,
1310  const Teuchos::RCP <const Teuchos::Comm<int> > comm_
1311  ) const {
1312 
1313  rcp_proc_to_task_xadj = ArrayRCP <part_t>(this->no_procs + 1);
1314  rcp_proc_to_task_adj = ArrayRCP <part_t>(this->no_tasks);
1315  rcp_task_to_proc = ArrayRCP <part_t>(this->no_tasks);
1316 
1317  // Holds the pointer to the task array
1318  part_t *proc_to_task_xadj = rcp_proc_to_task_xadj.getRawPtr();
1319 
1320  // Holds the indices of task wrt to proc_to_task_xadj array.
1321  part_t *proc_to_task_adj = rcp_proc_to_task_adj.getRawPtr();
1322 
1323  // Holds the processors mapped to tasks.
1324  part_t *task_to_proc = rcp_task_to_proc.getRawPtr();
1325 
1326  part_t invalid = 0;
1327  fillContinousArray<part_t>(proc_to_task_xadj, this->no_procs + 1,
1328  &invalid);
1329 
1330  // Obtain the number of parts that should be divided.
1331  part_t num_parts = MINOF(this->no_procs, this->no_tasks);
1332 
1333  // Obtain the min coordinate dim.
1334  // No more want to do min coord dim. If machine dimension > task_dim,
1335  // we end up with a long line.
1336 // part_t minCoordDim = MINOF(this->task_coord_dim, this->proc_coord_dim);
1337 
1338  int recursion_depth = partArraySize;
1339 
1340 // if (partArraySize < minCoordDim)
1341 // recursion_depth = minCoordDim;
1342  if (partArraySize == -1) {
1343 
1344  if (divide_to_prime_first) {
1345  // It is difficult to estimate the number of steps in this case
1346  // as each branch will have different depth.
1347  // The worst case happens when all prime factors are 3s.
1348  // P = 3^n, n recursion depth will divide parts to 2x and x
1349  // and n recursion depth with divide 2x into x and x.
1350  // Set it to upperbound here.
1351  // We could calculate the exact value here as well, but the
1352  // partitioning algorithm skips further ones anyways.
1353  recursion_depth = log(float(this->no_procs)) / log(2.0) * 2 + 1;
1354  }
1355  else {
1356  recursion_depth = log(float(this->no_procs)) / log(2.0) + 1;
1357  }
1358  }
1359 
1360  // Get number of different permutations for task dimension ordering
1361  int taskPerm = z2Fact<int>(this->task_coord_dim);
1362  // Get number of different permutations for proc dimension ordering
1363  int procPerm = z2Fact<int>(this->proc_coord_dim);
1364 
1365  // Total number of permutations (both task and proc permuted)
1366  int permutations = taskPerm * procPerm;
1367 
1368  // Add permutations where we divide the processors with longest
1369  // dimension but permute tasks.
1370  permutations += taskPerm;
1371 
1372  // Add permutations where we divide the tasks with longest
1373  // dimension but permute procs.
1374  permutations += procPerm;
1375 
1376  // Add permutation with both tasks and procs divided by longest
1377  // dimension
1378  permutations += 1;
1379 
1380  // Holds the pointers to proc_adjList
1381  part_t *proc_xadj = allocMemory<part_t>(num_parts + 1);
1382 
1383  // Holds the processors in parts according to the result of
1384  // partitioning algorithm.
1385  // The processors assigned to part x is at
1386  // proc_adjList[ proc_xadj[x] : proc_xadj[x + 1] ]
1387  part_t *proc_adjList = allocMemory<part_t>(this->no_procs);
1388 
1389 
1390  part_t used_num_procs = this->no_procs;
1391  if (this->no_procs > this->no_tasks) {
1392  // Obtain the subset of the processors that are closest to each
1393  // other.
1394  this->getClosestSubset(proc_adjList, this->no_procs,
1395  this->no_tasks);
1396  used_num_procs = this->no_tasks;
1397  }
1398  else {
1399  fillContinousArray<part_t>(proc_adjList,this->no_procs, NULL);
1400  }
1401 
1402  // Index of the permutation
1403  int myPermutation = myRank % permutations;
1404  bool task_partition_along_longest_dim = false;
1405  bool proc_partition_along_longest_dim = false;
1406 
1407 
1408  int myProcPerm = 0;
1409  int myTaskPerm = 0;
1410 
1411  if (myPermutation == 0) {
1412  task_partition_along_longest_dim = true;
1413  proc_partition_along_longest_dim = true;
1414  }
1415  else {
1416  --myPermutation;
1417  if (myPermutation < taskPerm) {
1418  proc_partition_along_longest_dim = true;
1419  // Index of the task permutation
1420  myTaskPerm = myPermutation;
1421  }
1422  else {
1423  myPermutation -= taskPerm;
1424 
1425  if (myPermutation < procPerm) {
1426  task_partition_along_longest_dim = true;
1427  // Index of the task permutation
1428  myProcPerm = myPermutation;
1429  }
1430  else {
1431  myPermutation -= procPerm;
1432  // Index of the proc permutation
1433  myProcPerm = myPermutation % procPerm;
1434  // Index of the task permutation
1435  myTaskPerm = myPermutation / procPerm;
1436  }
1437  }
1438  }
1439 
1440 /*
1441  if (task_partition_along_longest_dim &&
1442  proc_partition_along_longest_dim) {
1443  std::cout <<"me:" << myRank << " task:longest proc:longest"
1444  << " numPerms:" << permutations << std::endl;
1445  }
1446  else if (proc_partition_along_longest_dim) {
1447  std::cout <<"me:" << myRank << " task:" << myTaskPerm
1448  << " proc:longest" << " numPerms:" << permutations << std::endl;
1449  }
1450  else if (task_partition_along_longest_dim) {
1451  std::cout <<"me:" << myRank << " task: longest" << " proc:"
1452  << myProcPerm << " numPerms:" << permutations << std::endl;
1453  }
1454  else {
1455  std::cout <<"me:" << myRank << " task:" << myTaskPerm << " proc:"
1456  << myProcPerm << " numPerms:" << permutations << std::endl;
1457  }
1458 */
1459 
1460  int *permutation =
1461  allocMemory<int>((this->proc_coord_dim > this->task_coord_dim) ?
1462  this->proc_coord_dim : this->task_coord_dim);
1463 
1464  // Get the permutation order from the proc permutation index.
1465  ithPermutation<int>(this->proc_coord_dim, myProcPerm, permutation);
1466 
1467 /*
1468  // Reorder the coordinate dimensions.
1469  pcoord_t **pcoords = allocMemory<pcoord_t *>(this->proc_coord_dim);
1470  for (int i = 0; i < this->proc_coord_dim; ++i) {
1471  pcoords[i] = this->proc_coords[permutation[i]];
1472 // std::cout << permutation[i] << " ";
1473  }
1474 */
1475 
1476  int procdim = this->proc_coord_dim;
1477  pcoord_t **pcoords = this->proc_coords;
1478 
1479 /*
1480  int procdim = this->proc_coord_dim;
1481  procdim = 6;
1482  //reorder the coordinate dimensions.
1483  pcoord_t **pcoords = allocMemory<pcoord_t *>(procdim);
1484  for (int i = 0; i < procdim; ++i) {
1485  pcoords[i] = new pcoord_t[used_num_procs] ;
1486 // this->proc_coords[permutation[i]];
1487  }
1488 
1489  for (int k = 0; k < used_num_procs ; k++) {
1490  pcoords[0][k] = (int (this->proc_coords[0][k]) / 2) * 64;
1491  pcoords[3][k] = (int (this->proc_coords[0][k]) % 2) * 8 ;
1492 
1493  pcoords[1][k] = (int (this->proc_coords[1][k]) / 2) * 8 * 2400;
1494  pcoords[4][k] = (int (this->proc_coords[1][k]) % 2) * 8;
1495  pcoords[2][k] = ((int (this->proc_coords[2][k])) / 8) * 160;
1496  pcoords[5][k] = ((int (this->proc_coords[2][k])) % 8) * 5;
1497 
1498  //if (this->proc_coords[0][k] == 40 &&
1499  // this->proc_coords[1][k] == 8 &&
1500  // this->proc_coords[2][k] == 48) {
1501  if (this->proc_coords[0][k] == 5 &&
1502  this->proc_coords[1][k] == 0 &&
1503  this->proc_coords[2][k] == 10) {
1504  std::cout << "pcoords[0][k]:" << pcoords[0][k]
1505  << "pcoords[1][k]:" << pcoords[1][k]
1506  << "pcoords[2][k]:" << pcoords[2][k]
1507  << "pcoords[3][k]:" << pcoords[3][k]
1508  << "pcoords[4][k]:" << pcoords[4][k]
1509  << "pcoords[5][k]:" << pcoords[5][k] << std::endl;
1510  }
1511  else if (pcoords[0][k] == 64 &&
1512  pcoords[1][k] == 0 &&
1513  pcoords[2][k] == 160 &&
1514  pcoords[3][k] == 16 &&
1515  pcoords[4][k] == 0 &&
1516  pcoords[5][k] == 10) {
1517  std::cout << "this->proc_coords[0][k]:" << this->proc_coords[0][k]
1518  << "this->proc_coords[1][k]:" << this->proc_coords[1][k]
1519  << "this->proc_coords[2][k]:" << this->proc_coords[2][k]
1520  << std::endl;
1521  }
1522  }
1523 */
1524 
1525 // if (partNoArray == NULL)
1526 // std::cout << "partNoArray is null" << std::endl;
1527 // std::cout << "recursion_depth:" << recursion_depth
1528 // << " partArraySize:" << partArraySize << std::endl;
1529 
1530  // Optimization for Dragonfly Networks, First Level of partitioning
1531  // is imbalanced to ensure procs are divided by first RCA
1532  // coord (a.k.a. group).
1533  part_t num_group_count = 1;
1534  part_t *group_count = NULL;
1535 
1536  if (machine != NULL)
1537  num_group_count = machine->getNumUniqueGroups();
1538 
1539  if (num_group_count > 1) {
1540  group_count = new part_t[num_group_count];
1541  memset(group_count, 0, sizeof(part_t) * num_group_count);
1542 
1543  machine->getGroupCount(group_count);
1544  }
1545 
1546  // Do the partitioning and renumber the parts.
1547  env->timerStart(MACRO_TIMERS, "Mapping - Proc Partitioning");
1548  // Partitioning of Processors
1550 
1551  mj_partitioner.sequential_task_partitioning(
1552  env,
1553  this->no_procs,
1554  used_num_procs,
1555  num_parts,
1556  procdim,
1557  //minCoordDim,
1558  pcoords,//this->proc_coords,
1559  proc_adjList,
1560  proc_xadj,
1561  recursion_depth,
1562  partNoArray,
1563  proc_partition_along_longest_dim, //, false
1566  num_group_count,
1567  group_count
1568  );
1569  env->timerStop(MACRO_TIMERS, "Mapping - Proc Partitioning");
1570 
1571 // comm_->barrier();
1572 // std::cout << "mj_partitioner.for procs over" << std::endl;
1573 // freeArray<pcoord_t *>(pcoords);
1574 
1575  part_t *task_xadj = allocMemory<part_t>(num_parts + 1);
1576  part_t *task_adjList = allocMemory<part_t>(this->no_tasks);
1577 
1578  // Fill task_adjList st: task_adjList[i] <- i.
1579  fillContinousArray<part_t>(task_adjList,this->no_tasks, NULL);
1580 
1581  // Get the permutation order from the task permutation index.
1582  ithPermutation<int>(this->task_coord_dim, myTaskPerm, permutation);
1583 
1584  // Reorder task coordinate dimensions.
1585  tcoord_t **tcoords = allocMemory<tcoord_t *>(this->task_coord_dim);
1586  for (int i = 0; i < this->task_coord_dim; ++i) {
1587  tcoords[i] = this->task_coords[permutation[i]];
1588  }
1589 
1590  env->timerStart(MACRO_TIMERS, "Mapping - Task Partitioning");
1591  // Partitioning of Tasks
1592  mj_partitioner.sequential_task_partitioning(
1593  env,
1594  this->no_tasks,
1595  this->no_tasks,
1596  num_parts,
1597  this->task_coord_dim,
1598  //minCoordDim,
1599  tcoords, //this->task_coords,
1600  task_adjList,
1601  task_xadj,
1602  recursion_depth,
1603  partNoArray,
1604  task_partition_along_longest_dim,
1607  num_group_count,
1608  group_count
1609  //,"task_partitioning"
1610  //, false // (myRank == 6)
1611  );
1612  env->timerStop(MACRO_TIMERS, "Mapping - Task Partitioning");
1613 
1614 // std::cout << "myrank:" << myRank << std::endl;
1615 // comm_->barrier();
1616 // std::cout << "mj_partitioner.sequential_task_partitioning over"
1617 // << std::endl;
1618 
1619  freeArray<pcoord_t *>(tcoords);
1620  freeArray<int>(permutation);
1621 
1622  //filling proc_to_task_xadj, proc_to_task_adj, task_to_proc arrays.
1623  for (part_t i = 0; i < num_parts; ++i) {
1624 
1625  part_t proc_index_begin = proc_xadj[i];
1626  part_t task_begin_index = task_xadj[i];
1627  part_t proc_index_end = proc_xadj[i + 1];
1628  part_t task_end_index = task_xadj[i + 1];
1629 
1630  if (proc_index_end - proc_index_begin != 1) {
1631  std::cerr << "Error at partitioning of processors" << std::endl;
1632  std::cerr << "PART:" << i << " is assigned to "
1633  << proc_index_end - proc_index_begin << " processors."
1634  << std::endl;
1635  exit(1);
1636  }
1637  part_t assigned_proc = proc_adjList[proc_index_begin];
1638  proc_to_task_xadj[assigned_proc] = task_end_index - task_begin_index;
1639  }
1640 
1641  //holds the pointer to the task array
1642  //convert proc_to_task_xadj to CSR index array
1643  part_t *proc_to_task_xadj_work = allocMemory<part_t>(this->no_procs);
1644  part_t sum = 0;
1645  for (part_t i = 0; i < this->no_procs; ++i) {
1646  part_t tmp = proc_to_task_xadj[i];
1647  proc_to_task_xadj[i] = sum;
1648  sum += tmp;
1649  proc_to_task_xadj_work[i] = sum;
1650  }
1651  proc_to_task_xadj[this->no_procs] = sum;
1652 
1653  for (part_t i = 0; i < num_parts; ++i) {
1654 
1655  part_t proc_index_begin = proc_xadj[i];
1656  part_t task_begin_index = task_xadj[i];
1657  part_t task_end_index = task_xadj[i + 1];
1658 
1659  part_t assigned_proc = proc_adjList[proc_index_begin];
1660 
1661  for (part_t j = task_begin_index; j < task_end_index; ++j) {
1662  part_t taskId = task_adjList[j];
1663 
1664  task_to_proc[taskId] = assigned_proc;
1665 
1666  proc_to_task_adj[--proc_to_task_xadj_work[assigned_proc]] = taskId;
1667  }
1668  }
1669 
1670 /*
1671  if (myPermutation == 0) {
1672  std::ofstream gnuPlotCode ("mymapping.out", std::ofstream::out);
1673 
1674  for (part_t i = 0; i < num_parts; ++i) {
1675 
1676  part_t proc_index_begin = proc_xadj[i];
1677  part_t proc_index_end = proc_xadj[i + 1];
1678 
1679  if (proc_index_end - proc_index_begin != 1) {
1680  std::cerr << "Error at partitioning of processors" << std::endl;
1681  std::cerr << "PART:" << i << " is assigned to "
1682  << proc_index_end - proc_index_begin << " processors."
1683  << std::endl;
1684  exit(1);
1685  }
1686 
1687  part_t assigned_proc = proc_adjList[proc_index_begin];
1688  gnuPlotCode << "Rank:" << i << " "
1689  << this->proc_coords[0][assigned_proc] << " "
1690  << this->proc_coords[1][assigned_proc] << " "
1691  << this->proc_coords[2][assigned_proc] << " "
1692  << pcoords[0][assigned_proc] << " "
1693  << pcoords[1][assigned_proc] << " "
1694  << pcoords[2][assigned_proc] << " "
1695  << pcoords[3][assigned_proc] << std::endl;
1696  }
1697 
1698  gnuPlotCode << "Machine Extent:" << std::endl;
1699  //filling proc_to_task_xadj, proc_to_task_adj, task_to_proc arrays.
1700  for (part_t i = 0; i < num_parts; ++i) {
1701 
1702  part_t proc_index_begin = proc_xadj[i];
1703  part_t proc_index_end = proc_xadj[i + 1];
1704 
1705  if (proc_index_end - proc_index_begin != 1) {
1706  std::cerr << "Error at partitioning of processors" << std::endl;
1707  std::cerr << "PART:" << i << " is assigned to "
1708  << proc_index_end - proc_index_begin << " processors."
1709  << std::endl;
1710  exit(1);
1711  }
1712 
1713  part_t assigned_proc = proc_adjList[proc_index_begin];
1714  gnuPlotCode << "Rank:" << i << " "
1715  << this->proc_coords[0][assigned_proc] << " "
1716  << this->proc_coords[1][assigned_proc] << " "
1717  << this->proc_coords[2][assigned_proc] << std::endl;
1718  }
1719  gnuPlotCode.close();
1720  }
1721 */
1722 
1723  freeArray<part_t>(proc_to_task_xadj_work);
1724  freeArray<part_t>(task_xadj);
1725  freeArray<part_t>(task_adjList);
1726  freeArray<part_t>(proc_xadj);
1727  freeArray<part_t>(proc_adjList);
1728  }
1729 
1730 };
1731 
1732 template <typename Adapter, typename part_t>
1734 protected:
1735 
1736 #ifndef DOXYGEN_SHOULD_SKIP_THIS
1737 
1738  typedef typename Adapter::scalar_t pcoord_t;
1739  typedef typename Adapter::scalar_t tcoord_t;
1740  typedef typename Adapter::scalar_t scalar_t;
1741  typedef typename Adapter::lno_t lno_t;
1742 
1743 #endif
1744 
1745 // RCP<const Environment> env;
1746 
1747  // Holds the pointer to the task array
1748  ArrayRCP<part_t> proc_to_task_xadj;
1749 // = allocMemory<part_t> (this->no_procs + 1);
1750 
1751  // Holds the indices of tasks wrt to proc_to_task_xadj array.
1752  ArrayRCP<part_t> proc_to_task_adj;
1753 // = allocMemory<part_t>(this->no_tasks);
1754 
1755  // Holds the processors mapped to tasks.
1756  ArrayRCP<part_t> task_to_proc;
1757 // = allocMemory<part_t>(this->no_procs);
1758 
1759  // Holds the processors mapped to tasks.
1760  ArrayRCP<part_t> local_task_to_rank;
1761 // = allocMemory<part_t>(this->no_procs);
1762 
1767  ArrayRCP<part_t> task_communication_xadj;
1768  ArrayRCP<part_t> task_communication_adj;
1770 
1771 
1775  void doMapping(int myRank,
1776  const Teuchos::RCP<const Teuchos::Comm<int> > comm_) {
1777 
1778  if (this->proc_task_comm) {
1779  this->proc_task_comm->getMapping(
1780  myRank,
1781  this->env,
1782  // Holds the pointer to the task array
1783  this->proc_to_task_xadj,
1784  // Holds the indices of task wrt to proc_to_task_xadj array
1785  this->proc_to_task_adj,
1786  // Holds the processors mapped to tasks
1787  this->task_to_proc,
1788  comm_
1789  );
1790  }
1791  else {
1792  std::cerr << "communicationModel is not specified in the Mapper"
1793  << std::endl;
1794  exit(1);
1795  }
1796  }
1797 
1798 
1802  RCP<Comm<int> > create_subCommunicator() {
1803  int procDim = this->proc_task_comm->proc_coord_dim;
1804  int taskDim = this->proc_task_comm->task_coord_dim;
1805 
1806  // Get the number of different permutations for task dimension ordering
1807  int taskPerm = z2Fact<int>(procDim);
1808  // Get the number of different permutations for proc dimension ordering
1809  int procPerm = z2Fact<int>(taskDim);
1810  // Total number of permutations
1811  int idealGroupSize = taskPerm * procPerm;
1812 
1813  // For the one that does longest dimension partitioning.
1814  idealGroupSize += taskPerm + procPerm + 1;
1815 
1816  int myRank = this->comm->getRank();
1817  int commSize = this->comm->getSize();
1818 
1819  int myGroupIndex = myRank / idealGroupSize;
1820 
1821  int prevGroupBegin = (myGroupIndex - 1)* idealGroupSize;
1822  if (prevGroupBegin < 0) prevGroupBegin = 0;
1823  int myGroupBegin = myGroupIndex * idealGroupSize;
1824  int myGroupEnd = (myGroupIndex + 1) * idealGroupSize;
1825  int nextGroupEnd = (myGroupIndex + 2)* idealGroupSize;
1826 
1827  if (myGroupEnd > commSize) {
1828  myGroupBegin = prevGroupBegin;
1829  myGroupEnd = commSize;
1830  }
1831  if (nextGroupEnd > commSize) {
1832  myGroupEnd = commSize;
1833  }
1834  int myGroupSize = myGroupEnd - myGroupBegin;
1835 
1836  part_t *myGroup = allocMemory<part_t>(myGroupSize);
1837  for (int i = 0; i < myGroupSize; ++i) {
1838  myGroup[i] = myGroupBegin + i;
1839  }
1840 // std::cout << "me:" << myRank << " myGroupBegin:" << myGroupBegin
1841 // << " myGroupEnd:" << myGroupEnd << endl;
1842 
1843  ArrayView<const part_t> myGroupView(myGroup, myGroupSize);
1844 
1845  RCP<Comm<int> > subComm =
1846  this->comm->createSubcommunicator(myGroupView);
1847  freeArray<part_t>(myGroup);
1848  return subComm;
1849  }
1850 
1851 
1856  //create the sub group.
1857  RCP<Comm<int> > subComm = this->create_subCommunicator();
1858  //calculate cost.
1859  double myCost = this->proc_task_comm->getCommunicationCostMetric();
1860 // std::cout << "me:" << this->comm->getRank() << " myCost:"
1861 // << myCost << std::endl;
1862  double localCost[2], globalCost[2];
1863 
1864  localCost[0] = myCost;
1865  localCost[1] = double(subComm->getRank());
1866 
1867  globalCost[1] = globalCost[0] = std::numeric_limits<double>::max();
1869  reduceAll<int, double>(*subComm, reduceBest,
1870  2, localCost, globalCost);
1871 
1872  int sender = int(globalCost[1]);
1873 
1874 /*
1875  if ( this->comm->getRank() == 0) {
1876  std::cout << "me:" << localCost[1] <<
1877  " localcost:" << localCost[0]<<
1878  " bestcost:" << globalCost[0] <<
1879  " Sender:" << sender <<
1880  " procDim" << proc_task_comm->proc_coord_dim <<
1881  " taskDim:" << proc_task_comm->task_coord_dim << std::endl;
1882  }
1883 */
1884 
1885 // std::cout << "me:" << localCost[1] << " localcost:" << localCost[0]
1886 // << " bestcost:" << globalCost[0] << endl;
1887 // std::cout << "me:" << localCost[1] << " proc:" << globalCost[1]
1888 // << endl;
1889  broadcast(*subComm, sender, this->ntasks,
1890  this->task_to_proc.getRawPtr());
1891  broadcast(*subComm, sender, this->nprocs,
1892  this->proc_to_task_xadj.getRawPtr());
1893  broadcast(*subComm, sender, this->ntasks,
1894  this->proc_to_task_adj.getRawPtr());
1895  }
1896 
1897  //write mapping to gnuPlot code to visualize.
1898  void writeMapping() {
1899  std::ofstream gnuPlotCode("gnuPlot.plot", std::ofstream::out);
1900 
1901  int mindim = MINOF(proc_task_comm->proc_coord_dim,
1902  proc_task_comm->task_coord_dim);
1903  std::string ss = "";
1904  for (part_t i = 0; i < this->nprocs; ++i) {
1905 
1906  std::string procFile = Teuchos::toString<int>(i) + "_mapping.txt";
1907  if (i == 0) {
1908  gnuPlotCode << "plot \"" << procFile << "\"\n";
1909  }
1910  else {
1911  gnuPlotCode << "replot \"" << procFile << "\"\n";
1912  }
1913 
1914  std::ofstream inpFile(procFile.c_str(), std::ofstream::out);
1915 
1916  std::string gnuPlotArrow = "set arrow from ";
1917  for (int j = 0; j < mindim; ++j) {
1918  if (j == mindim - 1) {
1919  inpFile << proc_task_comm->proc_coords[j][i];
1920  gnuPlotArrow +=
1921  Teuchos::toString<float>(proc_task_comm->proc_coords[j][i]);
1922 
1923  }
1924  else {
1925  inpFile << proc_task_comm->proc_coords[j][i] << " ";
1926  gnuPlotArrow +=
1927  Teuchos::toString<float>(proc_task_comm->
1928  proc_coords[j][i]) + ",";
1929  }
1930  }
1931  gnuPlotArrow += " to ";
1932 
1933  inpFile << std::endl;
1934  ArrayView<part_t> a = this->getAssignedTasksForProc(i);
1935 
1936  for (int k = 0; k < a.size(); ++k) {
1937  int j = a[k];
1938 // std::cout << "i:" << i << " j:"
1939  std::string gnuPlotArrow2 = gnuPlotArrow;
1940  for (int z = 0; z < mindim; ++z) {
1941  if (z == mindim - 1) {
1942 
1943 // std::cout << "z:" << z << " j:" << j << " "
1944 // << proc_task_comm->task_coords[z][j] << endl;
1945  inpFile << proc_task_comm->task_coords[z][j];
1946  gnuPlotArrow2 +=
1947  Teuchos::toString<float>(proc_task_comm->task_coords[z][j]);
1948  }
1949  else {
1950  inpFile << proc_task_comm->task_coords[z][j] << " ";
1951  gnuPlotArrow2 +=
1952  Teuchos::toString<float>(proc_task_comm->
1953  task_coords[z][j]) + ",";
1954  }
1955  }
1956  ss += gnuPlotArrow2 + "\n";
1957  inpFile << std::endl;
1958  }
1959  inpFile.close();
1960  }
1961  gnuPlotCode << ss;
1962  gnuPlotCode << "\nreplot\n pause -1 \n";
1963  gnuPlotCode.close();
1964  }
1965 
1966  //write mapping to gnuPlot code to visualize.
1967  void writeMapping2(int myRank) {
1968  std::string rankStr = Teuchos::toString<int>(myRank);
1969  std::string gnuPlots = "gnuPlot", extentionS = ".plot";
1970  std::string outF = gnuPlots + rankStr+ extentionS;
1971  std::ofstream gnuPlotCode(outF.c_str(), std::ofstream::out);
1972 
1974  *tmpproc_task_comm =
1975  static_cast <CoordinateCommunicationModel<pcoord_t, tcoord_t,
1976  part_t> * > (
1977  proc_task_comm);
1978 
1979 // int mindim = MINOF(tmpproc_task_comm->proc_coord_dim,
1980 // tmpproc_task_comm->task_coord_dim);
1981  int mindim = tmpproc_task_comm->proc_coord_dim;
1982  if (mindim != 3) {
1983  std::cerr << "Mapping Write is only good for 3 dim" << std::endl;
1984  return;
1985  }
1986  std::string ss = "";
1987  std::string procs = "";
1988 
1989  std::set < std::tuple<int,int,int,int,int,int> > my_arrows;
1990  for (part_t origin_rank = 0; origin_rank < this->nprocs; ++origin_rank) {
1991  ArrayView<part_t> a = this->getAssignedTasksForProc(origin_rank);
1992  if (a.size() == 0) {
1993  continue;
1994  }
1995 
1996  std::string gnuPlotArrow = "set arrow from ";
1997  for (int j = 0; j < mindim; ++j) {
1998  if (j == mindim - 1) {
1999  gnuPlotArrow +=
2000  Teuchos::toString<float>(tmpproc_task_comm->
2001  proc_coords[j][origin_rank]);
2002  procs +=
2003  Teuchos::toString<float>(tmpproc_task_comm->
2004  proc_coords[j][origin_rank]);
2005 
2006  }
2007  else {
2008  gnuPlotArrow +=
2009  Teuchos::toString<float>(tmpproc_task_comm->
2010  proc_coords[j][origin_rank]) + ",";
2011  procs +=
2012  Teuchos::toString<float>(tmpproc_task_comm->
2013  proc_coords[j][origin_rank])+ " ";
2014  }
2015  }
2016  procs += "\n";
2017 
2018  gnuPlotArrow += " to ";
2019 
2020 
2021  for (int k = 0; k < a.size(); ++k) {
2022  int origin_task = a[k];
2023 
2024  for (int nind = task_communication_xadj[origin_task];
2025  nind < task_communication_xadj[origin_task + 1]; ++nind) {
2026  int neighbor_task = task_communication_adj[nind];
2027 
2028  bool differentnode = false;
2029 
2030  int neighbor_rank = this->getAssignedProcForTask(neighbor_task);
2031 
2032  for (int j = 0; j < mindim; ++j) {
2033  if (int(tmpproc_task_comm->proc_coords[j][origin_rank]) !=
2034  int(tmpproc_task_comm->proc_coords[j][neighbor_rank])) {
2035  differentnode = true; break;
2036  }
2037  }
2038  std::tuple<int,int,int, int, int, int> foo(
2039  int(tmpproc_task_comm->proc_coords[0][origin_rank]),
2040  int(tmpproc_task_comm->proc_coords[1][origin_rank]),
2041  int(tmpproc_task_comm->proc_coords[2][origin_rank]),
2042  int(tmpproc_task_comm->proc_coords[0][neighbor_rank]),
2043  int(tmpproc_task_comm->proc_coords[1][neighbor_rank]),
2044  int(tmpproc_task_comm->proc_coords[2][neighbor_rank]));
2045 
2046 
2047  if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
2048  my_arrows.insert(foo);
2049 
2050  std::string gnuPlotArrow2 = "";
2051  for (int j = 0; j < mindim; ++j) {
2052  if (j == mindim - 1) {
2053  gnuPlotArrow2 +=
2054  Teuchos::toString<float>(tmpproc_task_comm->
2055  proc_coords[j][neighbor_rank]);
2056  }
2057  else {
2058  gnuPlotArrow2 +=
2059  Teuchos::toString<float>(tmpproc_task_comm->
2060  proc_coords[j][neighbor_rank]) + ",";
2061  }
2062  }
2063  ss += gnuPlotArrow + gnuPlotArrow2 + " nohead\n";
2064  }
2065  }
2066  }
2067  }
2068 
2069  std::ofstream procFile("procPlot.plot", std::ofstream::out);
2070  procFile << procs << "\n";
2071  procFile.close();
2072 
2073  //gnuPlotCode << ss;
2074  if (mindim == 2) {
2075  gnuPlotCode << "plot \"procPlot.plot\" with points pointsize 3\n";
2076  } else {
2077  gnuPlotCode << "splot \"procPlot.plot\" with points pointsize 3\n";
2078  }
2079 
2080  gnuPlotCode << ss << "\nreplot\n pause -1 \n";
2081  gnuPlotCode.close();
2082  }
2083 
2084 
2085 // KDD Need to provide access to algorithm for getPartBoxes
2086 #ifdef gnuPlot
2087  void writeGnuPlot(
2088  const Teuchos::Comm<int> *comm_,
2090  int coordDim,
2091  tcoord_t **partCenters) {
2092 
2093  std::string file = "gggnuPlot";
2094  std::string exten = ".plot";
2095  std::ofstream mm("2d.txt");
2096  file += Teuchos::toString<int>(comm_->getRank()) + exten;
2097  std::ofstream ff(file.c_str());
2098 // ff.seekg(0, ff.end);
2099  std::vector<Zoltan2::coordinateModelPartBox <tcoord_t, part_t> >
2100  outPartBoxes =
2102  getPartBoxesView();
2103 
2104  for (part_t i = 0; i < this->ntasks;++i) {
2105  outPartBoxes[i].writeGnuPlot(ff, mm);
2106  }
2107  if (coordDim == 2) {
2108  ff << "plot \"2d.txt\"" << std::endl;
2109 // ff << "\n pause -1" << endl;
2110  }
2111  else {
2112  ff << "splot \"2d.txt\"" << std::endl;
2113 // ff << "\n pause -1" << endl;
2114  }
2115  mm.close();
2116 
2117  ff << "set style arrow 5 nohead size screen 0.03,15,135 ls 1"
2118  << std::endl;
2119 
2120  for (part_t i = 0; i < this->ntasks; ++i) {
2122  part_t pe = task_communication_xadj[i + 1];
2123 
2124  for (part_t p = pb; p < pe; ++p) {
2126 
2127 // std::cout << "i:" << i << " n:" << n << endl;
2128  std::string arrowline = "set arrow from ";
2129  for (int j = 0; j < coordDim - 1; ++j) {
2130  arrowline +=
2131  Teuchos::toString<tcoord_t>(partCenters[j][n]) + ",";
2132  }
2133  arrowline +=
2134  Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][n]) +
2135  " to ";
2136 
2137  for (int j = 0; j < coordDim - 1; ++j) {
2138  arrowline +=
2139  Teuchos::toString<tcoord_t>(partCenters[j][i]) + ",";
2140  }
2141  arrowline +=
2142  Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][i]) +
2143  " as 5\n";
2144 
2145 // std::cout << "arrow:" << arrowline << endl;
2146  ff << arrowline;
2147  }
2148  }
2149 
2150  ff << "replot\n pause -1" << std::endl;
2151  ff.close();
2152  }
2153 #endif // gnuPlot
2154 
2155 public:
2156 
2157  void getProcTask(part_t* &proc_to_task_xadj_,
2158  part_t* &proc_to_task_adj_) {
2159  proc_to_task_xadj_ = this->proc_to_task_xadj.getRawPtr();
2160  proc_to_task_adj_ = this->proc_to_task_adj.getRawPtr();
2161  }
2162 
2163  virtual void map(const RCP<MappingSolution<Adapter> > &mappingsoln) {
2164 
2165  // Mapping was already computed in the constructor; we need to store it
2166  // in the solution.
2167  mappingsoln->setMap_RankForLocalElements(local_task_to_rank);
2168 
2169  // KDDKDD TODO: Algorithm is also creating task_to_proc, which maybe
2170  // KDDKDD is not needed once we use MappingSolution to answer queries
2171  // KDDKDD instead of this algorithm.
2172  // KDDKDD Ask Mehmet: what is the most efficient way to get the answer
2173  // KDDKDD out of CoordinateTaskMapper and into the MappingSolution?
2174  }
2175 
2176 
2178  //freeArray<part_t>(proc_to_task_xadj);
2179  //freeArray<part_t>(proc_to_task_adj);
2180  //freeArray<part_t>(task_to_proc);
2181  if (this->isOwnerofModel) {
2182  delete this->proc_task_comm;
2183  }
2184  }
2185 
2187  const lno_t num_local_coords,
2188  const part_t *local_coord_parts,
2189  const ArrayRCP<part_t> task_to_proc_) {
2190  local_task_to_rank = ArrayRCP <part_t>(num_local_coords);
2191 
2192  for (lno_t i = 0; i < num_local_coords; ++i) {
2193  part_t local_coord_part = local_coord_parts[i];
2194  part_t rank_index = task_to_proc_[local_coord_part];
2195  local_task_to_rank[i] = rank_index;
2196  }
2197  }
2198 
2213  const Teuchos::RCP <const Teuchos::Comm<int> > comm_,
2214  const Teuchos::RCP <const MachineRepresentation<pcoord_t, part_t> >
2215  machine_,
2216  const Teuchos::RCP <const Adapter> input_adapter_,
2217  const Teuchos::RCP <const Zoltan2::PartitioningSolution<Adapter> >
2218  soln_,
2219  const Teuchos::RCP <const Environment> envConst,
2220  bool is_input_adapter_distributed = true,
2221  int num_ranks_per_node = 1,
2222  bool divide_to_prime_first = false,
2223  bool reduce_best_mapping = true):
2224  PartitionMapping<Adapter>(comm_, machine_, input_adapter_,
2225  soln_, envConst),
2226  proc_to_task_xadj(0),
2227  proc_to_task_adj(0),
2228  task_to_proc(0),
2229  isOwnerofModel(true),
2230  proc_task_comm(0),
2234 
2235  using namespace Teuchos;
2236  typedef typename Adapter::base_adapter_t ctm_base_adapter_t;
2237 
2238  RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2239  RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2240 
2241  RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2242  RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2243  if (!is_input_adapter_distributed) {
2244  ia_comm = Teuchos::createSerialComm<int>();
2245  }
2246 
2247  RCP<const Environment> envConst_ = envConst;
2248 
2249  RCP<const ctm_base_adapter_t> baseInputAdapter_(
2250  rcp(dynamic_cast<const ctm_base_adapter_t *>(
2251  input_adapter_.getRawPtr()), false));
2252 
2253  modelFlag_t coordFlags_, graphFlags_;
2254 
2255  //create coordinate model
2256  //since this is coordinate task mapper,
2257  //the adapter has to have the coordinates
2258  coordinateModel_ = rcp(new CoordinateModel<ctm_base_adapter_t>(
2259  baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2260 
2261  //if the adapter has also graph model, we will use graph model
2262  //to calculate the cost mapping.
2263  BaseAdapterType inputType_ = input_adapter_->adapterType();
2264  if (inputType_ == MatrixAdapterType ||
2265  inputType_ == GraphAdapterType ||
2266  inputType_ == MeshAdapterType)
2267  {
2268  graph_model_ = rcp(new GraphModel<ctm_base_adapter_t>(
2269  baseInputAdapter_, envConst_, ia_comm,
2270  graphFlags_));
2271  }
2272 
2273  if (!machine_->hasMachineCoordinates()) {
2274  throw std::runtime_error("Existing machine does not provide "
2275  "coordinates for coordinate task mapping");
2276  }
2277 
2278  //if mapping type is 0 then it is coordinate mapping
2279  int procDim = machine_->getMachineDim();
2280  this->nprocs = machine_->getNumRanks();
2281 
2282  //get processor coordinates.
2283  pcoord_t **procCoordinates = NULL;
2284  if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2285  throw std::runtime_error("Existing machine does not implement "
2286  "getAllMachineCoordinatesView");
2287  }
2288 
2289  //get the machine extent.
2290  //if we have machine extent,
2291  //if the machine has wrap-around links, we would like to shift the
2292  //coordinates, so that the largest hap would be the wrap-around.
2293  std::vector<int> machine_extent_vec(procDim);
2294  //std::vector<bool> machine_extent_wrap_around_vec(procDim, 0);
2295  int *machine_extent = &(machine_extent_vec[0]);
2296  bool *machine_extent_wrap_around = new bool[procDim];
2297  for (int i = 0; i < procDim; ++i)
2298  machine_extent_wrap_around[i] = false;
2299 
2300  bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2301 
2302  // KDDKDD ASK MEHMET: SHOULD WE GET AND USE machine_dimension HERE IF IT
2303  // KDDKDD ASK MEHMET: IS PROVIDED BY THE MACHINE REPRESENTATION?
2304  // KDDKDD ASK MEHMET: IF NOT HERE, THEN WHERE?
2305  // MD: Yes, I ADDED BELOW:
2306  if (machine_->getMachineExtent(machine_extent) &&
2307  haveWrapArounds) {
2308 
2309  procCoordinates =
2311  procDim,
2312  machine_extent,
2313  machine_extent_wrap_around,
2314  this->nprocs,
2315  procCoordinates);
2316  }
2317 
2318  //get the tasks information, such as coordinate dimension,
2319  //number of parts.
2320  int coordDim = coordinateModel_->getCoordinateDim();
2321 
2322 // int coordDim = machine_->getMachineDim();
2323 
2324  this->ntasks = soln_->getActualGlobalNumberOfParts();
2325  if (part_t(soln_->getTargetGlobalNumberOfParts()) > this->ntasks) {
2326  this->ntasks = soln_->getTargetGlobalNumberOfParts();
2327  }
2328  this->solution_parts = soln_->getPartListView();
2329 
2330  //we need to calculate the center of parts.
2331  tcoord_t **partCenters = NULL;
2332  partCenters = allocMemory<tcoord_t *>(coordDim);
2333  for (int i = 0; i < coordDim; ++i) {
2334  partCenters[i] = allocMemory<tcoord_t>(this->ntasks);
2335  }
2336 
2337  typedef typename Adapter::scalar_t t_scalar_t;
2338 
2339 
2340  envConst->timerStart(MACRO_TIMERS, "Mapping - Solution Center");
2341 
2342  //get centers for the parts.
2343  getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2344  envConst.getRawPtr(),
2345  ia_comm.getRawPtr(),
2346  coordinateModel_.getRawPtr(),
2347  this->solution_parts,
2348 // soln_->getPartListView();
2349 // this->soln.getRawPtr(),
2350  coordDim,
2351  ntasks,
2352  partCenters);
2353 
2354  envConst->timerStop(MACRO_TIMERS, "Mapping - Solution Center");
2355 
2356  //create the part graph
2357  if (graph_model_.getRawPtr() != NULL) {
2358  getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2359  envConst.getRawPtr(),
2360  ia_comm.getRawPtr(),
2361  graph_model_.getRawPtr(),
2362  this->ntasks,
2363  this->solution_parts,
2364 // soln_->getPartListView(),
2365 // this->soln.getRawPtr(),
2369  );
2370  }
2371 
2372  //create coordinate communication model.
2373  this->proc_task_comm =
2374  new Zoltan2::CoordinateCommunicationModel<pcoord_t, tcoord_t,
2375  part_t>(
2376  procDim,
2377  procCoordinates,
2378  coordDim,
2379  partCenters,
2380  this->nprocs,
2381  this->ntasks,
2382  machine_extent,
2383  machine_extent_wrap_around,
2384  machine_.getRawPtr());
2385 
2386  int myRank = comm_->getRank();
2387  this->proc_task_comm->num_ranks_per_node = num_ranks_per_node ;
2388  this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2389 
2390  envConst->timerStart(MACRO_TIMERS, "Mapping - Processor Task map");
2391  this->doMapping(myRank, comm_);
2392  envConst->timerStop(MACRO_TIMERS, "Mapping - Processor Task map");
2393 
2394  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Graph");
2395 
2396 /*
2397  soln_->getCommunicationGraph(task_communication_xadj,
2398  task_communication_adj);
2399 */
2400 
2401  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Graph");
2402  #ifdef gnuPlot1
2403  if (comm_->getRank() == 0) {
2404 
2405  part_t taskCommCount = task_communication_xadj.size();
2406  std::cout << " TotalComm:"
2407  << task_communication_xadj[taskCommCount] << std::endl;
2408  part_t maxN = task_communication_xadj[0];
2409  for (part_t i = 1; i <= taskCommCount; ++i) {
2410  part_t nc =
2412  if (maxN < nc)
2413  maxN = nc;
2414  }
2415  std::cout << " maxNeighbor:" << maxN << std::endl;
2416  }
2417 
2418  this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2419  #endif
2420 
2421  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Cost");
2422 
2423  if (reduce_best_mapping && task_communication_xadj.getRawPtr() &&
2424  task_communication_adj.getRawPtr()) {
2425  this->proc_task_comm->calculateCommunicationCost(
2426  task_to_proc.getRawPtr(),
2427  task_communication_xadj.getRawPtr(),
2428  task_communication_adj.getRawPtr(),
2429  task_communication_edge_weight.getRawPtr()
2430  );
2431  }
2432 
2433 // std::cout << "me: " << comm_->getRank() << " cost:"
2434 // << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2435 
2436  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Cost");
2437 
2438  //processors are divided into groups of size procDim! * coordDim!
2439  //each processor in the group obtains a mapping with a different
2440  //rotation and best one is broadcasted all processors.
2441  this->getBestMapping();
2443  coordinateModel_->getLocalNumCoordinates(),
2444  this->solution_parts,
2445  this->task_to_proc);
2446 /*
2447  {
2448  if (task_communication_xadj.getRawPtr() &&
2449  task_communication_adj.getRawPtr())
2450  this->proc_task_comm->calculateCommunicationCost(
2451  task_to_proc.getRawPtr(),
2452  task_communication_xadj.getRawPtr(),
2453  task_communication_adj.getRawPtr(),
2454  task_communication_edge_weight.getRawPtr()
2455  );
2456  std::cout << "me: " << comm_->getRank() << " cost:"
2457  << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2458  }
2459 */
2460 
2461 
2462  #ifdef gnuPlot
2463  this->writeMapping2(comm_->getRank());
2464  #endif
2465 
2466  delete [] machine_extent_wrap_around;
2467 
2468  if (machine_->getMachineExtent(machine_extent) &&
2469  haveWrapArounds) {
2470  for (int i = 0; i < procDim; ++i) {
2471  delete [] procCoordinates[i];
2472  }
2473  delete [] procCoordinates;
2474  }
2475 
2476  for (int i = 0; i < coordDim; ++i) {
2477  freeArray<tcoord_t>(partCenters[i]);
2478  }
2479  freeArray<tcoord_t *>(partCenters);
2480 
2481  }
2482 
2483 
2501  const Teuchos::RCP <const Teuchos::Comm<int> > comm_,
2502  const Teuchos::RCP <const MachineRepresentation<pcoord_t,part_t> >
2503  machine_,
2504  const Teuchos::RCP <const Adapter> input_adapter_,
2505  const part_t num_parts_,
2506  const part_t *result_parts,
2507  const Teuchos::RCP <const Environment> envConst,
2508  bool is_input_adapter_distributed = true,
2509  int num_ranks_per_node = 1,
2510  bool divide_to_prime_first = false,
2511  bool reduce_best_mapping = true):
2512  PartitionMapping<Adapter>(comm_, machine_, input_adapter_,
2513  num_parts_, result_parts, envConst),
2514  proc_to_task_xadj(0),
2515  proc_to_task_adj(0),
2516  task_to_proc(0),
2517  isOwnerofModel(true),
2518  proc_task_comm(0),
2522 
2523  using namespace Teuchos;
2524  typedef typename Adapter::base_adapter_t ctm_base_adapter_t;
2525 
2526  RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2527  RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2528 
2529  RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2530  RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2531  if (!is_input_adapter_distributed) {
2532  ia_comm = Teuchos::createSerialComm<int>();
2533  }
2534  RCP<const Environment> envConst_ = envConst;
2535 
2536  RCP<const ctm_base_adapter_t> baseInputAdapter_(
2537  rcp(dynamic_cast<const ctm_base_adapter_t *>(
2538  input_adapter_.getRawPtr()), false));
2539 
2540  modelFlag_t coordFlags_, graphFlags_;
2541 
2542  //create coordinate model
2543  //since this is coordinate task mapper,
2544  //the adapter has to have the coordinates
2545  coordinateModel_ = rcp(new CoordinateModel<ctm_base_adapter_t>(
2546  baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2547 
2548  //if the adapter has also graph model, we will use graph model
2549  //to calculate the cost mapping.
2550  BaseAdapterType inputType_ = input_adapter_->adapterType();
2551  if (inputType_ == MatrixAdapterType ||
2552  inputType_ == GraphAdapterType ||
2553  inputType_ == MeshAdapterType)
2554  {
2555  graph_model_ = rcp(new GraphModel<ctm_base_adapter_t>(
2556  baseInputAdapter_, envConst_, ia_comm,
2557  graphFlags_));
2558  }
2559 
2560  if (!machine_->hasMachineCoordinates()) {
2561  throw std::runtime_error("Existing machine does not provide "
2562  "coordinates for coordinate task mapping.");
2563  }
2564 
2565  //if mapping type is 0 then it is coordinate mapping
2566  int procDim = machine_->getMachineDim();
2567  this->nprocs = machine_->getNumRanks();
2568 
2569  //get processor coordinates.
2570  pcoord_t **procCoordinates = NULL;
2571  if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2572  throw std::runtime_error("Existing machine does not implement "
2573  "getAllMachineCoordinatesView");
2574  }
2575 
2576  //get the machine extent.
2577  //if we have machine extent,
2578  //if the machine has wrap-around links, we would like to shift the
2579  //coordinates,
2580  //so that the largest hap would be the wrap-around.
2581  std::vector<int> machine_extent_vec(procDim);
2582 // std::vector<bool> machine_extent_wrap_around_vec(procDim, 0);
2583  int *machine_extent = &(machine_extent_vec[0]);
2584  bool *machine_extent_wrap_around = new bool[procDim];
2585  bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2586 
2587  // KDDKDD ASK MEHMET: SHOULD WE GET AND USE machine_dimension HERE IF IT
2588  // KDDKDD ASK MEHMET: IS PROVIDED BY THE MACHINE REPRESENTATION?
2589  // KDDKDD ASK MEHMET: IF NOT HERE, THEN WHERE?
2590  // MD: Yes, I ADDED BELOW:
2591  if (machine_->getMachineExtent(machine_extent) &&
2592  haveWrapArounds) {
2593  procCoordinates =
2595  procDim,
2596  machine_extent,
2597  machine_extent_wrap_around,
2598  this->nprocs,
2599  procCoordinates);
2600  }
2601 
2602  //get the tasks information, such as coordinate dimension,
2603  //number of parts.
2604  int coordDim = coordinateModel_->getCoordinateDim();
2605 
2606 // int coordDim = machine_->getMachineDim();
2607 
2608 
2609  this->ntasks = num_parts_;
2610  this->solution_parts = result_parts;
2611 
2612  //we need to calculate the center of parts.
2613  tcoord_t **partCenters = NULL;
2614  partCenters = allocMemory<tcoord_t *>(coordDim);
2615  for (int i = 0; i < coordDim; ++i) {
2616  partCenters[i] = allocMemory<tcoord_t>(this->ntasks);
2617  }
2618 
2619  typedef typename Adapter::scalar_t t_scalar_t;
2620 
2621 
2622  envConst->timerStart(MACRO_TIMERS, "Mapping - Solution Center");
2623 
2624  //get centers for the parts.
2625  getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2626  envConst.getRawPtr(),
2627  ia_comm.getRawPtr(),
2628  coordinateModel_.getRawPtr(),
2629  this->solution_parts,
2630 // soln_->getPartListView();
2631 // this->soln.getRawPtr(),
2632  coordDim,
2633  ntasks,
2634  partCenters);
2635 
2636  envConst->timerStop(MACRO_TIMERS, "Mapping - Solution Center");
2637 
2638  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE");
2639  //create the part graph
2640  if (graph_model_.getRawPtr() != NULL) {
2641  getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2642  envConst.getRawPtr(),
2643  ia_comm.getRawPtr(),
2644  graph_model_.getRawPtr(),
2645  this->ntasks,
2646  this->solution_parts,
2647 // soln_->getPartListView(),
2648 // this->soln.getRawPtr(),
2652  );
2653  }
2654  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE");
2655 
2656  envConst->timerStart(MACRO_TIMERS,
2657  "CoordinateCommunicationModel Create");
2658  //create coordinate communication model.
2659  this->proc_task_comm =
2660  new Zoltan2::CoordinateCommunicationModel<pcoord_t, tcoord_t,
2661  part_t>(
2662  procDim,
2663  procCoordinates,
2664  coordDim,
2665  partCenters,
2666  this->nprocs,
2667  this->ntasks,
2668  machine_extent,
2669  machine_extent_wrap_around,
2670  machine_.getRawPtr());
2671 
2672  envConst->timerStop(MACRO_TIMERS,
2673  "CoordinateCommunicationModel Create");
2674 
2675 
2676  this->proc_task_comm->num_ranks_per_node = num_ranks_per_node;
2677  this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2678 
2679  int myRank = comm_->getRank();
2680 
2681 
2682  envConst->timerStart(MACRO_TIMERS, "Mapping - Processor Task map");
2683  this->doMapping(myRank, comm_);
2684  envConst->timerStop(MACRO_TIMERS, "Mapping - Processor Task map");
2685 
2686 
2687  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Graph");
2688 
2689 /*
2690  soln_->getCommunicationGraph(task_communication_xadj,
2691  task_communication_adj);
2692 */
2693 
2694  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Graph");
2695  #ifdef gnuPlot1
2696  if (comm_->getRank() == 0) {
2697 
2698  part_t taskCommCount = task_communication_xadj.size();
2699  std::cout << " TotalComm:"
2700  << task_communication_xadj[taskCommCount] << std::endl;
2701  part_t maxN = task_communication_xadj[0];
2702  for (part_t i = 1; i <= taskCommCount; ++i) {
2703  part_t nc =
2705  if (maxN < nc)
2706  maxN = nc;
2707  }
2708  std::cout << " maxNeighbor:" << maxN << std::endl;
2709  }
2710 
2711  this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2712  #endif
2713 
2714  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Cost");
2715 
2716  if (reduce_best_mapping && task_communication_xadj.getRawPtr() &&
2717  task_communication_adj.getRawPtr()) {
2718  this->proc_task_comm->calculateCommunicationCost(
2719  task_to_proc.getRawPtr(),
2720  task_communication_xadj.getRawPtr(),
2721  task_communication_adj.getRawPtr(),
2722  task_communication_edge_weight.getRawPtr()
2723  );
2724  }
2725 
2726 // std::cout << "me: " << comm_->getRank() << " cost:"
2727 // << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2728 
2729  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Cost");
2730 
2731  //processors are divided into groups of size procDim! * coordDim!
2732  //each processor in the group obtains a mapping with a different rotation
2733  //and best one is broadcasted all processors.
2734  this->getBestMapping();
2735 
2737  coordinateModel_->getLocalNumCoordinates(),
2738  this->solution_parts,
2739  this->task_to_proc);
2740 /*
2741  {
2742  if (task_communication_xadj.getRawPtr() &&
2743  task_communication_adj.getRawPtr())
2744  this->proc_task_comm->calculateCommunicationCost(
2745  task_to_proc.getRawPtr(),
2746  task_communication_xadj.getRawPtr(),
2747  task_communication_adj.getRawPtr(),
2748  task_communication_edge_weight.getRawPtr()
2749  );
2750  std::cout << "me: " << comm_->getRank() << " cost:"
2751  << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2752  }
2753 */
2754 
2755 
2756 
2757  #ifdef gnuPlot
2758  this->writeMapping2(comm_->getRank());
2759  #endif
2760 
2761  delete [] machine_extent_wrap_around;
2762 
2763  if (machine_->getMachineExtent(machine_extent) &&
2764  haveWrapArounds) {
2765  for (int i = 0; i < procDim; ++i) {
2766  delete [] procCoordinates[i];
2767  }
2768  delete [] procCoordinates;
2769  }
2770 
2771  for (int i = 0; i < coordDim; ++i) {
2772  freeArray<tcoord_t>(partCenters[i]);
2773  }
2774 
2775  freeArray<tcoord_t *>(partCenters);
2776  }
2777 
2844  const Environment *env_const_,
2845  const Teuchos::Comm<int> *problemComm,
2846  int proc_dim,
2847  int num_processors,
2848  pcoord_t **machine_coords,
2849  int task_dim,
2850  part_t num_tasks,
2851  tcoord_t **task_coords,
2852  ArrayRCP<part_t>task_comm_xadj,
2853  ArrayRCP<part_t>task_comm_adj,
2854  pcoord_t *task_communication_edge_weight_,
2855  int recursion_depth,
2856  part_t *part_no_array,
2857  const part_t *machine_dimensions,
2858  int num_ranks_per_node = 1,
2859  bool divide_to_prime_first = false,
2860  bool reduce_best_mapping = true):
2861  PartitionMapping<Adapter>(
2862  Teuchos::rcpFromRef<const Teuchos::Comm<int> >(*problemComm),
2863  Teuchos::rcpFromRef<const Environment>(*env_const_)),
2864  proc_to_task_xadj(0),
2865  proc_to_task_adj(0),
2866  task_to_proc(0),
2867  isOwnerofModel(true),
2868  proc_task_comm(0),
2869  task_communication_xadj(task_comm_xadj),
2870  task_communication_adj(task_comm_adj) {
2871 
2872  //if mapping type is 0 then it is coordinate mapping
2873  pcoord_t ** virtual_machine_coordinates = machine_coords;
2874  bool *wrap_arounds = new bool [proc_dim];
2875  for (int i = 0; i < proc_dim; ++i) wrap_arounds[i] = true;
2876 
2877  if (machine_dimensions) {
2878  virtual_machine_coordinates =
2880  proc_dim,
2881  machine_dimensions,
2882  wrap_arounds,
2883  num_processors,
2884  machine_coords);
2885  }
2886 
2887  this->nprocs = num_processors;
2888 
2889  int coordDim = task_dim;
2890  this->ntasks = num_tasks;
2891 
2892  //alloc memory for part centers.
2893  tcoord_t **partCenters = task_coords;
2894 
2895  //create coordinate communication model.
2896  this->proc_task_comm =
2898  proc_dim,
2899  virtual_machine_coordinates,
2900  coordDim,
2901  partCenters,
2902  this->nprocs,
2903  this->ntasks, NULL, NULL
2904  );
2905 
2906  this->proc_task_comm->num_ranks_per_node = num_ranks_per_node;
2907  this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2908 
2909  this->proc_task_comm->setPartArraySize(recursion_depth);
2910  this->proc_task_comm->setPartArray(part_no_array);
2911 
2912  int myRank = problemComm->getRank();
2913 
2914  this->doMapping(myRank, this->comm);
2915 #ifdef gnuPlot
2916  this->writeMapping2(myRank);
2917 #endif
2918 
2919  if (reduce_best_mapping && task_communication_xadj.getRawPtr() &&
2920  task_communication_adj.getRawPtr()) {
2921  this->proc_task_comm->calculateCommunicationCost(
2922  task_to_proc.getRawPtr(),
2923  task_communication_xadj.getRawPtr(),
2924  task_communication_adj.getRawPtr(),
2925  task_communication_edge_weight_
2926  );
2927 
2928 
2929  this->getBestMapping();
2930 
2931 /*
2932  if (myRank == 0) {
2933  this->proc_task_comm->calculateCommunicationCost(
2934  task_to_proc.getRawPtr(),
2935  task_communication_xadj.getRawPtr(),
2936  task_communication_adj.getRawPtr(),
2937  task_communication_edge_weight_
2938  );
2939  cout << "me: " << problemComm->getRank() << " cost:"
2940  << this->proc_task_comm->getCommunicationCostMetric() << endl;
2941  }
2942 */
2943 
2944  }
2945  delete [] wrap_arounds;
2946 
2947  if (machine_dimensions) {
2948  for (int i = 0; i < proc_dim; ++i) {
2949  delete [] virtual_machine_coordinates[i];
2950  }
2951  delete [] virtual_machine_coordinates;
2952  }
2953 #ifdef gnuPlot
2954  if (problemComm->getRank() == 0)
2955  this->writeMapping2(-1);
2956 #endif
2957  }
2958 
2959 
2960  /*
2961  double getCommunicationCostMetric() {
2962  return this->proc_task_comm->getCommCost();
2963  }
2964  */
2965 
2968  virtual size_t getLocalNumberOfParts() const{
2969  return 0;
2970  }
2971 
2983  int machine_dim,
2984  const part_t *machine_dimensions,
2985  bool *machine_extent_wrap_around,
2986  part_t numProcs,
2987  pcoord_t **mCoords) {
2988 
2989  pcoord_t **result_machine_coords = NULL;
2990  result_machine_coords = new pcoord_t*[machine_dim];
2991 
2992  for (int i = 0; i < machine_dim; ++i) {
2993  result_machine_coords[i] = new pcoord_t [numProcs];
2994  }
2995 
2996  for (int i = 0; i < machine_dim; ++i) {
2997  part_t numMachinesAlongDim = machine_dimensions[i];
2998 
2999  part_t *machineCounts = new part_t[numMachinesAlongDim];
3000  memset(machineCounts, 0, sizeof(part_t) * numMachinesAlongDim);
3001 
3002  int *filledCoordinates = new int[numMachinesAlongDim];
3003 
3004  pcoord_t *coords = mCoords[i];
3005 
3006  for (part_t j = 0; j < numProcs; ++j) {
3007  part_t mc = (part_t) coords[j];
3008  ++machineCounts[mc];
3009  }
3010 
3011  part_t filledCoordinateCount = 0;
3012  for (part_t j = 0; j < numMachinesAlongDim; ++j) {
3013  if (machineCounts[j] > 0) {
3014  filledCoordinates[filledCoordinateCount++] = j;
3015  }
3016  }
3017 
3018  part_t firstProcCoord = filledCoordinates[0];
3019  part_t firstProcCount = machineCounts[firstProcCoord];
3020 
3021  part_t lastProcCoord = filledCoordinates[filledCoordinateCount - 1];
3022  part_t lastProcCount = machineCounts[lastProcCoord];
3023 
3024  part_t firstLastGap =
3025  numMachinesAlongDim - lastProcCoord + firstProcCoord;
3026  part_t firstLastGapProc = lastProcCount + firstProcCount;
3027 
3028  part_t leftSideProcCoord = firstProcCoord;
3029  part_t leftSideProcCount = firstProcCount;
3030  part_t biggestGap = 0;
3031  part_t biggestGapProc = numProcs;
3032 
3033  part_t shiftBorderCoordinate = -1;
3034  for (part_t j = 1; j < filledCoordinateCount; ++j) {
3035  part_t rightSideProcCoord= filledCoordinates[j];
3036  part_t rightSideProcCount = machineCounts[rightSideProcCoord];
3037 
3038  part_t gap = rightSideProcCoord - leftSideProcCoord;
3039  part_t gapProc = rightSideProcCount + leftSideProcCount;
3040 
3041  // Pick the largest gap in this dimension. Use fewer process on
3042  // either side of the largest gap to break the tie. An easy
3043  // addition to this would be to weight the gap by the number of
3044  // processes.
3045  if (gap > biggestGap ||
3046  (gap == biggestGap && biggestGapProc > gapProc)) {
3047  shiftBorderCoordinate = rightSideProcCoord;
3048  biggestGapProc = gapProc;
3049  biggestGap = gap;
3050  }
3051  leftSideProcCoord = rightSideProcCoord;
3052  leftSideProcCount = rightSideProcCount;
3053  }
3054 
3055 
3056  if (!(biggestGap > firstLastGap ||
3057  (biggestGap == firstLastGap &&
3058  biggestGapProc < firstLastGapProc))) {
3059  shiftBorderCoordinate = -1;
3060  }
3061 
3062  for (part_t j = 0; j < numProcs; ++j) {
3063 
3064  if (machine_extent_wrap_around[i] &&
3065  coords[j] < shiftBorderCoordinate) {
3066  result_machine_coords[i][j] = coords[j] + numMachinesAlongDim;
3067 
3068  }
3069  else {
3070  result_machine_coords[i][j] = coords[j];
3071  }
3072  }
3073  delete [] machineCounts;
3074  delete [] filledCoordinates;
3075  }
3076 
3077  return result_machine_coords;
3078 
3079  }
3080 
3088  virtual void getProcsForPart(part_t taskId, part_t &numProcs,
3089  part_t *&procs) const {
3090  numProcs = 1;
3091  procs = this->task_to_proc.getRawPtr() + taskId;
3092  }
3093 
3099  return this->task_to_proc[taskId];
3100  }
3101 
3109  virtual void getPartsForProc(int procId, part_t &numParts,
3110  part_t *&parts) const {
3111 
3112  part_t task_begin = this->proc_to_task_xadj[procId];
3113  part_t taskend = this->proc_to_task_xadj[procId + 1];
3114 
3115  parts = this->proc_to_task_adj.getRawPtr() + task_begin;
3116  numParts = taskend - task_begin;
3117  }
3118 
3119  ArrayView<part_t> getAssignedTasksForProc(part_t procId) {
3120  part_t task_begin = this->proc_to_task_xadj[procId];
3121  part_t taskend = this->proc_to_task_xadj[procId + 1];
3122 
3123 /*
3124  std::cout << "part_t:" << procId << " taskCount:"
3125  << taskend - task_begin << std::endl;
3126 
3127  for (part_t i = task_begin; i < taskend; ++i) {
3128  std::cout << "part_t:" << procId << " task:"
3129  << proc_to_task_adj[i] << endl;
3130  }
3131 */
3132  if (taskend - task_begin > 0) {
3133  ArrayView <part_t> assignedParts(
3134  this->proc_to_task_adj.getRawPtr() + task_begin,
3135  taskend - task_begin);
3136 
3137  return assignedParts;
3138  }
3139  else {
3140  ArrayView <part_t> assignedParts;
3141 
3142  return assignedParts;
3143  }
3144  }
3145 
3146 };
3147 
3226 template <typename part_t, typename pcoord_t, typename tcoord_t>
3228  RCP<const Teuchos::Comm<int> > problemComm,
3229  int proc_dim,
3230  int num_processors,
3231  pcoord_t **machine_coords,
3232  int task_dim,
3233  part_t num_tasks,
3234  tcoord_t **task_coords,
3235  part_t *task_comm_xadj,
3236  part_t *task_comm_adj,
3237  // float-like, same size with task_communication_adj_ weight of the
3238  // corresponding edge.
3239  pcoord_t *task_communication_edge_weight_,
3240  part_t *proc_to_task_xadj, /*output*/
3241  part_t *proc_to_task_adj, /*output*/
3242  int recursion_depth,
3243  part_t *part_no_array,
3244  const part_t *machine_dimensions,
3245  int num_ranks_per_node = 1,
3246  bool divide_to_prime_first = false) {
3247 
3248  const Environment *envConst_ = new Environment(problemComm);
3249 
3250  // mfh 03 Mar 2015: It's OK to omit the Node template
3251  // parameter in Tpetra, if you're just going to use the
3252  // default Node.
3253  typedef Tpetra::MultiVector<tcoord_t, part_t, part_t> tMVector_t;
3254 
3255  Teuchos::ArrayRCP<part_t> task_communication_xadj(
3256  task_comm_xadj, 0, num_tasks + 1, false);
3257 
3258  Teuchos::ArrayRCP<part_t> task_communication_adj;
3259  if (task_comm_xadj) {
3260  Teuchos::ArrayRCP<part_t> tmp_task_communication_adj(
3261  task_comm_adj, 0, task_comm_xadj[num_tasks], false);
3262  task_communication_adj = tmp_task_communication_adj;
3263  }
3264 
3265 
3268  part_t>(
3269  envConst_,
3270  problemComm.getRawPtr(),
3271  proc_dim,
3272  num_processors,
3273  machine_coords,
3274 // machine_coords_,
3275 
3276  task_dim,
3277  num_tasks,
3278  task_coords,
3279 
3282  task_communication_edge_weight_,
3283  recursion_depth,
3284  part_no_array,
3285  machine_dimensions,
3286  num_ranks_per_node,
3287  divide_to_prime_first);
3288 
3289 
3290  part_t* proc_to_task_xadj_;
3291  part_t* proc_to_task_adj_;
3292 
3293  ctm->getProcTask(proc_to_task_xadj_, proc_to_task_adj_);
3294 
3295  for (part_t i = 0; i <= num_processors; ++i) {
3296  proc_to_task_xadj[i] = proc_to_task_xadj_[i];
3297  }
3298 
3299  for (part_t i = 0; i < num_tasks; ++i) {
3300  proc_to_task_adj[i] = proc_to_task_adj_[i];
3301  }
3302 
3303  delete ctm;
3304  delete envConst_;
3305 }
3306 
3307 template <typename proc_coord_t, typename v_lno_t>
3308 inline void visualize_mapping(int myRank,
3309  const int machine_coord_dim,
3310  const int num_ranks,
3311  proc_coord_t **machine_coords,
3312  const v_lno_t num_tasks,
3313  const v_lno_t *task_communication_xadj,
3314  const v_lno_t *task_communication_adj,
3315  const int *task_to_rank) {
3316 
3317  std::string rankStr = Teuchos::toString<int>(myRank);
3318  std::string gnuPlots = "gnuPlot", extentionS = ".plot";
3319  std::string outF = gnuPlots + rankStr+ extentionS;
3320  std::ofstream gnuPlotCode( outF.c_str(), std::ofstream::out);
3321 
3322  if (machine_coord_dim != 3) {
3323  std::cerr << "Mapping Write is only good for 3 dim" << std::endl;
3324  return;
3325  }
3326  std::string ss = "";
3327  std::string procs = "";
3328 
3329  std::set<std::tuple<int, int, int, int, int, int> > my_arrows;
3330 
3331  for (v_lno_t origin_task = 0; origin_task < num_tasks; ++origin_task) {
3332  int origin_rank = task_to_rank[origin_task];
3333  std::string gnuPlotArrow = "set arrow from ";
3334 
3335  for (int j = 0; j < machine_coord_dim; ++j) {
3336  if (j == machine_coord_dim - 1) {
3337  gnuPlotArrow +=
3338  Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3339  procs +=
3340  Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3341 
3342  }
3343  else {
3344  gnuPlotArrow +=
3345  Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3346  + ",";
3347  procs +=
3348  Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3349  + " ";
3350  }
3351  }
3352  procs += "\n";
3353 
3354  gnuPlotArrow += " to ";
3355 
3356 
3357  for (int nind = task_communication_xadj[origin_task];
3358  nind < task_communication_xadj[origin_task + 1]; ++nind) {
3359 
3360  int neighbor_task = task_communication_adj[nind];
3361 
3362  bool differentnode = false;
3363  int neighbor_rank = task_to_rank[neighbor_task];
3364 
3365  for (int j = 0; j < machine_coord_dim; ++j) {
3366  if (int(machine_coords[j][origin_rank]) !=
3367  int(machine_coords[j][neighbor_rank])) {
3368  differentnode = true; break;
3369  }
3370  }
3371 
3372  std::tuple<int,int,int, int, int, int> foo(
3373  (int)(machine_coords[0][origin_rank]),
3374  (int)(machine_coords[1][origin_rank]),
3375  (int)(machine_coords[2][origin_rank]),
3376  (int)(machine_coords[0][neighbor_rank]),
3377  (int)(machine_coords[1][neighbor_rank]),
3378  (int)(machine_coords[2][neighbor_rank]));
3379 
3380  if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
3381  my_arrows.insert(foo);
3382 
3383  std::string gnuPlotArrow2 = "";
3384  for (int j = 0; j < machine_coord_dim; ++j) {
3385  if (j == machine_coord_dim - 1) {
3386  gnuPlotArrow2 +=
3387  Teuchos::toString<float>(machine_coords[j][neighbor_rank]);
3388  }
3389  else {
3390  gnuPlotArrow2 +=
3391  Teuchos::toString<float>(machine_coords[j][neighbor_rank])
3392  + ",";
3393  }
3394  }
3395  ss += gnuPlotArrow + gnuPlotArrow2 + " nohead\n";
3396  }
3397  }
3398  }
3399 
3400  std::ofstream procFile("procPlot.plot", std::ofstream::out);
3401  procFile << procs << "\n";
3402  procFile.close();
3403 
3404  //gnuPlotCode << ss;
3405  if (machine_coord_dim == 2) {
3406  gnuPlotCode << "plot \"procPlot.plot\" with points pointsize 3\n";
3407  }
3408  else {
3409  gnuPlotCode << "splot \"procPlot.plot\" with points pointsize 3\n";
3410  }
3411 
3412  gnuPlotCode << ss << "\nreplot\n pause -1\npause -1";
3413  gnuPlotCode.close();
3414 }
3415 
3416 } // namespace Zoltan2
3417 
3418 #endif
void setParams(int dimension_, int heapsize)
CommunicationModel Base Class that performs mapping between the coordinate partitioning result...
size_t getLocalNumEdges() const
Returns the number of edges on this process. In global or subset graphs, includes off-process edges...
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
void getBestMapping()
finds the lowest cost mapping and broadcasts solution to everyone.
void ithPermutation(const IT n, IT i, IT *perm)
CommunicationModel(part_t no_procs_, part_t no_tasks_)
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const part_t num_parts_, const part_t *result_parts, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. Instead of Solution we have two parameters, numparts.
size_t getVertexList(ArrayView< const gno_t > &Ids, ArrayView< input_t > &wgts) const
Sets pointers to this process&#39; vertex Ids and their weights.
Time an algorithm (or other entity) as a whole.
WT getDistance(IT index, WT **elementCoords)
virtual size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
CoordinateCommunicationModel(int pcoord_dim_, pcoord_t **pcoords_, int tcoord_dim_, tcoord_t **tcoords_, part_t no_procs_, part_t no_tasks_, int *machine_extent_, bool *machine_extent_wrap_around_, const MachineRepresentation< pcoord_t, part_t > *machine_=NULL)
Class Constructor:
void getCoarsenedPartGraph(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::GraphModel< typename Adapter::base_adapter_t > *graph, part_t np, const part_t *parts, ArrayRCP< part_t > &g_part_xadj, ArrayRCP< part_t > &g_part_adj, ArrayRCP< scalar_t > &g_part_ew)
void visualize_mapping(int myRank, const int machine_coord_dim, const int num_ranks, proc_coord_t **machine_coords, const v_lno_t num_tasks, const v_lno_t *task_communication_xadj, const v_lno_t *task_communication_adj, const int *task_to_rank)
PartitionMapping maps a solution or an input distribution to ranks.
void getSolutionCenterCoordinates(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::CoordinateModel< typename Adapter::base_adapter_t > *coords, const part_t *parts, int coordDim, part_t ntasks, scalar_t **partCenters)
bool getNewCenters(WT **coords)
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, mj_scalar_t **mj_coordinates, mj_lno_t *initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth, const mj_part_t *part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const mj_part_t *first_level_distribution_=NULL)
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
virtual double getProcDistance(int procId1, int procId2) const =0
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
void calculateCommunicationCost(part_t *task_to_proc, part_t *task_communication_xadj, part_t *task_communication_adj, pcoord_t *task_communication_edge_weight)
ArrayView< part_t > getAssignedTasksForProc(part_t procId)
const MachineRepresentation< pcoord_t, part_t > * machine
part_t getAssignedProcForTask(part_t taskId)
getAssignedProcForTask function, returns the assigned processor id for the given task ...
void getMinDistanceCluster(IT *procPermutation)
void doMapping(int myRank, const Teuchos::RCP< const Teuchos::Comm< int > > comm_)
doMapping function, calls getMapping function of communicationModel object.
virtual std::vector< coordinateModelPartBox > & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
Defines the XpetraMultiVectorAdapter.
SparseMatrixAdapter_t::part_t part_t
Multi Jagged coordinate partitioning algorithm.
void create_local_task_to_rank(const lno_t num_local_coords, const part_t *local_coord_parts, const ArrayRCP< part_t > task_to_proc_)
void timerStop(TimerType tt, const char *timerName) const
Stop a named timer.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &proc_to_task_xadj, ArrayRCP< part_t > &proc_to_task_adj, ArrayRCP< part_t > &task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const =0
Function is called whenever nprocs &gt; no_task. Function returns only the subset of processors that are...
virtual void map(const RCP< MappingSolution< Adapter > > &mappingsoln)
Mapping method.
Contains the Multi-jagged algorthm.
Adapter::scalar_t scalar_t
size_t getCoordinates(ArrayView< const gno_t > &Ids, ArrayView< input_t > &xyz, ArrayView< input_t > &wgts) const
Returns the coordinate ids, values and optional weights.
PartitionMapping maps a solution or an input distribution to ranks.
void getProcTask(part_t *&proc_to_task_xadj_, part_t *&proc_to_task_adj_)
A PartitioningSolution is a solution to a partitioning problem.
#define ZOLTAN2_ABS(x)
virtual void getProcsForPart(part_t taskId, part_t &numProcs, part_t *&procs) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
size_t getEdgeList(ArrayView< const gno_t > &edgeIds, ArrayView< const offset_t > &offsets, ArrayView< input_t > &wgts) const
Sets pointers to this process&#39; edge (neighbor) global Ids, including off-process edges.
void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const
Function is called whenever nprocs &gt; no_task. Function returns only the subset of processors that are...
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
void copyCoordinates(IT *permutation)
void getGridCommunicationGraph(part_t taskCount, part_t *&task_comm_xadj, part_t *&task_comm_adj, std::vector< int > grid_dims)
Zoltan2_ReduceBestMapping Class, reduces the minimum cost mapping, ties breaks with minimum proc id...
Adapter::part_t part_t
KMeansCluster Class.
BaseAdapterType
An enum to identify general types of adapters.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
The StridedData class manages lists of weights or coordinates.
CoordinateTaskMapper(const Environment *env_const_, const Teuchos::Comm< int > *problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, ArrayRCP< part_t >task_comm_xadj, ArrayRCP< part_t >task_comm_adj, pcoord_t *task_communication_edge_weight_, int recursion_depth, part_t *part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor The mapping constructor which will also perform the mapping operation. The result mapping can be obtained by –getAssignedProcForTask function: which returns the assigned processor id for the given task –getPartsForProc: which returns the assigned tasks with the number of tasks.
The user parameters, debug, timing and memory profiling output objects, and error checking methods...
static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
void addPoint(IT index, WT distance)
bool getNewCenters(WT *center, WT **coords, int dimension)
#define epsilon
size_t getLocalNumVertices() const
Returns the number vertices on this process.
MachineRepresentation Class Base class for representing machine coordinates, networks, etc.
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const Teuchos::RCP< const Zoltan2::PartitioningSolution< Adapter > > soln_, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. When this constructor is called, in order to calculate the communication metric...
CoordinateModelInput Class that performs mapping between the coordinate partitioning result and mpi r...
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS&#39;s idx_t...
pcoord_t ** shiftMachineCoordinates(int machine_dim, const part_t *machine_dimensions, bool *machine_extent_wrap_around, part_t numProcs, pcoord_t **mCoords)
Using the machine dimensions provided, create virtual machine coordinates by assigning the largest ga...
Defines the MappingSolution class.
const Teuchos::RCP< const Teuchos::Comm< int > > comm
GraphModel defines the interface required for graph models.
Tpetra::MultiVector< double, int, int > tMVector_t
Gathering definitions used in software development.
void update_visit_order(part_t *visitOrder, part_t n, int &_u_umpa_seed, part_t rndm)
virtual double getProcDistance(int procId1, int procId2) const
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &rcp_proc_to_task_xadj, ArrayRCP< part_t > &rcp_proc_to_task_adj, ArrayRCP< part_t > &rcp_task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const
Function is called whenever nprocs &gt; no_task. Function returns only the subset of processors that are...
Zoltan2_ReduceBestMapping()
Default Constructor.
KmeansHeap Class, max heap, but holds the minimum values.
int getNumWeightsPerEdge() const
Returns the number (0 or greater) of weights per edge.
Defines the GraphModel interface.
void fillContinousArray(T *arr, size_t arrSize, T *val)
fillContinousArray function
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
void push_down(IT index_on_heap)
ArrayRCP< scalar_t > task_communication_edge_weight
RCP< Comm< int > > create_subCommunicator()
creates and returns the subcommunicator for the processor group.
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
CoordinateCommunicationModel< pcoord_t, tcoord_t, part_t > * proc_task_comm
KMeansAlgorithm Class that performs clustering of the coordinates, and returns the closest set of coo...
const Teuchos::RCP< const Environment > env
void timerStart(TimerType tt, const char *timerName) const
Start a named timer.
KMeansAlgorithm(int dim_, IT numElements_, WT **elementCoords_, IT required_elements_)
KMeansAlgorithm Constructor.
void copyCoordinates(IT *permutation)
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
#define MINOF(a, b)
void coordinateTaskMapperInterface(RCP< const Teuchos::Comm< int > > problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, part_t *task_comm_xadj, part_t *task_comm_adj, pcoord_t *task_communication_edge_weight_, part_t *proc_to_task_xadj, part_t *proc_to_task_adj, int recursion_depth, part_t *part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false)
Constructor The interface function that calls CoordinateTaskMapper which will also perform the mappin...
virtual void getPartsForProc(int procId, part_t &numParts, part_t *&parts) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
void setHeapsize(IT heapsize_)