Zoltan2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Zoltan2_TaskMapping.hpp
Go to the documentation of this file.
1 #ifndef _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
2 #define _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
3 
4 #include <fstream>
5 #include <ctime>
6 #include <vector>
7 #include <set>
8 #include <tuple>
9 
10 #include "Zoltan2_Standards.hpp"
12 #include "Teuchos_ArrayViewDecl.hpp"
15 #include "Teuchos_ReductionOp.hpp"
16 
18 
19 #include "Zoltan2_GraphModel.hpp"
20 
21 #include <Zoltan2_TPLTraits.hpp>
22 
23 #include "Teuchos_Comm.hpp"
24 #ifdef HAVE_ZOLTAN2_MPI
25 #include "Teuchos_DefaultMpiComm.hpp"
26 #endif // HAVE_ZOLTAN2_MPI
27 #include <Teuchos_DefaultSerialComm.hpp>
28 
29 //#define gnuPlot
32 
33 namespace Teuchos{
34 
37 template <typename Ordinal, typename T>
38 class Zoltan2_ReduceBestMapping : public ValueTypeReductionOp<Ordinal,T>
39 {
40 private:
41  T _EPSILON;
42 
43 public:
46  Zoltan2_ReduceBestMapping():_EPSILON(std::numeric_limits<T>::epsilon()){}
47 
50  void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
51  {
52 
53  for (Ordinal i=0; i < count; i++){
54  if (inBuffer[0] - inoutBuffer[0] < -_EPSILON){
55  inoutBuffer[0] = inBuffer[0];
56  inoutBuffer[1] = inBuffer[1];
57  } else if(inBuffer[0] - inoutBuffer[0] < _EPSILON &&
58  inBuffer[1] - inoutBuffer[1] < _EPSILON){
59  inoutBuffer[0] = inBuffer[0];
60  inoutBuffer[1] = inBuffer[1];
61  }
62  }
63  }
64 };
65 } // namespace Teuchos
66 
67 
68 namespace Zoltan2{
69 
70 template <typename it>
71 inline it z2Fact(it x) {
72  return (x == 1 ? x : x * z2Fact<it>(x - 1));
73 }
74 
75 template <typename gno_t, typename part_t>
77 public:
78  gno_t gno;
80 };
81 
82 //returns the ith permutation indices.
83 template <typename IT>
84 void ithPermutation(const IT n, IT i, IT *perm)
85 {
86  IT j, k = 0;
87  IT *fact = new IT[n];
88 
89 
90  // compute factorial numbers
91  fact[k] = 1;
92  while (++k < n)
93  fact[k] = fact[k - 1] * k;
94 
95  // compute factorial code
96  for (k = 0; k < n; ++k)
97  {
98  perm[k] = i / fact[n - 1 - k];
99  i = i % fact[n - 1 - k];
100  }
101 
102  // readjust values to obtain the permutation
103  // start from the end and check if preceding values are lower
104  for (k = n - 1; k > 0; --k)
105  for (j = k - 1; j >= 0; --j)
106  if (perm[j] <= perm[k])
107  perm[k]++;
108 
109  delete [] fact;
110 }
111 
112 template <typename part_t>
113 void getGridCommunicationGraph(part_t taskCount, part_t *&task_comm_xadj, part_t *&task_comm_adj, std::vector<int> grid_dims){
114  int dim = grid_dims.size();
115  int neighborCount = 2 * dim;
116  task_comm_xadj = allocMemory<part_t>(taskCount+1);
117  task_comm_adj = allocMemory<part_t>(taskCount * neighborCount);
118 
119  part_t neighBorIndex = 0;
120  task_comm_xadj[0] = 0;
121  for (part_t i = 0; i < taskCount; ++i){
122  part_t prevDimMul = 1;
123  for (int j = 0; j < dim; ++j){
124  part_t lNeighbor = i - prevDimMul;
125  part_t rNeighbor = i + prevDimMul;
126  prevDimMul *= grid_dims[j];
127  if (lNeighbor >= 0 && lNeighbor/ prevDimMul == i / prevDimMul && lNeighbor < taskCount){
128  task_comm_adj[neighBorIndex++] = lNeighbor;
129  }
130  if (rNeighbor >= 0 && rNeighbor/ prevDimMul == i / prevDimMul && rNeighbor < taskCount){
131  task_comm_adj[neighBorIndex++] = rNeighbor;
132  }
133  }
134  task_comm_xadj[i+1] = neighBorIndex;
135  }
136 
137 }
138 //returns the center of the parts.
139 template <typename Adapter, typename scalar_t, typename part_t>
141  const Environment *envConst,
142  const Teuchos::Comm<int> *comm,
144  //const Zoltan2::PartitioningSolution<Adapter> *soln_,
145  const part_t *parts,
146  int coordDim,
147  part_t ntasks,
148  scalar_t **partCenters){
149 
150  typedef typename Adapter::lno_t lno_t;
151  typedef typename Adapter::gno_t gno_t;
152 
153  typedef StridedData<lno_t, scalar_t> input_t;
154  ArrayView<const gno_t> gnos;
155  ArrayView<input_t> xyz;
156  ArrayView<input_t> wgts;
157  coords->getCoordinates(gnos, xyz, wgts);
158 
159  //local and global num coordinates.
160  lno_t numLocalCoords = coords->getLocalNumCoordinates();
161  //gno_t numGlobalCoords = coords->getGlobalNumCoordinates();
162 
163 
164 
165  //local number of points in each part.
166  gno_t *point_counts = allocMemory<gno_t>(ntasks);
167  memset(point_counts, 0, sizeof(gno_t) * ntasks);
168 
169  //global number of points in each part.
170  gno_t *global_point_counts = allocMemory<gno_t>(ntasks);
171 
172 
173  scalar_t **multiJagged_coordinates = allocMemory<scalar_t *>(coordDim);
174 
175  for (int dim=0; dim < coordDim; dim++){
176  ArrayRCP<const scalar_t> ar;
177  xyz[dim].getInputArray(ar);
178  //multiJagged coordinate values assignment
179  multiJagged_coordinates[dim] = (scalar_t *)ar.getRawPtr();
180  memset(partCenters[dim], 0, sizeof(scalar_t) * ntasks);
181  }
182 
183  //get parts with parallel gnos.
184  //const part_t *parts = soln_->getPartListView();
185  /*
186  for (lno_t i=0; i < numLocalCoords; i++){
187  cout << "me:" << comm->getRank() << " gno:" << soln_gnos[i] << " tmp.part :" << parts[i]<< endl;
188  }
189  */
190 
191 
192  envConst->timerStart(MACRO_TIMERS, "Mapping - Center Calculation");
193 
194 
195  for (lno_t i=0; i < numLocalCoords; i++){
196  part_t p = parts[i];
197  //add up all coordinates in each part.
198  for(int j = 0; j < coordDim; ++j){
199  scalar_t c = multiJagged_coordinates[j][i];
200  partCenters[j][p] += c;
201  }
202  ++point_counts[p];
203  }
204 
205  //get global number of points in each part.
206  reduceAll<int, gno_t>(*comm, Teuchos::REDUCE_SUM,
207  ntasks, point_counts, global_point_counts
208  );
209 
210  for(int j = 0; j < coordDim; ++j){
211  for (part_t i=0; i < ntasks; ++i){
212  partCenters[j][i] /= global_point_counts[i];
213  }
214  }
215 
216  scalar_t *tmpCoords = allocMemory<scalar_t>(ntasks);
217  for(int j = 0; j < coordDim; ++j){
218  reduceAll<int, scalar_t>(*comm, Teuchos::REDUCE_SUM,
219  ntasks, partCenters[j], tmpCoords
220  );
221 
222  scalar_t *tmp = partCenters[j];
223  partCenters[j] = tmpCoords;
224  tmpCoords = tmp;
225  }
226  envConst->timerStop(MACRO_TIMERS, "Mapping - Center Calculation");
227 
228  freeArray<gno_t>(point_counts);
229  freeArray<gno_t>(global_point_counts);
230 
231  freeArray<scalar_t>(tmpCoords);
232  freeArray<scalar_t *>(multiJagged_coordinates);
233 }
234 
235 //returns the coarsend part graph.
236 template <typename Adapter, typename scalar_t, typename part_t>
238  const Environment *envConst,
239  const Teuchos::Comm<int> *comm,
241  //const Zoltan2::PartitioningSolution<Adapter> *soln_,
242  part_t np,
243  const part_t *parts,
244  ArrayRCP<part_t> &g_part_xadj,
245  ArrayRCP<part_t> &g_part_adj,
246  ArrayRCP<scalar_t> &g_part_ew
247  ){
248 
249  typedef typename Adapter::lno_t t_lno_t;
250  typedef typename Adapter::gno_t t_gno_t;
251  typedef typename Adapter::scalar_t t_scalar_t;
252  typedef typename Adapter::offset_t t_offset_t;
254 
255  //int numRanks = comm->getSize();
256  //int myRank = comm->getRank();
257 
258  //get parts with parallel gnos.
259  /*
260  const part_t *parts = soln_->getPartListView();
261 
262  part_t np = soln_->getActualGlobalNumberOfParts();
263  if (part_t(soln_->getTargetGlobalNumberOfParts()) > np){
264  np = soln_->getTargetGlobalNumberOfParts();
265  }
266  */
267 
268 
269  t_lno_t localNumVertices = graph->getLocalNumVertices();
270  t_lno_t localNumEdges = graph->getLocalNumEdges();
271 
272  //get the vertex global ids, and weights
273  ArrayView<const t_gno_t> Ids;
274  ArrayView<t_input_t> v_wghts;
275  graph->getVertexList(Ids, v_wghts);
276 
277  //get the edge ids, and weights
278  ArrayView<const t_gno_t> edgeIds;
279  ArrayView<const t_offset_t> offsets;
280  ArrayView<t_input_t> e_wgts;
281  graph->getEdgeList(edgeIds, offsets, e_wgts);
282 
283  std::vector<t_scalar_t> edge_weights;
284  int numWeightPerEdge = graph->getNumWeightsPerEdge();
285 
286  if (numWeightPerEdge > 0){
287  edge_weights = std::vector<t_scalar_t>(localNumEdges);
288  for (t_lno_t i = 0; i < localNumEdges; ++i){
289  edge_weights[i] = e_wgts[0][i];
290  }
291  }
292 
293  //create a zoltan dictionary to get the parts of the vertices
294  //at the other end of edges
295  std::vector<part_t> e_parts(localNumEdges);
296 #ifdef HAVE_ZOLTAN2_MPI
297  if (comm->getSize() > 1)
298  {
299  const bool bUseLocalIDs = false; // Local IDs not needed
301  int debug_level = 0;
302  const RCP<const Comm<int> > rcp_comm(comm,false);
303  directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
304  directory.update(localNumVertices, &Ids[0], NULL, &parts[0],
305  NULL, directory_t::Update_Mode::Replace);
306  directory.find(localNumEdges, &edgeIds[0], NULL, &e_parts[0],
307  NULL, NULL, false);
308  } else
309 #endif
310  {
311 
312  /*
313  std::cout << "localNumVertices:" << localNumVertices
314  << " np:" << np
315  << " globalNumVertices:" << graph->getGlobalNumVertices()
316  << " localNumEdges:" << localNumEdges << std::endl;
317  */
318 
319  for (t_lno_t i = 0; i < localNumEdges; ++i){
320  t_gno_t ei = edgeIds[i];
321  part_t p = parts[ei];
322  e_parts[i] = p;
323  }
324 
325  //get the vertices in each part in my part.
326  std::vector<t_lno_t> part_begins(np, -1);
327  std::vector<t_lno_t> part_nexts(localNumVertices, -1);
328 
329  //cluster vertices according to their parts.
330  //create local part graph.
331  for (t_lno_t i = 0; i < localNumVertices; ++i){
332  part_t ap = parts[i];
333  part_nexts[i] = part_begins[ap];
334  part_begins[ap] = i;
335  }
336 
337 
338  g_part_xadj = ArrayRCP<part_t>(np + 1);
339  g_part_adj = ArrayRCP<part_t>(localNumEdges);
340  g_part_ew = ArrayRCP<t_scalar_t>(localNumEdges);
341  part_t nindex = 0;
342  g_part_xadj[0] = 0;
343  std::vector<part_t> part_neighbors(np);
344  std::vector<t_scalar_t> part_neighbor_weights(np, 0);
345  std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
346 
347  //coarsen for all vertices in my part in order with parts.
348  for (t_lno_t i = 0; i < np; ++i){
349  part_t num_neighbor_parts = 0;
350  t_lno_t v = part_begins[i];
351  //get part i, and first vertex in this part v.
352  while (v != -1){
353  //now get the neightbors of v.
354  for (t_offset_t j = offsets[v]; j < offsets[v+1]; ++j){
355  //get the part of the second vertex.
356  part_t ep = e_parts[j];
357 
358  t_scalar_t ew = 1;
359  if (numWeightPerEdge > 0){
360  ew = edge_weights[j];
361  }
362  //std::cout << "part:" << i << " v:" << v << " part2:" << ep << " v2:" << edgeIds[j] << " w:" << ew << std::endl;
363  //add it to my local part neighbors for part i.
364  if (part_neighbor_weights[ep] < 0.00001){
365  part_neighbors[num_neighbor_parts++] = ep;
366  }
367  part_neighbor_weights[ep] += ew;
368  }
369  v = part_nexts[v];
370  }
371 
372 
373  //now get the part list.
374  for (t_lno_t j = 0; j < num_neighbor_parts; ++j){
375  part_t neighbor_part = part_neighbors[j];
376  g_part_adj[nindex] = neighbor_part;
377  g_part_ew[nindex++] = part_neighbor_weights[neighbor_part];
378  part_neighbor_weights[neighbor_part] = 0;
379  }
380  g_part_xadj[i + 1] = nindex;
381  }
382  return;
383  }
384 
385  // struct for directory data - note more extensive comments in
386  // Zoltan2_GraphMetricsUtility.hpp which I didn't want to duplicate
387  // because this is in progress. Similar concept there.
388  struct part_info {
389  part_info() : weight(0) {
390  }
391  const part_info & operator+=(const part_info & src) {
392  this->weight += src.weight;
393  return *this;
394  }
395  bool operator>(const part_info & src) {
396  return (destination_part > src.destination_part);
397  }
398  bool operator==(const part_info & src) {
399  return (destination_part == src.destination_part);
400  }
401  part_t destination_part;
402  scalar_t weight;
403  };
404 
405  bool bUseLocalIDs = false;
406  const int debug_level = 0;
408  directory_t;
409  const RCP<const Comm<int> > rcp_comm(comm,false);
410  directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
411  std::vector<part_t> part_data;
412  std::vector<std::vector<part_info>> user_data;
413 
414  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE Coarsen");
415  {
416  //get the vertices in each part in my part.
417  std::vector<t_lno_t> part_begins(np, -1);
418  std::vector<t_lno_t> part_nexts(localNumVertices, -1);
419 
420  //cluster vertices according to their parts.
421  //create local part graph.
422  for (t_lno_t i = 0; i < localNumVertices; ++i){
423  part_t ap = parts[i];
424  part_nexts[i] = part_begins[ap];
425  part_begins[ap] = i;
426  }
427 
428  std::vector<part_t> part_neighbors(np);
429  std::vector<t_scalar_t> part_neighbor_weights(np, 0);
430  std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
431 
432  //coarsen for all vertices in my part in order with parts.
433  for (t_lno_t i = 0; i < np; ++i){
434  part_t num_neighbor_parts = 0;
435  t_lno_t v = part_begins[i];
436  //get part i, and first vertex in this part v.
437  while (v != -1){
438  //now get the neightbors of v.
439  for (t_offset_t j = offsets[v]; j < offsets[v+1]; ++j){
440  //get the part of the second vertex.
441  part_t ep = e_parts[j];
442 
443  t_scalar_t ew = 1;
444  if (numWeightPerEdge > 0){
445  ew = edge_weights[j];
446  }
447  //add it to my local part neighbors for part i.
448  if (part_neighbor_weights[ep] < 0.00001){
449  part_neighbors[num_neighbor_parts++] = ep;
450  }
451  part_neighbor_weights[ep] += ew;
452  }
453  v = part_nexts[v];
454  }
455 
456  //now get the part list.
457  for (t_lno_t j = 0; j < num_neighbor_parts; ++j){
458  part_t neighbor_part = part_neighbors[j];
459  part_neighbor_weights_ordered[j] = part_neighbor_weights[neighbor_part];
460  part_neighbor_weights[neighbor_part] = 0;
461  }
462 
463  //insert it to tpetra crsmatrix.
464  if (num_neighbor_parts > 0){
465  part_data.push_back(i); // TODO: optimize to avoid push_back
466  std::vector<part_info> new_user_data(num_neighbor_parts);
467  for(int q = 0; q < num_neighbor_parts; ++q) {
468  part_info & info = new_user_data[q];
469  info.weight = part_neighbor_weights_ordered[q];
470  info.destination_part = part_neighbors[q];
471  }
472  user_data.push_back(new_user_data); // TODO: optimize to avoid push_back
473  }
474  }
475  }
476  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE Coarsen");
477 
478  std::vector<part_t> part_indices(np);
479 
480  for (part_t i = 0; i < np; ++i) part_indices[i] = i;
481 
482  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE directory update");
483  directory.update(part_data.size(), &part_data[0], NULL, &user_data[0],
484  NULL, directory_t::Update_Mode::AggregateAdd);
485  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE directory update");
486 
487  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE directory find");
488  std::vector<std::vector<part_info>> find_user_data(part_indices.size());
489  directory.find(find_user_data.size(), &part_indices[0], NULL,
490  &find_user_data[0], NULL, NULL, false);
491  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE directory find");
492 
493  // Now reconstruct the output data from the directory find data
494  // This code was designed to reproduce the exact format of the original
495  // setup for g_part_xadj, g_part_adj, and g_part_ew but before making any
496  // further changes I wanted to verify if this formatting should be
497  // preserved or potentially changed further.
498 
499  // first thing is get the total number of elements
500  int get_total_length = 0;
501  for(size_t n = 0; n < find_user_data.size(); ++n) {
502  get_total_length += find_user_data[n].size();
503  }
504 
505  // setup data space
506  g_part_xadj = ArrayRCP<part_t> (np + 1);
507  g_part_adj = ArrayRCP<part_t> (get_total_length);
508  g_part_ew = ArrayRCP<t_scalar_t> (get_total_length);
509 
510  // loop through again and fill to match the original formatting
511  int track_insert_index = 0;
512  for(size_t n = 0; n < find_user_data.size(); ++n) {
513  g_part_xadj[n] = track_insert_index;
514  const std::vector<part_info> & user_data_vector = find_user_data[n];
515  for(size_t q = 0; q < user_data_vector.size(); ++q) {
516  const part_info & info = user_data_vector[q];
517  g_part_adj[track_insert_index] = info.destination_part;
518  g_part_ew[track_insert_index] = info.weight;
519  ++track_insert_index;
520  }
521  }
522  g_part_xadj[np] = get_total_length; // complete the series
523 
524  // This is purely for debugging logging and to be deleted
525  // hacky sort here just to make each subset of part destinations ordered
526  // so new and old code will produce identical output for validation. I think
527  // the ordering is arbitrary and no requirement to preserve the old ordering
528  // within each part. This code was written so I could drop it into the
529  // develop branch and validate we were producing identical results at this
530  // step. Would like to discuss this further before continuing.
531  // TODO: Delete all this
532  /*
533  if(comm->getRank() == 0) {
534  for(size_t i = 0; i < (size_t)g_part_xadj.size() - 1; ++i) {
535  part_t b = g_part_xadj[i];
536  part_t e = g_part_xadj[i+1];
537  printf("Results for part %d:\n", (int)i);
538  for(part_t scan_part = 0; scan_part < np; ++scan_part) {
539  for(part_t q = b; q < e; ++q) {
540  if(g_part_adj[q] == scan_part) { // inefficient hack to sort g_part_adj
541  printf( "(%d:%.2f) ", (int)scan_part, (float)g_part_ew[q]);
542  }
543  }
544  }
545  printf("\n");
546  }
547  }
548  */
549 
550 }
551 
552 
555 template <class IT, class WT>
557  IT heapSize;
558  IT *indices;
559  WT *values;
560  WT _EPSILON;
561 
562 
563 public:
564  void setHeapsize(IT heapsize_){
565  this->heapSize = heapsize_;
566  this->indices = allocMemory<IT>(heapsize_ );
567  this->values = allocMemory<WT>(heapsize_ );
568  this->_EPSILON = std::numeric_limits<WT>::epsilon();
569  }
570 
572  freeArray<IT>(this->indices);
573  freeArray<WT>(this->values);
574  }
575 
576 
577  void addPoint(IT index, WT distance){
578  WT maxVal = this->values[0];
579  //add only the distance is smaller than the maximum distance.
580  //cout << "indeX:" << index << "distance:" <<distance << " maxVal:" << maxVal << endl;
581  if (distance >= maxVal) return;
582  else {
583  this->values[0] = distance;
584  this->indices[0] = index;
585  this->push_down(0);
586  }
587  }
588 
589  //heap push down operation
590  void push_down(IT index_on_heap){
591  IT child_index1 = 2 * index_on_heap + 1;
592  IT child_index2 = 2 * index_on_heap + 2;
593 
594  IT biggerIndex = -1;
595  if(child_index1 < this->heapSize && child_index2 < this->heapSize){
596 
597  if (this->values[child_index1] < this->values[child_index2]){
598  biggerIndex = child_index2;
599  }
600  else {
601  biggerIndex = child_index1;
602  }
603  }
604  else if(child_index1 < this->heapSize){
605  biggerIndex = child_index1;
606 
607  }
608  else if(child_index2 < this->heapSize){
609  biggerIndex = child_index2;
610  }
611  if (biggerIndex >= 0 && this->values[biggerIndex] > this->values[index_on_heap]){
612  WT tmpVal = this->values[biggerIndex];
613  this->values[biggerIndex] = this->values[index_on_heap];
614  this->values[index_on_heap] = tmpVal;
615 
616  IT tmpIndex = this->indices[biggerIndex];
617  this->indices[biggerIndex] = this->indices[index_on_heap];
618  this->indices[index_on_heap] = tmpIndex;
619  this->push_down(biggerIndex);
620  }
621  }
622 
623  void initValues(){
624  WT MAXVAL = std::numeric_limits<WT>::max();
625  for(IT i = 0; i < this->heapSize; ++i){
626  this->values[i] = MAXVAL;
627  this->indices[i] = -1;
628  }
629  }
630 
631  //returns the total distance to center in the cluster.
633 
634  WT nc = 0;
635  for(IT j = 0; j < this->heapSize; ++j){
636  nc += this->values[j];
637 
638  //cout << "index:" << this->indices[j] << " distance:" << this->values[j] << endl;
639  }
640  return nc;
641  }
642 
643  //returns the new center of the cluster.
644  bool getNewCenters(WT *center, WT **coords, int dimension){
645  bool moved = false;
646  for(int i = 0; i < dimension; ++i){
647  WT nc = 0;
648  for(IT j = 0; j < this->heapSize; ++j){
649  IT k = this->indices[j];
650  //cout << "i:" << i << " dim:" << dimension << " k:" << k << " heapSize:" << heapSize << endl;
651  nc += coords[i][k];
652  }
653  nc /= this->heapSize;
654  moved = (ZOLTAN2_ABS(center[i] - nc) > this->_EPSILON || moved );
655  center[i] = nc;
656 
657  }
658  return moved;
659  }
660 
661  void copyCoordinates(IT *permutation){
662  for(IT i = 0; i < this->heapSize; ++i){
663  permutation[i] = this->indices[i];
664  }
665  }
666 };
667 
670 template <class IT, class WT>
672 
673  int dimension;
674  KmeansHeap<IT,WT> closestPoints;
675 
676 public:
677  WT *center;
679  freeArray<WT>(center);
680  }
681 
682  void setParams(int dimension_, int heapsize){
683  this->dimension = dimension_;
684  this->center = allocMemory<WT>(dimension_);
685  this->closestPoints.setHeapsize(heapsize);
686  }
687 
688  void clearHeap(){
689  this->closestPoints.initValues();
690  }
691 
692  bool getNewCenters( WT **coords){
693  return this->closestPoints.getNewCenters(center, coords, dimension);
694  }
695 
696  //returns the distance of the coordinate to the center.
697  //also adds it to the heap.
698  WT getDistance(IT index, WT **elementCoords){
699  WT distance = 0;
700  for (int i = 0; i < this->dimension; ++i){
701  WT d = (center[i] - elementCoords[i][index]);
702  distance += d * d;
703  }
704  distance = pow(distance, WT(1.0 / this->dimension));
705  closestPoints.addPoint(index, distance);
706  return distance;
707  }
708 
710  return closestPoints.getTotalDistance();
711  }
712 
713  void copyCoordinates(IT *permutation){
714  closestPoints.copyCoordinates(permutation);
715  }
716 };
717 
721 template <class IT, class WT>
723 
724  int dim;
725  IT numElements;
726  WT **elementCoords;
727  IT numClusters;
728  IT required_elements;
729  KMeansCluster<IT,WT> *clusters;
730  WT *maxCoordinates;
731  WT *minCoordinates;
732 public:
734  freeArray<KMeansCluster<IT,WT> >(clusters);
735  freeArray<WT>(maxCoordinates);
736  freeArray<WT>(minCoordinates);
737  }
738 
742  int dim_ ,
743  IT numElements_,
744  WT **elementCoords_,
745  IT required_elements_):
746  dim(dim_),
747  numElements(numElements_),
748  elementCoords(elementCoords_),
749  numClusters((1 << dim_) + 1),
750  required_elements(required_elements_)
751  {
752  this->clusters = allocMemory<KMeansCluster<IT,WT> >(this->numClusters);
753  //set dimension and the number of required elements for all clusters.
754  for (int i = 0; i < numClusters; ++i){
755  this->clusters[i].setParams(this->dim, this->required_elements);
756  }
757 
758  this->maxCoordinates = allocMemory <WT>(this->dim);
759  this->minCoordinates = allocMemory <WT>(this->dim);
760 
761  //obtain the min and max coordiantes for each dimension.
762  for (int j = 0; j < dim; ++j){
763  this->minCoordinates[j] = this->maxCoordinates[j] = this->elementCoords[j][0];
764  for(IT i = 1; i < numElements; ++i){
765  WT t = this->elementCoords[j][i];
766  if(t > this->maxCoordinates[j]){
767  this->maxCoordinates[j] = t;
768  }
769  if (t < minCoordinates[j]){
770  this->minCoordinates[j] = t;
771  }
772  }
773  }
774 
775 
776  //assign initial cluster centers.
777  for (int j = 0; j < dim; ++j){
778  int mod = (1 << (j+1));
779  for (int i = 0; i < numClusters - 1; ++i){
780  WT c = 0;
781  if ( (i % mod) < mod / 2){
782  c = this->maxCoordinates[j];
783  //cout << "i:" << i << " j:" << j << " setting max:" << c << endl;
784  }
785  else {
786  c = this->minCoordinates[j];
787  }
788  this->clusters[i].center[j] = c;
789  }
790  }
791 
792  //last cluster center is placed to middle.
793  for (int j = 0; j < dim; ++j){
794  this->clusters[numClusters - 1].center[j] = (this->maxCoordinates[j] + this->minCoordinates[j]) / 2;
795  }
796 
797 
798  /*
799  for (int i = 0; i < numClusters; ++i){
800  //cout << endl << "cluster:" << i << endl << "\t";
801  for (int j = 0; j < dim; ++j){
802  cout << this->clusters[i].center[j] << " ";
803  }
804  }
805  */
806  }
807 
808  //performs kmeans clustering of coordinates.
809  void kmeans(){
810  for(int it = 0; it < 10; ++it){
811  //cout << "it:" << it << endl;
812  for (IT j = 0; j < this->numClusters; ++j){
813  this->clusters[j].clearHeap();
814  }
815  for (IT i = 0; i < this->numElements; ++i){
816  //cout << "i:" << i << " numEl:" << this->numElements << endl;
817  for (IT j = 0; j < this->numClusters; ++j){
818  //cout << "j:" << j << " numClusters:" << this->numClusters << endl;
819  this->clusters[j].getDistance(i,this->elementCoords);
820  }
821  }
822  bool moved = false;
823  for (IT j = 0; j < this->numClusters; ++j){
824  moved =(this->clusters[j].getNewCenters(this->elementCoords) || moved );
825  }
826  if (!moved){
827  break;
828  }
829  }
830 
831 
832  }
833 
834  //finds the cluster in which the coordinates are the closest to each other.
835  void getMinDistanceCluster(IT *procPermutation){
836 
837  WT minDistance = this->clusters[0].getDistanceToCenter();
838  IT minCluster = 0;
839  //cout << "j:" << 0 << " minDistance:" << minDistance << " minTmpDistance:" << minDistance<< " minCluster:" << minCluster << endl;
840  for (IT j = 1; j < this->numClusters; ++j){
841  WT minTmpDistance = this->clusters[j].getDistanceToCenter();
842  //cout << "j:" << j << " minDistance:" << minDistance << " minTmpDistance:" << minTmpDistance<< " minCluster:" << minCluster << endl;
843  if(minTmpDistance < minDistance){
844  minDistance = minTmpDistance;
845  minCluster = j;
846  }
847  }
848 
849  //cout << "minCluster:" << minCluster << endl;
850  this->clusters[minCluster].copyCoordinates(procPermutation);
851  }
852 };
853 
854 
855 
856 #define MINOF(a,b) (((a)<(b))?(a):(b))
857 
864 template <typename T>
865 void fillContinousArray(T *arr, size_t arrSize, T *val){
866  if(val == NULL){
867 
868 #ifdef HAVE_ZOLTAN2_OMP
869 #pragma omp parallel for
870 #endif
871  for(size_t i = 0; i < arrSize; ++i){
872  arr[i] = i;
873  }
874 
875  }
876  else {
877  T v = *val;
878 #ifdef HAVE_ZOLTAN2_OMP
879 #pragma omp parallel for
880 #endif
881  for(size_t i = 0; i < arrSize; ++i){
882  //cout << "writing to i:" << i << " arr:" << arrSize << endl;
883  arr[i] = v;
884  }
885  }
886 }
887 
890 template <typename part_t, typename pcoord_t>
892 protected:
893  double commCost;
894 public:
895 
896  part_t no_procs; //the number of processors
897  part_t no_tasks; //the number of taks.
899  CommunicationModel(part_t no_procs_, part_t no_tasks_):
900  commCost(),
901  no_procs(no_procs_),
902  no_tasks(no_tasks_){}
904  part_t getNProcs() const{
905  return this->no_procs;
906  }
908  return this->no_tasks;
909  }
910 
911 
913  part_t *task_to_proc,
914  part_t *task_communication_xadj,
915  part_t *task_communication_adj,
916  pcoord_t *task_communication_edge_weight){
917 
918  double totalCost = 0;
919 
920  part_t commCount = 0;
921  for (part_t task = 0; task < this->no_tasks; ++task){
922  int assigned_proc = task_to_proc[task];
923  //cout << "task:" << task << endl;
924  part_t task_adj_begin = task_communication_xadj[task];
925  part_t task_adj_end = task_communication_xadj[task+1];
926 
927  commCount += task_adj_end - task_adj_begin;
928  //cout << "task:" << task << " proc:" << assigned_proc << endl;
929  for (part_t task2 = task_adj_begin; task2 < task_adj_end; ++task2){
930 
931  //cout << "task2:" << task2 << endl;
932  part_t neighborTask = task_communication_adj[task2];
933  //cout << "neighborTask :" << neighborTask << endl;
934  int neighborProc = task_to_proc[neighborTask];
935 
936  double distance = getProcDistance(assigned_proc, neighborProc);
937 
938  if (task_communication_edge_weight == NULL){
939  totalCost += distance ;
940  }
941  else {
942  totalCost += distance * task_communication_edge_weight[task2];
943 
944  /*
945  std::cout << "\ttask:" << task << " assigned_proc:" << assigned_proc <<
946  "task2:" << task << " neighborProc:" << neighborProc <<
947  " d:" << distance << " task_communication_edge_weight[task2]:" << task_communication_edge_weight[task2] <<
948  " wh:" << distance * task_communication_edge_weight[task2] <<
949  std::endl;
950  */
951  }
952  }
953  }
954 
955  this->commCost = totalCost;// commCount;
956  }
957 
959  return this->commCost;
960  }
961 
962  virtual double getProcDistance(int procId1, int procId2) const = 0;
963 
970  virtual void getMapping(
971  int myRank,
972  const RCP<const Environment> &env,
973  ArrayRCP <part_t> &proc_to_task_xadj, // = allocMemory<part_t> (this->no_procs+1); //holds the pointer to the task array
974  ArrayRCP <part_t> &proc_to_task_adj, // = allocMemory<part_t>(this->no_tasks); //holds the indices of tasks wrt to proc_to_task_xadj array.
975  ArrayRCP <part_t> &task_to_proc //allocMemory<part_t>(this->no_tasks); //holds the processors mapped to tasks.
976  ,const Teuchos::RCP <const Teuchos::Comm<int> > comm_
977  ) const = 0;
978 };
982 template <typename pcoord_t, typename tcoord_t, typename part_t>
983 class CoordinateCommunicationModel:public CommunicationModel<part_t, pcoord_t> {
984 public:
985  //private:
986  int proc_coord_dim; //dimension of the processors
987  pcoord_t **proc_coords; //the processor coordinates. allocated outside of the class.
988  int task_coord_dim; //dimension of the tasks coordinates.
989  tcoord_t **task_coords; //the task coordinates allocated outside of the class.
992 
996 
999 
1000  //public:
1002  CommunicationModel<part_t, pcoord_t>(),
1003  proc_coord_dim(0),
1004  proc_coords(0),
1005  task_coord_dim(0),
1006  task_coords(0),
1007  partArraySize(-1),
1008  partNoArray(NULL),
1009  machine_extent(NULL),
1011  machine(NULL),
1012  num_ranks_per_node(1),
1013  divide_to_prime_first(false){}
1014 
1016 
1026  int pcoord_dim_,
1027  pcoord_t **pcoords_,
1028  int tcoord_dim_,
1029  tcoord_t **tcoords_,
1030  part_t no_procs_,
1031  part_t no_tasks_,
1032  int *machine_extent_,
1033  bool *machine_extent_wrap_around_,
1034  const MachineRepresentation<pcoord_t,part_t> *machine_ = NULL
1035  ):
1036  CommunicationModel<part_t, pcoord_t>(no_procs_, no_tasks_),
1037  proc_coord_dim(pcoord_dim_), proc_coords(pcoords_),
1038  task_coord_dim(tcoord_dim_), task_coords(tcoords_),
1039  partArraySize(-1),
1040  partNoArray(NULL),
1041  machine_extent(machine_extent_),
1042  machine_extent_wrap_around(machine_extent_wrap_around_),
1043  machine(machine_),
1044  num_ranks_per_node(1),
1045  divide_to_prime_first(false){
1046  }
1047 
1048 
1049  void setPartArraySize(int psize){
1050  this->partArraySize = psize;
1051  }
1052  void setPartArray(part_t *pNo){
1053  this->partNoArray = pNo;
1054  }
1055 
1062  void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const{
1063  //currently returns a random subset.
1064 
1065  part_t minCoordDim = MINOF(this->task_coord_dim, this->proc_coord_dim);
1067  minCoordDim, nprocs,
1068  this->proc_coords, ntasks);
1069 
1070  kma.kmeans();
1071  kma.getMinDistanceCluster(proc_permutation);
1072 
1073  for(int i = ntasks; i < nprocs; ++i){
1074  proc_permutation[i] = -1;
1075  }
1076  /*
1077  //fill array.
1078  fillContinousArray<part_t>(proc_permutation, nprocs, NULL);
1079  int _u_umpa_seed = 847449649;
1080  srand (time(NULL));
1081  int a = rand() % 1000 + 1;
1082  _u_umpa_seed -= a;
1083  //permute array randomly.
1084  update_visit_order(proc_permutation, nprocs,_u_umpa_seed, 1);
1085  */
1086  }
1087 
1088  //temporary, necessary for random permutation.
1089  static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
1090  {
1091  int a = 16807;
1092  int m = 2147483647;
1093  int q = 127773;
1094  int r = 2836;
1095  int lo, hi, test;
1096  double d;
1097 
1098  lo = _u_umpa_seed % q;
1099  hi = _u_umpa_seed / q;
1100  test = (a*lo)-(r*hi);
1101  if (test>0)
1102  _u_umpa_seed = test;
1103  else
1104  _u_umpa_seed = test + m;
1105  d = (double) ((double) _u_umpa_seed / (double) m);
1106  return (part_t) (d*(double)l);
1107  }
1108 
1109  virtual double getProcDistance(int procId1, int procId2) const{
1110  pcoord_t distance = 0;
1111  if (machine == NULL){
1112  for (int i = 0 ; i < this->proc_coord_dim; ++i){
1113  double d = ZOLTAN2_ABS(proc_coords[i][procId1] - proc_coords[i][procId2]);
1115  if (machine_extent[i] - d < d){
1116  d = machine_extent[i] - d;
1117  }
1118  }
1119  distance += d;
1120  }
1121  }
1122  else {
1123  this->machine->getHopCount(procId1, procId2, distance);
1124  }
1125 
1126  return distance;
1127  }
1128 
1129 
1130  //temporary, does random permutation.
1131  void update_visit_order(part_t* visitOrder, part_t n, int &_u_umpa_seed, part_t rndm) {
1132  part_t *a = visitOrder;
1133 
1134 
1135  if (rndm){
1136  part_t i, u, v, tmp;
1137 
1138  if (n <= 4)
1139  return;
1140 
1141  //srand ( time(NULL) );
1142 
1143  //_u_umpa_seed = _u_umpa_seed1 - (rand()%100);
1144  for (i=0; i<n; i+=16)
1145  {
1146  u = umpa_uRandom(n-4, _u_umpa_seed);
1147  v = umpa_uRandom(n-4, _u_umpa_seed);
1148 
1149  // FIXME (mfh 30 Sep 2015) This requires including Zoltan2_AlgMultiJagged.hpp.
1150 
1151  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v], a[u], tmp);
1152  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v+1], a[u+1], tmp);
1153  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v+2], a[u+2], tmp);
1154  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v+3], a[u+3], tmp);
1155 
1156  }
1157  }
1158  else {
1159  part_t i, end = n / 4;
1160 
1161  for (i=1; i<end; i++)
1162  {
1163  part_t j=umpa_uRandom(n-i, _u_umpa_seed);
1164  part_t t=a[j];
1165  a[j] = a[n-i];
1166  a[n-i] = t;
1167  }
1168  }
1169  //PermuteInPlace(visitOrder, n);
1170  }
1171 
1172 
1173 
1180  virtual void getMapping(
1181  int myRank,
1182  const RCP<const Environment> &env,
1183  ArrayRCP <part_t> &rcp_proc_to_task_xadj, // = allocMemory<part_t> (this->no_procs+1); //holds the pointer to the task array
1184  ArrayRCP <part_t> &rcp_proc_to_task_adj, // = allocMemory<part_t>(this->no_tasks); //holds the indices of tasks wrt to proc_to_task_xadj array.
1185  ArrayRCP <part_t> &rcp_task_to_proc //allocMemory<part_t>(this->no_tasks); //holds the processors mapped to tasks.
1186  ,const Teuchos::RCP <const Teuchos::Comm<int> > comm_
1187  ) const{
1188 
1189  rcp_proc_to_task_xadj = ArrayRCP <part_t>(this->no_procs+1);
1190  rcp_proc_to_task_adj = ArrayRCP <part_t>(this->no_tasks);
1191  rcp_task_to_proc = ArrayRCP <part_t>(this->no_tasks);
1192 
1193  part_t *proc_to_task_xadj = rcp_proc_to_task_xadj.getRawPtr(); //holds the pointer to the task array
1194  part_t *proc_to_task_adj = rcp_proc_to_task_adj.getRawPtr(); //holds the indices of tasks wrt to proc_to_task_xadj array.
1195  part_t *task_to_proc = rcp_task_to_proc.getRawPtr(); //holds the processors mapped to tasks.);
1196 
1197 
1198  part_t invalid = 0;
1199  fillContinousArray<part_t>(proc_to_task_xadj, this->no_procs+1, &invalid);
1200 
1201  //obtain the number of parts that should be divided.
1202  part_t num_parts = MINOF(this->no_procs, this->no_tasks);
1203  //obtain the min coordinate dim.
1204  //No more want to do min coord dim. If machine dimension > task_dim,
1205  //we end up with a long line.
1206  //part_t minCoordDim = MINOF(this->task_coord_dim, this->proc_coord_dim);
1207 
1208  int recursion_depth = partArraySize;
1209  //if(partArraySize < minCoordDim) recursion_depth = minCoordDim;
1210  if (partArraySize == -1){
1211 
1212  if (divide_to_prime_first){
1213  //it is difficult to estimate the number of steps in this case as each branch will have different depth.
1214  //The worst case happens when all prime factors are 3s. P = 3^n, n recursion depth will divide parts to 2x and x
1215  //and n recursion depth with divide 2x into x and x.
1216  //set it to upperbound here.
1217  //we could calculate the exact value here as well, but the partitioning algorithm skips further ones anyways.
1218  recursion_depth = log(float(this->no_procs)) / log(2.0) * 2 + 1;
1219  }
1220  else {
1221  recursion_depth = log(float(this->no_procs)) / log(2.0) + 1;
1222  }
1223  }
1224 
1225 
1226  int taskPerm = z2Fact<int>(this->task_coord_dim); //get the number of different permutations for task dimension ordering
1227  int procPerm = z2Fact<int>(this->proc_coord_dim); //get the number of different permutations for proc dimension ordering
1228 
1229  int permutations = taskPerm * procPerm; //total number of permutations
1230  //now add the ones, where we divide the processors with longest dimension,
1231  //but task with order.
1232  permutations += taskPerm;
1233  //and divide tasks with longest dimension, and processors with order.
1234  permutations += procPerm; //total number of permutations
1235  //and both with longest dimension.
1236  permutations += 1;
1237  //add one also that partitions based the longest dimension.
1238 
1239  //holds the pointers to proc_adjList
1240  part_t *proc_xadj = allocMemory<part_t>(num_parts+1);
1241  //holds the processors in parts according to the result of partitioning algorithm.
1242  //the processors assigned to part x is at proc_adjList[ proc_xadj[x] : proc_xadj[x+1] ]
1243  part_t *proc_adjList = allocMemory<part_t>(this->no_procs);
1244 
1245 
1246  part_t used_num_procs = this->no_procs;
1247  if(this->no_procs > this->no_tasks){
1248  //obtain the subset of the processors that are closest to each other.
1249  this->getClosestSubset(proc_adjList, this->no_procs, this->no_tasks);
1250  used_num_procs = this->no_tasks;
1251  }
1252  else {
1253  fillContinousArray<part_t>(proc_adjList,this->no_procs, NULL);
1254  }
1255 
1256  int myPermutation = myRank % permutations; //the index of the permutation
1257  bool task_partition_along_longest_dim = false;
1258  bool proc_partition_along_longest_dim = false;
1259 
1260 
1261  int myProcPerm = 0;
1262  int myTaskPerm = 0;
1263 
1264  if (myPermutation == 0){
1265  task_partition_along_longest_dim = true;
1266  proc_partition_along_longest_dim = true;
1267  }
1268  else {
1269  --myPermutation;
1270  if (myPermutation < taskPerm){
1271  proc_partition_along_longest_dim = true;
1272  myTaskPerm = myPermutation; // the index of the task permutation
1273  }
1274  else{
1275  myPermutation -= taskPerm;
1276  if (myPermutation < procPerm){
1277  task_partition_along_longest_dim = true;
1278  myProcPerm = myPermutation; // the index of the task permutation
1279  }
1280  else {
1281  myPermutation -= procPerm;
1282  myProcPerm = myPermutation % procPerm; // the index of the proc permutation
1283  myTaskPerm = myPermutation / procPerm; // the index of the task permutation
1284  }
1285  }
1286  }
1287 
1288 
1289 
1290 
1291  /*
1292  if (task_partition_along_longest_dim && proc_partition_along_longest_dim){
1293  std::cout <<"me:" << myRank << " task:longest proc:longest" << " numPerms:" << permutations << std::endl;
1294  }
1295  else if (proc_partition_along_longest_dim){
1296  std::cout <<"me:" << myRank << " task:" << myTaskPerm << " proc:longest" << " numPerms:" << permutations << std::endl;
1297  }
1298  else if (task_partition_along_longest_dim){
1299  std::cout <<"me:" << myRank << " task: longest" << " proc:" << myProcPerm << " numPerms:" << permutations << std::endl;
1300  }
1301  else {
1302  std::cout <<"me:" << myRank << " task:" << myTaskPerm << " proc:" << myProcPerm << " numPerms:" << permutations << std::endl;
1303  }
1304  */
1305 
1306 
1307 
1308  int *permutation = allocMemory<int>((this->proc_coord_dim > this->task_coord_dim)
1309  ? this->proc_coord_dim : this->task_coord_dim);
1310 
1311  //get the permutation order from the proc permutation index.
1312  ithPermutation<int>(this->proc_coord_dim, myProcPerm, permutation);
1313 
1314  /*
1315  //reorder the coordinate dimensions.
1316  pcoord_t **pcoords = allocMemory<pcoord_t *>(this->proc_coord_dim);
1317  for(int i = 0; i < this->proc_coord_dim; ++i){
1318  pcoords[i] = this->proc_coords[permutation[i]];
1319  //cout << permutation[i] << " ";
1320  }
1321  */
1322  int procdim = this->proc_coord_dim;
1323  pcoord_t **pcoords = this->proc_coords;
1324  /*
1325  int procdim = this->proc_coord_dim;
1326  procdim = 6;
1327  //reorder the coordinate dimensions.
1328  pcoord_t **pcoords = allocMemory<pcoord_t *>(procdim);
1329  for(int i = 0; i < procdim; ++i){
1330  pcoords[i] = new pcoord_t[used_num_procs] ;//this->proc_coords[permutation[i]];
1331  }
1332 
1333  for (int k = 0; k < used_num_procs ; k++){
1334  pcoords[0][k] = (int (this->proc_coords[0][k]) / 2) * 64;
1335  pcoords[3][k] = (int (this->proc_coords[0][k]) % 2) * 8 ;
1336 
1337  pcoords[1][k] = (int (this->proc_coords[1][k]) / 2) * 8 * 2400;
1338  pcoords[4][k] = (int (this->proc_coords[1][k]) % 2) * 8;
1339  pcoords[2][k] = ((int (this->proc_coords[2][k])) / 8) * 160;
1340  pcoords[5][k] = ((int (this->proc_coords[2][k])) % 8) * 5;
1341 
1342  //if (this->proc_coords[0][k] == 40 && this->proc_coords[1][k] == 8 && this->proc_coords[2][k] == 48){
1343  if (this->proc_coords[0][k] == 5 && this->proc_coords[1][k] == 0 && this->proc_coords[2][k] == 10){
1344  std::cout << "pcoords[0][k]:" << pcoords[0][k] <<
1345  "pcoords[1][k]:" << pcoords[1][k] <<
1346  "pcoords[2][k]:" << pcoords[2][k] <<
1347  "pcoords[3][k]:" << pcoords[3][k] <<
1348  "pcoords[4][k]:" << pcoords[4][k] <<
1349  "pcoords[5][k]:" << pcoords[5][k] << std::endl;
1350  }
1351  else if ( pcoords[0][k] == 64 && pcoords[1][k] == 0 && pcoords[2][k] == 160 &&
1352  pcoords[3][k]==16 && pcoords[4][k] == 0 && pcoords[5][k] == 10){
1353  std::cout << "this->proc_coords[0][k]:" << this->proc_coords[0][k] <<
1354  "this->proc_coords[1][k]:" << this->proc_coords[1][k] <<
1355  "this->proc_coords[2][k]:" << this->proc_coords[2][k] << std::endl;
1356  }
1357 
1358  }
1359  */
1360 
1361 
1362  //if (partNoArray == NULL) std::cout << "partNoArray is null" << std::endl;
1363  //std::cout << "recursion_depth:" << recursion_depth << " partArraySize:" << partArraySize << std::endl;
1364  //do the partitioning and renumber the parts.
1365  env->timerStart(MACRO_TIMERS, "Mapping - Proc Partitioning");
1366 
1368  mj_partitioner.sequential_task_partitioning(
1369  env,
1370  this->no_procs,
1371  used_num_procs,
1372  num_parts,
1373  procdim,
1374  //minCoordDim,
1375  pcoords,//this->proc_coords,
1376  proc_adjList,
1377  proc_xadj,
1378  recursion_depth,
1379  partNoArray,
1380  proc_partition_along_longest_dim//, false
1383  );
1384  env->timerStop(MACRO_TIMERS, "Mapping - Proc Partitioning");
1385  //comm_->barrier();
1386  //std::cout << "mj_partitioner.for procs over" << std::endl;
1387 
1388 
1389  //freeArray<pcoord_t *>(pcoords);
1390 
1391 
1392  part_t *task_xadj = allocMemory<part_t>(num_parts+1);
1393  part_t *task_adjList = allocMemory<part_t>(this->no_tasks);
1394  //fill task_adjList st: task_adjList[i] <- i.
1395  fillContinousArray<part_t>(task_adjList,this->no_tasks, NULL);
1396 
1397  //get the permutation order from the task permutation index.
1398  ithPermutation<int>(this->task_coord_dim, myTaskPerm, permutation);
1399 
1400  //reorder task coordinate dimensions.
1401  tcoord_t **tcoords = allocMemory<tcoord_t *>(this->task_coord_dim);
1402  for(int i = 0; i < this->task_coord_dim; ++i){
1403  tcoords[i] = this->task_coords[permutation[i]];
1404  }
1405 
1406 
1407  env->timerStart(MACRO_TIMERS, "Mapping - Task Partitioning");
1408  //partitioning of tasks
1409  mj_partitioner.sequential_task_partitioning(
1410  env,
1411  this->no_tasks,
1412  this->no_tasks,
1413  num_parts,
1414  this->task_coord_dim,
1415  //minCoordDim,
1416  tcoords, //this->task_coords,
1417  task_adjList,
1418  task_xadj,
1419  recursion_depth,
1420  partNoArray,
1421  task_partition_along_longest_dim
1424  //,"task_partitioning"
1425  //, false//(myRank == 6)
1426  );
1427  env->timerStop(MACRO_TIMERS, "Mapping - Task Partitioning");
1428 
1429  //std::cout << "myrank:" << myRank << std::endl;
1430  //comm_->barrier();
1431  //std::cout << "mj_partitioner.sequential_task_partitioning over" << std::endl;
1432 
1433  freeArray<pcoord_t *>(tcoords);
1434  freeArray<int>(permutation);
1435 
1436 
1437  //filling proc_to_task_xadj, proc_to_task_adj, task_to_proc arrays.
1438  for(part_t i = 0; i < num_parts; ++i){
1439 
1440  part_t proc_index_begin = proc_xadj[i];
1441  part_t task_begin_index = task_xadj[i];
1442  part_t proc_index_end = proc_xadj[i+1];
1443  part_t task_end_index = task_xadj[i+1];
1444 
1445 
1446  if(proc_index_end - proc_index_begin != 1){
1447  std::cerr << "Error at partitioning of processors" << std::endl;
1448  std::cerr << "PART:" << i << " is assigned to " << proc_index_end - proc_index_begin << " processors." << std::endl;
1449  exit(1);
1450  }
1451  part_t assigned_proc = proc_adjList[proc_index_begin];
1452  proc_to_task_xadj[assigned_proc] = task_end_index - task_begin_index;
1453  }
1454 
1455 
1456  //holds the pointer to the task array
1457  //convert proc_to_task_xadj to CSR index array
1458  part_t *proc_to_task_xadj_work = allocMemory<part_t>(this->no_procs);
1459  part_t sum = 0;
1460  for(part_t i = 0; i < this->no_procs; ++i){
1461  part_t tmp = proc_to_task_xadj[i];
1462  proc_to_task_xadj[i] = sum;
1463  sum += tmp;
1464  proc_to_task_xadj_work[i] = sum;
1465  }
1466  proc_to_task_xadj[this->no_procs] = sum;
1467 
1468  for(part_t i = 0; i < num_parts; ++i){
1469 
1470  part_t proc_index_begin = proc_xadj[i];
1471  part_t task_begin_index = task_xadj[i];
1472  part_t task_end_index = task_xadj[i+1];
1473 
1474  part_t assigned_proc = proc_adjList[proc_index_begin];
1475 
1476  for (part_t j = task_begin_index; j < task_end_index; ++j){
1477  part_t taskId = task_adjList[j];
1478 
1479  task_to_proc[taskId] = assigned_proc;
1480 
1481  proc_to_task_adj [ --proc_to_task_xadj_work[assigned_proc] ] = taskId;
1482  }
1483  }
1484 
1485  /*
1486  if (myPermutation == 0){
1487  std::ofstream gnuPlotCode ("mymapping.out", std::ofstream::out);
1488 
1489 
1490  for(part_t i = 0; i < num_parts; ++i){
1491 
1492  part_t proc_index_begin = proc_xadj[i];
1493  part_t proc_index_end = proc_xadj[i+1];
1494 
1495 
1496  if(proc_index_end - proc_index_begin != 1){
1497  std::cerr << "Error at partitioning of processors" << std::endl;
1498  std::cerr << "PART:" << i << " is assigned to " << proc_index_end - proc_index_begin << " processors." << std::endl;
1499  exit(1);
1500  }
1501 
1502  part_t assigned_proc = proc_adjList[proc_index_begin];
1503  gnuPlotCode << "Rank:" << i << " " <<
1504  this->proc_coords[0][assigned_proc] << " " << this->proc_coords[1][assigned_proc] << " " << this->proc_coords[2][assigned_proc] <<
1505  " " << pcoords[0][assigned_proc] << " " << pcoords[1][assigned_proc] <<
1506  " " << pcoords[2][assigned_proc] << " " << pcoords[3][assigned_proc] <<
1507  std::endl;
1508 
1509  }
1510 
1511 
1512  gnuPlotCode << "Machine Extent:" << std::endl;
1513  //filling proc_to_task_xadj, proc_to_task_adj, task_to_proc arrays.
1514  for(part_t i = 0; i < num_parts; ++i){
1515 
1516  part_t proc_index_begin = proc_xadj[i];
1517  part_t proc_index_end = proc_xadj[i+1];
1518 
1519 
1520  if(proc_index_end - proc_index_begin != 1){
1521  std::cerr << "Error at partitioning of processors" << std::endl;
1522  std::cerr << "PART:" << i << " is assigned to " << proc_index_end - proc_index_begin << " processors." << std::endl;
1523  exit(1);
1524  }
1525 
1526  part_t assigned_proc = proc_adjList[proc_index_begin];
1527  gnuPlotCode << "Rank:" << i << " " << this->proc_coords[0][assigned_proc] << " " << this->proc_coords[1][assigned_proc] << " " << this->proc_coords[2][assigned_proc] << std::endl;
1528 
1529  }
1530  gnuPlotCode.close();
1531  }
1532  */
1533 
1534  freeArray<part_t>(proc_to_task_xadj_work);
1535  freeArray<part_t>(task_xadj);
1536  freeArray<part_t>(task_adjList);
1537  freeArray<part_t>(proc_xadj);
1538  freeArray<part_t>(proc_adjList);
1539  }
1540 
1541 };
1542 
1543 template <typename Adapter, typename part_t>
1545 protected:
1546 
1547 #ifndef DOXYGEN_SHOULD_SKIP_THIS
1548 
1549  typedef typename Adapter::scalar_t pcoord_t;
1550  typedef typename Adapter::scalar_t tcoord_t;
1551  typedef typename Adapter::scalar_t scalar_t;
1552  typedef typename Adapter::lno_t lno_t;
1553 
1554 #endif
1555 
1556  //RCP<const Environment> env;
1557  ArrayRCP<part_t> proc_to_task_xadj; // = allocMemory<part_t> (this->no_procs+1); //holds the pointer to the task array
1558  ArrayRCP<part_t> proc_to_task_adj; // = allocMemory<part_t>(this->no_tasks); //holds the indices of tasks wrt to proc_to_task_xadj array.
1559  ArrayRCP<part_t> task_to_proc; //allocMemory<part_t>(this->no_procs); //holds the processors mapped to tasks.
1560  ArrayRCP<part_t> local_task_to_rank; //allocMemory<part_t>(this->no_procs); //holds the processors mapped to tasks.
1561 
1566  ArrayRCP<part_t> task_communication_xadj;
1567  ArrayRCP<part_t> task_communication_adj;
1569 
1570 
1573  void doMapping(int myRank, const Teuchos::RCP <const Teuchos::Comm<int> > comm_){
1574 
1575  if(this->proc_task_comm){
1576  this->proc_task_comm->getMapping(
1577  myRank,
1578  this->env,
1579  this->proc_to_task_xadj, // = allocMemory<part_t> (this->no_procs+1); //holds the pointer to the task array
1580  this->proc_to_task_adj, // = allocMemory<part_t>(this->no_tasks); //holds the indices of tasks wrt to proc_to_task_xadj array.
1581  this->task_to_proc //allocMemory<part_t>(this->no_procs); //holds the processors mapped to tasks.);
1582  ,comm_
1583  );
1584  }
1585  else {
1586  std::cerr << "communicationModel is not specified in the Mapper" << std::endl;
1587  exit(1);
1588  }
1589  }
1590 
1591 
1594  RCP<Comm<int> > create_subCommunicator(){
1595  int procDim = this->proc_task_comm->proc_coord_dim;
1596  int taskDim = this->proc_task_comm->task_coord_dim;
1597 
1598  int taskPerm = z2Fact<int>(procDim); //get the number of different permutations for task dimension ordering
1599  int procPerm = z2Fact<int>(taskDim); //get the number of different permutations for proc dimension ordering
1600  int idealGroupSize = taskPerm * procPerm; //total number of permutations
1601 
1602  idealGroupSize += taskPerm + procPerm + 1; //for the one that does longest dimension partitioning.
1603 
1604  int myRank = this->comm->getRank();
1605  int commSize = this->comm->getSize();
1606 
1607  int myGroupIndex = myRank / idealGroupSize;
1608 
1609  int prevGroupBegin = (myGroupIndex - 1)* idealGroupSize;
1610  if (prevGroupBegin < 0) prevGroupBegin = 0;
1611  int myGroupBegin = myGroupIndex * idealGroupSize;
1612  int myGroupEnd = (myGroupIndex + 1) * idealGroupSize;
1613  int nextGroupEnd = (myGroupIndex + 2)* idealGroupSize;
1614 
1615  if (myGroupEnd > commSize){
1616  myGroupBegin = prevGroupBegin;
1617  myGroupEnd = commSize;
1618  }
1619  if (nextGroupEnd > commSize){
1620  myGroupEnd = commSize;
1621  }
1622  int myGroupSize = myGroupEnd - myGroupBegin;
1623 
1624  part_t *myGroup = allocMemory<part_t>(myGroupSize);
1625  for (int i = 0; i < myGroupSize; ++i){
1626  myGroup[i] = myGroupBegin + i;
1627  }
1628  //cout << "me:" << myRank << " myGroupBegin:" << myGroupBegin << " myGroupEnd:" << myGroupEnd << endl;
1629 
1630  ArrayView<const part_t> myGroupView(myGroup, myGroupSize);
1631 
1632  RCP<Comm<int> > subComm = this->comm->createSubcommunicator(myGroupView);
1633  freeArray<part_t>(myGroup);
1634  return subComm;
1635  }
1636 
1637 
1641  //create the sub group.
1642  RCP<Comm<int> > subComm = this->create_subCommunicator();
1643  //calculate cost.
1644  double myCost = this->proc_task_comm->getCommunicationCostMetric();
1645  //std::cout << "me:" << this->comm->getRank() << " myCost:" << myCost << std::endl;
1646  double localCost[2], globalCost[2];
1647 
1648  localCost[0] = myCost;
1649  localCost[1] = double(subComm->getRank());
1650 
1651  globalCost[1] = globalCost[0] = std::numeric_limits<double>::max();
1653  reduceAll<int, double>(*subComm, reduceBest,
1654  2, localCost, globalCost);
1655 
1656  int sender = int(globalCost[1]);
1657 
1658  /*
1659  if ( this->comm->getRank() == 0){
1660  std::cout << "me:" << localCost[1] <<
1661  " localcost:" << localCost[0]<<
1662  " bestcost:" << globalCost[0] <<
1663  " Sender:" << sender <<
1664  " procDim" << proc_task_comm->proc_coord_dim <<
1665  " taskDim:" << proc_task_comm->task_coord_dim << std::endl;
1666  }
1667  */
1668  //cout << "me:" << localCost[1] << " localcost:" << localCost[0]<< " bestcost:" << globalCost[0] << endl;
1669  //cout << "me:" << localCost[1] << " proc:" << globalCost[1] << endl;
1670  broadcast(*subComm, sender, this->ntasks, this->task_to_proc.getRawPtr());
1671  broadcast(*subComm, sender, this->nprocs, this->proc_to_task_xadj.getRawPtr());
1672  broadcast(*subComm, sender, this->ntasks, this->proc_to_task_adj.getRawPtr());
1673  }
1674 
1675 
1676 
1677  //write mapping to gnuPlot code to visualize.
1679  std::ofstream gnuPlotCode("gnuPlot.plot", std::ofstream::out);
1680 
1681  int mindim = MINOF(proc_task_comm->proc_coord_dim, proc_task_comm->task_coord_dim);
1682  std::string ss = "";
1683  for(part_t i = 0; i < this->nprocs; ++i){
1684 
1685  std::string procFile = Teuchos::toString<int>(i) + "_mapping.txt";
1686  if (i == 0){
1687  gnuPlotCode << "plot \"" << procFile << "\"\n";
1688  }
1689  else {
1690  gnuPlotCode << "replot \"" << procFile << "\"\n";
1691  }
1692 
1693  std::ofstream inpFile(procFile.c_str(), std::ofstream::out);
1694 
1695  std::string gnuPlotArrow = "set arrow from ";
1696  for(int j = 0; j < mindim; ++j){
1697  if (j == mindim - 1){
1698  inpFile << proc_task_comm->proc_coords[j][i];
1699  gnuPlotArrow += Teuchos::toString<float>(proc_task_comm->proc_coords[j][i]);
1700 
1701  }
1702  else {
1703  inpFile << proc_task_comm->proc_coords[j][i] << " ";
1704  gnuPlotArrow += Teuchos::toString<float>(proc_task_comm->proc_coords[j][i]) +",";
1705  }
1706  }
1707  gnuPlotArrow += " to ";
1708 
1709 
1710  inpFile << std::endl;
1711  ArrayView<part_t> a = this->getAssignedTasksForProc(i);
1712  for(int k = 0; k < a.size(); ++k){
1713  int j = a[k];
1714  //cout << "i:" << i << " j:"
1715  std::string gnuPlotArrow2 = gnuPlotArrow;
1716  for(int z = 0; z < mindim; ++z){
1717  if(z == mindim - 1){
1718 
1719  //cout << "z:" << z << " j:" << j << " " << proc_task_comm->task_coords[z][j] << endl;
1720  inpFile << proc_task_comm->task_coords[z][j];
1721  gnuPlotArrow2 += Teuchos::toString<float>(proc_task_comm->task_coords[z][j]);
1722  }
1723  else{
1724  inpFile << proc_task_comm->task_coords[z][j] << " ";
1725  gnuPlotArrow2 += Teuchos::toString<float>(proc_task_comm->task_coords[z][j]) +",";
1726  }
1727  }
1728  ss += gnuPlotArrow2 + "\n";
1729  inpFile << std::endl;
1730  }
1731  inpFile.close();
1732  }
1733  gnuPlotCode << ss;
1734  gnuPlotCode << "\nreplot\n pause -1 \n";
1735  gnuPlotCode.close();
1736 
1737  }
1738 
1739 
1740  //write mapping to gnuPlot code to visualize.
1741  void writeMapping2(int myRank){
1742  std::string rankStr = Teuchos::toString<int>(myRank);
1743  std::string gnuPlots = "gnuPlot", extentionS = ".plot";
1744  std::string outF = gnuPlots + rankStr+ extentionS;
1745  std::ofstream gnuPlotCode(outF.c_str(), std::ofstream::out);
1746 
1749  //int mindim = MINOF(tmpproc_task_comm->proc_coord_dim, tmpproc_task_comm->task_coord_dim);
1750  int mindim = tmpproc_task_comm->proc_coord_dim;
1751  if (mindim != 3) {
1752  std::cerr << "Mapping Write is only good for 3 dim" << std::endl;
1753  return;
1754  }
1755  std::string ss = "";
1756  std::string procs = "";
1757 
1758  std::set < std::tuple<int,int,int,int,int,int> > my_arrows;
1759  for(part_t origin_rank = 0; origin_rank < this->nprocs; ++origin_rank){
1760  ArrayView<part_t> a = this->getAssignedTasksForProc(origin_rank);
1761  if (a.size() == 0){
1762  continue;
1763  }
1764 
1765  std::string gnuPlotArrow = "set arrow from ";
1766  for(int j = 0; j < mindim; ++j){
1767  if (j == mindim - 1){
1768  gnuPlotArrow += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][origin_rank]);
1769  procs += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][origin_rank]);
1770 
1771  }
1772  else {
1773  gnuPlotArrow += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][origin_rank]) +",";
1774  procs += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][origin_rank])+ " ";
1775  }
1776  }
1777  procs += "\n";
1778 
1779  gnuPlotArrow += " to ";
1780 
1781 
1782  for(int k = 0; k < a.size(); ++k){
1783  int origin_task = a[k];
1784 
1785  for (int nind = task_communication_xadj[origin_task]; nind < task_communication_xadj[origin_task + 1]; ++nind){
1786  int neighbor_task = task_communication_adj[nind];
1787 
1788  bool differentnode = false;
1789 
1790  int neighbor_rank = this->getAssignedProcForTask(neighbor_task);
1791 
1792  for(int j = 0; j < mindim; ++j){
1793  if (int(tmpproc_task_comm->proc_coords[j][origin_rank]) != int(tmpproc_task_comm->proc_coords[j][neighbor_rank])){
1794  differentnode = true; break;
1795  }
1796  }
1797  std::tuple<int,int,int, int, int, int> foo(
1798  int(tmpproc_task_comm->proc_coords[0][origin_rank]),
1799  int(tmpproc_task_comm->proc_coords[1][origin_rank]),
1800  int(tmpproc_task_comm->proc_coords[2][origin_rank]),
1801  int(tmpproc_task_comm->proc_coords[0][neighbor_rank]),
1802  int(tmpproc_task_comm->proc_coords[1][neighbor_rank]),
1803  int(tmpproc_task_comm->proc_coords[2][neighbor_rank]));
1804 
1805 
1806  if (differentnode && my_arrows.find(foo) == my_arrows.end()){
1807  my_arrows.insert(foo);
1808 
1809  std::string gnuPlotArrow2 = "";
1810  for(int j = 0; j < mindim; ++j){
1811  if(j == mindim - 1){
1812  gnuPlotArrow2 += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][neighbor_rank]);
1813  }
1814  else{
1815  gnuPlotArrow2 += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][neighbor_rank]) +",";
1816  }
1817  }
1818  ss += gnuPlotArrow + gnuPlotArrow2 + " nohead\n";
1819  }
1820  }
1821  }
1822 
1823  }
1824 
1825 
1826  std::ofstream procFile("procPlot.plot", std::ofstream::out);
1827  procFile << procs << "\n";
1828  procFile.close();
1829 
1830  //gnuPlotCode << ss;
1831  if(mindim == 2){
1832  gnuPlotCode << "plot \"procPlot.plot\" with points pointsize 3\n";
1833  } else {
1834  gnuPlotCode << "splot \"procPlot.plot\" with points pointsize 3\n";
1835  }
1836 
1837  gnuPlotCode << ss << "\nreplot\n pause -1 \n";
1838  gnuPlotCode.close();
1839  }
1840 
1841 
1842 // KDD Need to provide access to algorithm for getPartBoxes
1843 #ifdef gnuPlot
1844  void writeGnuPlot(
1845  const Teuchos::Comm<int> *comm_,
1847  int coordDim,
1848  tcoord_t **partCenters
1849  ){
1850  std::string file = "gggnuPlot";
1851  std::string exten = ".plot";
1852  std::ofstream mm("2d.txt");
1853  file += Teuchos::toString<int>(comm_->getRank()) + exten;
1854  std::ofstream ff(file.c_str());
1855  //ff.seekg(0, ff.end);
1856  std::vector<Zoltan2::coordinateModelPartBox <tcoord_t, part_t> > outPartBoxes = ((Zoltan2::PartitioningSolution<Adapter> *)soln_)->getPartBoxesView();
1857 
1858  for (part_t i = 0; i < this->ntasks;++i){
1859  outPartBoxes[i].writeGnuPlot(ff, mm);
1860  }
1861  if (coordDim == 2){
1862  ff << "plot \"2d.txt\"" << std::endl;
1863  //ff << "\n pause -1" << endl;
1864  }
1865  else {
1866  ff << "splot \"2d.txt\"" << std::endl;
1867  //ff << "\n pause -1" << endl;
1868  }
1869  mm.close();
1870 
1871  ff << "set style arrow 5 nohead size screen 0.03,15,135 ls 1" << std::endl;
1872  for (part_t i = 0; i < this->ntasks;++i){
1874  part_t pe = task_communication_xadj[i+1];
1875  for (part_t p = pb; p < pe; ++p){
1877 
1878  //cout << "i:" << i << " n:" << n << endl;
1879  std::string arrowline = "set arrow from ";
1880  for (int j = 0; j < coordDim - 1; ++j){
1881  arrowline += Teuchos::toString<tcoord_t>(partCenters[j][n]) + ",";
1882  }
1883  arrowline += Teuchos::toString<tcoord_t>(partCenters[coordDim -1][n]) + " to ";
1884 
1885 
1886  for (int j = 0; j < coordDim - 1; ++j){
1887  arrowline += Teuchos::toString<tcoord_t>(partCenters[j][i]) + ",";
1888  }
1889  arrowline += Teuchos::toString<tcoord_t>(partCenters[coordDim -1][i]) + " as 5\n";
1890 
1891  //cout << "arrow:" << arrowline << endl;
1892  ff << arrowline;
1893  }
1894  }
1895 
1896  ff << "replot\n pause -1" << std::endl;
1897  ff.close();
1898  }
1899 #endif // gnuPlot
1900 
1901 public:
1902 
1903  void getProcTask(part_t* &proc_to_task_xadj_, part_t* &proc_to_task_adj_){
1904  proc_to_task_xadj_ = this->proc_to_task_xadj.getRawPtr();
1905  proc_to_task_adj_ = this->proc_to_task_adj.getRawPtr();
1906  }
1907 
1908  virtual void map(const RCP<MappingSolution<Adapter> > &mappingsoln) {
1909 
1910  // Mapping was already computed in the constructor; we need to store it
1911  // in the solution.
1912  mappingsoln->setMap_RankForLocalElements(local_task_to_rank);
1913 
1914  // KDDKDD TODO: Algorithm is also creating task_to_proc, which maybe
1915  // KDDKDD is not needed once we use MappingSolution to answer queries
1916  // KDDKDD instead of this algorithm.
1917  // KDDKDD Ask Mehmet: what is the most efficient way to get the answer
1918  // KDDKDD out of CoordinateTaskMapper and into the MappingSolution?
1919  }
1920 
1921 
1923  //freeArray<part_t>(proc_to_task_xadj);
1924  //freeArray<part_t>(proc_to_task_adj);
1925  //freeArray<part_t>(task_to_proc);
1926  if(this->isOwnerofModel){
1927  delete this->proc_task_comm;
1928  }
1929  }
1930 
1932  const lno_t num_local_coords,
1933  const part_t *local_coord_parts,
1934  const ArrayRCP<part_t> task_to_proc_){
1935  local_task_to_rank = ArrayRCP <part_t>(num_local_coords);
1936 
1937  for (lno_t i = 0; i < num_local_coords; ++i){
1938  part_t local_coord_part = local_coord_parts[i];
1939  part_t rank_index = task_to_proc_[local_coord_part];
1940  local_task_to_rank[i] = rank_index;
1941  }
1942  }
1943 
1944 
1945 
1957  const Teuchos::RCP <const Teuchos::Comm<int> > comm_,
1958  const Teuchos::RCP <const MachineRepresentation<pcoord_t,part_t> > machine_,
1959  const Teuchos::RCP <const Adapter> input_adapter_,
1960  const Teuchos::RCP <const Zoltan2::PartitioningSolution<Adapter> > soln_,
1961  const Teuchos::RCP <const Environment> envConst,
1962  bool is_input_adapter_distributed = true,
1963  int num_ranks_per_node = 1,
1964  bool divide_to_prime_first = false, bool reduce_best_mapping = true):
1965  PartitionMapping<Adapter>(comm_, machine_, input_adapter_, soln_, envConst),
1966  proc_to_task_xadj(0),
1967  proc_to_task_adj(0),
1968  task_to_proc(0),
1969  isOwnerofModel(true),
1970  proc_task_comm(0),
1974  using namespace Teuchos;
1975  typedef typename Adapter::base_adapter_t ctm_base_adapter_t;
1976 
1977  RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
1978  RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
1979 
1980  RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
1981  RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
1982  if (!is_input_adapter_distributed){
1983  ia_comm = Teuchos::createSerialComm<int>();
1984  }
1985 
1986  RCP<const Environment> envConst_ = envConst;
1987 
1988  RCP<const ctm_base_adapter_t> baseInputAdapter_(
1989  rcp(dynamic_cast<const ctm_base_adapter_t *>(input_adapter_.getRawPtr()), false));
1990 
1991  modelFlag_t coordFlags_, graphFlags_;
1992 
1993  //create coordinate model
1994  //since this is coordinate task mapper,
1995  //the adapter has to have the coordinates
1996  coordinateModel_ = rcp(new CoordinateModel<ctm_base_adapter_t>(
1997  baseInputAdapter_, envConst_, ia_comm, coordFlags_));
1998 
1999  //if the adapter has also graph model, we will use graph model
2000  //to calculate the cost mapping.
2001  BaseAdapterType inputType_ = input_adapter_->adapterType();
2002  if (inputType_ == MatrixAdapterType ||
2003  inputType_ == GraphAdapterType ||
2004  inputType_ == MeshAdapterType)
2005  {
2006  graph_model_ = rcp(new GraphModel<ctm_base_adapter_t>(
2007  baseInputAdapter_, envConst_, ia_comm,
2008  graphFlags_));
2009  }
2010 
2011  if (!machine_->hasMachineCoordinates()) {
2012  throw std::runtime_error("Existing machine does not provide coordinates "
2013  "for coordinate task mapping");
2014  }
2015 
2016  //if mapping type is 0 then it is coordinate mapping
2017  int procDim = machine_->getMachineDim();
2018  this->nprocs = machine_->getNumRanks();
2019 
2020  //get processor coordinates.
2021  pcoord_t **procCoordinates = NULL;
2022  if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2023  throw std::runtime_error("Existing machine does not implement "
2024  "getAllMachineCoordinatesView");
2025  }
2026 
2027  //get the machine extent.
2028  //if we have machine extent,
2029  //if the machine has wrap-around links, we would like to shift the coordinates,
2030  //so that the largest hap would be the wrap-around.
2031  std::vector<int> machine_extent_vec(procDim);
2032  //std::vector<bool> machine_extent_wrap_around_vec(procDim, 0);
2033  int *machine_extent = &(machine_extent_vec[0]);
2034  bool *machine_extent_wrap_around = new bool[procDim];
2035  for (int i = 0; i < procDim; ++i)machine_extent_wrap_around[i] = false;
2036  machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2037 
2038 
2039 
2040  // KDDKDD ASK MEHMET: SHOULD WE GET AND USE machine_dimension HERE IF IT
2041  // KDDKDD ASK MEHMET: IS PROVIDED BY THE MACHINE REPRESENTATION?
2042  // KDDKDD ASK MEHMET: IF NOT HERE, THEN WHERE?
2043  // MD: Yes, I ADDED BELOW:
2044  if (machine_->getMachineExtent(machine_extent)) {
2045  procCoordinates =
2047  procDim,
2048  machine_extent,
2049  machine_extent_wrap_around,
2050  this->nprocs,
2051  procCoordinates);
2052  }
2053 
2054 
2055  //get the tasks information, such as coordinate dimension,
2056  //number of parts.
2057  int coordDim = coordinateModel_->getCoordinateDim();
2058 
2059 
2060  this->ntasks = soln_->getActualGlobalNumberOfParts();
2061  if (part_t(soln_->getTargetGlobalNumberOfParts()) > this->ntasks){
2062  this->ntasks = soln_->getTargetGlobalNumberOfParts();
2063  }
2064  this->solution_parts = soln_->getPartListView();
2065 
2066  //we need to calculate the center of parts.
2067  tcoord_t **partCenters = NULL;
2068  partCenters = allocMemory<tcoord_t *>(coordDim);
2069  for (int i = 0; i < coordDim; ++i){
2070  partCenters[i] = allocMemory<tcoord_t>(this->ntasks);
2071  }
2072 
2073  typedef typename Adapter::scalar_t t_scalar_t;
2074 
2075 
2076  envConst->timerStart(MACRO_TIMERS, "Mapping - Solution Center");
2077 
2078  //get centers for the parts.
2079  getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2080  envConst.getRawPtr(),
2081  ia_comm.getRawPtr(),
2082  coordinateModel_.getRawPtr(),
2083  this->solution_parts,
2084  //soln_->getPartListView();
2085  //this->soln.getRawPtr(),
2086  coordDim,
2087  ntasks,
2088  partCenters);
2089 
2090  envConst->timerStop(MACRO_TIMERS, "Mapping - Solution Center");
2091 
2092 
2093  //create the part graph
2094  if (graph_model_.getRawPtr() != NULL){
2095  getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2096  envConst.getRawPtr(),
2097  ia_comm.getRawPtr(),
2098  graph_model_.getRawPtr(),
2099  this->ntasks,
2100  this->solution_parts,
2101  //soln_->getPartListView(),
2102  //this->soln.getRawPtr(),
2106  );
2107  }
2108 
2109  //create coordinate communication model.
2110  this->proc_task_comm =
2112  procDim,
2113  procCoordinates,
2114  coordDim,
2115  partCenters,
2116  this->nprocs,
2117  this->ntasks,
2118  machine_extent,
2119  machine_extent_wrap_around,
2120  machine_.getRawPtr());
2121 
2122  int myRank = comm_->getRank();
2123  this->proc_task_comm->num_ranks_per_node = num_ranks_per_node ;
2124  this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2125 
2126 
2127  envConst->timerStart(MACRO_TIMERS, "Mapping - Processor Task map");
2128  this->doMapping(myRank, comm_);
2129  envConst->timerStop(MACRO_TIMERS, "Mapping - Processor Task map");
2130 
2131 
2132  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Graph");
2133 
2134  /*soln_->getCommunicationGraph(task_communication_xadj,
2135  task_communication_adj);
2136  */
2137 
2138  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Graph");
2139  #ifdef gnuPlot1
2140  if (comm_->getRank() == 0){
2141 
2142  part_t taskCommCount = task_communication_xadj.size();
2143  std::cout << " TotalComm:" << task_communication_xadj[taskCommCount] << std::endl;
2144  part_t maxN = task_communication_xadj[0];
2145  for (part_t i = 1; i <= taskCommCount; ++i){
2147  if (maxN < nc) maxN = nc;
2148  }
2149  std::cout << " maxNeighbor:" << maxN << std::endl;
2150  }
2151 
2152  this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2153  #endif
2154 
2155  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Cost");
2156 
2157  if (reduce_best_mapping && task_communication_xadj.getRawPtr() && task_communication_adj.getRawPtr()){
2158  this->proc_task_comm->calculateCommunicationCost(
2159  task_to_proc.getRawPtr(),
2160  task_communication_xadj.getRawPtr(),
2161  task_communication_adj.getRawPtr(),
2162  task_communication_edge_weight.getRawPtr()
2163  );
2164  }
2165 
2166  //std::cout << "me: " << comm_->getRank() << " cost:" << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2167 
2168  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Cost");
2169 
2170  //processors are divided into groups of size procDim! * coordDim!
2171  //each processor in the group obtains a mapping with a different rotation
2172  //and best one is broadcasted all processors.
2173  this->getBestMapping();
2175  coordinateModel_->getLocalNumCoordinates(),
2176  this->solution_parts,
2177  this->task_to_proc);
2178  /*
2179  {
2180  if (task_communication_xadj.getRawPtr() && task_communication_adj.getRawPtr())
2181  this->proc_task_comm->calculateCommunicationCost(
2182  task_to_proc.getRawPtr(),
2183  task_communication_xadj.getRawPtr(),
2184  task_communication_adj.getRawPtr(),
2185  task_communication_edge_weight.getRawPtr()
2186  );
2187  std::cout << "me: " << comm_->getRank() << " cost:" << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2188  }
2189  */
2190 
2191 
2192  #ifdef gnuPlot
2193  this->writeMapping2(comm_->getRank());
2194  #endif
2195 
2196  delete []machine_extent_wrap_around;
2197  if (machine_->getMachineExtent(machine_extent)){
2198  for (int i = 0; i < procDim; ++i){
2199  delete [] procCoordinates[i];
2200  }
2201  delete [] procCoordinates;
2202  }
2203 
2204  for (int i = 0; i < coordDim; ++i){
2205  freeArray<tcoord_t>(partCenters[i]);
2206  }
2207  freeArray<tcoord_t *>(partCenters);
2208 
2209  }
2210 
2211 
2223  const Teuchos::RCP <const Teuchos::Comm<int> > comm_,
2224  const Teuchos::RCP <const MachineRepresentation<pcoord_t,part_t> > machine_,
2225  const Teuchos::RCP <const Adapter> input_adapter_,
2226  const part_t num_parts_,
2227  const part_t *result_parts,
2228  const Teuchos::RCP <const Environment> envConst,
2229  bool is_input_adapter_distributed = true,
2230  int num_ranks_per_node = 1,
2231  bool divide_to_prime_first = false, bool reduce_best_mapping = true):
2232  PartitionMapping<Adapter>(comm_, machine_, input_adapter_, num_parts_, result_parts, envConst),
2233  proc_to_task_xadj(0),
2234  proc_to_task_adj(0),
2235  task_to_proc(0),
2236  isOwnerofModel(true),
2237  proc_task_comm(0),
2241  using namespace Teuchos;
2242  typedef typename Adapter::base_adapter_t ctm_base_adapter_t;
2243 
2244  RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2245  RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2246 
2247  RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2248  RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2249  if (!is_input_adapter_distributed){
2250  ia_comm = Teuchos::createSerialComm<int>();
2251  }
2252  RCP<const Environment> envConst_ = envConst;
2253 
2254  RCP<const ctm_base_adapter_t> baseInputAdapter_(
2255  rcp(dynamic_cast<const ctm_base_adapter_t *>(input_adapter_.getRawPtr()), false));
2256 
2257  modelFlag_t coordFlags_, graphFlags_;
2258 
2259  //create coordinate model
2260  //since this is coordinate task mapper,
2261  //the adapter has to have the coordinates
2262  coordinateModel_ = rcp(new CoordinateModel<ctm_base_adapter_t>(
2263  baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2264 
2265  //if the adapter has also graph model, we will use graph model
2266  //to calculate the cost mapping.
2267  BaseAdapterType inputType_ = input_adapter_->adapterType();
2268  if (inputType_ == MatrixAdapterType ||
2269  inputType_ == GraphAdapterType ||
2270  inputType_ == MeshAdapterType)
2271  {
2272  graph_model_ = rcp(new GraphModel<ctm_base_adapter_t>(
2273  baseInputAdapter_, envConst_, ia_comm,
2274  graphFlags_));
2275  }
2276 
2277  if (!machine_->hasMachineCoordinates()) {
2278  throw std::runtime_error("Existing machine does not provide coordinates "
2279  "for coordinate task mapping");
2280  }
2281 
2282  //if mapping type is 0 then it is coordinate mapping
2283  int procDim = machine_->getMachineDim();
2284  this->nprocs = machine_->getNumRanks();
2285 
2286  //get processor coordinates.
2287  pcoord_t **procCoordinates = NULL;
2288  if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2289  throw std::runtime_error("Existing machine does not implement "
2290  "getAllMachineCoordinatesView");
2291  }
2292 
2293  //get the machine extent.
2294  //if we have machine extent,
2295  //if the machine has wrap-around links, we would like to shift the coordinates,
2296  //so that the largest hap would be the wrap-around.
2297  std::vector<int> machine_extent_vec(procDim);
2298  //std::vector<bool> machine_extent_wrap_around_vec(procDim, 0);
2299  int *machine_extent = &(machine_extent_vec[0]);
2300  bool *machine_extent_wrap_around = new bool[procDim];
2301  machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2302 
2303 
2304 
2305  // KDDKDD ASK MEHMET: SHOULD WE GET AND USE machine_dimension HERE IF IT
2306  // KDDKDD ASK MEHMET: IS PROVIDED BY THE MACHINE REPRESENTATION?
2307  // KDDKDD ASK MEHMET: IF NOT HERE, THEN WHERE?
2308  // MD: Yes, I ADDED BELOW:
2309  if (machine_->getMachineExtent(machine_extent)) {
2310  procCoordinates =
2312  procDim,
2313  machine_extent,
2314  machine_extent_wrap_around,
2315  this->nprocs,
2316  procCoordinates);
2317  }
2318 
2319 
2320  //get the tasks information, such as coordinate dimension,
2321  //number of parts.
2322  int coordDim = coordinateModel_->getCoordinateDim();
2323 
2324 
2325  this->ntasks = num_parts_;
2326  this->solution_parts = result_parts;
2327 
2328  //we need to calculate the center of parts.
2329  tcoord_t **partCenters = NULL;
2330  partCenters = allocMemory<tcoord_t *>(coordDim);
2331  for (int i = 0; i < coordDim; ++i){
2332  partCenters[i] = allocMemory<tcoord_t>(this->ntasks);
2333  }
2334 
2335  typedef typename Adapter::scalar_t t_scalar_t;
2336 
2337 
2338  envConst->timerStart(MACRO_TIMERS, "Mapping - Solution Center");
2339 
2340  //get centers for the parts.
2341  getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2342  envConst.getRawPtr(),
2343  ia_comm.getRawPtr(),
2344  coordinateModel_.getRawPtr(),
2345  this->solution_parts,
2346  //soln_->getPartListView();
2347  //this->soln.getRawPtr(),
2348  coordDim,
2349  ntasks,
2350  partCenters);
2351 
2352  envConst->timerStop(MACRO_TIMERS, "Mapping - Solution Center");
2353 
2354  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE");
2355  //create the part graph
2356  if (graph_model_.getRawPtr() != NULL){
2357  getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2358  envConst.getRawPtr(),
2359  ia_comm.getRawPtr(),
2360  graph_model_.getRawPtr(),
2361  this->ntasks,
2362  this->solution_parts,
2363  //soln_->getPartListView(),
2364  //this->soln.getRawPtr(),
2368  );
2369  }
2370  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE");
2371 
2372 
2373  envConst->timerStart(MACRO_TIMERS, "CoordinateCommunicationModel Create");
2374  //create coordinate communication model.
2375  this->proc_task_comm =
2377  procDim,
2378  procCoordinates,
2379  coordDim,
2380  partCenters,
2381  this->nprocs,
2382  this->ntasks,
2383  machine_extent,
2384  machine_extent_wrap_around,
2385  machine_.getRawPtr());
2386 
2387  envConst->timerStop(MACRO_TIMERS, "CoordinateCommunicationModel Create");
2388 
2389 
2390  this->proc_task_comm->num_ranks_per_node = num_ranks_per_node;
2391  this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2392 
2393  int myRank = comm_->getRank();
2394 
2395 
2396  envConst->timerStart(MACRO_TIMERS, "Mapping - Processor Task map");
2397  this->doMapping(myRank, comm_);
2398  envConst->timerStop(MACRO_TIMERS, "Mapping - Processor Task map");
2399 
2400 
2401  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Graph");
2402 
2403  /*soln_->getCommunicationGraph(task_communication_xadj,
2404  task_communication_adj);
2405  */
2406 
2407  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Graph");
2408  #ifdef gnuPlot1
2409  if (comm_->getRank() == 0){
2410 
2411  part_t taskCommCount = task_communication_xadj.size();
2412  std::cout << " TotalComm:" << task_communication_xadj[taskCommCount] << std::endl;
2413  part_t maxN = task_communication_xadj[0];
2414  for (part_t i = 1; i <= taskCommCount; ++i){
2416  if (maxN < nc) maxN = nc;
2417  }
2418  std::cout << " maxNeighbor:" << maxN << std::endl;
2419  }
2420 
2421  this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2422  #endif
2423 
2424  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Cost");
2425 
2426  if (reduce_best_mapping && task_communication_xadj.getRawPtr() && task_communication_adj.getRawPtr()){
2427  this->proc_task_comm->calculateCommunicationCost(
2428  task_to_proc.getRawPtr(),
2429  task_communication_xadj.getRawPtr(),
2430  task_communication_adj.getRawPtr(),
2431  task_communication_edge_weight.getRawPtr()
2432  );
2433  }
2434 
2435  //std::cout << "me: " << comm_->getRank() << " cost:" << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2436 
2437  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Cost");
2438 
2439  //processors are divided into groups of size procDim! * coordDim!
2440  //each processor in the group obtains a mapping with a different rotation
2441  //and best one is broadcasted all processors.
2442  this->getBestMapping();
2443 
2445  coordinateModel_->getLocalNumCoordinates(),
2446  this->solution_parts,
2447  this->task_to_proc);
2448  /*
2449 
2450  {
2451  if (task_communication_xadj.getRawPtr() && task_communication_adj.getRawPtr())
2452  this->proc_task_comm->calculateCommunicationCost(
2453  task_to_proc.getRawPtr(),
2454  task_communication_xadj.getRawPtr(),
2455  task_communication_adj.getRawPtr(),
2456  task_communication_edge_weight.getRawPtr()
2457  );
2458  std::cout << "me: " << comm_->getRank() << " cost:" << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2459  }
2460  */
2461 
2462 
2463 
2464  #ifdef gnuPlot
2465  this->writeMapping2(comm_->getRank());
2466  #endif
2467 
2468  delete []machine_extent_wrap_around;
2469  if (machine_->getMachineExtent(machine_extent)){
2470  for (int i = 0; i < procDim; ++i){
2471  delete [] procCoordinates[i];
2472  }
2473  delete [] procCoordinates;
2474  }
2475 
2476  for (int i = 0; i < coordDim; ++i){
2477  freeArray<tcoord_t>(partCenters[i]);
2478  }
2479  freeArray<tcoord_t *>(partCenters);
2480 
2481  }
2482 
2530  const Environment *env_const_,
2531  const Teuchos::Comm<int> *problemComm,
2532  int proc_dim,
2533  int num_processors,
2534  pcoord_t **machine_coords,
2535 
2536  int task_dim,
2537  part_t num_tasks,
2538  tcoord_t **task_coords,
2539  ArrayRCP<part_t>task_comm_xadj,
2540  ArrayRCP<part_t>task_comm_adj,
2541  pcoord_t *task_communication_edge_weight_,
2542  int recursion_depth,
2543  part_t *part_no_array,
2544  const part_t *machine_dimensions,
2545  int num_ranks_per_node = 1,
2546  bool divide_to_prime_first = false, bool reduce_best_mapping = true
2547  ): PartitionMapping<Adapter>(
2548  Teuchos::rcpFromRef<const Teuchos::Comm<int> >(*problemComm),
2549  Teuchos::rcpFromRef<const Environment>(*env_const_)),
2550  proc_to_task_xadj(0),
2551  proc_to_task_adj(0),
2552  task_to_proc(0),
2553  isOwnerofModel(true),
2554  proc_task_comm(0),
2555  task_communication_xadj(task_comm_xadj),
2556  task_communication_adj(task_comm_adj){
2557 
2558  //if mapping type is 0 then it is coordinate mapping
2559  pcoord_t ** virtual_machine_coordinates = machine_coords;
2560  bool *wrap_arounds = new bool [proc_dim];
2561  for (int i = 0; i < proc_dim; ++i) wrap_arounds[i] = true;
2562 
2563  if (machine_dimensions){
2564  virtual_machine_coordinates =
2566  proc_dim,
2567  machine_dimensions,
2568  wrap_arounds,
2569  num_processors,
2570  machine_coords);
2571  }
2572 
2573  this->nprocs = num_processors;
2574 
2575  int coordDim = task_dim;
2576  this->ntasks = num_tasks;
2577 
2578  //alloc memory for part centers.
2579  tcoord_t **partCenters = task_coords;
2580 
2581  //create coordinate communication model.
2582  this->proc_task_comm =
2584  proc_dim,
2585  virtual_machine_coordinates,
2586  coordDim,
2587  partCenters,
2588  this->nprocs,
2589  this->ntasks, NULL, NULL
2590  );
2591 
2592  this->proc_task_comm->num_ranks_per_node = num_ranks_per_node;
2593  this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2594 
2595  this->proc_task_comm->setPartArraySize(recursion_depth);
2596  this->proc_task_comm->setPartArray(part_no_array);
2597 
2598  int myRank = problemComm->getRank();
2599 
2600  this->doMapping(myRank, this->comm);
2601 #ifdef gnuPlot
2602  this->writeMapping2(myRank);
2603 #endif
2604 
2605  if (reduce_best_mapping && task_communication_xadj.getRawPtr() && task_communication_adj.getRawPtr()){
2606  this->proc_task_comm->calculateCommunicationCost(
2607  task_to_proc.getRawPtr(),
2608  task_communication_xadj.getRawPtr(),
2609  task_communication_adj.getRawPtr(),
2610  task_communication_edge_weight_
2611  );
2612 
2613 
2614  this->getBestMapping();
2615 
2616 /*
2617  if (myRank == 0){
2618  this->proc_task_comm->calculateCommunicationCost(
2619  task_to_proc.getRawPtr(),
2620  task_communication_xadj.getRawPtr(),
2621  task_communication_adj.getRawPtr(),
2622  task_communication_edge_weight_
2623  );
2624  cout << "me: " << problemComm->getRank() << " cost:" << this->proc_task_comm->getCommunicationCostMetric() << endl;
2625  }
2626 */
2627 
2628  }
2629  delete [] wrap_arounds;
2630 
2631  if (machine_dimensions){
2632  for (int i = 0; i < proc_dim; ++i){
2633  delete [] virtual_machine_coordinates[i];
2634  }
2635  delete [] virtual_machine_coordinates;
2636  }
2637 #ifdef gnuPlot
2638  if(problemComm->getRank() == 0)
2639  this->writeMapping2(-1);
2640 #endif
2641  }
2642 
2643 
2644  /*
2645  double getCommunicationCostMetric(){
2646  return this->proc_task_comm->getCommCost();
2647  }
2648  */
2649 
2652  virtual size_t getLocalNumberOfParts() const{
2653  return 0;
2654  }
2655 
2665  int machine_dim,
2666  const part_t *machine_dimensions,
2667  bool *machine_extent_wrap_around,
2668  part_t numProcs,
2669  pcoord_t **mCoords){
2670  pcoord_t **result_machine_coords = NULL;
2671  result_machine_coords = new pcoord_t*[machine_dim];
2672  for (int i = 0; i < machine_dim; ++i){
2673  result_machine_coords[i] = new pcoord_t [numProcs];
2674  }
2675 
2676  for (int i = 0; i < machine_dim; ++i){
2677  part_t numMachinesAlongDim = machine_dimensions[i];
2678  part_t *machineCounts= new part_t[numMachinesAlongDim];
2679  memset(machineCounts, 0, sizeof(part_t) *numMachinesAlongDim);
2680 
2681  int *filledCoordinates= new int[numMachinesAlongDim];
2682 
2683  pcoord_t *coords = mCoords[i];
2684  for(part_t j = 0; j < numProcs; ++j){
2685  part_t mc = (part_t) coords[j];
2686  ++machineCounts[mc];
2687  }
2688 
2689  part_t filledCoordinateCount = 0;
2690  for(part_t j = 0; j < numMachinesAlongDim; ++j){
2691  if (machineCounts[j] > 0){
2692  filledCoordinates[filledCoordinateCount++] = j;
2693  }
2694  }
2695 
2696  part_t firstProcCoord = filledCoordinates[0];
2697  part_t firstProcCount = machineCounts[firstProcCoord];
2698 
2699  part_t lastProcCoord = filledCoordinates[filledCoordinateCount - 1];
2700  part_t lastProcCount = machineCounts[lastProcCoord];
2701 
2702  part_t firstLastGap = numMachinesAlongDim - lastProcCoord + firstProcCoord;
2703  part_t firstLastGapProc = lastProcCount + firstProcCount;
2704 
2705  part_t leftSideProcCoord = firstProcCoord;
2706  part_t leftSideProcCount = firstProcCount;
2707  part_t biggestGap = 0;
2708  part_t biggestGapProc = numProcs;
2709 
2710  part_t shiftBorderCoordinate = -1;
2711  for(part_t j = 1; j < filledCoordinateCount; ++j){
2712  part_t rightSideProcCoord= filledCoordinates[j];
2713  part_t rightSideProcCount = machineCounts[rightSideProcCoord];
2714 
2715  part_t gap = rightSideProcCoord - leftSideProcCoord;
2716  part_t gapProc = rightSideProcCount + leftSideProcCount;
2717 
2718  /* Pick the largest gap in this dimension. Use fewer process on either side
2719  of the largest gap to break the tie. An easy addition to this would
2720  be to weight the gap by the number of processes. */
2721  if (gap > biggestGap || (gap == biggestGap && biggestGapProc > gapProc)){
2722  shiftBorderCoordinate = rightSideProcCoord;
2723  biggestGapProc = gapProc;
2724  biggestGap = gap;
2725  }
2726  leftSideProcCoord = rightSideProcCoord;
2727  leftSideProcCount = rightSideProcCount;
2728  }
2729 
2730 
2731  if (!(biggestGap > firstLastGap || (biggestGap == firstLastGap && biggestGapProc < firstLastGapProc))){
2732  shiftBorderCoordinate = -1;
2733  }
2734 
2735  for(part_t j = 0; j < numProcs; ++j){
2736 
2737  if (machine_extent_wrap_around[i] && coords[j] < shiftBorderCoordinate){
2738  result_machine_coords[i][j] = coords[j] + numMachinesAlongDim;
2739 
2740  }
2741  else {
2742  result_machine_coords[i][j] = coords[j];
2743  }
2744  //cout << "I:" << i << "j:" << j << " coord:" << coords[j] << " now:" << result_machine_coords[i][j] << endl;
2745  }
2746  delete [] machineCounts;
2747  delete [] filledCoordinates;
2748  }
2749 
2750  return result_machine_coords;
2751 
2752  }
2753 
2760  virtual void getProcsForPart(part_t taskId, part_t &numProcs, part_t *&procs) const{
2761  numProcs = 1;
2762  procs = this->task_to_proc.getRawPtr() + taskId;
2763  }
2768  return this->task_to_proc[taskId];
2769  }
2776  virtual void getPartsForProc(int procId, part_t &numParts, part_t *&parts) const{
2777 
2778  part_t task_begin = this->proc_to_task_xadj[procId];
2779  part_t taskend = this->proc_to_task_xadj[procId+1];
2780  parts = this->proc_to_task_adj.getRawPtr() + task_begin;
2781  numParts = taskend - task_begin;
2782  }
2783 
2784  ArrayView<part_t> getAssignedTasksForProc(part_t procId){
2785  part_t task_begin = this->proc_to_task_xadj[procId];
2786  part_t taskend = this->proc_to_task_xadj[procId+1];
2787 
2788  /*
2789  cout << "part_t:" << procId << " taskCount:" << taskend - task_begin << endl;
2790  for(part_t i = task_begin; i < taskend; ++i){
2791  cout << "part_t:" << procId << " task:" << proc_to_task_adj[i] << endl;
2792  }
2793  */
2794  if (taskend - task_begin > 0){
2795  ArrayView <part_t> assignedParts(this->proc_to_task_adj.getRawPtr() + task_begin, taskend - task_begin);
2796  return assignedParts;
2797  }
2798  else {
2799  ArrayView <part_t> assignedParts;
2800  return assignedParts;
2801  }
2802  }
2803 
2804 };
2805 
2806 
2807 
2876 template <typename part_t, typename pcoord_t, typename tcoord_t>
2878  RCP<const Teuchos::Comm<int> > problemComm,
2879  int proc_dim,
2880  int num_processors,
2881  pcoord_t **machine_coords,
2882  int task_dim,
2883  part_t num_tasks,
2884  tcoord_t **task_coords,
2885  part_t *task_comm_xadj,
2886  part_t *task_comm_adj,
2887  pcoord_t *task_communication_edge_weight_, /*float-like, same size with task_communication_adj_ weight of the corresponding edge.*/
2888  part_t *proc_to_task_xadj, /*output*/
2889  part_t *proc_to_task_adj, /*output*/
2890  int recursion_depth,
2891  part_t *part_no_array,
2892  const part_t *machine_dimensions,
2893  int num_ranks_per_node = 1,
2894  bool divide_to_prime_first = false
2895 )
2896 {
2897 
2898  const Environment *envConst_ = new Environment(problemComm);
2899 
2900  // mfh 03 Mar 2015: It's OK to omit the Node template
2901  // parameter in Tpetra, if you're just going to use the
2902  // default Node.
2903  typedef Tpetra::MultiVector<tcoord_t, part_t, part_t> tMVector_t;
2904 
2905  Teuchos::ArrayRCP<part_t> task_communication_xadj(task_comm_xadj, 0, num_tasks+1, false);
2906 
2907  Teuchos::ArrayRCP<part_t> task_communication_adj;
2908  if (task_comm_xadj){
2909  Teuchos::ArrayRCP<part_t> tmp_task_communication_adj(task_comm_adj, 0, task_comm_xadj[num_tasks], false);
2910  task_communication_adj = tmp_task_communication_adj;
2911  }
2912 
2913 
2916  envConst_,
2917  problemComm.getRawPtr(),
2918  proc_dim,
2919  num_processors,
2920  machine_coords,//machine_coords_,
2921 
2922  task_dim,
2923  num_tasks,
2924  task_coords,
2925 
2928  task_communication_edge_weight_,
2929  recursion_depth,
2930  part_no_array,
2931  machine_dimensions,
2932  num_ranks_per_node,
2933  divide_to_prime_first
2934  );
2935 
2936 
2937  part_t* proc_to_task_xadj_;
2938  part_t* proc_to_task_adj_;
2939 
2940  ctm->getProcTask(proc_to_task_xadj_, proc_to_task_adj_);
2941 
2942  for (part_t i = 0; i <= num_processors; ++i){
2943  proc_to_task_xadj[i] = proc_to_task_xadj_[i];
2944  }
2945  for (part_t i = 0; i < num_tasks; ++i){
2946  proc_to_task_adj[i] = proc_to_task_adj_[i];
2947  }
2948  delete ctm;
2949  delete envConst_;
2950 
2951 }
2952 
2953 template <typename proc_coord_t, typename v_lno_t>
2954 inline void visualize_mapping(int myRank,
2955  const int machine_coord_dim, const int num_ranks, proc_coord_t **machine_coords,
2956  const v_lno_t num_tasks, const v_lno_t *task_communication_xadj, const v_lno_t *task_communication_adj, const int *task_to_rank){
2957 
2958  std::string rankStr = Teuchos::toString<int>(myRank);
2959  std::string gnuPlots = "gnuPlot", extentionS = ".plot";
2960  std::string outF = gnuPlots + rankStr+ extentionS;
2961  std::ofstream gnuPlotCode( outF.c_str(), std::ofstream::out);
2962 
2963  if (machine_coord_dim != 3) {
2964  std::cerr << "Mapping Write is only good for 3 dim" << std::endl;
2965  return;
2966  }
2967  std::string ss = "";
2968  std::string procs = "";
2969 
2970  std::set < std::tuple<int,int,int,int,int,int> > my_arrows;
2971  for(v_lno_t origin_task = 0; origin_task < num_tasks; ++origin_task){
2972  int origin_rank = task_to_rank[origin_task];
2973  std::string gnuPlotArrow = "set arrow from ";
2974 
2975  for(int j = 0; j < machine_coord_dim; ++j){
2976  if (j == machine_coord_dim - 1){
2977  gnuPlotArrow += Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
2978  procs += Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
2979 
2980  }
2981  else {
2982  gnuPlotArrow += Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]) +",";
2983  procs += Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])+ " ";
2984  }
2985  }
2986  procs += "\n";
2987 
2988  gnuPlotArrow += " to ";
2989 
2990 
2991  for (int nind = task_communication_xadj[origin_task]; nind < task_communication_xadj[origin_task + 1]; ++nind){
2992  int neighbor_task = task_communication_adj[nind];
2993 
2994  bool differentnode = false;
2995  int neighbor_rank = task_to_rank[neighbor_task];
2996 
2997  for(int j = 0; j < machine_coord_dim; ++j){
2998  if (int(machine_coords[j][origin_rank]) != int(machine_coords[j][neighbor_rank])){
2999  differentnode = true; break;
3000  }
3001  }
3002  std::tuple<int,int,int, int, int, int> foo(
3003  (int)(machine_coords[0][origin_rank]),
3004  (int)(machine_coords[1][origin_rank]),
3005  (int)(machine_coords[2][origin_rank]),
3006  (int)(machine_coords[0][neighbor_rank]),
3007  (int)(machine_coords[1][neighbor_rank]),
3008  (int)(machine_coords[2][neighbor_rank]));
3009 
3010 
3011  if (differentnode && my_arrows.find(foo) == my_arrows.end()){
3012  my_arrows.insert(foo);
3013 
3014  std::string gnuPlotArrow2 = "";
3015  for(int j = 0; j < machine_coord_dim; ++j){
3016  if(j == machine_coord_dim - 1){
3017  gnuPlotArrow2 += Teuchos::toString<float>(machine_coords[j][neighbor_rank]);
3018  }
3019  else{
3020  gnuPlotArrow2 += Teuchos::toString<float>(machine_coords[j][neighbor_rank]) +",";
3021  }
3022  }
3023  ss += gnuPlotArrow + gnuPlotArrow2 + " nohead\n";
3024  }
3025  }
3026  }
3027 
3028  std::ofstream procFile("procPlot.plot", std::ofstream::out);
3029  procFile << procs << "\n";
3030  procFile.close();
3031 
3032  //gnuPlotCode << ss;
3033  if(machine_coord_dim == 2){
3034  gnuPlotCode << "plot \"procPlot.plot\" with points pointsize 3\n";
3035  } else {
3036  gnuPlotCode << "splot \"procPlot.plot\" with points pointsize 3\n";
3037  }
3038 
3039  gnuPlotCode << ss << "\nreplot\n pause -1\npause -1";
3040  gnuPlotCode.close();
3041 }
3042 
3043 }// namespace Zoltan2
3044 
3045 #endif
void setParams(int dimension_, int heapsize)
CommunicationModel Base Class that performs mapping between the coordinate partitioning result...
size_t getLocalNumEdges() const
Returns the number of edges on this process. In global or subset graphs, includes off-process edges...
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
void getBestMapping()
finds the lowest cost mapping and broadcasts solution to everyone.
void ithPermutation(const IT n, IT i, IT *perm)
CommunicationModel(part_t no_procs_, part_t no_tasks_)
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const part_t num_parts_, const part_t *result_parts, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. Instead of Solution we have two parameters, numparts When this constructor is called...
size_t getVertexList(ArrayView< const gno_t > &Ids, ArrayView< input_t > &wgts) const
Sets pointers to this process&#39; vertex Ids and their weights.
Time an algorithm (or other entity) as a whole.
WT getDistance(IT index, WT **elementCoords)
virtual size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
CoordinateCommunicationModel(int pcoord_dim_, pcoord_t **pcoords_, int tcoord_dim_, tcoord_t **tcoords_, part_t no_procs_, part_t no_tasks_, int *machine_extent_, bool *machine_extent_wrap_around_, const MachineRepresentation< pcoord_t, part_t > *machine_=NULL)
Class Constructor:
void getCoarsenedPartGraph(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::GraphModel< typename Adapter::base_adapter_t > *graph, part_t np, const part_t *parts, ArrayRCP< part_t > &g_part_xadj, ArrayRCP< part_t > &g_part_adj, ArrayRCP< scalar_t > &g_part_ew)
void visualize_mapping(int myRank, const int machine_coord_dim, const int num_ranks, proc_coord_t **machine_coords, const v_lno_t num_tasks, const v_lno_t *task_communication_xadj, const v_lno_t *task_communication_adj, const int *task_to_rank)
PartitionMapping maps a solution or an input distribution to ranks.
void getSolutionCenterCoordinates(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::CoordinateModel< typename Adapter::base_adapter_t > *coords, const part_t *parts, int coordDim, part_t ntasks, scalar_t **partCenters)
bool getNewCenters(WT **coords)
virtual double getProcDistance(int procId1, int procId2) const =0
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, mj_scalar_t **mj_coordinates, mj_lno_t *initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth, const mj_part_t *part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_)
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void calculateCommunicationCost(part_t *task_to_proc, part_t *task_communication_xadj, part_t *task_communication_adj, pcoord_t *task_communication_edge_weight)
ArrayView< part_t > getAssignedTasksForProc(part_t procId)
const MachineRepresentation< pcoord_t, part_t > * machine
part_t getAssignedProcForTask(part_t taskId)
getAssignedProcForTask function, returns the assigned processor id for the given task ...
void getMinDistanceCluster(IT *procPermutation)
void doMapping(int myRank, const Teuchos::RCP< const Teuchos::Comm< int > > comm_)
doMapping function, calls getMapping function of communicationModel object.
Defines the XpetraMultiVectorAdapter.
SparseMatrixAdapter_t::part_t part_t
Multi Jagged coordinate partitioning algorithm.
void create_local_task_to_rank(const lno_t num_local_coords, const part_t *local_coord_parts, const ArrayRCP< part_t > task_to_proc_)
void timerStop(TimerType tt, const char *timerName) const
Stop a named timer.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &proc_to_task_xadj, ArrayRCP< part_t > &proc_to_task_adj, ArrayRCP< part_t > &task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const =0
Function is called whenever nprocs &gt; no_task. Function returns only the subset of processors that are...
virtual void map(const RCP< MappingSolution< Adapter > > &mappingsoln)
Mapping method.
Contains the Multi-jagged algorthm.
Adapter::scalar_t scalar_t
size_t getCoordinates(ArrayView< const gno_t > &Ids, ArrayView< input_t > &xyz, ArrayView< input_t > &wgts) const
Returns the coordinate ids, values and optional weights.
PartitionMapping maps a solution or an input distribution to ranks.
void getProcTask(part_t *&proc_to_task_xadj_, part_t *&proc_to_task_adj_)
A PartitioningSolution is a solution to a partitioning problem.
#define ZOLTAN2_ABS(x)
virtual void getProcsForPart(part_t taskId, part_t &numProcs, part_t *&procs) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
size_t getEdgeList(ArrayView< const gno_t > &edgeIds, ArrayView< const offset_t > &offsets, ArrayView< input_t > &wgts) const
Sets pointers to this process&#39; edge (neighbor) global Ids, including off-process edges.
void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const
Function is called whenever nprocs &gt; no_task. Function returns only the subset of processors that are...
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
void copyCoordinates(IT *permutation)
void getGridCommunicationGraph(part_t taskCount, part_t *&task_comm_xadj, part_t *&task_comm_adj, std::vector< int > grid_dims)
Zoltan2_ReduceBestMapping Class, reduces the minimum cost mapping, ties breaks with minimum proc id...
Adapter::part_t part_t
KMeansCluster Class.
BaseAdapterType
An enum to identify general types of adapters.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
The StridedData class manages lists of weights or coordinates.
CoordinateTaskMapper(const Environment *env_const_, const Teuchos::Comm< int > *problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, ArrayRCP< part_t >task_comm_xadj, ArrayRCP< part_t >task_comm_adj, pcoord_t *task_communication_edge_weight_, int recursion_depth, part_t *part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor The mapping constructor which will also perform the mapping operation. The result mapping can be obtained by –getAssignedProcForTask function: which returns the assigned processor id for the given task –getPartsForProc: which returns the assigned tasks with the number of tasks.
The user parameters, debug, timing and memory profiling output objects, and error checking methods...
static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
void addPoint(IT index, WT distance)
bool getNewCenters(WT *center, WT **coords, int dimension)
#define epsilon
size_t getLocalNumVertices() const
Returns the number vertices on this process.
MachineRepresentation Class Base class for representing machine coordinates, networks, etc.
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const Teuchos::RCP< const Zoltan2::PartitioningSolution< Adapter > > soln_, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. When this constructor is called, in order to calculate the communication metric...
CoordinateModelInput Class that performs mapping between the coordinate partitioning result and mpi r...
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS&#39;s idx_t...
pcoord_t ** shiftMachineCoordinates(int machine_dim, const part_t *machine_dimensions, bool *machine_extent_wrap_around, part_t numProcs, pcoord_t **mCoords)
Using the machine dimensions provided, create virtual machine coordinates by assigning the largest ga...
Defines the MappingSolution class.
const Teuchos::RCP< const Teuchos::Comm< int > > comm
GraphModel defines the interface required for graph models.
Tpetra::MultiVector< double, int, int > tMVector_t
Gathering definitions used in software development.
void update_visit_order(part_t *visitOrder, part_t n, int &_u_umpa_seed, part_t rndm)
virtual double getProcDistance(int procId1, int procId2) const
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &rcp_proc_to_task_xadj, ArrayRCP< part_t > &rcp_proc_to_task_adj, ArrayRCP< part_t > &rcp_task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const
Function is called whenever nprocs &gt; no_task. Function returns only the subset of processors that are...
Zoltan2_ReduceBestMapping()
Default Constructor.
KmeansHeap Class, max heap, but holds the minimum values.
int getNumWeightsPerEdge() const
Returns the number (0 or greater) of weights per edge.
Defines the GraphModel interface.
void fillContinousArray(T *arr, size_t arrSize, T *val)
fillContinousArray function
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
void push_down(IT index_on_heap)
ArrayRCP< scalar_t > task_communication_edge_weight
RCP< Comm< int > > create_subCommunicator()
creates and returns the subcommunicator for the processor group.
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
KMeansAlgorithm Class that performs clustering of the coordinates, and returns the closest set of coo...
const Teuchos::RCP< const Environment > env
void timerStart(TimerType tt, const char *timerName) const
Start a named timer.
KMeansAlgorithm(int dim_, IT numElements_, WT **elementCoords_, IT required_elements_)
KMeansAlgorithm Constructor.
void copyCoordinates(IT *permutation)
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
#define MINOF(a, b)
void coordinateTaskMapperInterface(RCP< const Teuchos::Comm< int > > problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, part_t *task_comm_xadj, part_t *task_comm_adj, pcoord_t *task_communication_edge_weight_, part_t *proc_to_task_xadj, part_t *proc_to_task_adj, int recursion_depth, part_t *part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false)
Constructor The interface function that calls CoordinateTaskMapper which will also perform the mappin...
virtual void getPartsForProc(int procId, part_t &numParts, part_t *&parts) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
void setHeapsize(IT heapsize_)
CoordinateCommunicationModel< pcoord_t, tcoord_t, part_t > * proc_task_comm