Zoltan2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Zoltan2_AlgHybridD1.hpp
Go to the documentation of this file.
1 #ifndef _ZOLTAN2_ALGHYBRIDD1_HPP_
2 #define _ZOLTAN2_ALGHYBRIDD1_HPP_
3 
4 #include <vector>
5 #include <unordered_map>
6 #include <iostream>
7 #include <fstream>
8 #include <queue>
9 #ifdef _WIN32
10 #include <time.h>
11 #else
12 #include <sys/time.h>
13 #endif
14 #include <algorithm>
15 
16 #include "Zoltan2_Algorithm.hpp"
17 #include "Zoltan2_GraphModel.hpp"
19 #include "Zoltan2_Util.hpp"
20 #include "Zoltan2_TPLTraits.hpp"
21 #include "Zoltan2_AlltoAll.hpp"
22 
23 #include "Tpetra_Core.hpp"
24 #include "Teuchos_RCP.hpp"
25 #include "Tpetra_Import.hpp"
26 #include "Tpetra_FEMultiVector.hpp"
27 
28 #include "KokkosKernels_Handle.hpp"
29 #include "KokkosKernels_IOUtils.hpp"
30 #include "KokkosGraph_Distance1Color.hpp"
31 #include "KokkosGraph_Distance1ColorHandle.hpp"
32 #include <stdlib.h>
33 
37 
38 namespace Zoltan2 {
39 
40 template <typename Adapter>
41 class AlgDistance1 : public Algorithm<Adapter>
42 {
43  public:
44 
45  using lno_t = typename Adapter::lno_t;
46  using gno_t = typename Adapter::gno_t;
47  using offset_t = typename Adapter::offset_t;
48  using scalar_t = typename Adapter::scalar_t;
50  using map_t = Tpetra::Map<lno_t, gno_t>;
51  using femv_scalar_t = int;
52  using femv_t = Tpetra::FEMultiVector<femv_scalar_t, lno_t, gno_t>;
53  using device_type = typename femv_t::device_type;
54  using execution_space = typename device_type::execution_space;
55  using memory_space = typename device_type::memory_space;
56  using host_exec = typename femv_t::host_view_type::device_type::execution_space;
57  using host_mem = typename femv_t::host_view_type::device_type::memory_space;
58  double timer() {
59  struct timeval tp;
60  gettimeofday(&tp, NULL);
61  return ((double) (tp.tv_sec) + 1e-6 * tp.tv_usec);
62  }
63 
64  private:
65 
66  void buildModel(modelFlag_t &flags);
67 
68  //function to invoke KokkosKernels distance-1 coloring
69  //
70  //OUTPUT ARG:
71  //
72  // femv: FEMultiVector containing dual-view of colors.
73  // After this call each vertex will have a locally-valid color.
74  //
75  //INPUT ARGS:
76  //
77  // nVtx: number of local owned vertices
78  //
79  // adjs_view: CSR adjacencies for the local graph
80  //
81  // offset_view: CSR offsets for indexing adjs_view
82  //
83  // vertex_list: list of local IDs of vertices that
84  // need to be recolored
85  //
86  // vertex_list_size: 0 means no vertex list given,
87  // otherwise it simply is the size
88  // of the vertex_list
89  //
90  // recolor: switches between VB_BIT and EB KokkosKernels
91  // algorithms
92  //
93  template <class ExecutionSpace, typename MemorySpace>
94  void colorInterior(const size_t nVtx,
95  Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace,MemorySpace> > adjs_view,
96  Kokkos::View<offset_t*, Kokkos::Device<ExecutionSpace, MemorySpace> > offset_view,
97  Teuchos::RCP<femv_t> femv,
98  Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace, MemorySpace> > vertex_list,
99  size_t vertex_list_size = 0,
100  bool recolor=false){
101 
102  //set up types to be used by KokkosKernels
103  using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle
104  <offset_t, lno_t, lno_t, ExecutionSpace, MemorySpace,
105  MemorySpace>;
106  using lno_row_view_t = Kokkos::View<offset_t*, Kokkos::Device<ExecutionSpace, MemorySpace>>;
107  using lno_nnz_view_t = Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace, MemorySpace>>;
108 
109  KernelHandle kh;
110 
111  //pick which KokkosKernels algorithm to use.
112  //VBBIT is more efficient for inputs with max degree < ~6000
113  //EB is more efficient for inputs with max degree > ~6000
114  if(recolor){
115  kh.create_graph_coloring_handle(KokkosGraph::COLORING_VBBIT);
116  } else {
117  kh.create_graph_coloring_handle(KokkosGraph::COLORING_EB);
118  }
119 
120  //vertex_list_size indicates whether we have provided a list of vertices to recolor.
121  //Only makes a difference if the algorithm to be used is VBBIT.
122  if(vertex_list_size != 0){
123  kh.get_graph_coloring_handle()->set_vertex_list(vertex_list,vertex_list_size);
124  }
125 
126  kh.set_verbose(verbose);
127 
128  //set the initial coloring of the kh.get_graph_coloring_handle() to be
129  //the data view from the femv.
130  auto femvColors = femv->template getLocalView<Kokkos::Device<ExecutionSpace,MemorySpace> >(Tpetra::Access::ReadWrite);
131  auto sv = subview(femvColors, Kokkos::ALL, 0);
132  kh.get_graph_coloring_handle()->set_vertex_colors(sv);
133  kh.get_graph_coloring_handle()->set_tictoc(verbose);
134  KokkosGraph::Experimental::graph_color_symbolic<KernelHandle, lno_row_view_t, lno_nnz_view_t>
135  (&kh, nVtx, nVtx, offset_view, adjs_view);
136  //numColors = kh.get_graph_coloring_handle()->get_num_colors();
137 
138  if(verbose){
139  std::cout<<"\nKokkosKernels Coloring: "<<kh.get_graph_coloring_handle()->get_overall_coloring_time()<<" iterations: "<<kh.get_graph_coloring_handle()->get_num_phases()<<"\n\n";
140  }
141  }
142 
143  public:
144  //contains all distance-1 conflict detection logic
145  //
146  //OUTPUT ARGS:
147  //
148  // femv_colors: This function uncolors vertices that have
149  // distance-1 conflicts, so these colors will
150  // change if there are any conflicts present
151  //
152  //INPUT ARGS:
153  //
154  // n_local: number of locally owned vertices
155  //
156  // n_ghosts: number of ghosts on this process
157  //
158  // dist_offsets: CSR offsets of the local graph
159  //
160  // dist_adjs: CSR adjacencies of the local graph
161  //
162  // verts_to_send_view: Used to construct a list of verts to send on device.
163  //
164  // verts_to_send_size_atomic: atomic version of the verts_to_send_size view
165  // Used to construct a list on device,
166  // the atomic is necessary for correctness.
167  //
168  // recoloringSize: holds the total amount of recoloring that will be done
169  // locally. Does not need to be atomic.
170  //
171  // rand: view that holds the tie-breaking random numbers indexed by LID.
172  //
173  // gid: view that holds GIDs, for tie breaking in the case that rand
174  // numbers are the same for two vertices.
175  //
176  // ghost_degrees: view that holds degrees only for ghost vertices.
177  // LID n_local corresponds to ghost_degrees(0).
178  //
179  // recolor_degrees: if true, we factor degrees into the conflict detection
180  // if false, we resolve using only consistent random numbers.
181  //
182  template <class ExecutionSpace, typename MemorySpace>
183  void detectConflicts(const size_t n_local, const size_t n_ghosts,
184  Kokkos::View<offset_t*, Kokkos::Device<ExecutionSpace, MemorySpace>> dist_offsets,
185  Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace, MemorySpace>> dist_adjs,
186  Kokkos::View<int*, Kokkos::Device<ExecutionSpace, MemorySpace>> femv_colors,
187  Kokkos::View<lno_t*,
188  Kokkos::Device<ExecutionSpace, MemorySpace>> verts_to_send_view,
189  Kokkos::View<size_t*,
190  Kokkos::Device<ExecutionSpace, MemorySpace>,
191  Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic,
192  Kokkos::View<gno_t*, Kokkos::Device<ExecutionSpace, MemorySpace>> recoloringSize,
193  Kokkos::View<int*,
194  Kokkos::Device<ExecutionSpace, MemorySpace>> rand,
195  Kokkos::View<gno_t*,
196  Kokkos::Device<ExecutionSpace, MemorySpace>> gid,
197  Kokkos::View<gno_t*,
198  Kokkos::Device<ExecutionSpace, MemorySpace>> ghost_degrees,
199  bool recolor_degrees){
200  gno_t local_recoloring_size;
201  Kokkos::parallel_reduce("Conflict Detection",
202  Kokkos::RangePolicy<ExecutionSpace>(0,n_ghosts),
203  KOKKOS_LAMBDA(const int& i,
204  gno_t& recoloring_size){
205  lno_t lid = i+n_local;
206  int currColor = femv_colors(lid);
207  int currDegree = ghost_degrees(i);
208  for(offset_t j = dist_offsets(lid); j < dist_offsets(lid+1); j++){
209  int nborColor = femv_colors(dist_adjs(j));
210  int nborDegree = 0;
211  if((size_t)dist_adjs(j) < n_local) nborDegree = dist_offsets(dist_adjs(j)+1) - dist_offsets(dist_adjs(j));
212  else nborDegree = ghost_degrees(dist_adjs(j) - n_local);
213  if(currColor == nborColor ){
214  if(currDegree < nborDegree && recolor_degrees){
215  femv_colors(lid) = 0;
216  recoloring_size++;
217  }else if (nborDegree < currDegree && recolor_degrees){
218  femv_colors(dist_adjs(j)) = 0;
219  recoloring_size++;
220  }else if(rand(lid) > rand(dist_adjs(j))){
221  femv_colors(lid) = 0;
222  recoloring_size++;
223  break;
224  }else if (rand(dist_adjs(j)) > rand(lid)){
225  femv_colors(dist_adjs(j)) = 0;
226  recoloring_size++;
227  } else {
228  if (gid(lid) >= gid(dist_adjs(j))){
229  femv_colors(lid) = 0;
230  recoloring_size++;
231  break;
232  } else {
233  femv_colors(dist_adjs(j)) = 0;
234  recoloring_size++;
235  }
236  }
237  }
238  }
239  },local_recoloring_size);
240  Kokkos::deep_copy(recoloringSize, local_recoloring_size);
241  Kokkos::fence();
242  Kokkos::parallel_for("Rebuild verts_to_send_view",
243  Kokkos::RangePolicy<ExecutionSpace>(0,n_local),
244  KOKKOS_LAMBDA(const int& i){
245  if(femv_colors(i) == 0){
246  verts_to_send_view(verts_to_send_size_atomic(0)++) = i;
247  }
248  });
249  Kokkos::fence();
250  }
251 
252  private:
253  //Communicates owned vertex colors to ghost copies.
254  //
255  //RETURN VALUE:
256  //
257  // returns the time it took for the communication call to complete
258  //
259  //OUTPUT ARGS:
260  //
261  // femv: FEMultivector that holds the dual-view of the colors
262  // After this call, ghost colors will be up-to-date.
263  //
264  // recv: returns the size of the recv buffer
265  //
266  // send: returns the size of the send buffer
267  //
268  //INPUT ARGS:
269  //
270  // mapOwnedPlusGhosts: a Tpetra map that translates between
271  // LID and GID for any vertex on this process.
272  //
273  // nVtx: the number of locally owned vertices.
274  //
275  // verts_to_send: hostmirror of verts to send. This function sends
276  // all vertices in this list to their ghost copies.
277  //
278  // verts_to_send_size: hostmirror of verts_to_send_size, holds the
279  // size of verts_to_send.
280  //
281  // procs_to_send: map that translates LID into a list of processes that
282  // have a ghost copy of the vertex.
283  //
284  double doOwnedToGhosts(RCP<const map_t> mapOwnedPlusGhosts,
285  size_t nVtx,
286  typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_send,
287  typename Kokkos::View<size_t*, device_type>::HostMirror& verts_to_send_size,
288  Teuchos::RCP<femv_t> femv,
289  const std::unordered_map<lno_t, std::vector<int>>& procs_to_send,
290  gno_t& recv, gno_t& send){
291  auto femvColors = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
292  auto colors = subview(femvColors, Kokkos::ALL, 0);
293  int nprocs = comm->getSize();
294 
295  std::vector<int> sendcnts(comm->getSize(), 0);
296  std::vector<gno_t> sdispls(comm->getSize()+1, 0);
297  for(size_t i = 0; i < verts_to_send_size(0); i++){
298  for(size_t j = 0; j < procs_to_send.at(verts_to_send(i)).size(); j++){
299  sendcnts[procs_to_send.at(verts_to_send(i))[j]] += 2;
300  }
301  }
302 
303  sdispls[0] = 0;
304  gno_t sendsize = 0;
305  std::vector<int> sentcount(nprocs, 0);
306 
307  for(int i = 1; i < comm->getSize()+1; i++){
308  sdispls[i] = sdispls[i-1] + sendcnts[i-1];
309  sendsize += sendcnts[i-1];
310  }
311  send = sendsize;
312  std::vector<gno_t> sendbuf(sendsize,0);
313 
314  for(size_t i = 0; i < verts_to_send_size(0); i++){
315  std::vector<int> procs = procs_to_send.at(verts_to_send(i));
316  for(size_t j = 0; j < procs.size(); j++){
317  size_t idx = sdispls[procs[j]] + sentcount[procs[j]];
318  sentcount[procs[j]] += 2;
319  sendbuf[idx++] = mapOwnedPlusGhosts->getGlobalElement(verts_to_send(i));
320  sendbuf[idx] = colors(verts_to_send(i));
321  }
322  }
323 
324  Teuchos::ArrayView<int> sendcnts_view = Teuchos::arrayViewFromVector(sendcnts);
325  Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
326  Teuchos::ArrayRCP<gno_t> recvbuf;
327  std::vector<int> recvcnts(comm->getSize(), 0);
328  Teuchos::ArrayView<int> recvcnts_view = Teuchos::arrayViewFromVector(recvcnts);
329 
330  //if we're reporting timings, remove the computation imbalance from the comm timer.
331  if(timing) comm->barrier();
332  double comm_total = 0.0;
333  double comm_temp = timer();
334 
335  AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcnts_view, recvbuf, recvcnts_view);
336  comm_total += timer() - comm_temp;
337 
338  gno_t recvsize = 0;
339 
340  for(int i = 0; i < recvcnts_view.size(); i++){
341  recvsize += recvcnts_view[i];
342  }
343  recv = recvsize;
344  for(int i = 0; i < recvsize; i+=2){
345  size_t lid = mapOwnedPlusGhosts->getLocalElement(recvbuf[i]);
346  if(lid<nVtx && verbose) std::cout<<comm->getRank()<<": received a locally owned vertex, somehow\n";
347  colors(lid) = recvbuf[i+1];
348  }
349 
350  return comm_total;
351  }
352 
353  RCP<const base_adapter_t> adapter;
354  RCP<GraphModel<base_adapter_t> > model;
355  RCP<Teuchos::ParameterList> pl;
356  RCP<Environment> env;
357  RCP<const Teuchos::Comm<int> > comm;
358  bool verbose;
359  bool timing;
360  public:
361  //constructor for the hybrid distributed distance-1 algorithm
363  const RCP<const base_adapter_t> &adapter_,
364  const RCP<Teuchos::ParameterList> &pl_,
365  const RCP<Environment> &env_,
366  const RCP<const Teuchos::Comm<int> > &comm_)
367  : adapter(adapter_), pl(pl_), env(env_), comm(comm_) {
368  verbose = pl->get<bool>("verbose",false);
369  timing = pl->get<bool>("timing", false);
370  if(verbose) std::cout<<comm->getRank()<<": inside coloring constructor\n";
371  modelFlag_t flags;
372  flags.reset();
373  buildModel(flags);
374  if(verbose) std::cout<<comm->getRank()<<": done constructing coloring class\n";
375  }
376 
377 
378  //Main entry point for graph coloring
379  void color( const RCP<ColoringSolution<Adapter> > &solution ) {
380  if(verbose) std::cout<<comm->getRank()<<": inside coloring function\n";
381  //this will color the global graph in a manner similar to Zoltan
382 
383  //get vertex GIDs in a locally indexed array
384  if(verbose) std::cout<<comm->getRank()<<": getting owned vtxIDs\n";
385  ArrayView<const gno_t> vtxIDs;
386  ArrayView<StridedData<lno_t, scalar_t> > vwgts;
387  size_t nVtx = model->getVertexList(vtxIDs, vwgts);
388  //we do not use weights at this point
389  if(verbose) std::cout<<comm->getRank()<<": getting edge list\n";
390  //get edge information from the model
391  ArrayView<const gno_t> adjs;
392  ArrayView<const offset_t> offsets;
393  ArrayView<StridedData<lno_t, scalar_t> > ewgts;
394  model->getEdgeList(adjs, offsets, ewgts);
395  //again, weights are not used
396 
397  RCP<const map_t> mapOwned;
398  RCP<const map_t> mapWithCopies;
399 
400  std::vector<gno_t> finalGIDs;
401  std::vector<offset_t> finalOffset_vec;
402  std::vector<lno_t> finalAdjs_vec;
403 
404  std::vector<lno_t> reorderToLocal;
405  for(size_t i = 0; i< nVtx; i++) reorderToLocal.push_back(i);
406  if(verbose) std::cout<<comm->getRank()<<": Setting up local datastructures\n";
407  //Set up a typical local mapping here.
408  std::unordered_map<gno_t,lno_t> globalToLocal;
409  std::vector<gno_t> ownedPlusGhosts;
410  for (gno_t i = 0; i < vtxIDs.size(); i++){
411  if(vtxIDs[i] < 0 && verbose) std::cout<<comm->getRank()<<": found a negative GID\n";
412  globalToLocal[vtxIDs[i]] = i;
413  ownedPlusGhosts.push_back(vtxIDs[i]);
414  }
415  gno_t nghosts = 0;
416  for (int i = 0; i < adjs.size(); i++){
417  if(globalToLocal.count(adjs[i]) == 0){
418  //new unique ghost found
419  if(adjs[i] < 0 && verbose) std::cout<<comm->getRank()<<": found a negative adjacency\n";
420  ownedPlusGhosts.push_back(adjs[i]);
421  globalToLocal[adjs[i]] = vtxIDs.size() + nghosts;
422  nghosts++;
423 
424  }
425  }
426  if(verbose) std::cout<<comm->getRank()<<": vec.max_size() = "<<finalAdjs_vec.max_size()<<", adjs.size() = "<<adjs.size()<<"\n";
427  finalAdjs_vec.resize(adjs.size());
428  for(size_t i = 0; i < finalAdjs_vec.size();i++){
429  finalAdjs_vec[i] = globalToLocal[adjs[i]];
430  }
431  for(int i = 0; i < offsets.size(); i++) finalOffset_vec.push_back(offsets[i]);
432  finalGIDs = ownedPlusGhosts;
433 
434 
435  if(verbose) std::cout<<comm->getRank()<<": creating Tpetra Maps\n";
436  Tpetra::global_size_t dummy = Teuchos::OrdinalTraits
437  <Tpetra::global_size_t>::invalid();
438  mapOwned = rcp(new map_t(dummy, vtxIDs, 0, comm));
439 
440  dummy = Teuchos::OrdinalTraits <Tpetra::global_size_t>::invalid();
441  mapWithCopies = rcp(new map_t(dummy,
442  Teuchos::arrayViewFromVector(ownedPlusGhosts),
443  0, comm));
444 
445  //create the FEMultiVector for the distributed communication.
446  //We also use the views from this datastructure as arguments to
447  //KokkosKernels coloring functions.
448  if(verbose)std::cout<<comm->getRank()<<": creating FEMultiVector\n";
449  typedef Tpetra::Import<lno_t, gno_t> import_t;
450  Teuchos::RCP<import_t> importer = rcp(new import_t(mapOwned,
451  mapWithCopies));
452  Teuchos::RCP<femv_t> femv = rcp(new femv_t(mapOwned,
453  importer, 1, true));
454  //Get color array to fill
455  ArrayRCP<int> colors = solution->getColorsRCP();
456  for(size_t i=0; i<nVtx; i++){
457  colors[i] = 0;
458  }
459 
460  //Create random numbers seeded on global IDs so that we don't
461  //need to communicate for consistency. These numbers determine
462  //which vertex gets recolored in the event of a conflict.
463  //taken directly from the Zoltan coloring implementation
464  std::vector<int> rand(finalGIDs.size());
465  for(size_t i = 0; i < finalGIDs.size(); i++){
466  std::srand(finalGIDs[i]);
467  rand[i] = std::rand();
468  }
469 
470  //find out who owns the ghosts on this process
471  std::vector<int> ghostOwners(finalGIDs.size() - nVtx);
472  std::vector<gno_t> ghosts(finalGIDs.size() - nVtx);
473  for(size_t i = nVtx; i < finalGIDs.size(); i++) ghosts[i-nVtx] = finalGIDs[i];
474  ArrayView<int> owners = Teuchos::arrayViewFromVector(ghostOwners);
475  ArrayView<const gno_t> ghostGIDs = Teuchos::arrayViewFromVector(ghosts);
476  mapOwned->getRemoteIndexList(ghostGIDs, owners);
477 
478  //create const views of the CSR representation of the local graph
479  ArrayView<const offset_t> finalOffsets = Teuchos::arrayViewFromVector(finalOffset_vec);
480  ArrayView<const lno_t> finalAdjs = Teuchos::arrayViewFromVector(finalAdjs_vec);
481  ArrayView<const int> rand_view = Teuchos::arrayViewFromVector(rand);
482  ArrayView<const gno_t> gid_view = Teuchos::arrayViewFromVector(finalGIDs);
483 
484  //find out which remote processes have a ghost copy of a local owned vertex.
485  Teuchos::ArrayView<const lno_t> exportLIDs = importer->getExportLIDs();
486  Teuchos::ArrayView<const int> exportPIDs = importer->getExportPIDs();
487 
488  //create a quick lookup from LID -> remote processes with a ghost copy
489  std::unordered_map<lno_t, std::vector<int>> procs_to_send;
490  for(int i = 0; i < exportLIDs.size(); i++){
491  procs_to_send[exportLIDs[i]].push_back(exportPIDs[i]);
492  }
493 
494  // call coloring function
495  hybridGMB(nVtx, finalAdjs, finalOffsets,femv,gid_view,rand_view,owners,mapWithCopies, procs_to_send);
496 
497  //copy colors to the output array.
498  auto femvdata = femv->getData(0);
499  for(int i = 0; i < colors.size(); i++){
500  colors[reorderToLocal[i]] = femvdata[i];
501  }
502 
503  }
504 
505  //This function contains all coloring logic.
506  //
507  //OUTPUT ARGS:
508  //
509  // femv: FEMultiVector with a dual-view of the vertex colors.
510  // The colors will be a valid distance-1 coloring after this
511  // function returns.
512  //
513  //INPUT ARGS:
514  //
515  // nVtx: number of locally owned vertices
516  //
517  // adjs: CSR adjacencies for the local graph
518  //
519  // offsets: CSR offsets into adjs for the local graph
520  //
521  // gids: global IDs for every vertex on this process.
522  // indexed by local ID
523  //
524  // rand: random number generated for every vertex on
525  // this process. Indexed by local ID
526  //
527  // owners: owners of each ghost vertex.
528  // owners[i] = the owning process for vertex with GID gids[i+nVtx]
529  //
530  // mapOwnedPlusGhosts: Tpetra map that translates between LIDs and GIDs
531  //
532  // procs_to_send: maps from LIDs to Process IDs.
533  // procs_to_send[LID] gives a list of processes that have
534  // a ghost copy of LID.
535  //
536  void hybridGMB(const size_t nVtx,
537  const Teuchos::ArrayView<const lno_t>& adjs,
538  const Teuchos::ArrayView<const offset_t>& offsets,
539  const Teuchos::RCP<femv_t>& femv,
540  const Teuchos::ArrayView<const gno_t>& gids,
541  const Teuchos::ArrayView<const int>& rand,
542  const Teuchos::ArrayView<const int>& owners,
543  RCP<const map_t> mapOwnedPlusGhosts,
544  const std::unordered_map<lno_t, std::vector<int>>& procs_to_send){
545  if(verbose) std::cout<<comm->getRank()<<": inside coloring algorithm\n";
546 
547  //initialize timers
548  double total_time = 0.0;
549  double interior_time = 0.0;
550  double comm_time = 0.0;
551  double comp_time = 0.0;
552  double recoloring_time = 0.0;
553  double conflict_detection = 0.0;
554 
555  const int numStatisticRecordingRounds = 100;
556 
557  //Put together local and remote degrees
558  //1. Communicate remote GIDs to owning processes
559  std::vector<int> deg_send_cnts(comm->getSize(),0);
560  std::vector<gno_t> deg_sdispls(comm->getSize()+1,0);
561  for(int i = 0; i < owners.size(); i++){
562  deg_send_cnts[owners[i]]++;
563  }
564  deg_sdispls[0] = 0;
565  gno_t deg_sendsize = 0;
566  std::vector<int> deg_sentcount(comm->getSize(),0);
567  for(int i = 1; i < comm->getSize()+1; i++){
568  deg_sdispls[i] = deg_sdispls[i-1] + deg_send_cnts[i-1];
569  deg_sendsize += deg_send_cnts[i-1];
570  }
571  std::vector<gno_t> deg_sendbuf(deg_sendsize,0);
572  for(int i = 0; i < owners.size(); i++){
573  size_t idx = deg_sdispls[owners[i]] + deg_sentcount[owners[i]];
574  deg_sentcount[owners[i]]++;
575  deg_sendbuf[idx] = mapOwnedPlusGhosts->getGlobalElement(i+nVtx);
576  }
577  Teuchos::ArrayView<int> deg_send_cnts_view = Teuchos::arrayViewFromVector(deg_send_cnts);
578  Teuchos::ArrayView<gno_t> deg_sendbuf_view = Teuchos::arrayViewFromVector(deg_sendbuf);
579  Teuchos::ArrayRCP<gno_t> deg_recvbuf;
580  std::vector<int> deg_recvcnts(comm->getSize(),0);
581  Teuchos::ArrayView<int> deg_recvcnts_view = Teuchos::arrayViewFromVector(deg_recvcnts);
582  AlltoAllv<gno_t>(*comm, *env, deg_sendbuf_view, deg_send_cnts_view, deg_recvbuf, deg_recvcnts_view);
583 
584  //2. replace GID with local degree
585  for(int i = 0; i < deg_recvbuf.size(); i++){
586  lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_recvbuf[i]);
587  deg_recvbuf[i] = offsets[lid+1] - offsets[lid];
588  }
589  //3. send modified buffer back
590  ArrayRCP<gno_t> ghost_degrees;
591  AlltoAllv<gno_t>(*comm, *env, deg_recvbuf(), deg_recvcnts_view, ghost_degrees, deg_send_cnts_view);
592  //determine max degree locally and globally
593  Kokkos::View<gno_t*, device_type> ghost_degrees_dev("ghost degree view",ghost_degrees.size());
594  typename Kokkos::View<gno_t*, device_type>::HostMirror ghost_degrees_host = Kokkos::create_mirror(ghost_degrees_dev);
595  for(int i = 0; i < ghost_degrees.size(); i++){
596  lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_sendbuf[i]);
597  ghost_degrees_host(lid-nVtx) = ghost_degrees[i];
598  }
599  Kokkos::deep_copy(ghost_degrees_dev, ghost_degrees_host);
600 
601  offset_t local_max_degree = 0;
602  offset_t global_max_degree = 0;
603  for(size_t i = 0; i < nVtx; i++){
604  offset_t curr_degree = offsets[i+1] - offsets[i];
605  if(curr_degree > local_max_degree){
606  local_max_degree = curr_degree;
607  }
608  }
609  Teuchos::reduceAll<int, offset_t>(*comm,Teuchos::REDUCE_MAX,1, &local_max_degree, &global_max_degree);
610  if(comm->getRank() == 0 && verbose) std::cout<<"Input has max degree "<<global_max_degree<<"\n";
611  if(verbose)std::cout<<comm->getRank()<<": creating Kokkos Views\n";
612 
613  Kokkos::View<offset_t*, device_type> dist_degrees("Owned+Ghost degree view",rand.size());
614  typename Kokkos::View<offset_t*, device_type>::HostMirror dist_degrees_host = Kokkos::create_mirror(dist_degrees);
615  //set degree counts for ghosts
616  for(int i = 0; i < adjs.size(); i++){
617  if((size_t)adjs[i] < nVtx) continue;
618  dist_degrees_host(adjs[i])++;
619  }
620  //set degree counts for owned verts
621  for(int i = 0; i < offsets.size()-1; i++){
622  dist_degrees_host(i) = offsets[i+1] - offsets[i];
623  }
624 
625  Kokkos::View<offset_t*, device_type> dist_offsets("Owned+Ghost Offset view", rand.size()+1);
626  typename Kokkos::View<offset_t*, device_type>::HostMirror dist_offsets_host = Kokkos::create_mirror(dist_offsets);
627 
628  //set offsets and total # of adjacencies
629  dist_offsets_host(0) = 0;
630  uint64_t total_adjs = 0;
631  for(Teuchos_Ordinal i = 1; i < rand.size()+1; i++){
632  dist_offsets_host(i) = dist_degrees_host(i-1) + dist_offsets_host(i-1);
633  total_adjs+= dist_degrees_host(i-1);
634  }
635 
636  Kokkos::View<lno_t*, device_type> dist_adjs("Owned+Ghost adjacency view", total_adjs);
637  typename Kokkos::View<lno_t*, device_type>::HostMirror dist_adjs_host = Kokkos::create_mirror(dist_adjs);
638  //now, use the degree view as a counter
639  for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
640  dist_degrees_host(i) = 0;
641  }
642  for(int i = 0; i < adjs.size(); i++) dist_adjs_host(i) = adjs[i];
643  if(comm->getSize() > 1){
644  for(size_t i = 0; i < nVtx; i++){
645  for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
646  //if the adjacency is a ghost
647  if( (size_t)adjs[j] >= nVtx){
648  //add the symmetric edge to its adjacency list (already accounted for by offsets)
649  dist_adjs_host(dist_offsets_host(adjs[j]) + dist_degrees_host(adjs[j])) = i;
650  dist_degrees_host(adjs[j])++;
651  }
652  }
653  }
654  }
655 
656  if(verbose) std::cout<<comm->getRank()<<": copying host mirrors to device views\n";
657  //copy the host data to device memory
658  Kokkos::deep_copy(dist_degrees, dist_degrees_host); //may be unnecessary
659  Kokkos::deep_copy(dist_offsets, dist_offsets_host);
660  Kokkos::deep_copy(dist_adjs, dist_adjs_host);
661  if(verbose) std::cout<<comm->getRank()<<": done copying to device\n";
662 
663  //counter in UVM memory for how many vertices need recoloring.
664  Kokkos::View<gno_t*, device_type> recoloringSize("Recoloring Queue Size",1);
665  typename Kokkos::View<gno_t*, device_type>::HostMirror recoloringSize_host = Kokkos::create_mirror(recoloringSize);
666  recoloringSize_host(0) = 0;
667  Kokkos::deep_copy(recoloringSize, recoloringSize_host);
668 
669  //device copy of the random tie-breakers
670  Kokkos::View<int*,device_type> rand_dev("randVec",rand.size());
671  typename Kokkos::View<int*, device_type>::HostMirror rand_host = Kokkos::create_mirror(rand_dev);
672  for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
673  rand_host(i) = rand[i];
674  }
675 
676  //device copy of global IDs
677  Kokkos::View<gno_t*, device_type> gid_dev("GIDs",gids.size());
678  typename Kokkos::View<gno_t*,device_type>::HostMirror gid_host = Kokkos::create_mirror(gid_dev);
679  for(Teuchos_Ordinal i = 0; i < gids.size(); i++){
680  gid_host(i) = gids[i];
681  }
682 
683  //copy host views to device memory
684  Kokkos::deep_copy(rand_dev,rand_host);
685  Kokkos::deep_copy(gid_dev, gid_host);
686 
687  if(verbose)std::cout<<comm->getRank()<<": done creating recoloring datastructures\n";
688  //count boundary size to allocate list of vertices to recolor.
689  offset_t boundary_size = 0;
690  for(size_t i = 0; i < nVtx; i++){
691  for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
692  if((size_t)adjs[j] >= nVtx) {
693  boundary_size++;
694  break;
695  }
696  }
697  }
698  if(verbose)std::cout<<comm->getRank()<<": creating send views\n";
699 
700  //list of vertices to send to remote processes
701  Kokkos::View<lno_t*, device_type> verts_to_send_view("verts to send",boundary_size);
702  Kokkos::parallel_for("init verts_to_send_view",
703  Kokkos::RangePolicy<execution_space, int>(0,boundary_size),
704  KOKKOS_LAMBDA(const int& i){
705  verts_to_send_view(i) = -1;
706  });
707 
708  //size information for the list of vertices to send. Also includes an atomic copy
709  Kokkos::View<size_t*, device_type> verts_to_send_size("verts to send size",1);
710  Kokkos::View<size_t*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic> > verts_to_send_size_atomic = verts_to_send_size;
711  typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_send_host = create_mirror(verts_to_send_view);
712  typename Kokkos::View<size_t*,device_type>::HostMirror verts_to_send_size_host = create_mirror(verts_to_send_size);
713  //initialize the device view with a value of zero
714  verts_to_send_size_host(0) = 0;
715  deep_copy(verts_to_send_size, verts_to_send_size_host);
716 
717  if(verbose)std::cout<<comm->getRank()<<": Done creating send views, initializing...\n";
718  if(verbose)std::cout<<comm->getRank()<<": boundary_size = "<<boundary_size<<" verts_to_send_size_atomic(0) = "<<verts_to_send_size_atomic(0)<<"\n";
719  //initially the verts to send include all boundary vertices.
720  Kokkos::parallel_for("Initialize verts_to_send",
721  Kokkos::RangePolicy<execution_space, int>(0,nVtx),
722  KOKKOS_LAMBDA(const int&i){
723  for(offset_t j = dist_offsets(i); j < dist_offsets(i+1); j++){
724  if((size_t)dist_adjs(j) >= nVtx){
725  verts_to_send_view(verts_to_send_size_atomic(0)++) = i;
726  break;
727  }
728  }
729  });
730  Kokkos::fence();
731 
732 
733  //used to replace the incorrectly recolored ghosts
734  Kokkos::View<int*, device_type> ghost_colors("ghost color backups", rand.size()-nVtx);
735  if(verbose)std::cout<<comm->getRank()<<": Done initializing\n";
736  gno_t sentPerRound[numStatisticRecordingRounds];
737  gno_t recvPerRound[numStatisticRecordingRounds];
738 
739  if(verbose) std::cout<<comm->getRank()<<": Coloring interior\n";
740  //initialize interior and total timers, barrier to prevent any imbalance from setup.
741  //Only use a barrier if timing is happening.
742  if(timing) comm->barrier();
743  interior_time = timer();
744  total_time = timer();
745  //call the KokkosKernels coloring function with the Tpetra default spaces.
746  bool use_vbbit = (global_max_degree < 6000);
747  this->colorInterior<execution_space,memory_space>
748  (nVtx, dist_adjs, dist_offsets, femv,dist_adjs,0,use_vbbit);
749  if(timing){
750  interior_time = timer() - interior_time;
751  comp_time = interior_time;
752  }
753  if(verbose) std::cout<<comm->getRank()<<": Going to recolor\n";
754  bool recolor_degrees = this->pl->template get<bool>("recolor_degrees", true);
755 
756  //if there is more than a single process, check distributed conflicts and recolor
757  if(comm->getSize() > 1){
758 
759  if(verbose)std::cout<<comm->getRank()<<": going to communicate\n";
760 
761  //communicate
762  Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
763  Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
764  gno_t recv, sent;
765  comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,
766  nVtx,
767  verts_to_send_host,
768  verts_to_send_size_host,
769  femv,
770  procs_to_send,
771  recv,
772  sent);
773  sentPerRound[0] = sent;
774  recvPerRound[0] = recv;
775  if(verbose) std::cout<<comm->getRank()<<": done communicating\n";
776  verts_to_send_size_host(0) = 0;
777  deep_copy(verts_to_send_size, verts_to_send_size_host);
778  //set the old ghost colors
779  //get the colors from the femv
780  Kokkos::View<int**, Kokkos::LayoutLeft, device_type> femvColors =
781  femv->template getLocalView<device_type>(Tpetra::Access::ReadWrite); // Partial write
782  Kokkos::View<int*, device_type> femv_colors = subview(femvColors, Kokkos::ALL, 0);
783  Kokkos::parallel_for("get colors from femv",
784  Kokkos::RangePolicy<execution_space, int>(0,rand.size()-nVtx),
785  KOKKOS_LAMBDA(const int& i){
786  ghost_colors(i) = femv_colors(i+nVtx);
787  });
788  Kokkos::fence();
789  //detect conflicts on the device, uncolor conflicts consistently.
790  double temp = timer();
791  detectConflicts<execution_space, memory_space>(nVtx,
792  rand.size()-nVtx,
793  dist_offsets,
794  dist_adjs,
795  femv_colors,
796  verts_to_send_view,
797  verts_to_send_size_atomic,
798  recoloringSize,
799  rand_dev,
800  gid_dev,
801  ghost_degrees_dev,
802  recolor_degrees);
803  deep_copy(recoloringSize_host, recoloringSize);
804  conflict_detection += timer() - temp;
805  comp_time += conflict_detection;
806  }
807  //end the initial recoloring
808  if(verbose)std::cout<<comm->getRank()<<": done initial recoloring, begin recoloring loop\n";
809  double totalPerRound[numStatisticRecordingRounds];
810  double commPerRound[numStatisticRecordingRounds];
811  double compPerRound[numStatisticRecordingRounds];
812  double recoloringPerRound[numStatisticRecordingRounds];
813  double conflictDetectionPerRound[numStatisticRecordingRounds];
814  double serialRecoloringPerRound[numStatisticRecordingRounds];
815  int vertsPerRound[numStatisticRecordingRounds];
816  bool done = false; //We're only done when all processors are done
817  if(comm->getSize() == 1) done = true;
818  totalPerRound[0] = interior_time + comm_time + conflict_detection;
819  recoloringPerRound[0] = 0;
820  commPerRound[0] = comm_time;
821  compPerRound[0] = interior_time + conflict_detection;
822  conflictDetectionPerRound[0] = conflict_detection;
823  recoloringPerRound[0] = 0;
824  vertsPerRound[0] = 0;
825  int distributedRounds = 1; //this is the same across all processors
826  int serial_threshold = this->pl->template get<int>("serial_threshold",0);
827 
828  Kokkos::View<lno_t*, device_type> verts_to_recolor("verts_to_recolor", boundary_size);
829  typename Kokkos::View<int*, device_type>::HostMirror ghost_colors_host;
830  //while the queue is not empty
831  while(recoloringSize_host(0) > 0 || !done){
832  if(recoloringSize_host(0) < serial_threshold) break;
833  //get a subview of the colors:
834  auto femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
835  auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
836  //color everything in the recoloring queue, put everything on conflict queue
837  if(distributedRounds < numStatisticRecordingRounds) {
838  vertsPerRound[distributedRounds] = recoloringSize_host(0);
839  }
840 
841  //copying the send view to the recolor view is necessary because
842  //KokkosKernels can change the view passed in, and we need the send view
843  //intact for communication.
844  Kokkos::deep_copy(verts_to_recolor, verts_to_send_view);
845 
846  double recolor_temp = timer();
847  //use KokkosKernels to recolor the conflicting vertices.
848  deep_copy(verts_to_send_size_host, verts_to_send_size);
849  if(verts_to_send_size_host(0) > 0){
850  this->colorInterior<execution_space,
851  memory_space>(femv_colors.size(),
852  dist_adjs,dist_offsets,
853  femv,
854  verts_to_recolor,
855  verts_to_send_size_host(0),
856  use_vbbit);
857  }
858 
859  if(distributedRounds < numStatisticRecordingRounds){
860  recoloringPerRound[distributedRounds] = timer() - recolor_temp;
861  recoloring_time += recoloringPerRound[distributedRounds];
862  comp_time += recoloringPerRound[distributedRounds];
863  compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
864  totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
865  } else if(timing) {
866  double recolor_round_time = timer() - recolor_temp;
867  recoloring_time += recolor_round_time;
868  comp_time += recolor_round_time;
869  }
870 
871  //reset the recoloringSize device host and device views
872  //to zero
873  recoloringSize_host(0) = 0;
874  Kokkos::deep_copy(recoloringSize,recoloringSize_host);
875 
876  Kokkos::parallel_for("set femv colors",
877  Kokkos::RangePolicy<execution_space, int>(0,rand.size()-nVtx),
878  KOKKOS_LAMBDA(const int& i){
879  femv_colors(i+nVtx) = ghost_colors(i);
880  });
881  Kokkos::fence();
882  //communicate
883  Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
884  Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
885  gno_t sent,recv;
886  // Reset device views
887  femvColors = decltype(femvColors)();
888  femv_colors = decltype(femv_colors)();
889 
890  double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,
891  nVtx,
892  verts_to_send_host,
893  verts_to_send_size_host,
894  femv,
895  procs_to_send,
896  recv,
897  sent);
898  comm_time += curr_comm_time;
899  if(distributedRounds < numStatisticRecordingRounds){
900  commPerRound[distributedRounds] = curr_comm_time;
901  sentPerRound[distributedRounds] = sent;
902  recvPerRound[distributedRounds] = recv;
903  totalPerRound[distributedRounds] += commPerRound[distributedRounds];
904  }
905  //detect conflicts in parallel. For a detected conflict,
906  //reset the vertex-to-be-recolored's color to 0, in order to
907  //allow KokkosKernels to recolor correctly.
908 
909  femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
910  femv_colors = subview(femvColors, Kokkos::ALL, 0);
911  Kokkos::parallel_for("get femv colors 2",
912  Kokkos::RangePolicy<execution_space, int>(0,rand.size()-nVtx),
913  KOKKOS_LAMBDA(const int& i){
914  ghost_colors(i) = femv_colors(i+nVtx);
915  });
916  Kokkos::fence();
917  verts_to_send_size_host(0) = 0;
918  deep_copy(verts_to_send_size, verts_to_send_size_host);
919  double detection_temp = timer();
920  detectConflicts<execution_space, memory_space>(nVtx,
921  rand.size()-nVtx,
922  dist_offsets,
923  dist_adjs,
924  femv_colors,
925  verts_to_send_view,
926  verts_to_send_size_atomic,
927  recoloringSize,
928  rand_dev,
929  gid_dev,
930  ghost_degrees_dev,
931  recolor_degrees);
932 
933  Kokkos::deep_copy(recoloringSize_host, recoloringSize);
934 
935  if(distributedRounds < numStatisticRecordingRounds){
936  conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
937  conflict_detection += conflictDetectionPerRound[distributedRounds];
938  compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
939  totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
940  comp_time += conflictDetectionPerRound[distributedRounds];
941  } else if(timing){
942  double conflict_detection_round_time = timer()- detection_temp;
943  conflict_detection += conflict_detection_round_time;
944  comp_time += conflict_detection_round_time;
945  }
946  //do a reduction to determine if we're done
947  int globalDone = 0;
948  int localDone = recoloringSize_host(0);
949  Teuchos::reduceAll<int, int>(*comm,Teuchos::REDUCE_SUM,1, &localDone, &globalDone);
950  //We're only allowed to stop once everyone has no work to do.
951  //collectives will hang if one process exits.
952  distributedRounds++;
953  done = !globalDone;
954  }
955 
956  if(recoloringSize_host(0) > 0 || !done){
957  ghost_colors_host = Kokkos::create_mirror_view(ghost_colors);
958  deep_copy(ghost_colors_host, ghost_colors);
959  deep_copy(verts_to_send_host, verts_to_send_view);
960  deep_copy(verts_to_send_size_host, verts_to_send_size);
961  }
962 
963 
964  //finish the local coloring in serial on the host
965  while(recoloringSize_host(0) > 0 || !done){
966  //Use non-templated call to get the Host view
967  auto femvColors = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
968  auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
969  /*Kokkos::View<int*, Kokkos::Device<Kokkos::Serial, Kokkos::HostSpace>>femv_colors_host = create_mirror(femv_colors);
970  Kokkos::deep_copy(femv_colors_host, femv_colors);*/
971  if(distributedRounds < 100){
972  vertsPerRound[distributedRounds] = recoloringSize_host(0);
973  }
974 
975  double recolor_temp = timer();
976  //use KokkosKernels to recolor the conflicting vertices
977  if(verts_to_send_size_host(0) > 0){
978  this->colorInterior<host_exec,
979  host_mem>
980  (femv_colors.size(), dist_adjs_host, dist_offsets_host, femv, verts_to_send_host, verts_to_send_size_host(0), true);
981  }
982 
983  if(distributedRounds < numStatisticRecordingRounds){
984  recoloringPerRound[distributedRounds] = timer() - recolor_temp;
985  recoloring_time += recoloringPerRound[distributedRounds];
986  comp_time += recoloringPerRound[distributedRounds];
987  compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
988  totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
989  } else if(timing){
990  double recolor_serial_round_time = timer() - recolor_temp;
991  recoloring_time += recolor_serial_round_time;
992  comp_time += recolor_serial_round_time;
993  }
994 
995  recoloringSize_host(0) = 0;
996 
997  for(size_t i = 0; i < rand.size() -nVtx; i++){
998  femv_colors(i+nVtx) = ghost_colors_host(i);
999  }
1000 
1001  gno_t sent,recv;
1002  double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,
1003  nVtx,
1004  verts_to_send_host,
1005  verts_to_send_size_host,
1006  femv,
1007  procs_to_send,
1008  recv,
1009  sent);
1010  comm_time += curr_comm_time;
1011 
1012  if(distributedRounds < numStatisticRecordingRounds){
1013  commPerRound[distributedRounds] = curr_comm_time;
1014  sentPerRound[distributedRounds] = sent;
1015  recvPerRound[distributedRounds] = recv;
1016  totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1017  }
1018  for(size_t i = 0; i < rand.size()-nVtx; i++){
1019  ghost_colors_host(i) = femv_colors(i+nVtx);
1020  }
1021 
1022  verts_to_send_size_host(0) = 0;
1023  double detection_temp = timer();
1024  detectConflicts<host_exec, host_mem>(nVtx,
1025  rand.size()-nVtx,
1026  dist_offsets_host,
1027  dist_adjs_host,
1028  femv_colors,
1029  verts_to_send_host,
1030  verts_to_send_size_host,
1031  recoloringSize_host,
1032  rand_host,
1033  gid_host,
1034  ghost_degrees_host,
1035  recolor_degrees);
1036  if(distributedRounds < numStatisticRecordingRounds){
1037  conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
1038  conflict_detection += conflictDetectionPerRound[distributedRounds];
1039  compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1040  totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1041  comp_time += conflictDetectionPerRound[distributedRounds];
1042  } else if(timing){
1043  double conflict_detection_serial_round_time = timer() - detection_temp;
1044  conflict_detection += conflict_detection_serial_round_time;
1045  comp_time += conflict_detection_serial_round_time;
1046  }
1047  //do a reduction to determine if we're done
1048  int globalDone = 0;
1049  int localDone = recoloringSize_host(0);
1050  Teuchos::reduceAll<int, int>(*comm, Teuchos::REDUCE_SUM,1, &localDone, &globalDone);
1051  distributedRounds++;
1052  done = !globalDone;
1053  }
1054  total_time = timer() - total_time;
1055 
1056  //only take time to compute statistics if the user wants them
1057  if(verbose){
1058  std::cout<<comm->getRank()<<": done recoloring loop, computing statistics\n";
1059  int localBoundaryVertices = 0;
1060  for(size_t i = 0; i < nVtx; i++){
1061  for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
1062  if((size_t)adjs[j] >= nVtx){
1063  localBoundaryVertices++;
1064  break;
1065  }
1066  }
1067  }
1068  //print how many rounds of speculating/correcting happened (this should be the same for all ranks):
1069  //if(comm->getRank()==0) printf("did %d rounds of distributed coloring\n", distributedRounds);
1070  int totalBoundarySize = 0;
1071  int totalVertsPerRound[numStatisticRecordingRounds];
1072  double finalTotalPerRound[numStatisticRecordingRounds];
1073  double maxRecoloringPerRound[numStatisticRecordingRounds];
1074  double finalSerialRecoloringPerRound[numStatisticRecordingRounds];
1075  double minRecoloringPerRound[numStatisticRecordingRounds];
1076  double finalCommPerRound[numStatisticRecordingRounds];
1077  double finalCompPerRound[numStatisticRecordingRounds];
1078  double finalConflictDetectionPerRound[numStatisticRecordingRounds];
1079  gno_t finalRecvPerRound[numStatisticRecordingRounds];
1080  gno_t finalSentPerRound[numStatisticRecordingRounds];
1081  for(int i = 0; i < numStatisticRecordingRounds; i++) {
1082  totalVertsPerRound[i] = 0;
1083  finalTotalPerRound[i] = 0.0;
1084  maxRecoloringPerRound[i] = 0.0;
1085  minRecoloringPerRound[i] = 0.0;
1086  finalCommPerRound[i] = 0.0;
1087  finalCompPerRound[i] = 0.0;
1088  finalConflictDetectionPerRound[i] = 0.0;
1089  finalSentPerRound[i] = 0;
1090  finalRecvPerRound[i] = 0;
1091  }
1092  Teuchos::reduceAll<int,int>(*comm, Teuchos::REDUCE_SUM,1, &localBoundaryVertices,&totalBoundarySize);
1093  Teuchos::reduceAll<int,int>(*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,vertsPerRound,totalVertsPerRound);
1094  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,totalPerRound,finalTotalPerRound);
1095  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,recoloringPerRound,maxRecoloringPerRound);
1096  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,numStatisticRecordingRounds,recoloringPerRound,minRecoloringPerRound);
1097  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,serialRecoloringPerRound,finalSerialRecoloringPerRound);
1098  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,commPerRound,finalCommPerRound);
1099  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,compPerRound,finalCompPerRound);
1100  Teuchos::reduceAll<int,double>(*comm,
1101  Teuchos::REDUCE_MAX,numStatisticRecordingRounds,conflictDetectionPerRound,finalConflictDetectionPerRound);
1102  Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,recvPerRound, finalRecvPerRound);
1103  Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,sentPerRound, finalSentPerRound);
1104 
1105  std::cout << "Rank " << comm->getRank()
1106  << ": boundary size: " << localBoundaryVertices << std::endl;
1107  if(comm->getRank()==0)
1108  std::cout << "Total boundary size: " << totalBoundarySize << std::endl;
1109  for(int i = 0; i < std::min(distributedRounds,numStatisticRecordingRounds); i++){
1110  std::cout << "Rank " << comm->getRank()
1111  << ": recolor " << vertsPerRound[i]
1112  << " vertices in round " << i << std::endl;
1113  if(comm->getRank()==0) {
1114  std::cout << "recolored " << totalVertsPerRound[i]
1115  << " vertices in round " << i << std::endl;
1116  std::cout << "total time in round " << i
1117  << ": " << finalTotalPerRound[i] << std::endl;;
1118  std::cout << "recoloring time in round " << i
1119  << ": " << maxRecoloringPerRound[i] << std::endl;
1120  std::cout << "serial recoloring time in round " << i
1121  << ": " << finalSerialRecoloringPerRound[i] << std::endl;
1122  std::cout << "min recoloring time in round " << i
1123  << ": " << minRecoloringPerRound[i] << std::endl;
1124  std::cout << "conflict detection time in round " << i
1125  << ": " << finalConflictDetectionPerRound[i] << std::endl;
1126  std::cout << "comm time in round " << i
1127  << ": " << finalCommPerRound[i] << std::endl;
1128  std::cout << "total sent in round " << i
1129  << ": " << finalSentPerRound[i] << std::endl;
1130  std::cout << "total recv in round " << i
1131  << ": " << finalRecvPerRound[i] << std::endl;
1132  std::cout << "comp time in round " << i
1133  << ": " << finalCompPerRound[i] << std::endl;
1134  }
1135  }
1136  } else if(timing){
1137  double global_total_time = 0.0;
1138  double global_recoloring_time=0.0;
1139  double global_min_recoloring_time=0.0;
1140  double global_conflict_detection=0.0;
1141  double global_comm_time=0.0;
1142  double global_comp_time=0.0;
1143  double global_interior_time = 0.0;
1144  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&total_time,&global_total_time);
1145  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&recoloring_time,&global_recoloring_time);
1146  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,1,&recoloring_time,&global_min_recoloring_time);
1147  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&conflict_detection,&global_conflict_detection);
1148  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comm_time,&global_comm_time);
1149  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comp_time,&global_comp_time);
1150  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&interior_time,&global_interior_time);
1151  comm->barrier();
1152  fflush(stdout);
1153  if(comm->getRank()==0){
1154  std::cout << "Total Time: " << global_total_time << std::endl;
1155  std::cout << "Interior Time: " << global_interior_time << std::endl;
1156  std::cout << "Recoloring Time: " << global_recoloring_time << std::endl;
1157  std::cout << "Min Recoloring Time: " << global_min_recoloring_time << std::endl;
1158  std::cout << "Conflict Detection Time: " << global_conflict_detection << std::endl;
1159  std::cout << "Comm Time: " << global_comm_time << std::endl;
1160  std::cout << "Comp Time: " << global_comp_time << std::endl;
1161  }
1162  }
1163  if(verbose) std::cout<<comm->getRank()<<": exiting coloring\n";
1164  }
1165 };
1166 
1167 template <typename Adapter>
1168 void AlgDistance1<Adapter>::buildModel(modelFlag_t &flags){
1169  flags.set(REMOVE_SELF_EDGES);
1170 
1171  this->env->debug(DETAILED_STATUS, " building graph model");
1172  if(verbose) std::cout<<comm->getRank()<<": starting to construct graph model\n";
1173  this->model = rcp(new GraphModel<base_adapter_t>(this->adapter, this->env,
1174  this->comm, flags));
1175  if(verbose) std::cout<<comm->getRank()<<": done constructing graph model\n";
1176  this->env->debug(DETAILED_STATUS, " graph model built");
1177 }
1178 
1179 
1180 }//end namespace Zoltan2
1181 #endif
typename Adapter::lno_t lno_t
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
typename device_type::memory_space memory_space
AlgDistance1(const RCP< const base_adapter_t > &adapter_, const RCP< Teuchos::ParameterList > &pl_, const RCP< Environment > &env_, const RCP< const Teuchos::Comm< int > > &comm_)
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
map_t::global_ordinal_type gno_t
Definition: mapRemotes.cpp:18
typename Adapter::offset_t offset_t
sub-steps, each method&#39;s entry and exit
typename femv_t::device_type device_type
algorithm requires no self edges
typename femv_t::host_view_type::device_type::memory_space host_mem
Tpetra::FEMultiVector< femv_scalar_t, lno_t, gno_t > femv_t
void color(const RCP< ColoringSolution< Adapter > > &solution)
Coloring method.
typename Adapter::gno_t gno_t
Tpetra::Map< lno_t, gno_t > map_t
Algorithm defines the base class for all algorithms.
map_t::local_ordinal_type lno_t
Definition: mapRemotes.cpp:17
typename device_type::execution_space execution_space
typename Adapter::base_adapter_t base_adapter_t
typename femv_t::host_view_type::device_type::execution_space host_exec
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS&#39;s idx_t...
void hybridGMB(const size_t nVtx, const Teuchos::ArrayView< const lno_t > &adjs, const Teuchos::ArrayView< const offset_t > &offsets, const Teuchos::RCP< femv_t > &femv, const Teuchos::ArrayView< const gno_t > &gids, const Teuchos::ArrayView< const int > &rand, const Teuchos::ArrayView< const int > &owners, RCP< const map_t > mapOwnedPlusGhosts, const std::unordered_map< lno_t, std::vector< int >> &procs_to_send)
void detectConflicts(const size_t n_local, const size_t n_ghosts, Kokkos::View< offset_t *, Kokkos::Device< ExecutionSpace, MemorySpace >> dist_offsets, Kokkos::View< lno_t *, Kokkos::Device< ExecutionSpace, MemorySpace >> dist_adjs, Kokkos::View< int *, Kokkos::Device< ExecutionSpace, MemorySpace >> femv_colors, Kokkos::View< lno_t *, Kokkos::Device< ExecutionSpace, MemorySpace >> verts_to_send_view, Kokkos::View< size_t *, Kokkos::Device< ExecutionSpace, MemorySpace >, Kokkos::MemoryTraits< Kokkos::Atomic >> verts_to_send_size_atomic, Kokkos::View< gno_t *, Kokkos::Device< ExecutionSpace, MemorySpace >> recoloringSize, Kokkos::View< int *, Kokkos::Device< ExecutionSpace, MemorySpace >> rand, Kokkos::View< gno_t *, Kokkos::Device< ExecutionSpace, MemorySpace >> gid, Kokkos::View< gno_t *, Kokkos::Device< ExecutionSpace, MemorySpace >> ghost_degrees, bool recolor_degrees)
Defines the ColoringSolution class.
Tpetra::global_size_t global_size_t
typename Adapter::scalar_t scalar_t
Defines the GraphModel interface.
A gathering of useful namespace methods.
AlltoAll communication methods.
The class containing coloring solution.