Zoltan2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Zoltan2_AlgHybridD1.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Zoltan2: A package of combinatorial algorithms for scientific computing
4 //
5 // Copyright 2012 NTESS and the Zoltan2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef _ZOLTAN2_ALGHYBRIDD1_HPP_
11 #define _ZOLTAN2_ALGHYBRIDD1_HPP_
12 
13 #include <vector>
14 #include <unordered_map>
15 #include <iostream>
16 #include <fstream>
17 #include <queue>
18 #ifdef _WIN32
19 #include <time.h>
20 #else
21 #include <sys/time.h>
22 #endif
23 #include <algorithm>
24 
25 #include "Zoltan2_Algorithm.hpp"
26 #include "Zoltan2_GraphModel.hpp"
28 #include "Zoltan2_Util.hpp"
29 #include "Zoltan2_TPLTraits.hpp"
30 #include "Zoltan2_AlltoAll.hpp"
31 
32 #include "Tpetra_Core.hpp"
33 #include "Teuchos_RCP.hpp"
34 #include "Tpetra_Import.hpp"
35 #include "Tpetra_FEMultiVector.hpp"
36 
37 #include "KokkosKernels_Handle.hpp"
38 #include "KokkosKernels_IOUtils.hpp"
39 #include "KokkosGraph_Distance1Color.hpp"
40 #include "KokkosGraph_Distance1ColorHandle.hpp"
41 #include <stdlib.h>
42 
46 
47 namespace Zoltan2 {
48 
49 template <typename Adapter>
50 class AlgDistance1 : public Algorithm<Adapter>
51 {
52  public:
53 
54  using lno_t = typename Adapter::lno_t;
55  using gno_t = typename Adapter::gno_t;
56  using offset_t = typename Adapter::offset_t;
57  using scalar_t = typename Adapter::scalar_t;
59  using map_t = Tpetra::Map<lno_t, gno_t>;
60  using femv_scalar_t = int;
61  using femv_t = Tpetra::FEMultiVector<femv_scalar_t, lno_t, gno_t>;
62  using device_type = typename femv_t::device_type;
63  using execution_space = typename device_type::execution_space;
64  using memory_space = typename device_type::memory_space;
65  using host_exec = typename femv_t::host_view_type::device_type::execution_space;
66  using host_mem = typename femv_t::host_view_type::device_type::memory_space;
67  double timer() {
68  struct timeval tp;
69  gettimeofday(&tp, NULL);
70  return ((double) (tp.tv_sec) + 1e-6 * tp.tv_usec);
71  }
72 
73  private:
74 
75  void buildModel(modelFlag_t &flags);
76 
77  //function to invoke KokkosKernels distance-1 coloring
78  //
79  //OUTPUT ARG:
80  //
81  // femv: FEMultiVector containing dual-view of colors.
82  // After this call each vertex will have a locally-valid color.
83  //
84  //INPUT ARGS:
85  //
86  // nVtx: number of local owned vertices
87  //
88  // adjs_view: CSR adjacencies for the local graph
89  //
90  // offset_view: CSR offsets for indexing adjs_view
91  //
92  // vertex_list: list of local IDs of vertices that
93  // need to be recolored
94  //
95  // vertex_list_size: 0 means no vertex list given,
96  // otherwise it simply is the size
97  // of the vertex_list
98  //
99  // recolor: switches between VB_BIT and EB KokkosKernels
100  // algorithms
101  //
102  template <class ExecutionSpace, typename MemorySpace>
103  void colorInterior(const size_t nVtx,
104  Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace,MemorySpace> > adjs_view,
105  Kokkos::View<offset_t*, Kokkos::Device<ExecutionSpace, MemorySpace> > offset_view,
106  Teuchos::RCP<femv_t> femv,
107  Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace, MemorySpace> > vertex_list,
108  size_t vertex_list_size = 0,
109  bool recolor=false){
110 
111  //set up types to be used by KokkosKernels
112  using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle
113  <offset_t, lno_t, lno_t, ExecutionSpace, MemorySpace,
114  MemorySpace>;
115  using lno_row_view_t = Kokkos::View<offset_t*, Kokkos::Device<ExecutionSpace, MemorySpace>>;
116  using lno_nnz_view_t = Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace, MemorySpace>>;
117 
118  KernelHandle kh;
119 
120  //pick which KokkosKernels algorithm to use.
121  //VBBIT is more efficient for inputs with max degree < ~6000
122  //EB is more efficient for inputs with max degree > ~6000
123  if(recolor){
124  kh.create_graph_coloring_handle(KokkosGraph::COLORING_VBBIT);
125  } else {
126  kh.create_graph_coloring_handle(KokkosGraph::COLORING_EB);
127  }
128 
129  //vertex_list_size indicates whether we have provided a list of vertices to recolor.
130  //Only makes a difference if the algorithm to be used is VBBIT.
131  if(vertex_list_size != 0){
132  kh.get_graph_coloring_handle()->set_vertex_list(vertex_list,vertex_list_size);
133  }
134 
135  kh.set_verbose(verbose);
136 
137  //set the initial coloring of the kh.get_graph_coloring_handle() to be
138  //the data view from the femv.
139  auto femvColors = femv->template getLocalView<Kokkos::Device<ExecutionSpace,MemorySpace> >(Tpetra::Access::ReadWrite);
140  auto sv = subview(femvColors, Kokkos::ALL, 0);
141  kh.get_graph_coloring_handle()->set_vertex_colors(sv);
142  kh.get_graph_coloring_handle()->set_tictoc(verbose);
143  KokkosGraph::Experimental::graph_color_symbolic<KernelHandle, lno_row_view_t, lno_nnz_view_t>
144  (&kh, nVtx, nVtx, offset_view, adjs_view);
145  //numColors = kh.get_graph_coloring_handle()->get_num_colors();
146 
147  if(verbose){
148  std::cout<<"\nKokkosKernels Coloring: "<<kh.get_graph_coloring_handle()->get_overall_coloring_time()<<" iterations: "<<kh.get_graph_coloring_handle()->get_num_phases()<<"\n\n";
149  }
150  }
151 
152  public:
153  //contains all distance-1 conflict detection logic
154  //
155  //OUTPUT ARGS:
156  //
157  // femv_colors: This function uncolors vertices that have
158  // distance-1 conflicts, so these colors will
159  // change if there are any conflicts present
160  //
161  //INPUT ARGS:
162  //
163  // n_local: number of locally owned vertices
164  //
165  // n_ghosts: number of ghosts on this process
166  //
167  // dist_offsets: CSR offsets of the local graph
168  //
169  // dist_adjs: CSR adjacencies of the local graph
170  //
171  // verts_to_send_view: Used to construct a list of verts to send on device.
172  //
173  // verts_to_send_size_atomic: atomic version of the verts_to_send_size view
174  // Used to construct a list on device,
175  // the atomic is necessary for correctness.
176  //
177  // recoloringSize: holds the total amount of recoloring that will be done
178  // locally. Does not need to be atomic.
179  //
180  // rand: view that holds the tie-breaking random numbers indexed by LID.
181  //
182  // gid: view that holds GIDs, for tie breaking in the case that rand
183  // numbers are the same for two vertices.
184  //
185  // ghost_degrees: view that holds degrees only for ghost vertices.
186  // LID n_local corresponds to ghost_degrees(0).
187  //
188  // recolor_degrees: if true, we factor degrees into the conflict detection
189  // if false, we resolve using only consistent random numbers.
190  //
191  template <class ExecutionSpace, typename MemorySpace>
192  void detectConflicts(const size_t n_local, const size_t n_ghosts,
193  Kokkos::View<offset_t*, Kokkos::Device<ExecutionSpace, MemorySpace>> dist_offsets,
194  Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace, MemorySpace>> dist_adjs,
195  Kokkos::View<int*, Kokkos::Device<ExecutionSpace, MemorySpace>> femv_colors,
196  Kokkos::View<lno_t*,
197  Kokkos::Device<ExecutionSpace, MemorySpace>> verts_to_send_view,
198  Kokkos::View<size_t*,
199  Kokkos::Device<ExecutionSpace, MemorySpace>,
200  Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic,
201  Kokkos::View<gno_t*, Kokkos::Device<ExecutionSpace, MemorySpace>> recoloringSize,
202  Kokkos::View<int*,
203  Kokkos::Device<ExecutionSpace, MemorySpace>> rand,
204  Kokkos::View<gno_t*,
205  Kokkos::Device<ExecutionSpace, MemorySpace>> gid,
206  Kokkos::View<gno_t*,
207  Kokkos::Device<ExecutionSpace, MemorySpace>> ghost_degrees,
208  bool recolor_degrees){
209  gno_t local_recoloring_size;
210  Kokkos::parallel_reduce("Conflict Detection",
211  Kokkos::RangePolicy<ExecutionSpace>(0,n_ghosts),
212  KOKKOS_LAMBDA(const int& i,
213  gno_t& recoloring_size){
214  lno_t lid = i+n_local;
215  int currColor = femv_colors(lid);
216  int currDegree = ghost_degrees(i);
217  for(offset_t j = dist_offsets(lid); j < dist_offsets(lid+1); j++){
218  int nborColor = femv_colors(dist_adjs(j));
219  int nborDegree = 0;
220  if((size_t)dist_adjs(j) < n_local) nborDegree = dist_offsets(dist_adjs(j)+1) - dist_offsets(dist_adjs(j));
221  else nborDegree = ghost_degrees(dist_adjs(j) - n_local);
222  if(currColor == nborColor ){
223  if(currDegree < nborDegree && recolor_degrees){
224  femv_colors(lid) = 0;
225  recoloring_size++;
226  }else if (nborDegree < currDegree && recolor_degrees){
227  femv_colors(dist_adjs(j)) = 0;
228  recoloring_size++;
229  }else if(rand(lid) > rand(dist_adjs(j))){
230  femv_colors(lid) = 0;
231  recoloring_size++;
232  break;
233  }else if (rand(dist_adjs(j)) > rand(lid)){
234  femv_colors(dist_adjs(j)) = 0;
235  recoloring_size++;
236  } else {
237  if (gid(lid) >= gid(dist_adjs(j))){
238  femv_colors(lid) = 0;
239  recoloring_size++;
240  break;
241  } else {
242  femv_colors(dist_adjs(j)) = 0;
243  recoloring_size++;
244  }
245  }
246  }
247  }
248  },local_recoloring_size);
249  Kokkos::deep_copy(recoloringSize, local_recoloring_size);
250  Kokkos::fence();
251  Kokkos::parallel_for("Rebuild verts_to_send_view",
252  Kokkos::RangePolicy<ExecutionSpace>(0,n_local),
253  KOKKOS_LAMBDA(const int& i){
254  if(femv_colors(i) == 0){
255  verts_to_send_view(verts_to_send_size_atomic(0)++) = i;
256  }
257  });
258  Kokkos::fence();
259  }
260 
261  private:
262  //Communicates owned vertex colors to ghost copies.
263  //
264  //RETURN VALUE:
265  //
266  // returns the time it took for the communication call to complete
267  //
268  //OUTPUT ARGS:
269  //
270  // femv: FEMultivector that holds the dual-view of the colors
271  // After this call, ghost colors will be up-to-date.
272  //
273  // recv: returns the size of the recv buffer
274  //
275  // send: returns the size of the send buffer
276  //
277  //INPUT ARGS:
278  //
279  // mapOwnedPlusGhosts: a Tpetra map that translates between
280  // LID and GID for any vertex on this process.
281  //
282  // nVtx: the number of locally owned vertices.
283  //
284  // verts_to_send: hostmirror of verts to send. This function sends
285  // all vertices in this list to their ghost copies.
286  //
287  // verts_to_send_size: hostmirror of verts_to_send_size, holds the
288  // size of verts_to_send.
289  //
290  // procs_to_send: map that translates LID into a list of processes that
291  // have a ghost copy of the vertex.
292  //
293  double doOwnedToGhosts(RCP<const map_t> mapOwnedPlusGhosts,
294  size_t nVtx,
295  typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_send,
296  typename Kokkos::View<size_t*, device_type>::HostMirror& verts_to_send_size,
297  Teuchos::RCP<femv_t> femv,
298  const std::unordered_map<lno_t, std::vector<int>>& procs_to_send,
299  gno_t& recv, gno_t& send){
300  auto femvColors = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
301  auto colors = subview(femvColors, Kokkos::ALL, 0);
302  int nprocs = comm->getSize();
303 
304  std::vector<int> sendcnts(comm->getSize(), 0);
305  std::vector<gno_t> sdispls(comm->getSize()+1, 0);
306  for(size_t i = 0; i < verts_to_send_size(0); i++){
307  for(size_t j = 0; j < procs_to_send.at(verts_to_send(i)).size(); j++){
308  sendcnts[procs_to_send.at(verts_to_send(i))[j]] += 2;
309  }
310  }
311 
312  sdispls[0] = 0;
313  gno_t sendsize = 0;
314  std::vector<int> sentcount(nprocs, 0);
315 
316  for(int i = 1; i < comm->getSize()+1; i++){
317  sdispls[i] = sdispls[i-1] + sendcnts[i-1];
318  sendsize += sendcnts[i-1];
319  }
320  send = sendsize;
321  std::vector<gno_t> sendbuf(sendsize,0);
322 
323  for(size_t i = 0; i < verts_to_send_size(0); i++){
324  std::vector<int> procs = procs_to_send.at(verts_to_send(i));
325  for(size_t j = 0; j < procs.size(); j++){
326  size_t idx = sdispls[procs[j]] + sentcount[procs[j]];
327  sentcount[procs[j]] += 2;
328  sendbuf[idx++] = mapOwnedPlusGhosts->getGlobalElement(verts_to_send(i));
329  sendbuf[idx] = colors(verts_to_send(i));
330  }
331  }
332 
333  Teuchos::ArrayView<int> sendcnts_view = Teuchos::arrayViewFromVector(sendcnts);
334  Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
335  Teuchos::ArrayRCP<gno_t> recvbuf;
336  std::vector<int> recvcnts(comm->getSize(), 0);
337  Teuchos::ArrayView<int> recvcnts_view = Teuchos::arrayViewFromVector(recvcnts);
338 
339  //if we're reporting timings, remove the computation imbalance from the comm timer.
340  if(timing) comm->barrier();
341  double comm_total = 0.0;
342  double comm_temp = timer();
343 
344  AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcnts_view, recvbuf, recvcnts_view);
345  comm_total += timer() - comm_temp;
346 
347  gno_t recvsize = 0;
348 
349  for(int i = 0; i < recvcnts_view.size(); i++){
350  recvsize += recvcnts_view[i];
351  }
352  recv = recvsize;
353  for(int i = 0; i < recvsize; i+=2){
354  size_t lid = mapOwnedPlusGhosts->getLocalElement(recvbuf[i]);
355  if(lid<nVtx && verbose) std::cout<<comm->getRank()<<": received a locally owned vertex, somehow\n";
356  colors(lid) = recvbuf[i+1];
357  }
358 
359  return comm_total;
360  }
361 
362  RCP<const base_adapter_t> adapter;
363  RCP<GraphModel<base_adapter_t> > model;
364  RCP<Teuchos::ParameterList> pl;
365  RCP<Environment> env;
366  RCP<const Teuchos::Comm<int> > comm;
367  bool verbose;
368  bool timing;
369  public:
370  //constructor for the hybrid distributed distance-1 algorithm
372  const RCP<const base_adapter_t> &adapter_,
373  const RCP<Teuchos::ParameterList> &pl_,
374  const RCP<Environment> &env_,
375  const RCP<const Teuchos::Comm<int> > &comm_)
376  : adapter(adapter_), pl(pl_), env(env_), comm(comm_) {
377  verbose = pl->get<bool>("verbose",false);
378  timing = pl->get<bool>("timing", false);
379  if(verbose) std::cout<<comm->getRank()<<": inside coloring constructor\n";
380  modelFlag_t flags;
381  flags.reset();
382  buildModel(flags);
383  if(verbose) std::cout<<comm->getRank()<<": done constructing coloring class\n";
384  }
385 
386 
387  //Main entry point for graph coloring
388  void color( const RCP<ColoringSolution<Adapter> > &solution ) {
389  if(verbose) std::cout<<comm->getRank()<<": inside coloring function\n";
390  //this will color the global graph in a manner similar to Zoltan
391 
392  //get vertex GIDs in a locally indexed array
393  if(verbose) std::cout<<comm->getRank()<<": getting owned vtxIDs\n";
394  ArrayView<const gno_t> vtxIDs;
395  ArrayView<StridedData<lno_t, scalar_t> > vwgts;
396  size_t nVtx = model->getVertexList(vtxIDs, vwgts);
397  //we do not use weights at this point
398  if(verbose) std::cout<<comm->getRank()<<": getting edge list\n";
399  //get edge information from the model
400  ArrayView<const gno_t> adjs;
401  ArrayView<const offset_t> offsets;
402  ArrayView<StridedData<lno_t, scalar_t> > ewgts;
403  model->getEdgeList(adjs, offsets, ewgts);
404  //again, weights are not used
405 
406  RCP<const map_t> mapOwned;
407  RCP<const map_t> mapWithCopies;
408 
409  std::vector<gno_t> finalGIDs;
410  std::vector<offset_t> finalOffset_vec;
411  std::vector<lno_t> finalAdjs_vec;
412 
413  std::vector<lno_t> reorderToLocal;
414  for(size_t i = 0; i< nVtx; i++) reorderToLocal.push_back(i);
415  if(verbose) std::cout<<comm->getRank()<<": Setting up local datastructures\n";
416  //Set up a typical local mapping here.
417  std::unordered_map<gno_t,lno_t> globalToLocal;
418  std::vector<gno_t> ownedPlusGhosts;
419  for (gno_t i = 0; i < vtxIDs.size(); i++){
420  if(vtxIDs[i] < 0 && verbose) std::cout<<comm->getRank()<<": found a negative GID\n";
421  globalToLocal[vtxIDs[i]] = i;
422  ownedPlusGhosts.push_back(vtxIDs[i]);
423  }
424  gno_t nghosts = 0;
425  for (int i = 0; i < adjs.size(); i++){
426  if(globalToLocal.count(adjs[i]) == 0){
427  //new unique ghost found
428  if(adjs[i] < 0 && verbose) std::cout<<comm->getRank()<<": found a negative adjacency\n";
429  ownedPlusGhosts.push_back(adjs[i]);
430  globalToLocal[adjs[i]] = vtxIDs.size() + nghosts;
431  nghosts++;
432 
433  }
434  }
435  if(verbose) std::cout<<comm->getRank()<<": vec.max_size() = "<<finalAdjs_vec.max_size()<<", adjs.size() = "<<adjs.size()<<"\n";
436  finalAdjs_vec.resize(adjs.size());
437  for(size_t i = 0; i < finalAdjs_vec.size();i++){
438  finalAdjs_vec[i] = globalToLocal[adjs[i]];
439  }
440  for(int i = 0; i < offsets.size(); i++) finalOffset_vec.push_back(offsets[i]);
441  finalGIDs = ownedPlusGhosts;
442 
443 
444  if(verbose) std::cout<<comm->getRank()<<": creating Tpetra Maps\n";
445  Tpetra::global_size_t dummy = Teuchos::OrdinalTraits
446  <Tpetra::global_size_t>::invalid();
447  mapOwned = rcp(new map_t(dummy, vtxIDs, 0, comm));
448 
449  dummy = Teuchos::OrdinalTraits <Tpetra::global_size_t>::invalid();
450  mapWithCopies = rcp(new map_t(dummy,
451  Teuchos::arrayViewFromVector(ownedPlusGhosts),
452  0, comm));
453 
454  //create the FEMultiVector for the distributed communication.
455  //We also use the views from this datastructure as arguments to
456  //KokkosKernels coloring functions.
457  if(verbose)std::cout<<comm->getRank()<<": creating FEMultiVector\n";
458  typedef Tpetra::Import<lno_t, gno_t> import_t;
459  Teuchos::RCP<import_t> importer = rcp(new import_t(mapOwned,
460  mapWithCopies));
461  Teuchos::RCP<femv_t> femv = rcp(new femv_t(mapOwned,
462  importer, 1, true));
463  //Get color array to fill
464  ArrayRCP<int> colors = solution->getColorsRCP();
465  for(size_t i=0; i<nVtx; i++){
466  colors[i] = 0;
467  }
468 
469  //Create random numbers seeded on global IDs so that we don't
470  //need to communicate for consistency. These numbers determine
471  //which vertex gets recolored in the event of a conflict.
472  //taken directly from the Zoltan coloring implementation
473  std::vector<int> rand(finalGIDs.size());
474  for(size_t i = 0; i < finalGIDs.size(); i++){
475  std::srand(finalGIDs[i]);
476  rand[i] = std::rand();
477  }
478 
479  //find out who owns the ghosts on this process
480  std::vector<int> ghostOwners(finalGIDs.size() - nVtx);
481  std::vector<gno_t> ghosts(finalGIDs.size() - nVtx);
482  for(size_t i = nVtx; i < finalGIDs.size(); i++) ghosts[i-nVtx] = finalGIDs[i];
483  ArrayView<int> owners = Teuchos::arrayViewFromVector(ghostOwners);
484  ArrayView<const gno_t> ghostGIDs = Teuchos::arrayViewFromVector(ghosts);
485  mapOwned->getRemoteIndexList(ghostGIDs, owners);
486 
487  //create const views of the CSR representation of the local graph
488  ArrayView<const offset_t> finalOffsets = Teuchos::arrayViewFromVector(finalOffset_vec);
489  ArrayView<const lno_t> finalAdjs = Teuchos::arrayViewFromVector(finalAdjs_vec);
490  ArrayView<const int> rand_view = Teuchos::arrayViewFromVector(rand);
491  ArrayView<const gno_t> gid_view = Teuchos::arrayViewFromVector(finalGIDs);
492 
493  //find out which remote processes have a ghost copy of a local owned vertex.
494  Teuchos::ArrayView<const lno_t> exportLIDs = importer->getExportLIDs();
495  Teuchos::ArrayView<const int> exportPIDs = importer->getExportPIDs();
496 
497  //create a quick lookup from LID -> remote processes with a ghost copy
498  std::unordered_map<lno_t, std::vector<int>> procs_to_send;
499  for(int i = 0; i < exportLIDs.size(); i++){
500  procs_to_send[exportLIDs[i]].push_back(exportPIDs[i]);
501  }
502 
503  // call coloring function
504  hybridGMB(nVtx, finalAdjs, finalOffsets,femv,gid_view,rand_view,owners,mapWithCopies, procs_to_send);
505 
506  //copy colors to the output array.
507  auto femvdata = femv->getData(0);
508  for(int i = 0; i < colors.size(); i++){
509  colors[reorderToLocal[i]] = femvdata[i];
510  }
511 
512  }
513 
514  //This function contains all coloring logic.
515  //
516  //OUTPUT ARGS:
517  //
518  // femv: FEMultiVector with a dual-view of the vertex colors.
519  // The colors will be a valid distance-1 coloring after this
520  // function returns.
521  //
522  //INPUT ARGS:
523  //
524  // nVtx: number of locally owned vertices
525  //
526  // adjs: CSR adjacencies for the local graph
527  //
528  // offsets: CSR offsets into adjs for the local graph
529  //
530  // gids: global IDs for every vertex on this process.
531  // indexed by local ID
532  //
533  // rand: random number generated for every vertex on
534  // this process. Indexed by local ID
535  //
536  // owners: owners of each ghost vertex.
537  // owners[i] = the owning process for vertex with GID gids[i+nVtx]
538  //
539  // mapOwnedPlusGhosts: Tpetra map that translates between LIDs and GIDs
540  //
541  // procs_to_send: maps from LIDs to Process IDs.
542  // procs_to_send[LID] gives a list of processes that have
543  // a ghost copy of LID.
544  //
545  void hybridGMB(const size_t nVtx,
546  const Teuchos::ArrayView<const lno_t>& adjs,
547  const Teuchos::ArrayView<const offset_t>& offsets,
548  const Teuchos::RCP<femv_t>& femv,
549  const Teuchos::ArrayView<const gno_t>& gids,
550  const Teuchos::ArrayView<const int>& rand,
551  const Teuchos::ArrayView<const int>& owners,
552  RCP<const map_t> mapOwnedPlusGhosts,
553  const std::unordered_map<lno_t, std::vector<int>>& procs_to_send){
554  if(verbose) std::cout<<comm->getRank()<<": inside coloring algorithm\n";
555 
556  //initialize timers
557  double total_time = 0.0;
558  double interior_time = 0.0;
559  double comm_time = 0.0;
560  double comp_time = 0.0;
561  double recoloring_time = 0.0;
562  double conflict_detection = 0.0;
563 
564  const int numStatisticRecordingRounds = 100;
565 
566  //Put together local and remote degrees
567  //1. Communicate remote GIDs to owning processes
568  std::vector<int> deg_send_cnts(comm->getSize(),0);
569  std::vector<gno_t> deg_sdispls(comm->getSize()+1,0);
570  for(int i = 0; i < owners.size(); i++){
571  deg_send_cnts[owners[i]]++;
572  }
573  deg_sdispls[0] = 0;
574  gno_t deg_sendsize = 0;
575  std::vector<int> deg_sentcount(comm->getSize(),0);
576  for(int i = 1; i < comm->getSize()+1; i++){
577  deg_sdispls[i] = deg_sdispls[i-1] + deg_send_cnts[i-1];
578  deg_sendsize += deg_send_cnts[i-1];
579  }
580  std::vector<gno_t> deg_sendbuf(deg_sendsize,0);
581  for(int i = 0; i < owners.size(); i++){
582  size_t idx = deg_sdispls[owners[i]] + deg_sentcount[owners[i]];
583  deg_sentcount[owners[i]]++;
584  deg_sendbuf[idx] = mapOwnedPlusGhosts->getGlobalElement(i+nVtx);
585  }
586  Teuchos::ArrayView<int> deg_send_cnts_view = Teuchos::arrayViewFromVector(deg_send_cnts);
587  Teuchos::ArrayView<gno_t> deg_sendbuf_view = Teuchos::arrayViewFromVector(deg_sendbuf);
588  Teuchos::ArrayRCP<gno_t> deg_recvbuf;
589  std::vector<int> deg_recvcnts(comm->getSize(),0);
590  Teuchos::ArrayView<int> deg_recvcnts_view = Teuchos::arrayViewFromVector(deg_recvcnts);
591  AlltoAllv<gno_t>(*comm, *env, deg_sendbuf_view, deg_send_cnts_view, deg_recvbuf, deg_recvcnts_view);
592 
593  //2. replace GID with local degree
594  for(int i = 0; i < deg_recvbuf.size(); i++){
595  lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_recvbuf[i]);
596  deg_recvbuf[i] = offsets[lid+1] - offsets[lid];
597  }
598  //3. send modified buffer back
599  ArrayRCP<gno_t> ghost_degrees;
600  AlltoAllv<gno_t>(*comm, *env, deg_recvbuf(), deg_recvcnts_view, ghost_degrees, deg_send_cnts_view);
601  //determine max degree locally and globally
602  Kokkos::View<gno_t*, device_type> ghost_degrees_dev("ghost degree view",ghost_degrees.size());
603  typename Kokkos::View<gno_t*, device_type>::HostMirror ghost_degrees_host = Kokkos::create_mirror(ghost_degrees_dev);
604  for(int i = 0; i < ghost_degrees.size(); i++){
605  lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_sendbuf[i]);
606  ghost_degrees_host(lid-nVtx) = ghost_degrees[i];
607  }
608  Kokkos::deep_copy(ghost_degrees_dev, ghost_degrees_host);
609 
610  offset_t local_max_degree = 0;
611  offset_t global_max_degree = 0;
612  for(size_t i = 0; i < nVtx; i++){
613  offset_t curr_degree = offsets[i+1] - offsets[i];
614  if(curr_degree > local_max_degree){
615  local_max_degree = curr_degree;
616  }
617  }
618  Teuchos::reduceAll<int, offset_t>(*comm,Teuchos::REDUCE_MAX,1, &local_max_degree, &global_max_degree);
619  if(comm->getRank() == 0 && verbose) std::cout<<"Input has max degree "<<global_max_degree<<"\n";
620  if(verbose)std::cout<<comm->getRank()<<": creating Kokkos Views\n";
621 
622  Kokkos::View<offset_t*, device_type> dist_degrees("Owned+Ghost degree view",rand.size());
623  typename Kokkos::View<offset_t*, device_type>::HostMirror dist_degrees_host = Kokkos::create_mirror(dist_degrees);
624  //set degree counts for ghosts
625  for(int i = 0; i < adjs.size(); i++){
626  if((size_t)adjs[i] < nVtx) continue;
627  dist_degrees_host(adjs[i])++;
628  }
629  //set degree counts for owned verts
630  for(int i = 0; i < offsets.size()-1; i++){
631  dist_degrees_host(i) = offsets[i+1] - offsets[i];
632  }
633 
634  Kokkos::View<offset_t*, device_type> dist_offsets("Owned+Ghost Offset view", rand.size()+1);
635  typename Kokkos::View<offset_t*, device_type>::HostMirror dist_offsets_host = Kokkos::create_mirror(dist_offsets);
636 
637  //set offsets and total # of adjacencies
638  dist_offsets_host(0) = 0;
639  uint64_t total_adjs = 0;
640  for(Teuchos_Ordinal i = 1; i < rand.size()+1; i++){
641  dist_offsets_host(i) = dist_degrees_host(i-1) + dist_offsets_host(i-1);
642  total_adjs+= dist_degrees_host(i-1);
643  }
644 
645  Kokkos::View<lno_t*, device_type> dist_adjs("Owned+Ghost adjacency view", total_adjs);
646  typename Kokkos::View<lno_t*, device_type>::HostMirror dist_adjs_host = Kokkos::create_mirror(dist_adjs);
647  //now, use the degree view as a counter
648  for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
649  dist_degrees_host(i) = 0;
650  }
651  for(int i = 0; i < adjs.size(); i++) dist_adjs_host(i) = adjs[i];
652  if(comm->getSize() > 1){
653  for(size_t i = 0; i < nVtx; i++){
654  for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
655  //if the adjacency is a ghost
656  if( (size_t)adjs[j] >= nVtx){
657  //add the symmetric edge to its adjacency list (already accounted for by offsets)
658  dist_adjs_host(dist_offsets_host(adjs[j]) + dist_degrees_host(adjs[j])) = i;
659  dist_degrees_host(adjs[j])++;
660  }
661  }
662  }
663  }
664 
665  if(verbose) std::cout<<comm->getRank()<<": copying host mirrors to device views\n";
666  //copy the host data to device memory
667  Kokkos::deep_copy(dist_degrees, dist_degrees_host); //may be unnecessary
668  Kokkos::deep_copy(dist_offsets, dist_offsets_host);
669  Kokkos::deep_copy(dist_adjs, dist_adjs_host);
670  if(verbose) std::cout<<comm->getRank()<<": done copying to device\n";
671 
672  //counter in UVM memory for how many vertices need recoloring.
673  Kokkos::View<gno_t*, device_type> recoloringSize("Recoloring Queue Size",1);
674  typename Kokkos::View<gno_t*, device_type>::HostMirror recoloringSize_host = Kokkos::create_mirror(recoloringSize);
675  recoloringSize_host(0) = 0;
676  Kokkos::deep_copy(recoloringSize, recoloringSize_host);
677 
678  //device copy of the random tie-breakers
679  Kokkos::View<int*,device_type> rand_dev("randVec",rand.size());
680  typename Kokkos::View<int*, device_type>::HostMirror rand_host = Kokkos::create_mirror(rand_dev);
681  for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
682  rand_host(i) = rand[i];
683  }
684 
685  //device copy of global IDs
686  Kokkos::View<gno_t*, device_type> gid_dev("GIDs",gids.size());
687  typename Kokkos::View<gno_t*,device_type>::HostMirror gid_host = Kokkos::create_mirror(gid_dev);
688  for(Teuchos_Ordinal i = 0; i < gids.size(); i++){
689  gid_host(i) = gids[i];
690  }
691 
692  //copy host views to device memory
693  Kokkos::deep_copy(rand_dev,rand_host);
694  Kokkos::deep_copy(gid_dev, gid_host);
695 
696  if(verbose)std::cout<<comm->getRank()<<": done creating recoloring datastructures\n";
697  //count boundary size to allocate list of vertices to recolor.
698  offset_t boundary_size = 0;
699  for(size_t i = 0; i < nVtx; i++){
700  for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
701  if((size_t)adjs[j] >= nVtx) {
702  boundary_size++;
703  break;
704  }
705  }
706  }
707  if(verbose)std::cout<<comm->getRank()<<": creating send views\n";
708 
709  //list of vertices to send to remote processes
710  Kokkos::View<lno_t*, device_type> verts_to_send_view("verts to send",boundary_size);
711  Kokkos::parallel_for("init verts_to_send_view",
712  Kokkos::RangePolicy<execution_space, int>(0,boundary_size),
713  KOKKOS_LAMBDA(const int& i){
714  verts_to_send_view(i) = -1;
715  });
716 
717  //size information for the list of vertices to send. Also includes an atomic copy
718  Kokkos::View<size_t*, device_type> verts_to_send_size("verts to send size",1);
719  Kokkos::View<size_t*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic> > verts_to_send_size_atomic = verts_to_send_size;
720  typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_send_host = create_mirror(verts_to_send_view);
721  typename Kokkos::View<size_t*,device_type>::HostMirror verts_to_send_size_host = create_mirror(verts_to_send_size);
722  //initialize the device view with a value of zero
723  verts_to_send_size_host(0) = 0;
724  deep_copy(verts_to_send_size, verts_to_send_size_host);
725 
726  if(verbose)std::cout<<comm->getRank()<<": Done creating send views, initializing...\n";
727  if(verbose)std::cout<<comm->getRank()<<": boundary_size = "<<boundary_size<<" verts_to_send_size_atomic(0) = "<<verts_to_send_size_atomic(0)<<"\n";
728  //initially the verts to send include all boundary vertices.
729  Kokkos::parallel_for("Initialize verts_to_send",
730  Kokkos::RangePolicy<execution_space, int>(0,nVtx),
731  KOKKOS_LAMBDA(const int&i){
732  for(offset_t j = dist_offsets(i); j < dist_offsets(i+1); j++){
733  if((size_t)dist_adjs(j) >= nVtx){
734  verts_to_send_view(verts_to_send_size_atomic(0)++) = i;
735  break;
736  }
737  }
738  });
739  Kokkos::fence();
740 
741 
742  //used to replace the incorrectly recolored ghosts
743  Kokkos::View<int*, device_type> ghost_colors("ghost color backups", rand.size()-nVtx);
744  if(verbose)std::cout<<comm->getRank()<<": Done initializing\n";
745  gno_t sentPerRound[numStatisticRecordingRounds];
746  gno_t recvPerRound[numStatisticRecordingRounds];
747 
748  if(verbose) std::cout<<comm->getRank()<<": Coloring interior\n";
749  //initialize interior and total timers, barrier to prevent any imbalance from setup.
750  //Only use a barrier if timing is happening.
751  if(timing) comm->barrier();
752  interior_time = timer();
753  total_time = timer();
754  //call the KokkosKernels coloring function with the Tpetra default spaces.
755  bool use_vbbit = (global_max_degree < 6000);
756  this->colorInterior<execution_space,memory_space>
757  (nVtx, dist_adjs, dist_offsets, femv,dist_adjs,0,use_vbbit);
758  if(timing){
759  interior_time = timer() - interior_time;
760  comp_time = interior_time;
761  }
762  if(verbose) std::cout<<comm->getRank()<<": Going to recolor\n";
763  bool recolor_degrees = this->pl->template get<bool>("recolor_degrees", true);
764 
765  //if there is more than a single process, check distributed conflicts and recolor
766  if(comm->getSize() > 1){
767 
768  if(verbose)std::cout<<comm->getRank()<<": going to communicate\n";
769 
770  //communicate
771  Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
772  Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
773  gno_t recv, sent;
774  comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,
775  nVtx,
776  verts_to_send_host,
777  verts_to_send_size_host,
778  femv,
779  procs_to_send,
780  recv,
781  sent);
782  sentPerRound[0] = sent;
783  recvPerRound[0] = recv;
784  if(verbose) std::cout<<comm->getRank()<<": done communicating\n";
785  verts_to_send_size_host(0) = 0;
786  deep_copy(verts_to_send_size, verts_to_send_size_host);
787  //set the old ghost colors
788  //get the colors from the femv
789  Kokkos::View<int**, Kokkos::LayoutLeft, device_type> femvColors =
790  femv->template getLocalView<device_type>(Tpetra::Access::ReadWrite); // Partial write
791  Kokkos::View<int*, device_type> femv_colors = subview(femvColors, Kokkos::ALL, 0);
792  Kokkos::parallel_for("get colors from femv",
793  Kokkos::RangePolicy<execution_space, int>(0,rand.size()-nVtx),
794  KOKKOS_LAMBDA(const int& i){
795  ghost_colors(i) = femv_colors(i+nVtx);
796  });
797  Kokkos::fence();
798  //detect conflicts on the device, uncolor conflicts consistently.
799  double temp = timer();
800  detectConflicts<execution_space, memory_space>(nVtx,
801  rand.size()-nVtx,
802  dist_offsets,
803  dist_adjs,
804  femv_colors,
805  verts_to_send_view,
806  verts_to_send_size_atomic,
807  recoloringSize,
808  rand_dev,
809  gid_dev,
810  ghost_degrees_dev,
811  recolor_degrees);
812  deep_copy(recoloringSize_host, recoloringSize);
813  conflict_detection += timer() - temp;
814  comp_time += conflict_detection;
815  }
816  //end the initial recoloring
817  if(verbose)std::cout<<comm->getRank()<<": done initial recoloring, begin recoloring loop\n";
818  double totalPerRound[numStatisticRecordingRounds];
819  double commPerRound[numStatisticRecordingRounds];
820  double compPerRound[numStatisticRecordingRounds];
821  double recoloringPerRound[numStatisticRecordingRounds];
822  double conflictDetectionPerRound[numStatisticRecordingRounds];
823  double serialRecoloringPerRound[numStatisticRecordingRounds];
824  int vertsPerRound[numStatisticRecordingRounds];
825  bool done = false; //We're only done when all processors are done
826  if(comm->getSize() == 1) done = true;
827  totalPerRound[0] = interior_time + comm_time + conflict_detection;
828  recoloringPerRound[0] = 0;
829  commPerRound[0] = comm_time;
830  compPerRound[0] = interior_time + conflict_detection;
831  conflictDetectionPerRound[0] = conflict_detection;
832  recoloringPerRound[0] = 0;
833  vertsPerRound[0] = 0;
834  int distributedRounds = 1; //this is the same across all processors
835  int serial_threshold = this->pl->template get<int>("serial_threshold",0);
836 
837  Kokkos::View<lno_t*, device_type> verts_to_recolor("verts_to_recolor", boundary_size);
838  typename Kokkos::View<int*, device_type>::HostMirror ghost_colors_host;
839  //while the queue is not empty
840  while(recoloringSize_host(0) > 0 || !done){
841  if(recoloringSize_host(0) < serial_threshold) break;
842  //get a subview of the colors:
843  auto femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
844  auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
845  //color everything in the recoloring queue, put everything on conflict queue
846  if(distributedRounds < numStatisticRecordingRounds) {
847  vertsPerRound[distributedRounds] = recoloringSize_host(0);
848  }
849 
850  //copying the send view to the recolor view is necessary because
851  //KokkosKernels can change the view passed in, and we need the send view
852  //intact for communication.
853  Kokkos::deep_copy(verts_to_recolor, verts_to_send_view);
854 
855  double recolor_temp = timer();
856  //use KokkosKernels to recolor the conflicting vertices.
857  deep_copy(verts_to_send_size_host, verts_to_send_size);
858  if(verts_to_send_size_host(0) > 0){
859  this->colorInterior<execution_space,
860  memory_space>(femv_colors.size(),
861  dist_adjs,dist_offsets,
862  femv,
863  verts_to_recolor,
864  verts_to_send_size_host(0),
865  use_vbbit);
866  }
867 
868  if(distributedRounds < numStatisticRecordingRounds){
869  recoloringPerRound[distributedRounds] = timer() - recolor_temp;
870  recoloring_time += recoloringPerRound[distributedRounds];
871  comp_time += recoloringPerRound[distributedRounds];
872  compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
873  totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
874  } else if(timing) {
875  double recolor_round_time = timer() - recolor_temp;
876  recoloring_time += recolor_round_time;
877  comp_time += recolor_round_time;
878  }
879 
880  //reset the recoloringSize device host and device views
881  //to zero
882  recoloringSize_host(0) = 0;
883  Kokkos::deep_copy(recoloringSize,recoloringSize_host);
884 
885  Kokkos::parallel_for("set femv colors",
886  Kokkos::RangePolicy<execution_space, int>(0,rand.size()-nVtx),
887  KOKKOS_LAMBDA(const int& i){
888  femv_colors(i+nVtx) = ghost_colors(i);
889  });
890  Kokkos::fence();
891  //communicate
892  Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
893  Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
894  gno_t sent,recv;
895  // Reset device views
896  femvColors = decltype(femvColors)();
897  femv_colors = decltype(femv_colors)();
898 
899  double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,
900  nVtx,
901  verts_to_send_host,
902  verts_to_send_size_host,
903  femv,
904  procs_to_send,
905  recv,
906  sent);
907  comm_time += curr_comm_time;
908  if(distributedRounds < numStatisticRecordingRounds){
909  commPerRound[distributedRounds] = curr_comm_time;
910  sentPerRound[distributedRounds] = sent;
911  recvPerRound[distributedRounds] = recv;
912  totalPerRound[distributedRounds] += commPerRound[distributedRounds];
913  }
914  //detect conflicts in parallel. For a detected conflict,
915  //reset the vertex-to-be-recolored's color to 0, in order to
916  //allow KokkosKernels to recolor correctly.
917 
918  femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
919  femv_colors = subview(femvColors, Kokkos::ALL, 0);
920  Kokkos::parallel_for("get femv colors 2",
921  Kokkos::RangePolicy<execution_space, int>(0,rand.size()-nVtx),
922  KOKKOS_LAMBDA(const int& i){
923  ghost_colors(i) = femv_colors(i+nVtx);
924  });
925  Kokkos::fence();
926  verts_to_send_size_host(0) = 0;
927  deep_copy(verts_to_send_size, verts_to_send_size_host);
928  double detection_temp = timer();
929  detectConflicts<execution_space, memory_space>(nVtx,
930  rand.size()-nVtx,
931  dist_offsets,
932  dist_adjs,
933  femv_colors,
934  verts_to_send_view,
935  verts_to_send_size_atomic,
936  recoloringSize,
937  rand_dev,
938  gid_dev,
939  ghost_degrees_dev,
940  recolor_degrees);
941 
942  Kokkos::deep_copy(recoloringSize_host, recoloringSize);
943 
944  if(distributedRounds < numStatisticRecordingRounds){
945  conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
946  conflict_detection += conflictDetectionPerRound[distributedRounds];
947  compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
948  totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
949  comp_time += conflictDetectionPerRound[distributedRounds];
950  } else if(timing){
951  double conflict_detection_round_time = timer()- detection_temp;
952  conflict_detection += conflict_detection_round_time;
953  comp_time += conflict_detection_round_time;
954  }
955  //do a reduction to determine if we're done
956  int globalDone = 0;
957  int localDone = recoloringSize_host(0);
958  Teuchos::reduceAll<int, int>(*comm,Teuchos::REDUCE_SUM,1, &localDone, &globalDone);
959  //We're only allowed to stop once everyone has no work to do.
960  //collectives will hang if one process exits.
961  distributedRounds++;
962  done = !globalDone;
963  }
964 
965  if(recoloringSize_host(0) > 0 || !done){
966  ghost_colors_host = Kokkos::create_mirror_view(ghost_colors);
967  deep_copy(ghost_colors_host, ghost_colors);
968  deep_copy(verts_to_send_host, verts_to_send_view);
969  deep_copy(verts_to_send_size_host, verts_to_send_size);
970  }
971 
972 
973  //finish the local coloring in serial on the host
974  while(recoloringSize_host(0) > 0 || !done){
975  //Use non-templated call to get the Host view
976  auto femvColors = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
977  auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
978  /*Kokkos::View<int*, Kokkos::Device<Kokkos::Serial, Kokkos::HostSpace>>femv_colors_host = create_mirror(femv_colors);
979  Kokkos::deep_copy(femv_colors_host, femv_colors);*/
980  if(distributedRounds < 100){
981  vertsPerRound[distributedRounds] = recoloringSize_host(0);
982  }
983 
984  double recolor_temp = timer();
985  //use KokkosKernels to recolor the conflicting vertices
986  if(verts_to_send_size_host(0) > 0){
987  this->colorInterior<host_exec,
988  host_mem>
989  (femv_colors.size(), dist_adjs_host, dist_offsets_host, femv, verts_to_send_host, verts_to_send_size_host(0), true);
990  }
991 
992  if(distributedRounds < numStatisticRecordingRounds){
993  recoloringPerRound[distributedRounds] = timer() - recolor_temp;
994  recoloring_time += recoloringPerRound[distributedRounds];
995  comp_time += recoloringPerRound[distributedRounds];
996  compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
997  totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
998  } else if(timing){
999  double recolor_serial_round_time = timer() - recolor_temp;
1000  recoloring_time += recolor_serial_round_time;
1001  comp_time += recolor_serial_round_time;
1002  }
1003 
1004  recoloringSize_host(0) = 0;
1005 
1006  for(size_t i = 0; i < rand.size() -nVtx; i++){
1007  femv_colors(i+nVtx) = ghost_colors_host(i);
1008  }
1009 
1010  gno_t sent,recv;
1011  double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,
1012  nVtx,
1013  verts_to_send_host,
1014  verts_to_send_size_host,
1015  femv,
1016  procs_to_send,
1017  recv,
1018  sent);
1019  comm_time += curr_comm_time;
1020 
1021  if(distributedRounds < numStatisticRecordingRounds){
1022  commPerRound[distributedRounds] = curr_comm_time;
1023  sentPerRound[distributedRounds] = sent;
1024  recvPerRound[distributedRounds] = recv;
1025  totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1026  }
1027  for(size_t i = 0; i < rand.size()-nVtx; i++){
1028  ghost_colors_host(i) = femv_colors(i+nVtx);
1029  }
1030 
1031  verts_to_send_size_host(0) = 0;
1032  double detection_temp = timer();
1033  detectConflicts<host_exec, host_mem>(nVtx,
1034  rand.size()-nVtx,
1035  dist_offsets_host,
1036  dist_adjs_host,
1037  femv_colors,
1038  verts_to_send_host,
1039  verts_to_send_size_host,
1040  recoloringSize_host,
1041  rand_host,
1042  gid_host,
1043  ghost_degrees_host,
1044  recolor_degrees);
1045  if(distributedRounds < numStatisticRecordingRounds){
1046  conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
1047  conflict_detection += conflictDetectionPerRound[distributedRounds];
1048  compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1049  totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1050  comp_time += conflictDetectionPerRound[distributedRounds];
1051  } else if(timing){
1052  double conflict_detection_serial_round_time = timer() - detection_temp;
1053  conflict_detection += conflict_detection_serial_round_time;
1054  comp_time += conflict_detection_serial_round_time;
1055  }
1056  //do a reduction to determine if we're done
1057  int globalDone = 0;
1058  int localDone = recoloringSize_host(0);
1059  Teuchos::reduceAll<int, int>(*comm, Teuchos::REDUCE_SUM,1, &localDone, &globalDone);
1060  distributedRounds++;
1061  done = !globalDone;
1062  }
1063  total_time = timer() - total_time;
1064 
1065  //only take time to compute statistics if the user wants them
1066  if(verbose){
1067  std::cout<<comm->getRank()<<": done recoloring loop, computing statistics\n";
1068  int localBoundaryVertices = 0;
1069  for(size_t i = 0; i < nVtx; i++){
1070  for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
1071  if((size_t)adjs[j] >= nVtx){
1072  localBoundaryVertices++;
1073  break;
1074  }
1075  }
1076  }
1077  //print how many rounds of speculating/correcting happened (this should be the same for all ranks):
1078  //if(comm->getRank()==0) printf("did %d rounds of distributed coloring\n", distributedRounds);
1079  int totalBoundarySize = 0;
1080  int totalVertsPerRound[numStatisticRecordingRounds];
1081  double finalTotalPerRound[numStatisticRecordingRounds];
1082  double maxRecoloringPerRound[numStatisticRecordingRounds];
1083  double finalSerialRecoloringPerRound[numStatisticRecordingRounds];
1084  double minRecoloringPerRound[numStatisticRecordingRounds];
1085  double finalCommPerRound[numStatisticRecordingRounds];
1086  double finalCompPerRound[numStatisticRecordingRounds];
1087  double finalConflictDetectionPerRound[numStatisticRecordingRounds];
1088  gno_t finalRecvPerRound[numStatisticRecordingRounds];
1089  gno_t finalSentPerRound[numStatisticRecordingRounds];
1090  for(int i = 0; i < numStatisticRecordingRounds; i++) {
1091  totalVertsPerRound[i] = 0;
1092  finalTotalPerRound[i] = 0.0;
1093  maxRecoloringPerRound[i] = 0.0;
1094  minRecoloringPerRound[i] = 0.0;
1095  finalCommPerRound[i] = 0.0;
1096  finalCompPerRound[i] = 0.0;
1097  finalConflictDetectionPerRound[i] = 0.0;
1098  finalSentPerRound[i] = 0;
1099  finalRecvPerRound[i] = 0;
1100  }
1101  Teuchos::reduceAll<int,int>(*comm, Teuchos::REDUCE_SUM,1, &localBoundaryVertices,&totalBoundarySize);
1102  Teuchos::reduceAll<int,int>(*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,vertsPerRound,totalVertsPerRound);
1103  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,totalPerRound,finalTotalPerRound);
1104  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,recoloringPerRound,maxRecoloringPerRound);
1105  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,numStatisticRecordingRounds,recoloringPerRound,minRecoloringPerRound);
1106  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,serialRecoloringPerRound,finalSerialRecoloringPerRound);
1107  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,commPerRound,finalCommPerRound);
1108  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,compPerRound,finalCompPerRound);
1109  Teuchos::reduceAll<int,double>(*comm,
1110  Teuchos::REDUCE_MAX,numStatisticRecordingRounds,conflictDetectionPerRound,finalConflictDetectionPerRound);
1111  Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,recvPerRound, finalRecvPerRound);
1112  Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,sentPerRound, finalSentPerRound);
1113 
1114  std::cout << "Rank " << comm->getRank()
1115  << ": boundary size: " << localBoundaryVertices << std::endl;
1116  if(comm->getRank()==0)
1117  std::cout << "Total boundary size: " << totalBoundarySize << std::endl;
1118  for(int i = 0; i < std::min(distributedRounds,numStatisticRecordingRounds); i++){
1119  std::cout << "Rank " << comm->getRank()
1120  << ": recolor " << vertsPerRound[i]
1121  << " vertices in round " << i << std::endl;
1122  if(comm->getRank()==0) {
1123  std::cout << "recolored " << totalVertsPerRound[i]
1124  << " vertices in round " << i << std::endl;
1125  std::cout << "total time in round " << i
1126  << ": " << finalTotalPerRound[i] << std::endl;;
1127  std::cout << "recoloring time in round " << i
1128  << ": " << maxRecoloringPerRound[i] << std::endl;
1129  std::cout << "serial recoloring time in round " << i
1130  << ": " << finalSerialRecoloringPerRound[i] << std::endl;
1131  std::cout << "min recoloring time in round " << i
1132  << ": " << minRecoloringPerRound[i] << std::endl;
1133  std::cout << "conflict detection time in round " << i
1134  << ": " << finalConflictDetectionPerRound[i] << std::endl;
1135  std::cout << "comm time in round " << i
1136  << ": " << finalCommPerRound[i] << std::endl;
1137  std::cout << "total sent in round " << i
1138  << ": " << finalSentPerRound[i] << std::endl;
1139  std::cout << "total recv in round " << i
1140  << ": " << finalRecvPerRound[i] << std::endl;
1141  std::cout << "comp time in round " << i
1142  << ": " << finalCompPerRound[i] << std::endl;
1143  }
1144  }
1145  } else if(timing){
1146  double global_total_time = 0.0;
1147  double global_recoloring_time=0.0;
1148  double global_min_recoloring_time=0.0;
1149  double global_conflict_detection=0.0;
1150  double global_comm_time=0.0;
1151  double global_comp_time=0.0;
1152  double global_interior_time = 0.0;
1153  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&total_time,&global_total_time);
1154  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&recoloring_time,&global_recoloring_time);
1155  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,1,&recoloring_time,&global_min_recoloring_time);
1156  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&conflict_detection,&global_conflict_detection);
1157  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comm_time,&global_comm_time);
1158  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comp_time,&global_comp_time);
1159  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&interior_time,&global_interior_time);
1160  comm->barrier();
1161  fflush(stdout);
1162  if(comm->getRank()==0){
1163  std::cout << "Total Time: " << global_total_time << std::endl;
1164  std::cout << "Interior Time: " << global_interior_time << std::endl;
1165  std::cout << "Recoloring Time: " << global_recoloring_time << std::endl;
1166  std::cout << "Min Recoloring Time: " << global_min_recoloring_time << std::endl;
1167  std::cout << "Conflict Detection Time: " << global_conflict_detection << std::endl;
1168  std::cout << "Comm Time: " << global_comm_time << std::endl;
1169  std::cout << "Comp Time: " << global_comp_time << std::endl;
1170  }
1171  }
1172  if(verbose) std::cout<<comm->getRank()<<": exiting coloring\n";
1173  }
1174 };
1175 
1176 template <typename Adapter>
1177 void AlgDistance1<Adapter>::buildModel(modelFlag_t &flags){
1178  flags.set(REMOVE_SELF_EDGES);
1179 
1180  this->env->debug(DETAILED_STATUS, " building graph model");
1181  if(verbose) std::cout<<comm->getRank()<<": starting to construct graph model\n";
1182  this->model = rcp(new GraphModel<base_adapter_t>(this->adapter, this->env,
1183  this->comm, flags));
1184  if(verbose) std::cout<<comm->getRank()<<": done constructing graph model\n";
1185  this->env->debug(DETAILED_STATUS, " graph model built");
1186 }
1187 
1188 
1189 }//end namespace Zoltan2
1190 #endif
typename Adapter::lno_t lno_t
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
typename device_type::memory_space memory_space
AlgDistance1(const RCP< const base_adapter_t > &adapter_, const RCP< Teuchos::ParameterList > &pl_, const RCP< Environment > &env_, const RCP< const Teuchos::Comm< int > > &comm_)
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
map_t::global_ordinal_type gno_t
Definition: mapRemotes.cpp:27
typename Adapter::offset_t offset_t
sub-steps, each method&#39;s entry and exit
typename femv_t::device_type device_type
algorithm requires no self edges
typename femv_t::host_view_type::device_type::memory_space host_mem
Tpetra::FEMultiVector< femv_scalar_t, lno_t, gno_t > femv_t
void color(const RCP< ColoringSolution< Adapter > > &solution)
Coloring method.
typename Adapter::gno_t gno_t
Tpetra::Map< lno_t, gno_t > map_t
Algorithm defines the base class for all algorithms.
map_t::local_ordinal_type lno_t
Definition: mapRemotes.cpp:26
typename device_type::execution_space execution_space
typename Adapter::base_adapter_t base_adapter_t
typename femv_t::host_view_type::device_type::execution_space host_exec
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS&#39;s idx_t...
void hybridGMB(const size_t nVtx, const Teuchos::ArrayView< const lno_t > &adjs, const Teuchos::ArrayView< const offset_t > &offsets, const Teuchos::RCP< femv_t > &femv, const Teuchos::ArrayView< const gno_t > &gids, const Teuchos::ArrayView< const int > &rand, const Teuchos::ArrayView< const int > &owners, RCP< const map_t > mapOwnedPlusGhosts, const std::unordered_map< lno_t, std::vector< int >> &procs_to_send)
void detectConflicts(const size_t n_local, const size_t n_ghosts, Kokkos::View< offset_t *, Kokkos::Device< ExecutionSpace, MemorySpace >> dist_offsets, Kokkos::View< lno_t *, Kokkos::Device< ExecutionSpace, MemorySpace >> dist_adjs, Kokkos::View< int *, Kokkos::Device< ExecutionSpace, MemorySpace >> femv_colors, Kokkos::View< lno_t *, Kokkos::Device< ExecutionSpace, MemorySpace >> verts_to_send_view, Kokkos::View< size_t *, Kokkos::Device< ExecutionSpace, MemorySpace >, Kokkos::MemoryTraits< Kokkos::Atomic >> verts_to_send_size_atomic, Kokkos::View< gno_t *, Kokkos::Device< ExecutionSpace, MemorySpace >> recoloringSize, Kokkos::View< int *, Kokkos::Device< ExecutionSpace, MemorySpace >> rand, Kokkos::View< gno_t *, Kokkos::Device< ExecutionSpace, MemorySpace >> gid, Kokkos::View< gno_t *, Kokkos::Device< ExecutionSpace, MemorySpace >> ghost_degrees, bool recolor_degrees)
Defines the ColoringSolution class.
Tpetra::global_size_t global_size_t
typename Adapter::scalar_t scalar_t
Defines the GraphModel interface.
A gathering of useful namespace methods.
AlltoAll communication methods.
The class containing coloring solution.