Zoltan2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Zoltan2_AlgHybrid2GL.hpp
Go to the documentation of this file.
1 #ifndef _ZOLTAN2_2GHOSTLAYER_HPP_
2 #define _ZOLTAN2_2GHOSTLAYER_HPP_
3 
4 #include <vector>
5 #include <unordered_map>
6 #include <iostream>
7 #include <queue>
8 #ifdef _WIN32
9 #include <time.h>
10 #else
11 #include <sys/time.h>
12 #endif
13 
14 #include "Zoltan2_Algorithm.hpp"
15 #include "Zoltan2_GraphModel.hpp"
17 #include "Zoltan2_Util.hpp"
18 #include "Zoltan2_TPLTraits.hpp"
19 #include "Zoltan2_AlltoAll.hpp"
20 
21 
22 #include "Tpetra_Core.hpp"
23 #include "Teuchos_RCP.hpp"
24 #include "Tpetra_Import.hpp"
25 #include "Tpetra_FEMultiVector.hpp"
26 
27 #include "KokkosKernels_Handle.hpp"
28 #include "KokkosKernels_IOUtils.hpp"
29 #include "KokkosGraph_Distance1Color.hpp"
30 #include "KokkosGraph_Distance1ColorHandle.hpp"
31 
35 // for methods that use two layers of ghosts.
36 
37 
38 namespace Zoltan2{
39 
40 template <typename Adapter>
41 class AlgTwoGhostLayer : public Algorithm<Adapter> {
42 
43  public:
44 
45  using lno_t = typename Adapter::lno_t;
46  using gno_t = typename Adapter::gno_t;
47  using offset_t = typename Adapter::offset_t;
48  using scalar_t = typename Adapter::scalar_t;
50  using map_t = Tpetra::Map<lno_t,gno_t>;
51  using femv_scalar_t = int;
52  using femv_t = Tpetra::FEMultiVector<femv_scalar_t, lno_t, gno_t>;
53  using device_type = typename femv_t::device_type;
54  using execution_space = typename device_type::execution_space;
55  using memory_space = typename device_type::memory_space;
56  using host_exec = typename femv_t::host_view_type::device_type::execution_space;
57  using host_mem = typename femv_t::host_view_type::device_type::memory_space;
58 
59  double timer(){
60  struct timeval tp;
61  gettimeofday(&tp, NULL);
62  return ((double) (tp.tv_sec) + 1e-6 * tp.tv_usec);
63  }
64  private:
65 
66  //Entry point for parallel local coloring
67  // nVtx is the number of vertices owned by the current process
68  //
69  // adjs_view is the adjacencies, indexed by offset_view
70  //
71  // offset_view is the CSR row map, used to index the adjs_view
72  //
73  // femv is the FEMultiVector that holds the colors for the vertices
74  // the colors will change in this function.
75  //
76  // vertex_list is a list of vertices to recolor
77  //
78  // vertex_list_size is the size of the list of vertices to recolor
79  // vertex_list_size = 0 means recolor all uncolored vertices
80  //
81  // recolor decides which KokkosKernels algorithm to use.
82  //
83  virtual void colorInterior(const size_t nVtx,
84  Kokkos::View<lno_t*, device_type > adjs_view,
85  Kokkos::View<offset_t*,device_type > offset_view,
86  Teuchos::RCP<femv_t> femv,
87  Kokkos::View<lno_t*, device_type> vertex_list,
88  size_t vertex_list_size = 0,
89  bool recolor=false) = 0;
90 
91  //Entry point for serial local coloring
92  virtual void colorInterior_serial(const size_t nVtx,
93  typename Kokkos::View<lno_t*, device_type >::HostMirror adjs_view,
94  typename Kokkos::View<offset_t*,device_type >::HostMirror offset_view,
95  Teuchos::RCP<femv_t> femv,
96  typename Kokkos::View<lno_t*, device_type>::HostMirror vertex_list,
97  size_t vertex_list_size = 0,
98  bool recolor=false) = 0;
99  public:
100  //Entry point for parallel conflict detection
101  //INPUT ARGS
102  // n_local is the number of vertices owned by the current process
103  //
104  // dist_offsets_dev is the device view that holds the CSR offsets
105  // It holds offsets for all vertices, owned and ghosts.
106  // It is organized with owned first, and then ghost layers:
107  // -----------------------------------------------
108  // | owned | first | second |
109  // | vtx | ghost | ghost |
110  // | offsets | layer | layer |
111  // -----------------------------------------------
112  // The allocated length is equal to the sum of owned
113  // and ghost vertices + 1. Any vertex with LID >= n_local
114  // is a ghost.
115  //
116  // dist_adjs_dev is the device view that holds CSR adjacencies
117  //
118  // boundary_verts_view holds the local IDs of vertices in the boundary.
119  // By definition, boundary vertices are owned vertices
120  // that have a ghost in their neighborhood.
121  // (distance-1 coloring = 1-hop neighborhood)
122  // (distance-2 coloring = 2-hop neighborhood)
123  // Note: for D1-2GL we do not use this argument.
124  // It is possible to detect all D1 conflicts by only
125  // checking the ghost vertices, without constructing
126  // the boundary.
127  //
128  // rand is a device view that holds random numbers generated from GIDs,
129  // they are consistently generated across processes
130  //
131  // gid is a device view that holds the GIDs of each vertex on this process.
132  // It is indexable by local ID. It stores both owned and ghost
133  // vertices, and LIDs are ordered so that the structure looks like:
134  // -----------------------------------------
135  // | | first | second | owned LIDs are smallest
136  // | owned | ghost | ghost | ghost LIDs increase based
137  // | vtx | layer | layer | on layer (1 < 2)
138  // -----------------------------------------
139  // The allocated size is dist_offsets_dev.extent(0)-1.
140  //
141  //
142  // ghost_degrees is a device view that holds the degrees of ghost vertices only.
143  // A ghost with local ID n_local will be the first entry in this view.
144  // So, ghost_degrees[i] is the degree of the vtx with
145  // GID = gid[n_local+i].
146  //
147  // recolor_degrees is a boolean that determines whether or not we factor in vertex
148  // degrees on recoloring
149  //
150  //OUTPUT ARGS
151  // femv_colors is the device color view.
152  // After this function call, conflicts between vertices will
153  // be resolved via setting a vertex's color to 0. The vertex
154  // to be uncolored is determined by rules that are consistent
155  // across multiple processes.
156  //
157  // verts_to_recolor_view is a device view that holds the list
158  // of vertices to recolor. Any vertex
159  // uncolored by this function will appear in this
160  // view after the function returns.
161  //
162  // verts_to_recolor_size_atomic is an atomic device view that holds the
163  // size of verts_to_recolor_view.
164  // Effectively it represents how many
165  // vertices need to be recolored after
166  // conflict detection.
167  // It differs in size from the allocated size of
168  // verts_to_recolor_view.
169  // Atomicity is required for building
170  // verts_to_recolor_view in parallel, which
171  // is the reason this is a view instead of
172  // just an integer type.
173  //
174  // verts_to_send_view is a device view that holds the list of vertices
175  // that will need to be sent to remotes after recoloring
176  // Note: Only locally owned vertices should ever be in
177  // this view. A process cannot color ghosts correctly.
178  //
179  // verts_to_send_size_atomic is an atomic device view that holds the size of
180  // verts_to_send_view. It differs in size from the
181  // allocated size of verts_to_send_view.
182  // Atomicity is required for building
183  // verts_to_send_view in parallel,
184  // which is the reason this is a view instead of
185  // just an integer type.
186  //
187  // recoloringSize is a device view that stores the total number of
188  // vertices that were uncolored by the conflict detection.
189  //
190  virtual void detectConflicts(const size_t n_local,
191  Kokkos::View<offset_t*, device_type > dist_offsets_dev,
192  Kokkos::View<lno_t*, device_type > dist_adjs_dev,
193  Kokkos::View<int*,device_type > femv_colors,
194  Kokkos::View<lno_t*, device_type > boundary_verts_view,
195  Kokkos::View<lno_t*,
196  device_type > verts_to_recolor_view,
197  Kokkos::View<int*,
198  device_type,
199  Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_recolor_size_atomic,
200  Kokkos::View<lno_t*,
201  device_type > verts_to_send_view,
202  Kokkos::View<size_t*,
203  device_type,
204  Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic,
205  Kokkos::View<size_t*, device_type> recoloringSize,
206  Kokkos::View<int*,
207  device_type> rand,
208  Kokkos::View<gno_t*,
209  device_type> gid,
210  Kokkos::View<gno_t*,
211  device_type> ghost_degrees,
212  bool recolor_degrees) = 0;
213 
214  //Entry point for serial conflict detection
215  virtual void detectConflicts_serial(const size_t n_local,
216  typename Kokkos::View<offset_t*, device_type >::HostMirror dist_offsets_host,
217  typename Kokkos::View<lno_t*, device_type >::HostMirror dist_adjs_host,
218  typename Kokkos::View<int*,device_type >::HostMirror femv_colors,
219  typename Kokkos::View<lno_t*, device_type >::HostMirror boundary_verts_view,
220  typename Kokkos::View<lno_t*,device_type>::HostMirror verts_to_recolor_view,
221  typename Kokkos::View<int*,device_type>::HostMirror verts_to_recolor_size_atomic,
222  typename Kokkos::View<lno_t*,device_type>::HostMirror verts_to_send_view,
223  typename Kokkos::View<size_t*,device_type>::HostMirror verts_to_send_size_atomic,
224  typename Kokkos::View<size_t*, device_type>::HostMirror recoloringSize,
225  typename Kokkos::View<int*, device_type>::HostMirror rand,
226  typename Kokkos::View<gno_t*,device_type>::HostMirror gid,
227  typename Kokkos::View<gno_t*,device_type>::HostMirror ghost_degrees,
228  bool recolor_degrees) = 0;
229  //Entry point for the construction of the boundary
230  //INPUT ARGS
231  // n_local is the number of vertices owned by the current process
232  //
233  // dist_offsets_dev is the device view that holds the CSR offsets
234  //
235  // dist_adjs_dev is the device view that holds CSR adjacencies
236  //
237  // dist_offsets_host is the hostmirror that holds the CSR offsets
238  //
239  // dist_adjs_host is the hostmirror that holds the CSR adjacencies
240  //
241  //OUTPUT ARGS
242  // boundary_verts is an unallocated device view that will hold the
243  // list of boundary vertices.
244  //
245  // verts_to_send_view will hold the list of vertices to send
246  //
247  // verts_to_send_size_atomic will hold the number of vertices to
248  // send currently held in verts_to_send_view.
249  // verts_to_send_size_atomic differs
250  // from the allocated size of verts_to_send_view
251  // Atomicity is required for building
252  // verts_to_send_view in parallel, which is
253  // the reason this is a view instead of an
254  // integer type.
255  //
256  virtual void constructBoundary(const size_t n_local,
257  Kokkos::View<offset_t*, device_type> dist_offsets_dev,
258  Kokkos::View<lno_t*, device_type> dist_adjs_dev,
259  typename Kokkos::View<offset_t*, device_type>::HostMirror dist_offsets_host,
260  typename Kokkos::View<lno_t*, device_type>::HostMirror dist_adjs_host,
261  Kokkos::View<lno_t*, device_type>& boundary_verts,
262  Kokkos::View<lno_t*,
263  device_type > verts_to_send_view,
264  Kokkos::View<size_t*,
265  device_type,
266  Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic) = 0;
267 
268  protected:
269  RCP<const base_adapter_t> adapter;
270  RCP<Teuchos::ParameterList> pl;
271  RCP<Environment> env;
272  RCP<const Teuchos::Comm<int> > comm;
273  bool verbose;
274  bool timing;
275 
276  private:
277  //This function constructs a CSR with complete adjacency information for
278  //first-layer ghost vertices. This CSR can contain edges from:
279  // first-layer ghosts to owned;
280  // first-layer ghosts to first-layer ghosts;
281  // first-layer ghosts to second-layer ghosts;
282  //1) Each process sends a list of ghost GIDs back to their owning process
283  // to request full adjacency information for that vertex.
284  //
285  //2) Initially each process sends back degree information to the requesting
286  // process.
287  //
288  //3) Then, each process can construct send buffers and receive schedules for
289  // the adjacency information for each requested GID it received,
290  // in addition to building the 2GL offsets and relabeling ghost LIDs
291  // to make the final CSR construction easier.
292  //
293  // 3a) We can construct the 2GL offsets here because each process has
294  // already received the degree information for each first-layer
295  // ghost vertex.
296  //
297  // 3b) The original ghost LIDs may not correspond to the order in which
298  // we will receive the adjacency information. Instead of reordering
299  // the received adjacencies, we relabel the GIDs of first-layer
300  // ghosts with new LIDs that correspond to the order in which we
301  // receive the adjacency information. Only first-layer ghost LIDs
302  // are changed.
303  //
304  //4) Due to limitations on the size of an MPI message, we split sending and
305  // receiving into rounds to accommodate arbitrarily large graphs.
306  //
307  //5) As send rounds progress, we construct the 2GL adjacency structure.
308  // Because we relabel ghost LIDs, we do not need to reorder the
309  // data after we receive it.
310  //
311  //
312  //
313  //OUTPUT ARGS
314  // ownedPlusGhosts Initially holds the list of GIDs for owned and ghost
315  // vertices. The only hard constraint on the ordering is
316  // that owned vertices come before ghost vertices.
317  // This function can re-assign the LIDs of ghost vertices
318  // in order to make the final construction of the 2GL
319  // CSR easier. After this function call, some ghost
320  // LIDs may be changed, but they will still be greater
321  // than owned LIDs. No owned LIDs will be changed.
322  //
323  // adjs_2GL holds the second ghost layer CSR adjacencies. The 2GL CSR
324  // contains complete adjacency information for the first-layer
325  // ghosts. These adjacencies can hold both owned vertices and
326  // second-layer ghosts, as well as first-layer ghosts.
327  //
328  // offsets_2GL holds CSR offsets for vertices in the first ghost layer only.
329  // That is, a vertex with GID = gid[n_local] will be the first
330  // vertex with information in this offset structure.
331  //
332  //
333  //INPUT ARGS
334  // owners holds the owning proc for a given vertex, indexed by local ID.
335  //
336  // adjs is the CSR adjacencies of the local graph with a single ghost layer.
337  //
338  // offsets is the CSR offsets of the local graph with a single ghost layer.
339  //
340  // mapOwned translates from Owned GID to LID. We only need this translation
341  // for owned vertices.
342  //
343  //TODO: This function uses many vectors of size comm->getSize();
344  // consider changes to increase memory scalability.
345  void constructSecondGhostLayer(std::vector<gno_t>& ownedPlusGhosts,
346  const std::vector<int>& owners,
347  ArrayView<const gno_t> adjs,
348  ArrayView<const offset_t> offsets,
349  RCP<const map_t> mapOwned,
350  std::vector< gno_t>& adjs_2GL,
351  std::vector< offset_t>& offsets_2GL) {
352 
353  //first, we send ghost GIDs back to owners to receive the
354  //degrees of each first-layer ghost.
355  std::vector<int> sendcounts(comm->getSize(),0);
356  std::vector<size_t> sdispls(comm->getSize()+1,0);
357  //loop through owners, count how many vertices we'll send to each processor
358  //We send each ghost GID back to its owning process.
359  if(verbose) std::cout<<comm->getRank()<<": building sendcounts\n";
360  for(size_t i = 0; i < owners.size(); i++){
361  if(owners[i] != comm->getRank()&& owners[i] !=-1) sendcounts[owners[i]]++;
362  }
363  //construct sdispls (for building sendbuf), and sum the total sendcount
364  if(verbose) std::cout<<comm->getRank()<<": building sdispls\n";
365  size_t sendcount = 0;
366  for(int i = 1; i < comm->getSize()+1; i++){
367  sdispls[i] = sdispls[i-1] + sendcounts[i-1];
368  sendcount += sendcounts[i-1];
369  }
370 
371  if(verbose) std::cout<<comm->getRank()<<": building idx\n";
372  std::vector<gno_t> idx(comm->getSize(),0);
373  for(int i = 0; i < comm->getSize(); i++){
374  idx[i]=sdispls[i];
375  }
376  //construct sendbuf to send GIDs to owning processes
377  if(verbose) std::cout<<comm->getRank()<<": building sendbuf\n";
378 
379  std::vector<gno_t> sendbuf(sendcount,0);
380  for(size_t i = offsets.size()-1; i < owners.size(); i++){
381  if(owners[i] != comm->getRank() && owners[i] != -1){
382  sendbuf[idx[owners[i]]++] = ownedPlusGhosts[i];
383  }
384  }
385 
386  //communicate GIDs to owners
387  if(verbose) std::cout<<comm->getRank()<<": requesting GIDs from owners\n";
388  Teuchos::ArrayView<int> sendcounts_view = Teuchos::arrayViewFromVector(sendcounts);
389  Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
390  Teuchos::ArrayRCP<gno_t> recvbuf;
391  std::vector<int> recvcounts(comm->getSize(),0);
392  Teuchos::ArrayView<int> recvcounts_view = Teuchos::arrayViewFromVector(recvcounts);
393  Zoltan2::AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcounts_view, recvbuf, recvcounts_view);
394 
395  if(verbose) std::cout<<comm->getRank()<<": done communicating\n";
396  //replace entries in recvGIDs with their degrees
397 
398  if(verbose) std::cout<<comm->getRank()<<": building rdispls\n";
399  gno_t recvcounttotal = 0;
400  std::vector<int> rdispls(comm->getSize()+1,0);
401  for(size_t i = 1; i<recvcounts.size()+1; i++){
402  rdispls[i] = rdispls[i-1] + recvcounts[i-1];
403  recvcounttotal += recvcounts[i-1];
404  }
405  //send back the degrees to the requesting processes,
406  //build the adjacency counts
407  std::vector<offset_t> sendDegrees(recvcounttotal,0);
408  gno_t adj_len = 0;
409  std::vector<int> adjsendcounts(comm->getSize(),0);
410  if(verbose) std::cout<<comm->getRank()<<": building adjacency counts\n";
411  for(int i = 0; i < comm->getSize(); i++){
412  adjsendcounts[i] = 0;
413  for(int j = rdispls[i]; j < rdispls[i+1]; j++){
414  lno_t lid = mapOwned->getLocalElement(recvbuf[j]);
415  offset_t degree = offsets[lid+1] - offsets[lid];
416  sendDegrees[j] = degree;
417  adj_len +=degree;
418  adjsendcounts[i] += degree;
419  }
420  }
421  //communicate the degrees back to the requesting processes
422  if(verbose) std::cout<<comm->getRank()<<": sending degrees back to requestors\n";
423  Teuchos::ArrayView<offset_t> sendDegrees_view = Teuchos::arrayViewFromVector(sendDegrees);
424  Teuchos::ArrayRCP<offset_t> recvDegrees;
425  std::vector<int> recvDegreesCount(comm->getSize(),0);
426  Teuchos::ArrayView<int> recvDegreesCount_view = Teuchos::arrayViewFromVector(recvDegreesCount);
427  Zoltan2::AlltoAllv<offset_t>(*comm, *env, sendDegrees_view, recvcounts_view, recvDegrees, recvDegreesCount_view);
428 
429  //calculate number of rounds of AlltoAllv's that are necessary on this process
430 
431  if(verbose) std::cout<<comm->getRank()<<": determining number of rounds necessary\n";
432  int rounds = 1;
433  for(int i = 0; i < comm->getSize(); i++){
434  if(adjsendcounts[i]*sizeof(gno_t)/ INT_MAX > (size_t)rounds){
435  rounds = (adjsendcounts[i]*sizeof(gno_t)/INT_MAX)+1;
436  }
437  }
438 
439  //see what the max number of rounds will be globally
440  int max_rounds = 0;
441  Teuchos::reduceAll<int>(*comm, Teuchos::REDUCE_MAX, 1, &rounds, &max_rounds);
442 
443  if(verbose) std::cout<<comm->getRank()<<": building per_proc sums\n";
444  //compute send and receive schedules to and from each process
445  std::vector<std::vector<uint64_t>> per_proc_round_adj_sums(max_rounds+1,std::vector<uint64_t>(comm->getSize(),0));
446  std::vector<std::vector<uint64_t>> per_proc_round_vtx_sums(max_rounds+1,std::vector<uint64_t>(comm->getSize(),0));
447 
448  if(verbose) std::cout<<comm->getRank()<<": filling per_proc sums\n";
449  //fill out the send schedules
450  for(int proc_to_send = 0; proc_to_send < comm->getSize(); proc_to_send++){
451  int curr_round = 0;
452  for(size_t j = sdispls[proc_to_send]; j < sdispls[proc_to_send+1]; j++){
453  if((per_proc_round_adj_sums[curr_round][proc_to_send] + recvDegrees[j])*sizeof(gno_t) > INT_MAX){
454  curr_round++;
455  }
456  per_proc_round_adj_sums[curr_round][proc_to_send] += recvDegrees[j];
457  per_proc_round_vtx_sums[curr_round][proc_to_send]++;
458  }
459  }
460 
461  if(verbose) std::cout<<comm->getRank()<<": building recv GID schedule\n";
462 
463  //A 3D vector to hold the GIDs so we can see how this process will receive
464  //the vertices in each round, from each process. This way we can reorder things
465  //locally so that the data we receive will automatically construct the CSR correctly
466  //without any other computation. We do the reordering before we start receiving.
467  std::vector<std::vector<std::vector<gno_t>>> recv_GID_per_proc_per_round(
468  max_rounds+1,std::vector<std::vector<gno_t>>(
469  comm->getSize(),std::vector<gno_t>(0)));
470  for(int i = 0; i < max_rounds; i++){
471  for(int j = 0; j < comm->getSize(); j++){
472  recv_GID_per_proc_per_round[i][j] = std::vector<gno_t>(sendcounts[j],0);
473  }
474  }
475 
476  if(verbose) std::cout<<comm->getRank()<<": filling out recv GID schedule\n";
477  for(int i = 0; i < comm->getSize(); i++){
478  int curr_round = 0;
479  size_t curr_idx = 0;
480  for(size_t j = sdispls[i]; j < sdispls[i+1]; j++){
481  if(curr_idx > per_proc_round_vtx_sums[curr_round][i]){
482  curr_round++;
483  curr_idx = 0;
484  }
485  recv_GID_per_proc_per_round[curr_round][i][curr_idx++] = j;
486  }
487  }
488 
489  if(verbose) std::cout<<comm->getRank()<<": reordering gids and degrees in the order they'll be received\n";
490  //reorder the GIDs and degrees locally:
491  // - The way we previously stored GIDs and degrees had no relation
492  // to the order that we'll be receiving the adjacency data.
493  // For the CSR to be complete (and usable), we reorder the LIDs of
494  // ghosts so that they are in the order we will receive the adjacency
495  // data.
496  //
497  // - For graphs that are not large enough to trigger sending in multiple
498  // rounds of communication, the LIDs of the ghosts will be ordered
499  // by owning process. If multiple rounds are used, the LIDs of
500  // ghosts will be ordered by rounds in addition to owning process.
501  //
502  // - final_gid_vec and final_degree_vec hold the reorganized gids and
503  // degrees, the final re-mapping happens further down
504  std::vector<gno_t> final_gid_vec(sendcount, 0);
505  std::vector<offset_t> final_degree_vec(sendcount,0);
506  gno_t reorder_idx = 0;
507  for(int i = 0; i < max_rounds; i++){
508  for(int j = 0; j < comm->getSize(); j++){
509  for(size_t k = 0; k < per_proc_round_vtx_sums[i][j]; k++){
510  final_gid_vec[reorder_idx] = sendbuf[recv_GID_per_proc_per_round[i][j][k]];
511  final_degree_vec[reorder_idx++] = recvDegrees[recv_GID_per_proc_per_round[i][j][k]];
512  }
513  }
514  }
515 
516  //check to see if the reorganization has happened correctly
517  if(verbose){
518  //do a quick check to see if we ended up reorganizing anything
519  bool reorganized = false;
520  for(size_t i = 0; i < sendcount; i++){
521  if(final_gid_vec[i] != sendbuf[i]) reorganized = true;
522  }
523 
524  //if we have more than a single round of communication, we need to reorganize.
525  //this alerts of unnecessary reorganization, and a failure to perform necessary
526  //reorganization.
527  if(!reorganized && (max_rounds > 1)) std::cout<<comm->getRank()<<": did not reorgainze GIDs, but probably should have\n";
528  if(reorganized && (max_rounds == 1)) std::cout<<comm->getRank()<<": reorganized GIDs, but probably should not have\n";
529  }
530  //remap the GIDs so we receive the adjacencies in the same order as the current processes LIDs
531  // originally, the GID->LID mapping has no relevance to how we'll be receiving adj data from
532  // remote processes. Here we make it so that the GID->LID mapping does correspond to the
533  // order we have received degree info and will receive adjacencies. ( LID n_local
534  // corresponds to the first GID whose adjacency info we will receive, LID n_local+1 the
535  // second, etc.)
536  // That way, we don't need to reorder the adjacencies after we receive them.
537  //
538  // This next loop is the final mapping step.
539 
540  for (size_t i = 0; i < sendcount; i++){
541  ownedPlusGhosts[i+offsets.size()-1] = final_gid_vec[i];
542  }
543 
544  //status report
545  if(verbose) {
546  std::cout<<comm->getRank()<<": done remapping\n";
547  std::cout<<comm->getRank()<<": building ghost offsets\n";
548  }
549  //start building the second ghost layer's offsets
550  std::vector<offset_t> ghost_offsets(sendcount+1,0);
551  for(size_t i = 1; i < sendcount+1; i++){
552  ghost_offsets[i] = ghost_offsets[i-1] + final_degree_vec[i-1];
553  }
554 
555 
556  if(verbose) std::cout<<comm->getRank()<<": going through the sending rounds\n";
557  //set up counters to keep track of where we are in the sending order
558  std::vector<uint64_t> curr_idx_per_proc(comm->getSize(),0);
559  for(int i = 0; i < comm->getSize(); i++) curr_idx_per_proc[i] = rdispls[i];
560  for(int round = 0; round < max_rounds; round++){
561  //send buffers
562  std::vector<gno_t> send_adj;
563  std::vector<int> send_adj_counts(comm->getSize(),0);
564  if(verbose) std::cout<<comm->getRank()<<": round "<<round<<", constructing send_adj\n";
565  //construct the adjacencies to send for this round
566  for(int curr_proc = 0; curr_proc < comm->getSize(); curr_proc++){
567  uint64_t curr_adj_sum = 0;
568  //keep going through adjacencies to send to this process until we're done
569  while( curr_idx_per_proc[curr_proc] < (size_t)rdispls[curr_proc+1]){
570  lno_t lid = mapOwned->getLocalElement(recvbuf[curr_idx_per_proc[curr_proc]++]);
571 
572  //if the next adjacency would push us over the MPI message size max,
573  //stop for this round
574  if((curr_adj_sum + (offsets[lid+1]-offsets[lid]))*sizeof(gno_t) >= INT_MAX){
575  break;
576  }
577 
578  //add the adjacencies to the send buffer
579  curr_adj_sum += (offsets[lid+1] - offsets[lid]);
580  for(offset_t j = offsets[lid]; j < offsets[lid+1]; j++){
581  send_adj.push_back(adjs[j]);
582  }
583  }
584  //update the send counts for this round
585  send_adj_counts[curr_proc] = curr_adj_sum;
586  }
587  if(verbose) std::cout<<comm->getRank()<<": round "<<round<<", sending...\n";
588  //do the sending...
589  Teuchos::ArrayView<gno_t> send_adjs_view = Teuchos::arrayViewFromVector(send_adj);
590  Teuchos::ArrayView<int> adjsendcounts_view = Teuchos::arrayViewFromVector(send_adj_counts);
591  Teuchos::ArrayRCP<gno_t> ghost_adjs;
592  std::vector<int> adjrecvcounts(comm->getSize(),0);
593  Teuchos::ArrayView<int> adjsrecvcounts_view = Teuchos::arrayViewFromVector(adjrecvcounts);
594  Zoltan2::AlltoAllv<gno_t>(*comm, *env, send_adjs_view, adjsendcounts_view, ghost_adjs, adjsrecvcounts_view);
595  //Because of the previous reordering, these adjacencies are
596  //in the correct order as they arrive on the process.
597  for(offset_t i = 0; i< (offset_t)ghost_adjs.size(); i++){
598  adjs_2GL.push_back(ghost_adjs[i]);
599  }
600  }
601  if(verbose) std::cout<<comm->getRank()<<": constructing offsets\n";
602  //put the offsets we computed into the output argument.
603  for(size_t i = 0; i < sendcount+1; i++){
604  offsets_2GL.push_back(ghost_offsets[i]);
605  }
606  if(verbose) std::cout<<comm->getRank()<<": done building 2nd ghost layer\n";
607  }
608 
609  //Communicates owned vertex colors to remote ghost copies.
610  //
611  //returns: the amount of time the communication call took.
612  //
613  //OUTPUT ARGS:
614  // colors: the owned vertices' colors are not changed,
615  // ghost vertex colors are updated from their owners.
616  //
617  // total_sent: reports the total size of the send buffer
618  //
619  // total_recv: reports the total size of the recv buffer
620  //
621  //INPUT ARGS:
622  //
623  // mapOwnedPlusGhosts: maps global IDs to local IDs and vice-versa.
624  //
625  // nVtx: the number of owned vertices on this process
626  //
627  // verts_to_send: hostmirror of the list of vertices to send.
628  // This list will only contain local vertices
629  // that are ghosted on a remote process.
630  //
631  // verts_to_send_size: hostmirror of the size of verts_to_send
632  //
633  // procs_to_send: map that takes a local ID and gives a vector of
634  // process IDs which have a ghost copy of that vertex.
635  //
636  double doOwnedToGhosts(RCP<const map_t> mapOwnedPlusGhosts,
637  size_t nVtx,
638  typename Kokkos::View<lno_t*,device_type>::HostMirror verts_to_send,
639  typename Kokkos::View<size_t*,device_type>::HostMirror verts_to_send_size,
640  Teuchos::RCP<femv_t> femv,
641  const std::unordered_map<lno_t, std::vector<int>>& procs_to_send,
642  gno_t& total_sent, gno_t& total_recvd){
643 
644  auto femvColors = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
645  auto colors = subview(femvColors, Kokkos::ALL, 0);
646  //create vectors to hold send information
647  int nprocs = comm->getSize();
648  std::vector<int> sendcnts(comm->getSize(), 0);
649  std::vector<gno_t> sdispls(comm->getSize()+1, 0);
650 
651  //calculate how much data we're sending to each process
652  for(size_t i = 0; i < verts_to_send_size(0); i++){
653  for(size_t j = 0; j < procs_to_send.at(verts_to_send(i)).size(); j++){
654  sendcnts[procs_to_send.at(verts_to_send(i))[j]] += 2;
655  }
656  }
657  //calculate sendsize and sdispls
658  sdispls[0] = 0;
659  gno_t sendsize = 0;
660  std::vector<int> sentcount(nprocs, 0);
661 
662  for(int i = 1; i < comm->getSize()+1; i++){
663  sdispls[i] = sdispls[i-1] + sendcnts[i-1];
664  sendsize += sendcnts[i-1];
665  }
666  total_sent = sendsize;
667  std::vector<gno_t> sendbuf(sendsize,0);
668  //construct sendbuf, send each owned vertex's GID
669  //and its color to the processes that have a
670  //ghost copy of that vertex. If a vertex is not ghosted,
671  //it does not get sent anywhere.
672  for(size_t i = 0; i < verts_to_send_size(0); i++){
673  std::vector<int> procs = procs_to_send.at(verts_to_send(i));
674  for(size_t j = 0; j < procs.size(); j++){
675  size_t idx = sdispls[procs[j]] + sentcount[procs[j]];
676  sentcount[procs[j]] += 2;
677  sendbuf[idx++] = mapOwnedPlusGhosts->getGlobalElement(verts_to_send(i));
678  sendbuf[idx] = colors(verts_to_send(i));
679  }
680  }
681 
682  Teuchos::ArrayView<int> sendcnts_view = Teuchos::arrayViewFromVector(sendcnts);
683  Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
684  Teuchos::ArrayRCP<gno_t> recvbuf;
685  std::vector<int> recvcnts(comm->getSize(), 0);
686  Teuchos::ArrayView<int> recvcnts_view = Teuchos::arrayViewFromVector(recvcnts);
687 
688  //if we're reporting times, remove the computation imbalance from the comm timer
689  if(timing) comm->barrier();
690  double comm_total = 0.0;
691  double comm_temp = timer();
692 
693  Zoltan2::AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcnts_view, recvbuf, recvcnts_view);
694  comm_total += timer() - comm_temp;
695 
696  //compute total recvsize for updating local ghost colors
697  gno_t recvsize = 0;
698  for(int i = 0; i < recvcnts_view.size(); i++){
699  recvsize += recvcnts_view[i];
700  }
701  total_recvd = recvsize;
702  //update the local ghost copies with the color we just received.
703  for(int i = 0; i < recvsize; i+=2){
704  size_t lid = mapOwnedPlusGhosts->getLocalElement(recvbuf[i]);
705  colors(lid) = recvbuf[i+1];
706  }
707 
708  return comm_total;
709  }
710 
711  public:
712  //constructor
714  const RCP<const base_adapter_t> &adapter_,
715  const RCP<Teuchos::ParameterList> &pl_,
716  const RCP<Environment> &env_,
717  const RCP<const Teuchos::Comm<int> > &comm_)
718  : adapter(adapter_), pl(pl_), env(env_), comm(comm_){
719  verbose = pl->get<bool>("verbose",false);
720  timing = pl->get<bool>("timing", false);
721  }
722  //Main entry point for graph coloring
723  void color( const RCP<ColoringSolution<Adapter> > &solution){
724  //convert from global graph to local graph
725  ArrayView<const gno_t> vtxIDs;
726  ArrayView<StridedData<lno_t, scalar_t> > vwgts;
727 
728  modelFlag_t flags;
729  flags.set(REMOVE_SELF_EDGES);
730  const auto model = rcp(new GraphModel<base_adapter_t>(this->adapter, this->env,
731  this->comm, flags));
732  size_t nVtx = model->getVertexList(vtxIDs, vwgts);
733  // the weights are not used at this point.
734 
735  //get the adjacencies in a view that holds GIDs
736  ArrayView<const gno_t> adjs;
737  //get the CSR offsets
738  ArrayView<const offset_t> offsets;
739  ArrayView<StridedData<lno_t, scalar_t> > ewgts;
740  model->getEdgeList(adjs, offsets, ewgts);
741 
742  //This map is used to map GID to LID initially
743  std::unordered_map<gno_t,lno_t> globalToLocal;
744  //this vector holds GIDs for owned and ghost vertices
745  //The final order of the gids is:
746  // -----------------------------------
747  // | | first | second |
748  // | owned | layer | layer |
749  // | | ghosts | ghosts |
750  // -----------------------------------
751  //
752  // constructSecondGhostLayer reorders the first layer ghost
753  // GIDs in the particular order that the communication requires.
754 
755  std::vector<gno_t> ownedPlusGhosts;
756 
757  //this vector holds the owning process for
758  //owned vertices and the first ghost layer.
759  //owners[i] gives the processor owning ownedPlusGhosts[i],
760  //for owned vertices and first layer ghosts.
761  //
762  //This should only be used BEFORE the call to constructSecondGhostLayer
763  std::vector<int> owners;
764 
765  //fill these structures using owned vertices
766  for(int i = 0; i < vtxIDs.size(); i++){
767  globalToLocal[vtxIDs[i]] = i;
768  ownedPlusGhosts.push_back(vtxIDs[i]);
769  owners.push_back(comm->getRank());
770  }
771 
772  //count the initial number of ghosts,
773  //map them to a local ID.
774  //The globalToLocal map has first layer ghosts
775  //from here on.
776  int nGhosts = 0;
777  std::vector<lno_t> local_adjs;
778  for(int i = 0; i < adjs.size(); i++){
779  if(globalToLocal.count(adjs[i])==0){
780  ownedPlusGhosts.push_back(adjs[i]);
781  globalToLocal[adjs[i]] = vtxIDs.size()+nGhosts;
782  nGhosts++;
783  }
784  local_adjs.push_back(globalToLocal[adjs[i]]);
785  }
786 
787  //construct a Tpetra map for the FEMultiVector
788  Tpetra::global_size_t dummy = Teuchos::OrdinalTraits
789  <Tpetra::global_size_t>::invalid();
790  RCP<const map_t> mapOwned = rcp(new map_t(dummy, vtxIDs, 0, comm));
791 
792  //GID and owner lookup for ghost vertices
793  std::vector<gno_t> ghosts;
794  std::vector<int> ghostowners;
795  for(size_t i = nVtx; i < nVtx+nGhosts; i++){
796  ghosts.push_back(ownedPlusGhosts[i]);
797  ghostowners.push_back(-1);
798  }
799 
800  //get the owners of the ghost vertices
801  ArrayView<int> owningProcs = Teuchos::arrayViewFromVector(ghostowners);
802  ArrayView<const gno_t> gids = Teuchos::arrayViewFromVector(ghosts);
803  mapOwned->getRemoteIndexList(gids, owningProcs);
804 
805  //add ghost owners to the total owner vector
806  for(size_t i = 0; i < ghostowners.size(); i++){
807  owners.push_back(ghostowners[i]);
808  }
809 
810  //construct the second ghost layer.
811  //NOTE: this may reorder the LIDs of the ghosts.
812  //
813  //these vectors hold the CSR representation of the
814  // first ghost layer. This CSR contains:
815  // - edges from first layer ghosts to:
816  // - owned vertices
817  // - first layer ghosts
818  // - second layer ghosts
819  //
820  // - first_layer_ghost_adjs uses GIDs that need
821  // translated to LIDs.
822  //
823  // - first_layer_ghost_offsets are indexed by LID,
824  // first_layer_ghost_offsets[i] corresponds to
825  // the vertex with GID = ownedPlusGhosts[i+nVtx].
826  // first_layer_ghost_offsets.size() = #first layer ghosts + 1
827  //
828  std::vector< gno_t> first_layer_ghost_adjs;
829  std::vector< offset_t> first_layer_ghost_offsets;
830  constructSecondGhostLayer(ownedPlusGhosts,owners, adjs, offsets, mapOwned,
831  first_layer_ghost_adjs, first_layer_ghost_offsets);
832 
833  //we potentially reordered the local IDs of the ghost vertices, so we need
834  //to re-insert the GIDs into the global to local ID mapping.
835  globalToLocal.clear();
836  for(size_t i = 0; i < ownedPlusGhosts.size(); i++){
837  globalToLocal[ownedPlusGhosts[i]] = i;
838  }
839 
840  //use updated globalToLocal map to translate
841  //adjacency GIDs to LIDs
842  for(int i = 0 ; i < adjs.size(); i++){
843  local_adjs[i] = globalToLocal[adjs[i]];
844  }
845 
846  //at this point, we have ownedPlusGhosts with 1layer ghosts' GIDs.
847  //Need to map the second ghost layer GIDs to new local IDs,
848  //and add them to the map, as well as translating 2layer ghost
849  //adjacencies to use those new LIDs.
850  size_t n2Ghosts = 0;
851 
852  //local_ghost_adjs is the LID translation of first_layer_ghost_adjs.
853  std::vector<lno_t> local_ghost_adjs;
854  for(size_t i = 0; i< first_layer_ghost_adjs.size(); i++ ){
855  if(globalToLocal.count(first_layer_ghost_adjs[i]) == 0){
856  ownedPlusGhosts.push_back(first_layer_ghost_adjs[i]);
857  globalToLocal[first_layer_ghost_adjs[i]] = vtxIDs.size() + nGhosts + n2Ghosts;
858  n2Ghosts++;
859  }
860  local_ghost_adjs.push_back(globalToLocal[first_layer_ghost_adjs[i]]);
861  }
862 
863  //construct data structures necessary for FEMultiVector
864  if(verbose) std::cout<<comm->getRank()<<": constructing Tpetra map with copies\n";
865  dummy = Teuchos::OrdinalTraits <Tpetra::global_size_t>::invalid();
866  RCP<const map_t> mapWithCopies = rcp(new map_t(dummy,
867  Teuchos::arrayViewFromVector(ownedPlusGhosts),
868  0, comm));
869  if(verbose) std::cout<<comm->getRank()<<": done constructing map with copies\n";
870 
871  using import_t = Tpetra::Import<lno_t, gno_t>;
872  Teuchos::RCP<import_t> importer = rcp(new import_t(mapOwned,
873  mapWithCopies));
874  if(verbose) std::cout<<comm->getRank()<<": done constructing importer\n";
875  Teuchos::RCP<femv_t> femv = rcp(new femv_t(mapOwned,
876  importer, 1, true));
877  if(verbose) std::cout<<comm->getRank()<<": done constructing femv\n";
878 
879  //Create random numbers seeded on global IDs to resolve conflicts
880  //consistently between processes.
881  std::vector<int> rand(ownedPlusGhosts.size());
882  for(size_t i = 0; i < rand.size(); i++){
883  std::srand(ownedPlusGhosts[i]);
884  rand[i] = std::rand();
885  }
886 
887  // get up-to-date owners for all ghosts.
888  std::vector<int> ghostOwners2(ownedPlusGhosts.size() -nVtx);
889  std::vector<gno_t> ghosts2(ownedPlusGhosts.size() - nVtx);
890  for(size_t i = nVtx; i < ownedPlusGhosts.size(); i++) ghosts2[i-nVtx] = ownedPlusGhosts[i];
891  Teuchos::ArrayView<int> owners2 = Teuchos::arrayViewFromVector(ghostOwners2);
892  Teuchos::ArrayView<const gno_t> ghostGIDs = Teuchos::arrayViewFromVector(ghosts2);
893  mapOwned->getRemoteIndexList(ghostGIDs,owners2);
894  if(verbose) std::cout<<comm->getRank()<<": done getting ghost owners\n";
895 
896  //calculations for how many 2GL verts are in the boundary of another process, only
897  //do this if it's going to be displayed.
898  if(verbose) {
899  std::cout<<comm->getRank()<<": calculating 2GL stats...\n";
900 
901  std::vector<int> sendcounts(comm->getSize(),0);
902  std::vector<gno_t> sdispls(comm->getSize()+1,0);
903  //loop through owners, count how many vertices we'll send to each processor
904  for(int i = nGhosts; i < ghostGIDs.size(); i++){
905  if(owners2[i] != comm->getRank()&& owners2[i] !=-1) sendcounts[owners2[i]]++;
906  }
907  //construct sdispls (for building sendbuf), and sum the total sendcount
908  gno_t sendcount = 0;
909  for(int i = 1; i < comm->getSize()+1; i++){
910  sdispls[i] = sdispls[i-1] + sendcounts[i-1];
911  sendcount += sendcounts[i-1];
912  }
913  std::vector<gno_t> idx(comm->getSize(),0);
914  for(int i = 0; i < comm->getSize(); i++){
915  idx[i]=sdispls[i];
916  }
917  //construct sendbuf to send GIDs to owning processes
918  std::vector<gno_t> sendbuf(sendcount,0);
919  for(size_t i = nGhosts; i < (size_t)ghostGIDs.size(); i++){
920  if(owners2[i] != comm->getRank() && owners2[i] != -1){
921  sendbuf[idx[owners2[i]]++] = ghostGIDs[i];
922  }
923  }
924  //do the communication
925  Teuchos::ArrayView<int> sendcounts_view = Teuchos::arrayViewFromVector(sendcounts);
926  Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
927  Teuchos::ArrayRCP<gno_t> recvbuf;
928  std::vector<int> recvcounts(comm->getSize(),0);
929  Teuchos::ArrayView<int> recvcounts_view = Teuchos::arrayViewFromVector(recvcounts);
930  Zoltan2::AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcounts_view, recvbuf, recvcounts_view);
931  std::vector<int> is_bndry_send(recvbuf.size(),0);
932 
933  //send back how many received vertices are in the boundary
934  for(int i = 0; i < recvbuf.size(); i++){
935  size_t lid = mapWithCopies->getLocalElement(recvbuf[i]);
936  is_bndry_send[i] = 0;
937  if(lid < nVtx){
938  for(offset_t j = offsets[lid]; j < offsets[lid+1]; j++){
939  if((size_t)local_adjs[j] >= nVtx) is_bndry_send[i] = 1;
940  }
941  } else{
942  for(offset_t j = first_layer_ghost_offsets[lid]; j < first_layer_ghost_offsets[lid+1]; j++){
943  if((size_t)local_ghost_adjs[j] >= nVtx) is_bndry_send[i] = 1;
944  }
945  }
946  }
947 
948  //send back the boundary flags
949  Teuchos::ArrayView<int> is_bndry_send_view = Teuchos::arrayViewFromVector(is_bndry_send);
950  Teuchos::ArrayRCP<int> is_bndry_recv;
951  std::vector<int> bndry_recvcounts(comm->getSize(),0);
952  Teuchos::ArrayView<int> bndry_recvcounts_view = Teuchos::arrayViewFromVector(bndry_recvcounts);
953  Zoltan2::AlltoAllv<int> (*comm, *env, is_bndry_send_view, recvcounts_view, is_bndry_recv, bndry_recvcounts_view);
954 
955  //add together the flags to compute how many boundary vertices are in the second ghost layer
956  int boundaryverts = 0;
957  for(int i = 0; i < is_bndry_recv.size(); i++){
958  boundaryverts += is_bndry_recv[i];
959  }
960  //this cout is guarded by a verbose check.
961  std::cout<<comm->getRank()<<": "<<boundaryverts<<" boundary verts out of "<<n2Ghosts<<" verts in 2GL\n";
962  }
963 
964 
965  //local CSR representation for the owned vertices:
966  Teuchos::ArrayView<const lno_t> local_adjs_view = Teuchos::arrayViewFromVector(local_adjs);
967  //NOTE: the offset ArrayView was constructed earlier, and is up-to-date.
968  //
969  //first ghost layer CSR representation:
970  Teuchos::ArrayView<const offset_t> ghost_offsets = Teuchos::arrayViewFromVector(first_layer_ghost_offsets);
971  Teuchos::ArrayView<const lno_t> ghost_adjacencies = Teuchos::arrayViewFromVector(local_ghost_adjs);
972  Teuchos::ArrayView<const int> rand_view = Teuchos::arrayViewFromVector(rand);
973  Teuchos::ArrayView<const gno_t> gid_view = Teuchos::arrayViewFromVector(ownedPlusGhosts);
974  //these ArrayViews contain LIDs that are ghosted remotely,
975  //and the Process IDs that have those ghost copies.
976  //An LID may appear in the exportLIDs more than once.
977  Teuchos::ArrayView<const lno_t> exportLIDs = importer->getExportLIDs();
978  Teuchos::ArrayView<const int> exportPIDs = importer->getExportPIDs();
979 
980  //construct a quick lookup datastructure to map from LID to a
981  //list of PIDs we need to send data to.
982  std::unordered_map<lno_t, std::vector<int>> procs_to_send;
983  for(int i = 0; i < exportLIDs.size(); i++){
984  procs_to_send[exportLIDs[i]].push_back(exportPIDs[i]);
985  }
986 
987  //call the coloring algorithm
988  twoGhostLayer(nVtx, nVtx+nGhosts, local_adjs_view, offsets, ghost_adjacencies, ghost_offsets,
989  femv, gid_view, rand_view, owners2, mapWithCopies, procs_to_send);
990 
991  //copy colors to the output array
992  ArrayRCP<int> colors = solution->getColorsRCP();
993  auto femvdata = femv->getData(0);
994  for(size_t i=0; i<nVtx; i++){
995  colors[i] = femvdata[i];
996  }
997 
998  }
999 // private:
1000 
1001  //This is the coloring logic for all 2GL-based methods.
1002  //
1003  //INPUT ARGS:
1004  //
1005  // n_local: the number of local owned vertices
1006  //
1007  // n_total: the number of local owned vertices plus first layer ghosts
1008  //
1009  // adjs: the CSR adjacencies for the graph, only including owned vertices
1010  // and first layer ghosts
1011  //
1012  // offsets: the CSR offsets for the graph, has owned offsets into adjacencies
1013  //
1014  // ghost_adjs: the adjacency information for first-layer ghost vertices.
1015  // Includes all neighbors (owned, first-layer ghost,
1016  // second-layer ghost) for each first-layer ghost.
1017  //
1018  //
1019  // ghost_offsets: the offsets into ghost_adjs, first layer ghost LIDs
1020  // minus n_local are used to index this
1021  // datastructure. I.E. ghost_offsets(0)
1022  // corresponds to the ghost with LID n_local
1023  //
1024  // gids: a vector that holds GIDs for all vertices on this process
1025  // (includes owned, and all ghosts, indexable by LID)
1026  // The GIDs are ordered like this:
1027  // ----------------------------------
1028  // | | first | second |
1029  // | owned | layer | layer |
1030  // | | ghosts | ghosts |
1031  // ----------------------------------
1032  // ^ ^
1033  // n_local n_total
1034  //
1035  // rand: a vector that holds random numbers generated on GID for tie
1036  // breaking. Indexed by LID.
1037  //
1038  // ghost_owners: contains the process ID for the owner of each remote vertex.
1039  // Indexable by LID. owners[i] = the owning process for vertex
1040  // with GID = gids[i+n_local]
1041  //
1042  // mapOwnedPlusGhosts: map that can translate between GID and LID
1043  //
1044  // procs_to_send: for each vertex that is copied on a remote process,
1045  // this map contains the list of processes that have
1046  // a copy of a given vertex. Input LID, get the list
1047  // of PIDs that have a ghost copy of that LID.
1048  //
1049  //OUTPUT ARGS:
1050  //
1051  // femv: an FEMultiVector that holds a dual view of the colors.
1052  // After this call, femv contains updated colors.
1053  //
1054  void twoGhostLayer(const size_t n_local, const size_t n_total,
1055  const Teuchos::ArrayView<const lno_t>& adjs,
1056  const Teuchos::ArrayView<const offset_t>& offsets,
1057  const Teuchos::ArrayView<const lno_t>& ghost_adjs,
1058  const Teuchos::ArrayView<const offset_t>& ghost_offsets,
1059  const Teuchos::RCP<femv_t>& femv,
1060  const Teuchos::ArrayView<const gno_t>& gids,
1061  const Teuchos::ArrayView<const int>& rand,
1062  const Teuchos::ArrayView<const int>& ghost_owners,
1063  RCP<const map_t> mapOwnedPlusGhosts,
1064  const std::unordered_map<lno_t, std::vector<int>>& procs_to_send){
1065  //initialize timing variables
1066  double total_time = 0.0;
1067  double interior_time = 0.0;
1068  double comm_time = 0.0;
1069  double comp_time = 0.0;
1070  double recoloring_time=0.0;
1071  double conflict_detection = 0.0;
1072 
1073  //Number of rounds we are saving statistics for
1074  //100 is a decent default. Reporting requires --verbose argument.
1075  const int numStatisticRecordingRounds = 100;
1076 
1077  //includes all ghosts, including the second layer.
1078  const size_t n_ghosts = rand.size() - n_local;
1079 
1080 
1081  //Get the degrees of all ghost vertices
1082  //This is necessary for recoloring based on degrees,
1083  //we need ghost vertex degrees for consistency.
1084  std::vector<int> deg_send_cnts(comm->getSize(),0);
1085  std::vector<gno_t> deg_sdispls(comm->getSize()+1,0);
1086  for(int i = 0; i < ghost_owners.size(); i++){
1087  deg_send_cnts[ghost_owners[i]]++;
1088  }
1089  deg_sdispls[0] = 0;
1090  gno_t deg_sendsize = 0;
1091  std::vector<int> deg_sentcount(comm->getSize(),0);
1092  for(int i = 1; i < comm->getSize()+1; i++){
1093  deg_sdispls[i] = deg_sdispls[i-1] + deg_send_cnts[i-1];
1094  deg_sendsize += deg_send_cnts[i-1];
1095  }
1096  std::vector<gno_t> deg_sendbuf(deg_sendsize,0);
1097  for(int i = 0; i < ghost_owners.size(); i++){
1098  size_t idx = deg_sdispls[ghost_owners[i]] + deg_sentcount[ghost_owners[i]];
1099  deg_sentcount[ghost_owners[i]]++;
1100  deg_sendbuf[idx] = mapOwnedPlusGhosts->getGlobalElement(i+n_local);
1101  }
1102  Teuchos::ArrayView<int> deg_send_cnts_view = Teuchos::arrayViewFromVector(deg_send_cnts);
1103  Teuchos::ArrayView<gno_t> deg_sendbuf_view = Teuchos::arrayViewFromVector(deg_sendbuf);
1104  Teuchos::ArrayRCP<gno_t> deg_recvbuf;
1105  std::vector<int> deg_recvcnts(comm->getSize(),0);
1106  Teuchos::ArrayView<int> deg_recvcnts_view = Teuchos::arrayViewFromVector(deg_recvcnts);
1107  Zoltan2::AlltoAllv<gno_t>(*comm, *env, deg_sendbuf_view, deg_send_cnts_view, deg_recvbuf, deg_recvcnts_view);
1108 
1109  //replace GID with the degree the owning process has for this vertex.
1110  //(this will include all edges present for this vertex globally)
1111  //This is safe to do, since we sent ghosts to their owners.
1112  for(int i = 0; i < deg_recvbuf.size(); i++){
1113  lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_recvbuf[i]);
1114  deg_recvbuf[i] = offsets[lid+1] - offsets[lid];
1115  }
1116  //send modified buffer back
1117  ArrayRCP<gno_t> ghost_degrees;
1118  Zoltan2::AlltoAllv<gno_t>(*comm, *env, deg_recvbuf(), deg_recvcnts_view, ghost_degrees, deg_send_cnts_view);
1119 
1120  //create the ghost degree views, for use on host and device.
1121  Kokkos::View<gno_t*, device_type> ghost_degrees_dev("ghost degree view",ghost_degrees.size());
1122  typename Kokkos::View<gno_t*, device_type>::HostMirror ghost_degrees_host = Kokkos::create_mirror(ghost_degrees_dev);
1123  for(int i = 0; i < ghost_degrees.size(); i++){
1124  lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_sendbuf[i]);
1125  ghost_degrees_host(lid-n_local) = ghost_degrees[i];
1126  }
1127  Kokkos::deep_copy(ghost_degrees_dev, ghost_degrees_host);
1128 
1129  //track the size of sends and receives through coloring rounds.
1130  gno_t recvPerRound[numStatisticRecordingRounds];
1131  gno_t sentPerRound[numStatisticRecordingRounds];
1132 
1133  //find global max degree to determine which
1134  //coloring algorithm will be the most efficient
1135  //(For D1-2GL this is important, D2 and PD2 should always use NBBIT
1136  offset_t local_max_degree = 0;
1137  offset_t global_max_degree = 0;
1138  for(size_t i = 0; i < n_local; i++){
1139  offset_t curr_degree = offsets[i+1] - offsets[i];
1140  if(curr_degree > local_max_degree){
1141  local_max_degree = curr_degree;
1142  }
1143  }
1144  Teuchos::reduceAll<int, offset_t>(*comm, Teuchos::REDUCE_MAX,1, &local_max_degree, &global_max_degree);
1145  if(comm->getRank() == 0 && verbose) std::cout<<"Input has max degree "<<global_max_degree<<"\n";
1146 
1147  if(verbose) std::cout<<comm->getRank()<<": constructing Kokkos Views for initial coloring\n";
1148 
1149  //the initial coloring is able to use a standard CSR representation, so we construct that here.
1150  //This is a direct translation of the offsets and adjs arguments into Kokkos::Views.
1151  Kokkos::View<offset_t*, device_type> offsets_dev("Host Offset View", offsets.size());
1152  typename Kokkos::View<offset_t*, device_type>::HostMirror offsets_host = Kokkos::create_mirror(offsets_dev);
1153  Kokkos::View<lno_t*, device_type> adjs_dev("Host Adjacencies View", adjs.size());
1154  typename Kokkos::View<lno_t*, device_type>::HostMirror adjs_host = Kokkos::create_mirror(adjs_dev);
1155  for(Teuchos_Ordinal i = 0; i < offsets.size(); i++) offsets_host(i) = offsets[i];
1156  for(Teuchos_Ordinal i = 0; i < adjs.size(); i++) adjs_host(i) = adjs[i];
1157  Kokkos::deep_copy(offsets_dev,offsets_host);
1158  Kokkos::deep_copy(adjs_dev, adjs_host);
1159 
1160 
1161  //create the graph structures which allow KokkosKernels to recolor the conflicting vertices
1162  if(verbose) std::cout<<comm->getRank()<<": constructing Kokkos Views for recoloring\n";
1163 
1164  //in order to correctly recolor, all ghost vertices on this process need an entry in the CSR offsets.
1165  //Otherwise, the color of the ghosts will be ignored, and the coloring will not converge.
1166  Kokkos::View<offset_t*, device_type> dist_degrees_dev("Owned+Ghost degree view",rand.size());
1167  typename Kokkos::View<offset_t*, device_type>::HostMirror dist_degrees_host = Kokkos::create_mirror(dist_degrees_dev);
1168 
1169  //calculate the local degrees for the owned, first layer ghosts, and second layer ghosts
1170  //to be used to construct the CSR representation of the local graph.
1171  //owned
1172  for(Teuchos_Ordinal i = 0; i < offsets.size()-1; i++) dist_degrees_host(i) = offsets[i+1] - offsets[i];
1173  //first layer ghosts
1174  for(Teuchos_Ordinal i = 0; i < ghost_offsets.size()-1; i++) dist_degrees_host(i+n_local) = ghost_offsets[i+1] - ghost_offsets[i];
1175  //second layer ghosts
1176  for(Teuchos_Ordinal i = 0; i < ghost_adjs.size(); i++){
1177  //second layer ghosts have LID >= n_total
1178  if((size_t)ghost_adjs[i] >= n_total ){
1179  dist_degrees_host(ghost_adjs[i])++;
1180  }
1181  }
1182 
1183  //The structure of the final CSR constructed here looks like:
1184  //
1185  // Index by LID--v
1186  // --------------------------------------------
1187  // | | first |second |
1188  // dist_offsets_dev/host |owned verts | layer |layer |
1189  // | | ghosts |ghosts |
1190  // --------------------------------------------
1191  // |indexes
1192  // v
1193  // --------------------------------------------
1194  // | |first |second |
1195  // dist_adjs_dev/host |owned adjs |layer |layer |
1196  // | |ghost adjs |ghost adjs |
1197  // --------------------------------------------
1198  //
1199  // We end up with a CSR that has adjacency information for all the vertices on
1200  // this process. We include only edges on the process, so ghosts have only partial
1201  // adjacency information.
1202  //
1203  // We symmetrize the local graph representation as well, for
1204  // KokkosKernels to behave as we need it to. This means that edges to
1205  // second layer ghosts must be symmetrized, so we end up with offsets
1206  // that correspond to second layer ghosts.
1207  Kokkos::View<offset_t*, device_type> dist_offsets_dev("Owned+Ghost Offset view", rand.size()+1);
1208  typename Kokkos::View<offset_t*, device_type>::HostMirror dist_offsets_host = Kokkos::create_mirror(dist_offsets_dev);
1209 
1210  //we can construct the entire offsets using the degrees we computed above
1211  dist_offsets_host(0) = 0;
1212  offset_t total_adjs = 0;
1213  for(Teuchos_Ordinal i = 1; i < rand.size()+1; i++){
1214  dist_offsets_host(i) = dist_degrees_host(i-1) + dist_offsets_host(i-1);
1215  total_adjs += dist_degrees_host(i-1);
1216  }
1217  Kokkos::View<lno_t*, device_type> dist_adjs_dev("Owned+Ghost adjacency view", total_adjs);
1218  typename Kokkos::View<lno_t*, device_type>::HostMirror dist_adjs_host = Kokkos::create_mirror(dist_adjs_dev);
1219 
1220  //zero out the degrees to use it as a counter.
1221  //The offsets now allow us to compute degrees,
1222  //so we aren't losing anything.
1223  for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
1224  dist_degrees_host(i) = 0;
1225  }
1226  //We can just copy the adjacencies for the owned verts and first layer ghosts
1227  for(Teuchos_Ordinal i = 0; i < adjs.size(); i++) dist_adjs_host(i) = adjs[i];
1228  for(Teuchos_Ordinal i = adjs.size(); i < adjs.size() + ghost_adjs.size(); i++) dist_adjs_host(i) = ghost_adjs[i-adjs.size()];
1229 
1230  //We have to symmetrize edges that involve a second layer ghost.
1231  //Add edges from the second layer ghosts to their neighbors.
1232  for(Teuchos_Ordinal i = 0; i < ghost_offsets.size()-1; i++){
1233  //loop through each first layer ghost
1234  for(offset_t j = ghost_offsets[i]; j < ghost_offsets[i+1]; j++){
1235  //if an adjacency is a second layer ghost
1236  if((size_t)ghost_adjs[j] >= n_total){
1237  //compute the offset to place the symmetric edge, and place it.
1238  dist_adjs_host(dist_offsets_host(ghost_adjs[j]) + dist_degrees_host(ghost_adjs[j])) = i + n_local;
1239  //increment the counter to keep track of how many adjacencies
1240  //have been placed.
1241  dist_degrees_host(ghost_adjs[j])++;
1242  }
1243  }
1244  }
1245  //copy the host views back to the device views
1246  Kokkos::deep_copy(dist_degrees_dev,dist_degrees_host);
1247  Kokkos::deep_copy(dist_offsets_dev,dist_offsets_host);
1248  Kokkos::deep_copy(dist_adjs_dev, dist_adjs_host);
1249 
1250  //this view represents how many conflicts were found
1251  Kokkos::View<size_t*, device_type> recoloringSize("Recoloring Queue Size",1);
1252  typename Kokkos::View<size_t*, device_type>::HostMirror recoloringSize_host = Kokkos::create_mirror(recoloringSize);
1253  recoloringSize_host(0) = 0;
1254  Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1255 
1256  //create a view for the tie-breaking random numbers.
1257  if(verbose) std::cout<<comm->getRank()<<": constructing rand and GIDs views\n";
1258  Kokkos::View<int*, device_type> rand_dev("Random View", rand.size());
1259  typename Kokkos::View<int*, device_type>::HostMirror rand_host = Kokkos::create_mirror(rand_dev);
1260  for(Teuchos_Ordinal i = 0; i < rand.size(); i ++) rand_host(i) = rand[i];
1261  Kokkos::deep_copy(rand_dev,rand_host);
1262 
1263  //create a view for the global IDs of each vertex.
1264  Kokkos::View<gno_t*, device_type> gid_dev("GIDs", gids.size());
1265  typename Kokkos::View<gno_t*, device_type>::HostMirror gid_host = Kokkos::create_mirror(gid_dev);
1266  for(Teuchos_Ordinal i = 0; i < gids.size(); i++) gid_host(i) = gids[i];
1267  Kokkos::deep_copy(gid_dev,gid_host);
1268 
1269  //These variables will be initialized by constructBoundary()
1270  //
1271  //boundary_verts_dev holds all owned vertices that could possibly conflict
1272  //with a remote vertex. Stores LIDs.
1273  Kokkos::View<lno_t*, device_type> boundary_verts_dev;
1274  //this is the number of vertices in the boundary.
1275  if(verbose) std::cout<<comm->getRank()<<": constructing communication and recoloring lists\n";
1276 
1277  //We keep track of the vertices that need to get recolored
1278  //this list can contain ghost vertices, so it needs to be initialized
1279  //to the number of all vertices on the local process.
1280  //rand has an entry for each vertex, so its size corresponds to what is needed.
1281  Kokkos::View<lno_t*, device_type> verts_to_recolor_view("verts to recolor", rand.size());
1282  typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_recolor_host = create_mirror(verts_to_recolor_view);
1283 
1284  //This view keeps track of the size of the list of vertices to recolor.
1285  Kokkos::View<int*, device_type> verts_to_recolor_size("verts to recolor size",1);
1286  Kokkos::View<int*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_recolor_size_atomic = verts_to_recolor_size;
1287  typename Kokkos::View<int*, device_type>::HostMirror verts_to_recolor_size_host = create_mirror(verts_to_recolor_size);
1288 
1289  //initialize the host view
1290  verts_to_recolor_size_host(0) = 0;
1291  //copy to device
1292  Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1293 
1294  //verts to send can only include locally owned vertices,
1295  //so we can safely allocate the view to size n_local.
1296  //
1297  //This view is a list of local vertices that are going to be
1298  //recolored, and need to be sent to their remote copies.
1299  Kokkos::View<lno_t*, device_type> verts_to_send_view("verts to send", n_local);
1300  typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_send_host = create_mirror(verts_to_send_view);
1301 
1302  //this view keeps track of the size of verts_to_send.
1303  Kokkos::View<size_t*, device_type> verts_to_send_size("verts to send size",1);
1304  Kokkos::View<size_t*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic = verts_to_send_size;
1305  typename Kokkos::View<size_t*, device_type>::HostMirror verts_to_send_size_host = create_mirror(verts_to_send_size);
1306 
1307  verts_to_send_size_host(0) = 0;
1308  Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1309 
1310  if(verbose) std::cout<<comm->getRank()<<": Constructing the boundary\n";
1311 
1312  //this function allocates and initializes the boundary_verts_dev view,
1313  //sets the boundary_size accordingly, and adds vertices to the
1314  //verts_to_send_atomic view, updating the size view as well.
1315  constructBoundary(n_local, dist_offsets_dev, dist_adjs_dev, dist_offsets_host, dist_adjs_host, boundary_verts_dev,
1316  verts_to_send_view, verts_to_send_size_atomic);
1317 
1318  //this boolean chooses which KokkosKernels algorithm to use.
1319  //It was experimentally chosen for distance-1 coloring.
1320  //Should be disregarded for distance-2 colorings.
1321  bool use_vbbit = (global_max_degree < 6000);
1322  //Done initializing, start coloring!
1323 
1324  //use a barrier if we are reporting timing info
1325  if(timing) comm->barrier();
1326  interior_time = timer();
1327  total_time = timer();
1328  //give the entire local graph to KokkosKernels to color
1329  this->colorInterior(n_local, adjs_dev, offsets_dev, femv,adjs_dev,0,use_vbbit);
1330  interior_time = timer() - interior_time;
1331  comp_time = interior_time;
1332 
1333  //ghost_colors holds the colors of only ghost vertices.
1334  //ghost_colors(0) holds the color of a vertex with LID n_local.
1335  //To index this with LIDs, subtract n_local. If an LID is < n_local,
1336  //it will not have a color stored in this view.
1337  Kokkos::View<int*,device_type> ghost_colors("ghost color backups", n_ghosts);
1338 
1339  //communicate the initial coloring.
1340  if(verbose) std::cout<<comm->getRank()<<": communicating\n";
1341 
1342  //update the host views for the verts to send,
1343  //doOwnedToGhosts needs host memory views, but doesn't
1344  //change the values in them, so no need to copy afterwards
1345  Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1346  Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1347  gno_t recv,sent;
1348  comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,n_local,verts_to_send_host,verts_to_send_size_host,femv,procs_to_send,sent,recv);
1349  sentPerRound[0] = sent;
1350  recvPerRound[0] = recv;
1351 
1352  //store ghost colors so we can restore them after recoloring.
1353  //the local process can't color ghosts correctly, so we
1354  //reset the colors to avoid consistency issues.
1355  //get the color view from the FEMultiVector
1356  auto femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
1357  auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
1358  Kokkos::parallel_for("get femv colors",
1359  Kokkos::RangePolicy<execution_space, int>(0,n_ghosts),
1360  KOKKOS_LAMBDA(const int& i){
1361  ghost_colors(i) = femv_colors(i+n_local);
1362  });
1363  Kokkos::fence();
1364 
1365  double temp = timer();
1366  //detect conflicts only for ghost vertices
1367  bool recolor_degrees = this->pl->template get<bool>("recolor_degrees",false);
1368  if(verbose) std::cout<<comm->getRank()<<": detecting conflicts\n";
1369 
1370  //these sizes will be updated by detectConflicts, zero them out beforehand
1371  verts_to_send_size_host(0) = 0;
1372  verts_to_recolor_size_host(0) = 0;
1373  recoloringSize_host(0) = 0;
1374  Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1375  Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1376  Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1377 
1378  detectConflicts(n_local, dist_offsets_dev, dist_adjs_dev, femv_colors, boundary_verts_dev,
1379  verts_to_recolor_view, verts_to_recolor_size_atomic, verts_to_send_view, verts_to_send_size_atomic,
1380  recoloringSize, rand_dev, gid_dev, ghost_degrees_dev, recolor_degrees);
1381 
1382  //these sizes were updated by detectConflicts,
1383  //copy the device views back into host memory
1384  Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1385  Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1386  Kokkos::deep_copy(recoloringSize_host, recoloringSize);
1387  Kokkos::deep_copy(verts_to_recolor_size_host, verts_to_recolor_size);
1388 
1389  if(comm->getSize() > 1){
1390  conflict_detection = timer() - temp;
1391  comp_time += conflict_detection;
1392  }
1393  //all conflicts detected!
1394  if(verbose) std::cout<<comm->getRank()<<": starting to recolor\n";
1395  //variables for statistics
1396  double totalPerRound[numStatisticRecordingRounds];
1397  double commPerRound[numStatisticRecordingRounds];
1398  double compPerRound[numStatisticRecordingRounds];
1399  double recoloringPerRound[numStatisticRecordingRounds];
1400  double conflictDetectionPerRound[numStatisticRecordingRounds];
1401  uint64_t vertsPerRound[numStatisticRecordingRounds];
1402  uint64_t incorrectGhostsPerRound[numStatisticRecordingRounds];
1403  int distributedRounds = 1;
1404  totalPerRound[0] = interior_time + comm_time + conflict_detection;
1405  recoloringPerRound[0] = 0;
1406  commPerRound[0] = comm_time;
1407  compPerRound[0] = interior_time + conflict_detection;
1408  conflictDetectionPerRound[0] = conflict_detection;
1409  recoloringPerRound[0] = 0;
1410  vertsPerRound[0] = 0;
1411  incorrectGhostsPerRound[0]=0;
1412  typename Kokkos::View<int*, device_type>::HostMirror ghost_colors_host;
1413  typename Kokkos::View<lno_t*, device_type>::HostMirror boundary_verts_host;
1414  size_t serial_threshold = this->pl->template get<int>("serial_threshold",0);
1415  //see if recoloring is necessary.
1416  size_t totalConflicts = 0;
1417  size_t localConflicts = recoloringSize_host(0);
1418  Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localConflicts, &totalConflicts);
1419  bool done = !totalConflicts;
1420  if(comm->getSize()==1) done = true;
1421 
1422  //recolor until no conflicts are left
1423  while(!done){
1424  //if the number of vertices left to recolor is less than
1425  //the serial threshold set by the user, finish the coloring
1426  //only using host views in a serial execution space.
1427  if(recoloringSize_host(0) < serial_threshold) break;
1428  if(distributedRounds < numStatisticRecordingRounds) {
1429  vertsPerRound[distributedRounds] = verts_to_recolor_size_host(0);
1430  }
1431 
1432  if(timing) comm->barrier();
1433  double recolor_temp = timer();
1434  //recolor using KokkosKernels' coloring function
1435  if(verts_to_recolor_size_host(0) > 0){
1436  this->colorInterior(femv_colors.size(), dist_adjs_dev, dist_offsets_dev,femv,verts_to_recolor_view,verts_to_recolor_size_host(0),use_vbbit);
1437  }
1438 
1439  if(distributedRounds < numStatisticRecordingRounds){
1440  recoloringPerRound[distributedRounds] = timer() - recolor_temp;
1441  recoloring_time += recoloringPerRound[distributedRounds];
1442  comp_time += recoloringPerRound[distributedRounds];
1443  compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1444  totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1445  } else if(timing){
1446  double recoloring_round_time = timer() - recolor_temp;
1447  recoloring_time += recoloring_round_time;
1448  comp_time += recoloring_round_time;
1449  }
1450 
1451  //reset the ghost colors to what they were before recoloring
1452  //the recoloring does not have enough information to color
1453  //ghosts correctly, so we set the colors to what they were before
1454  //to avoid consistency issues.
1455  Kokkos::parallel_for("set femv colors",
1456  Kokkos::RangePolicy<execution_space, int>(0,n_ghosts),
1457  KOKKOS_LAMBDA(const int& i){
1458  femv_colors(i+n_local) = ghost_colors(i);
1459  });
1460  Kokkos::fence();
1461 
1462  //send views are up-to-date, they were copied after conflict detection.
1463  //communicate the new colors
1464 
1465  // Reset device views
1466  femvColors = decltype(femvColors)();
1467  femv_colors = decltype(femv_colors)();
1468  double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,n_local,verts_to_send_host,verts_to_send_size_host,femv,procs_to_send,sent,recv);
1469  comm_time += curr_comm_time;
1470 
1471  if(distributedRounds < numStatisticRecordingRounds){
1472  commPerRound[distributedRounds] = curr_comm_time;
1473  recvPerRound[distributedRounds] = recv;
1474  sentPerRound[distributedRounds] = sent;
1475  totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1476  }
1477 
1478  //store ghost colors before we uncolor and recolor them.
1479  //this process doesn't have enough info to correctly color
1480  //ghosts, so we set them back to what they were before to
1481  //remove consistency issues.
1482  femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
1483  femv_colors = subview(femvColors, Kokkos::ALL, 0);
1484  Kokkos::parallel_for("get femv colors 2",
1485  Kokkos::RangePolicy<execution_space, int>(0,n_ghosts),
1486  KOKKOS_LAMBDA(const int& i){
1487  ghost_colors(i) = femv_colors(i+n_local);
1488  });
1489  Kokkos::fence();
1490 
1491 
1492  //these views will change in detectConflicts, they need
1493  //to be zeroed out beforehand
1494  verts_to_send_size_host(0) = 0;
1495  verts_to_recolor_size_host(0) = 0;
1496  recoloringSize_host(0) = 0;
1497  Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1498  Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1499  Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1500 
1501  //check for further conflicts
1502  double detection_temp = timer();
1503 
1504  detectConflicts(n_local, dist_offsets_dev, dist_adjs_dev,femv_colors, boundary_verts_dev,
1505  verts_to_recolor_view, verts_to_recolor_size_atomic, verts_to_send_view, verts_to_send_size_atomic,
1506  recoloringSize, rand_dev, gid_dev, ghost_degrees_dev, recolor_degrees);
1507 
1508  //copy the updated device views back into host memory where necessary
1509  Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1510  Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1511  //we only use the list of verts to recolor on device, no need to copy to host.
1512  Kokkos::deep_copy(verts_to_recolor_size_host, verts_to_recolor_size);
1513  Kokkos::deep_copy(recoloringSize_host, recoloringSize);
1514 
1515  if(distributedRounds < numStatisticRecordingRounds){
1516  conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
1517  conflict_detection += conflictDetectionPerRound[distributedRounds];
1518  compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1519  totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1520  comp_time += conflictDetectionPerRound[distributedRounds];
1521  } else if(timing){
1522  double conflict_detection_round_time = timer() - detection_temp;
1523  conflict_detection += conflict_detection_round_time;
1524  comp_time += conflict_detection_round_time;
1525  }
1526 
1527  distributedRounds++;
1528  size_t localDone = recoloringSize_host(0);
1529  size_t globalDone = 0;
1530  Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localDone, &globalDone);
1531  done = !globalDone;
1532 
1533  }//end device coloring
1534 
1535 
1536  //If we reach this point and still have vertices to color,
1537  //the rest of the coloring is happening in serial on the host.
1538  //
1539  //First, we need to copy device-only views to host mirrors
1540  if(recoloringSize_host(0) > 0 || !done){
1541  ghost_colors_host = Kokkos::create_mirror_view_and_copy(host_mem(),ghost_colors,"ghost_colors host mirror");
1542  boundary_verts_host = Kokkos::create_mirror_view_and_copy(host_mem(),boundary_verts_dev,"boundary_verts host mirror");
1543  }
1544 
1545  //Now we do a similar coloring loop to before,
1546  //but using only host views in a serial execution space.
1547  // Reset device views
1548  femvColors = decltype(femvColors)();
1549  femv_colors = decltype(femv_colors)();
1550  while(recoloringSize_host(0) > 0 || !done){
1551  auto femvColors_host = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
1552  auto colors_host = subview(femvColors_host, Kokkos::ALL, 0);
1553  if(distributedRounds < numStatisticRecordingRounds){
1554  vertsPerRound[distributedRounds] = recoloringSize_host(0);
1555  }
1556  if(verbose) std::cout<<comm->getRank()<<": starting to recolor, serial\n";
1557  if(timing) comm->barrier();
1558 
1559  double recolor_temp = timer();
1560  if(verts_to_recolor_size_host(0) > 0){
1561  this->colorInterior_serial(colors_host.size(), dist_adjs_host, dist_offsets_host, femv,
1562  verts_to_recolor_host, verts_to_recolor_size_host(0), true);
1563  }
1564  if(distributedRounds < numStatisticRecordingRounds){
1565  recoloringPerRound[distributedRounds] = timer() - recolor_temp;
1566  recoloring_time += recoloringPerRound[distributedRounds];
1567  comp_time += recoloringPerRound[distributedRounds];
1568  compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1569  totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1570  } else if(timing){
1571  double recoloring_serial_round_time = timer() - recolor_temp;
1572  recoloring_time += recoloring_serial_round_time;
1573  comp_time += recoloring_serial_round_time;
1574  }
1575 
1576  //reset the ghost colors to their previous values to avoid
1577  //consistency issues.
1578  for(size_t i = 0; i < n_ghosts; i++){
1579  colors_host(i+n_local) = ghost_colors_host(i);
1580  }
1581 
1582  double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts, n_local,verts_to_send_host, verts_to_send_size_host, femv, procs_to_send, sent,recv);
1583  comm_time += curr_comm_time;
1584 
1585  if(distributedRounds < numStatisticRecordingRounds){
1586  commPerRound[distributedRounds] = curr_comm_time;
1587  recvPerRound[distributedRounds] = recv;
1588  sentPerRound[distributedRounds] = sent;
1589  totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1590  }
1591 
1592  //store updated ghost colors we received from their owners
1593  //before conflict detection and recoloring changes them locally.
1594  for(size_t i = 0; i < n_ghosts; i++){
1595  ghost_colors_host(i) = colors_host(i+n_local);
1596  }
1597 
1598  if(timing) comm->barrier();
1599  double detection_temp = timer();
1600 
1601  //zero these out, they'll be updated by detectConflicts_serial
1602  verts_to_recolor_size_host(0) = 0;
1603  verts_to_send_size_host(0) = 0;
1604  recoloringSize_host(0) = 0;
1605 
1606  detectConflicts_serial(n_local,dist_offsets_host, dist_adjs_host, colors_host, boundary_verts_host,
1607  verts_to_recolor_host, verts_to_recolor_size_host, verts_to_send_host, verts_to_send_size_host,
1608  recoloringSize_host, rand_host, gid_host, ghost_degrees_host, recolor_degrees);
1609 
1610  //no need to update the host views, we're only using host views now.
1611 
1612  if(distributedRounds < numStatisticRecordingRounds){
1613  conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
1614  conflict_detection += conflictDetectionPerRound[distributedRounds];
1615  compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1616  totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1617  comp_time += conflictDetectionPerRound[distributedRounds];
1618  } else if(timing){
1619  double conflict_detection_serial_round_time = timer() - detection_temp;
1620  conflict_detection += conflict_detection_serial_round_time;
1621  comp_time += conflict_detection_serial_round_time;
1622  }
1623 
1624  size_t globalDone = 0;
1625  size_t localDone = recoloringSize_host(0);
1626  Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localDone, &globalDone);
1627  distributedRounds++;
1628  done = !globalDone;
1629  }
1630 
1631  total_time = timer() - total_time;
1632  //only compute stats if they will be displayed
1633  if(verbose){
1634  uint64_t localBoundaryVertices = 0;
1635  for(size_t i = 0; i < n_local; i++){
1636  for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
1637  if((size_t)adjs[j] >= n_local){
1638  localBoundaryVertices++;
1639  break;
1640  }
1641  }
1642  }
1643 
1644  //if(comm->getRank() == 0) printf("did %d rounds of distributed coloring\n", distributedRounds);
1645  uint64_t totalVertsPerRound[numStatisticRecordingRounds];
1646  uint64_t totalBoundarySize = 0;
1647  uint64_t totalIncorrectGhostsPerRound[numStatisticRecordingRounds];
1648  double finalTotalPerRound[numStatisticRecordingRounds];
1649  double maxRecoloringPerRound[numStatisticRecordingRounds];
1650  double minRecoloringPerRound[numStatisticRecordingRounds];
1651  double finalCommPerRound[numStatisticRecordingRounds];
1652  double finalCompPerRound[numStatisticRecordingRounds];
1653  double finalConflictDetectionPerRound[numStatisticRecordingRounds];
1654  gno_t finalRecvPerRound[numStatisticRecordingRounds];
1655  gno_t finalSentPerRound[numStatisticRecordingRounds];
1656  for(int i = 0; i < numStatisticRecordingRounds; i++){
1657  totalVertsPerRound[i] = 0;
1658  finalTotalPerRound[i] = 0.0;
1659  maxRecoloringPerRound[i] = 0.0;
1660  minRecoloringPerRound[i] = 0.0;
1661  finalCommPerRound[i] = 0.0;
1662  finalCompPerRound[i] = 0.0;
1663  finalConflictDetectionPerRound[i] = 0.0;
1664  finalRecvPerRound[i] = 0;
1665  finalSentPerRound[i] = 0;
1666  }
1667  Teuchos::reduceAll<int,uint64_t>(*comm, Teuchos::REDUCE_SUM,1,&localBoundaryVertices, &totalBoundarySize);
1668  Teuchos::reduceAll<int,uint64_t>(*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,vertsPerRound,totalVertsPerRound);
1669  Teuchos::reduceAll<int,uint64_t>(*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,incorrectGhostsPerRound,totalIncorrectGhostsPerRound);
1670  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,totalPerRound, finalTotalPerRound);
1671  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,recoloringPerRound,maxRecoloringPerRound);
1672  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,numStatisticRecordingRounds,recoloringPerRound,minRecoloringPerRound);
1673  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,commPerRound,finalCommPerRound);
1674  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,compPerRound,finalCompPerRound);
1675  Teuchos::reduceAll<int,double>(*comm,
1676  Teuchos::REDUCE_MAX,numStatisticRecordingRounds,conflictDetectionPerRound,finalConflictDetectionPerRound);
1677  Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,recvPerRound,finalRecvPerRound);
1678  Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,sentPerRound,finalSentPerRound);
1679  std::cout << "Rank " << comm->getRank()
1680  << ": boundary size: " << localBoundaryVertices << std::endl;
1681  if(comm->getRank() == 0)
1682  std::cout << "Total boundary size: " << totalBoundarySize << std::endl;
1683  for(int i = 0; i < std::min((int)distributedRounds,numStatisticRecordingRounds); i++){
1684  std::cout << "Rank " << comm->getRank()
1685  << ": recolor " << vertsPerRound[i]
1686  << " vertices in round " << i << std::endl;
1687  std::cout << "Rank " << comm->getRank()
1688  << " sentbuf had " << sentPerRound[i]
1689  << " entries in round " << i << std::endl;
1690  if(comm->getRank()==0){
1691  std::cout << "recolored " << totalVertsPerRound[i]
1692  << " vertices in round " << i << std::endl;
1693  std::cout << totalIncorrectGhostsPerRound[i]
1694  << " inconsistent ghosts in round " << i << std::endl;
1695  std::cout << "total time in round " << i
1696  << ": " << finalTotalPerRound[i] << std::endl;
1697  std::cout << "recoloring time in round " << i
1698  << ": " << maxRecoloringPerRound[i] << std::endl;
1699  std::cout << "min recoloring time in round " << i
1700  << ": " << minRecoloringPerRound[i] << std::endl;
1701  std::cout << "conflict detection time in round " << i
1702  << ": " << finalConflictDetectionPerRound[i] << std::endl;
1703  std::cout << "comm time in round " << i
1704  << ": " << finalCommPerRound[i] << std::endl;
1705  std::cout << "recvbuf size in round " << i
1706  << ": " << finalRecvPerRound[i] << std::endl;
1707  std::cout << "sendbuf size in round " << i
1708  << ": " << finalSentPerRound[i] << std::endl;
1709  std::cout << "comp time in round " << i
1710  << ": " << finalCompPerRound[i] << std::endl;
1711  }
1712  }
1713  } else if (timing){
1714  double global_total_time = 0.0;
1715  double global_recoloring_time = 0.0;
1716  double global_min_recoloring_time = 0.0;
1717  double global_conflict_detection=0.0;
1718  double global_comm_time=0.0;
1719  double global_comp_time=0.0;
1720  double global_interior_time=0.0;
1721  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&total_time,&global_total_time);
1722  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&recoloring_time,&global_recoloring_time);
1723  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,1,&recoloring_time,&global_min_recoloring_time);
1724  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&conflict_detection,&global_conflict_detection);
1725  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comm_time,&global_comm_time);
1726  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comp_time,&global_comp_time);
1727  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&interior_time,&global_interior_time);
1728  comm->barrier();
1729  fflush(stdout);
1730  if(comm->getRank()==0){
1731  std::cout << "Total Time: " << global_total_time << std::endl;
1732  std::cout << "Interior Time: " << global_interior_time << std::endl;
1733  std::cout << "Recoloring Time: " << global_recoloring_time << std::endl;
1734  std::cout << "Min Recoloring Time: " << global_min_recoloring_time << std::endl;
1735  std::cout << "Conflict Detection Time: " << global_conflict_detection << std::endl;
1736  std::cout << "Comm Time: " << global_comm_time << std::endl;
1737  std::cout << "Comp Time: " << global_comp_time << std::endl;
1738  }
1739  }
1740  }
1741 }; //end class
1742 
1743 }//end namespace Zoltan2
1744 
1745 #endif
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
Tpetra::FEMultiVector< femv_scalar_t, lno_t, gno_t > femv_t
typename Adapter::gno_t gno_t
virtual void detectConflicts(const size_t n_local, Kokkos::View< offset_t *, device_type > dist_offsets_dev, Kokkos::View< lno_t *, device_type > dist_adjs_dev, Kokkos::View< int *, device_type > femv_colors, Kokkos::View< lno_t *, device_type > boundary_verts_view, Kokkos::View< lno_t *, device_type > verts_to_recolor_view, Kokkos::View< int *, device_type, Kokkos::MemoryTraits< Kokkos::Atomic >> verts_to_recolor_size_atomic, Kokkos::View< lno_t *, device_type > verts_to_send_view, Kokkos::View< size_t *, device_type, Kokkos::MemoryTraits< Kokkos::Atomic >> verts_to_send_size_atomic, Kokkos::View< size_t *, device_type > recoloringSize, Kokkos::View< int *, device_type > rand, Kokkos::View< gno_t *, device_type > gid, Kokkos::View< gno_t *, device_type > ghost_degrees, bool recolor_degrees)=0
typename Adapter::lno_t lno_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
map_t::global_ordinal_type gno_t
Definition: mapRemotes.cpp:18
typename Adapter::offset_t offset_t
AlgTwoGhostLayer(const RCP< const base_adapter_t > &adapter_, const RCP< Teuchos::ParameterList > &pl_, const RCP< Environment > &env_, const RCP< const Teuchos::Comm< int > > &comm_)
virtual void constructBoundary(const size_t n_local, Kokkos::View< offset_t *, device_type > dist_offsets_dev, Kokkos::View< lno_t *, device_type > dist_adjs_dev, typename Kokkos::View< offset_t *, device_type >::HostMirror dist_offsets_host, typename Kokkos::View< lno_t *, device_type >::HostMirror dist_adjs_host, Kokkos::View< lno_t *, device_type > &boundary_verts, Kokkos::View< lno_t *, device_type > verts_to_send_view, Kokkos::View< size_t *, device_type, Kokkos::MemoryTraits< Kokkos::Atomic >> verts_to_send_size_atomic)=0
algorithm requires no self edges
typename femv_t::host_view_type::device_type::memory_space host_mem
void color(const RCP< ColoringSolution< Adapter > > &solution)
Coloring method.
RCP< Teuchos::ParameterList > pl
Algorithm defines the base class for all algorithms.
map_t::local_ordinal_type lno_t
Definition: mapRemotes.cpp:17
typename femv_t::device_type device_type
void twoGhostLayer(const size_t n_local, const size_t n_total, const Teuchos::ArrayView< const lno_t > &adjs, const Teuchos::ArrayView< const offset_t > &offsets, const Teuchos::ArrayView< const lno_t > &ghost_adjs, const Teuchos::ArrayView< const offset_t > &ghost_offsets, const Teuchos::RCP< femv_t > &femv, const Teuchos::ArrayView< const gno_t > &gids, const Teuchos::ArrayView< const int > &rand, const Teuchos::ArrayView< const int > &ghost_owners, RCP< const map_t > mapOwnedPlusGhosts, const std::unordered_map< lno_t, std::vector< int >> &procs_to_send)
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS&#39;s idx_t...
typename femv_t::host_view_type::device_type::execution_space host_exec
RCP< const base_adapter_t > adapter
GraphModel defines the interface required for graph models.
RCP< const Teuchos::Comm< int > > comm
Defines the ColoringSolution class.
Tpetra::global_size_t global_size_t
virtual void detectConflicts_serial(const size_t n_local, typename Kokkos::View< offset_t *, device_type >::HostMirror dist_offsets_host, typename Kokkos::View< lno_t *, device_type >::HostMirror dist_adjs_host, typename Kokkos::View< int *, device_type >::HostMirror femv_colors, typename Kokkos::View< lno_t *, device_type >::HostMirror boundary_verts_view, typename Kokkos::View< lno_t *, device_type >::HostMirror verts_to_recolor_view, typename Kokkos::View< int *, device_type >::HostMirror verts_to_recolor_size_atomic, typename Kokkos::View< lno_t *, device_type >::HostMirror verts_to_send_view, typename Kokkos::View< size_t *, device_type >::HostMirror verts_to_send_size_atomic, typename Kokkos::View< size_t *, device_type >::HostMirror recoloringSize, typename Kokkos::View< int *, device_type >::HostMirror rand, typename Kokkos::View< gno_t *, device_type >::HostMirror gid, typename Kokkos::View< gno_t *, device_type >::HostMirror ghost_degrees, bool recolor_degrees)=0
Defines the GraphModel interface.
A gathering of useful namespace methods.
typename Adapter::base_adapter_t base_adapter_t
typename device_type::execution_space execution_space
Tpetra::Map< lno_t, gno_t > map_t
typename Adapter::scalar_t scalar_t
typename device_type::memory_space memory_space
AlltoAll communication methods.
The class containing coloring solution.