Zoltan2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Zoltan2_AlgHybrid2GL.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Zoltan2: A package of combinatorial algorithms for scientific computing
4 //
5 // Copyright 2012 NTESS and the Zoltan2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef _ZOLTAN2_2GHOSTLAYER_HPP_
11 #define _ZOLTAN2_2GHOSTLAYER_HPP_
12 
13 #include <vector>
14 #include <unordered_map>
15 #include <iostream>
16 #include <queue>
17 #ifdef _WIN32
18 #include <time.h>
19 #else
20 #include <sys/time.h>
21 #endif
22 
23 #include "Zoltan2_Algorithm.hpp"
24 #include "Zoltan2_GraphModel.hpp"
26 #include "Zoltan2_Util.hpp"
27 #include "Zoltan2_TPLTraits.hpp"
28 #include "Zoltan2_AlltoAll.hpp"
29 
30 
31 #include "Tpetra_Core.hpp"
32 #include "Teuchos_RCP.hpp"
33 #include "Tpetra_Import.hpp"
34 #include "Tpetra_FEMultiVector.hpp"
35 
36 #include "KokkosKernels_Handle.hpp"
37 #include "KokkosKernels_IOUtils.hpp"
38 #include "KokkosGraph_Distance1Color.hpp"
39 #include "KokkosGraph_Distance1ColorHandle.hpp"
40 
44 // for methods that use two layers of ghosts.
45 
46 
47 namespace Zoltan2{
48 
49 template <typename Adapter>
50 class AlgTwoGhostLayer : public Algorithm<Adapter> {
51 
52  public:
53 
54  using lno_t = typename Adapter::lno_t;
55  using gno_t = typename Adapter::gno_t;
56  using offset_t = typename Adapter::offset_t;
57  using scalar_t = typename Adapter::scalar_t;
59  using map_t = Tpetra::Map<lno_t,gno_t>;
60  using femv_scalar_t = int;
61  using femv_t = Tpetra::FEMultiVector<femv_scalar_t, lno_t, gno_t>;
62  using device_type = typename femv_t::device_type;
63  using execution_space = typename device_type::execution_space;
64  using memory_space = typename device_type::memory_space;
65  using host_exec = typename femv_t::host_view_type::device_type::execution_space;
66  using host_mem = typename femv_t::host_view_type::device_type::memory_space;
67 
68  double timer(){
69  struct timeval tp;
70  gettimeofday(&tp, NULL);
71  return ((double) (tp.tv_sec) + 1e-6 * tp.tv_usec);
72  }
73  private:
74 
75  //Entry point for parallel local coloring
76  // nVtx is the number of vertices owned by the current process
77  //
78  // adjs_view is the adjacencies, indexed by offset_view
79  //
80  // offset_view is the CSR row map, used to index the adjs_view
81  //
82  // femv is the FEMultiVector that holds the colors for the vertices
83  // the colors will change in this function.
84  //
85  // vertex_list is a list of vertices to recolor
86  //
87  // vertex_list_size is the size of the list of vertices to recolor
88  // vertex_list_size = 0 means recolor all uncolored vertices
89  //
90  // recolor decides which KokkosKernels algorithm to use.
91  //
92  virtual void colorInterior(const size_t nVtx,
93  Kokkos::View<lno_t*, device_type > adjs_view,
94  Kokkos::View<offset_t*,device_type > offset_view,
95  Teuchos::RCP<femv_t> femv,
96  Kokkos::View<lno_t*, device_type> vertex_list,
97  size_t vertex_list_size = 0,
98  bool recolor=false) = 0;
99 
100  //Entry point for serial local coloring
101  virtual void colorInterior_serial(const size_t nVtx,
102  typename Kokkos::View<lno_t*, device_type >::HostMirror adjs_view,
103  typename Kokkos::View<offset_t*,device_type >::HostMirror offset_view,
104  Teuchos::RCP<femv_t> femv,
105  typename Kokkos::View<lno_t*, device_type>::HostMirror vertex_list,
106  size_t vertex_list_size = 0,
107  bool recolor=false) = 0;
108  public:
109  //Entry point for parallel conflict detection
110  //INPUT ARGS
111  // n_local is the number of vertices owned by the current process
112  //
113  // dist_offsets_dev is the device view that holds the CSR offsets
114  // It holds offsets for all vertices, owned and ghosts.
115  // It is organized with owned first, and then ghost layers:
116  // -----------------------------------------------
117  // | owned | first | second |
118  // | vtx | ghost | ghost |
119  // | offsets | layer | layer |
120  // -----------------------------------------------
121  // The allocated length is equal to the sum of owned
122  // and ghost vertices + 1. Any vertex with LID >= n_local
123  // is a ghost.
124  //
125  // dist_adjs_dev is the device view that holds CSR adjacencies
126  //
127  // boundary_verts_view holds the local IDs of vertices in the boundary.
128  // By definition, boundary vertices are owned vertices
129  // that have a ghost in their neighborhood.
130  // (distance-1 coloring = 1-hop neighborhood)
131  // (distance-2 coloring = 2-hop neighborhood)
132  // Note: for D1-2GL we do not use this argument.
133  // It is possible to detect all D1 conflicts by only
134  // checking the ghost vertices, without constructing
135  // the boundary.
136  //
137  // rand is a device view that holds random numbers generated from GIDs,
138  // they are consistently generated across processes
139  //
140  // gid is a device view that holds the GIDs of each vertex on this process.
141  // It is indexable by local ID. It stores both owned and ghost
142  // vertices, and LIDs are ordered so that the structure looks like:
143  // -----------------------------------------
144  // | | first | second | owned LIDs are smallest
145  // | owned | ghost | ghost | ghost LIDs increase based
146  // | vtx | layer | layer | on layer (1 < 2)
147  // -----------------------------------------
148  // The allocated size is dist_offsets_dev.extent(0)-1.
149  //
150  //
151  // ghost_degrees is a device view that holds the degrees of ghost vertices only.
152  // A ghost with local ID n_local will be the first entry in this view.
153  // So, ghost_degrees[i] is the degree of the vtx with
154  // GID = gid[n_local+i].
155  //
156  // recolor_degrees is a boolean that determines whether or not we factor in vertex
157  // degrees on recoloring
158  //
159  //OUTPUT ARGS
160  // femv_colors is the device color view.
161  // After this function call, conflicts between vertices will
162  // be resolved via setting a vertex's color to 0. The vertex
163  // to be uncolored is determined by rules that are consistent
164  // across multiple processes.
165  //
166  // verts_to_recolor_view is a device view that holds the list
167  // of vertices to recolor. Any vertex
168  // uncolored by this function will appear in this
169  // view after the function returns.
170  //
171  // verts_to_recolor_size_atomic is an atomic device view that holds the
172  // size of verts_to_recolor_view.
173  // Effectively it represents how many
174  // vertices need to be recolored after
175  // conflict detection.
176  // It differs in size from the allocated size of
177  // verts_to_recolor_view.
178  // Atomicity is required for building
179  // verts_to_recolor_view in parallel, which
180  // is the reason this is a view instead of
181  // just an integer type.
182  //
183  // verts_to_send_view is a device view that holds the list of vertices
184  // that will need to be sent to remotes after recoloring
185  // Note: Only locally owned vertices should ever be in
186  // this view. A process cannot color ghosts correctly.
187  //
188  // verts_to_send_size_atomic is an atomic device view that holds the size of
189  // verts_to_send_view. It differs in size from the
190  // allocated size of verts_to_send_view.
191  // Atomicity is required for building
192  // verts_to_send_view in parallel,
193  // which is the reason this is a view instead of
194  // just an integer type.
195  //
196  // recoloringSize is a device view that stores the total number of
197  // vertices that were uncolored by the conflict detection.
198  //
199  virtual void detectConflicts(const size_t n_local,
200  Kokkos::View<offset_t*, device_type > dist_offsets_dev,
201  Kokkos::View<lno_t*, device_type > dist_adjs_dev,
202  Kokkos::View<int*,device_type > femv_colors,
203  Kokkos::View<lno_t*, device_type > boundary_verts_view,
204  Kokkos::View<lno_t*,
205  device_type > verts_to_recolor_view,
206  Kokkos::View<int*,
207  device_type,
208  Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_recolor_size_atomic,
209  Kokkos::View<lno_t*,
210  device_type > verts_to_send_view,
211  Kokkos::View<size_t*,
212  device_type,
213  Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic,
214  Kokkos::View<size_t*, device_type> recoloringSize,
215  Kokkos::View<int*,
216  device_type> rand,
217  Kokkos::View<gno_t*,
218  device_type> gid,
219  Kokkos::View<gno_t*,
220  device_type> ghost_degrees,
221  bool recolor_degrees) = 0;
222 
223  //Entry point for serial conflict detection
224  virtual void detectConflicts_serial(const size_t n_local,
225  typename Kokkos::View<offset_t*, device_type >::HostMirror dist_offsets_host,
226  typename Kokkos::View<lno_t*, device_type >::HostMirror dist_adjs_host,
227  typename Kokkos::View<int*,device_type >::HostMirror femv_colors,
228  typename Kokkos::View<lno_t*, device_type >::HostMirror boundary_verts_view,
229  typename Kokkos::View<lno_t*,device_type>::HostMirror verts_to_recolor_view,
230  typename Kokkos::View<int*,device_type>::HostMirror verts_to_recolor_size_atomic,
231  typename Kokkos::View<lno_t*,device_type>::HostMirror verts_to_send_view,
232  typename Kokkos::View<size_t*,device_type>::HostMirror verts_to_send_size_atomic,
233  typename Kokkos::View<size_t*, device_type>::HostMirror recoloringSize,
234  typename Kokkos::View<int*, device_type>::HostMirror rand,
235  typename Kokkos::View<gno_t*,device_type>::HostMirror gid,
236  typename Kokkos::View<gno_t*,device_type>::HostMirror ghost_degrees,
237  bool recolor_degrees) = 0;
238  //Entry point for the construction of the boundary
239  //INPUT ARGS
240  // n_local is the number of vertices owned by the current process
241  //
242  // dist_offsets_dev is the device view that holds the CSR offsets
243  //
244  // dist_adjs_dev is the device view that holds CSR adjacencies
245  //
246  // dist_offsets_host is the hostmirror that holds the CSR offsets
247  //
248  // dist_adjs_host is the hostmirror that holds the CSR adjacencies
249  //
250  //OUTPUT ARGS
251  // boundary_verts is an unallocated device view that will hold the
252  // list of boundary vertices.
253  //
254  // verts_to_send_view will hold the list of vertices to send
255  //
256  // verts_to_send_size_atomic will hold the number of vertices to
257  // send currently held in verts_to_send_view.
258  // verts_to_send_size_atomic differs
259  // from the allocated size of verts_to_send_view
260  // Atomicity is required for building
261  // verts_to_send_view in parallel, which is
262  // the reason this is a view instead of an
263  // integer type.
264  //
265  virtual void constructBoundary(const size_t n_local,
266  Kokkos::View<offset_t*, device_type> dist_offsets_dev,
267  Kokkos::View<lno_t*, device_type> dist_adjs_dev,
268  typename Kokkos::View<offset_t*, device_type>::HostMirror dist_offsets_host,
269  typename Kokkos::View<lno_t*, device_type>::HostMirror dist_adjs_host,
270  Kokkos::View<lno_t*, device_type>& boundary_verts,
271  Kokkos::View<lno_t*,
272  device_type > verts_to_send_view,
273  Kokkos::View<size_t*,
274  device_type,
275  Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic) = 0;
276 
277  protected:
278  RCP<const base_adapter_t> adapter;
279  RCP<Teuchos::ParameterList> pl;
280  RCP<Environment> env;
281  RCP<const Teuchos::Comm<int> > comm;
282  bool verbose;
283  bool timing;
284 
285  private:
286  //This function constructs a CSR with complete adjacency information for
287  //first-layer ghost vertices. This CSR can contain edges from:
288  // first-layer ghosts to owned;
289  // first-layer ghosts to first-layer ghosts;
290  // first-layer ghosts to second-layer ghosts;
291  //1) Each process sends a list of ghost GIDs back to their owning process
292  // to request full adjacency information for that vertex.
293  //
294  //2) Initially each process sends back degree information to the requesting
295  // process.
296  //
297  //3) Then, each process can construct send buffers and receive schedules for
298  // the adjacency information for each requested GID it received,
299  // in addition to building the 2GL offsets and relabeling ghost LIDs
300  // to make the final CSR construction easier.
301  //
302  // 3a) We can construct the 2GL offsets here because each process has
303  // already received the degree information for each first-layer
304  // ghost vertex.
305  //
306  // 3b) The original ghost LIDs may not correspond to the order in which
307  // we will receive the adjacency information. Instead of reordering
308  // the received adjacencies, we relabel the GIDs of first-layer
309  // ghosts with new LIDs that correspond to the order in which we
310  // receive the adjacency information. Only first-layer ghost LIDs
311  // are changed.
312  //
313  //4) Due to limitations on the size of an MPI message, we split sending and
314  // receiving into rounds to accommodate arbitrarily large graphs.
315  //
316  //5) As send rounds progress, we construct the 2GL adjacency structure.
317  // Because we relabel ghost LIDs, we do not need to reorder the
318  // data after we receive it.
319  //
320  //
321  //
322  //OUTPUT ARGS
323  // ownedPlusGhosts Initially holds the list of GIDs for owned and ghost
324  // vertices. The only hard constraint on the ordering is
325  // that owned vertices come before ghost vertices.
326  // This function can re-assign the LIDs of ghost vertices
327  // in order to make the final construction of the 2GL
328  // CSR easier. After this function call, some ghost
329  // LIDs may be changed, but they will still be greater
330  // than owned LIDs. No owned LIDs will be changed.
331  //
332  // adjs_2GL holds the second ghost layer CSR adjacencies. The 2GL CSR
333  // contains complete adjacency information for the first-layer
334  // ghosts. These adjacencies can hold both owned vertices and
335  // second-layer ghosts, as well as first-layer ghosts.
336  //
337  // offsets_2GL holds CSR offsets for vertices in the first ghost layer only.
338  // That is, a vertex with GID = gid[n_local] will be the first
339  // vertex with information in this offset structure.
340  //
341  //
342  //INPUT ARGS
343  // owners holds the owning proc for a given vertex, indexed by local ID.
344  //
345  // adjs is the CSR adjacencies of the local graph with a single ghost layer.
346  //
347  // offsets is the CSR offsets of the local graph with a single ghost layer.
348  //
349  // mapOwned translates from Owned GID to LID. We only need this translation
350  // for owned vertices.
351  //
352  //TODO: This function uses many vectors of size comm->getSize();
353  // consider changes to increase memory scalability.
354  void constructSecondGhostLayer(std::vector<gno_t>& ownedPlusGhosts,
355  const std::vector<int>& owners,
356  ArrayView<const gno_t> adjs,
357  ArrayView<const offset_t> offsets,
358  RCP<const map_t> mapOwned,
359  std::vector< gno_t>& adjs_2GL,
360  std::vector< offset_t>& offsets_2GL) {
361 
362  //first, we send ghost GIDs back to owners to receive the
363  //degrees of each first-layer ghost.
364  std::vector<int> sendcounts(comm->getSize(),0);
365  std::vector<size_t> sdispls(comm->getSize()+1,0);
366  //loop through owners, count how many vertices we'll send to each processor
367  //We send each ghost GID back to its owning process.
368  if(verbose) std::cout<<comm->getRank()<<": building sendcounts\n";
369  for(size_t i = 0; i < owners.size(); i++){
370  if(owners[i] != comm->getRank()&& owners[i] !=-1) sendcounts[owners[i]]++;
371  }
372  //construct sdispls (for building sendbuf), and sum the total sendcount
373  if(verbose) std::cout<<comm->getRank()<<": building sdispls\n";
374  size_t sendcount = 0;
375  for(int i = 1; i < comm->getSize()+1; i++){
376  sdispls[i] = sdispls[i-1] + sendcounts[i-1];
377  sendcount += sendcounts[i-1];
378  }
379 
380  if(verbose) std::cout<<comm->getRank()<<": building idx\n";
381  std::vector<gno_t> idx(comm->getSize(),0);
382  for(int i = 0; i < comm->getSize(); i++){
383  idx[i]=sdispls[i];
384  }
385  //construct sendbuf to send GIDs to owning processes
386  if(verbose) std::cout<<comm->getRank()<<": building sendbuf\n";
387 
388  std::vector<gno_t> sendbuf(sendcount,0);
389  for(size_t i = offsets.size()-1; i < owners.size(); i++){
390  if(owners[i] != comm->getRank() && owners[i] != -1){
391  sendbuf[idx[owners[i]]++] = ownedPlusGhosts[i];
392  }
393  }
394 
395  //communicate GIDs to owners
396  if(verbose) std::cout<<comm->getRank()<<": requesting GIDs from owners\n";
397  Teuchos::ArrayView<int> sendcounts_view = Teuchos::arrayViewFromVector(sendcounts);
398  Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
399  Teuchos::ArrayRCP<gno_t> recvbuf;
400  std::vector<int> recvcounts(comm->getSize(),0);
401  Teuchos::ArrayView<int> recvcounts_view = Teuchos::arrayViewFromVector(recvcounts);
402  Zoltan2::AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcounts_view, recvbuf, recvcounts_view);
403 
404  if(verbose) std::cout<<comm->getRank()<<": done communicating\n";
405  //replace entries in recvGIDs with their degrees
406 
407  if(verbose) std::cout<<comm->getRank()<<": building rdispls\n";
408  gno_t recvcounttotal = 0;
409  std::vector<int> rdispls(comm->getSize()+1,0);
410  for(size_t i = 1; i<recvcounts.size()+1; i++){
411  rdispls[i] = rdispls[i-1] + recvcounts[i-1];
412  recvcounttotal += recvcounts[i-1];
413  }
414  //send back the degrees to the requesting processes,
415  //build the adjacency counts
416  std::vector<offset_t> sendDegrees(recvcounttotal,0);
417  gno_t adj_len = 0;
418  std::vector<int> adjsendcounts(comm->getSize(),0);
419  if(verbose) std::cout<<comm->getRank()<<": building adjacency counts\n";
420  for(int i = 0; i < comm->getSize(); i++){
421  adjsendcounts[i] = 0;
422  for(int j = rdispls[i]; j < rdispls[i+1]; j++){
423  lno_t lid = mapOwned->getLocalElement(recvbuf[j]);
424  offset_t degree = offsets[lid+1] - offsets[lid];
425  sendDegrees[j] = degree;
426  adj_len +=degree;
427  adjsendcounts[i] += degree;
428  }
429  }
430  //communicate the degrees back to the requesting processes
431  if(verbose) std::cout<<comm->getRank()<<": sending degrees back to requestors\n";
432  Teuchos::ArrayView<offset_t> sendDegrees_view = Teuchos::arrayViewFromVector(sendDegrees);
433  Teuchos::ArrayRCP<offset_t> recvDegrees;
434  std::vector<int> recvDegreesCount(comm->getSize(),0);
435  Teuchos::ArrayView<int> recvDegreesCount_view = Teuchos::arrayViewFromVector(recvDegreesCount);
436  Zoltan2::AlltoAllv<offset_t>(*comm, *env, sendDegrees_view, recvcounts_view, recvDegrees, recvDegreesCount_view);
437 
438  //calculate number of rounds of AlltoAllv's that are necessary on this process
439 
440  if(verbose) std::cout<<comm->getRank()<<": determining number of rounds necessary\n";
441  int rounds = 1;
442  for(int i = 0; i < comm->getSize(); i++){
443  if(adjsendcounts[i]*sizeof(gno_t)/ INT_MAX > (size_t)rounds){
444  rounds = (adjsendcounts[i]*sizeof(gno_t)/INT_MAX)+1;
445  }
446  }
447 
448  //see what the max number of rounds will be globally
449  int max_rounds = 0;
450  Teuchos::reduceAll<int>(*comm, Teuchos::REDUCE_MAX, 1, &rounds, &max_rounds);
451 
452  if(verbose) std::cout<<comm->getRank()<<": building per_proc sums\n";
453  //compute send and receive schedules to and from each process
454  std::vector<std::vector<uint64_t>> per_proc_round_adj_sums(max_rounds+1,std::vector<uint64_t>(comm->getSize(),0));
455  std::vector<std::vector<uint64_t>> per_proc_round_vtx_sums(max_rounds+1,std::vector<uint64_t>(comm->getSize(),0));
456 
457  if(verbose) std::cout<<comm->getRank()<<": filling per_proc sums\n";
458  //fill out the send schedules
459  for(int proc_to_send = 0; proc_to_send < comm->getSize(); proc_to_send++){
460  int curr_round = 0;
461  for(size_t j = sdispls[proc_to_send]; j < sdispls[proc_to_send+1]; j++){
462  if((per_proc_round_adj_sums[curr_round][proc_to_send] + recvDegrees[j])*sizeof(gno_t) > INT_MAX){
463  curr_round++;
464  }
465  per_proc_round_adj_sums[curr_round][proc_to_send] += recvDegrees[j];
466  per_proc_round_vtx_sums[curr_round][proc_to_send]++;
467  }
468  }
469 
470  if(verbose) std::cout<<comm->getRank()<<": building recv GID schedule\n";
471 
472  //A 3D vector to hold the GIDs so we can see how this process will receive
473  //the vertices in each round, from each process. This way we can reorder things
474  //locally so that the data we receive will automatically construct the CSR correctly
475  //without any other computation. We do the reordering before we start receiving.
476  std::vector<std::vector<std::vector<gno_t>>> recv_GID_per_proc_per_round(
477  max_rounds+1,std::vector<std::vector<gno_t>>(
478  comm->getSize(),std::vector<gno_t>(0)));
479  for(int i = 0; i < max_rounds; i++){
480  for(int j = 0; j < comm->getSize(); j++){
481  recv_GID_per_proc_per_round[i][j] = std::vector<gno_t>(sendcounts[j],0);
482  }
483  }
484 
485  if(verbose) std::cout<<comm->getRank()<<": filling out recv GID schedule\n";
486  for(int i = 0; i < comm->getSize(); i++){
487  int curr_round = 0;
488  size_t curr_idx = 0;
489  for(size_t j = sdispls[i]; j < sdispls[i+1]; j++){
490  if(curr_idx > per_proc_round_vtx_sums[curr_round][i]){
491  curr_round++;
492  curr_idx = 0;
493  }
494  recv_GID_per_proc_per_round[curr_round][i][curr_idx++] = j;
495  }
496  }
497 
498  if(verbose) std::cout<<comm->getRank()<<": reordering gids and degrees in the order they'll be received\n";
499  //reorder the GIDs and degrees locally:
500  // - The way we previously stored GIDs and degrees had no relation
501  // to the order that we'll be receiving the adjacency data.
502  // For the CSR to be complete (and usable), we reorder the LIDs of
503  // ghosts so that they are in the order we will receive the adjacency
504  // data.
505  //
506  // - For graphs that are not large enough to trigger sending in multiple
507  // rounds of communication, the LIDs of the ghosts will be ordered
508  // by owning process. If multiple rounds are used, the LIDs of
509  // ghosts will be ordered by rounds in addition to owning process.
510  //
511  // - final_gid_vec and final_degree_vec hold the reorganized gids and
512  // degrees, the final re-mapping happens further down
513  std::vector<gno_t> final_gid_vec(sendcount, 0);
514  std::vector<offset_t> final_degree_vec(sendcount,0);
515  gno_t reorder_idx = 0;
516  for(int i = 0; i < max_rounds; i++){
517  for(int j = 0; j < comm->getSize(); j++){
518  for(size_t k = 0; k < per_proc_round_vtx_sums[i][j]; k++){
519  final_gid_vec[reorder_idx] = sendbuf[recv_GID_per_proc_per_round[i][j][k]];
520  final_degree_vec[reorder_idx++] = recvDegrees[recv_GID_per_proc_per_round[i][j][k]];
521  }
522  }
523  }
524 
525  //check to see if the reorganization has happened correctly
526  if(verbose){
527  //do a quick check to see if we ended up reorganizing anything
528  bool reorganized = false;
529  for(size_t i = 0; i < sendcount; i++){
530  if(final_gid_vec[i] != sendbuf[i]) reorganized = true;
531  }
532 
533  //if we have more than a single round of communication, we need to reorganize.
534  //this alerts of unnecessary reorganization, and a failure to perform necessary
535  //reorganization.
536  if(!reorganized && (max_rounds > 1)) std::cout<<comm->getRank()<<": did not reorgainze GIDs, but probably should have\n";
537  if(reorganized && (max_rounds == 1)) std::cout<<comm->getRank()<<": reorganized GIDs, but probably should not have\n";
538  }
539  //remap the GIDs so we receive the adjacencies in the same order as the current processes LIDs
540  // originally, the GID->LID mapping has no relevance to how we'll be receiving adj data from
541  // remote processes. Here we make it so that the GID->LID mapping does correspond to the
542  // order we have received degree info and will receive adjacencies. ( LID n_local
543  // corresponds to the first GID whose adjacency info we will receive, LID n_local+1 the
544  // second, etc.)
545  // That way, we don't need to reorder the adjacencies after we receive them.
546  //
547  // This next loop is the final mapping step.
548 
549  for (size_t i = 0; i < sendcount; i++){
550  ownedPlusGhosts[i+offsets.size()-1] = final_gid_vec[i];
551  }
552 
553  //status report
554  if(verbose) {
555  std::cout<<comm->getRank()<<": done remapping\n";
556  std::cout<<comm->getRank()<<": building ghost offsets\n";
557  }
558  //start building the second ghost layer's offsets
559  std::vector<offset_t> ghost_offsets(sendcount+1,0);
560  for(size_t i = 1; i < sendcount+1; i++){
561  ghost_offsets[i] = ghost_offsets[i-1] + final_degree_vec[i-1];
562  }
563 
564 
565  if(verbose) std::cout<<comm->getRank()<<": going through the sending rounds\n";
566  //set up counters to keep track of where we are in the sending order
567  std::vector<uint64_t> curr_idx_per_proc(comm->getSize(),0);
568  for(int i = 0; i < comm->getSize(); i++) curr_idx_per_proc[i] = rdispls[i];
569  for(int round = 0; round < max_rounds; round++){
570  //send buffers
571  std::vector<gno_t> send_adj;
572  std::vector<int> send_adj_counts(comm->getSize(),0);
573  if(verbose) std::cout<<comm->getRank()<<": round "<<round<<", constructing send_adj\n";
574  //construct the adjacencies to send for this round
575  for(int curr_proc = 0; curr_proc < comm->getSize(); curr_proc++){
576  uint64_t curr_adj_sum = 0;
577  //keep going through adjacencies to send to this process until we're done
578  while( curr_idx_per_proc[curr_proc] < (size_t)rdispls[curr_proc+1]){
579  lno_t lid = mapOwned->getLocalElement(recvbuf[curr_idx_per_proc[curr_proc]++]);
580 
581  //if the next adjacency would push us over the MPI message size max,
582  //stop for this round
583  if((curr_adj_sum + (offsets[lid+1]-offsets[lid]))*sizeof(gno_t) >= INT_MAX){
584  break;
585  }
586 
587  //add the adjacencies to the send buffer
588  curr_adj_sum += (offsets[lid+1] - offsets[lid]);
589  for(offset_t j = offsets[lid]; j < offsets[lid+1]; j++){
590  send_adj.push_back(adjs[j]);
591  }
592  }
593  //update the send counts for this round
594  send_adj_counts[curr_proc] = curr_adj_sum;
595  }
596  if(verbose) std::cout<<comm->getRank()<<": round "<<round<<", sending...\n";
597  //do the sending...
598  Teuchos::ArrayView<gno_t> send_adjs_view = Teuchos::arrayViewFromVector(send_adj);
599  Teuchos::ArrayView<int> adjsendcounts_view = Teuchos::arrayViewFromVector(send_adj_counts);
600  Teuchos::ArrayRCP<gno_t> ghost_adjs;
601  std::vector<int> adjrecvcounts(comm->getSize(),0);
602  Teuchos::ArrayView<int> adjsrecvcounts_view = Teuchos::arrayViewFromVector(adjrecvcounts);
603  Zoltan2::AlltoAllv<gno_t>(*comm, *env, send_adjs_view, adjsendcounts_view, ghost_adjs, adjsrecvcounts_view);
604  //Because of the previous reordering, these adjacencies are
605  //in the correct order as they arrive on the process.
606  for(offset_t i = 0; i< (offset_t)ghost_adjs.size(); i++){
607  adjs_2GL.push_back(ghost_adjs[i]);
608  }
609  }
610  if(verbose) std::cout<<comm->getRank()<<": constructing offsets\n";
611  //put the offsets we computed into the output argument.
612  for(size_t i = 0; i < sendcount+1; i++){
613  offsets_2GL.push_back(ghost_offsets[i]);
614  }
615  if(verbose) std::cout<<comm->getRank()<<": done building 2nd ghost layer\n";
616  }
617 
618  //Communicates owned vertex colors to remote ghost copies.
619  //
620  //returns: the amount of time the communication call took.
621  //
622  //OUTPUT ARGS:
623  // colors: the owned vertices' colors are not changed,
624  // ghost vertex colors are updated from their owners.
625  //
626  // total_sent: reports the total size of the send buffer
627  //
628  // total_recv: reports the total size of the recv buffer
629  //
630  //INPUT ARGS:
631  //
632  // mapOwnedPlusGhosts: maps global IDs to local IDs and vice-versa.
633  //
634  // nVtx: the number of owned vertices on this process
635  //
636  // verts_to_send: hostmirror of the list of vertices to send.
637  // This list will only contain local vertices
638  // that are ghosted on a remote process.
639  //
640  // verts_to_send_size: hostmirror of the size of verts_to_send
641  //
642  // procs_to_send: map that takes a local ID and gives a vector of
643  // process IDs which have a ghost copy of that vertex.
644  //
645  double doOwnedToGhosts(RCP<const map_t> mapOwnedPlusGhosts,
646  size_t nVtx,
647  typename Kokkos::View<lno_t*,device_type>::HostMirror verts_to_send,
648  typename Kokkos::View<size_t*,device_type>::HostMirror verts_to_send_size,
649  Teuchos::RCP<femv_t> femv,
650  const std::unordered_map<lno_t, std::vector<int>>& procs_to_send,
651  gno_t& total_sent, gno_t& total_recvd){
652 
653  auto femvColors = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
654  auto colors = subview(femvColors, Kokkos::ALL, 0);
655  //create vectors to hold send information
656  int nprocs = comm->getSize();
657  std::vector<int> sendcnts(comm->getSize(), 0);
658  std::vector<gno_t> sdispls(comm->getSize()+1, 0);
659 
660  //calculate how much data we're sending to each process
661  for(size_t i = 0; i < verts_to_send_size(0); i++){
662  for(size_t j = 0; j < procs_to_send.at(verts_to_send(i)).size(); j++){
663  sendcnts[procs_to_send.at(verts_to_send(i))[j]] += 2;
664  }
665  }
666  //calculate sendsize and sdispls
667  sdispls[0] = 0;
668  gno_t sendsize = 0;
669  std::vector<int> sentcount(nprocs, 0);
670 
671  for(int i = 1; i < comm->getSize()+1; i++){
672  sdispls[i] = sdispls[i-1] + sendcnts[i-1];
673  sendsize += sendcnts[i-1];
674  }
675  total_sent = sendsize;
676  std::vector<gno_t> sendbuf(sendsize,0);
677  //construct sendbuf, send each owned vertex's GID
678  //and its color to the processes that have a
679  //ghost copy of that vertex. If a vertex is not ghosted,
680  //it does not get sent anywhere.
681  for(size_t i = 0; i < verts_to_send_size(0); i++){
682  std::vector<int> procs = procs_to_send.at(verts_to_send(i));
683  for(size_t j = 0; j < procs.size(); j++){
684  size_t idx = sdispls[procs[j]] + sentcount[procs[j]];
685  sentcount[procs[j]] += 2;
686  sendbuf[idx++] = mapOwnedPlusGhosts->getGlobalElement(verts_to_send(i));
687  sendbuf[idx] = colors(verts_to_send(i));
688  }
689  }
690 
691  Teuchos::ArrayView<int> sendcnts_view = Teuchos::arrayViewFromVector(sendcnts);
692  Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
693  Teuchos::ArrayRCP<gno_t> recvbuf;
694  std::vector<int> recvcnts(comm->getSize(), 0);
695  Teuchos::ArrayView<int> recvcnts_view = Teuchos::arrayViewFromVector(recvcnts);
696 
697  //if we're reporting times, remove the computation imbalance from the comm timer
698  if(timing) comm->barrier();
699  double comm_total = 0.0;
700  double comm_temp = timer();
701 
702  Zoltan2::AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcnts_view, recvbuf, recvcnts_view);
703  comm_total += timer() - comm_temp;
704 
705  //compute total recvsize for updating local ghost colors
706  gno_t recvsize = 0;
707  for(int i = 0; i < recvcnts_view.size(); i++){
708  recvsize += recvcnts_view[i];
709  }
710  total_recvd = recvsize;
711  //update the local ghost copies with the color we just received.
712  for(int i = 0; i < recvsize; i+=2){
713  size_t lid = mapOwnedPlusGhosts->getLocalElement(recvbuf[i]);
714  colors(lid) = recvbuf[i+1];
715  }
716 
717  return comm_total;
718  }
719 
720  public:
721  //constructor
723  const RCP<const base_adapter_t> &adapter_,
724  const RCP<Teuchos::ParameterList> &pl_,
725  const RCP<Environment> &env_,
726  const RCP<const Teuchos::Comm<int> > &comm_)
727  : adapter(adapter_), pl(pl_), env(env_), comm(comm_){
728  verbose = pl->get<bool>("verbose",false);
729  timing = pl->get<bool>("timing", false);
730  }
731  //Main entry point for graph coloring
732  void color( const RCP<ColoringSolution<Adapter> > &solution){
733  //convert from global graph to local graph
734  ArrayView<const gno_t> vtxIDs;
735  ArrayView<StridedData<lno_t, scalar_t> > vwgts;
736 
737  modelFlag_t flags;
738  flags.set(REMOVE_SELF_EDGES);
739  const auto model = rcp(new GraphModel<base_adapter_t>(this->adapter, this->env,
740  this->comm, flags));
741  size_t nVtx = model->getVertexList(vtxIDs, vwgts);
742  // the weights are not used at this point.
743 
744  //get the adjacencies in a view that holds GIDs
745  ArrayView<const gno_t> adjs;
746  //get the CSR offsets
747  ArrayView<const offset_t> offsets;
748  ArrayView<StridedData<lno_t, scalar_t> > ewgts;
749  model->getEdgeList(adjs, offsets, ewgts);
750 
751  //This map is used to map GID to LID initially
752  std::unordered_map<gno_t,lno_t> globalToLocal;
753  //this vector holds GIDs for owned and ghost vertices
754  //The final order of the gids is:
755  // -----------------------------------
756  // | | first | second |
757  // | owned | layer | layer |
758  // | | ghosts | ghosts |
759  // -----------------------------------
760  //
761  // constructSecondGhostLayer reorders the first layer ghost
762  // GIDs in the particular order that the communication requires.
763 
764  std::vector<gno_t> ownedPlusGhosts;
765 
766  //this vector holds the owning process for
767  //owned vertices and the first ghost layer.
768  //owners[i] gives the processor owning ownedPlusGhosts[i],
769  //for owned vertices and first layer ghosts.
770  //
771  //This should only be used BEFORE the call to constructSecondGhostLayer
772  std::vector<int> owners;
773 
774  //fill these structures using owned vertices
775  for(int i = 0; i < vtxIDs.size(); i++){
776  globalToLocal[vtxIDs[i]] = i;
777  ownedPlusGhosts.push_back(vtxIDs[i]);
778  owners.push_back(comm->getRank());
779  }
780 
781  //count the initial number of ghosts,
782  //map them to a local ID.
783  //The globalToLocal map has first layer ghosts
784  //from here on.
785  int nGhosts = 0;
786  std::vector<lno_t> local_adjs;
787  for(int i = 0; i < adjs.size(); i++){
788  if(globalToLocal.count(adjs[i])==0){
789  ownedPlusGhosts.push_back(adjs[i]);
790  globalToLocal[adjs[i]] = vtxIDs.size()+nGhosts;
791  nGhosts++;
792  }
793  local_adjs.push_back(globalToLocal[adjs[i]]);
794  }
795 
796  //construct a Tpetra map for the FEMultiVector
797  Tpetra::global_size_t dummy = Teuchos::OrdinalTraits
798  <Tpetra::global_size_t>::invalid();
799  RCP<const map_t> mapOwned = rcp(new map_t(dummy, vtxIDs, 0, comm));
800 
801  //GID and owner lookup for ghost vertices
802  std::vector<gno_t> ghosts;
803  std::vector<int> ghostowners;
804  for(size_t i = nVtx; i < nVtx+nGhosts; i++){
805  ghosts.push_back(ownedPlusGhosts[i]);
806  ghostowners.push_back(-1);
807  }
808 
809  //get the owners of the ghost vertices
810  ArrayView<int> owningProcs = Teuchos::arrayViewFromVector(ghostowners);
811  ArrayView<const gno_t> gids = Teuchos::arrayViewFromVector(ghosts);
812  mapOwned->getRemoteIndexList(gids, owningProcs);
813 
814  //add ghost owners to the total owner vector
815  for(size_t i = 0; i < ghostowners.size(); i++){
816  owners.push_back(ghostowners[i]);
817  }
818 
819  //construct the second ghost layer.
820  //NOTE: this may reorder the LIDs of the ghosts.
821  //
822  //these vectors hold the CSR representation of the
823  // first ghost layer. This CSR contains:
824  // - edges from first layer ghosts to:
825  // - owned vertices
826  // - first layer ghosts
827  // - second layer ghosts
828  //
829  // - first_layer_ghost_adjs uses GIDs that need
830  // translated to LIDs.
831  //
832  // - first_layer_ghost_offsets are indexed by LID,
833  // first_layer_ghost_offsets[i] corresponds to
834  // the vertex with GID = ownedPlusGhosts[i+nVtx].
835  // first_layer_ghost_offsets.size() = #first layer ghosts + 1
836  //
837  std::vector< gno_t> first_layer_ghost_adjs;
838  std::vector< offset_t> first_layer_ghost_offsets;
839  constructSecondGhostLayer(ownedPlusGhosts,owners, adjs, offsets, mapOwned,
840  first_layer_ghost_adjs, first_layer_ghost_offsets);
841 
842  //we potentially reordered the local IDs of the ghost vertices, so we need
843  //to re-insert the GIDs into the global to local ID mapping.
844  globalToLocal.clear();
845  for(size_t i = 0; i < ownedPlusGhosts.size(); i++){
846  globalToLocal[ownedPlusGhosts[i]] = i;
847  }
848 
849  //use updated globalToLocal map to translate
850  //adjacency GIDs to LIDs
851  for(int i = 0 ; i < adjs.size(); i++){
852  local_adjs[i] = globalToLocal[adjs[i]];
853  }
854 
855  //at this point, we have ownedPlusGhosts with 1layer ghosts' GIDs.
856  //Need to map the second ghost layer GIDs to new local IDs,
857  //and add them to the map, as well as translating 2layer ghost
858  //adjacencies to use those new LIDs.
859  size_t n2Ghosts = 0;
860 
861  //local_ghost_adjs is the LID translation of first_layer_ghost_adjs.
862  std::vector<lno_t> local_ghost_adjs;
863  for(size_t i = 0; i< first_layer_ghost_adjs.size(); i++ ){
864  if(globalToLocal.count(first_layer_ghost_adjs[i]) == 0){
865  ownedPlusGhosts.push_back(first_layer_ghost_adjs[i]);
866  globalToLocal[first_layer_ghost_adjs[i]] = vtxIDs.size() + nGhosts + n2Ghosts;
867  n2Ghosts++;
868  }
869  local_ghost_adjs.push_back(globalToLocal[first_layer_ghost_adjs[i]]);
870  }
871 
872  //construct data structures necessary for FEMultiVector
873  if(verbose) std::cout<<comm->getRank()<<": constructing Tpetra map with copies\n";
874  dummy = Teuchos::OrdinalTraits <Tpetra::global_size_t>::invalid();
875  RCP<const map_t> mapWithCopies = rcp(new map_t(dummy,
876  Teuchos::arrayViewFromVector(ownedPlusGhosts),
877  0, comm));
878  if(verbose) std::cout<<comm->getRank()<<": done constructing map with copies\n";
879 
880  using import_t = Tpetra::Import<lno_t, gno_t>;
881  Teuchos::RCP<import_t> importer = rcp(new import_t(mapOwned,
882  mapWithCopies));
883  if(verbose) std::cout<<comm->getRank()<<": done constructing importer\n";
884  Teuchos::RCP<femv_t> femv = rcp(new femv_t(mapOwned,
885  importer, 1, true));
886  if(verbose) std::cout<<comm->getRank()<<": done constructing femv\n";
887 
888  //Create random numbers seeded on global IDs to resolve conflicts
889  //consistently between processes.
890  std::vector<int> rand(ownedPlusGhosts.size());
891  for(size_t i = 0; i < rand.size(); i++){
892  std::srand(ownedPlusGhosts[i]);
893  rand[i] = std::rand();
894  }
895 
896  // get up-to-date owners for all ghosts.
897  std::vector<int> ghostOwners2(ownedPlusGhosts.size() -nVtx);
898  std::vector<gno_t> ghosts2(ownedPlusGhosts.size() - nVtx);
899  for(size_t i = nVtx; i < ownedPlusGhosts.size(); i++) ghosts2[i-nVtx] = ownedPlusGhosts[i];
900  Teuchos::ArrayView<int> owners2 = Teuchos::arrayViewFromVector(ghostOwners2);
901  Teuchos::ArrayView<const gno_t> ghostGIDs = Teuchos::arrayViewFromVector(ghosts2);
902  mapOwned->getRemoteIndexList(ghostGIDs,owners2);
903  if(verbose) std::cout<<comm->getRank()<<": done getting ghost owners\n";
904 
905  //calculations for how many 2GL verts are in the boundary of another process, only
906  //do this if it's going to be displayed.
907  if(verbose) {
908  std::cout<<comm->getRank()<<": calculating 2GL stats...\n";
909 
910  std::vector<int> sendcounts(comm->getSize(),0);
911  std::vector<gno_t> sdispls(comm->getSize()+1,0);
912  //loop through owners, count how many vertices we'll send to each processor
913  for(int i = nGhosts; i < ghostGIDs.size(); i++){
914  if(owners2[i] != comm->getRank()&& owners2[i] !=-1) sendcounts[owners2[i]]++;
915  }
916  //construct sdispls (for building sendbuf), and sum the total sendcount
917  gno_t sendcount = 0;
918  for(int i = 1; i < comm->getSize()+1; i++){
919  sdispls[i] = sdispls[i-1] + sendcounts[i-1];
920  sendcount += sendcounts[i-1];
921  }
922  std::vector<gno_t> idx(comm->getSize(),0);
923  for(int i = 0; i < comm->getSize(); i++){
924  idx[i]=sdispls[i];
925  }
926  //construct sendbuf to send GIDs to owning processes
927  std::vector<gno_t> sendbuf(sendcount,0);
928  for(size_t i = nGhosts; i < (size_t)ghostGIDs.size(); i++){
929  if(owners2[i] != comm->getRank() && owners2[i] != -1){
930  sendbuf[idx[owners2[i]]++] = ghostGIDs[i];
931  }
932  }
933  //do the communication
934  Teuchos::ArrayView<int> sendcounts_view = Teuchos::arrayViewFromVector(sendcounts);
935  Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
936  Teuchos::ArrayRCP<gno_t> recvbuf;
937  std::vector<int> recvcounts(comm->getSize(),0);
938  Teuchos::ArrayView<int> recvcounts_view = Teuchos::arrayViewFromVector(recvcounts);
939  Zoltan2::AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcounts_view, recvbuf, recvcounts_view);
940  std::vector<int> is_bndry_send(recvbuf.size(),0);
941 
942  //send back how many received vertices are in the boundary
943  for(int i = 0; i < recvbuf.size(); i++){
944  size_t lid = mapWithCopies->getLocalElement(recvbuf[i]);
945  is_bndry_send[i] = 0;
946  if(lid < nVtx){
947  for(offset_t j = offsets[lid]; j < offsets[lid+1]; j++){
948  if((size_t)local_adjs[j] >= nVtx) is_bndry_send[i] = 1;
949  }
950  } else{
951  for(offset_t j = first_layer_ghost_offsets[lid]; j < first_layer_ghost_offsets[lid+1]; j++){
952  if((size_t)local_ghost_adjs[j] >= nVtx) is_bndry_send[i] = 1;
953  }
954  }
955  }
956 
957  //send back the boundary flags
958  Teuchos::ArrayView<int> is_bndry_send_view = Teuchos::arrayViewFromVector(is_bndry_send);
959  Teuchos::ArrayRCP<int> is_bndry_recv;
960  std::vector<int> bndry_recvcounts(comm->getSize(),0);
961  Teuchos::ArrayView<int> bndry_recvcounts_view = Teuchos::arrayViewFromVector(bndry_recvcounts);
962  Zoltan2::AlltoAllv<int> (*comm, *env, is_bndry_send_view, recvcounts_view, is_bndry_recv, bndry_recvcounts_view);
963 
964  //add together the flags to compute how many boundary vertices are in the second ghost layer
965  int boundaryverts = 0;
966  for(int i = 0; i < is_bndry_recv.size(); i++){
967  boundaryverts += is_bndry_recv[i];
968  }
969  //this cout is guarded by a verbose check.
970  std::cout<<comm->getRank()<<": "<<boundaryverts<<" boundary verts out of "<<n2Ghosts<<" verts in 2GL\n";
971  }
972 
973 
974  //local CSR representation for the owned vertices:
975  Teuchos::ArrayView<const lno_t> local_adjs_view = Teuchos::arrayViewFromVector(local_adjs);
976  //NOTE: the offset ArrayView was constructed earlier, and is up-to-date.
977  //
978  //first ghost layer CSR representation:
979  Teuchos::ArrayView<const offset_t> ghost_offsets = Teuchos::arrayViewFromVector(first_layer_ghost_offsets);
980  Teuchos::ArrayView<const lno_t> ghost_adjacencies = Teuchos::arrayViewFromVector(local_ghost_adjs);
981  Teuchos::ArrayView<const int> rand_view = Teuchos::arrayViewFromVector(rand);
982  Teuchos::ArrayView<const gno_t> gid_view = Teuchos::arrayViewFromVector(ownedPlusGhosts);
983  //these ArrayViews contain LIDs that are ghosted remotely,
984  //and the Process IDs that have those ghost copies.
985  //An LID may appear in the exportLIDs more than once.
986  Teuchos::ArrayView<const lno_t> exportLIDs = importer->getExportLIDs();
987  Teuchos::ArrayView<const int> exportPIDs = importer->getExportPIDs();
988 
989  //construct a quick lookup datastructure to map from LID to a
990  //list of PIDs we need to send data to.
991  std::unordered_map<lno_t, std::vector<int>> procs_to_send;
992  for(int i = 0; i < exportLIDs.size(); i++){
993  procs_to_send[exportLIDs[i]].push_back(exportPIDs[i]);
994  }
995 
996  //call the coloring algorithm
997  twoGhostLayer(nVtx, nVtx+nGhosts, local_adjs_view, offsets, ghost_adjacencies, ghost_offsets,
998  femv, gid_view, rand_view, owners2, mapWithCopies, procs_to_send);
999 
1000  //copy colors to the output array
1001  ArrayRCP<int> colors = solution->getColorsRCP();
1002  auto femvdata = femv->getData(0);
1003  for(size_t i=0; i<nVtx; i++){
1004  colors[i] = femvdata[i];
1005  }
1006 
1007  }
1008 // private:
1009 
1010  //This is the coloring logic for all 2GL-based methods.
1011  //
1012  //INPUT ARGS:
1013  //
1014  // n_local: the number of local owned vertices
1015  //
1016  // n_total: the number of local owned vertices plus first layer ghosts
1017  //
1018  // adjs: the CSR adjacencies for the graph, only including owned vertices
1019  // and first layer ghosts
1020  //
1021  // offsets: the CSR offsets for the graph, has owned offsets into adjacencies
1022  //
1023  // ghost_adjs: the adjacency information for first-layer ghost vertices.
1024  // Includes all neighbors (owned, first-layer ghost,
1025  // second-layer ghost) for each first-layer ghost.
1026  //
1027  //
1028  // ghost_offsets: the offsets into ghost_adjs, first layer ghost LIDs
1029  // minus n_local are used to index this
1030  // datastructure. I.E. ghost_offsets(0)
1031  // corresponds to the ghost with LID n_local
1032  //
1033  // gids: a vector that holds GIDs for all vertices on this process
1034  // (includes owned, and all ghosts, indexable by LID)
1035  // The GIDs are ordered like this:
1036  // ----------------------------------
1037  // | | first | second |
1038  // | owned | layer | layer |
1039  // | | ghosts | ghosts |
1040  // ----------------------------------
1041  // ^ ^
1042  // n_local n_total
1043  //
1044  // rand: a vector that holds random numbers generated on GID for tie
1045  // breaking. Indexed by LID.
1046  //
1047  // ghost_owners: contains the process ID for the owner of each remote vertex.
1048  // Indexable by LID. owners[i] = the owning process for vertex
1049  // with GID = gids[i+n_local]
1050  //
1051  // mapOwnedPlusGhosts: map that can translate between GID and LID
1052  //
1053  // procs_to_send: for each vertex that is copied on a remote process,
1054  // this map contains the list of processes that have
1055  // a copy of a given vertex. Input LID, get the list
1056  // of PIDs that have a ghost copy of that LID.
1057  //
1058  //OUTPUT ARGS:
1059  //
1060  // femv: an FEMultiVector that holds a dual view of the colors.
1061  // After this call, femv contains updated colors.
1062  //
1063  void twoGhostLayer(const size_t n_local, const size_t n_total,
1064  const Teuchos::ArrayView<const lno_t>& adjs,
1065  const Teuchos::ArrayView<const offset_t>& offsets,
1066  const Teuchos::ArrayView<const lno_t>& ghost_adjs,
1067  const Teuchos::ArrayView<const offset_t>& ghost_offsets,
1068  const Teuchos::RCP<femv_t>& femv,
1069  const Teuchos::ArrayView<const gno_t>& gids,
1070  const Teuchos::ArrayView<const int>& rand,
1071  const Teuchos::ArrayView<const int>& ghost_owners,
1072  RCP<const map_t> mapOwnedPlusGhosts,
1073  const std::unordered_map<lno_t, std::vector<int>>& procs_to_send){
1074  //initialize timing variables
1075  double total_time = 0.0;
1076  double interior_time = 0.0;
1077  double comm_time = 0.0;
1078  double comp_time = 0.0;
1079  double recoloring_time=0.0;
1080  double conflict_detection = 0.0;
1081 
1082  //Number of rounds we are saving statistics for
1083  //100 is a decent default. Reporting requires --verbose argument.
1084  const int numStatisticRecordingRounds = 100;
1085 
1086  //includes all ghosts, including the second layer.
1087  const size_t n_ghosts = rand.size() - n_local;
1088 
1089 
1090  //Get the degrees of all ghost vertices
1091  //This is necessary for recoloring based on degrees,
1092  //we need ghost vertex degrees for consistency.
1093  std::vector<int> deg_send_cnts(comm->getSize(),0);
1094  std::vector<gno_t> deg_sdispls(comm->getSize()+1,0);
1095  for(int i = 0; i < ghost_owners.size(); i++){
1096  deg_send_cnts[ghost_owners[i]]++;
1097  }
1098  deg_sdispls[0] = 0;
1099  gno_t deg_sendsize = 0;
1100  std::vector<int> deg_sentcount(comm->getSize(),0);
1101  for(int i = 1; i < comm->getSize()+1; i++){
1102  deg_sdispls[i] = deg_sdispls[i-1] + deg_send_cnts[i-1];
1103  deg_sendsize += deg_send_cnts[i-1];
1104  }
1105  std::vector<gno_t> deg_sendbuf(deg_sendsize,0);
1106  for(int i = 0; i < ghost_owners.size(); i++){
1107  size_t idx = deg_sdispls[ghost_owners[i]] + deg_sentcount[ghost_owners[i]];
1108  deg_sentcount[ghost_owners[i]]++;
1109  deg_sendbuf[idx] = mapOwnedPlusGhosts->getGlobalElement(i+n_local);
1110  }
1111  Teuchos::ArrayView<int> deg_send_cnts_view = Teuchos::arrayViewFromVector(deg_send_cnts);
1112  Teuchos::ArrayView<gno_t> deg_sendbuf_view = Teuchos::arrayViewFromVector(deg_sendbuf);
1113  Teuchos::ArrayRCP<gno_t> deg_recvbuf;
1114  std::vector<int> deg_recvcnts(comm->getSize(),0);
1115  Teuchos::ArrayView<int> deg_recvcnts_view = Teuchos::arrayViewFromVector(deg_recvcnts);
1116  Zoltan2::AlltoAllv<gno_t>(*comm, *env, deg_sendbuf_view, deg_send_cnts_view, deg_recvbuf, deg_recvcnts_view);
1117 
1118  //replace GID with the degree the owning process has for this vertex.
1119  //(this will include all edges present for this vertex globally)
1120  //This is safe to do, since we sent ghosts to their owners.
1121  for(int i = 0; i < deg_recvbuf.size(); i++){
1122  lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_recvbuf[i]);
1123  deg_recvbuf[i] = offsets[lid+1] - offsets[lid];
1124  }
1125  //send modified buffer back
1126  ArrayRCP<gno_t> ghost_degrees;
1127  Zoltan2::AlltoAllv<gno_t>(*comm, *env, deg_recvbuf(), deg_recvcnts_view, ghost_degrees, deg_send_cnts_view);
1128 
1129  //create the ghost degree views, for use on host and device.
1130  Kokkos::View<gno_t*, device_type> ghost_degrees_dev("ghost degree view",ghost_degrees.size());
1131  typename Kokkos::View<gno_t*, device_type>::HostMirror ghost_degrees_host = Kokkos::create_mirror(ghost_degrees_dev);
1132  for(int i = 0; i < ghost_degrees.size(); i++){
1133  lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_sendbuf[i]);
1134  ghost_degrees_host(lid-n_local) = ghost_degrees[i];
1135  }
1136  Kokkos::deep_copy(ghost_degrees_dev, ghost_degrees_host);
1137 
1138  //track the size of sends and receives through coloring rounds.
1139  gno_t recvPerRound[numStatisticRecordingRounds];
1140  gno_t sentPerRound[numStatisticRecordingRounds];
1141 
1142  //find global max degree to determine which
1143  //coloring algorithm will be the most efficient
1144  //(For D1-2GL this is important, D2 and PD2 should always use NBBIT
1145  offset_t local_max_degree = 0;
1146  offset_t global_max_degree = 0;
1147  for(size_t i = 0; i < n_local; i++){
1148  offset_t curr_degree = offsets[i+1] - offsets[i];
1149  if(curr_degree > local_max_degree){
1150  local_max_degree = curr_degree;
1151  }
1152  }
1153  Teuchos::reduceAll<int, offset_t>(*comm, Teuchos::REDUCE_MAX,1, &local_max_degree, &global_max_degree);
1154  if(comm->getRank() == 0 && verbose) std::cout<<"Input has max degree "<<global_max_degree<<"\n";
1155 
1156  if(verbose) std::cout<<comm->getRank()<<": constructing Kokkos Views for initial coloring\n";
1157 
1158  //the initial coloring is able to use a standard CSR representation, so we construct that here.
1159  //This is a direct translation of the offsets and adjs arguments into Kokkos::Views.
1160  Kokkos::View<offset_t*, device_type> offsets_dev("Host Offset View", offsets.size());
1161  typename Kokkos::View<offset_t*, device_type>::HostMirror offsets_host = Kokkos::create_mirror(offsets_dev);
1162  Kokkos::View<lno_t*, device_type> adjs_dev("Host Adjacencies View", adjs.size());
1163  typename Kokkos::View<lno_t*, device_type>::HostMirror adjs_host = Kokkos::create_mirror(adjs_dev);
1164  for(Teuchos_Ordinal i = 0; i < offsets.size(); i++) offsets_host(i) = offsets[i];
1165  for(Teuchos_Ordinal i = 0; i < adjs.size(); i++) adjs_host(i) = adjs[i];
1166  Kokkos::deep_copy(offsets_dev,offsets_host);
1167  Kokkos::deep_copy(adjs_dev, adjs_host);
1168 
1169 
1170  //create the graph structures which allow KokkosKernels to recolor the conflicting vertices
1171  if(verbose) std::cout<<comm->getRank()<<": constructing Kokkos Views for recoloring\n";
1172 
1173  //in order to correctly recolor, all ghost vertices on this process need an entry in the CSR offsets.
1174  //Otherwise, the color of the ghosts will be ignored, and the coloring will not converge.
1175  Kokkos::View<offset_t*, device_type> dist_degrees_dev("Owned+Ghost degree view",rand.size());
1176  typename Kokkos::View<offset_t*, device_type>::HostMirror dist_degrees_host = Kokkos::create_mirror(dist_degrees_dev);
1177 
1178  //calculate the local degrees for the owned, first layer ghosts, and second layer ghosts
1179  //to be used to construct the CSR representation of the local graph.
1180  //owned
1181  for(Teuchos_Ordinal i = 0; i < offsets.size()-1; i++) dist_degrees_host(i) = offsets[i+1] - offsets[i];
1182  //first layer ghosts
1183  for(Teuchos_Ordinal i = 0; i < ghost_offsets.size()-1; i++) dist_degrees_host(i+n_local) = ghost_offsets[i+1] - ghost_offsets[i];
1184  //second layer ghosts
1185  for(Teuchos_Ordinal i = 0; i < ghost_adjs.size(); i++){
1186  //second layer ghosts have LID >= n_total
1187  if((size_t)ghost_adjs[i] >= n_total ){
1188  dist_degrees_host(ghost_adjs[i])++;
1189  }
1190  }
1191 
1192  //The structure of the final CSR constructed here looks like:
1193  //
1194  // Index by LID--v
1195  // --------------------------------------------
1196  // | | first |second |
1197  // dist_offsets_dev/host |owned verts | layer |layer |
1198  // | | ghosts |ghosts |
1199  // --------------------------------------------
1200  // |indexes
1201  // v
1202  // --------------------------------------------
1203  // | |first |second |
1204  // dist_adjs_dev/host |owned adjs |layer |layer |
1205  // | |ghost adjs |ghost adjs |
1206  // --------------------------------------------
1207  //
1208  // We end up with a CSR that has adjacency information for all the vertices on
1209  // this process. We include only edges on the process, so ghosts have only partial
1210  // adjacency information.
1211  //
1212  // We symmetrize the local graph representation as well, for
1213  // KokkosKernels to behave as we need it to. This means that edges to
1214  // second layer ghosts must be symmetrized, so we end up with offsets
1215  // that correspond to second layer ghosts.
1216  Kokkos::View<offset_t*, device_type> dist_offsets_dev("Owned+Ghost Offset view", rand.size()+1);
1217  typename Kokkos::View<offset_t*, device_type>::HostMirror dist_offsets_host = Kokkos::create_mirror(dist_offsets_dev);
1218 
1219  //we can construct the entire offsets using the degrees we computed above
1220  dist_offsets_host(0) = 0;
1221  offset_t total_adjs = 0;
1222  for(Teuchos_Ordinal i = 1; i < rand.size()+1; i++){
1223  dist_offsets_host(i) = dist_degrees_host(i-1) + dist_offsets_host(i-1);
1224  total_adjs += dist_degrees_host(i-1);
1225  }
1226  Kokkos::View<lno_t*, device_type> dist_adjs_dev("Owned+Ghost adjacency view", total_adjs);
1227  typename Kokkos::View<lno_t*, device_type>::HostMirror dist_adjs_host = Kokkos::create_mirror(dist_adjs_dev);
1228 
1229  //zero out the degrees to use it as a counter.
1230  //The offsets now allow us to compute degrees,
1231  //so we aren't losing anything.
1232  for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
1233  dist_degrees_host(i) = 0;
1234  }
1235  //We can just copy the adjacencies for the owned verts and first layer ghosts
1236  for(Teuchos_Ordinal i = 0; i < adjs.size(); i++) dist_adjs_host(i) = adjs[i];
1237  for(Teuchos_Ordinal i = adjs.size(); i < adjs.size() + ghost_adjs.size(); i++) dist_adjs_host(i) = ghost_adjs[i-adjs.size()];
1238 
1239  //We have to symmetrize edges that involve a second layer ghost.
1240  //Add edges from the second layer ghosts to their neighbors.
1241  for(Teuchos_Ordinal i = 0; i < ghost_offsets.size()-1; i++){
1242  //loop through each first layer ghost
1243  for(offset_t j = ghost_offsets[i]; j < ghost_offsets[i+1]; j++){
1244  //if an adjacency is a second layer ghost
1245  if((size_t)ghost_adjs[j] >= n_total){
1246  //compute the offset to place the symmetric edge, and place it.
1247  dist_adjs_host(dist_offsets_host(ghost_adjs[j]) + dist_degrees_host(ghost_adjs[j])) = i + n_local;
1248  //increment the counter to keep track of how many adjacencies
1249  //have been placed.
1250  dist_degrees_host(ghost_adjs[j])++;
1251  }
1252  }
1253  }
1254  //copy the host views back to the device views
1255  Kokkos::deep_copy(dist_degrees_dev,dist_degrees_host);
1256  Kokkos::deep_copy(dist_offsets_dev,dist_offsets_host);
1257  Kokkos::deep_copy(dist_adjs_dev, dist_adjs_host);
1258 
1259  //this view represents how many conflicts were found
1260  Kokkos::View<size_t*, device_type> recoloringSize("Recoloring Queue Size",1);
1261  typename Kokkos::View<size_t*, device_type>::HostMirror recoloringSize_host = Kokkos::create_mirror(recoloringSize);
1262  recoloringSize_host(0) = 0;
1263  Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1264 
1265  //create a view for the tie-breaking random numbers.
1266  if(verbose) std::cout<<comm->getRank()<<": constructing rand and GIDs views\n";
1267  Kokkos::View<int*, device_type> rand_dev("Random View", rand.size());
1268  typename Kokkos::View<int*, device_type>::HostMirror rand_host = Kokkos::create_mirror(rand_dev);
1269  for(Teuchos_Ordinal i = 0; i < rand.size(); i ++) rand_host(i) = rand[i];
1270  Kokkos::deep_copy(rand_dev,rand_host);
1271 
1272  //create a view for the global IDs of each vertex.
1273  Kokkos::View<gno_t*, device_type> gid_dev("GIDs", gids.size());
1274  typename Kokkos::View<gno_t*, device_type>::HostMirror gid_host = Kokkos::create_mirror(gid_dev);
1275  for(Teuchos_Ordinal i = 0; i < gids.size(); i++) gid_host(i) = gids[i];
1276  Kokkos::deep_copy(gid_dev,gid_host);
1277 
1278  //These variables will be initialized by constructBoundary()
1279  //
1280  //boundary_verts_dev holds all owned vertices that could possibly conflict
1281  //with a remote vertex. Stores LIDs.
1282  Kokkos::View<lno_t*, device_type> boundary_verts_dev;
1283  //this is the number of vertices in the boundary.
1284  if(verbose) std::cout<<comm->getRank()<<": constructing communication and recoloring lists\n";
1285 
1286  //We keep track of the vertices that need to get recolored
1287  //this list can contain ghost vertices, so it needs to be initialized
1288  //to the number of all vertices on the local process.
1289  //rand has an entry for each vertex, so its size corresponds to what is needed.
1290  Kokkos::View<lno_t*, device_type> verts_to_recolor_view("verts to recolor", rand.size());
1291  typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_recolor_host = create_mirror(verts_to_recolor_view);
1292 
1293  //This view keeps track of the size of the list of vertices to recolor.
1294  Kokkos::View<int*, device_type> verts_to_recolor_size("verts to recolor size",1);
1295  Kokkos::View<int*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_recolor_size_atomic = verts_to_recolor_size;
1296  typename Kokkos::View<int*, device_type>::HostMirror verts_to_recolor_size_host = create_mirror(verts_to_recolor_size);
1297 
1298  //initialize the host view
1299  verts_to_recolor_size_host(0) = 0;
1300  //copy to device
1301  Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1302 
1303  //verts to send can only include locally owned vertices,
1304  //so we can safely allocate the view to size n_local.
1305  //
1306  //This view is a list of local vertices that are going to be
1307  //recolored, and need to be sent to their remote copies.
1308  Kokkos::View<lno_t*, device_type> verts_to_send_view("verts to send", n_local);
1309  typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_send_host = create_mirror(verts_to_send_view);
1310 
1311  //this view keeps track of the size of verts_to_send.
1312  Kokkos::View<size_t*, device_type> verts_to_send_size("verts to send size",1);
1313  Kokkos::View<size_t*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic = verts_to_send_size;
1314  typename Kokkos::View<size_t*, device_type>::HostMirror verts_to_send_size_host = create_mirror(verts_to_send_size);
1315 
1316  verts_to_send_size_host(0) = 0;
1317  Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1318 
1319  if(verbose) std::cout<<comm->getRank()<<": Constructing the boundary\n";
1320 
1321  //this function allocates and initializes the boundary_verts_dev view,
1322  //sets the boundary_size accordingly, and adds vertices to the
1323  //verts_to_send_atomic view, updating the size view as well.
1324  constructBoundary(n_local, dist_offsets_dev, dist_adjs_dev, dist_offsets_host, dist_adjs_host, boundary_verts_dev,
1325  verts_to_send_view, verts_to_send_size_atomic);
1326 
1327  //this boolean chooses which KokkosKernels algorithm to use.
1328  //It was experimentally chosen for distance-1 coloring.
1329  //Should be disregarded for distance-2 colorings.
1330  bool use_vbbit = (global_max_degree < 6000);
1331  //Done initializing, start coloring!
1332 
1333  //use a barrier if we are reporting timing info
1334  if(timing) comm->barrier();
1335  interior_time = timer();
1336  total_time = timer();
1337  //give the entire local graph to KokkosKernels to color
1338  this->colorInterior(n_local, adjs_dev, offsets_dev, femv,adjs_dev,0,use_vbbit);
1339  interior_time = timer() - interior_time;
1340  comp_time = interior_time;
1341 
1342  //ghost_colors holds the colors of only ghost vertices.
1343  //ghost_colors(0) holds the color of a vertex with LID n_local.
1344  //To index this with LIDs, subtract n_local. If an LID is < n_local,
1345  //it will not have a color stored in this view.
1346  Kokkos::View<int*,device_type> ghost_colors("ghost color backups", n_ghosts);
1347 
1348  //communicate the initial coloring.
1349  if(verbose) std::cout<<comm->getRank()<<": communicating\n";
1350 
1351  //update the host views for the verts to send,
1352  //doOwnedToGhosts needs host memory views, but doesn't
1353  //change the values in them, so no need to copy afterwards
1354  Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1355  Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1356  gno_t recv,sent;
1357  comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,n_local,verts_to_send_host,verts_to_send_size_host,femv,procs_to_send,sent,recv);
1358  sentPerRound[0] = sent;
1359  recvPerRound[0] = recv;
1360 
1361  //store ghost colors so we can restore them after recoloring.
1362  //the local process can't color ghosts correctly, so we
1363  //reset the colors to avoid consistency issues.
1364  //get the color view from the FEMultiVector
1365  auto femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
1366  auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
1367  Kokkos::parallel_for("get femv colors",
1368  Kokkos::RangePolicy<execution_space, int>(0,n_ghosts),
1369  KOKKOS_LAMBDA(const int& i){
1370  ghost_colors(i) = femv_colors(i+n_local);
1371  });
1372  Kokkos::fence();
1373 
1374  double temp = timer();
1375  //detect conflicts only for ghost vertices
1376  bool recolor_degrees = this->pl->template get<bool>("recolor_degrees",false);
1377  if(verbose) std::cout<<comm->getRank()<<": detecting conflicts\n";
1378 
1379  //these sizes will be updated by detectConflicts, zero them out beforehand
1380  verts_to_send_size_host(0) = 0;
1381  verts_to_recolor_size_host(0) = 0;
1382  recoloringSize_host(0) = 0;
1383  Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1384  Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1385  Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1386 
1387  detectConflicts(n_local, dist_offsets_dev, dist_adjs_dev, femv_colors, boundary_verts_dev,
1388  verts_to_recolor_view, verts_to_recolor_size_atomic, verts_to_send_view, verts_to_send_size_atomic,
1389  recoloringSize, rand_dev, gid_dev, ghost_degrees_dev, recolor_degrees);
1390 
1391  //these sizes were updated by detectConflicts,
1392  //copy the device views back into host memory
1393  Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1394  Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1395  Kokkos::deep_copy(recoloringSize_host, recoloringSize);
1396  Kokkos::deep_copy(verts_to_recolor_size_host, verts_to_recolor_size);
1397 
1398  if(comm->getSize() > 1){
1399  conflict_detection = timer() - temp;
1400  comp_time += conflict_detection;
1401  }
1402  //all conflicts detected!
1403  if(verbose) std::cout<<comm->getRank()<<": starting to recolor\n";
1404  //variables for statistics
1405  double totalPerRound[numStatisticRecordingRounds];
1406  double commPerRound[numStatisticRecordingRounds];
1407  double compPerRound[numStatisticRecordingRounds];
1408  double recoloringPerRound[numStatisticRecordingRounds];
1409  double conflictDetectionPerRound[numStatisticRecordingRounds];
1410  uint64_t vertsPerRound[numStatisticRecordingRounds];
1411  uint64_t incorrectGhostsPerRound[numStatisticRecordingRounds];
1412  int distributedRounds = 1;
1413  totalPerRound[0] = interior_time + comm_time + conflict_detection;
1414  recoloringPerRound[0] = 0;
1415  commPerRound[0] = comm_time;
1416  compPerRound[0] = interior_time + conflict_detection;
1417  conflictDetectionPerRound[0] = conflict_detection;
1418  recoloringPerRound[0] = 0;
1419  vertsPerRound[0] = 0;
1420  incorrectGhostsPerRound[0]=0;
1421  typename Kokkos::View<int*, device_type>::HostMirror ghost_colors_host;
1422  typename Kokkos::View<lno_t*, device_type>::HostMirror boundary_verts_host;
1423  size_t serial_threshold = this->pl->template get<int>("serial_threshold",0);
1424  //see if recoloring is necessary.
1425  size_t totalConflicts = 0;
1426  size_t localConflicts = recoloringSize_host(0);
1427  Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localConflicts, &totalConflicts);
1428  bool done = !totalConflicts;
1429  if(comm->getSize()==1) done = true;
1430 
1431  //recolor until no conflicts are left
1432  while(!done){
1433  //if the number of vertices left to recolor is less than
1434  //the serial threshold set by the user, finish the coloring
1435  //only using host views in a serial execution space.
1436  if(recoloringSize_host(0) < serial_threshold) break;
1437  if(distributedRounds < numStatisticRecordingRounds) {
1438  vertsPerRound[distributedRounds] = verts_to_recolor_size_host(0);
1439  }
1440 
1441  if(timing) comm->barrier();
1442  double recolor_temp = timer();
1443  //recolor using KokkosKernels' coloring function
1444  if(verts_to_recolor_size_host(0) > 0){
1445  this->colorInterior(femv_colors.size(), dist_adjs_dev, dist_offsets_dev,femv,verts_to_recolor_view,verts_to_recolor_size_host(0),use_vbbit);
1446  }
1447 
1448  if(distributedRounds < numStatisticRecordingRounds){
1449  recoloringPerRound[distributedRounds] = timer() - recolor_temp;
1450  recoloring_time += recoloringPerRound[distributedRounds];
1451  comp_time += recoloringPerRound[distributedRounds];
1452  compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1453  totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1454  } else if(timing){
1455  double recoloring_round_time = timer() - recolor_temp;
1456  recoloring_time += recoloring_round_time;
1457  comp_time += recoloring_round_time;
1458  }
1459 
1460  //reset the ghost colors to what they were before recoloring
1461  //the recoloring does not have enough information to color
1462  //ghosts correctly, so we set the colors to what they were before
1463  //to avoid consistency issues.
1464  Kokkos::parallel_for("set femv colors",
1465  Kokkos::RangePolicy<execution_space, int>(0,n_ghosts),
1466  KOKKOS_LAMBDA(const int& i){
1467  femv_colors(i+n_local) = ghost_colors(i);
1468  });
1469  Kokkos::fence();
1470 
1471  //send views are up-to-date, they were copied after conflict detection.
1472  //communicate the new colors
1473 
1474  // Reset device views
1475  femvColors = decltype(femvColors)();
1476  femv_colors = decltype(femv_colors)();
1477  double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,n_local,verts_to_send_host,verts_to_send_size_host,femv,procs_to_send,sent,recv);
1478  comm_time += curr_comm_time;
1479 
1480  if(distributedRounds < numStatisticRecordingRounds){
1481  commPerRound[distributedRounds] = curr_comm_time;
1482  recvPerRound[distributedRounds] = recv;
1483  sentPerRound[distributedRounds] = sent;
1484  totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1485  }
1486 
1487  //store ghost colors before we uncolor and recolor them.
1488  //this process doesn't have enough info to correctly color
1489  //ghosts, so we set them back to what they were before to
1490  //remove consistency issues.
1491  femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
1492  femv_colors = subview(femvColors, Kokkos::ALL, 0);
1493  Kokkos::parallel_for("get femv colors 2",
1494  Kokkos::RangePolicy<execution_space, int>(0,n_ghosts),
1495  KOKKOS_LAMBDA(const int& i){
1496  ghost_colors(i) = femv_colors(i+n_local);
1497  });
1498  Kokkos::fence();
1499 
1500 
1501  //these views will change in detectConflicts, they need
1502  //to be zeroed out beforehand
1503  verts_to_send_size_host(0) = 0;
1504  verts_to_recolor_size_host(0) = 0;
1505  recoloringSize_host(0) = 0;
1506  Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1507  Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1508  Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1509 
1510  //check for further conflicts
1511  double detection_temp = timer();
1512 
1513  detectConflicts(n_local, dist_offsets_dev, dist_adjs_dev,femv_colors, boundary_verts_dev,
1514  verts_to_recolor_view, verts_to_recolor_size_atomic, verts_to_send_view, verts_to_send_size_atomic,
1515  recoloringSize, rand_dev, gid_dev, ghost_degrees_dev, recolor_degrees);
1516 
1517  //copy the updated device views back into host memory where necessary
1518  Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1519  Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1520  //we only use the list of verts to recolor on device, no need to copy to host.
1521  Kokkos::deep_copy(verts_to_recolor_size_host, verts_to_recolor_size);
1522  Kokkos::deep_copy(recoloringSize_host, recoloringSize);
1523 
1524  if(distributedRounds < numStatisticRecordingRounds){
1525  conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
1526  conflict_detection += conflictDetectionPerRound[distributedRounds];
1527  compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1528  totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1529  comp_time += conflictDetectionPerRound[distributedRounds];
1530  } else if(timing){
1531  double conflict_detection_round_time = timer() - detection_temp;
1532  conflict_detection += conflict_detection_round_time;
1533  comp_time += conflict_detection_round_time;
1534  }
1535 
1536  distributedRounds++;
1537  size_t localDone = recoloringSize_host(0);
1538  size_t globalDone = 0;
1539  Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localDone, &globalDone);
1540  done = !globalDone;
1541 
1542  }//end device coloring
1543 
1544 
1545  //If we reach this point and still have vertices to color,
1546  //the rest of the coloring is happening in serial on the host.
1547  //
1548  //First, we need to copy device-only views to host mirrors
1549  if(recoloringSize_host(0) > 0 || !done){
1550  ghost_colors_host = Kokkos::create_mirror_view_and_copy(host_mem(),ghost_colors,"ghost_colors host mirror");
1551  boundary_verts_host = Kokkos::create_mirror_view_and_copy(host_mem(),boundary_verts_dev,"boundary_verts host mirror");
1552  }
1553 
1554  //Now we do a similar coloring loop to before,
1555  //but using only host views in a serial execution space.
1556  // Reset device views
1557  femvColors = decltype(femvColors)();
1558  femv_colors = decltype(femv_colors)();
1559  while(recoloringSize_host(0) > 0 || !done){
1560  auto femvColors_host = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
1561  auto colors_host = subview(femvColors_host, Kokkos::ALL, 0);
1562  if(distributedRounds < numStatisticRecordingRounds){
1563  vertsPerRound[distributedRounds] = recoloringSize_host(0);
1564  }
1565  if(verbose) std::cout<<comm->getRank()<<": starting to recolor, serial\n";
1566  if(timing) comm->barrier();
1567 
1568  double recolor_temp = timer();
1569  if(verts_to_recolor_size_host(0) > 0){
1570  this->colorInterior_serial(colors_host.size(), dist_adjs_host, dist_offsets_host, femv,
1571  verts_to_recolor_host, verts_to_recolor_size_host(0), true);
1572  }
1573  if(distributedRounds < numStatisticRecordingRounds){
1574  recoloringPerRound[distributedRounds] = timer() - recolor_temp;
1575  recoloring_time += recoloringPerRound[distributedRounds];
1576  comp_time += recoloringPerRound[distributedRounds];
1577  compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1578  totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1579  } else if(timing){
1580  double recoloring_serial_round_time = timer() - recolor_temp;
1581  recoloring_time += recoloring_serial_round_time;
1582  comp_time += recoloring_serial_round_time;
1583  }
1584 
1585  //reset the ghost colors to their previous values to avoid
1586  //consistency issues.
1587  for(size_t i = 0; i < n_ghosts; i++){
1588  colors_host(i+n_local) = ghost_colors_host(i);
1589  }
1590 
1591  double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts, n_local,verts_to_send_host, verts_to_send_size_host, femv, procs_to_send, sent,recv);
1592  comm_time += curr_comm_time;
1593 
1594  if(distributedRounds < numStatisticRecordingRounds){
1595  commPerRound[distributedRounds] = curr_comm_time;
1596  recvPerRound[distributedRounds] = recv;
1597  sentPerRound[distributedRounds] = sent;
1598  totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1599  }
1600 
1601  //store updated ghost colors we received from their owners
1602  //before conflict detection and recoloring changes them locally.
1603  for(size_t i = 0; i < n_ghosts; i++){
1604  ghost_colors_host(i) = colors_host(i+n_local);
1605  }
1606 
1607  if(timing) comm->barrier();
1608  double detection_temp = timer();
1609 
1610  //zero these out, they'll be updated by detectConflicts_serial
1611  verts_to_recolor_size_host(0) = 0;
1612  verts_to_send_size_host(0) = 0;
1613  recoloringSize_host(0) = 0;
1614 
1615  detectConflicts_serial(n_local,dist_offsets_host, dist_adjs_host, colors_host, boundary_verts_host,
1616  verts_to_recolor_host, verts_to_recolor_size_host, verts_to_send_host, verts_to_send_size_host,
1617  recoloringSize_host, rand_host, gid_host, ghost_degrees_host, recolor_degrees);
1618 
1619  //no need to update the host views, we're only using host views now.
1620 
1621  if(distributedRounds < numStatisticRecordingRounds){
1622  conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
1623  conflict_detection += conflictDetectionPerRound[distributedRounds];
1624  compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1625  totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1626  comp_time += conflictDetectionPerRound[distributedRounds];
1627  } else if(timing){
1628  double conflict_detection_serial_round_time = timer() - detection_temp;
1629  conflict_detection += conflict_detection_serial_round_time;
1630  comp_time += conflict_detection_serial_round_time;
1631  }
1632 
1633  size_t globalDone = 0;
1634  size_t localDone = recoloringSize_host(0);
1635  Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localDone, &globalDone);
1636  distributedRounds++;
1637  done = !globalDone;
1638  }
1639 
1640  total_time = timer() - total_time;
1641  //only compute stats if they will be displayed
1642  if(verbose){
1643  uint64_t localBoundaryVertices = 0;
1644  for(size_t i = 0; i < n_local; i++){
1645  for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
1646  if((size_t)adjs[j] >= n_local){
1647  localBoundaryVertices++;
1648  break;
1649  }
1650  }
1651  }
1652 
1653  //if(comm->getRank() == 0) printf("did %d rounds of distributed coloring\n", distributedRounds);
1654  uint64_t totalVertsPerRound[numStatisticRecordingRounds];
1655  uint64_t totalBoundarySize = 0;
1656  uint64_t totalIncorrectGhostsPerRound[numStatisticRecordingRounds];
1657  double finalTotalPerRound[numStatisticRecordingRounds];
1658  double maxRecoloringPerRound[numStatisticRecordingRounds];
1659  double minRecoloringPerRound[numStatisticRecordingRounds];
1660  double finalCommPerRound[numStatisticRecordingRounds];
1661  double finalCompPerRound[numStatisticRecordingRounds];
1662  double finalConflictDetectionPerRound[numStatisticRecordingRounds];
1663  gno_t finalRecvPerRound[numStatisticRecordingRounds];
1664  gno_t finalSentPerRound[numStatisticRecordingRounds];
1665  for(int i = 0; i < numStatisticRecordingRounds; i++){
1666  totalVertsPerRound[i] = 0;
1667  finalTotalPerRound[i] = 0.0;
1668  maxRecoloringPerRound[i] = 0.0;
1669  minRecoloringPerRound[i] = 0.0;
1670  finalCommPerRound[i] = 0.0;
1671  finalCompPerRound[i] = 0.0;
1672  finalConflictDetectionPerRound[i] = 0.0;
1673  finalRecvPerRound[i] = 0;
1674  finalSentPerRound[i] = 0;
1675  }
1676  Teuchos::reduceAll<int,uint64_t>(*comm, Teuchos::REDUCE_SUM,1,&localBoundaryVertices, &totalBoundarySize);
1677  Teuchos::reduceAll<int,uint64_t>(*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,vertsPerRound,totalVertsPerRound);
1678  Teuchos::reduceAll<int,uint64_t>(*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,incorrectGhostsPerRound,totalIncorrectGhostsPerRound);
1679  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,totalPerRound, finalTotalPerRound);
1680  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,recoloringPerRound,maxRecoloringPerRound);
1681  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,numStatisticRecordingRounds,recoloringPerRound,minRecoloringPerRound);
1682  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,commPerRound,finalCommPerRound);
1683  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,compPerRound,finalCompPerRound);
1684  Teuchos::reduceAll<int,double>(*comm,
1685  Teuchos::REDUCE_MAX,numStatisticRecordingRounds,conflictDetectionPerRound,finalConflictDetectionPerRound);
1686  Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,recvPerRound,finalRecvPerRound);
1687  Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,sentPerRound,finalSentPerRound);
1688  std::cout << "Rank " << comm->getRank()
1689  << ": boundary size: " << localBoundaryVertices << std::endl;
1690  if(comm->getRank() == 0)
1691  std::cout << "Total boundary size: " << totalBoundarySize << std::endl;
1692  for(int i = 0; i < std::min((int)distributedRounds,numStatisticRecordingRounds); i++){
1693  std::cout << "Rank " << comm->getRank()
1694  << ": recolor " << vertsPerRound[i]
1695  << " vertices in round " << i << std::endl;
1696  std::cout << "Rank " << comm->getRank()
1697  << " sentbuf had " << sentPerRound[i]
1698  << " entries in round " << i << std::endl;
1699  if(comm->getRank()==0){
1700  std::cout << "recolored " << totalVertsPerRound[i]
1701  << " vertices in round " << i << std::endl;
1702  std::cout << totalIncorrectGhostsPerRound[i]
1703  << " inconsistent ghosts in round " << i << std::endl;
1704  std::cout << "total time in round " << i
1705  << ": " << finalTotalPerRound[i] << std::endl;
1706  std::cout << "recoloring time in round " << i
1707  << ": " << maxRecoloringPerRound[i] << std::endl;
1708  std::cout << "min recoloring time in round " << i
1709  << ": " << minRecoloringPerRound[i] << std::endl;
1710  std::cout << "conflict detection time in round " << i
1711  << ": " << finalConflictDetectionPerRound[i] << std::endl;
1712  std::cout << "comm time in round " << i
1713  << ": " << finalCommPerRound[i] << std::endl;
1714  std::cout << "recvbuf size in round " << i
1715  << ": " << finalRecvPerRound[i] << std::endl;
1716  std::cout << "sendbuf size in round " << i
1717  << ": " << finalSentPerRound[i] << std::endl;
1718  std::cout << "comp time in round " << i
1719  << ": " << finalCompPerRound[i] << std::endl;
1720  }
1721  }
1722  } else if (timing){
1723  double global_total_time = 0.0;
1724  double global_recoloring_time = 0.0;
1725  double global_min_recoloring_time = 0.0;
1726  double global_conflict_detection=0.0;
1727  double global_comm_time=0.0;
1728  double global_comp_time=0.0;
1729  double global_interior_time=0.0;
1730  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&total_time,&global_total_time);
1731  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&recoloring_time,&global_recoloring_time);
1732  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,1,&recoloring_time,&global_min_recoloring_time);
1733  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&conflict_detection,&global_conflict_detection);
1734  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comm_time,&global_comm_time);
1735  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comp_time,&global_comp_time);
1736  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&interior_time,&global_interior_time);
1737  comm->barrier();
1738  fflush(stdout);
1739  if(comm->getRank()==0){
1740  std::cout << "Total Time: " << global_total_time << std::endl;
1741  std::cout << "Interior Time: " << global_interior_time << std::endl;
1742  std::cout << "Recoloring Time: " << global_recoloring_time << std::endl;
1743  std::cout << "Min Recoloring Time: " << global_min_recoloring_time << std::endl;
1744  std::cout << "Conflict Detection Time: " << global_conflict_detection << std::endl;
1745  std::cout << "Comm Time: " << global_comm_time << std::endl;
1746  std::cout << "Comp Time: " << global_comp_time << std::endl;
1747  }
1748  }
1749  }
1750 }; //end class
1751 
1752 }//end namespace Zoltan2
1753 
1754 #endif
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
Tpetra::FEMultiVector< femv_scalar_t, lno_t, gno_t > femv_t
typename Adapter::gno_t gno_t
virtual void detectConflicts(const size_t n_local, Kokkos::View< offset_t *, device_type > dist_offsets_dev, Kokkos::View< lno_t *, device_type > dist_adjs_dev, Kokkos::View< int *, device_type > femv_colors, Kokkos::View< lno_t *, device_type > boundary_verts_view, Kokkos::View< lno_t *, device_type > verts_to_recolor_view, Kokkos::View< int *, device_type, Kokkos::MemoryTraits< Kokkos::Atomic >> verts_to_recolor_size_atomic, Kokkos::View< lno_t *, device_type > verts_to_send_view, Kokkos::View< size_t *, device_type, Kokkos::MemoryTraits< Kokkos::Atomic >> verts_to_send_size_atomic, Kokkos::View< size_t *, device_type > recoloringSize, Kokkos::View< int *, device_type > rand, Kokkos::View< gno_t *, device_type > gid, Kokkos::View< gno_t *, device_type > ghost_degrees, bool recolor_degrees)=0
typename Adapter::lno_t lno_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
map_t::global_ordinal_type gno_t
Definition: mapRemotes.cpp:27
typename Adapter::offset_t offset_t
AlgTwoGhostLayer(const RCP< const base_adapter_t > &adapter_, const RCP< Teuchos::ParameterList > &pl_, const RCP< Environment > &env_, const RCP< const Teuchos::Comm< int > > &comm_)
virtual void constructBoundary(const size_t n_local, Kokkos::View< offset_t *, device_type > dist_offsets_dev, Kokkos::View< lno_t *, device_type > dist_adjs_dev, typename Kokkos::View< offset_t *, device_type >::HostMirror dist_offsets_host, typename Kokkos::View< lno_t *, device_type >::HostMirror dist_adjs_host, Kokkos::View< lno_t *, device_type > &boundary_verts, Kokkos::View< lno_t *, device_type > verts_to_send_view, Kokkos::View< size_t *, device_type, Kokkos::MemoryTraits< Kokkos::Atomic >> verts_to_send_size_atomic)=0
algorithm requires no self edges
typename femv_t::host_view_type::device_type::memory_space host_mem
void color(const RCP< ColoringSolution< Adapter > > &solution)
Coloring method.
RCP< Teuchos::ParameterList > pl
Algorithm defines the base class for all algorithms.
map_t::local_ordinal_type lno_t
Definition: mapRemotes.cpp:26
typename femv_t::device_type device_type
void twoGhostLayer(const size_t n_local, const size_t n_total, const Teuchos::ArrayView< const lno_t > &adjs, const Teuchos::ArrayView< const offset_t > &offsets, const Teuchos::ArrayView< const lno_t > &ghost_adjs, const Teuchos::ArrayView< const offset_t > &ghost_offsets, const Teuchos::RCP< femv_t > &femv, const Teuchos::ArrayView< const gno_t > &gids, const Teuchos::ArrayView< const int > &rand, const Teuchos::ArrayView< const int > &ghost_owners, RCP< const map_t > mapOwnedPlusGhosts, const std::unordered_map< lno_t, std::vector< int >> &procs_to_send)
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS&#39;s idx_t...
typename femv_t::host_view_type::device_type::execution_space host_exec
RCP< const base_adapter_t > adapter
GraphModel defines the interface required for graph models.
RCP< const Teuchos::Comm< int > > comm
Defines the ColoringSolution class.
Tpetra::global_size_t global_size_t
virtual void detectConflicts_serial(const size_t n_local, typename Kokkos::View< offset_t *, device_type >::HostMirror dist_offsets_host, typename Kokkos::View< lno_t *, device_type >::HostMirror dist_adjs_host, typename Kokkos::View< int *, device_type >::HostMirror femv_colors, typename Kokkos::View< lno_t *, device_type >::HostMirror boundary_verts_view, typename Kokkos::View< lno_t *, device_type >::HostMirror verts_to_recolor_view, typename Kokkos::View< int *, device_type >::HostMirror verts_to_recolor_size_atomic, typename Kokkos::View< lno_t *, device_type >::HostMirror verts_to_send_view, typename Kokkos::View< size_t *, device_type >::HostMirror verts_to_send_size_atomic, typename Kokkos::View< size_t *, device_type >::HostMirror recoloringSize, typename Kokkos::View< int *, device_type >::HostMirror rand, typename Kokkos::View< gno_t *, device_type >::HostMirror gid, typename Kokkos::View< gno_t *, device_type >::HostMirror ghost_degrees, bool recolor_degrees)=0
Defines the GraphModel interface.
A gathering of useful namespace methods.
typename Adapter::base_adapter_t base_adapter_t
typename device_type::execution_space execution_space
Tpetra::Map< lno_t, gno_t > map_t
typename Adapter::scalar_t scalar_t
typename device_type::memory_space memory_space
AlltoAll communication methods.
The class containing coloring solution.