MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_AggregationPhase1Algorithm_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
46 #ifndef MUELU_AGGREGATIONPHASE1ALGORITHM_KOKKOS_DEF_HPP
47 #define MUELU_AGGREGATIONPHASE1ALGORITHM_KOKKOS_DEF_HPP
48 
49 #ifdef HAVE_MUELU_KOKKOS_REFACTOR
50 
51 #include <queue>
52 #include <vector>
53 
54 #include <Teuchos_Comm.hpp>
55 #include <Teuchos_CommHelpers.hpp>
56 
57 #include <Xpetra_Vector.hpp>
58 
60 
61 #include "MueLu_Aggregates_kokkos.hpp"
62 #include "MueLu_Exceptions.hpp"
63 #include "MueLu_LWGraph_kokkos.hpp"
64 #include "MueLu_Monitor.hpp"
65 
66 #include "KokkosGraph_Distance2ColorHandle.hpp"
67 #include "KokkosGraph_Distance2Color.hpp"
68 
69 namespace MueLu {
70 
71  template <class LocalOrdinal, class GlobalOrdinal, class Node>
72  void AggregationPhase1Algorithm_kokkos<LocalOrdinal, GlobalOrdinal, Node>::
73  BuildAggregates(const ParameterList& params, const LWGraph_kokkos& graph, Aggregates_kokkos& aggregates, std::vector<unsigned>& aggStat,
74  LO& numNonAggregatedNodes) const {
75  Monitor m(*this, "BuildAggregates");
76 
77  std::string orderingStr = params.get<std::string>("aggregation: ordering");
78  int maxNeighAlreadySelected = params.get<int> ("aggregation: max selected neighbors");
79  int minNodesPerAggregate = params.get<int> ("aggregation: min agg size");
80  int maxNodesPerAggregate = params.get<int> ("aggregation: max agg size");
81 
82  Algorithm algorithm = Algorithm::Serial;
83  std::string algoParamName = "aggregation: phase 1 algorithm";
84  if(params.isParameter(algoParamName))
85  {
86  algorithm = algorithmFromName(params.get<std::string>("aggregation: phase 1 algorithm"));
87  }
88 
89  TEUCHOS_TEST_FOR_EXCEPTION(maxNodesPerAggregate < minNodesPerAggregate, Exceptions::RuntimeError,
90  "MueLu::UncoupledAggregationAlgorithm::BuildAggregates: minNodesPerAggregate must be smaller or equal to MaxNodePerAggregate!");
91 
92  //Distance-2 gives less control than serial uncoupled phase 1
93  //no custom row reordering because would require making deep copy of local matrix entries and permuting it
94  //can only enforce max aggregate size
95  if(algorithm == Algorithm::Distance2)
96  {
97  BuildAggregatesDistance2(graph, aggregates, aggStat, numNonAggregatedNodes, maxNodesPerAggregate);
98  }
99  else
100  {
101  BuildAggregatesSerial(graph, aggregates, aggStat, numNonAggregatedNodes,
102  minNodesPerAggregate, maxNodesPerAggregate, maxNeighAlreadySelected, orderingStr);
103  }
104  }
105 
106 
107  template <class LocalOrdinal, class GlobalOrdinal, class Node>
108  void AggregationPhase1Algorithm_kokkos<LocalOrdinal, GlobalOrdinal, Node>::
109  BuildAggregatesSerial(const LWGraph_kokkos& graph, Aggregates_kokkos& aggregates,
110  std::vector<unsigned>& aggStat, LO& numNonAggregatedNodes,
111  LO minNodesPerAggregate, LO maxNodesPerAggregate,
112  LO maxNeighAlreadySelected, std::string& orderingStr) const
113  {
114  enum {
115  O_NATURAL,
116  O_RANDOM,
117  O_GRAPH
118  } ordering;
119 
120  ordering = O_NATURAL; // initialize variable (fix CID 143665)
121  if (orderingStr == "natural") ordering = O_NATURAL;
122  if (orderingStr == "random" ) ordering = O_RANDOM;
123  if (orderingStr == "graph" ) ordering = O_GRAPH;
124 
125  const LO numRows = graph.GetNodeNumVertices();
126  const int myRank = graph.GetComm()->getRank();
127 
128  ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
129  ArrayRCP<LO> procWinner = aggregates.GetProcWinner() ->getDataNonConst(0);
130 
131  LO numLocalAggregates = aggregates.GetNumAggregates();
132 
133  ArrayRCP<LO> randomVector;
134  if (ordering == O_RANDOM) {
135  randomVector = arcp<LO>(numRows);
136  for (LO i = 0; i < numRows; i++)
137  randomVector[i] = i;
138  RandomReorder(randomVector);
139  }
140 
141  int aggIndex = -1;
142  size_t aggSize = 0;
143  std::vector<int> aggList(graph.getNodeMaxNumRowEntries());
144 
145  std::queue<LO> graphOrderQueue;
146 
147  // Main loop over all local rows of graph(A)
148  for (LO i = 0; i < numRows; i++) {
149  // Step 1: pick the next node to aggregate
150  LO rootCandidate = 0;
151  if (ordering == O_NATURAL) rootCandidate = i;
152  else if (ordering == O_RANDOM) rootCandidate = randomVector[i];
153  else if (ordering == O_GRAPH) {
154 
155  if (graphOrderQueue.size() == 0) {
156  // Current queue is empty for "graph" ordering, populate with one READY node
157  for (LO jnode = 0; jnode < numRows; jnode++)
158  if (aggStat[jnode] == READY) {
159  graphOrderQueue.push(jnode);
160  break;
161  }
162  }
163  if (graphOrderQueue.size() == 0) {
164  // There are no more ready nodes, end the phase
165  break;
166  }
167  rootCandidate = graphOrderQueue.front(); // take next node from graph ordering queue
168  graphOrderQueue.pop(); // delete this node in list
169  }
170 
171  if (aggStat[rootCandidate] != READY)
172  continue;
173 
174  // Step 2: build tentative aggregate
175  aggSize = 0;
176  aggList[aggSize++] = rootCandidate;
177 
178  auto neighOfINode = graph.getNeighborVertices(rootCandidate);
179 
180  // If the number of neighbors is less than the minimum number of nodes
181  // per aggregate, we know this is not going to be a valid root, and we
182  // may skip it, but only for "natural" and "random" (for "graph" we still
183  // need to fetch the list of local neighbors to continue)
184  if ((ordering == O_NATURAL || ordering == O_RANDOM) &&
185  as<int>(neighOfINode.length) < minNodesPerAggregate) {
186  continue;
187  }
188 
189  LO numAggregatedNeighbours = 0;
190 
191  for (int j = 0; j < as<int>(neighOfINode.length); j++) {
192  LO neigh = neighOfINode(j);
193 
194  if (neigh != rootCandidate && graph.isLocalNeighborVertex(neigh)) {
195 
196  if (aggStat[neigh] == READY || aggStat[neigh] == NOTSEL) {
197  // If aggregate size does not exceed max size, add node to the
198  // tentative aggregate
199  // NOTE: We do not exit the loop over all neighbours since we have
200  // still to count all aggregated neighbour nodes for the
201  // aggregation criteria
202  // NOTE: We check here for the maximum aggregation size. If we
203  // would do it below with all the other check too big aggregates
204  // would not be accepted at all.
205  if (aggSize < as<size_t>(maxNodesPerAggregate))
206  aggList[aggSize++] = neigh;
207 
208  } else {
209  numAggregatedNeighbours++;
210  }
211  }
212  }
213 
214  // Step 3: check if tentative aggregate is acceptable
215  if ((numAggregatedNeighbours <= maxNeighAlreadySelected) && // too many connections to other aggregates
216  (aggSize >= as<size_t>(minNodesPerAggregate))) { // too few nodes in the tentative aggregate
217  // Accept new aggregate
218  // rootCandidate becomes the root of the newly formed aggregate
219  aggregates.SetIsRoot(rootCandidate);
220  aggIndex = numLocalAggregates++;
221 
222  for (size_t k = 0; k < aggSize; k++) {
223  aggStat [aggList[k]] = AGGREGATED;
224  vertex2AggId[aggList[k]] = aggIndex;
225  procWinner [aggList[k]] = myRank;
226  }
227 
228  numNonAggregatedNodes -= aggSize;
229 
230  } else {
231  // Aggregate is not accepted
232  aggStat[rootCandidate] = NOTSEL;
233 
234  // Need this for the "graph" ordering below
235  // The original candidate is always aggList[0]
236  aggSize = 1;
237  }
238 
239  if (ordering == O_GRAPH) {
240  // Add candidates to the list of nodes
241  // NOTE: the code have slightly different meanings depending on context:
242  // - if aggregate was accepted, we add neighbors of neighbors of the original candidate
243  // - if aggregate was not accepted, we add neighbors of the original candidate
244  for (size_t k = 0; k < aggSize; k++) {
245  auto neighOfJNode = graph.getNeighborVertices(aggList[k]);
246 
247  for (int j = 0; j < as<int>(neighOfJNode.length); j++) {
248  LO neigh = neighOfJNode(j);
249 
250  if (graph.isLocalNeighborVertex(neigh) && aggStat[neigh] == READY)
251  graphOrderQueue.push(neigh);
252  }
253  }
254  }
255  }
256 
257  // Reset all NOTSEL vertices to READY
258  // This simplifies other algorithms
259  for (LO i = 0; i < numRows; i++)
260  if (aggStat[i] == NOTSEL)
261  aggStat[i] = READY;
262 
263  // update aggregate object
264  aggregates.SetNumAggregates(numLocalAggregates);
265  }
266 
267  template <class LocalOrdinal, class GlobalOrdinal, class Node>
268  void AggregationPhase1Algorithm_kokkos<LocalOrdinal, GlobalOrdinal, Node>::
269  BuildAggregatesDistance2(const LWGraph_kokkos& graph, Aggregates_kokkos& aggregates,
270  std::vector<unsigned>& aggStat, LO& numNonAggregatedNodes, LO maxAggSize) const
271  {
272  const LO numRows = graph.GetNodeNumVertices();
273  const int myRank = graph.GetComm()->getRank();
274 
275  ArrayRCP<LO> vertex2AggId = aggregates.GetVertex2AggId()->getDataNonConst(0);
276  ArrayRCP<LO> procWinner = aggregates.GetProcWinner() ->getDataNonConst(0);
277 
278  LO numLocalAggregates = aggregates.GetNumAggregates();
279 
280  //get the sparse local graph in CRS
281  std::vector<LocalOrdinal> rowptrs;
282  rowptrs.reserve(numRows + 1);
283  std::vector<LocalOrdinal> colinds;
284  colinds.reserve(graph.GetNodeNumEdges());
285 
286  rowptrs.push_back(0);
287  for(LocalOrdinal row = 0; row < numRows; row++)
288  {
289  auto entries = graph.getNeighborVertices(row);
290  for(LocalOrdinal i = 0; i < entries.length; i++)
291  {
292  colinds.push_back(entries.colidx(i));
293  }
294  rowptrs.push_back(colinds.size());
295  }
296 
297  //the local CRS graph to Kokkos device views, then compute graph squared
298  typedef typename Tpetra::CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::local_graph_type graph_t;
299  typedef typename graph_t::device_type device_t;
300  typedef typename device_t::memory_space memory_space;
301  typedef typename device_t::execution_space execution_space;
302  typedef typename graph_t::row_map_type::non_const_type rowptrs_view;
303  typedef typename rowptrs_view::HostMirror host_rowptrs_view;
304  typedef typename graph_t::entries_type::non_const_type colinds_view;
305  typedef typename colinds_view::HostMirror host_colinds_view;
306  //note: just using colinds_view in place of scalar_view_t type (it won't be used at all by symbolic SPGEMM)
307  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
308  typename rowptrs_view::const_value_type, typename colinds_view::const_value_type, typename colinds_view::const_value_type,
309  execution_space, memory_space, memory_space> KernelHandle;
310 
311  KernelHandle kh;
312  //leave gc algorithm choice as the default
313  kh.create_distance2_graph_coloring_handle();
314 
315  // get the distance-2 graph coloring handle
316  auto coloringHandle = kh.get_distance2_graph_coloring_handle();
317 
318  // Set the distance-2 graph coloring algorithm to use.
319  // Options:
320  // COLORING_D2_DEFAULT - Let the kernel handle pick the variation
321  // COLORING_D2_SERIAL - Use the legacy serial-only implementation
322  // COLORING_D2_MATRIX_SQUARED - Use the SPGEMM + D1GC method
323  // COLORING_D2_SPGEMM - Same as MATRIX_SQUARED
324  // COLORING_D2_VB - Use the parallel vertex based direct method
325  // COLORING_D2_VB_BIT - Same as VB but using the bitvector forbidden array
326  // COLORING_D2_VB_BIT_EF - Add experimental edge-filtering to VB_BIT
327  coloringHandle->set_algorithm( KokkosGraph::COLORING_D2_SERIAL );
328 
329  //Create device views for graph rowptrs/colinds
330  rowptrs_view aRowptrs("A device rowptrs", rowptrs.size());
331  colinds_view aColinds("A device colinds", colinds.size());
332  // Populate A in temporary host views, then copy to device
333  {
334  host_rowptrs_view aHostRowptrs = Kokkos::create_mirror_view(aRowptrs);
335  for(size_t i = 0; i < rowptrs.size(); i++)
336  {
337  aHostRowptrs(i) = rowptrs[i];
338  }
339  Kokkos::deep_copy(aRowptrs, aHostRowptrs);
340  host_colinds_view aHostColinds = Kokkos::create_mirror_view(aColinds);
341  for(size_t i = 0; i < colinds.size(); i++)
342  {
343  aHostColinds(i) = colinds[i];
344  }
345  Kokkos::deep_copy(aColinds, aHostColinds);
346  }
347  //run d2 graph coloring
348  //graph is symmetric so row map/entries and col map/entries are the same
349  KokkosGraph::Experimental::graph_compute_distance2_color(&kh, numRows, numRows, aRowptrs, aColinds, aRowptrs, aColinds);
350 
351  // extract the colors
352  auto colorsDevice = coloringHandle->get_vertex_colors();
353 
354  auto colors = Kokkos::create_mirror_view(colorsDevice);
355  Kokkos::deep_copy(colors, colorsDevice);
356 
357  //clean up coloring handle
358  kh.destroy_distance2_graph_coloring_handle();
359 
360  //have color 1 (first color) be the aggregate roots (add those to mapping first)
361  LocalOrdinal aggCount = 0;
362  for(LocalOrdinal i = 0; i < numRows; i++)
363  {
364  if(colors(i) == 1 && aggStat[i] == READY)
365  {
366  vertex2AggId[i] = aggCount++;
367  aggStat[i] = AGGREGATED;
368  numLocalAggregates++;
369  procWinner[i] = myRank;
370  }
371  }
372  numNonAggregatedNodes = 0;
373  std::vector<LocalOrdinal> aggSizes(numLocalAggregates, 0);
374  for(int i = 0; i < numRows; i++)
375  {
376  if(vertex2AggId[i] >= 0)
377  aggSizes[vertex2AggId[i]]++;
378  }
379  //now assign every READY vertex to a directly connected root
380  for(LocalOrdinal i = 0; i < numRows; i++)
381  {
382  if(colors(i) != 1 && (aggStat[i] == READY || aggStat[i] == NOTSEL))
383  {
384  //get neighbors of vertex i and
385  //look for local, aggregated, color 1 neighbor (valid root)
386  auto neighbors = graph.getNeighborVertices(i);
387  for(LocalOrdinal j = 0; j < neighbors.length; j++)
388  {
389  auto nei = neighbors.colidx(j);
390  LocalOrdinal agg = vertex2AggId[nei];
391  if(graph.isLocalNeighborVertex(nei) && colors(nei) == 1 && aggStat[nei] == AGGREGATED && aggSizes[agg] < maxAggSize)
392  {
393  //assign vertex i to aggregate with root j
394  vertex2AggId[i] = agg;
395  aggSizes[agg]++;
396  aggStat[i] = AGGREGATED;
397  procWinner[i] = myRank;
398  break;
399  }
400  }
401  }
402  if(aggStat[i] != AGGREGATED)
403  {
404  numNonAggregatedNodes++;
405  if(aggStat[i] == NOTSEL)
406  aggStat[i] = READY;
407  }
408  }
409  // update aggregate object
410  aggregates.SetNumAggregates(numLocalAggregates);
411  }
412 
413  template <class LocalOrdinal, class GlobalOrdinal, class Node>
414  void AggregationPhase1Algorithm_kokkos<LocalOrdinal, GlobalOrdinal, Node>::RandomReorder(ArrayRCP<LO> list) const {
415  //TODO: replace int
416  int n = list.size();
417  for(int i = 0; i < n-1; i++)
418  std::swap(list[i], list[RandomOrdinal(i,n-1)]);
419  }
420 
421  template <class LocalOrdinal, class GlobalOrdinal, class Node>
422  int AggregationPhase1Algorithm_kokkos<LocalOrdinal, GlobalOrdinal, Node>::RandomOrdinal(int min, int max) const {
423  return min + as<int>((max-min+1) * (static_cast<double>(std::rand()) / (RAND_MAX + 1.0)));
424  }
425 
426 } // end namespace
427 
428 #endif // HAVE_MUELU_KOKKOS_REFACTOR
429 #endif // MUELU_AGGREGATIONPHASE1ALGORITHM_KOKKOS_DEF_HPP
430 
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
LocalOrdinal LO
void deep_copy(const View< DT, DP...> &dst, typename ViewTraits< DT, DP...>::const_value_type &value, typename std::enable_if< std::is_same< typename ViewTraits< DT, DP...>::specialize, void >::value >::type *=0)