Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_crsUtils.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_DETAILS_CRSUTILS_HPP
11 #define TPETRA_DETAILS_CRSUTILS_HPP
12 #include <numeric>
13 #include <type_traits>
14 
15 #include "TpetraCore_config.h"
16 #include "Kokkos_Core.hpp"
18 #include "Tpetra_Details_CrsPadding.hpp"
19 #include "Tpetra_Details_WrappedDualView.hpp"
20 #include <iostream>
21 #include <memory>
22 #include <unordered_map>
23 
29 
30 namespace Tpetra {
31 namespace Details {
32 
33 namespace impl {
34 
35 template <class ViewType>
36 ViewType
37 make_uninitialized_view(
38  const std::string& name,
39  const size_t size,
40  const bool verbose,
41  const std::string* const prefix) {
42  if (verbose) {
43  std::ostringstream os;
44  os << *prefix << "Allocate Kokkos::View " << name
45  << ": " << size << std::endl;
46  std::cerr << os.str();
47  }
48  using Kokkos::view_alloc;
49  using Kokkos::WithoutInitializing;
50  return ViewType(view_alloc(name, WithoutInitializing), size);
51 }
52 
53 template <class ViewType>
54 ViewType
55 make_initialized_view(
56  const std::string& name,
57  const size_t size,
58  const bool verbose,
59  const std::string* const prefix) {
60  if (verbose) {
61  std::ostringstream os;
62  os << *prefix << "Allocate & initialize Kokkos::View "
63  << name << ": " << size << std::endl;
64  std::cerr << os.str();
65  }
66  return ViewType(name, size);
67 }
68 
69 template <class OutViewType, class InViewType>
70 void assign_to_view(OutViewType& out,
71  const InViewType& in,
72  const char viewName[],
73  const bool verbose,
74  const std::string* const prefix) {
75  if (verbose) {
76  std::ostringstream os;
77  os << *prefix << "Assign to Kokkos::View " << viewName
78  << ": Old size: " << out.extent(0)
79  << ", New size: " << in.extent(0) << std::endl;
80  std::cerr << os.str();
81  }
82  out = in;
83 }
84 
85 template <class MemorySpace, class ViewType>
86 auto create_mirror_view(
87  const MemorySpace& memSpace,
88  const ViewType& view,
89  const bool verbose,
90  const std::string* const prefix) -> decltype(Kokkos::create_mirror_view(memSpace, view)) {
91  if (verbose) {
92  std::ostringstream os;
93  os << *prefix << "Create mirror view: "
94  << "view.extent(0): " << view.extent(0) << std::endl;
95  std::cerr << os.str();
96  }
97  return Kokkos::create_mirror_view(memSpace, view);
98 }
99 
100 enum class PadCrsAction {
101  INDICES_ONLY,
102  INDICES_AND_VALUES
103 };
104 
113 template <class RowPtr, class Indices, class Values, class Padding>
114 void pad_crs_arrays(
115  const PadCrsAction action,
116  const RowPtr& row_ptr_beg,
117  const RowPtr& row_ptr_end,
118  Indices& indices_wdv,
119  Values& values_wdv,
120  const Padding& padding,
121  const int my_rank,
122  const bool verbose) {
123  using execution_space = typename Indices::t_dev::execution_space;
124  using Kokkos::view_alloc;
125  using Kokkos::WithoutInitializing;
126  using std::endl;
127  std::unique_ptr<std::string> prefix;
128 
129  const size_t maxNumToPrint = verbose ? Behavior::verbosePrintCountThreshold() : size_t(0);
130  if (verbose) {
131  std::ostringstream os;
132  os << "Proc " << my_rank << ": Tpetra::...::pad_crs_arrays: ";
133  prefix = std::unique_ptr<std::string>(new std::string(os.str()));
134  os << "Start" << endl;
135  std::cerr << os.str();
136  }
137  Kokkos::HostSpace hostSpace;
138 
139  if (verbose) {
140  std::ostringstream os;
141  os << *prefix << "On input: ";
142  auto row_ptr_beg_h =
143  Kokkos::create_mirror_view(hostSpace, row_ptr_beg);
144  // DEEP_COPY REVIEW - NOT TESTED
145  Kokkos::deep_copy(row_ptr_beg_h, row_ptr_beg);
146  verbosePrintArray(os, row_ptr_beg_h, "row_ptr_beg before scan",
147  maxNumToPrint);
148  os << ", ";
149  auto row_ptr_end_h =
150  Kokkos::create_mirror_view(hostSpace, row_ptr_end);
151  // DEEP_COPY REVIEW - NOT TESTED
152  Kokkos::deep_copy(row_ptr_end_h, row_ptr_end);
153  verbosePrintArray(os, row_ptr_end_h, "row_ptr_end before scan",
154  maxNumToPrint);
155  os << ", indices.extent(0): " << indices_wdv.extent(0)
156  << ", values.extent(0): " << values_wdv.extent(0)
157  << ", padding: ";
158  padding.print(os);
159  os << endl;
160  std::cerr << os.str();
161  }
162 
163  if (row_ptr_beg.size() == 0) {
164  if (verbose) {
165  std::ostringstream os;
166  os << *prefix << "Done; local matrix has no rows" << endl;
167  std::cerr << os.str();
168  }
169  return; // nothing to do
170  }
171 
172  const size_t lclNumRows(row_ptr_beg.size() - 1);
173  RowPtr newAllocPerRow =
174  make_uninitialized_view<RowPtr>("newAllocPerRow", lclNumRows,
175  verbose, prefix.get());
176  if (verbose) {
177  std::ostringstream os;
178  os << *prefix << "Fill newAllocPerRow & compute increase" << endl;
179  std::cerr << os.str();
180  }
181  size_t increase = 0;
182  {
183  // Must do on host because padding uses std::map
184  execution_space exec_space_instance = execution_space();
185  auto row_ptr_end_h = create_mirror_view(
186  hostSpace, row_ptr_end, verbose, prefix.get());
187  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
188  Kokkos::deep_copy(exec_space_instance, row_ptr_end_h, row_ptr_end);
189  auto row_ptr_beg_h = create_mirror_view(
190  hostSpace, row_ptr_beg, verbose, prefix.get());
191  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
192  Kokkos::deep_copy(exec_space_instance, row_ptr_beg_h, row_ptr_beg);
193 
194  // lbv 03/15/23: The execution space deep_copy does an asynchronous
195  // copy so we really want to fence that space before touching the
196  // host data as it is not guarenteed to have arrived by the time we
197  // start the parallel_reduce below which might use a different
198  // execution space, see:
199  // https://kokkos.github.io/kokkos-core-wiki/API/core/view/deep_copy.html#semantics
200  exec_space_instance.fence();
201 
202  auto newAllocPerRow_h = create_mirror_view(
203  hostSpace, newAllocPerRow, verbose, prefix.get());
204  using host_range_type = Kokkos::RangePolicy<
205  Kokkos::DefaultHostExecutionSpace, size_t>;
206  Kokkos::parallel_reduce(
207  "Tpetra::CrsGraph: Compute new allocation size per row",
208  host_range_type(0, lclNumRows),
209  [&](const size_t lclRowInd, size_t& lclIncrease) {
210  const size_t start = row_ptr_beg_h[lclRowInd];
211  const size_t end = row_ptr_beg_h[lclRowInd + 1];
212  TEUCHOS_ASSERT(end >= start);
213  const size_t oldAllocSize = end - start;
214  const size_t oldNumEnt = row_ptr_end_h[lclRowInd] - start;
215  TEUCHOS_ASSERT(oldNumEnt <= oldAllocSize);
216 
217  // This is not a pack routine. Do not shrink! Shrinking now
218  // to fit the number of entries would ignore users' hint for
219  // the max number of entries in each row. Also, CrsPadding
220  // only counts entries and ignores any available free space.
221 
222  auto result = padding.get_result(lclRowInd);
223  const size_t newNumEnt = oldNumEnt + result.numInSrcNotInTgt;
224  if (newNumEnt > oldAllocSize) {
225  lclIncrease += (newNumEnt - oldAllocSize);
226  newAllocPerRow_h[lclRowInd] = newNumEnt;
227  } else {
228  newAllocPerRow_h[lclRowInd] = oldAllocSize;
229  }
230  },
231  increase);
232 
233  if (verbose) {
234  std::ostringstream os;
235  os << *prefix << "increase: " << increase << ", ";
236  verbosePrintArray(os, newAllocPerRow_h, "newAllocPerRow",
237  maxNumToPrint);
238  os << endl;
239  std::cerr << os.str();
240  }
241 
242  if (increase == 0) {
243  return;
244  }
245  // DEEP_COPY REVIEW - HOSTMIRROR-TO-DEVICE
246  Kokkos::deep_copy(execution_space(), newAllocPerRow, newAllocPerRow_h);
247  }
248 
249  using inds_value_type =
250  typename Indices::t_dev::non_const_value_type;
251  using vals_value_type = typename Values::t_dev::non_const_value_type;
252 
253  {
254  auto indices_old = indices_wdv.getDeviceView(Access::ReadOnly);
255  const size_t newIndsSize = size_t(indices_old.size()) + increase;
256  auto indices_new = make_uninitialized_view<typename Indices::t_dev>(
257  "Tpetra::CrsGraph column indices", newIndsSize, verbose,
258  prefix.get());
259 
260  typename Values::t_dev values_new;
261  auto values_old = values_wdv.getDeviceView(Access::ReadOnly);
262  if (action == PadCrsAction::INDICES_AND_VALUES) {
263  const size_t newValsSize = newIndsSize;
264  // NOTE (mfh 10 Feb 2020) If we don't initialize values_new here,
265  // then the CrsMatrix tests fail.
266  values_new = make_initialized_view<typename Values::t_dev>(
267  "Tpetra::CrsMatrix values", newValsSize, verbose, prefix.get());
268  }
269 
270  if (verbose) {
271  std::ostringstream os;
272  os << *prefix << "Repack" << endl;
273  std::cerr << os.str();
274  }
275 
276  using range_type = Kokkos::RangePolicy<execution_space, size_t>;
277  Kokkos::parallel_scan(
278  "Tpetra::CrsGraph or CrsMatrix repack",
279  range_type(size_t(0), size_t(lclNumRows + 1)),
280  KOKKOS_LAMBDA(const size_t lclRow, size_t& newRowBeg,
281  const bool finalPass) {
282  // row_ptr_beg has lclNumRows + 1 entries.
283  // row_ptr_end has lclNumRows entries.
284  // newAllocPerRow has lclNumRows entries.
285  const size_t row_beg = row_ptr_beg[lclRow];
286  const size_t row_end =
287  lclRow < lclNumRows ? row_ptr_end[lclRow] : row_beg;
288  const size_t numEnt = row_end - row_beg;
289  const size_t newRowAllocSize =
290  lclRow < lclNumRows ? newAllocPerRow[lclRow] : size_t(0);
291  if (finalPass) {
292  if (lclRow < lclNumRows) {
293  const Kokkos::pair<size_t, size_t> oldRange(
294  row_beg, row_beg + numEnt);
295  const Kokkos::pair<size_t, size_t> newRange(
296  newRowBeg, newRowBeg + numEnt);
297  auto oldColInds = Kokkos::subview(indices_old, oldRange);
298  auto newColInds = Kokkos::subview(indices_new, newRange);
299  // memcpy works fine on device; the next step is to
300  // introduce two-level parallelism and use team copy.
301  memcpy(newColInds.data(), oldColInds.data(),
302  numEnt * sizeof(inds_value_type));
303  if (action == PadCrsAction::INDICES_AND_VALUES) {
304  auto oldVals =
305  Kokkos::subview(values_old, oldRange);
306  auto newVals = Kokkos::subview(values_new, newRange);
307  memcpy((void*)newVals.data(), oldVals.data(),
308  numEnt * sizeof(vals_value_type));
309  }
310  }
311  // It's the final pass, so we can modify these arrays.
312  row_ptr_beg[lclRow] = newRowBeg;
313  if (lclRow < lclNumRows) {
314  row_ptr_end[lclRow] = newRowBeg + numEnt;
315  }
316  }
317  newRowBeg += newRowAllocSize;
318  });
319 
320  if (verbose) {
321  std::ostringstream os;
322 
323  os << *prefix;
324  auto row_ptr_beg_h =
325  Kokkos::create_mirror_view(hostSpace, row_ptr_beg);
326  // DEEP_COPY REVIEW - NOT TESTED
327  Kokkos::deep_copy(row_ptr_beg_h, row_ptr_beg);
328  verbosePrintArray(os, row_ptr_beg_h, "row_ptr_beg after scan",
329  maxNumToPrint);
330  os << endl;
331 
332  os << *prefix;
333  auto row_ptr_end_h =
334  Kokkos::create_mirror_view(hostSpace, row_ptr_end);
335  // DEEP_COPY REVIEW - NOT TESTED
336  Kokkos::deep_copy(row_ptr_end_h, row_ptr_end);
337  verbosePrintArray(os, row_ptr_end_h, "row_ptr_end after scan",
338  maxNumToPrint);
339  os << endl;
340 
341  std::cout << os.str();
342  }
343 
344  indices_wdv = Indices(indices_new);
345  values_wdv = Values(values_new);
346  }
347 
348  if (verbose) {
349  auto indices_h = indices_wdv.getHostView(Access::ReadOnly);
350  auto values_h = values_wdv.getHostView(Access::ReadOnly);
351  std::ostringstream os;
352  os << "On output: ";
353  verbosePrintArray(os, indices_h, "indices", maxNumToPrint);
354  os << ", ";
355  verbosePrintArray(os, values_h, "values", maxNumToPrint);
356  os << ", padding: ";
357  padding.print(os);
358  os << endl;
359  }
360 
361  if (verbose) {
362  std::ostringstream os;
363  os << *prefix << "Done" << endl;
364  std::cerr << os.str();
365  }
366 }
367 
369 template <class Pointers, class InOutIndices, class InIndices, class IndexMap>
370 size_t
371 insert_crs_indices(
372  typename Pointers::value_type const row,
373  Pointers const& row_ptrs,
374  InOutIndices& cur_indices,
375  size_t& num_assigned,
376  InIndices const& new_indices,
377  IndexMap&& map,
378  std::function<void(size_t const, size_t const, size_t const)> cb) {
379  if (new_indices.size() == 0) {
380  return 0;
381  }
382 
383  if (cur_indices.size() == 0) {
384  // No room to insert new indices
385  return Teuchos::OrdinalTraits<size_t>::invalid();
386  }
387 
388  using offset_type = typename std::decay<decltype(row_ptrs[0])>::type;
389  using ordinal_type = typename std::decay<decltype(cur_indices[0])>::type;
390 
391  const offset_type start = row_ptrs[row];
392  offset_type end = start + static_cast<offset_type>(num_assigned);
393  const size_t num_avail = (row_ptrs[row + 1] < end) ? size_t(0) : row_ptrs[row + 1] - end;
394  const size_t num_new_indices = static_cast<size_t>(new_indices.size());
395  size_t num_inserted = 0;
396 
397  size_t numIndicesLookup = num_assigned + num_new_indices;
398 
399  // Threshold determined from test/Utils/insertCrsIndicesThreshold.cpp
400  const size_t useLookUpTableThreshold = 400;
401 
402  if (numIndicesLookup <= useLookUpTableThreshold || num_new_indices == 1) {
403  // For rows with few nonzeros, can use a serial search to find duplicates
404  // Or if inserting only one index, serial search is as fast as anything else
405  for (size_t k = 0; k < num_new_indices; ++k) {
406  const ordinal_type idx = std::forward<IndexMap>(map)(new_indices[k]);
407  offset_type row_offset = start;
408  for (; row_offset < end; ++row_offset) {
409  if (idx == cur_indices[row_offset]) {
410  break;
411  }
412  }
413 
414  if (row_offset == end) {
415  if (num_inserted >= num_avail) { // not enough room
416  return Teuchos::OrdinalTraits<size_t>::invalid();
417  }
418  // This index is not yet in indices
419  cur_indices[end++] = idx;
420  num_inserted++;
421  }
422  if (cb) {
423  cb(k, start, row_offset - start);
424  }
425  }
426  } else {
427  // For rows with many nonzeros, use a lookup table to find duplicates
428  std::unordered_map<ordinal_type, offset_type> idxLookup(numIndicesLookup);
429 
430  // Put existing indices into the lookup table
431  for (size_t k = 0; k < num_assigned; k++) {
432  idxLookup[cur_indices[start + k]] = start + k;
433  }
434 
435  // Check for new indices in table; insert if not there yet
436  for (size_t k = 0; k < num_new_indices; k++) {
437  const ordinal_type idx = std::forward<IndexMap>(map)(new_indices[k]);
438  offset_type row_offset;
439 
440  auto it = idxLookup.find(idx);
441  if (it == idxLookup.end()) {
442  if (num_inserted >= num_avail) { // not enough room
443  return Teuchos::OrdinalTraits<size_t>::invalid();
444  }
445  // index not found; insert it
446  row_offset = end;
447  cur_indices[end++] = idx;
448  idxLookup[idx] = row_offset;
449  num_inserted++;
450  } else {
451  // index found; note its position
452  row_offset = it->second;
453  }
454  if (cb) {
455  cb(k, start, row_offset - start);
456  }
457  }
458  }
459  num_assigned += num_inserted;
460  return num_inserted;
461 }
462 
464 template <class Pointers, class Indices1, class Indices2, class IndexMap, class Callback>
465 size_t
466 find_crs_indices(
467  typename Pointers::value_type const row,
468  Pointers const& row_ptrs,
469  const size_t curNumEntries,
470  Indices1 const& cur_indices,
471  Indices2 const& new_indices,
472  IndexMap&& map,
473  Callback&& cb) {
474  if (new_indices.size() == 0)
475  return 0;
476 
477  using ordinal =
478  typename std::remove_const<typename Indices1::value_type>::type;
479  auto invalid_ordinal = Teuchos::OrdinalTraits<ordinal>::invalid();
480 
481  const size_t start = static_cast<size_t>(row_ptrs[row]);
482  const size_t end = start + curNumEntries;
483  size_t num_found = 0;
484  for (size_t k = 0; k < new_indices.size(); k++) {
485  auto row_offset = start;
486  auto idx = std::forward<IndexMap>(map)(new_indices[k]);
487  if (idx == invalid_ordinal)
488  continue;
489  for (; row_offset < end; row_offset++) {
490  if (idx == cur_indices[row_offset]) {
491  std::forward<Callback>(cb)(k, start, row_offset - start);
492  num_found++;
493  }
494  }
495  }
496  return num_found;
497 }
498 
499 } // namespace impl
500 
518 template <class RowPtr, class Indices, class Padding>
520  const RowPtr& rowPtrBeg,
521  const RowPtr& rowPtrEnd,
522  Indices& indices_wdv,
523  const Padding& padding,
524  const int my_rank,
525  const bool verbose) {
526  using impl::pad_crs_arrays;
527  // send empty values array
528  Indices values_null;
529  pad_crs_arrays<RowPtr, Indices, Indices, Padding>(
530  impl::PadCrsAction::INDICES_ONLY, rowPtrBeg, rowPtrEnd,
531  indices_wdv, values_null, padding, my_rank, verbose);
532 }
533 
534 template <class RowPtr, class Indices, class Values, class Padding>
535 void padCrsArrays(
536  const RowPtr& rowPtrBeg,
537  const RowPtr& rowPtrEnd,
538  Indices& indices_wdv,
539  Values& values_wdv,
540  const Padding& padding,
541  const int my_rank,
542  const bool verbose) {
543  using impl::pad_crs_arrays;
544  pad_crs_arrays<RowPtr, Indices, Values, Padding>(
545  impl::PadCrsAction::INDICES_AND_VALUES, rowPtrBeg, rowPtrEnd,
546  indices_wdv, values_wdv, padding, my_rank, verbose);
547 }
548 
594 template <class Pointers, class InOutIndices, class InIndices>
595 size_t
597  typename Pointers::value_type const row,
598  Pointers const& rowPtrs,
599  InOutIndices& curIndices,
600  size_t& numAssigned,
601  InIndices const& newIndices,
602  std::function<void(const size_t, const size_t, const size_t)> cb =
603  std::function<void(const size_t, const size_t, const size_t)>()) {
604  static_assert(std::is_same<typename std::remove_const<typename InOutIndices::value_type>::type,
605  typename std::remove_const<typename InIndices::value_type>::type>::value,
606  "Expected views to have same value type");
607 
608  // Provide a unit map for the more general insert_indices
609  using ordinal = typename InOutIndices::value_type;
610  auto numInserted = impl::insert_crs_indices(
611  row, rowPtrs, curIndices,
612  numAssigned, newIndices, [](ordinal const idx) { return idx; }, cb);
613  return numInserted;
614 }
615 
616 template <class Pointers, class InOutIndices, class InIndices>
617 size_t
619  typename Pointers::value_type const row,
620  Pointers const& rowPtrs,
621  InOutIndices& curIndices,
622  size_t& numAssigned,
623  InIndices const& newIndices,
624  std::function<typename InOutIndices::value_type(const typename InIndices::value_type)> map,
625  std::function<void(const size_t, const size_t, const size_t)> cb =
626  std::function<void(const size_t, const size_t, const size_t)>()) {
627  auto numInserted = impl::insert_crs_indices(row, rowPtrs, curIndices,
628  numAssigned, newIndices, map, cb);
629  return numInserted;
630 }
631 
661 template <class Pointers, class Indices1, class Indices2, class Callback>
662 size_t
664  typename Pointers::value_type const row,
665  Pointers const& rowPtrs,
666  const size_t curNumEntries,
667  Indices1 const& curIndices,
668  Indices2 const& newIndices,
669  Callback&& cb) {
670  static_assert(std::is_same<typename std::remove_const<typename Indices1::value_type>::type,
671  typename std::remove_const<typename Indices2::value_type>::type>::value,
672  "Expected views to have same value type");
673  // Provide a unit map for the more general find_crs_indices
674  using ordinal = typename Indices2::value_type;
675  auto numFound = impl::find_crs_indices(
676  row, rowPtrs, curNumEntries, curIndices, newIndices,
677  [=](ordinal ind) { return ind; }, cb);
678  return numFound;
679 }
680 
681 template <class Pointers, class Indices1, class Indices2, class IndexMap, class Callback>
682 size_t
684  typename Pointers::value_type const row,
685  Pointers const& rowPtrs,
686  const size_t curNumEntries,
687  Indices1 const& curIndices,
688  Indices2 const& newIndices,
689  IndexMap&& map,
690  Callback&& cb) {
691  return impl::find_crs_indices(row, rowPtrs, curNumEntries, curIndices, newIndices, map, cb);
692 }
693 
694 } // namespace Details
695 } // namespace Tpetra
696 
697 #endif // TPETRA_DETAILS_CRSUTILS_HPP
void padCrsArrays(const RowPtr &rowPtrBeg, const RowPtr &rowPtrEnd, Indices &indices_wdv, const Padding &padding, const int my_rank, const bool verbose)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries...
void verbosePrintArray(std::ostream &out, const ArrayType &x, const char name[], const size_t maxNumToPrint)
Print min(x.size(), maxNumToPrint) entries of x.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
size_t insertCrsIndices(typename Pointers::value_type const row, Pointers const &rowPtrs, InOutIndices &curIndices, size_t &numAssigned, InIndices const &newIndices, std::function< void(const size_t, const size_t, const size_t)> cb=std::function< void(const size_t, const size_t, const size_t)>())
Insert new indices in to current list of indices.
size_t findCrsIndices(typename Pointers::value_type const row, Pointers const &rowPtrs, const size_t curNumEntries, Indices1 const &curIndices, Indices2 const &newIndices, Callback &&cb)
Finds offsets in to current list of indices.
void start()
Start the deep_copy counter.
static size_t verbosePrintCountThreshold()
Number of entries below which arrays, lists, etc. will be printed in debug mode.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.