Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_BlockCrsMatrix_Helpers_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_BLOCKCRSMATRIX_HELPERS_DEF_HPP
11 #define TPETRA_BLOCKCRSMATRIX_HELPERS_DEF_HPP
12 
14 
15 #include "Tpetra_BlockCrsMatrix.hpp"
16 #include "Tpetra_CrsMatrix.hpp"
17 #include "Tpetra_HashTable.hpp"
18 #include "Tpetra_Import.hpp"
19 #include "Tpetra_Map.hpp"
20 #include "Tpetra_MultiVector.hpp"
21 #include "Teuchos_ParameterList.hpp"
22 #include "Teuchos_ScalarTraits.hpp"
23 #include <ctime>
24 #include <fstream>
25 
26 namespace Tpetra {
27 
28 template <class Scalar, class LO, class GO, class Node>
29 void blockCrsMatrixWriter(BlockCrsMatrix<Scalar, LO, GO, Node> const &A, std::string const &fileName) {
30  Teuchos::ParameterList pl;
31  std::ofstream out;
32  out.open(fileName.c_str());
33  blockCrsMatrixWriter(A, out, pl);
34 }
35 
36 template <class Scalar, class LO, class GO, class Node>
37 void blockCrsMatrixWriter(BlockCrsMatrix<Scalar, LO, GO, Node> const &A, std::string const &fileName, Teuchos::ParameterList const &params) {
38  std::ofstream out;
39  out.open(fileName.c_str());
40  blockCrsMatrixWriter(A, out, params);
41 }
42 
43 template <class Scalar, class LO, class GO, class Node>
45  Teuchos::ParameterList pl;
46  blockCrsMatrixWriter(A, os, pl);
47 }
48 
49 template <class Scalar, class LO, class GO, class Node>
50 void blockCrsMatrixWriter(BlockCrsMatrix<Scalar, LO, GO, Node> const &A, std::ostream &os, Teuchos::ParameterList const &params) {
51  using Teuchos::RCP;
52  using Teuchos::rcp;
53 
54  typedef Teuchos::OrdinalTraits<GO> TOT;
55  typedef BlockCrsMatrix<Scalar, LO, GO, Node> block_crs_matrix_type;
56  typedef Tpetra::Import<LO, GO, Node> import_type;
57  typedef Tpetra::Map<LO, GO, Node> map_type;
59  typedef Tpetra::CrsGraph<LO, GO, Node> crs_graph_type;
60 
61  RCP<const map_type> const rowMap = A.getRowMap(); //"mesh" map
62  RCP<const Teuchos::Comm<int>> comm = rowMap->getComm();
63  const int myRank = comm->getRank();
64  const size_t numProcs = comm->getSize();
65 
66  // If true, force use of the import strip-mining infrastructure. This is useful for debugging on one process.
67  bool alwaysUseParallelAlgorithm = false;
68  if (params.isParameter("always use parallel algorithm"))
69  alwaysUseParallelAlgorithm = params.get<bool>("always use parallel algorithm");
70  bool printMatrixMarketHeader = true;
71  if (params.isParameter("print MatrixMarket header"))
72  printMatrixMarketHeader = params.get<bool>("print MatrixMarket header");
73 
74  if (printMatrixMarketHeader && myRank == 0) {
75  std::time_t now = std::time(NULL);
76 
77  const std::string dataTypeStr =
78  Teuchos::ScalarTraits<Scalar>::isComplex ? "complex" : "real";
79 
80  // Explanation of first line of file:
81  // - "%%MatrixMarket" is the tag for Matrix Market format.
82  // - "matrix" is what we're printing.
83  // - "coordinate" means sparse (triplet format), rather than dense.
84  // - "real" / "complex" is the type (in an output format sense,
85  // not in a C++ sense) of each value of the matrix. (The
86  // other two possibilities are "integer" (not really necessary
87  // here) and "pattern" (no values, just graph).)
88  os << "%%MatrixMarket matrix coordinate " << dataTypeStr << " general" << std::endl;
89  os << "% time stamp: " << ctime(&now);
90  os << "% written from " << numProcs << " processes" << std::endl;
91  os << "% point representation of Tpetra::BlockCrsMatrix" << std::endl;
92  size_t numRows = A.getGlobalNumRows();
93  size_t numCols = A.getGlobalNumCols();
94  os << "% " << numRows << " block rows, " << numCols << " block columns" << std::endl;
95  const LO blockSize = A.getBlockSize();
96  os << "% block size " << blockSize << std::endl;
97  os << numRows * blockSize << " " << numCols * blockSize << " " << A.getGlobalNumEntries() * blockSize * blockSize << std::endl;
98  }
99 
100  if (numProcs == 1 && !alwaysUseParallelAlgorithm) {
101  writeMatrixStrip(A, os, params);
102  } else {
103  size_t numRows = rowMap->getLocalNumElements();
104 
105  // Create source map
106  RCP<const map_type> allMeshGidsMap = rcp(new map_type(TOT::invalid(), numRows, A.getIndexBase(), comm));
107  // Create and populate vector of mesh GIDs corresponding to this pid's rows.
108  // This vector will be imported one pid's worth of information at a time to pid 0.
109  mv_type allMeshGids(allMeshGidsMap, 1);
110  Teuchos::ArrayRCP<GO> allMeshGidsData = allMeshGids.getDataNonConst(0);
111 
112  for (size_t i = 0; i < numRows; i++)
113  allMeshGidsData[i] = rowMap->getGlobalElement(i);
114  allMeshGidsData = Teuchos::null;
115 
116  // Now construct a RowMatrix on PE 0 by strip-mining the rows of the input matrix A.
117  size_t stripSize = allMeshGids.getGlobalLength() / numProcs;
118  size_t remainder = allMeshGids.getGlobalLength() % numProcs;
119  size_t curStart = 0;
120  size_t curStripSize = 0;
121  Teuchos::Array<GO> importMeshGidList;
122  for (size_t i = 0; i < numProcs; i++) {
123  if (myRank == 0) { // Only PE 0 does this part
124  curStripSize = stripSize;
125  if (i < remainder) curStripSize++; // handle leftovers
126  importMeshGidList.resize(curStripSize); // Set size of vector to max needed
127  for (size_t j = 0; j < curStripSize; j++) importMeshGidList[j] = j + curStart + A.getIndexBase();
128  curStart += curStripSize;
129  }
130  // The following import map should be non-trivial only on PE 0.
131  TEUCHOS_TEST_FOR_EXCEPTION(myRank > 0 && curStripSize != 0,
132  std::runtime_error, "Tpetra::blockCrsMatrixWriter: (pid " << myRank << ") map size should be zero, but is " << curStripSize);
133  RCP<map_type> importMeshGidMap = rcp(new map_type(TOT::invalid(), importMeshGidList(), A.getIndexBase(), comm));
134  import_type gidImporter(allMeshGidsMap, importMeshGidMap);
135  mv_type importMeshGids(importMeshGidMap, 1);
136  importMeshGids.doImport(allMeshGids, gidImporter, INSERT);
137 
138  // importMeshGids now has a list of GIDs for the current strip of matrix rows.
139  // Use these values to build another importer that will get rows of the matrix.
140 
141  // The following import map will be non-trivial only on PE 0.
142  Teuchos::ArrayRCP<const GO> importMeshGidsData = importMeshGids.getData(0);
143  Teuchos::Array<GO> importMeshGidsGO;
144  importMeshGidsGO.reserve(importMeshGidsData.size());
145  for (typename Teuchos::ArrayRCP<const GO>::size_type j = 0; j < importMeshGidsData.size(); ++j)
146  importMeshGidsGO.push_back(importMeshGidsData[j]);
147  RCP<const map_type> importMap = rcp(new map_type(TOT::invalid(), importMeshGidsGO(), rowMap->getIndexBase(), comm));
148 
149  import_type importer(rowMap, importMap);
150  size_t numEntriesPerRow = A.getCrsGraph().getGlobalMaxNumRowEntries();
151  RCP<crs_graph_type> graph = createCrsGraph(importMap, numEntriesPerRow);
152  RCP<const map_type> domainMap = A.getCrsGraph().getDomainMap();
153  graph->doImport(A.getCrsGraph(), importer, INSERT);
154  graph->fillComplete(domainMap, importMap);
155 
156  block_crs_matrix_type importA(*graph, A.getBlockSize());
157  importA.doImport(A, importer, INSERT);
158 
159  // Finally we are ready to write this strip of the matrix
160  writeMatrixStrip(importA, os, params);
161  }
162  }
163 }
164 
165 template <class Scalar, class LO, class GO, class Node>
166 void writeMatrixStrip(BlockCrsMatrix<Scalar, LO, GO, Node> const &A, std::ostream &os, Teuchos::ParameterList const &params) {
167  using Teuchos::RCP;
168  using map_type = Tpetra::Map<LO, GO, Node>;
169  using bcrs_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
170  using bcrs_local_inds_host_view_type = typename bcrs_type::local_inds_host_view_type;
171  using bcrs_values_host_view_type = typename bcrs_type::values_host_view_type;
172  using impl_scalar_type = typename bcrs_type::impl_scalar_type;
173 
174  size_t numRows = A.getGlobalNumRows();
175  RCP<const map_type> rowMap = A.getRowMap();
176  RCP<const map_type> colMap = A.getColMap();
177  RCP<const Teuchos::Comm<int>> comm = rowMap->getComm();
178  const int myRank = comm->getRank();
179 
180  const size_t meshRowOffset = rowMap->getIndexBase();
181  const size_t meshColOffset = colMap->getIndexBase();
182  TEUCHOS_TEST_FOR_EXCEPTION(meshRowOffset != meshColOffset,
183  std::runtime_error,
184  "Tpetra::writeMatrixStrip: "
185  "mesh row index base != mesh column index base");
186 
187  if (myRank != 0) {
188  TEUCHOS_TEST_FOR_EXCEPTION(A.getLocalNumRows() != 0,
189  std::runtime_error, "Tpetra::writeMatrixStrip: pid " << myRank << " should have 0 rows but has " << A.getLocalNumRows());
190  TEUCHOS_TEST_FOR_EXCEPTION(A.getLocalNumCols() != 0,
191  std::runtime_error, "Tpetra::writeMatrixStrip: pid " << myRank << " should have 0 columns but has " << A.getLocalNumCols());
192 
193  } else {
194  TEUCHOS_TEST_FOR_EXCEPTION(numRows != A.getLocalNumRows(),
195  std::runtime_error,
196  "Tpetra::writeMatrixStrip: "
197  "number of rows on pid 0 does not match global number of rows");
198 
199  int err = 0;
200  const LO blockSize = A.getBlockSize();
201  const size_t numLocalRows = A.getLocalNumRows();
202  bool precisionChanged = false;
203  int oldPrecision = 0; // avoid "unused variable" warning
204  if (params.isParameter("precision")) {
205  oldPrecision = os.precision(params.get<int>("precision"));
206  precisionChanged = true;
207  }
208  int pointOffset = 1;
209  if (params.isParameter("zero-based indexing")) {
210  if (params.get<bool>("zero-based indexing") == true)
211  pointOffset = 0;
212  }
213 
214  size_t localRowInd;
215  for (localRowInd = 0; localRowInd < numLocalRows; ++localRowInd) {
216  // Get a view of the current row.
217  bcrs_local_inds_host_view_type localColInds;
218  bcrs_values_host_view_type vals;
219  LO numEntries;
220  A.getLocalRowView(localRowInd, localColInds, vals);
221  numEntries = localColInds.extent(0);
222  GO globalMeshRowID = rowMap->getGlobalElement(localRowInd) - meshRowOffset;
223 
224  for (LO k = 0; k < numEntries; ++k) {
225  GO globalMeshColID = colMap->getGlobalElement(localColInds[k]) - meshColOffset;
226  const impl_scalar_type *curBlock = vals.data() + blockSize * blockSize * k;
227  // Blocks are stored in row-major format.
228  for (LO j = 0; j < blockSize; ++j) {
229  GO globalPointRowID = globalMeshRowID * blockSize + j + pointOffset;
230  for (LO i = 0; i < blockSize; ++i) {
231  GO globalPointColID = globalMeshColID * blockSize + i + pointOffset;
232  const impl_scalar_type curVal = curBlock[i + j * blockSize];
233 
234  os << globalPointRowID << " " << globalPointColID << " ";
235  if (Teuchos::ScalarTraits<impl_scalar_type>::isComplex) {
236  // Matrix Market format wants complex values to be
237  // written as space-delimited pairs. See Bug 6469.
238  os << Teuchos::ScalarTraits<impl_scalar_type>::real(curVal) << " "
239  << Teuchos::ScalarTraits<impl_scalar_type>::imag(curVal);
240  } else {
241  os << curVal;
242  }
243  os << std::endl;
244  }
245  }
246  }
247  }
248  if (precisionChanged)
249  os.precision(oldPrecision);
250  TEUCHOS_TEST_FOR_EXCEPTION(err != 0,
251  std::runtime_error,
252  "Tpetra::writeMatrixStrip: "
253  "error getting view of local row "
254  << localRowInd);
255  }
256 }
257 
258 template <class Scalar, class LO, class GO, class Node>
259 Teuchos::RCP<Tpetra::CrsGraph<LO, GO, Node>>
260 getBlockCrsGraph(const Tpetra::CrsMatrix<Scalar, LO, GO, Node> &pointMatrix, const LO &blockSize, bool use_LID) {
261  TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::getBlockCrsGraph", getBlockCrsGraph0);
262  /*
263  ASSUMPTIONS:
264 
265  1) In point matrix, all entries associated with a little block are present (even if they are zero).
266  2) For given mesh DOF, point DOFs appear consecutively and in ascending order in row & column maps.
267  3) Point column map and block column map are ordered consistently.
268  */
269 
270  using Teuchos::Array;
271  using Teuchos::ArrayView;
272  using Teuchos::RCP;
273 
274  typedef Tpetra::Map<LO, GO, Node> map_type;
275  typedef Tpetra::CrsGraph<LO, GO, Node> crs_graph_type;
276  typedef Tpetra::CrsMatrix<Scalar, LO, GO, Node> crs_matrix_type;
277 
278  using local_graph_device_type = typename crs_matrix_type::local_graph_device_type;
279  using row_map_type = typename local_graph_device_type::row_map_type::non_const_type;
280  using entries_type = typename local_graph_device_type::entries_type::non_const_type;
281 
282  using offset_type = typename row_map_type::non_const_value_type;
283 
284  using execution_space = typename Node::execution_space;
285  using range_type = Kokkos::RangePolicy<execution_space, LO>;
286 
287  const map_type &pointRowMap = *(pointMatrix.getRowMap());
288  RCP<const map_type> meshRowMap, meshColMap, meshDomainMap, meshRangeMap;
289 
290  const map_type &pointColMap = *(pointMatrix.getColMap());
291  const map_type &pointDomainMap = *(pointMatrix.getDomainMap());
292  const map_type &pointRangeMap = *(pointMatrix.getRangeMap());
293 
294  {
295  TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::getBlockCrsGraph::createMeshMaps", getBlockCrsGraph1);
296  meshRowMap = createMeshMap<LO, GO, Node>(blockSize, pointRowMap, use_LID);
297  meshColMap = createMeshMap<LO, GO, Node>(blockSize, pointColMap, use_LID);
298  meshDomainMap = createMeshMap<LO, GO, Node>(blockSize, pointDomainMap, use_LID);
299  meshRangeMap = createMeshMap<LO, GO, Node>(blockSize, pointRangeMap, use_LID);
300  Kokkos::DefaultExecutionSpace().fence();
301  }
302 
303  if (meshColMap.is_null()) throw std::runtime_error("ERROR: Cannot create mesh colmap");
304 
305  auto localMeshColMap = meshColMap->getLocalMap();
306  auto localPointColMap = pointColMap.getLocalMap();
307  auto localPointRowMap = pointRowMap.getLocalMap();
308 
309  RCP<crs_graph_type> meshCrsGraph;
310 
311  const offset_type bs2 = blockSize * blockSize;
312 
313  if (use_LID) {
314  TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::getBlockCrsGraph::LID", getBlockCrsGraph2);
315  auto pointLocalGraph = pointMatrix.getCrsGraph()->getLocalGraphDevice();
316  auto pointRowptr = pointLocalGraph.row_map;
317  auto pointColind = pointLocalGraph.entries;
318 
319  TEUCHOS_TEST_FOR_EXCEPTION(pointColind.extent(0) % bs2 != 0,
320  std::runtime_error,
321  "Tpetra::getBlockCrsGraph: "
322  "local number of non zero entries is not a multiple of blockSize^2 ");
323 
324  LO block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0) - 1) / blockSize;
325  row_map_type blockRowptr("blockRowptr", block_rows + 1);
326  entries_type blockColind("blockColind", pointColind.extent(0) / (bs2));
327 
328  Kokkos::parallel_for(
329  "fillMesh", range_type(0, block_rows), KOKKOS_LAMBDA(const LO i) {
330  const LO offset_b = pointRowptr(i * blockSize) / bs2;
331  const LO offset_b_max = pointRowptr((i + 1) * blockSize) / bs2;
332 
333  if (i == block_rows - 1)
334  blockRowptr(i + 1) = offset_b_max;
335  blockRowptr(i) = offset_b;
336 
337  const LO offset_p = pointRowptr(i * blockSize);
338 
339  for (LO k = 0; k < offset_b_max - offset_b; ++k) {
340  blockColind(offset_b + k) = pointColind(offset_p + k * blockSize) / blockSize;
341  }
342  });
343 
344  meshCrsGraph = rcp(new crs_graph_type(meshRowMap, meshColMap, blockRowptr, blockColind));
345  meshCrsGraph->fillComplete(meshDomainMap, meshRangeMap);
346  Kokkos::DefaultExecutionSpace().fence();
347  } else {
348  TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::getBlockCrsGraph::GID", getBlockCrsGraph3);
349  auto pointLocalGraph = pointMatrix.getCrsGraph()->getLocalGraphDevice();
350  auto pointRowptr = pointLocalGraph.row_map;
351  auto pointColind = pointLocalGraph.entries;
352 
353  TEUCHOS_TEST_FOR_EXCEPTION(pointColind.extent(0) % bs2 != 0,
354  std::runtime_error,
355  "Tpetra::getBlockCrsGraph: "
356  "local number of non zero entries is not a multiple of blockSize^2 ");
357 
358  LO block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0) - 1) / blockSize;
359  row_map_type blockRowptr("blockRowptr", block_rows + 1);
360  entries_type blockColind("blockColind", pointColind.extent(0) / (bs2));
361 
362  Kokkos::parallel_for(
363  "fillMesh", range_type(0, block_rows), KOKKOS_LAMBDA(const LO i) {
364  const LO offset_b = pointRowptr(i * blockSize) / bs2;
365  const LO offset_b_max = pointRowptr((i + 1) * blockSize) / bs2;
366 
367  if (i == block_rows - 1)
368  blockRowptr(i + 1) = offset_b_max;
369  blockRowptr(i) = offset_b;
370 
371  const LO offset_p = pointRowptr(i * blockSize);
372  const LO offset_p_max = pointRowptr((i + 1) * blockSize);
373 
374  LO filled_block = 0;
375  for (LO p_i = 0; p_i < offset_p_max - offset_p; ++p_i) {
376  auto bcol_GID = localPointColMap.getGlobalElement(pointColind(offset_p + p_i)) / blockSize;
377  auto bcol_LID = localMeshColMap.getLocalElement(bcol_GID);
378 
379  bool visited = false;
380  for (LO k = 0; k < filled_block; ++k) {
381  if (blockColind(offset_b + k) == bcol_LID)
382  visited = true;
383  }
384  if (!visited) {
385  blockColind(offset_b + filled_block) = bcol_LID;
386  ++filled_block;
387  }
388  }
389  });
390 
391  meshCrsGraph = rcp(new crs_graph_type(meshRowMap, meshColMap, blockRowptr, blockColind));
392  meshCrsGraph->fillComplete(meshDomainMap, meshRangeMap);
393  Kokkos::DefaultExecutionSpace().fence();
394  }
395 
396  return meshCrsGraph;
397 }
398 
399 template <class Scalar, class LO, class GO, class Node>
400 Teuchos::RCP<BlockCrsMatrix<Scalar, LO, GO, Node>>
401 convertToBlockCrsMatrix(const Tpetra::CrsMatrix<Scalar, LO, GO, Node> &pointMatrix, const LO &blockSize, bool use_LID) {
402  TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::convertToBlockCrsMatrix", convertToBlockCrsMatrix0);
403  /*
404  ASSUMPTIONS:
405 
406  1) In point matrix, all entries associated with a little block are present (even if they are zero).
407  2) For given mesh DOF, point DOFs appear consecutively and in ascending order in row & column maps.
408  3) Point column map and block column map are ordered consistently.
409  */
410 
411  using Teuchos::Array;
412  using Teuchos::ArrayView;
413  using Teuchos::RCP;
414 
415  typedef Tpetra::BlockCrsMatrix<Scalar, LO, GO, Node> block_crs_matrix_type;
416  typedef Tpetra::CrsMatrix<Scalar, LO, GO, Node> crs_matrix_type;
417 
418  using local_graph_device_type = typename crs_matrix_type::local_graph_device_type;
419  using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
420  using row_map_type = typename local_graph_device_type::row_map_type::non_const_type;
421  using values_type = typename local_matrix_device_type::values_type::non_const_type;
422 
423  using offset_type = typename row_map_type::non_const_value_type;
424 
425  using execution_space = typename Node::execution_space;
426  using range_type = Kokkos::RangePolicy<execution_space, LO>;
427 
428  RCP<block_crs_matrix_type> blockMatrix;
429 
430  const offset_type bs2 = blockSize * blockSize;
431 
432  auto meshCrsGraph = getBlockCrsGraph(pointMatrix, blockSize, use_LID);
433 
434  if (use_LID) {
435  TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::convertToBlockCrsMatrix::LID", convertToBlockCrsMatrix1);
436  auto pointLocalGraph = pointMatrix.getCrsGraph()->getLocalGraphDevice();
437  auto pointRowptr = pointLocalGraph.row_map;
438  auto pointColind = pointLocalGraph.entries;
439 
440  offset_type block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0) - 1) / blockSize;
441  values_type blockValues("values", meshCrsGraph->getLocalNumEntries() * bs2);
442  auto pointValues = pointMatrix.getLocalValuesDevice(Access::ReadOnly);
443  auto blockRowptr = meshCrsGraph->getLocalGraphDevice().row_map;
444 
445  Kokkos::parallel_for(
446  "copyblockValues", range_type(0, block_rows), KOKKOS_LAMBDA(const LO i) {
447  const offset_type blkBeg = blockRowptr[i];
448  const offset_type numBlocks = blockRowptr[i + 1] - blkBeg;
449 
450  // For each block in the row...
451  for (offset_type block = 0; block < numBlocks; block++) {
452  // For each entry in the block...
453  for (LO little_row = 0; little_row < blockSize; little_row++) {
454  offset_type point_row_offset = pointRowptr[i * blockSize + little_row];
455  for (LO little_col = 0; little_col < blockSize; little_col++) {
456  blockValues((blkBeg + block) * bs2 + little_row * blockSize + little_col) =
457  pointValues[point_row_offset + block * blockSize + little_col];
458  }
459  }
460  }
461  });
462  blockMatrix = rcp(new block_crs_matrix_type(*meshCrsGraph, blockValues, blockSize));
463  Kokkos::DefaultExecutionSpace().fence();
464  } else {
465  TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::convertToBlockCrsMatrix::GID", convertToBlockCrsMatrix2);
466  auto localMeshColMap = meshCrsGraph->getColMap()->getLocalMap();
467  auto localPointColMap = pointMatrix.getColMap()->getLocalMap();
468 
469  values_type blockValues("values", meshCrsGraph->getLocalNumEntries() * bs2);
470  {
471  auto pointLocalGraph = pointMatrix.getCrsGraph()->getLocalGraphDevice();
472  auto pointRowptr = pointLocalGraph.row_map;
473  auto pointColind = pointLocalGraph.entries;
474 
475  offset_type block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0) - 1) / blockSize;
476  auto pointValues = pointMatrix.getLocalValuesDevice(Access::ReadOnly);
477  auto blockRowptr = meshCrsGraph->getLocalGraphDevice().row_map;
478  auto blockColind = meshCrsGraph->getLocalGraphDevice().entries;
479 
480  row_map_type pointGColind("pointGColind", pointColind.extent(0));
481 
482  Kokkos::parallel_for(
483  "computePointGColind", range_type(0, pointColind.extent(0)), KOKKOS_LAMBDA(const LO i) {
484  pointGColind(i) = localPointColMap.getGlobalElement(pointColind(i));
485  });
486 
487  row_map_type blockGColind("blockGColind", blockColind.extent(0));
488 
489  Kokkos::parallel_for(
490  "computeBlockGColind", range_type(0, blockGColind.extent(0)), KOKKOS_LAMBDA(const LO i) {
491  blockGColind(i) = localMeshColMap.getGlobalElement(blockColind(i));
492  });
493 
494  Kokkos::parallel_for(
495  "copyblockValues", range_type(0, block_rows), KOKKOS_LAMBDA(const LO i) {
496  const offset_type blkBeg = blockRowptr[i];
497  const offset_type numBlocks = blockRowptr[i + 1] - blkBeg;
498 
499  for (offset_type point_i = 0; point_i < pointRowptr[i * blockSize + 1] - pointRowptr[i * blockSize]; point_i++) {
500  offset_type block_inv = static_cast<offset_type>(-1);
501  offset_type little_col_inv = static_cast<offset_type>(-1);
502  for (offset_type block_2 = 0; block_2 < numBlocks; block_2++) {
503  for (LO little_col_2 = 0; little_col_2 < blockSize; little_col_2++) {
504  if (blockGColind(blkBeg + block_2) * blockSize + little_col_2 == pointGColind(pointRowptr[i * blockSize] + point_i)) {
505  block_inv = block_2;
506  little_col_inv = little_col_2;
507  break;
508  }
509  }
510  if (block_inv != static_cast<offset_type>(-1))
511  break;
512  }
513 
514  for (LO little_row = 0; little_row < blockSize; little_row++) {
515  blockValues((blkBeg + block_inv) * bs2 + little_row * blockSize + little_col_inv) = pointValues[pointRowptr[i * blockSize + little_row] + point_i];
516  }
517  }
518  });
519  }
520  blockMatrix = rcp(new block_crs_matrix_type(*meshCrsGraph, blockValues, blockSize));
521  Kokkos::DefaultExecutionSpace().fence();
522  }
523 
524  return blockMatrix;
525 }
526 
527 template <class Scalar, class LO, class GO, class Node>
528 Teuchos::RCP<Tpetra::CrsMatrix<Scalar, LO, GO, Node>>
529 fillLogicalBlocks(const Tpetra::CrsMatrix<Scalar, LO, GO, Node> &pointMatrix, const LO &blockSize) {
531  using execution_space = typename Node::execution_space;
532  using dev_row_view_t = typename crs_t::local_graph_device_type::row_map_type::non_const_type;
533  using dev_col_view_t = typename crs_t::local_graph_device_type::entries_type::non_const_type;
534  using dev_val_view_t = typename crs_t::local_matrix_device_type::values_type::non_const_type;
535  using range_type = Kokkos::RangePolicy<execution_space, size_t>;
536  using team_policy = Kokkos::TeamPolicy<execution_space>;
537  using member_type = typename team_policy::member_type;
538  using scratch_view = Kokkos::View<bool *, typename execution_space::scratch_memory_space>;
539  using Ordinal = typename dev_row_view_t::non_const_value_type;
540 
541  // Get structure / values
542  auto local = pointMatrix.getLocalMatrixDevice();
543  auto row_ptrs = local.graph.row_map;
544  auto col_inds = local.graph.entries;
545  auto values = local.values;
546  const auto nrows = pointMatrix.getLocalNumRows();
547 
548  //
549  // Populate all active blocks, they must be fully populated with entries so fill with zeroes
550  //
551 
552  // Make row workspace views
553  dev_row_view_t new_rowmap("new_rowmap", nrows + 1);
554  const auto blocks_per_row = nrows / blockSize; // assumes square matrix
555  dev_row_view_t active_block_row_map("active_block_row_map", blocks_per_row + 1);
556  const int max_threads = execution_space().concurrency();
557  assert(blockSize > 1);
558  assert(nrows % blockSize == 0);
559  const int mem_level = 1;
560  const int bytes = scratch_view::shmem_size(blocks_per_row);
561 
562  if (max_threads >= blockSize) {
563  // Prefer 1 team per block since this will require a lot less scratch memory
564  team_policy tp(blocks_per_row, blockSize);
565 
566  // Count active blocks
567  Kokkos::parallel_for(
568  "countActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(const member_type &team) {
569  Ordinal block_row = team.league_rank();
570 
571  scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
572  Kokkos::single(
573  Kokkos::PerTeam(team), [&]() {
574  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
575  row_block_active(row_block_idx) = false;
576  }
577  });
578  team.team_barrier();
579 
580  // All threads in a team scan a blocks-worth of rows to see which
581  // blocks are active
582  Kokkos::parallel_for(
583  Kokkos::TeamThreadRange(team, blockSize), [&](Ordinal block_offset) {
584  Ordinal row = block_row * blockSize + block_offset;
585  Ordinal row_itr = row_ptrs(row);
586  Ordinal row_end = row_ptrs(row + 1);
587 
588  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
589  const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
590  const Ordinal first_possible_col_in_next_block = (row_block_idx + 1) * blockSize;
591  Ordinal curr_nnz_col = col_inds(row_itr);
592  while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
593  // This block has at least one nnz in this row
594  row_block_active(row_block_idx) = true;
595  ++row_itr;
596  if (row_itr == row_end) break;
597  curr_nnz_col = col_inds(row_itr);
598  }
599  }
600  });
601 
602  team.team_barrier();
603 
604  Kokkos::single(
605  Kokkos::PerTeam(team), [&]() {
606  Ordinal count = 0;
607  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
608  if (row_block_active(row_block_idx)) {
609  ++count;
610  }
611  }
612  active_block_row_map(block_row) = count;
613  });
614  });
615  } else {
616  // We don't have enough parallelism to make a thread team handle a block, so just
617  // have 1 thread per block
618  team_policy tp(blocks_per_row, 1);
619 
620  // Count active blocks
621  Kokkos::parallel_for(
622  "countActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(const member_type &team) {
623  Ordinal block_row = team.league_rank();
624 
625  scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
626  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
627  row_block_active(row_block_idx) = false;
628  }
629 
630  // One thread scans a blocks-worth of rows to see which blocks are active
631  for (int block_offset = 0; block_offset < blockSize; ++block_offset) {
632  Ordinal row = block_row * blockSize + block_offset;
633  Ordinal row_itr = row_ptrs(row);
634  Ordinal row_end = row_ptrs(row + 1);
635 
636  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
637  const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
638  const Ordinal first_possible_col_in_next_block = (row_block_idx + 1) * blockSize;
639  Ordinal curr_nnz_col = col_inds(row_itr);
640  while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
641  // This block has at least one nnz in this row
642  row_block_active(row_block_idx) = true;
643  ++row_itr;
644  if (row_itr == row_end) break;
645  curr_nnz_col = col_inds(row_itr);
646  }
647  }
648  }
649 
650  Ordinal count = 0;
651  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
652  if (row_block_active(row_block_idx)) {
653  ++count;
654  }
655  }
656  active_block_row_map(block_row) = count;
657  });
658  }
659 
660  Ordinal nnz_block_count = 0;
661  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
662  execution_space>(active_block_row_map.extent(0), active_block_row_map, nnz_block_count);
663  dev_col_view_t block_col_ids("block_col_ids", nnz_block_count);
664 
665  // Find active blocks
666  if (max_threads >= blockSize) {
667  // Prefer 1 team per block since this will require a lot less scratch memory
668  team_policy tp(blocks_per_row, blockSize);
669 
670  Kokkos::parallel_for(
671  "findActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(const member_type &team) {
672  Ordinal block_row = team.league_rank();
673 
674  scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
675  Kokkos::single(
676  Kokkos::PerTeam(team), [&]() {
677  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
678  row_block_active(row_block_idx) = false;
679  }
680  });
681  team.team_barrier();
682 
683  // All threads in a team scan a blocks-worth of rows to see which
684  // blocks are active
685  Kokkos::parallel_for(
686  Kokkos::TeamThreadRange(team, blockSize), [&](Ordinal block_offset) {
687  Ordinal row = block_row * blockSize + block_offset;
688  Ordinal row_itr = row_ptrs(row);
689  Ordinal row_end = row_ptrs(row + 1);
690 
691  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
692  const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
693  const Ordinal first_possible_col_in_next_block = (row_block_idx + 1) * blockSize;
694  Ordinal curr_nnz_col = col_inds(row_itr);
695  while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
696  // This block has at least one nnz in this row
697  row_block_active(row_block_idx) = true;
698  ++row_itr;
699  if (row_itr == row_end) break;
700  curr_nnz_col = col_inds(row_itr);
701  }
702  }
703  });
704 
705  team.team_barrier();
706 
707  Kokkos::single(
708  Kokkos::PerTeam(team), [&]() {
709  Ordinal offset = active_block_row_map[block_row];
710  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
711  if (row_block_active(row_block_idx)) {
712  block_col_ids(offset) = row_block_idx;
713  ++offset;
714  }
715  }
716  });
717  });
718  } else {
719  team_policy tp(blocks_per_row, 1);
720 
721  Kokkos::parallel_for(
722  "findActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(const member_type &team) {
723  Ordinal block_row = team.league_rank();
724 
725  scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
726  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
727  row_block_active(row_block_idx) = false;
728  }
729 
730  // One thread scans a blocks-worth of rows to see which blocks are active
731  for (int block_offset = 0; block_offset < blockSize; ++block_offset) {
732  Ordinal row = block_row * blockSize + block_offset;
733  Ordinal row_itr = row_ptrs(row);
734  Ordinal row_end = row_ptrs(row + 1);
735 
736  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
737  const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
738  const Ordinal first_possible_col_in_next_block = (row_block_idx + 1) * blockSize;
739  Ordinal curr_nnz_col = col_inds(row_itr);
740  while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
741  // This block has at least one nnz in this row
742  row_block_active(row_block_idx) = true;
743  ++row_itr;
744  if (row_itr == row_end) break;
745  curr_nnz_col = col_inds(row_itr);
746  }
747  }
748  }
749 
750  Ordinal offset = active_block_row_map[block_row];
751  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
752  if (row_block_active(row_block_idx)) {
753  block_col_ids(offset) = row_block_idx;
754  ++offset;
755  }
756  }
757  });
758  }
759 
760  // Sizing
761  Kokkos::parallel_for(
762  "sizing", range_type(0, nrows), KOKKOS_LAMBDA(const size_t row) {
763  const auto block_row = row / blockSize;
764  const Ordinal block_row_begin = active_block_row_map(block_row);
765  const Ordinal block_row_end = active_block_row_map(block_row + 1);
766  const Ordinal row_nnz = (block_row_end - block_row_begin) * blockSize;
767  new_rowmap(row) = row_nnz;
768  });
769 
770  Ordinal new_nnz_count = 0;
771  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
772  execution_space>(new_rowmap.extent(0), new_rowmap, new_nnz_count);
773  // Now populate cols and vals
774  dev_col_view_t new_col_ids("new_col_ids", new_nnz_count);
775  dev_val_view_t new_vals("new_vals", new_nnz_count);
776  Kokkos::parallel_for(
777  "entries", range_type(0, nrows), KOKKOS_LAMBDA(const size_t row) {
778  Ordinal row_itr = row_ptrs(row);
779  Ordinal row_end = row_ptrs(row + 1);
780  Ordinal row_itr_new = new_rowmap(row);
781 
782  Ordinal block_row = row / blockSize;
783  Ordinal block_row_begin = active_block_row_map(block_row);
784  Ordinal block_row_end = active_block_row_map(block_row + 1);
785 
786  for (Ordinal row_block_idx = block_row_begin; row_block_idx < block_row_end; ++row_block_idx) {
787  const Ordinal block_col = block_col_ids(row_block_idx);
788  const Ordinal first_possible_col_in_block = block_col * blockSize;
789  const Ordinal first_possible_col_in_next_block = (block_col + 1) * blockSize;
790  for (Ordinal possible_col = first_possible_col_in_block; possible_col < first_possible_col_in_next_block; ++possible_col, ++row_itr_new) {
791  new_col_ids(row_itr_new) = possible_col;
792  Ordinal curr_nnz_col = col_inds(row_itr);
793  if (possible_col == curr_nnz_col && row_itr < row_end) {
794  // Already a non-zero entry
795  new_vals(row_itr_new) = values(row_itr);
796  ++row_itr;
797  }
798  }
799  }
800  });
801 
802  // Create new, filled CRS
803  auto crs_row_map = pointMatrix.getRowMap();
804  auto crs_col_map = pointMatrix.getColMap();
805  Teuchos::RCP<crs_t> result = Teuchos::rcp(new crs_t(crs_row_map, crs_col_map, new_rowmap, new_col_ids, new_vals));
806  result->fillComplete();
807  return result;
808 }
809 
810 template <class Scalar, class LO, class GO, class Node>
811 Teuchos::RCP<Tpetra::CrsMatrix<Scalar, LO, GO, Node>>
814  using dev_row_view_t = typename crs_t::local_graph_device_type::row_map_type::non_const_type;
815  using dev_col_view_t = typename crs_t::local_graph_device_type::entries_type::non_const_type;
816  using dev_val_view_t = typename crs_t::local_matrix_device_type::values_type::non_const_type;
817 #if KOKKOS_VERSION >= 40799
818  using impl_scalar_t = typename KokkosKernels::ArithTraits<Scalar>::val_type;
819 #else
820  using impl_scalar_t = typename Kokkos::ArithTraits<Scalar>::val_type;
821 #endif
822 #if KOKKOS_VERSION >= 40799
823  using STS = KokkosKernels::ArithTraits<impl_scalar_t>;
824 #else
825  using STS = Kokkos::ArithTraits<impl_scalar_t>;
826 #endif
827  using Ordinal = typename dev_row_view_t::non_const_value_type;
828  using execution_space = typename Node::execution_space;
829  using range_type = Kokkos::RangePolicy<execution_space, size_t>;
830 
831  // Get structure / values
832  auto local = pointMatrix.getLocalMatrixHost();
833  auto row_ptrs = local.graph.row_map;
834  auto col_inds = local.graph.entries;
835  auto values = local.values;
836  const auto nrows = pointMatrix.getLocalNumRows();
837 
838  // Sizing and rows
839  dev_row_view_t new_rowmap("new_rowmap", nrows + 1);
840  const impl_scalar_t zero = STS::zero();
841  Kokkos::parallel_for(
842  "sizing", range_type(0, nrows), KOKKOS_LAMBDA(const size_t row) {
843  const Ordinal row_nnz_begin = row_ptrs(row);
844  Ordinal row_nnzs = 0;
845  for (Ordinal row_nnz = row_nnz_begin; row_nnz < row_ptrs(row + 1); ++row_nnz) {
846  const impl_scalar_t value = values(row_nnz);
847  if (value != zero) {
848  ++row_nnzs;
849  }
850  }
851  new_rowmap(row) = row_nnzs;
852  });
853 
854  Ordinal real_nnzs = 0;
855  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
856  execution_space>(new_rowmap.extent(0), new_rowmap, real_nnzs);
857  // Now populate cols and vals
858  dev_col_view_t new_col_ids("new_col_ids", real_nnzs);
859  dev_val_view_t new_vals("new_vals", real_nnzs);
860  Kokkos::parallel_for(
861  "entries", range_type(0, nrows), KOKKOS_LAMBDA(const size_t row) {
862  Ordinal row_nnzs = 0;
863  const Ordinal old_row_nnz_begin = row_ptrs(row);
864  const Ordinal new_row_nnz_begin = new_rowmap(row);
865  for (Ordinal old_row_nnz = old_row_nnz_begin; old_row_nnz < row_ptrs(row + 1); ++old_row_nnz) {
866  const impl_scalar_t value = values(old_row_nnz);
867  if (value != zero) {
868  new_col_ids(new_row_nnz_begin + row_nnzs) = col_inds(old_row_nnz);
869  new_vals(new_row_nnz_begin + row_nnzs) = value;
870  ++row_nnzs;
871  }
872  }
873  });
874 
875  // Create new, unfilled CRS
876  auto crs_row_map = pointMatrix.getRowMap();
877  auto crs_col_map = pointMatrix.getColMap();
878  Teuchos::RCP<crs_t> result = Teuchos::rcp(new crs_t(crs_row_map, crs_col_map, new_rowmap, new_col_ids, new_vals));
879  result->fillComplete();
880  return result;
881 }
882 
883 template <class LO, class GO, class Node>
884 Teuchos::RCP<const Tpetra::Map<LO, GO, Node>>
885 createMeshMap(const LO &blockSize, const Tpetra::Map<LO, GO, Node> &pointMap, bool use_LID) {
886  typedef Teuchos::OrdinalTraits<Tpetra::global_size_t> TOT;
887  typedef Tpetra::Map<LO, GO, Node> map_type;
888 
889  if (use_LID) {
890  using execution_space = typename Node::execution_space;
891  using range_type = Kokkos::RangePolicy<execution_space, size_t>;
892 
893  auto pointGlobalID = pointMap.getMyGlobalIndicesDevice();
894  LO block_rows = pointGlobalID.extent(0) / blockSize;
895  Kokkos::View<GO *, typename map_type::device_type> meshGlobalID("meshGlobalID", block_rows);
896 
897  Kokkos::parallel_for(
898  "fillMeshMap", range_type(0, block_rows), KOKKOS_LAMBDA(const LO i) {
899  meshGlobalID(i) = pointGlobalID(i * blockSize) / blockSize;
900  });
901 
902  Teuchos::RCP<const map_type> meshMap = Teuchos::rcp(new map_type(TOT::invalid(), meshGlobalID, 0, pointMap.getComm()));
903  return meshMap;
904  } else {
905  // calculate mesh GIDs
906  Teuchos::ArrayView<const GO> pointGids = pointMap.getLocalElementList();
907  Teuchos::Array<GO> meshGids;
908  GO indexBase = pointMap.getIndexBase();
909 
910  // Use hash table to track whether we've encountered this GID previously. This will happen
911  // when striding through the point DOFs in a block. It should not happen otherwise.
912  // I don't use sort/make unique because I don't want to change the ordering.
913  meshGids.reserve(pointGids.size());
914  Tpetra::Details::HashTable<GO, int> hashTable(pointGids.size());
915  for (int i = 0; i < pointGids.size(); ++i) {
916  GO meshGid = (pointGids[i] - indexBase) / blockSize + indexBase;
917  if (hashTable.get(meshGid) == -1) {
918  hashTable.add(meshGid, 1); //(key,value)
919  meshGids.push_back(meshGid);
920  }
921  }
922 
923  Teuchos::RCP<const map_type> meshMap = Teuchos::rcp(new map_type(TOT::invalid(), meshGids(), 0, pointMap.getComm()));
924  return meshMap;
925  }
926 }
927 
928 template <class LO, class GO, class Node>
929 Teuchos::RCP<const Tpetra::Map<LO, GO, Node>>
930 createPointMap(const LO &blockSize, const Tpetra::Map<LO, GO, Node> &blockMap) {
931  typedef Teuchos::OrdinalTraits<Tpetra::global_size_t> TOT;
932  typedef Tpetra::Map<LO, GO, Node> map_type;
933 
934  // calculate mesh GIDs
935  Teuchos::ArrayView<const GO> blockGids = blockMap.getLocalElementList();
936  Teuchos::Array<GO> pointGids(blockGids.size() * blockSize);
937  GO indexBase = blockMap.getIndexBase();
938 
939  for (LO i = 0, ct = 0; i < (LO)blockGids.size(); i++) {
940  GO base = (blockGids[i] - indexBase) * blockSize + indexBase;
941  for (LO j = 0; j < blockSize; j++, ct++)
942  pointGids[i * blockSize + j] = base + j;
943  }
944 
945  Teuchos::RCP<const map_type> pointMap = Teuchos::rcp(new map_type(TOT::invalid(), pointGids(), 0, blockMap.getComm()));
946  return pointMap;
947 }
948 
949 template <class Scalar, class LO, class GO, class Node>
950 Teuchos::RCP<CrsMatrix<Scalar, LO, GO, Node>>
952  using Teuchos::Array;
953  using Teuchos::ArrayView;
954  using Teuchos::RCP;
955 
956  typedef Tpetra::BlockCrsMatrix<Scalar, LO, GO, Node> block_crs_matrix_type;
957  typedef Tpetra::Map<LO, GO, Node> map_type;
958  typedef Tpetra::CrsMatrix<Scalar, LO, GO, Node> crs_matrix_type;
959 
960  using crs_graph_type = typename block_crs_matrix_type::crs_graph_type;
961  using local_graph_device_type = typename crs_matrix_type::local_graph_device_type;
962  using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
963  using row_map_type = typename local_graph_device_type::row_map_type::non_const_type;
964  using entries_type = typename local_graph_device_type::entries_type::non_const_type;
965  using values_type = typename local_matrix_device_type::values_type::non_const_type;
966 
967  using row_map_type_const = typename local_graph_device_type::row_map_type;
968  using entries_type_const = typename local_graph_device_type::entries_type;
969 
970  using little_block_type = typename block_crs_matrix_type::const_little_block_type;
971  using offset_type = typename row_map_type::non_const_value_type;
972  using column_type = typename entries_type::non_const_value_type;
973 
974  using execution_space = typename Node::execution_space;
975  using range_type = Kokkos::RangePolicy<execution_space, LO>;
976 
977  LO blocksize = blockMatrix.getBlockSize();
978  const offset_type bs2 = blocksize * blocksize;
979  size_t block_nnz = blockMatrix.getLocalNumEntries();
980  size_t point_nnz = block_nnz * bs2;
981 
982  // We can get these from the blockMatrix directly
983  RCP<const map_type> pointDomainMap = blockMatrix.getDomainMap();
984  RCP<const map_type> pointRangeMap = blockMatrix.getRangeMap();
985 
986  // We need to generate the row/col Map ourselves.
987  RCP<const map_type> blockRowMap = blockMatrix.getRowMap();
988  RCP<const map_type> pointRowMap = createPointMap<LO, GO, Node>(blocksize, *blockRowMap);
989 
990  RCP<const map_type> blockColMap = blockMatrix.getColMap();
991  RCP<const map_type> pointColMap = createPointMap<LO, GO, Node>(blocksize, *blockColMap);
992 
993  // Get the last few things
994 
995  const crs_graph_type &blockGraph = blockMatrix.getCrsGraph();
996  LO point_rows = (LO)pointRowMap->getLocalNumElements();
997  LO block_rows = (LO)blockRowMap->getLocalNumElements();
998  auto blockValues = blockMatrix.getValuesDevice();
999  auto blockLocalGraph = blockGraph.getLocalGraphDevice();
1000  row_map_type_const blockRowptr = blockLocalGraph.row_map;
1001  entries_type_const blockColind = blockLocalGraph.entries;
1002 
1003  // Generate the point matrix rowptr / colind / values
1004  row_map_type rowptr("row_map", point_rows + 1);
1005  entries_type colind("entries", point_nnz);
1006  values_type values("values", point_nnz);
1007 
1008  // Pre-fill the rowptr
1009  Kokkos::parallel_for(
1010  "fillRowPtr", range_type(0, block_rows), KOKKOS_LAMBDA(const LO i) {
1011  offset_type base = blockRowptr[i];
1012  offset_type diff = blockRowptr[i + 1] - base;
1013  for (LO j = 0; j < blocksize; j++) {
1014  rowptr[i * blocksize + j] = base * bs2 + j * diff * blocksize;
1015  }
1016 
1017  // Fill the last guy, if we're on the final entry
1018  if (i == block_rows - 1) {
1019  rowptr[block_rows * blocksize] = blockRowptr[i + 1] * bs2;
1020  }
1021  });
1022 
1023  Kokkos::parallel_for(
1024  "copyEntriesAndValues", range_type(0, block_rows), KOKKOS_LAMBDA(const LO i) {
1025  const offset_type blkBeg = blockRowptr[i];
1026  const offset_type numBlocks = blockRowptr[i + 1] - blkBeg;
1027 
1028  // For each block in the row...
1029  for (offset_type block = 0; block < numBlocks; block++) {
1030  column_type point_col_base = blockColind[blkBeg + block] * blocksize;
1031  little_block_type my_block(blockValues.data() + (blkBeg + block) * bs2, blocksize, blocksize);
1032 
1033  // For each entry in the block...
1034  for (LO little_row = 0; little_row < blocksize; little_row++) {
1035  offset_type point_row_offset = rowptr[i * blocksize + little_row];
1036  for (LO little_col = 0; little_col < blocksize; little_col++) {
1037  colind[point_row_offset + block * blocksize + little_col] = point_col_base + little_col;
1038  values[point_row_offset + block * blocksize + little_col] = my_block(little_row, little_col);
1039  }
1040  }
1041  }
1042  });
1043 
1044  // Generate the matrix
1045  RCP<crs_matrix_type> pointCrsMatrix = rcp(new crs_matrix_type(pointRowMap, pointColMap, 0));
1046  pointCrsMatrix->setAllValues(rowptr, colind, values);
1047 
1048  // FUTURE OPTIMIZATION: Directly compute import/export, rather than letting ESFC do it
1049  pointCrsMatrix->expertStaticFillComplete(pointDomainMap, pointRangeMap);
1050 
1051  return pointCrsMatrix;
1052 }
1053 
1054 } // namespace Tpetra
1055 
1056 //
1057 // Explicit instantiation macro for blockCrsMatrixWriter (various
1058 // overloads), writeMatrixStrip, and convertToBlockCrsMatrix.
1059 //
1060 // Must be expanded from within the Tpetra namespace!
1061 //
1062 #define TPETRA_BLOCKCRSMATRIX_HELPERS_INSTANT(S, LO, GO, NODE) \
1063  template void blockCrsMatrixWriter(BlockCrsMatrix<S, LO, GO, NODE> const &A, std::string const &fileName); \
1064  template void blockCrsMatrixWriter(BlockCrsMatrix<S, LO, GO, NODE> const &A, std::string const &fileName, Teuchos::ParameterList const &params); \
1065  template void blockCrsMatrixWriter(BlockCrsMatrix<S, LO, GO, NODE> const &A, std::ostream &os); \
1066  template void blockCrsMatrixWriter(BlockCrsMatrix<S, LO, GO, NODE> const &A, std::ostream &os, Teuchos::ParameterList const &params); \
1067  template void writeMatrixStrip(BlockCrsMatrix<S, LO, GO, NODE> const &A, std::ostream &os, Teuchos::ParameterList const &params); \
1068  template Teuchos::RCP<BlockCrsMatrix<S, LO, GO, NODE>> convertToBlockCrsMatrix(const CrsMatrix<S, LO, GO, NODE> &pointMatrix, const LO &blockSize, bool use_LID); \
1069  template Teuchos::RCP<CrsMatrix<S, LO, GO, NODE>> fillLogicalBlocks(const CrsMatrix<S, LO, GO, NODE> &pointMatrix, const LO &blockSize); \
1070  template Teuchos::RCP<CrsMatrix<S, LO, GO, NODE>> unfillFormerBlockCrs(const CrsMatrix<S, LO, GO, NODE> &pointMatrix); \
1071  template Teuchos::RCP<CrsMatrix<S, LO, GO, NODE>> convertToCrsMatrix(const BlockCrsMatrix<S, LO, GO, NODE> &blockMatrix); \
1072  template Teuchos::RCP<CrsGraph<LO, GO, NODE>> getBlockCrsGraph(const Tpetra::CrsMatrix<S, LO, GO, NODE> &pointMatrix, const LO &blockSize, bool use_local_ID);
1073 
1074 //
1075 // Explicit instantiation macro for createMeshMap / createPointMap
1076 //
1077 #define TPETRA_CREATEMESHMAP_INSTANT(LO, GO, NODE) \
1078  template Teuchos::RCP<const Map<LO, GO, NODE>> createMeshMap(const LO &blockSize, const Map<LO, GO, NODE> &pointMap, bool use_local_ID); \
1079  template Teuchos::RCP<const Map<LO, GO, NODE>> createPointMap(const LO &blockSize, const Map<LO, GO, NODE> &blockMap);
1080 
1081 #endif // TPETRA_BLOCKCRSMATRIX_HELPERS_DEF_HPP
Teuchos::RCP< const Tpetra::Map< LO, GO, Node > > createPointMap(LO const &blockSize, const Tpetra::Map< LO, GO, Node > &blockMap)
Helper function to generate a point map from a block map (with a given block size) GIDs associated wi...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
virtual size_t getLocalNumEntries() const override
The local number of stored (structurally nonzero) entries.
virtual global_size_t getGlobalNumCols() const override
The global number of columns of this matrix.
void blockCrsMatrixWriter(BlockCrsMatrix< Scalar, LO, GO, Node > const &A, std::string const &fileName)
Helper function to write a BlockCrsMatrix. Calls the 3-argument version.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
virtual LO getBlockSize() const override
The number of degrees of freedom per mesh point.
Teuchos::RCP< Tpetra::CrsMatrix< Scalar, LO, GO, Node > > unfillFormerBlockCrs(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix)
Unfill all point entries in a logical block of a CrsMatrix with zeroes. This can be called after conv...
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
Teuchos::RCP< const Tpetra::Map< LO, GO, Node > > createMeshMap(LO const &blockSize, const Tpetra::Map< LO, GO, Node > &pointMap, bool use_local_ID=false)
Helper function to generate a mesh map from a point map. Important! It&#39;s assumed that point GIDs asso...
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
One or more distributed dense vectors.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
size_t getGlobalMaxNumRowEntries() const override
Maximum number of entries in any row of the graph, over all processes in the graph&#39;s communicator...
global_indices_array_device_type getMyGlobalIndicesDevice() const
Return a view of the global indices owned by this process on the Map&#39;s device.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
Accessors for the Teuchos::Comm and Kokkos Node objects.
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix&#39;s graph, as a CrsGraph.
size_t getLocalNumRows() const override
get the local number of block rows
void writeMatrixStrip(BlockCrsMatrix< Scalar, LO, GO, Node > const &A, std::ostream &os, Teuchos::ParameterList const &params)
Helper function called by blockCrsMatrixWriter.
void getLocalRowView(LO LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
Teuchos::RCP< const map_type > getColMap() const override
get the (mesh) map for the columns of this block matrix.
Teuchos::ArrayView< const global_ordinal_type > getLocalElementList() const
Return a NONOWNING view of the global indices owned by this process.
Teuchos::RCP< const map_type > getDomainMap() const override
Get the (point) domain Map of this matrix.
Teuchos::RCP< const map_type > getRowMap() const override
get the (mesh) map for the rows of this block matrix.
Insert new values that don&#39;t currently exist.
global_ordinal_type getIndexBase() const
The index base for this Map.
global_size_t getGlobalNumRows() const override
get the global number of block rows
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
virtual GO getIndexBase() const override
The index base for global indices in this matrix.
virtual global_size_t getGlobalNumEntries() const override
The global number of stored (structurally nonzero) entries.
Teuchos::RCP< Tpetra::CrsMatrix< Scalar, LO, GO, Node > > fillLogicalBlocks(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix, const LO &blockSize)
Fill all point entries in a logical block of a CrsMatrix with zeroes. This should be called before co...
Teuchos::RCP< const map_type > getRangeMap() const override
Get the (point) range Map of this matrix.
Teuchos::RCP< CrsMatrix< Scalar, LO, GO, Node > > convertToCrsMatrix(const Tpetra::BlockCrsMatrix< Scalar, LO, GO, Node > &blockMatrix)
Non-member constructor that creates a point CrsMatrix from an existing BlockCrsMatrix.
Teuchos::RCP< CrsGraph< LocalOrdinal, GlobalOrdinal, Node > > createCrsGraph(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node >> &map, size_t maxNumEntriesPerRow=0, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Nonmember function to create an empty CrsGraph given a row Map and the max number of entries allowed ...
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Teuchos::RCP< Tpetra::CrsGraph< LO, GO, Node > > getBlockCrsGraph(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix, const LO &blockSize, bool use_local_ID=true)
Non-member constructor that creates the CrsGraph of a BlockCrsMatrix from an existing point CrsMatrix...
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
void add(const KeyType key, const ValueType value)
Add a key and its value to the hash table.
Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > convertToBlockCrsMatrix(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix, const LO &blockSize, bool use_local_ID=true)
Non-member constructor that creates a BlockCrsMatrix from an existing point CrsMatrix.
local_matrix_device_type::values_type::const_type getLocalValuesDevice(Access::ReadOnlyStruct s) const
Get the Kokkos local values on device, read only.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
virtual size_t getLocalNumCols() const override
The number of columns needed to apply the forward operator on this node.
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.