Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_BlockCrsMatrix_Helpers_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_BLOCKCRSMATRIX_HELPERS_DEF_HPP
11 #define TPETRA_BLOCKCRSMATRIX_HELPERS_DEF_HPP
12 
14 
15 #include "Tpetra_BlockCrsMatrix.hpp"
16 #include "Tpetra_CrsMatrix.hpp"
17 #include "Tpetra_HashTable.hpp"
18 #include "Tpetra_Import.hpp"
19 #include "Tpetra_Map.hpp"
20 #include "Tpetra_MultiVector.hpp"
21 #include "Teuchos_ParameterList.hpp"
22 #include "Teuchos_ScalarTraits.hpp"
23 #include <ctime>
24 #include <fstream>
25 
26 namespace Tpetra {
27 
28  template<class Scalar, class LO, class GO, class Node>
29  void blockCrsMatrixWriter(BlockCrsMatrix<Scalar,LO,GO,Node> const &A, std::string const &fileName) {
30  Teuchos::ParameterList pl;
31  std::ofstream out;
32  out.open(fileName.c_str());
33  blockCrsMatrixWriter(A, out, pl);
34  }
35 
36  template<class Scalar, class LO, class GO, class Node>
37  void blockCrsMatrixWriter(BlockCrsMatrix<Scalar,LO,GO,Node> const &A, std::string const &fileName, Teuchos::ParameterList const &params) {
38  std::ofstream out;
39  out.open(fileName.c_str());
40  blockCrsMatrixWriter(A, out, params);
41  }
42 
43  template<class Scalar, class LO, class GO, class Node>
44  void blockCrsMatrixWriter(BlockCrsMatrix<Scalar,LO,GO,Node> const &A, std::ostream &os) {
45  Teuchos::ParameterList pl;
46  blockCrsMatrixWriter(A, os, pl);
47  }
48 
49  template<class Scalar, class LO, class GO, class Node>
50  void blockCrsMatrixWriter(BlockCrsMatrix<Scalar,LO,GO,Node> const &A, std::ostream &os, Teuchos::ParameterList const &params) {
51 
52  using Teuchos::RCP;
53  using Teuchos::rcp;
54 
55  typedef Teuchos::OrdinalTraits<GO> TOT;
56  typedef BlockCrsMatrix<Scalar, LO, GO, Node> block_crs_matrix_type;
57  typedef Tpetra::Import<LO, GO, Node> import_type;
58  typedef Tpetra::Map<LO, GO, Node> map_type;
60  typedef Tpetra::CrsGraph<LO, GO, Node> crs_graph_type;
61 
62  RCP<const map_type> const rowMap = A.getRowMap(); //"mesh" map
63  RCP<const Teuchos::Comm<int> > comm = rowMap->getComm();
64  const int myRank = comm->getRank();
65  const size_t numProcs = comm->getSize();
66 
67  // If true, force use of the import strip-mining infrastructure. This is useful for debugging on one process.
68  bool alwaysUseParallelAlgorithm = false;
69  if (params.isParameter("always use parallel algorithm"))
70  alwaysUseParallelAlgorithm = params.get<bool>("always use parallel algorithm");
71  bool printMatrixMarketHeader = true;
72  if (params.isParameter("print MatrixMarket header"))
73  printMatrixMarketHeader = params.get<bool>("print MatrixMarket header");
74 
75  if (printMatrixMarketHeader && myRank==0) {
76  std::time_t now = std::time(NULL);
77 
78  const std::string dataTypeStr =
79  Teuchos::ScalarTraits<Scalar>::isComplex ? "complex" : "real";
80 
81  // Explanation of first line of file:
82  // - "%%MatrixMarket" is the tag for Matrix Market format.
83  // - "matrix" is what we're printing.
84  // - "coordinate" means sparse (triplet format), rather than dense.
85  // - "real" / "complex" is the type (in an output format sense,
86  // not in a C++ sense) of each value of the matrix. (The
87  // other two possibilities are "integer" (not really necessary
88  // here) and "pattern" (no values, just graph).)
89  os << "%%MatrixMarket matrix coordinate " << dataTypeStr << " general" << std::endl;
90  os << "% time stamp: " << ctime(&now);
91  os << "% written from " << numProcs << " processes" << std::endl;
92  os << "% point representation of Tpetra::BlockCrsMatrix" << std::endl;
93  size_t numRows = A.getGlobalNumRows();
94  size_t numCols = A.getGlobalNumCols();
95  os << "% " << numRows << " block rows, " << numCols << " block columns" << std::endl;
96  const LO blockSize = A.getBlockSize();
97  os << "% block size " << blockSize << std::endl;
98  os << numRows*blockSize << " " << numCols*blockSize << " " << A.getGlobalNumEntries()*blockSize*blockSize << std::endl;
99  }
100 
101  if (numProcs==1 && !alwaysUseParallelAlgorithm) {
102  writeMatrixStrip(A,os,params);
103  } else {
104  size_t numRows = rowMap->getLocalNumElements();
105 
106  //Create source map
107  RCP<const map_type> allMeshGidsMap = rcp(new map_type(TOT::invalid(), numRows, A.getIndexBase(), comm));
108  //Create and populate vector of mesh GIDs corresponding to this pid's rows.
109  //This vector will be imported one pid's worth of information at a time to pid 0.
110  mv_type allMeshGids(allMeshGidsMap,1);
111  Teuchos::ArrayRCP<GO> allMeshGidsData = allMeshGids.getDataNonConst(0);
112 
113  for (size_t i=0; i<numRows; i++)
114  allMeshGidsData[i] = rowMap->getGlobalElement(i);
115  allMeshGidsData = Teuchos::null;
116 
117  // Now construct a RowMatrix on PE 0 by strip-mining the rows of the input matrix A.
118  size_t stripSize = allMeshGids.getGlobalLength() / numProcs;
119  size_t remainder = allMeshGids.getGlobalLength() % numProcs;
120  size_t curStart = 0;
121  size_t curStripSize = 0;
122  Teuchos::Array<GO> importMeshGidList;
123  for (size_t i=0; i<numProcs; i++) {
124  if (myRank==0) { // Only PE 0 does this part
125  curStripSize = stripSize;
126  if (i<remainder) curStripSize++; // handle leftovers
127  importMeshGidList.resize(curStripSize); // Set size of vector to max needed
128  for (size_t j=0; j<curStripSize; j++) importMeshGidList[j] = j + curStart + A.getIndexBase();
129  curStart += curStripSize;
130  }
131  // The following import map should be non-trivial only on PE 0.
132  TEUCHOS_TEST_FOR_EXCEPTION(myRank>0 && curStripSize!=0,
133  std::runtime_error, "Tpetra::blockCrsMatrixWriter: (pid "
134  << myRank << ") map size should be zero, but is " << curStripSize);
135  RCP<map_type> importMeshGidMap = rcp(new map_type(TOT::invalid(), importMeshGidList(), A.getIndexBase(), comm));
136  import_type gidImporter(allMeshGidsMap, importMeshGidMap);
137  mv_type importMeshGids(importMeshGidMap, 1);
138  importMeshGids.doImport(allMeshGids, gidImporter, INSERT);
139 
140  // importMeshGids now has a list of GIDs for the current strip of matrix rows.
141  // Use these values to build another importer that will get rows of the matrix.
142 
143  // The following import map will be non-trivial only on PE 0.
144  Teuchos::ArrayRCP<const GO> importMeshGidsData = importMeshGids.getData(0);
145  Teuchos::Array<GO> importMeshGidsGO;
146  importMeshGidsGO.reserve(importMeshGidsData.size());
147  for (typename Teuchos::ArrayRCP<const GO>::size_type j=0; j<importMeshGidsData.size(); ++j)
148  importMeshGidsGO.push_back(importMeshGidsData[j]);
149  RCP<const map_type> importMap = rcp(new map_type(TOT::invalid(), importMeshGidsGO(), rowMap->getIndexBase(), comm) );
150 
151  import_type importer(rowMap,importMap );
152  size_t numEntriesPerRow = A.getCrsGraph().getGlobalMaxNumRowEntries();
153  RCP<crs_graph_type> graph = createCrsGraph(importMap,numEntriesPerRow);
154  RCP<const map_type> domainMap = A.getCrsGraph().getDomainMap();
155  graph->doImport(A.getCrsGraph(), importer, INSERT);
156  graph->fillComplete(domainMap, importMap);
157 
158  block_crs_matrix_type importA(*graph, A.getBlockSize());
159  importA.doImport(A, importer, INSERT);
160 
161  // Finally we are ready to write this strip of the matrix
162  writeMatrixStrip(importA, os, params);
163  }
164  }
165  }
166 
167  template<class Scalar, class LO, class GO, class Node>
168  void writeMatrixStrip(BlockCrsMatrix<Scalar,LO,GO,Node> const &A, std::ostream &os, Teuchos::ParameterList const &params) {
169  using Teuchos::RCP;
170  using map_type = Tpetra::Map<LO, GO, Node>;
171  using bcrs_type = BlockCrsMatrix<Scalar,LO,GO,Node>;
172  using bcrs_local_inds_host_view_type = typename bcrs_type::local_inds_host_view_type;
173  using bcrs_values_host_view_type = typename bcrs_type::values_host_view_type;
174  using impl_scalar_type = typename bcrs_type::impl_scalar_type;
175 
176  size_t numRows = A.getGlobalNumRows();
177  RCP<const map_type> rowMap = A.getRowMap();
178  RCP<const map_type> colMap = A.getColMap();
179  RCP<const Teuchos::Comm<int> > comm = rowMap->getComm();
180  const int myRank = comm->getRank();
181 
182  const size_t meshRowOffset = rowMap->getIndexBase();
183  const size_t meshColOffset = colMap->getIndexBase();
184  TEUCHOS_TEST_FOR_EXCEPTION(meshRowOffset != meshColOffset,
185  std::runtime_error, "Tpetra::writeMatrixStrip: "
186  "mesh row index base != mesh column index base");
187 
188  if (myRank !=0) {
189 
190  TEUCHOS_TEST_FOR_EXCEPTION(A.getLocalNumRows() != 0,
191  std::runtime_error, "Tpetra::writeMatrixStrip: pid "
192  << myRank << " should have 0 rows but has " << A.getLocalNumRows());
193  TEUCHOS_TEST_FOR_EXCEPTION(A.getLocalNumCols() != 0,
194  std::runtime_error, "Tpetra::writeMatrixStrip: pid "
195  << myRank << " should have 0 columns but has " << A.getLocalNumCols());
196 
197  } else {
198 
199  TEUCHOS_TEST_FOR_EXCEPTION(numRows != A.getLocalNumRows(),
200  std::runtime_error, "Tpetra::writeMatrixStrip: "
201  "number of rows on pid 0 does not match global number of rows");
202 
203 
204  int err = 0;
205  const LO blockSize = A.getBlockSize();
206  const size_t numLocalRows = A.getLocalNumRows();
207  bool precisionChanged=false;
208  int oldPrecision = 0; // avoid "unused variable" warning
209  if (params.isParameter("precision")) {
210  oldPrecision = os.precision(params.get<int>("precision"));
211  precisionChanged=true;
212  }
213  int pointOffset = 1;
214  if (params.isParameter("zero-based indexing")) {
215  if (params.get<bool>("zero-based indexing") == true)
216  pointOffset = 0;
217  }
218 
219  size_t localRowInd;
220  for (localRowInd = 0; localRowInd < numLocalRows; ++localRowInd) {
221 
222  // Get a view of the current row.
223  bcrs_local_inds_host_view_type localColInds;
224  bcrs_values_host_view_type vals;
225  LO numEntries;
226  A.getLocalRowView (localRowInd, localColInds, vals); numEntries = localColInds.extent(0);
227  GO globalMeshRowID = rowMap->getGlobalElement(localRowInd) - meshRowOffset;
228 
229  for (LO k = 0; k < numEntries; ++k) {
230  GO globalMeshColID = colMap->getGlobalElement(localColInds[k]) - meshColOffset;
231  const impl_scalar_type* curBlock = vals.data() + blockSize * blockSize * k;
232  // Blocks are stored in row-major format.
233  for (LO j = 0; j < blockSize; ++j) {
234  GO globalPointRowID = globalMeshRowID * blockSize + j + pointOffset;
235  for (LO i = 0; i < blockSize; ++i) {
236  GO globalPointColID = globalMeshColID * blockSize + i + pointOffset;
237  const impl_scalar_type curVal = curBlock[i + j * blockSize];
238 
239  os << globalPointRowID << " " << globalPointColID << " ";
240  if (Teuchos::ScalarTraits<impl_scalar_type>::isComplex) {
241  // Matrix Market format wants complex values to be
242  // written as space-delimited pairs. See Bug 6469.
243  os << Teuchos::ScalarTraits<impl_scalar_type>::real (curVal) << " "
244  << Teuchos::ScalarTraits<impl_scalar_type>::imag (curVal);
245  }
246  else {
247  os << curVal;
248  }
249  os << std::endl;
250  }
251  }
252  }
253  }
254  if (precisionChanged)
255  os.precision(oldPrecision);
256  TEUCHOS_TEST_FOR_EXCEPTION(err != 0,
257  std::runtime_error, "Tpetra::writeMatrixStrip: "
258  "error getting view of local row " << localRowInd);
259 
260  }
261 
262  }
263 
264  template<class Scalar, class LO, class GO, class Node>
265  Teuchos::RCP<Tpetra::CrsGraph<LO, GO, Node> >
266  getBlockCrsGraph(const Tpetra::CrsMatrix<Scalar, LO, GO, Node>& pointMatrix, const LO &blockSize, bool use_LID)
267  {
268  TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::getBlockCrsGraph", getBlockCrsGraph0);
269  /*
270  ASSUMPTIONS:
271 
272  1) In point matrix, all entries associated with a little block are present (even if they are zero).
273  2) For given mesh DOF, point DOFs appear consecutively and in ascending order in row & column maps.
274  3) Point column map and block column map are ordered consistently.
275  */
276 
277  using Teuchos::Array;
278  using Teuchos::ArrayView;
279  using Teuchos::RCP;
280 
281  typedef Tpetra::Map<LO,GO,Node> map_type;
282  typedef Tpetra::CrsGraph<LO,GO,Node> crs_graph_type;
283  typedef Tpetra::CrsMatrix<Scalar, LO,GO,Node> crs_matrix_type;
284 
285  using local_graph_device_type = typename crs_matrix_type::local_graph_device_type;
286  using row_map_type = typename local_graph_device_type::row_map_type::non_const_type;
287  using entries_type = typename local_graph_device_type::entries_type::non_const_type;
288 
289  using offset_type = typename row_map_type::non_const_value_type;
290 
291  using execution_space = typename Node::execution_space;
292  using range_type = Kokkos::RangePolicy<execution_space, LO>;
293 
294  const map_type &pointRowMap = *(pointMatrix.getRowMap());
295  RCP<const map_type> meshRowMap, meshColMap, meshDomainMap, meshRangeMap;
296 
297  const map_type &pointColMap = *(pointMatrix.getColMap());
298  const map_type &pointDomainMap = *(pointMatrix.getDomainMap());
299  const map_type &pointRangeMap = *(pointMatrix.getRangeMap());
300 
301  {
302  TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::getBlockCrsGraph::createMeshMaps", getBlockCrsGraph1);
303  meshRowMap = createMeshMap<LO,GO,Node>(blockSize, pointRowMap, use_LID);
304  meshColMap = createMeshMap<LO,GO,Node>(blockSize, pointColMap, use_LID);
305  meshDomainMap = createMeshMap<LO,GO,Node>(blockSize, pointDomainMap, use_LID);
306  meshRangeMap = createMeshMap<LO,GO,Node>(blockSize, pointRangeMap, use_LID);
307  Kokkos::DefaultExecutionSpace().fence();
308  }
309 
310  if(meshColMap.is_null()) throw std::runtime_error("ERROR: Cannot create mesh colmap");
311 
312  auto localMeshColMap = meshColMap->getLocalMap();
313  auto localPointColMap = pointColMap.getLocalMap();
314  auto localPointRowMap = pointRowMap.getLocalMap();
315 
316  RCP<crs_graph_type> meshCrsGraph;
317 
318  const offset_type bs2 = blockSize * blockSize;
319 
320  if (use_LID) {
321  TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::getBlockCrsGraph::LID", getBlockCrsGraph2);
322  auto pointLocalGraph = pointMatrix.getCrsGraph()->getLocalGraphDevice();
323  auto pointRowptr = pointLocalGraph.row_map;
324  auto pointColind = pointLocalGraph.entries;
325 
326  TEUCHOS_TEST_FOR_EXCEPTION(pointColind.extent(0) % bs2 != 0,
327  std::runtime_error, "Tpetra::getBlockCrsGraph: "
328  "local number of non zero entries is not a multiple of blockSize^2 ");
329 
330  LO block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0)-1)/blockSize;
331  row_map_type blockRowptr("blockRowptr", block_rows+1);
332  entries_type blockColind("blockColind", pointColind.extent(0)/(bs2));
333 
334  Kokkos::parallel_for("fillMesh",range_type(0,block_rows), KOKKOS_LAMBDA(const LO i) {
335 
336  const LO offset_b = pointRowptr(i*blockSize)/bs2;
337  const LO offset_b_max = pointRowptr((i+1)*blockSize)/bs2;
338 
339  if (i==block_rows-1)
340  blockRowptr(i+1) = offset_b_max;
341  blockRowptr(i) = offset_b;
342 
343  const LO offset_p = pointRowptr(i*blockSize);
344 
345  for (LO k=0; k<offset_b_max-offset_b; ++k) {
346  blockColind(offset_b + k) = pointColind(offset_p + k * blockSize)/blockSize;
347  }
348  });
349 
350  meshCrsGraph = rcp(new crs_graph_type(meshRowMap, meshColMap, blockRowptr, blockColind));
351  meshCrsGraph->fillComplete(meshDomainMap,meshRangeMap);
352  Kokkos::DefaultExecutionSpace().fence();
353  }
354  else {
355  TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::getBlockCrsGraph::GID", getBlockCrsGraph3);
356  auto pointLocalGraph = pointMatrix.getCrsGraph()->getLocalGraphDevice();
357  auto pointRowptr = pointLocalGraph.row_map;
358  auto pointColind = pointLocalGraph.entries;
359 
360  TEUCHOS_TEST_FOR_EXCEPTION(pointColind.extent(0) % bs2 != 0,
361  std::runtime_error, "Tpetra::getBlockCrsGraph: "
362  "local number of non zero entries is not a multiple of blockSize^2 ");
363 
364  LO block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0)-1)/blockSize;
365  row_map_type blockRowptr("blockRowptr", block_rows+1);
366  entries_type blockColind("blockColind", pointColind.extent(0)/(bs2));
367 
368  Kokkos::parallel_for("fillMesh",range_type(0,block_rows), KOKKOS_LAMBDA(const LO i) {
369 
370  const LO offset_b = pointRowptr(i*blockSize)/bs2;
371  const LO offset_b_max = pointRowptr((i+1)*blockSize)/bs2;
372 
373  if (i==block_rows-1)
374  blockRowptr(i+1) = offset_b_max;
375  blockRowptr(i) = offset_b;
376 
377  const LO offset_p = pointRowptr(i*blockSize);
378  const LO offset_p_max = pointRowptr((i+1)*blockSize);
379 
380  LO filled_block = 0;
381  for (LO p_i=0; p_i<offset_p_max-offset_p; ++p_i) {
382  auto bcol_GID = localPointColMap.getGlobalElement(pointColind(offset_p + p_i))/blockSize;
383  auto bcol_LID = localMeshColMap.getLocalElement(bcol_GID);
384 
385  bool visited = false;
386  for (LO k=0; k<filled_block; ++k) {
387  if (blockColind(offset_b + k) == bcol_LID)
388  visited = true;
389  }
390  if (!visited) {
391  blockColind(offset_b + filled_block) = bcol_LID;
392  ++filled_block;
393  }
394  }
395  });
396 
397  meshCrsGraph = rcp(new crs_graph_type(meshRowMap, meshColMap, blockRowptr, blockColind));
398  meshCrsGraph->fillComplete(meshDomainMap,meshRangeMap);
399  Kokkos::DefaultExecutionSpace().fence();
400  }
401 
402  return meshCrsGraph;
403  }
404 
405  template<class Scalar, class LO, class GO, class Node>
406  Teuchos::RCP<BlockCrsMatrix<Scalar, LO, GO, Node> >
407  convertToBlockCrsMatrix(const Tpetra::CrsMatrix<Scalar, LO, GO, Node>& pointMatrix, const LO &blockSize, bool use_LID)
408  {
409  TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::convertToBlockCrsMatrix", convertToBlockCrsMatrix0);
410  /*
411  ASSUMPTIONS:
412 
413  1) In point matrix, all entries associated with a little block are present (even if they are zero).
414  2) For given mesh DOF, point DOFs appear consecutively and in ascending order in row & column maps.
415  3) Point column map and block column map are ordered consistently.
416  */
417 
418  using Teuchos::Array;
419  using Teuchos::ArrayView;
420  using Teuchos::RCP;
421 
422  typedef Tpetra::BlockCrsMatrix<Scalar,LO,GO,Node> block_crs_matrix_type;
423  typedef Tpetra::CrsMatrix<Scalar, LO,GO,Node> crs_matrix_type;
424 
425  using local_graph_device_type = typename crs_matrix_type::local_graph_device_type;
426  using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
427  using row_map_type = typename local_graph_device_type::row_map_type::non_const_type;
428  using values_type = typename local_matrix_device_type::values_type::non_const_type;
429 
430  using offset_type = typename row_map_type::non_const_value_type;
431 
432  using execution_space = typename Node::execution_space;
433  using range_type = Kokkos::RangePolicy<execution_space, LO>;
434 
435  RCP<block_crs_matrix_type> blockMatrix;
436 
437  const offset_type bs2 = blockSize * blockSize;
438 
439  auto meshCrsGraph = getBlockCrsGraph(pointMatrix, blockSize, use_LID);
440 
441  if (use_LID) {
442  TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::convertToBlockCrsMatrix::LID", convertToBlockCrsMatrix1);
443  auto pointLocalGraph = pointMatrix.getCrsGraph()->getLocalGraphDevice();
444  auto pointRowptr = pointLocalGraph.row_map;
445  auto pointColind = pointLocalGraph.entries;
446 
447  offset_type block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0)-1)/blockSize;
448  values_type blockValues("values", meshCrsGraph->getLocalNumEntries()*bs2);
449  auto pointValues = pointMatrix.getLocalValuesDevice (Access::ReadOnly);
450  auto blockRowptr = meshCrsGraph->getLocalGraphDevice().row_map;
451 
452  Kokkos::parallel_for("copyblockValues",range_type(0,block_rows),KOKKOS_LAMBDA(const LO i) {
453  const offset_type blkBeg = blockRowptr[i];
454  const offset_type numBlocks = blockRowptr[i+1] - blkBeg;
455 
456  // For each block in the row...
457  for (offset_type block=0; block < numBlocks; block++) {
458 
459  // For each entry in the block...
460  for(LO little_row=0; little_row<blockSize; little_row++) {
461  offset_type point_row_offset = pointRowptr[i*blockSize + little_row];
462  for(LO little_col=0; little_col<blockSize; little_col++) {
463  blockValues((blkBeg+block) * bs2 + little_row * blockSize + little_col) =
464  pointValues[point_row_offset + block*blockSize + little_col];
465  }
466  }
467 
468  }
469  });
470  blockMatrix = rcp(new block_crs_matrix_type(*meshCrsGraph, blockValues, blockSize));
471  Kokkos::DefaultExecutionSpace().fence();
472  }
473  else {
474  TEUCHOS_FUNC_TIME_MONITOR_DIFF("Tpetra::convertToBlockCrsMatrix::GID", convertToBlockCrsMatrix2);
475  auto localMeshColMap = meshCrsGraph->getColMap()->getLocalMap();
476  auto localPointColMap = pointMatrix.getColMap()->getLocalMap();
477 
478  values_type blockValues("values", meshCrsGraph->getLocalNumEntries()*bs2);
479  {
480  auto pointLocalGraph = pointMatrix.getCrsGraph()->getLocalGraphDevice();
481  auto pointRowptr = pointLocalGraph.row_map;
482  auto pointColind = pointLocalGraph.entries;
483 
484  offset_type block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0)-1)/blockSize;
485  auto pointValues = pointMatrix.getLocalValuesDevice (Access::ReadOnly);
486  auto blockRowptr = meshCrsGraph->getLocalGraphDevice().row_map;
487  auto blockColind = meshCrsGraph->getLocalGraphDevice().entries;
488 
489  row_map_type pointGColind("pointGColind", pointColind.extent(0));
490 
491  Kokkos::parallel_for("computePointGColind",range_type(0,pointColind.extent(0)),KOKKOS_LAMBDA(const LO i) {
492  pointGColind(i) = localPointColMap.getGlobalElement(pointColind(i));
493  });
494 
495  row_map_type blockGColind("blockGColind", blockColind.extent(0));
496 
497  Kokkos::parallel_for("computeBlockGColind",range_type(0,blockGColind.extent(0)),KOKKOS_LAMBDA(const LO i) {
498  blockGColind(i) = localMeshColMap.getGlobalElement(blockColind(i));
499  });
500 
501  Kokkos::parallel_for("copyblockValues",range_type(0,block_rows),KOKKOS_LAMBDA(const LO i) {
502  const offset_type blkBeg = blockRowptr[i];
503  const offset_type numBlocks = blockRowptr[i+1] - blkBeg;
504 
505  for (offset_type point_i=0; point_i < pointRowptr[i*blockSize + 1] - pointRowptr[i*blockSize]; point_i++) {
506 
507  offset_type block_inv=static_cast<offset_type>(-1);
508  offset_type little_col_inv=static_cast<offset_type>(-1);
509  for (offset_type block_2=0; block_2 < numBlocks; block_2++) {
510  for (LO little_col_2=0; little_col_2 < blockSize; little_col_2++) {
511  if (blockGColind(blkBeg+block_2)*blockSize + little_col_2 == pointGColind(pointRowptr[i*blockSize] + point_i)) {
512  block_inv = block_2;
513  little_col_inv = little_col_2;
514  break;
515  }
516  }
517  if (block_inv!=static_cast<offset_type>(-1))
518  break;
519  }
520 
521  for(LO little_row=0; little_row<blockSize; little_row++) {
522  blockValues((blkBeg+block_inv) * bs2 + little_row * blockSize + little_col_inv) = pointValues[pointRowptr[i*blockSize+little_row] + point_i];
523  }
524  }
525  });
526  }
527  blockMatrix = rcp(new block_crs_matrix_type(*meshCrsGraph, blockValues, blockSize));
528  Kokkos::DefaultExecutionSpace().fence();
529  }
530 
531  return blockMatrix;
532 
533  }
534 
535  template<class Scalar, class LO, class GO, class Node>
536  Teuchos::RCP<Tpetra::CrsMatrix<Scalar, LO, GO, Node>>
537  fillLogicalBlocks(const Tpetra::CrsMatrix<Scalar, LO, GO, Node>& pointMatrix, const LO &blockSize)
538  {
540  using execution_space = typename Node::execution_space;
541  using dev_row_view_t = typename crs_t::local_graph_device_type::row_map_type::non_const_type;
542  using dev_col_view_t = typename crs_t::local_graph_device_type::entries_type::non_const_type;
543  using dev_val_view_t = typename crs_t::local_matrix_device_type::values_type::non_const_type;
544  using range_type = Kokkos::RangePolicy<execution_space, size_t>;
545  using team_policy = Kokkos::TeamPolicy<execution_space>;
546  using member_type = typename team_policy::member_type;
547  using scratch_view = Kokkos::View<bool*, typename execution_space::scratch_memory_space>;
548  using Ordinal = typename dev_row_view_t::non_const_value_type;
549 
550  // Get structure / values
551  auto local = pointMatrix.getLocalMatrixDevice();
552  auto row_ptrs = local.graph.row_map;
553  auto col_inds = local.graph.entries;
554  auto values = local.values;
555  const auto nrows = pointMatrix.getLocalNumRows();
556 
557  //
558  // Populate all active blocks, they must be fully populated with entries so fill with zeroes
559  //
560 
561  // Make row workspace views
562  dev_row_view_t new_rowmap("new_rowmap", nrows+1);
563  const auto blocks_per_row = nrows / blockSize; // assumes square matrix
564  dev_row_view_t active_block_row_map("active_block_row_map", blocks_per_row + 1);
565  const int max_threads = execution_space::concurrency();
566  assert(blockSize > 1);
567  assert(nrows % blockSize == 0);
568  const int mem_level = 1;
569  const int bytes = scratch_view::shmem_size(blocks_per_row);
570 
571  if (max_threads >= blockSize) {
572  // Prefer 1 team per block since this will require a lot less scratch memory
573  team_policy tp(blocks_per_row, blockSize);
574 
575  // Count active blocks
576  Kokkos::parallel_for("countActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(const member_type& team) {
577  Ordinal block_row = team.league_rank();
578 
579  scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
580  Kokkos::single(
581  Kokkos::PerTeam(team), [&] () {
582  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
583  row_block_active(row_block_idx) = false;
584  }
585  });
586  team.team_barrier();
587 
588  // All threads in a team scan a blocks-worth of rows to see which
589  // blocks are active
590  Kokkos::parallel_for(
591  Kokkos::TeamThreadRange(team, blockSize), [&] (Ordinal block_offset) {
592 
593  Ordinal row = block_row*blockSize + block_offset;
594  Ordinal row_itr = row_ptrs(row);
595  Ordinal row_end = row_ptrs(row+1);
596 
597  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
598  const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
599  const Ordinal first_possible_col_in_next_block = (row_block_idx+1) * blockSize;
600  Ordinal curr_nnz_col = col_inds(row_itr);
601  while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
602  // This block has at least one nnz in this row
603  row_block_active(row_block_idx) = true;
604  ++row_itr;
605  if (row_itr == row_end) break;
606  curr_nnz_col = col_inds(row_itr);
607  }
608  }
609  });
610 
611  team.team_barrier();
612 
613  Kokkos::single(
614  Kokkos::PerTeam(team), [&] () {
615  Ordinal count = 0;
616  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
617  if (row_block_active(row_block_idx)) {
618  ++count;
619  }
620  }
621  active_block_row_map(block_row) = count;
622  });
623  });
624  }
625  else {
626  // We don't have enough parallelism to make a thread team handle a block, so just
627  // have 1 thread per block
628  team_policy tp(blocks_per_row, 1);
629 
630  // Count active blocks
631  Kokkos::parallel_for("countActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(const member_type& team) {
632  Ordinal block_row = team.league_rank();
633 
634  scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
635  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
636  row_block_active(row_block_idx) = false;
637  }
638 
639  // One thread scans a blocks-worth of rows to see which blocks are active
640  for (int block_offset=0; block_offset < blockSize; ++block_offset) {
641  Ordinal row = block_row*blockSize + block_offset;
642  Ordinal row_itr = row_ptrs(row);
643  Ordinal row_end = row_ptrs(row+1);
644 
645  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
646  const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
647  const Ordinal first_possible_col_in_next_block = (row_block_idx+1) * blockSize;
648  Ordinal curr_nnz_col = col_inds(row_itr);
649  while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
650  // This block has at least one nnz in this row
651  row_block_active(row_block_idx) = true;
652  ++row_itr;
653  if (row_itr == row_end) break;
654  curr_nnz_col = col_inds(row_itr);
655  }
656  }
657  }
658 
659  Ordinal count = 0;
660  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
661  if (row_block_active(row_block_idx)) {
662  ++count;
663  }
664  }
665  active_block_row_map(block_row) = count;
666  });
667  }
668 
669  Ordinal nnz_block_count = 0;
670 #if KOKKOSKERNELS_VERSION >= 40199
671  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
672  execution_space>(active_block_row_map.extent(0), active_block_row_map, nnz_block_count);
673 #else
674  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
675  dev_row_view_t, execution_space>(active_block_row_map.extent(0), active_block_row_map, nnz_block_count);
676 #endif
677  dev_col_view_t block_col_ids("block_col_ids", nnz_block_count);
678 
679  // Find active blocks
680  if (max_threads >= blockSize) {
681  // Prefer 1 team per block since this will require a lot less scratch memory
682  team_policy tp(blocks_per_row, blockSize);
683 
684  Kokkos::parallel_for("findActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(const member_type& team) {
685  Ordinal block_row = team.league_rank();
686 
687  scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
688  Kokkos::single(
689  Kokkos::PerTeam(team), [&] () {
690  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
691  row_block_active(row_block_idx) = false;
692  }
693  });
694  team.team_barrier();
695 
696  // All threads in a team scan a blocks-worth of rows to see which
697  // blocks are active
698  Kokkos::parallel_for(
699  Kokkos::TeamThreadRange(team, blockSize), [&] (Ordinal block_offset) {
700 
701  Ordinal row = block_row*blockSize + block_offset;
702  Ordinal row_itr = row_ptrs(row);
703  Ordinal row_end = row_ptrs(row+1);
704 
705  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
706  const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
707  const Ordinal first_possible_col_in_next_block = (row_block_idx+1) * blockSize;
708  Ordinal curr_nnz_col = col_inds(row_itr);
709  while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
710  // This block has at least one nnz in this row
711  row_block_active(row_block_idx) = true;
712  ++row_itr;
713  if (row_itr == row_end) break;
714  curr_nnz_col = col_inds(row_itr);
715  }
716  }
717  });
718 
719  team.team_barrier();
720 
721  Kokkos::single(
722  Kokkos::PerTeam(team), [&] () {
723  Ordinal offset = active_block_row_map[block_row];
724  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
725  if (row_block_active(row_block_idx)) {
726  block_col_ids(offset) = row_block_idx;
727  ++offset;
728  }
729  }
730  });
731  });
732  }
733  else {
734  team_policy tp(blocks_per_row, 1);
735 
736  Kokkos::parallel_for("findActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(const member_type& team) {
737  Ordinal block_row = team.league_rank();
738 
739  scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
740  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
741  row_block_active(row_block_idx) = false;
742  }
743 
744  // One thread scans a blocks-worth of rows to see which blocks are active
745  for (int block_offset=0; block_offset < blockSize; ++block_offset) {
746  Ordinal row = block_row*blockSize + block_offset;
747  Ordinal row_itr = row_ptrs(row);
748  Ordinal row_end = row_ptrs(row+1);
749 
750  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
751  const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
752  const Ordinal first_possible_col_in_next_block = (row_block_idx+1) * blockSize;
753  Ordinal curr_nnz_col = col_inds(row_itr);
754  while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
755  // This block has at least one nnz in this row
756  row_block_active(row_block_idx) = true;
757  ++row_itr;
758  if (row_itr == row_end) break;
759  curr_nnz_col = col_inds(row_itr);
760  }
761  }
762  }
763 
764  Ordinal offset = active_block_row_map[block_row];
765  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
766  if (row_block_active(row_block_idx)) {
767  block_col_ids(offset) = row_block_idx;
768  ++offset;
769  }
770  }
771  });
772  }
773 
774  // Sizing
775  Kokkos::parallel_for("sizing", range_type(0, nrows), KOKKOS_LAMBDA(const size_t row) {
776  const auto block_row = row / blockSize;
777  const Ordinal block_row_begin = active_block_row_map(block_row);
778  const Ordinal block_row_end = active_block_row_map(block_row+1);
779  const Ordinal row_nnz = (block_row_end - block_row_begin) * blockSize;
780  new_rowmap(row) = row_nnz;
781  });
782 
783  Ordinal new_nnz_count = 0;
784 #if KOKKOSKERNELS_VERSION >= 40199
785  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
786  execution_space>(new_rowmap.extent(0), new_rowmap, new_nnz_count);
787 #else
788  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
789  dev_row_view_t, execution_space>(new_rowmap.extent(0), new_rowmap, new_nnz_count);
790 #endif
791  // Now populate cols and vals
792  dev_col_view_t new_col_ids("new_col_ids", new_nnz_count);
793  dev_val_view_t new_vals("new_vals", new_nnz_count);
794  Kokkos::parallel_for("entries", range_type(0, nrows), KOKKOS_LAMBDA(const size_t row) {
795  Ordinal row_itr = row_ptrs(row);
796  Ordinal row_end = row_ptrs(row+1);
797  Ordinal row_itr_new = new_rowmap(row);
798 
799  Ordinal block_row = row / blockSize;
800  Ordinal block_row_begin = active_block_row_map(block_row);
801  Ordinal block_row_end = active_block_row_map(block_row+1);
802 
803  for (Ordinal row_block_idx = block_row_begin; row_block_idx < block_row_end; ++row_block_idx) {
804  const Ordinal block_col = block_col_ids(row_block_idx);
805  const Ordinal first_possible_col_in_block = block_col * blockSize;
806  const Ordinal first_possible_col_in_next_block = (block_col+1) * blockSize;
807  for (Ordinal possible_col = first_possible_col_in_block; possible_col < first_possible_col_in_next_block; ++possible_col, ++row_itr_new) {
808  new_col_ids(row_itr_new) = possible_col;
809  Ordinal curr_nnz_col = col_inds(row_itr);
810  if (possible_col == curr_nnz_col && row_itr < row_end) {
811  // Already a non-zero entry
812  new_vals(row_itr_new) = values(row_itr);
813  ++row_itr;
814  }
815  }
816  }
817  });
818 
819  // Create new, filled CRS
820  auto crs_row_map = pointMatrix.getRowMap();
821  auto crs_col_map = pointMatrix.getColMap();
822  Teuchos::RCP<crs_t> result = Teuchos::rcp(new crs_t(crs_row_map, crs_col_map, new_rowmap, new_col_ids, new_vals));
823  result->fillComplete();
824  return result;
825  }
826 
827  template<class Scalar, class LO, class GO, class Node>
828  Teuchos::RCP<Tpetra::CrsMatrix<Scalar, LO, GO, Node>>
830  {
832  using dev_row_view_t = typename crs_t::local_graph_device_type::row_map_type::non_const_type;
833  using dev_col_view_t = typename crs_t::local_graph_device_type::entries_type::non_const_type;
834  using dev_val_view_t = typename crs_t::local_matrix_device_type::values_type::non_const_type;
835  using impl_scalar_t = typename Kokkos::ArithTraits<Scalar>::val_type;
836  using STS = Kokkos::ArithTraits<impl_scalar_t>;
837  using Ordinal = typename dev_row_view_t::non_const_value_type;
838  using execution_space = typename Node::execution_space;
839  using range_type = Kokkos::RangePolicy<execution_space, size_t>;
840 
841  // Get structure / values
842  auto local = pointMatrix.getLocalMatrixHost();
843  auto row_ptrs = local.graph.row_map;
844  auto col_inds = local.graph.entries;
845  auto values = local.values;
846  const auto nrows = pointMatrix.getLocalNumRows();
847 
848  // Sizing and rows
849  dev_row_view_t new_rowmap("new_rowmap", nrows+1);
850  const impl_scalar_t zero = STS::zero();
851  Kokkos::parallel_for("sizing", range_type(0, nrows), KOKKOS_LAMBDA(const size_t row) {
852  const Ordinal row_nnz_begin = row_ptrs(row);
853  Ordinal row_nnzs = 0;
854  for (Ordinal row_nnz = row_nnz_begin; row_nnz < row_ptrs(row+1); ++row_nnz) {
855  const impl_scalar_t value = values(row_nnz);
856  if (value != zero) {
857  ++row_nnzs;
858  }
859  }
860  new_rowmap(row) = row_nnzs;
861  });
862 
863  Ordinal real_nnzs = 0;
864 #if KOKKOSKERNELS_VERSION >= 40199
865  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
866  execution_space>(new_rowmap.extent(0), new_rowmap, real_nnzs);
867 #else
868  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
869  dev_row_view_t, execution_space>(new_rowmap.extent(0), new_rowmap, real_nnzs);
870 #endif
871  // Now populate cols and vals
872  dev_col_view_t new_col_ids("new_col_ids", real_nnzs);
873  dev_val_view_t new_vals("new_vals", real_nnzs);
874  Kokkos::parallel_for("entries", range_type(0, nrows), KOKKOS_LAMBDA(const size_t row) {
875  Ordinal row_nnzs = 0;
876  const Ordinal old_row_nnz_begin = row_ptrs(row);
877  const Ordinal new_row_nnz_begin = new_rowmap(row);
878  for (Ordinal old_row_nnz = old_row_nnz_begin; old_row_nnz < row_ptrs(row+1); ++old_row_nnz) {
879  const impl_scalar_t value = values(old_row_nnz);
880  if (value != zero) {
881  new_col_ids(new_row_nnz_begin + row_nnzs) = col_inds(old_row_nnz);
882  new_vals(new_row_nnz_begin + row_nnzs) = value;
883  ++row_nnzs;
884  }
885  }
886  });
887 
888  // Create new, unfilled CRS
889  auto crs_row_map = pointMatrix.getRowMap();
890  auto crs_col_map = pointMatrix.getColMap();
891  Teuchos::RCP<crs_t> result = Teuchos::rcp(new crs_t(crs_row_map, crs_col_map, new_rowmap, new_col_ids, new_vals));
892  result->fillComplete();
893  return result;
894  }
895 
896  template<class LO, class GO, class Node>
897  Teuchos::RCP<const Tpetra::Map<LO,GO,Node> >
898  createMeshMap (const LO& blockSize, const Tpetra::Map<LO,GO,Node>& pointMap, bool use_LID)
899  {
900  typedef Teuchos::OrdinalTraits<Tpetra::global_size_t> TOT;
901  typedef Tpetra::Map<LO,GO,Node> map_type;
902 
903  if(use_LID) {
904 
905  using execution_space = typename Node::execution_space;
906  using range_type = Kokkos::RangePolicy<execution_space, size_t>;
907 
908  auto pointGlobalID = pointMap.getMyGlobalIndicesDevice();
909  LO block_rows = pointGlobalID.extent(0)/blockSize;
910  Kokkos::View<GO*, typename map_type::device_type> meshGlobalID("meshGlobalID", block_rows);
911 
912  Kokkos::parallel_for("fillMeshMap",range_type(0,block_rows), KOKKOS_LAMBDA(const LO i) {
913  meshGlobalID(i) = pointGlobalID(i*blockSize)/blockSize;
914  });
915 
916  Teuchos::RCP<const map_type> meshMap = Teuchos::rcp( new map_type(TOT::invalid(), meshGlobalID, 0, pointMap.getComm()) );
917  return meshMap;
918  }
919  else {
920  //calculate mesh GIDs
921  Teuchos::ArrayView<const GO> pointGids = pointMap.getLocalElementList();
922  Teuchos::Array<GO> meshGids;
923  GO indexBase = pointMap.getIndexBase();
924 
925  // Use hash table to track whether we've encountered this GID previously. This will happen
926  // when striding through the point DOFs in a block. It should not happen otherwise.
927  // I don't use sort/make unique because I don't want to change the ordering.
928  meshGids.reserve(pointGids.size());
929  Tpetra::Details::HashTable<GO,int> hashTable(pointGids.size());
930  for (int i=0; i<pointGids.size(); ++i) {
931  GO meshGid = (pointGids[i]-indexBase) / blockSize + indexBase;
932  if (hashTable.get(meshGid) == -1) {
933  hashTable.add(meshGid,1); //(key,value)
934  meshGids.push_back(meshGid);
935  }
936  }
937 
938  Teuchos::RCP<const map_type> meshMap = Teuchos::rcp( new map_type(TOT::invalid(), meshGids(), 0, pointMap.getComm()) );
939  return meshMap;
940  }
941  }
942 
943 
944  template<class LO, class GO, class Node>
945  Teuchos::RCP<const Tpetra::Map<LO,GO,Node> >
946  createPointMap (const LO& blockSize, const Tpetra::Map<LO,GO,Node>& blockMap)
947  {
948  typedef Teuchos::OrdinalTraits<Tpetra::global_size_t> TOT;
949  typedef Tpetra::Map<LO,GO,Node> map_type;
950 
951  //calculate mesh GIDs
952  Teuchos::ArrayView<const GO> blockGids = blockMap.getLocalElementList();
953  Teuchos::Array<GO> pointGids(blockGids.size() * blockSize);
954  GO indexBase = blockMap.getIndexBase();
955 
956  for(LO i=0, ct=0; i<(LO)blockGids.size(); i++) {
957  GO base = (blockGids[i] - indexBase)* blockSize + indexBase;
958  for(LO j=0; j<blockSize; j++, ct++)
959  pointGids[i*blockSize+j] = base+j;
960  }
961 
962  Teuchos::RCP<const map_type> pointMap = Teuchos::rcp( new map_type(TOT::invalid(), pointGids(), 0, blockMap.getComm()) );
963  return pointMap;
964 
965  }
966 
967 
968  template<class Scalar, class LO, class GO, class Node>
969  Teuchos::RCP<CrsMatrix<Scalar, LO, GO, Node> >
971 
972  using Teuchos::Array;
973  using Teuchos::ArrayView;
974  using Teuchos::RCP;
975 
976  typedef Tpetra::BlockCrsMatrix<Scalar,LO,GO,Node> block_crs_matrix_type;
977  typedef Tpetra::Map<LO,GO,Node> map_type;
978  typedef Tpetra::CrsMatrix<Scalar, LO,GO,Node> crs_matrix_type;
979 
980  using crs_graph_type = typename block_crs_matrix_type::crs_graph_type;
981  using local_graph_device_type = typename crs_matrix_type::local_graph_device_type;
982  using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
983  using row_map_type = typename local_graph_device_type::row_map_type::non_const_type;
984  using entries_type = typename local_graph_device_type::entries_type::non_const_type;
985  using values_type = typename local_matrix_device_type::values_type::non_const_type;
986 
987  using row_map_type_const = typename local_graph_device_type::row_map_type;
988  using entries_type_const = typename local_graph_device_type::entries_type;
989 
990  using little_block_type = typename block_crs_matrix_type::const_little_block_type;
991  using offset_type = typename row_map_type::non_const_value_type;
992  using column_type = typename entries_type::non_const_value_type;
993 
994  using execution_space = typename Node::execution_space;
995  using range_type = Kokkos::RangePolicy<execution_space, LO>;
996 
997 
998  LO blocksize = blockMatrix.getBlockSize();
999  const offset_type bs2 = blocksize * blocksize;
1000  size_t block_nnz = blockMatrix.getLocalNumEntries();
1001  size_t point_nnz = block_nnz * bs2;
1002 
1003  // We can get these from the blockMatrix directly
1004  RCP<const map_type> pointDomainMap = blockMatrix.getDomainMap();
1005  RCP<const map_type> pointRangeMap = blockMatrix.getRangeMap();
1006 
1007  // We need to generate the row/col Map ourselves.
1008  RCP<const map_type> blockRowMap = blockMatrix.getRowMap();
1009  RCP<const map_type> pointRowMap = createPointMap<LO,GO,Node>(blocksize, *blockRowMap);
1010 
1011  RCP<const map_type> blockColMap = blockMatrix.getColMap();
1012  RCP<const map_type> pointColMap = createPointMap<LO,GO,Node>(blocksize, *blockColMap);
1013 
1014  // Get the last few things
1015 
1016  const crs_graph_type & blockGraph = blockMatrix.getCrsGraph();
1017  LO point_rows = (LO) pointRowMap->getLocalNumElements();
1018  LO block_rows = (LO) blockRowMap->getLocalNumElements();
1019  auto blockValues = blockMatrix.getValuesDevice();
1020  auto blockLocalGraph = blockGraph.getLocalGraphDevice();
1021  row_map_type_const blockRowptr = blockLocalGraph.row_map;
1022  entries_type_const blockColind = blockLocalGraph.entries;
1023 
1024  // Generate the point matrix rowptr / colind / values
1025  row_map_type rowptr("row_map", point_rows+1);
1026  entries_type colind("entries", point_nnz);
1027  values_type values("values", point_nnz);
1028 
1029  // Pre-fill the rowptr
1030  Kokkos::parallel_for("fillRowPtr",range_type(0,block_rows),KOKKOS_LAMBDA(const LO i) {
1031  offset_type base = blockRowptr[i];
1032  offset_type diff = blockRowptr[i+1] - base;
1033  for(LO j=0; j<blocksize; j++) {
1034  rowptr[i*blocksize +j] = base*bs2 + j*diff*blocksize;
1035  }
1036 
1037  // Fill the last guy, if we're on the final entry
1038  if(i==block_rows-1) {
1039  rowptr[block_rows*blocksize] = blockRowptr[i+1] * bs2;
1040  }
1041  });
1042 
1043 
1044  Kokkos::parallel_for("copyEntriesAndValues",range_type(0,block_rows),KOKKOS_LAMBDA(const LO i) {
1045  const offset_type blkBeg = blockRowptr[i];
1046  const offset_type numBlocks = blockRowptr[i+1] - blkBeg;
1047 
1048  // For each block in the row...
1049  for (offset_type block=0; block < numBlocks; block++) {
1050  column_type point_col_base = blockColind[blkBeg + block] * blocksize;
1051  little_block_type my_block(blockValues.data () + (blkBeg+block) * bs2, blocksize, blocksize);
1052 
1053  // For each entry in the block...
1054  for(LO little_row=0; little_row<blocksize; little_row++) {
1055  offset_type point_row_offset = rowptr[i*blocksize + little_row];
1056  for(LO little_col=0; little_col<blocksize; little_col++) {
1057  colind[point_row_offset + block*blocksize + little_col] = point_col_base + little_col;
1058  values[point_row_offset + block*blocksize + little_col] = my_block(little_row,little_col);
1059  }
1060  }
1061 
1062  }
1063  });
1064 
1065  // Generate the matrix
1066  RCP<crs_matrix_type> pointCrsMatrix = rcp(new crs_matrix_type(pointRowMap, pointColMap, 0));
1067  pointCrsMatrix->setAllValues(rowptr,colind,values);
1068 
1069  // FUTURE OPTIMIZATION: Directly compute import/export, rather than letting ESFC do it
1070  pointCrsMatrix->expertStaticFillComplete(pointDomainMap,pointRangeMap);
1071 
1072  return pointCrsMatrix;
1073  }
1074 
1075 
1076 } // namespace Tpetra
1077 
1078 //
1079 // Explicit instantiation macro for blockCrsMatrixWriter (various
1080 // overloads), writeMatrixStrip, and convertToBlockCrsMatrix.
1081 //
1082 // Must be expanded from within the Tpetra namespace!
1083 //
1084 #define TPETRA_BLOCKCRSMATRIX_HELPERS_INSTANT(S,LO,GO,NODE) \
1085  template void blockCrsMatrixWriter(BlockCrsMatrix<S,LO,GO,NODE> const &A, std::string const &fileName); \
1086  template void blockCrsMatrixWriter(BlockCrsMatrix<S,LO,GO,NODE> const &A, std::string const &fileName, Teuchos::ParameterList const &params); \
1087  template void blockCrsMatrixWriter(BlockCrsMatrix<S,LO,GO,NODE> const &A, std::ostream &os); \
1088  template void blockCrsMatrixWriter(BlockCrsMatrix<S,LO,GO,NODE> const &A, std::ostream &os, Teuchos::ParameterList const &params); \
1089  template void writeMatrixStrip(BlockCrsMatrix<S,LO,GO,NODE> const &A, std::ostream &os, Teuchos::ParameterList const &params); \
1090  template Teuchos::RCP<BlockCrsMatrix<S, LO, GO, NODE> > convertToBlockCrsMatrix(const CrsMatrix<S, LO, GO, NODE>& pointMatrix, const LO &blockSize, bool use_LID); \
1091  template Teuchos::RCP<CrsMatrix<S, LO, GO, NODE> > fillLogicalBlocks(const CrsMatrix<S, LO, GO, NODE>& pointMatrix, const LO &blockSize); \
1092  template Teuchos::RCP<CrsMatrix<S, LO, GO, NODE> > unfillFormerBlockCrs(const CrsMatrix<S, LO, GO, NODE>& pointMatrix); \
1093  template Teuchos::RCP<CrsMatrix<S, LO, GO, NODE> > convertToCrsMatrix(const BlockCrsMatrix<S, LO, GO, NODE>& blockMatrix); \
1094  template Teuchos::RCP<CrsGraph<LO, GO, NODE>> getBlockCrsGraph(const Tpetra::CrsMatrix<S, LO, GO, NODE>& pointMatrix, const LO &blockSize, bool use_local_ID);
1095 
1096 //
1097 // Explicit instantiation macro for createMeshMap / createPointMap
1098 //
1099 #define TPETRA_CREATEMESHMAP_INSTANT(LO,GO,NODE) \
1100  template Teuchos::RCP<const Map<LO,GO,NODE> > createMeshMap (const LO& blockSize, const Map<LO,GO,NODE>& pointMap, bool use_local_ID); \
1101  template Teuchos::RCP<const Map<LO,GO,NODE> > createPointMap (const LO& blockSize, const Map<LO,GO,NODE>& blockMap);
1102 
1103 #endif // TPETRA_BLOCKCRSMATRIX_HELPERS_DEF_HPP
Teuchos::RCP< const Tpetra::Map< LO, GO, Node > > createPointMap(LO const &blockSize, const Tpetra::Map< LO, GO, Node > &blockMap)
Helper function to generate a point map from a block map (with a given block size) GIDs associated wi...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
virtual size_t getLocalNumEntries() const override
The local number of stored (structurally nonzero) entries.
virtual global_size_t getGlobalNumCols() const override
The global number of columns of this matrix.
void blockCrsMatrixWriter(BlockCrsMatrix< Scalar, LO, GO, Node > const &A, std::string const &fileName)
Helper function to write a BlockCrsMatrix. Calls the 3-argument version.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
virtual LO getBlockSize() const override
The number of degrees of freedom per mesh point.
Teuchos::RCP< Tpetra::CrsMatrix< Scalar, LO, GO, Node > > unfillFormerBlockCrs(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix)
Unfill all point entries in a logical block of a CrsMatrix with zeroes. This can be called after conv...
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
Teuchos::RCP< const Tpetra::Map< LO, GO, Node > > createMeshMap(LO const &blockSize, const Tpetra::Map< LO, GO, Node > &pointMap, bool use_local_ID=false)
Helper function to generate a mesh map from a point map. Important! It&#39;s assumed that point GIDs asso...
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
One or more distributed dense vectors.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
size_t getGlobalMaxNumRowEntries() const override
Maximum number of entries in any row of the graph, over all processes in the graph&#39;s communicator...
global_indices_array_device_type getMyGlobalIndicesDevice() const
Return a view of the global indices owned by this process on the Map&#39;s device.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
Accessors for the Teuchos::Comm and Kokkos Node objects.
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix&#39;s graph, as a CrsGraph.
size_t getLocalNumRows() const override
get the local number of block rows
void writeMatrixStrip(BlockCrsMatrix< Scalar, LO, GO, Node > const &A, std::ostream &os, Teuchos::ParameterList const &params)
Helper function called by blockCrsMatrixWriter.
void getLocalRowView(LO LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
Teuchos::RCP< const map_type > getColMap() const override
get the (mesh) map for the columns of this block matrix.
Teuchos::ArrayView< const global_ordinal_type > getLocalElementList() const
Return a NONOWNING view of the global indices owned by this process.
Teuchos::RCP< const map_type > getDomainMap() const override
Get the (point) domain Map of this matrix.
Teuchos::RCP< const map_type > getRowMap() const override
get the (mesh) map for the rows of this block matrix.
Insert new values that don&#39;t currently exist.
global_ordinal_type getIndexBase() const
The index base for this Map.
global_size_t getGlobalNumRows() const override
get the global number of block rows
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
virtual GO getIndexBase() const override
The index base for global indices in this matrix.
virtual global_size_t getGlobalNumEntries() const override
The global number of stored (structurally nonzero) entries.
Teuchos::RCP< Tpetra::CrsMatrix< Scalar, LO, GO, Node > > fillLogicalBlocks(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix, const LO &blockSize)
Fill all point entries in a logical block of a CrsMatrix with zeroes. This should be called before co...
Teuchos::RCP< const map_type > getRangeMap() const override
Get the (point) range Map of this matrix.
Teuchos::RCP< CrsMatrix< Scalar, LO, GO, Node > > convertToCrsMatrix(const Tpetra::BlockCrsMatrix< Scalar, LO, GO, Node > &blockMatrix)
Non-member constructor that creates a point CrsMatrix from an existing BlockCrsMatrix.
Teuchos::RCP< CrsGraph< LocalOrdinal, GlobalOrdinal, Node > > createCrsGraph(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node >> &map, size_t maxNumEntriesPerRow=0, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Nonmember function to create an empty CrsGraph given a row Map and the max number of entries allowed ...
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Teuchos::RCP< Tpetra::CrsGraph< LO, GO, Node > > getBlockCrsGraph(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix, const LO &blockSize, bool use_local_ID=true)
Non-member constructor that creates the CrsGraph of a BlockCrsMatrix from an existing point CrsMatrix...
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
void add(const KeyType key, const ValueType value)
Add a key and its value to the hash table.
Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > convertToBlockCrsMatrix(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix, const LO &blockSize, bool use_local_ID=true)
Non-member constructor that creates a BlockCrsMatrix from an existing point CrsMatrix.
local_matrix_device_type::values_type::const_type getLocalValuesDevice(Access::ReadOnlyStruct s) const
Get the Kokkos local values on device, read only.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
virtual size_t getLocalNumCols() const override
The number of columns needed to apply the forward operator on this node.
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.