Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_BlockCrsMatrix_Helpers_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_BLOCKCRSMATRIX_HELPERS_DEF_HPP
41 #define TPETRA_BLOCKCRSMATRIX_HELPERS_DEF_HPP
42 
44 
45 #include "Tpetra_BlockCrsMatrix.hpp"
46 #include "Tpetra_CrsMatrix.hpp"
47 #include "Tpetra_HashTable.hpp"
48 #include "Tpetra_Import.hpp"
49 #include "Tpetra_Map.hpp"
50 #include "Tpetra_MultiVector.hpp"
51 #include "Teuchos_ParameterList.hpp"
52 #include "Teuchos_ScalarTraits.hpp"
53 #include <ctime>
54 #include <fstream>
55 
56 namespace Tpetra {
57 
58  template<class Scalar, class LO, class GO, class Node>
59  void blockCrsMatrixWriter(BlockCrsMatrix<Scalar,LO,GO,Node> const &A, std::string const &fileName) {
60  Teuchos::ParameterList pl;
61  std::ofstream out;
62  out.open(fileName.c_str());
63  blockCrsMatrixWriter(A, out, pl);
64  }
65 
66  template<class Scalar, class LO, class GO, class Node>
67  void blockCrsMatrixWriter(BlockCrsMatrix<Scalar,LO,GO,Node> const &A, std::string const &fileName, Teuchos::ParameterList const &params) {
68  std::ofstream out;
69  out.open(fileName.c_str());
70  blockCrsMatrixWriter(A, out, params);
71  }
72 
73  template<class Scalar, class LO, class GO, class Node>
74  void blockCrsMatrixWriter(BlockCrsMatrix<Scalar,LO,GO,Node> const &A, std::ostream &os) {
75  Teuchos::ParameterList pl;
76  blockCrsMatrixWriter(A, os, pl);
77  }
78 
79  template<class Scalar, class LO, class GO, class Node>
80  void blockCrsMatrixWriter(BlockCrsMatrix<Scalar,LO,GO,Node> const &A, std::ostream &os, Teuchos::ParameterList const &params) {
81 
82  using Teuchos::RCP;
83  using Teuchos::rcp;
84 
85  typedef Teuchos::OrdinalTraits<GO> TOT;
86  typedef BlockCrsMatrix<Scalar, LO, GO, Node> block_crs_matrix_type;
87  typedef Tpetra::Import<LO, GO, Node> import_type;
88  typedef Tpetra::Map<LO, GO, Node> map_type;
90  typedef Tpetra::CrsGraph<LO, GO, Node> crs_graph_type;
91 
92  RCP<const map_type> const rowMap = A.getRowMap(); //"mesh" map
93  RCP<const Teuchos::Comm<int> > comm = rowMap->getComm();
94  const int myRank = comm->getRank();
95  const size_t numProcs = comm->getSize();
96 
97  // If true, force use of the import strip-mining infrastructure. This is useful for debugging on one process.
98  bool alwaysUseParallelAlgorithm = false;
99  if (params.isParameter("always use parallel algorithm"))
100  alwaysUseParallelAlgorithm = params.get<bool>("always use parallel algorithm");
101  bool printMatrixMarketHeader = true;
102  if (params.isParameter("print MatrixMarket header"))
103  printMatrixMarketHeader = params.get<bool>("print MatrixMarket header");
104 
105  if (printMatrixMarketHeader && myRank==0) {
106  std::time_t now = std::time(NULL);
107 
108  const std::string dataTypeStr =
109  Teuchos::ScalarTraits<Scalar>::isComplex ? "complex" : "real";
110 
111  // Explanation of first line of file:
112  // - "%%MatrixMarket" is the tag for Matrix Market format.
113  // - "matrix" is what we're printing.
114  // - "coordinate" means sparse (triplet format), rather than dense.
115  // - "real" / "complex" is the type (in an output format sense,
116  // not in a C++ sense) of each value of the matrix. (The
117  // other two possibilities are "integer" (not really necessary
118  // here) and "pattern" (no values, just graph).)
119  os << "%%MatrixMarket matrix coordinate " << dataTypeStr << " general" << std::endl;
120  os << "% time stamp: " << ctime(&now);
121  os << "% written from " << numProcs << " processes" << std::endl;
122  os << "% point representation of Tpetra::BlockCrsMatrix" << std::endl;
123  size_t numRows = A.getGlobalNumRows();
124  size_t numCols = A.getGlobalNumCols();
125  os << "% " << numRows << " block rows, " << numCols << " block columns" << std::endl;
126  const LO blockSize = A.getBlockSize();
127  os << "% block size " << blockSize << std::endl;
128  os << numRows*blockSize << " " << numCols*blockSize << " " << A.getGlobalNumEntries()*blockSize*blockSize << std::endl;
129  }
130 
131  if (numProcs==1 && !alwaysUseParallelAlgorithm) {
132  writeMatrixStrip(A,os,params);
133  } else {
134  size_t numRows = rowMap->getLocalNumElements();
135 
136  //Create source map
137  RCP<const map_type> allMeshGidsMap = rcp(new map_type(TOT::invalid(), numRows, A.getIndexBase(), comm));
138  //Create and populate vector of mesh GIDs corresponding to this pid's rows.
139  //This vector will be imported one pid's worth of information at a time to pid 0.
140  mv_type allMeshGids(allMeshGidsMap,1);
141  Teuchos::ArrayRCP<GO> allMeshGidsData = allMeshGids.getDataNonConst(0);
142 
143  for (size_t i=0; i<numRows; i++)
144  allMeshGidsData[i] = rowMap->getGlobalElement(i);
145  allMeshGidsData = Teuchos::null;
146 
147  // Now construct a RowMatrix on PE 0 by strip-mining the rows of the input matrix A.
148  size_t stripSize = allMeshGids.getGlobalLength() / numProcs;
149  size_t remainder = allMeshGids.getGlobalLength() % numProcs;
150  size_t curStart = 0;
151  size_t curStripSize = 0;
152  Teuchos::Array<GO> importMeshGidList;
153  for (size_t i=0; i<numProcs; i++) {
154  if (myRank==0) { // Only PE 0 does this part
155  curStripSize = stripSize;
156  if (i<remainder) curStripSize++; // handle leftovers
157  importMeshGidList.resize(curStripSize); // Set size of vector to max needed
158  for (size_t j=0; j<curStripSize; j++) importMeshGidList[j] = j + curStart + A.getIndexBase();
159  curStart += curStripSize;
160  }
161  // The following import map should be non-trivial only on PE 0.
162  TEUCHOS_TEST_FOR_EXCEPTION(myRank>0 && curStripSize!=0,
163  std::runtime_error, "Tpetra::blockCrsMatrixWriter: (pid "
164  << myRank << ") map size should be zero, but is " << curStripSize);
165  RCP<map_type> importMeshGidMap = rcp(new map_type(TOT::invalid(), importMeshGidList(), A.getIndexBase(), comm));
166  import_type gidImporter(allMeshGidsMap, importMeshGidMap);
167  mv_type importMeshGids(importMeshGidMap, 1);
168  importMeshGids.doImport(allMeshGids, gidImporter, INSERT);
169 
170  // importMeshGids now has a list of GIDs for the current strip of matrix rows.
171  // Use these values to build another importer that will get rows of the matrix.
172 
173  // The following import map will be non-trivial only on PE 0.
174  Teuchos::ArrayRCP<const GO> importMeshGidsData = importMeshGids.getData(0);
175  Teuchos::Array<GO> importMeshGidsGO;
176  importMeshGidsGO.reserve(importMeshGidsData.size());
177  for (typename Teuchos::ArrayRCP<const GO>::size_type j=0; j<importMeshGidsData.size(); ++j)
178  importMeshGidsGO.push_back(importMeshGidsData[j]);
179  RCP<const map_type> importMap = rcp(new map_type(TOT::invalid(), importMeshGidsGO(), rowMap->getIndexBase(), comm) );
180 
181  import_type importer(rowMap,importMap );
182  size_t numEntriesPerRow = A.getCrsGraph().getGlobalMaxNumRowEntries();
183  RCP<crs_graph_type> graph = createCrsGraph(importMap,numEntriesPerRow);
184  RCP<const map_type> domainMap = A.getCrsGraph().getDomainMap();
185  graph->doImport(A.getCrsGraph(), importer, INSERT);
186  graph->fillComplete(domainMap, importMap);
187 
188  block_crs_matrix_type importA(*graph, A.getBlockSize());
189  importA.doImport(A, importer, INSERT);
190 
191  // Finally we are ready to write this strip of the matrix
192  writeMatrixStrip(importA, os, params);
193  }
194  }
195  }
196 
197  template<class Scalar, class LO, class GO, class Node>
198  void writeMatrixStrip(BlockCrsMatrix<Scalar,LO,GO,Node> const &A, std::ostream &os, Teuchos::ParameterList const &params) {
199  using Teuchos::RCP;
200  using map_type = Tpetra::Map<LO, GO, Node>;
201  using bcrs_type = BlockCrsMatrix<Scalar,LO,GO,Node>;
202  using bcrs_local_inds_host_view_type = typename bcrs_type::local_inds_host_view_type;
203  using bcrs_values_host_view_type = typename bcrs_type::values_host_view_type;
204  using impl_scalar_type = typename bcrs_type::impl_scalar_type;
205 
206  size_t numRows = A.getGlobalNumRows();
207  RCP<const map_type> rowMap = A.getRowMap();
208  RCP<const map_type> colMap = A.getColMap();
209  RCP<const Teuchos::Comm<int> > comm = rowMap->getComm();
210  const int myRank = comm->getRank();
211 
212  const size_t meshRowOffset = rowMap->getIndexBase();
213  const size_t meshColOffset = colMap->getIndexBase();
214  TEUCHOS_TEST_FOR_EXCEPTION(meshRowOffset != meshColOffset,
215  std::runtime_error, "Tpetra::writeMatrixStrip: "
216  "mesh row index base != mesh column index base");
217 
218  if (myRank !=0) {
219 
220  TEUCHOS_TEST_FOR_EXCEPTION(A.getLocalNumRows() != 0,
221  std::runtime_error, "Tpetra::writeMatrixStrip: pid "
222  << myRank << " should have 0 rows but has " << A.getLocalNumRows());
223  TEUCHOS_TEST_FOR_EXCEPTION(A.getLocalNumCols() != 0,
224  std::runtime_error, "Tpetra::writeMatrixStrip: pid "
225  << myRank << " should have 0 columns but has " << A.getLocalNumCols());
226 
227  } else {
228 
229  TEUCHOS_TEST_FOR_EXCEPTION(numRows != A.getLocalNumRows(),
230  std::runtime_error, "Tpetra::writeMatrixStrip: "
231  "number of rows on pid 0 does not match global number of rows");
232 
233 
234  int err = 0;
235  const LO blockSize = A.getBlockSize();
236  const size_t numLocalRows = A.getLocalNumRows();
237  bool precisionChanged=false;
238  int oldPrecision = 0; // avoid "unused variable" warning
239  if (params.isParameter("precision")) {
240  oldPrecision = os.precision(params.get<int>("precision"));
241  precisionChanged=true;
242  }
243  int pointOffset = 1;
244  if (params.isParameter("zero-based indexing")) {
245  if (params.get<bool>("zero-based indexing") == true)
246  pointOffset = 0;
247  }
248 
249  size_t localRowInd;
250  for (localRowInd = 0; localRowInd < numLocalRows; ++localRowInd) {
251 
252  // Get a view of the current row.
253  bcrs_local_inds_host_view_type localColInds;
254  bcrs_values_host_view_type vals;
255  LO numEntries;
256  A.getLocalRowView (localRowInd, localColInds, vals); numEntries = localColInds.extent(0);
257  GO globalMeshRowID = rowMap->getGlobalElement(localRowInd) - meshRowOffset;
258 
259  for (LO k = 0; k < numEntries; ++k) {
260  GO globalMeshColID = colMap->getGlobalElement(localColInds[k]) - meshColOffset;
261  const impl_scalar_type* curBlock = vals.data() + blockSize * blockSize * k;
262  // Blocks are stored in row-major format.
263  for (LO j = 0; j < blockSize; ++j) {
264  GO globalPointRowID = globalMeshRowID * blockSize + j + pointOffset;
265  for (LO i = 0; i < blockSize; ++i) {
266  GO globalPointColID = globalMeshColID * blockSize + i + pointOffset;
267  const impl_scalar_type curVal = curBlock[i + j * blockSize];
268 
269  os << globalPointRowID << " " << globalPointColID << " ";
270  if (Teuchos::ScalarTraits<impl_scalar_type>::isComplex) {
271  // Matrix Market format wants complex values to be
272  // written as space-delimited pairs. See Bug 6469.
273  os << Teuchos::ScalarTraits<impl_scalar_type>::real (curVal) << " "
274  << Teuchos::ScalarTraits<impl_scalar_type>::imag (curVal);
275  }
276  else {
277  os << curVal;
278  }
279  os << std::endl;
280  }
281  }
282  }
283  }
284  if (precisionChanged)
285  os.precision(oldPrecision);
286  TEUCHOS_TEST_FOR_EXCEPTION(err != 0,
287  std::runtime_error, "Tpetra::writeMatrixStrip: "
288  "error getting view of local row " << localRowInd);
289 
290  }
291 
292  }
293 
294  template<class Scalar, class LO, class GO, class Node>
295  Teuchos::RCP<Tpetra::CrsGraph<LO, GO, Node> >
296  getBlockCrsGraph(const Tpetra::CrsMatrix<Scalar, LO, GO, Node>& pointMatrix, const LO &blockSize, bool use_LID)
297  {
298 
299  /*
300  ASSUMPTIONS:
301 
302  1) In point matrix, all entries associated with a little block are present (even if they are zero).
303  2) For given mesh DOF, point DOFs appear consecutively and in ascending order in row & column maps.
304  3) Point column map and block column map are ordered consistently.
305  */
306 
307  using Teuchos::Array;
308  using Teuchos::ArrayView;
309  using Teuchos::RCP;
310 
311  typedef Tpetra::Map<LO,GO,Node> map_type;
312  typedef Tpetra::CrsGraph<LO,GO,Node> crs_graph_type;
313  typedef Tpetra::CrsMatrix<Scalar, LO,GO,Node> crs_matrix_type;
314 
315  using local_graph_device_type = typename crs_matrix_type::local_graph_device_type;
316  using row_map_type = typename local_graph_device_type::row_map_type::non_const_type;
317  using entries_type = typename local_graph_device_type::entries_type::non_const_type;
318 
319  using offset_type = typename row_map_type::non_const_value_type;
320 
321  using execution_space = typename Node::execution_space;
322  using range_type = Kokkos::RangePolicy<execution_space, LO>;
323 
324  const map_type &pointRowMap = *(pointMatrix.getRowMap());
325  RCP<const map_type> meshRowMap = createMeshMap<LO,GO,Node>(blockSize, pointRowMap);
326 
327  const map_type &pointColMap = *(pointMatrix.getColMap());
328  RCP<const map_type> meshColMap = createMeshMap<LO,GO,Node>(blockSize, pointColMap);
329  if(meshColMap.is_null()) throw std::runtime_error("ERROR: Cannot create mesh colmap");
330 
331  auto localMeshColMap = meshColMap->getLocalMap();
332  auto localPointColMap = pointColMap.getLocalMap();
333  auto localPointRowMap = pointRowMap.getLocalMap();
334 
335  const map_type &pointDomainMap = *(pointMatrix.getDomainMap());
336  RCP<const map_type> meshDomainMap = createMeshMap<LO,GO,Node>(blockSize, pointDomainMap);
337 
338  const map_type &pointRangeMap = *(pointMatrix.getRangeMap());
339  RCP<const map_type> meshRangeMap = createMeshMap<LO,GO,Node>(blockSize, pointRangeMap);
340 
341  RCP<crs_graph_type> meshCrsGraph;
342 
343  const offset_type bs2 = blockSize * blockSize;
344 
345  if (use_LID) {
346  TEUCHOS_FUNC_TIME_MONITOR("Tpetra::convertToBlockCrsMatrix::fillCrsGraph");
347  auto pointLocalGraph = pointMatrix.getCrsGraph()->getLocalGraphDevice();
348  auto pointRowptr = pointLocalGraph.row_map;
349  auto pointColind = pointLocalGraph.entries;
350 
351  TEUCHOS_TEST_FOR_EXCEPTION(pointColind.extent(0) % bs2 != 0,
352  std::runtime_error, "Tpetra::getBlockCrsGraph: "
353  "local number of non zero entries is not a multiple of blockSize^2 ");
354 
355  LO block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0)-1)/blockSize;
356  row_map_type blockRowptr("blockRowptr", block_rows+1);
357  entries_type blockColind("blockColind", pointColind.extent(0)/(bs2));
358 
359  Kokkos::parallel_for("fillMesh",range_type(0,block_rows), KOKKOS_LAMBDA(const LO i) {
360 
361  const LO offset_b = pointRowptr(i*blockSize)/bs2;
362  const LO offset_b_max = pointRowptr((i+1)*blockSize)/bs2;
363 
364  if (i==block_rows-1)
365  blockRowptr(i+1) = offset_b_max;
366  blockRowptr(i) = offset_b;
367 
368  const LO offset_p = pointRowptr(i*blockSize);
369 
370  for (LO k=0; k<offset_b_max-offset_b; ++k) {
371  blockColind(offset_b + k) = pointColind(offset_p + k * blockSize)/blockSize;
372  }
373  });
374 
375  meshCrsGraph = rcp(new crs_graph_type(meshRowMap, meshColMap, blockRowptr, blockColind));
376  meshCrsGraph->fillComplete(meshDomainMap,meshRangeMap);
377  Kokkos::DefaultExecutionSpace().fence();
378  }
379  else {
380  TEUCHOS_FUNC_TIME_MONITOR("Tpetra::convertToBlockCrsMatrix::fillCrsGraph");
381  auto pointLocalGraph = pointMatrix.getCrsGraph()->getLocalGraphDevice();
382  auto pointRowptr = pointLocalGraph.row_map;
383  auto pointColind = pointLocalGraph.entries;
384 
385  TEUCHOS_TEST_FOR_EXCEPTION(pointColind.extent(0) % bs2 != 0,
386  std::runtime_error, "Tpetra::getBlockCrsGraph: "
387  "local number of non zero entries is not a multiple of blockSize^2 ");
388 
389  LO block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0)-1)/blockSize;
390  row_map_type blockRowptr("blockRowptr", block_rows+1);
391  entries_type blockColind("blockColind", pointColind.extent(0)/(bs2));
392 
393  Kokkos::parallel_for("fillMesh",range_type(0,block_rows), KOKKOS_LAMBDA(const LO i) {
394 
395  const LO offset_b = pointRowptr(i*blockSize)/bs2;
396  const LO offset_b_max = pointRowptr((i+1)*blockSize)/bs2;
397 
398  if (i==block_rows-1)
399  blockRowptr(i+1) = offset_b_max;
400  blockRowptr(i) = offset_b;
401 
402  const LO offset_p = pointRowptr(i*blockSize);
403  const LO offset_p_max = pointRowptr((i+1)*blockSize);
404 
405  LO filled_block = 0;
406  for (LO p_i=0; p_i<offset_p_max-offset_p; ++p_i) {
407  auto bcol_GID = localPointColMap.getGlobalElement(pointColind(offset_p + p_i))/blockSize;
408  auto bcol_LID = localMeshColMap.getLocalElement(bcol_GID);
409 
410  bool visited = false;
411  for (LO k=0; k<filled_block; ++k) {
412  if (blockColind(offset_b + k) == bcol_LID)
413  visited = true;
414  }
415  if (!visited) {
416  blockColind(offset_b + filled_block) = bcol_LID;
417  ++filled_block;
418  }
419  }
420  });
421 
422  meshCrsGraph = rcp(new crs_graph_type(meshRowMap, meshColMap, blockRowptr, blockColind));
423  meshCrsGraph->fillComplete(meshDomainMap,meshRangeMap);
424  Kokkos::DefaultExecutionSpace().fence();
425  }
426 
427  return meshCrsGraph;
428  }
429 
430  template<class Scalar, class LO, class GO, class Node>
431  Teuchos::RCP<BlockCrsMatrix<Scalar, LO, GO, Node> >
432  convertToBlockCrsMatrix(const Tpetra::CrsMatrix<Scalar, LO, GO, Node>& pointMatrix, const LO &blockSize, bool use_LID)
433  {
434 
435  /*
436  ASSUMPTIONS:
437 
438  1) In point matrix, all entries associated with a little block are present (even if they are zero).
439  2) For given mesh DOF, point DOFs appear consecutively and in ascending order in row & column maps.
440  3) Point column map and block column map are ordered consistently.
441  */
442 
443  using Teuchos::Array;
444  using Teuchos::ArrayView;
445  using Teuchos::RCP;
446 
447  typedef Tpetra::BlockCrsMatrix<Scalar,LO,GO,Node> block_crs_matrix_type;
448  typedef Tpetra::CrsMatrix<Scalar, LO,GO,Node> crs_matrix_type;
449 
450  using local_graph_device_type = typename crs_matrix_type::local_graph_device_type;
451  using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
452  using row_map_type = typename local_graph_device_type::row_map_type::non_const_type;
453  using values_type = typename local_matrix_device_type::values_type::non_const_type;
454 
455  using offset_type = typename row_map_type::non_const_value_type;
456 
457  using execution_space = typename Node::execution_space;
458  using range_type = Kokkos::RangePolicy<execution_space, LO>;
459 
460  RCP<block_crs_matrix_type> blockMatrix;
461 
462  const offset_type bs2 = blockSize * blockSize;
463 
464  auto meshCrsGraph = getBlockCrsGraph(pointMatrix, blockSize, use_LID);
465 
466  if (use_LID) {
467  TEUCHOS_FUNC_TIME_MONITOR("Tpetra::convertToBlockCrsMatrix::fillBlockCrsMatrix");
468  auto pointLocalGraph = pointMatrix.getCrsGraph()->getLocalGraphDevice();
469  auto pointRowptr = pointLocalGraph.row_map;
470  auto pointColind = pointLocalGraph.entries;
471 
472  offset_type block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0)-1)/blockSize;
473  values_type blockValues("values", meshCrsGraph->getLocalNumEntries()*bs2);
474  auto pointValues = pointMatrix.getLocalValuesDevice (Access::ReadOnly);
475  auto blockRowptr = meshCrsGraph->getLocalGraphDevice().row_map;
476 
477  Kokkos::parallel_for("copyblockValues",range_type(0,block_rows),KOKKOS_LAMBDA(const LO i) {
478  const offset_type blkBeg = blockRowptr[i];
479  const offset_type numBlocks = blockRowptr[i+1] - blkBeg;
480 
481  // For each block in the row...
482  for (offset_type block=0; block < numBlocks; block++) {
483 
484  // For each entry in the block...
485  for(LO little_row=0; little_row<blockSize; little_row++) {
486  offset_type point_row_offset = pointRowptr[i*blockSize + little_row];
487  for(LO little_col=0; little_col<blockSize; little_col++) {
488  blockValues((blkBeg+block) * bs2 + little_row * blockSize + little_col) =
489  pointValues[point_row_offset + block*blockSize + little_col];
490  }
491  }
492 
493  }
494  });
495  blockMatrix = rcp(new block_crs_matrix_type(*meshCrsGraph, blockValues, blockSize));
496  Kokkos::DefaultExecutionSpace().fence();
497  }
498  else {
499  TEUCHOS_FUNC_TIME_MONITOR("Tpetra::convertToBlockCrsMatrix::fillBlockCrsMatrix");
500  auto localMeshColMap = meshCrsGraph->getColMap()->getLocalMap();
501  auto localPointColMap = pointMatrix.getColMap()->getLocalMap();
502 
503  values_type blockValues("values", meshCrsGraph->getLocalNumEntries()*bs2);
504  {
505  auto pointLocalGraph = pointMatrix.getCrsGraph()->getLocalGraphDevice();
506  auto pointRowptr = pointLocalGraph.row_map;
507  auto pointColind = pointLocalGraph.entries;
508 
509  offset_type block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0)-1)/blockSize;
510  auto pointValues = pointMatrix.getLocalValuesDevice (Access::ReadOnly);
511  auto blockRowptr = meshCrsGraph->getLocalGraphDevice().row_map;
512  auto blockColind = meshCrsGraph->getLocalGraphDevice().entries;
513 
514  row_map_type pointGColind("pointGColind", pointColind.extent(0));
515 
516  Kokkos::parallel_for("computePointGColind",range_type(0,pointColind.extent(0)),KOKKOS_LAMBDA(const LO i) {
517  pointGColind(i) = localPointColMap.getGlobalElement(pointColind(i));
518  });
519 
520  row_map_type blockGColind("blockGColind", blockColind.extent(0));
521 
522  Kokkos::parallel_for("computeBlockGColind",range_type(0,blockGColind.extent(0)),KOKKOS_LAMBDA(const LO i) {
523  blockGColind(i) = localMeshColMap.getGlobalElement(blockColind(i));
524  });
525 
526  Kokkos::parallel_for("copyblockValues",range_type(0,block_rows),KOKKOS_LAMBDA(const LO i) {
527  const offset_type blkBeg = blockRowptr[i];
528  const offset_type numBlocks = blockRowptr[i+1] - blkBeg;
529 
530  for (offset_type point_i=0; point_i < pointRowptr[i*blockSize + 1] - pointRowptr[i*blockSize]; point_i++) {
531 
532  offset_type block_inv=static_cast<offset_type>(-1);
533  offset_type little_col_inv=static_cast<offset_type>(-1);
534  for (offset_type block_2=0; block_2 < numBlocks; block_2++) {
535  for (LO little_col_2=0; little_col_2 < blockSize; little_col_2++) {
536  if (blockGColind(blkBeg+block_2)*blockSize + little_col_2 == pointGColind(pointRowptr[i*blockSize] + point_i)) {
537  block_inv = block_2;
538  little_col_inv = little_col_2;
539  break;
540  }
541  }
542  if (block_inv!=static_cast<offset_type>(-1))
543  break;
544  }
545 
546  for(LO little_row=0; little_row<blockSize; little_row++) {
547  blockValues((blkBeg+block_inv) * bs2 + little_row * blockSize + little_col_inv) = pointValues[pointRowptr[i*blockSize+little_row] + point_i];
548  }
549  }
550  });
551  }
552  blockMatrix = rcp(new block_crs_matrix_type(*meshCrsGraph, blockValues, blockSize));
553  Kokkos::DefaultExecutionSpace().fence();
554  }
555 
556  return blockMatrix;
557 
558  }
559 
560  template<class Scalar, class LO, class GO, class Node>
561  Teuchos::RCP<Tpetra::CrsMatrix<Scalar, LO, GO, Node>>
562  fillLogicalBlocks(const Tpetra::CrsMatrix<Scalar, LO, GO, Node>& pointMatrix, const LO &blockSize)
563  {
565  using execution_space = typename Node::execution_space;
566  using dev_row_view_t = typename crs_t::local_graph_device_type::row_map_type::non_const_type;
567  using dev_col_view_t = typename crs_t::local_graph_device_type::entries_type::non_const_type;
568  using dev_val_view_t = typename crs_t::local_matrix_device_type::values_type::non_const_type;
569  using range_type = Kokkos::RangePolicy<execution_space, size_t>;
570  using team_policy = Kokkos::TeamPolicy<execution_space>;
571  using member_type = typename team_policy::member_type;
572  using scratch_view = Kokkos::View<bool*, typename execution_space::scratch_memory_space>;
573  using Ordinal = typename dev_row_view_t::non_const_value_type;
574 
575  // Get structure / values
576  auto local = pointMatrix.getLocalMatrixDevice();
577  auto row_ptrs = local.graph.row_map;
578  auto col_inds = local.graph.entries;
579  auto values = local.values;
580  const auto nrows = pointMatrix.getLocalNumRows();
581 
582  //
583  // Populate all active blocks, they must be fully populated with entries so fill with zeroes
584  //
585 
586  // Make row workspace views
587  dev_row_view_t new_rowmap("new_rowmap", nrows+1);
588  const auto blocks_per_row = nrows / blockSize; // assumes square matrix
589  dev_row_view_t active_block_row_map("active_block_row_map", blocks_per_row + 1);
590  const int max_threads = execution_space::concurrency();
591  assert(blockSize > 1);
592  assert(nrows % blockSize == 0);
593  const int mem_level = 1;
594  const int bytes = scratch_view::shmem_size(blocks_per_row);
595 
596  if (max_threads >= blockSize) {
597  // Prefer 1 team per block since this will require a lot less scratch memory
598  team_policy tp(blocks_per_row, blockSize);
599 
600  // Count active blocks
601  Kokkos::parallel_for("countActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(const member_type& team) {
602  Ordinal block_row = team.league_rank();
603 
604  scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
605  Kokkos::single(
606  Kokkos::PerTeam(team), [&] () {
607  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
608  row_block_active(row_block_idx) = false;
609  }
610  });
611  team.team_barrier();
612 
613  // All threads in a team scan a blocks-worth of rows to see which
614  // blocks are active
615  Kokkos::parallel_for(
616  Kokkos::TeamThreadRange(team, blockSize), [&] (Ordinal block_offset) {
617 
618  Ordinal row = block_row*blockSize + block_offset;
619  Ordinal row_itr = row_ptrs(row);
620  Ordinal row_end = row_ptrs(row+1);
621 
622  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
623  const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
624  const Ordinal first_possible_col_in_next_block = (row_block_idx+1) * blockSize;
625  Ordinal curr_nnz_col = col_inds(row_itr);
626  while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
627  // This block has at least one nnz in this row
628  row_block_active(row_block_idx) = true;
629  ++row_itr;
630  if (row_itr == row_end) break;
631  curr_nnz_col = col_inds(row_itr);
632  }
633  }
634  });
635 
636  team.team_barrier();
637 
638  Kokkos::single(
639  Kokkos::PerTeam(team), [&] () {
640  Ordinal count = 0;
641  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
642  if (row_block_active(row_block_idx)) {
643  ++count;
644  }
645  }
646  active_block_row_map(block_row) = count;
647  });
648  });
649  }
650  else {
651  // We don't have enough parallelism to make a thread team handle a block, so just
652  // have 1 thread per block
653  team_policy tp(blocks_per_row, 1);
654 
655  // Count active blocks
656  Kokkos::parallel_for("countActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(const member_type& team) {
657  Ordinal block_row = team.league_rank();
658 
659  scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
660  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
661  row_block_active(row_block_idx) = false;
662  }
663 
664  // One thread scans a blocks-worth of rows to see which blocks are active
665  for (int block_offset=0; block_offset < blockSize; ++block_offset) {
666  Ordinal row = block_row*blockSize + block_offset;
667  Ordinal row_itr = row_ptrs(row);
668  Ordinal row_end = row_ptrs(row+1);
669 
670  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
671  const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
672  const Ordinal first_possible_col_in_next_block = (row_block_idx+1) * blockSize;
673  Ordinal curr_nnz_col = col_inds(row_itr);
674  while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
675  // This block has at least one nnz in this row
676  row_block_active(row_block_idx) = true;
677  ++row_itr;
678  if (row_itr == row_end) break;
679  curr_nnz_col = col_inds(row_itr);
680  }
681  }
682  }
683 
684  Ordinal count = 0;
685  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
686  if (row_block_active(row_block_idx)) {
687  ++count;
688  }
689  }
690  active_block_row_map(block_row) = count;
691  });
692  }
693 
694  Ordinal nnz_block_count = 0;
695 #if KOKKOSKERNELS_VERSION >= 40199
696  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
697  execution_space>(active_block_row_map.extent(0), active_block_row_map, nnz_block_count);
698 #else
699  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
700  dev_row_view_t, execution_space>(active_block_row_map.extent(0), active_block_row_map, nnz_block_count);
701 #endif
702  dev_col_view_t block_col_ids("block_col_ids", nnz_block_count);
703 
704  // Find active blocks
705  if (max_threads >= blockSize) {
706  // Prefer 1 team per block since this will require a lot less scratch memory
707  team_policy tp(blocks_per_row, blockSize);
708 
709  Kokkos::parallel_for("findActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(const member_type& team) {
710  Ordinal block_row = team.league_rank();
711 
712  scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
713  Kokkos::single(
714  Kokkos::PerTeam(team), [&] () {
715  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
716  row_block_active(row_block_idx) = false;
717  }
718  });
719  team.team_barrier();
720 
721  // All threads in a team scan a blocks-worth of rows to see which
722  // blocks are active
723  Kokkos::parallel_for(
724  Kokkos::TeamThreadRange(team, blockSize), [&] (Ordinal block_offset) {
725 
726  Ordinal row = block_row*blockSize + block_offset;
727  Ordinal row_itr = row_ptrs(row);
728  Ordinal row_end = row_ptrs(row+1);
729 
730  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
731  const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
732  const Ordinal first_possible_col_in_next_block = (row_block_idx+1) * blockSize;
733  Ordinal curr_nnz_col = col_inds(row_itr);
734  while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
735  // This block has at least one nnz in this row
736  row_block_active(row_block_idx) = true;
737  ++row_itr;
738  if (row_itr == row_end) break;
739  curr_nnz_col = col_inds(row_itr);
740  }
741  }
742  });
743 
744  team.team_barrier();
745 
746  Kokkos::single(
747  Kokkos::PerTeam(team), [&] () {
748  Ordinal offset = active_block_row_map[block_row];
749  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
750  if (row_block_active(row_block_idx)) {
751  block_col_ids(offset) = row_block_idx;
752  ++offset;
753  }
754  }
755  });
756  });
757  }
758  else {
759  team_policy tp(blocks_per_row, 1);
760 
761  Kokkos::parallel_for("findActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(const member_type& team) {
762  Ordinal block_row = team.league_rank();
763 
764  scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
765  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
766  row_block_active(row_block_idx) = false;
767  }
768 
769  // One thread scans a blocks-worth of rows to see which blocks are active
770  for (int block_offset=0; block_offset < blockSize; ++block_offset) {
771  Ordinal row = block_row*blockSize + block_offset;
772  Ordinal row_itr = row_ptrs(row);
773  Ordinal row_end = row_ptrs(row+1);
774 
775  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
776  const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
777  const Ordinal first_possible_col_in_next_block = (row_block_idx+1) * blockSize;
778  Ordinal curr_nnz_col = col_inds(row_itr);
779  while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
780  // This block has at least one nnz in this row
781  row_block_active(row_block_idx) = true;
782  ++row_itr;
783  if (row_itr == row_end) break;
784  curr_nnz_col = col_inds(row_itr);
785  }
786  }
787  }
788 
789  Ordinal offset = active_block_row_map[block_row];
790  for (size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
791  if (row_block_active(row_block_idx)) {
792  block_col_ids(offset) = row_block_idx;
793  ++offset;
794  }
795  }
796  });
797  }
798 
799  // Sizing
800  Kokkos::parallel_for("sizing", range_type(0, nrows), KOKKOS_LAMBDA(const size_t row) {
801  const auto block_row = row / blockSize;
802  const Ordinal block_row_begin = active_block_row_map(block_row);
803  const Ordinal block_row_end = active_block_row_map(block_row+1);
804  const Ordinal row_nnz = (block_row_end - block_row_begin) * blockSize;
805  new_rowmap(row) = row_nnz;
806  });
807 
808  Ordinal new_nnz_count = 0;
809 #if KOKKOSKERNELS_VERSION >= 40199
810  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
811  execution_space>(new_rowmap.extent(0), new_rowmap, new_nnz_count);
812 #else
813  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
814  dev_row_view_t, execution_space>(new_rowmap.extent(0), new_rowmap, new_nnz_count);
815 #endif
816  // Now populate cols and vals
817  dev_col_view_t new_col_ids("new_col_ids", new_nnz_count);
818  dev_val_view_t new_vals("new_vals", new_nnz_count);
819  Kokkos::parallel_for("entries", range_type(0, nrows), KOKKOS_LAMBDA(const size_t row) {
820  Ordinal row_itr = row_ptrs(row);
821  Ordinal row_end = row_ptrs(row+1);
822  Ordinal row_itr_new = new_rowmap(row);
823 
824  Ordinal block_row = row / blockSize;
825  Ordinal block_row_begin = active_block_row_map(block_row);
826  Ordinal block_row_end = active_block_row_map(block_row+1);
827 
828  for (Ordinal row_block_idx = block_row_begin; row_block_idx < block_row_end; ++row_block_idx) {
829  const Ordinal block_col = block_col_ids(row_block_idx);
830  const Ordinal first_possible_col_in_block = block_col * blockSize;
831  const Ordinal first_possible_col_in_next_block = (block_col+1) * blockSize;
832  for (Ordinal possible_col = first_possible_col_in_block; possible_col < first_possible_col_in_next_block; ++possible_col, ++row_itr_new) {
833  new_col_ids(row_itr_new) = possible_col;
834  Ordinal curr_nnz_col = col_inds(row_itr);
835  if (possible_col == curr_nnz_col && row_itr < row_end) {
836  // Already a non-zero entry
837  new_vals(row_itr_new) = values(row_itr);
838  ++row_itr;
839  }
840  }
841  }
842  });
843 
844  // Create new, filled CRS
845  auto crs_row_map = pointMatrix.getRowMap();
846  auto crs_col_map = pointMatrix.getColMap();
847  Teuchos::RCP<crs_t> result = Teuchos::rcp(new crs_t(crs_row_map, crs_col_map, new_rowmap, new_col_ids, new_vals));
848  result->fillComplete();
849  return result;
850  }
851 
852  template<class Scalar, class LO, class GO, class Node>
853  Teuchos::RCP<Tpetra::CrsMatrix<Scalar, LO, GO, Node>>
855  {
857  using dev_row_view_t = typename crs_t::local_graph_device_type::row_map_type::non_const_type;
858  using dev_col_view_t = typename crs_t::local_graph_device_type::entries_type::non_const_type;
859  using dev_val_view_t = typename crs_t::local_matrix_device_type::values_type::non_const_type;
860  using impl_scalar_t = typename Kokkos::ArithTraits<Scalar>::val_type;
861  using STS = Kokkos::ArithTraits<impl_scalar_t>;
862  using Ordinal = typename dev_row_view_t::non_const_value_type;
863  using execution_space = typename Node::execution_space;
864  using range_type = Kokkos::RangePolicy<execution_space, size_t>;
865 
866  // Get structure / values
867  auto local = pointMatrix.getLocalMatrixHost();
868  auto row_ptrs = local.graph.row_map;
869  auto col_inds = local.graph.entries;
870  auto values = local.values;
871  const auto nrows = pointMatrix.getLocalNumRows();
872 
873  // Sizing and rows
874  dev_row_view_t new_rowmap("new_rowmap", nrows+1);
875  const impl_scalar_t zero = STS::zero();
876  Kokkos::parallel_for("sizing", range_type(0, nrows), KOKKOS_LAMBDA(const size_t row) {
877  const Ordinal row_nnz_begin = row_ptrs(row);
878  Ordinal row_nnzs = 0;
879  for (Ordinal row_nnz = row_nnz_begin; row_nnz < row_ptrs(row+1); ++row_nnz) {
880  const impl_scalar_t value = values(row_nnz);
881  if (value != zero) {
882  ++row_nnzs;
883  }
884  }
885  new_rowmap(row) = row_nnzs;
886  });
887 
888  Ordinal real_nnzs = 0;
889 #if KOKKOSKERNELS_VERSION >= 40199
890  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
891  execution_space>(new_rowmap.extent(0), new_rowmap, real_nnzs);
892 #else
893  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
894  dev_row_view_t, execution_space>(new_rowmap.extent(0), new_rowmap, real_nnzs);
895 #endif
896  // Now populate cols and vals
897  dev_col_view_t new_col_ids("new_col_ids", real_nnzs);
898  dev_val_view_t new_vals("new_vals", real_nnzs);
899  Kokkos::parallel_for("entries", range_type(0, nrows), KOKKOS_LAMBDA(const size_t row) {
900  Ordinal row_nnzs = 0;
901  const Ordinal old_row_nnz_begin = row_ptrs(row);
902  const Ordinal new_row_nnz_begin = new_rowmap(row);
903  for (Ordinal old_row_nnz = old_row_nnz_begin; old_row_nnz < row_ptrs(row+1); ++old_row_nnz) {
904  const impl_scalar_t value = values(old_row_nnz);
905  if (value != zero) {
906  new_col_ids(new_row_nnz_begin + row_nnzs) = col_inds(old_row_nnz);
907  new_vals(new_row_nnz_begin + row_nnzs) = value;
908  ++row_nnzs;
909  }
910  }
911  });
912 
913  // Create new, unfilled CRS
914  auto crs_row_map = pointMatrix.getRowMap();
915  auto crs_col_map = pointMatrix.getColMap();
916  Teuchos::RCP<crs_t> result = Teuchos::rcp(new crs_t(crs_row_map, crs_col_map, new_rowmap, new_col_ids, new_vals));
917  result->fillComplete();
918  return result;
919  }
920 
921  template<class LO, class GO, class Node>
922  Teuchos::RCP<const Tpetra::Map<LO,GO,Node> >
923  createMeshMap (const LO& blockSize, const Tpetra::Map<LO,GO,Node>& pointMap)
924  {
925  typedef Teuchos::OrdinalTraits<Tpetra::global_size_t> TOT;
926  typedef Tpetra::Map<LO,GO,Node> map_type;
927 
928  //calculate mesh GIDs
929  Teuchos::ArrayView<const GO> pointGids = pointMap.getLocalElementList();
930  Teuchos::Array<GO> meshGids;
931  GO indexBase = pointMap.getIndexBase();
932 
933  // Use hash table to track whether we've encountered this GID previously. This will happen
934  // when striding through the point DOFs in a block. It should not happen otherwise.
935  // I don't use sort/make unique because I don't want to change the ordering.
936  meshGids.reserve(pointGids.size());
937  Tpetra::Details::HashTable<GO,int> hashTable(pointGids.size());
938  for (int i=0; i<pointGids.size(); ++i) {
939  GO meshGid = (pointGids[i]-indexBase) / blockSize + indexBase;
940  if (hashTable.get(meshGid) == -1) {
941  hashTable.add(meshGid,1); //(key,value)
942  meshGids.push_back(meshGid);
943  }
944  }
945 
946  Teuchos::RCP<const map_type> meshMap = Teuchos::rcp( new map_type(TOT::invalid(), meshGids(), 0, pointMap.getComm()) );
947  return meshMap;
948 
949  }
950 
951 
952  template<class LO, class GO, class Node>
953  Teuchos::RCP<const Tpetra::Map<LO,GO,Node> >
954  createPointMap (const LO& blockSize, const Tpetra::Map<LO,GO,Node>& blockMap)
955  {
956  typedef Teuchos::OrdinalTraits<Tpetra::global_size_t> TOT;
957  typedef Tpetra::Map<LO,GO,Node> map_type;
958 
959  //calculate mesh GIDs
960  Teuchos::ArrayView<const GO> blockGids = blockMap.getLocalElementList();
961  Teuchos::Array<GO> pointGids(blockGids.size() * blockSize);
962  GO indexBase = blockMap.getIndexBase();
963 
964  for(LO i=0, ct=0; i<(LO)blockGids.size(); i++) {
965  GO base = (blockGids[i] - indexBase)* blockSize + indexBase;
966  for(LO j=0; j<blockSize; j++, ct++)
967  pointGids[i*blockSize+j] = base+j;
968  }
969 
970  Teuchos::RCP<const map_type> pointMap = Teuchos::rcp( new map_type(TOT::invalid(), pointGids(), 0, blockMap.getComm()) );
971  return pointMap;
972 
973  }
974 
975 
976  template<class Scalar, class LO, class GO, class Node>
977  Teuchos::RCP<CrsMatrix<Scalar, LO, GO, Node> >
979 
980  using Teuchos::Array;
981  using Teuchos::ArrayView;
982  using Teuchos::RCP;
983 
984  typedef Tpetra::BlockCrsMatrix<Scalar,LO,GO,Node> block_crs_matrix_type;
985  typedef Tpetra::Map<LO,GO,Node> map_type;
986  typedef Tpetra::CrsMatrix<Scalar, LO,GO,Node> crs_matrix_type;
987 
988  using crs_graph_type = typename block_crs_matrix_type::crs_graph_type;
989  using local_graph_device_type = typename crs_matrix_type::local_graph_device_type;
990  using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
991  using row_map_type = typename local_graph_device_type::row_map_type::non_const_type;
992  using entries_type = typename local_graph_device_type::entries_type::non_const_type;
993  using values_type = typename local_matrix_device_type::values_type::non_const_type;
994 
995  using row_map_type_const = typename local_graph_device_type::row_map_type;
996  using entries_type_const = typename local_graph_device_type::entries_type;
997 
998  using little_block_type = typename block_crs_matrix_type::const_little_block_type;
999  using offset_type = typename row_map_type::non_const_value_type;
1000  using column_type = typename entries_type::non_const_value_type;
1001 
1002  using execution_space = typename Node::execution_space;
1003  using range_type = Kokkos::RangePolicy<execution_space, LO>;
1004 
1005 
1006  LO blocksize = blockMatrix.getBlockSize();
1007  const offset_type bs2 = blocksize * blocksize;
1008  size_t block_nnz = blockMatrix.getLocalNumEntries();
1009  size_t point_nnz = block_nnz * bs2;
1010 
1011  // We can get these from the blockMatrix directly
1012  RCP<const map_type> pointDomainMap = blockMatrix.getDomainMap();
1013  RCP<const map_type> pointRangeMap = blockMatrix.getRangeMap();
1014 
1015  // We need to generate the row/col Map ourselves.
1016  RCP<const map_type> blockRowMap = blockMatrix.getRowMap();
1017  RCP<const map_type> pointRowMap = createPointMap<LO,GO,Node>(blocksize, *blockRowMap);
1018 
1019  RCP<const map_type> blockColMap = blockMatrix.getColMap();
1020  RCP<const map_type> pointColMap = createPointMap<LO,GO,Node>(blocksize, *blockColMap);
1021 
1022  // Get the last few things
1023 
1024  const crs_graph_type & blockGraph = blockMatrix.getCrsGraph();
1025  LO point_rows = (LO) pointRowMap->getLocalNumElements();
1026  LO block_rows = (LO) blockRowMap->getLocalNumElements();
1027  auto blockValues = blockMatrix.getValuesDevice();
1028  auto blockLocalGraph = blockGraph.getLocalGraphDevice();
1029  row_map_type_const blockRowptr = blockLocalGraph.row_map;
1030  entries_type_const blockColind = blockLocalGraph.entries;
1031 
1032  // Generate the point matrix rowptr / colind / values
1033  row_map_type rowptr("row_map", point_rows+1);
1034  entries_type colind("entries", point_nnz);
1035  values_type values("values", point_nnz);
1036 
1037  // Pre-fill the rowptr
1038  Kokkos::parallel_for("fillRowPtr",range_type(0,block_rows),KOKKOS_LAMBDA(const LO i) {
1039  offset_type base = blockRowptr[i];
1040  offset_type diff = blockRowptr[i+1] - base;
1041  for(LO j=0; j<blocksize; j++) {
1042  rowptr[i*blocksize +j] = base*bs2 + j*diff*blocksize;
1043  }
1044 
1045  // Fill the last guy, if we're on the final entry
1046  if(i==block_rows-1) {
1047  rowptr[block_rows*blocksize] = blockRowptr[i+1] * bs2;
1048  }
1049  });
1050 
1051 
1052  Kokkos::parallel_for("copyEntriesAndValues",range_type(0,block_rows),KOKKOS_LAMBDA(const LO i) {
1053  const offset_type blkBeg = blockRowptr[i];
1054  const offset_type numBlocks = blockRowptr[i+1] - blkBeg;
1055 
1056  // For each block in the row...
1057  for (offset_type block=0; block < numBlocks; block++) {
1058  column_type point_col_base = blockColind[blkBeg + block] * blocksize;
1059  little_block_type my_block(blockValues.data () + (blkBeg+block) * bs2, blocksize, blocksize);
1060 
1061  // For each entry in the block...
1062  for(LO little_row=0; little_row<blocksize; little_row++) {
1063  offset_type point_row_offset = rowptr[i*blocksize + little_row];
1064  for(LO little_col=0; little_col<blocksize; little_col++) {
1065  colind[point_row_offset + block*blocksize + little_col] = point_col_base + little_col;
1066  values[point_row_offset + block*blocksize + little_col] = my_block(little_row,little_col);
1067  }
1068  }
1069 
1070  }
1071  });
1072 
1073  // Generate the matrix
1074  RCP<crs_matrix_type> pointCrsMatrix = rcp(new crs_matrix_type(pointRowMap, pointColMap, 0));
1075  pointCrsMatrix->setAllValues(rowptr,colind,values);
1076 
1077  // FUTURE OPTIMIZATION: Directly compute import/export, rather than letting ESFC do it
1078  pointCrsMatrix->expertStaticFillComplete(pointDomainMap,pointRangeMap);
1079 
1080  return pointCrsMatrix;
1081  }
1082 
1083 
1084 } // namespace Tpetra
1085 
1086 //
1087 // Explicit instantiation macro for blockCrsMatrixWriter (various
1088 // overloads), writeMatrixStrip, and convertToBlockCrsMatrix.
1089 //
1090 // Must be expanded from within the Tpetra namespace!
1091 //
1092 #define TPETRA_BLOCKCRSMATRIX_HELPERS_INSTANT(S,LO,GO,NODE) \
1093  template void blockCrsMatrixWriter(BlockCrsMatrix<S,LO,GO,NODE> const &A, std::string const &fileName); \
1094  template void blockCrsMatrixWriter(BlockCrsMatrix<S,LO,GO,NODE> const &A, std::string const &fileName, Teuchos::ParameterList const &params); \
1095  template void blockCrsMatrixWriter(BlockCrsMatrix<S,LO,GO,NODE> const &A, std::ostream &os); \
1096  template void blockCrsMatrixWriter(BlockCrsMatrix<S,LO,GO,NODE> const &A, std::ostream &os, Teuchos::ParameterList const &params); \
1097  template void writeMatrixStrip(BlockCrsMatrix<S,LO,GO,NODE> const &A, std::ostream &os, Teuchos::ParameterList const &params); \
1098  template Teuchos::RCP<BlockCrsMatrix<S, LO, GO, NODE> > convertToBlockCrsMatrix(const CrsMatrix<S, LO, GO, NODE>& pointMatrix, const LO &blockSize, bool use_LID); \
1099  template Teuchos::RCP<CrsMatrix<S, LO, GO, NODE> > fillLogicalBlocks(const CrsMatrix<S, LO, GO, NODE>& pointMatrix, const LO &blockSize); \
1100  template Teuchos::RCP<CrsMatrix<S, LO, GO, NODE> > unfillFormerBlockCrs(const CrsMatrix<S, LO, GO, NODE>& pointMatrix); \
1101  template Teuchos::RCP<CrsMatrix<S, LO, GO, NODE> > convertToCrsMatrix(const BlockCrsMatrix<S, LO, GO, NODE>& blockMatrix);
1102 
1103 //
1104 // Explicit instantiation macro for createMeshMap / createPointMap
1105 //
1106 #define TPETRA_CREATEMESHMAP_INSTANT(LO,GO,NODE) \
1107  template Teuchos::RCP<const Map<LO,GO,NODE> > createMeshMap (const LO& blockSize, const Map<LO,GO,NODE>& pointMap); \
1108  template Teuchos::RCP<const Map<LO,GO,NODE> > createPointMap (const LO& blockSize, const Map<LO,GO,NODE>& blockMap);
1109 
1110 #endif // TPETRA_BLOCKCRSMATRIX_HELPERS_DEF_HPP
Teuchos::RCP< const Tpetra::Map< LO, GO, Node > > createPointMap(LO const &blockSize, const Tpetra::Map< LO, GO, Node > &blockMap)
Helper function to generate a point map from a block map (with a given block size) GIDs associated wi...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
virtual size_t getLocalNumEntries() const override
The local number of stored (structurally nonzero) entries.
virtual global_size_t getGlobalNumCols() const override
The global number of columns of this matrix.
void blockCrsMatrixWriter(BlockCrsMatrix< Scalar, LO, GO, Node > const &A, std::string const &fileName)
Helper function to write a BlockCrsMatrix. Calls the 3-argument version.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
virtual LO getBlockSize() const override
The number of degrees of freedom per mesh point.
Teuchos::RCP< Tpetra::CrsMatrix< Scalar, LO, GO, Node > > unfillFormerBlockCrs(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix)
Unfill all point entries in a logical block of a CrsMatrix with zeroes. This can be called after conv...
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
One or more distributed dense vectors.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
size_t getGlobalMaxNumRowEntries() const override
Maximum number of entries in any row of the graph, over all processes in the graph&#39;s communicator...
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
Accessors for the Teuchos::Comm and Kokkos Node objects.
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix&#39;s graph, as a CrsGraph.
size_t getLocalNumRows() const override
get the local number of block rows
void writeMatrixStrip(BlockCrsMatrix< Scalar, LO, GO, Node > const &A, std::ostream &os, Teuchos::ParameterList const &params)
Helper function called by blockCrsMatrixWriter.
void getLocalRowView(LO LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
Teuchos::RCP< const Tpetra::Map< LO, GO, Node > > createMeshMap(LO const &blockSize, const Tpetra::Map< LO, GO, Node > &pointMap)
Helper function to generate a mesh map from a point map. Important! It&#39;s assumed that point GIDs asso...
Teuchos::RCP< const map_type > getColMap() const override
get the (mesh) map for the columns of this block matrix.
Teuchos::ArrayView< const global_ordinal_type > getLocalElementList() const
Return a NONOWNING view of the global indices owned by this process.
Teuchos::RCP< const map_type > getDomainMap() const override
Get the (point) domain Map of this matrix.
Teuchos::RCP< const map_type > getRowMap() const override
get the (mesh) map for the rows of this block matrix.
Insert new values that don&#39;t currently exist.
global_ordinal_type getIndexBase() const
The index base for this Map.
global_size_t getGlobalNumRows() const override
get the global number of block rows
local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
virtual GO getIndexBase() const override
The index base for global indices in this matrix.
virtual global_size_t getGlobalNumEntries() const override
The global number of stored (structurally nonzero) entries.
Teuchos::RCP< Tpetra::CrsMatrix< Scalar, LO, GO, Node > > fillLogicalBlocks(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix, const LO &blockSize)
Fill all point entries in a logical block of a CrsMatrix with zeroes. This should be called before co...
Teuchos::RCP< const map_type > getRangeMap() const override
Get the (point) range Map of this matrix.
Teuchos::RCP< CrsMatrix< Scalar, LO, GO, Node > > convertToCrsMatrix(const Tpetra::BlockCrsMatrix< Scalar, LO, GO, Node > &blockMatrix)
Non-member constructor that creates a point CrsMatrix from an existing BlockCrsMatrix.
Teuchos::RCP< CrsGraph< LocalOrdinal, GlobalOrdinal, Node > > createCrsGraph(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node >> &map, size_t maxNumEntriesPerRow=0, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Nonmember function to create an empty CrsGraph given a row Map and the max number of entries allowed ...
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Teuchos::RCP< Tpetra::CrsGraph< LO, GO, Node > > getBlockCrsGraph(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix, const LO &blockSize, bool use_local_ID=true)
Non-member constructor that creates the CrsGraph of a BlockCrsMatrix from an existing point CrsMatrix...
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
void add(const KeyType key, const ValueType value)
Add a key and its value to the hash table.
Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > convertToBlockCrsMatrix(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix, const LO &blockSize, bool use_local_ID=true)
Non-member constructor that creates a BlockCrsMatrix from an existing point CrsMatrix.
local_matrix_device_type::values_type::const_type getLocalValuesDevice(Access::ReadOnlyStruct s) const
Get the Kokkos local values on device, read only.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
virtual size_t getLocalNumCols() const override
The number of columns needed to apply the forward operator on this node.
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.