40 #ifndef TPETRA_BLOCKCRSMATRIX_HELPERS_DEF_HPP
41 #define TPETRA_BLOCKCRSMATRIX_HELPERS_DEF_HPP
45 #include "Tpetra_BlockCrsMatrix.hpp"
46 #include "Tpetra_CrsMatrix.hpp"
47 #include "Tpetra_HashTable.hpp"
48 #include "Tpetra_Import.hpp"
49 #include "Tpetra_Map.hpp"
50 #include "Tpetra_MultiVector.hpp"
51 #include "Teuchos_ParameterList.hpp"
52 #include "Teuchos_ScalarTraits.hpp"
58 template<
class Scalar,
class LO,
class GO,
class Node>
60 Teuchos::ParameterList pl;
62 out.open(fileName.c_str());
66 template<
class Scalar,
class LO,
class GO,
class Node>
69 out.open(fileName.c_str());
73 template<
class Scalar,
class LO,
class GO,
class Node>
75 Teuchos::ParameterList pl;
79 template<
class Scalar,
class LO,
class GO,
class Node>
85 typedef Teuchos::OrdinalTraits<GO> TOT;
92 RCP<const map_type>
const rowMap = A.
getRowMap();
93 RCP<const Teuchos::Comm<int> > comm = rowMap->getComm();
94 const int myRank = comm->getRank();
95 const size_t numProcs = comm->getSize();
98 bool alwaysUseParallelAlgorithm =
false;
99 if (params.isParameter(
"always use parallel algorithm"))
100 alwaysUseParallelAlgorithm = params.get<
bool>(
"always use parallel algorithm");
101 bool printMatrixMarketHeader =
true;
102 if (params.isParameter(
"print MatrixMarket header"))
103 printMatrixMarketHeader = params.get<
bool>(
"print MatrixMarket header");
105 if (printMatrixMarketHeader && myRank==0) {
106 std::time_t now = std::time(NULL);
108 const std::string dataTypeStr =
109 Teuchos::ScalarTraits<Scalar>::isComplex ?
"complex" :
"real";
119 os <<
"%%MatrixMarket matrix coordinate " << dataTypeStr <<
" general" << std::endl;
120 os <<
"% time stamp: " << ctime(&now);
121 os <<
"% written from " << numProcs <<
" processes" << std::endl;
122 os <<
"% point representation of Tpetra::BlockCrsMatrix" << std::endl;
125 os <<
"% " << numRows <<
" block rows, " << numCols <<
" block columns" << std::endl;
127 os <<
"% block size " << blockSize << std::endl;
128 os << numRows*blockSize <<
" " << numCols*blockSize <<
" " << A.
getGlobalNumEntries()*blockSize*blockSize << std::endl;
131 if (numProcs==1 && !alwaysUseParallelAlgorithm) {
134 size_t numRows = rowMap->getLocalNumElements();
137 RCP<const map_type> allMeshGidsMap = rcp(
new map_type(TOT::invalid(), numRows, A.
getIndexBase(), comm));
140 mv_type allMeshGids(allMeshGidsMap,1);
141 Teuchos::ArrayRCP<GO> allMeshGidsData = allMeshGids.getDataNonConst(0);
143 for (
size_t i=0; i<numRows; i++)
144 allMeshGidsData[i] = rowMap->getGlobalElement(i);
145 allMeshGidsData = Teuchos::null;
148 size_t stripSize = allMeshGids.getGlobalLength() / numProcs;
149 size_t remainder = allMeshGids.getGlobalLength() % numProcs;
151 size_t curStripSize = 0;
152 Teuchos::Array<GO> importMeshGidList;
153 for (
size_t i=0; i<numProcs; i++) {
155 curStripSize = stripSize;
156 if (i<remainder) curStripSize++;
157 importMeshGidList.resize(curStripSize);
158 for (
size_t j=0; j<curStripSize; j++) importMeshGidList[j] = j + curStart + A.
getIndexBase();
159 curStart += curStripSize;
162 TEUCHOS_TEST_FOR_EXCEPTION(myRank>0 && curStripSize!=0,
163 std::runtime_error,
"Tpetra::blockCrsMatrixWriter: (pid "
164 << myRank <<
") map size should be zero, but is " << curStripSize);
165 RCP<map_type> importMeshGidMap = rcp(
new map_type(TOT::invalid(), importMeshGidList(), A.
getIndexBase(), comm));
166 import_type gidImporter(allMeshGidsMap, importMeshGidMap);
167 mv_type importMeshGids(importMeshGidMap, 1);
168 importMeshGids.doImport(allMeshGids, gidImporter,
INSERT);
174 Teuchos::ArrayRCP<const GO> importMeshGidsData = importMeshGids.getData(0);
175 Teuchos::Array<GO> importMeshGidsGO;
176 importMeshGidsGO.reserve(importMeshGidsData.size());
177 for (
typename Teuchos::ArrayRCP<const GO>::size_type j=0; j<importMeshGidsData.size(); ++j)
178 importMeshGidsGO.push_back(importMeshGidsData[j]);
179 RCP<const map_type> importMap = rcp(
new map_type(TOT::invalid(), importMeshGidsGO(), rowMap->getIndexBase(), comm) );
181 import_type importer(rowMap,importMap );
183 RCP<crs_graph_type> graph =
createCrsGraph(importMap,numEntriesPerRow);
184 RCP<const map_type> domainMap = A.getCrsGraph().
getDomainMap();
185 graph->doImport(A.getCrsGraph(), importer,
INSERT);
186 graph->fillComplete(domainMap, importMap);
188 block_crs_matrix_type importA(*graph, A.
getBlockSize());
189 importA.doImport(A, importer,
INSERT);
197 template<
class Scalar,
class LO,
class GO,
class Node>
202 using bcrs_local_inds_host_view_type =
typename bcrs_type::local_inds_host_view_type;
203 using bcrs_values_host_view_type =
typename bcrs_type::values_host_view_type;
204 using impl_scalar_type =
typename bcrs_type::impl_scalar_type;
207 RCP<const map_type> rowMap = A.
getRowMap();
208 RCP<const map_type> colMap = A.
getColMap();
209 RCP<const Teuchos::Comm<int> > comm = rowMap->getComm();
210 const int myRank = comm->getRank();
212 const size_t meshRowOffset = rowMap->getIndexBase();
213 const size_t meshColOffset = colMap->getIndexBase();
214 TEUCHOS_TEST_FOR_EXCEPTION(meshRowOffset != meshColOffset,
215 std::runtime_error,
"Tpetra::writeMatrixStrip: "
216 "mesh row index base != mesh column index base");
221 std::runtime_error,
"Tpetra::writeMatrixStrip: pid "
224 std::runtime_error,
"Tpetra::writeMatrixStrip: pid "
225 << myRank <<
" should have 0 columns but has " << A.
getLocalNumCols());
230 std::runtime_error,
"Tpetra::writeMatrixStrip: "
231 "number of rows on pid 0 does not match global number of rows");
237 bool precisionChanged=
false;
238 int oldPrecision = 0;
239 if (params.isParameter(
"precision")) {
240 oldPrecision = os.precision(params.get<
int>(
"precision"));
241 precisionChanged=
true;
244 if (params.isParameter(
"zero-based indexing")) {
245 if (params.get<
bool>(
"zero-based indexing") ==
true)
250 for (localRowInd = 0; localRowInd < numLocalRows; ++localRowInd) {
253 bcrs_local_inds_host_view_type localColInds;
254 bcrs_values_host_view_type vals;
256 A.
getLocalRowView (localRowInd, localColInds, vals); numEntries = localColInds.extent(0);
257 GO globalMeshRowID = rowMap->getGlobalElement(localRowInd) - meshRowOffset;
259 for (LO k = 0; k < numEntries; ++k) {
260 GO globalMeshColID = colMap->getGlobalElement(localColInds[k]) - meshColOffset;
261 const impl_scalar_type* curBlock = vals.data() + blockSize * blockSize * k;
263 for (LO j = 0; j < blockSize; ++j) {
264 GO globalPointRowID = globalMeshRowID * blockSize + j + pointOffset;
265 for (LO i = 0; i < blockSize; ++i) {
266 GO globalPointColID = globalMeshColID * blockSize + i + pointOffset;
267 const impl_scalar_type curVal = curBlock[i + j * blockSize];
269 os << globalPointRowID <<
" " << globalPointColID <<
" ";
270 if (Teuchos::ScalarTraits<impl_scalar_type>::isComplex) {
273 os << Teuchos::ScalarTraits<impl_scalar_type>::real (curVal) <<
" "
274 << Teuchos::ScalarTraits<impl_scalar_type>::imag (curVal);
284 if (precisionChanged)
285 os.precision(oldPrecision);
286 TEUCHOS_TEST_FOR_EXCEPTION(err != 0,
287 std::runtime_error,
"Tpetra::writeMatrixStrip: "
288 "error getting view of local row " << localRowInd);
294 template<
class Scalar,
class LO,
class GO,
class Node>
295 Teuchos::RCP<Tpetra::CrsGraph<LO, GO, Node> >
307 using Teuchos::Array;
308 using Teuchos::ArrayView;
315 using local_graph_device_type =
typename crs_matrix_type::local_graph_device_type;
316 using row_map_type =
typename local_graph_device_type::row_map_type::non_const_type;
317 using entries_type =
typename local_graph_device_type::entries_type::non_const_type;
319 using offset_type =
typename row_map_type::non_const_value_type;
321 using execution_space =
typename Node::execution_space;
322 using range_type = Kokkos::RangePolicy<execution_space, LO>;
324 const map_type &pointRowMap = *(pointMatrix.
getRowMap());
325 RCP<const map_type> meshRowMap = createMeshMap<LO,GO,Node>(blockSize, pointRowMap);
327 const map_type &pointColMap = *(pointMatrix.
getColMap());
328 RCP<const map_type> meshColMap = createMeshMap<LO,GO,Node>(blockSize, pointColMap);
329 if(meshColMap.is_null())
throw std::runtime_error(
"ERROR: Cannot create mesh colmap");
331 auto localMeshColMap = meshColMap->getLocalMap();
332 auto localPointColMap = pointColMap.getLocalMap();
333 auto localPointRowMap = pointRowMap.getLocalMap();
335 const map_type &pointDomainMap = *(pointMatrix.
getDomainMap());
336 RCP<const map_type> meshDomainMap = createMeshMap<LO,GO,Node>(blockSize, pointDomainMap);
338 const map_type &pointRangeMap = *(pointMatrix.
getRangeMap());
339 RCP<const map_type> meshRangeMap = createMeshMap<LO,GO,Node>(blockSize, pointRangeMap);
341 RCP<crs_graph_type> meshCrsGraph;
343 const offset_type bs2 = blockSize * blockSize;
346 TEUCHOS_FUNC_TIME_MONITOR(
"Tpetra::convertToBlockCrsMatrix::fillCrsGraph");
347 auto pointLocalGraph = pointMatrix.
getCrsGraph()->getLocalGraphDevice();
348 auto pointRowptr = pointLocalGraph.row_map;
349 auto pointColind = pointLocalGraph.entries;
351 TEUCHOS_TEST_FOR_EXCEPTION(pointColind.extent(0) % bs2 != 0,
352 std::runtime_error,
"Tpetra::getBlockCrsGraph: "
353 "local number of non zero entries is not a multiple of blockSize^2 ");
355 LO block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0)-1)/blockSize;
356 row_map_type blockRowptr(
"blockRowptr", block_rows+1);
357 entries_type blockColind(
"blockColind", pointColind.extent(0)/(bs2));
359 Kokkos::parallel_for(
"fillMesh",range_type(0,block_rows), KOKKOS_LAMBDA(
const LO i) {
361 const LO offset_b = pointRowptr(i*blockSize)/bs2;
362 const LO offset_b_max = pointRowptr((i+1)*blockSize)/bs2;
365 blockRowptr(i+1) = offset_b_max;
366 blockRowptr(i) = offset_b;
368 const LO offset_p = pointRowptr(i*blockSize);
370 for (LO k=0; k<offset_b_max-offset_b; ++k) {
371 blockColind(offset_b + k) = pointColind(offset_p + k * blockSize)/blockSize;
375 meshCrsGraph = rcp(
new crs_graph_type(meshRowMap, meshColMap, blockRowptr, blockColind));
376 meshCrsGraph->fillComplete(meshDomainMap,meshRangeMap);
377 Kokkos::DefaultExecutionSpace().fence();
380 TEUCHOS_FUNC_TIME_MONITOR(
"Tpetra::convertToBlockCrsMatrix::fillCrsGraph");
381 auto pointLocalGraph = pointMatrix.
getCrsGraph()->getLocalGraphDevice();
382 auto pointRowptr = pointLocalGraph.row_map;
383 auto pointColind = pointLocalGraph.entries;
385 TEUCHOS_TEST_FOR_EXCEPTION(pointColind.extent(0) % bs2 != 0,
386 std::runtime_error,
"Tpetra::getBlockCrsGraph: "
387 "local number of non zero entries is not a multiple of blockSize^2 ");
389 LO block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0)-1)/blockSize;
390 row_map_type blockRowptr(
"blockRowptr", block_rows+1);
391 entries_type blockColind(
"blockColind", pointColind.extent(0)/(bs2));
393 Kokkos::parallel_for(
"fillMesh",range_type(0,block_rows), KOKKOS_LAMBDA(
const LO i) {
395 const LO offset_b = pointRowptr(i*blockSize)/bs2;
396 const LO offset_b_max = pointRowptr((i+1)*blockSize)/bs2;
399 blockRowptr(i+1) = offset_b_max;
400 blockRowptr(i) = offset_b;
402 const LO offset_p = pointRowptr(i*blockSize);
403 const LO offset_p_max = pointRowptr((i+1)*blockSize);
406 for (LO p_i=0; p_i<offset_p_max-offset_p; ++p_i) {
407 auto bcol_GID = localPointColMap.getGlobalElement(pointColind(offset_p + p_i))/blockSize;
408 auto bcol_LID = localMeshColMap.getLocalElement(bcol_GID);
410 bool visited =
false;
411 for (LO k=0; k<filled_block; ++k) {
412 if (blockColind(offset_b + k) == bcol_LID)
416 blockColind(offset_b + filled_block) = bcol_LID;
422 meshCrsGraph = rcp(
new crs_graph_type(meshRowMap, meshColMap, blockRowptr, blockColind));
423 meshCrsGraph->fillComplete(meshDomainMap,meshRangeMap);
424 Kokkos::DefaultExecutionSpace().fence();
430 template<
class Scalar,
class LO,
class GO,
class Node>
431 Teuchos::RCP<BlockCrsMatrix<Scalar, LO, GO, Node> >
443 using Teuchos::Array;
444 using Teuchos::ArrayView;
450 using local_graph_device_type =
typename crs_matrix_type::local_graph_device_type;
451 using local_matrix_device_type =
typename crs_matrix_type::local_matrix_device_type;
452 using row_map_type =
typename local_graph_device_type::row_map_type::non_const_type;
453 using values_type =
typename local_matrix_device_type::values_type::non_const_type;
455 using offset_type =
typename row_map_type::non_const_value_type;
457 using execution_space =
typename Node::execution_space;
458 using range_type = Kokkos::RangePolicy<execution_space, LO>;
460 RCP<block_crs_matrix_type> blockMatrix;
462 const offset_type bs2 = blockSize * blockSize;
467 TEUCHOS_FUNC_TIME_MONITOR(
"Tpetra::convertToBlockCrsMatrix::fillBlockCrsMatrix");
468 auto pointLocalGraph = pointMatrix.
getCrsGraph()->getLocalGraphDevice();
469 auto pointRowptr = pointLocalGraph.row_map;
470 auto pointColind = pointLocalGraph.entries;
472 offset_type block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0)-1)/blockSize;
473 values_type blockValues(
"values", meshCrsGraph->getLocalNumEntries()*bs2);
475 auto blockRowptr = meshCrsGraph->getLocalGraphDevice().row_map;
477 Kokkos::parallel_for(
"copyblockValues",range_type(0,block_rows),KOKKOS_LAMBDA(
const LO i) {
478 const offset_type blkBeg = blockRowptr[i];
479 const offset_type numBlocks = blockRowptr[i+1] - blkBeg;
482 for (offset_type block=0; block < numBlocks; block++) {
485 for(LO little_row=0; little_row<blockSize; little_row++) {
486 offset_type point_row_offset = pointRowptr[i*blockSize + little_row];
487 for(LO little_col=0; little_col<blockSize; little_col++) {
488 blockValues((blkBeg+block) * bs2 + little_row * blockSize + little_col) =
489 pointValues[point_row_offset + block*blockSize + little_col];
495 blockMatrix = rcp(
new block_crs_matrix_type(*meshCrsGraph, blockValues, blockSize));
496 Kokkos::DefaultExecutionSpace().fence();
499 TEUCHOS_FUNC_TIME_MONITOR(
"Tpetra::convertToBlockCrsMatrix::fillBlockCrsMatrix");
500 auto localMeshColMap = meshCrsGraph->getColMap()->getLocalMap();
501 auto localPointColMap = pointMatrix.
getColMap()->getLocalMap();
503 values_type blockValues(
"values", meshCrsGraph->getLocalNumEntries()*bs2);
505 auto pointLocalGraph = pointMatrix.
getCrsGraph()->getLocalGraphDevice();
506 auto pointRowptr = pointLocalGraph.row_map;
507 auto pointColind = pointLocalGraph.entries;
509 offset_type block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0)-1)/blockSize;
511 auto blockRowptr = meshCrsGraph->getLocalGraphDevice().row_map;
512 auto blockColind = meshCrsGraph->getLocalGraphDevice().entries;
514 row_map_type pointGColind(
"pointGColind", pointColind.extent(0));
516 Kokkos::parallel_for(
"computePointGColind",range_type(0,pointColind.extent(0)),KOKKOS_LAMBDA(
const LO i) {
517 pointGColind(i) = localPointColMap.getGlobalElement(pointColind(i));
520 row_map_type blockGColind(
"blockGColind", blockColind.extent(0));
522 Kokkos::parallel_for(
"computeBlockGColind",range_type(0,blockGColind.extent(0)),KOKKOS_LAMBDA(
const LO i) {
523 blockGColind(i) = localMeshColMap.getGlobalElement(blockColind(i));
526 Kokkos::parallel_for(
"copyblockValues",range_type(0,block_rows),KOKKOS_LAMBDA(
const LO i) {
527 const offset_type blkBeg = blockRowptr[i];
528 const offset_type numBlocks = blockRowptr[i+1] - blkBeg;
530 for (offset_type point_i=0; point_i < pointRowptr[i*blockSize + 1] - pointRowptr[i*blockSize]; point_i++) {
532 offset_type block_inv=
static_cast<offset_type
>(-1);
533 offset_type little_col_inv=
static_cast<offset_type
>(-1);
534 for (offset_type block_2=0; block_2 < numBlocks; block_2++) {
535 for (LO little_col_2=0; little_col_2 < blockSize; little_col_2++) {
536 if (blockGColind(blkBeg+block_2)*blockSize + little_col_2 == pointGColind(pointRowptr[i*blockSize] + point_i)) {
538 little_col_inv = little_col_2;
542 if (block_inv!=static_cast<offset_type>(-1))
546 for(LO little_row=0; little_row<blockSize; little_row++) {
547 blockValues((blkBeg+block_inv) * bs2 + little_row * blockSize + little_col_inv) = pointValues[pointRowptr[i*blockSize+little_row] + point_i];
552 blockMatrix = rcp(
new block_crs_matrix_type(*meshCrsGraph, blockValues, blockSize));
553 Kokkos::DefaultExecutionSpace().fence();
560 template<
class Scalar,
class LO,
class GO,
class Node>
561 Teuchos::RCP<Tpetra::CrsMatrix<Scalar, LO, GO, Node>>
565 using execution_space =
typename Node::execution_space;
566 using dev_row_view_t =
typename crs_t::local_graph_device_type::row_map_type::non_const_type;
567 using dev_col_view_t =
typename crs_t::local_graph_device_type::entries_type::non_const_type;
568 using dev_val_view_t =
typename crs_t::local_matrix_device_type::values_type::non_const_type;
569 using range_type = Kokkos::RangePolicy<execution_space, size_t>;
570 using team_policy = Kokkos::TeamPolicy<execution_space>;
571 using member_type =
typename team_policy::member_type;
572 using scratch_view = Kokkos::View<bool*, typename execution_space::scratch_memory_space>;
573 using Ordinal =
typename dev_row_view_t::non_const_value_type;
577 auto row_ptrs = local.graph.row_map;
578 auto col_inds = local.graph.entries;
579 auto values = local.values;
587 dev_row_view_t new_rowmap(
"new_rowmap", nrows+1);
588 const auto blocks_per_row = nrows / blockSize;
589 dev_row_view_t active_block_row_map(
"active_block_row_map", blocks_per_row + 1);
590 const int max_threads = execution_space::concurrency();
591 assert(blockSize > 1);
592 assert(nrows % blockSize == 0);
593 const int mem_level = 1;
594 const int bytes = scratch_view::shmem_size(blocks_per_row);
596 if (max_threads >= blockSize) {
598 team_policy tp(blocks_per_row, blockSize);
601 Kokkos::parallel_for(
"countActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(
const member_type& team) {
602 Ordinal block_row = team.league_rank();
604 scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
606 Kokkos::PerTeam(team), [&] () {
607 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
608 row_block_active(row_block_idx) =
false;
615 Kokkos::parallel_for(
616 Kokkos::TeamThreadRange(team, blockSize), [&] (Ordinal block_offset) {
618 Ordinal row = block_row*blockSize + block_offset;
619 Ordinal row_itr = row_ptrs(row);
620 Ordinal row_end = row_ptrs(row+1);
622 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
623 const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
624 const Ordinal first_possible_col_in_next_block = (row_block_idx+1) * blockSize;
625 Ordinal curr_nnz_col = col_inds(row_itr);
626 while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
628 row_block_active(row_block_idx) =
true;
630 if (row_itr == row_end)
break;
631 curr_nnz_col = col_inds(row_itr);
639 Kokkos::PerTeam(team), [&] () {
641 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
642 if (row_block_active(row_block_idx)) {
646 active_block_row_map(block_row) = count;
653 team_policy tp(blocks_per_row, 1);
656 Kokkos::parallel_for(
"countActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(
const member_type& team) {
657 Ordinal block_row = team.league_rank();
659 scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
660 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
661 row_block_active(row_block_idx) =
false;
665 for (
int block_offset=0; block_offset < blockSize; ++block_offset) {
666 Ordinal row = block_row*blockSize + block_offset;
667 Ordinal row_itr = row_ptrs(row);
668 Ordinal row_end = row_ptrs(row+1);
670 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
671 const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
672 const Ordinal first_possible_col_in_next_block = (row_block_idx+1) * blockSize;
673 Ordinal curr_nnz_col = col_inds(row_itr);
674 while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
676 row_block_active(row_block_idx) =
true;
678 if (row_itr == row_end)
break;
679 curr_nnz_col = col_inds(row_itr);
685 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
686 if (row_block_active(row_block_idx)) {
690 active_block_row_map(block_row) = count;
694 Ordinal nnz_block_count = 0;
695 #if KOKKOSKERNELS_VERSION >= 40199
696 KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
697 execution_space>(active_block_row_map.extent(0), active_block_row_map, nnz_block_count);
699 KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
700 dev_row_view_t, execution_space>(active_block_row_map.extent(0), active_block_row_map, nnz_block_count);
702 dev_col_view_t block_col_ids(
"block_col_ids", nnz_block_count);
705 if (max_threads >= blockSize) {
707 team_policy tp(blocks_per_row, blockSize);
709 Kokkos::parallel_for(
"findActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(
const member_type& team) {
710 Ordinal block_row = team.league_rank();
712 scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
714 Kokkos::PerTeam(team), [&] () {
715 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
716 row_block_active(row_block_idx) =
false;
723 Kokkos::parallel_for(
724 Kokkos::TeamThreadRange(team, blockSize), [&] (Ordinal block_offset) {
726 Ordinal row = block_row*blockSize + block_offset;
727 Ordinal row_itr = row_ptrs(row);
728 Ordinal row_end = row_ptrs(row+1);
730 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
731 const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
732 const Ordinal first_possible_col_in_next_block = (row_block_idx+1) * blockSize;
733 Ordinal curr_nnz_col = col_inds(row_itr);
734 while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
736 row_block_active(row_block_idx) =
true;
738 if (row_itr == row_end)
break;
739 curr_nnz_col = col_inds(row_itr);
747 Kokkos::PerTeam(team), [&] () {
748 Ordinal offset = active_block_row_map[block_row];
749 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
750 if (row_block_active(row_block_idx)) {
751 block_col_ids(offset) = row_block_idx;
759 team_policy tp(blocks_per_row, 1);
761 Kokkos::parallel_for(
"findActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(
const member_type& team) {
762 Ordinal block_row = team.league_rank();
764 scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
765 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
766 row_block_active(row_block_idx) =
false;
770 for (
int block_offset=0; block_offset < blockSize; ++block_offset) {
771 Ordinal row = block_row*blockSize + block_offset;
772 Ordinal row_itr = row_ptrs(row);
773 Ordinal row_end = row_ptrs(row+1);
775 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
776 const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
777 const Ordinal first_possible_col_in_next_block = (row_block_idx+1) * blockSize;
778 Ordinal curr_nnz_col = col_inds(row_itr);
779 while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
781 row_block_active(row_block_idx) =
true;
783 if (row_itr == row_end)
break;
784 curr_nnz_col = col_inds(row_itr);
789 Ordinal offset = active_block_row_map[block_row];
790 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
791 if (row_block_active(row_block_idx)) {
792 block_col_ids(offset) = row_block_idx;
800 Kokkos::parallel_for(
"sizing", range_type(0, nrows), KOKKOS_LAMBDA(
const size_t row) {
801 const auto block_row = row / blockSize;
802 const Ordinal block_row_begin = active_block_row_map(block_row);
803 const Ordinal block_row_end = active_block_row_map(block_row+1);
804 const Ordinal row_nnz = (block_row_end - block_row_begin) * blockSize;
805 new_rowmap(row) = row_nnz;
808 Ordinal new_nnz_count = 0;
809 #if KOKKOSKERNELS_VERSION >= 40199
810 KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
811 execution_space>(new_rowmap.extent(0), new_rowmap, new_nnz_count);
813 KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
814 dev_row_view_t, execution_space>(new_rowmap.extent(0), new_rowmap, new_nnz_count);
817 dev_col_view_t new_col_ids(
"new_col_ids", new_nnz_count);
818 dev_val_view_t new_vals(
"new_vals", new_nnz_count);
819 Kokkos::parallel_for(
"entries", range_type(0, nrows), KOKKOS_LAMBDA(
const size_t row) {
820 Ordinal row_itr = row_ptrs(row);
821 Ordinal row_end = row_ptrs(row+1);
822 Ordinal row_itr_new = new_rowmap(row);
824 Ordinal block_row = row / blockSize;
825 Ordinal block_row_begin = active_block_row_map(block_row);
826 Ordinal block_row_end = active_block_row_map(block_row+1);
828 for (Ordinal row_block_idx = block_row_begin; row_block_idx < block_row_end; ++row_block_idx) {
829 const Ordinal block_col = block_col_ids(row_block_idx);
830 const Ordinal first_possible_col_in_block = block_col * blockSize;
831 const Ordinal first_possible_col_in_next_block = (block_col+1) * blockSize;
832 for (Ordinal possible_col = first_possible_col_in_block; possible_col < first_possible_col_in_next_block; ++possible_col, ++row_itr_new) {
833 new_col_ids(row_itr_new) = possible_col;
834 Ordinal curr_nnz_col = col_inds(row_itr);
835 if (possible_col == curr_nnz_col && row_itr < row_end) {
837 new_vals(row_itr_new) = values(row_itr);
845 auto crs_row_map = pointMatrix.
getRowMap();
846 auto crs_col_map = pointMatrix.
getColMap();
847 Teuchos::RCP<crs_t> result = Teuchos::rcp(
new crs_t(crs_row_map, crs_col_map, new_rowmap, new_col_ids, new_vals));
848 result->fillComplete();
852 template<
class Scalar,
class LO,
class GO,
class Node>
853 Teuchos::RCP<Tpetra::CrsMatrix<Scalar, LO, GO, Node>>
857 using dev_row_view_t =
typename crs_t::local_graph_device_type::row_map_type::non_const_type;
858 using dev_col_view_t =
typename crs_t::local_graph_device_type::entries_type::non_const_type;
859 using dev_val_view_t =
typename crs_t::local_matrix_device_type::values_type::non_const_type;
860 using impl_scalar_t =
typename Kokkos::ArithTraits<Scalar>::val_type;
861 using STS = Kokkos::ArithTraits<impl_scalar_t>;
862 using Ordinal =
typename dev_row_view_t::non_const_value_type;
863 using execution_space =
typename Node::execution_space;
864 using range_type = Kokkos::RangePolicy<execution_space, size_t>;
867 auto local = pointMatrix.getLocalMatrixHost();
868 auto row_ptrs = local.graph.row_map;
869 auto col_inds = local.graph.entries;
870 auto values = local.values;
874 dev_row_view_t new_rowmap(
"new_rowmap", nrows+1);
875 const impl_scalar_t zero = STS::zero();
876 Kokkos::parallel_for(
"sizing", range_type(0, nrows), KOKKOS_LAMBDA(
const size_t row) {
877 const Ordinal row_nnz_begin = row_ptrs(row);
878 Ordinal row_nnzs = 0;
879 for (Ordinal row_nnz = row_nnz_begin; row_nnz < row_ptrs(row+1); ++row_nnz) {
880 const impl_scalar_t value = values(row_nnz);
885 new_rowmap(row) = row_nnzs;
888 Ordinal real_nnzs = 0;
889 #if KOKKOSKERNELS_VERSION >= 40199
890 KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
891 execution_space>(new_rowmap.extent(0), new_rowmap, real_nnzs);
893 KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
894 dev_row_view_t, execution_space>(new_rowmap.extent(0), new_rowmap, real_nnzs);
897 dev_col_view_t new_col_ids(
"new_col_ids", real_nnzs);
898 dev_val_view_t new_vals(
"new_vals", real_nnzs);
899 Kokkos::parallel_for(
"entries", range_type(0, nrows), KOKKOS_LAMBDA(
const size_t row) {
900 Ordinal row_nnzs = 0;
901 const Ordinal old_row_nnz_begin = row_ptrs(row);
902 const Ordinal new_row_nnz_begin = new_rowmap(row);
903 for (Ordinal old_row_nnz = old_row_nnz_begin; old_row_nnz < row_ptrs(row+1); ++old_row_nnz) {
904 const impl_scalar_t value = values(old_row_nnz);
906 new_col_ids(new_row_nnz_begin + row_nnzs) = col_inds(old_row_nnz);
907 new_vals(new_row_nnz_begin + row_nnzs) = value;
914 auto crs_row_map = pointMatrix.
getRowMap();
915 auto crs_col_map = pointMatrix.
getColMap();
916 Teuchos::RCP<crs_t> result = Teuchos::rcp(
new crs_t(crs_row_map, crs_col_map, new_rowmap, new_col_ids, new_vals));
917 result->fillComplete();
921 template<
class LO,
class GO,
class Node>
922 Teuchos::RCP<const Tpetra::Map<LO,GO,Node> >
925 typedef Teuchos::OrdinalTraits<Tpetra::global_size_t> TOT;
930 Teuchos::Array<GO> meshGids;
936 meshGids.reserve(pointGids.size());
938 for (
int i=0; i<pointGids.size(); ++i) {
939 GO meshGid = (pointGids[i]-indexBase) / blockSize + indexBase;
940 if (hashTable.get(meshGid) == -1) {
941 hashTable.
add(meshGid,1);
942 meshGids.push_back(meshGid);
946 Teuchos::RCP<const map_type> meshMap = Teuchos::rcp(
new map_type(TOT::invalid(), meshGids(), 0, pointMap.
getComm()) );
952 template<
class LO,
class GO,
class Node>
953 Teuchos::RCP<const Tpetra::Map<LO,GO,Node> >
956 typedef Teuchos::OrdinalTraits<Tpetra::global_size_t> TOT;
961 Teuchos::Array<GO> pointGids(blockGids.size() * blockSize);
964 for(LO i=0, ct=0; i<(LO)blockGids.size(); i++) {
965 GO base = (blockGids[i] - indexBase)* blockSize + indexBase;
966 for(LO j=0; j<blockSize; j++, ct++)
967 pointGids[i*blockSize+j] = base+j;
970 Teuchos::RCP<const map_type> pointMap = Teuchos::rcp(
new map_type(TOT::invalid(), pointGids(), 0, blockMap.
getComm()) );
976 template<
class Scalar,
class LO,
class GO,
class Node>
977 Teuchos::RCP<CrsMatrix<Scalar, LO, GO, Node> >
980 using Teuchos::Array;
981 using Teuchos::ArrayView;
988 using crs_graph_type =
typename block_crs_matrix_type::crs_graph_type;
989 using local_graph_device_type =
typename crs_matrix_type::local_graph_device_type;
990 using local_matrix_device_type =
typename crs_matrix_type::local_matrix_device_type;
991 using row_map_type =
typename local_graph_device_type::row_map_type::non_const_type;
992 using entries_type =
typename local_graph_device_type::entries_type::non_const_type;
993 using values_type =
typename local_matrix_device_type::values_type::non_const_type;
995 using row_map_type_const =
typename local_graph_device_type::row_map_type;
996 using entries_type_const =
typename local_graph_device_type::entries_type;
998 using little_block_type =
typename block_crs_matrix_type::const_little_block_type;
999 using offset_type =
typename row_map_type::non_const_value_type;
1000 using column_type =
typename entries_type::non_const_value_type;
1002 using execution_space =
typename Node::execution_space;
1003 using range_type = Kokkos::RangePolicy<execution_space, LO>;
1007 const offset_type bs2 = blocksize * blocksize;
1009 size_t point_nnz = block_nnz * bs2;
1012 RCP<const map_type> pointDomainMap = blockMatrix.
getDomainMap();
1013 RCP<const map_type> pointRangeMap = blockMatrix.
getRangeMap();
1016 RCP<const map_type> blockRowMap = blockMatrix.
getRowMap();
1017 RCP<const map_type> pointRowMap = createPointMap<LO,GO,Node>(blocksize, *blockRowMap);
1019 RCP<const map_type> blockColMap = blockMatrix.
getColMap();
1020 RCP<const map_type> pointColMap = createPointMap<LO,GO,Node>(blocksize, *blockColMap);
1024 const crs_graph_type & blockGraph = blockMatrix.getCrsGraph();
1025 LO point_rows = (LO) pointRowMap->getLocalNumElements();
1026 LO block_rows = (LO) blockRowMap->getLocalNumElements();
1027 auto blockValues = blockMatrix.getValuesDevice();
1028 auto blockLocalGraph = blockGraph.getLocalGraphDevice();
1029 row_map_type_const blockRowptr = blockLocalGraph.row_map;
1030 entries_type_const blockColind = blockLocalGraph.entries;
1033 row_map_type rowptr(
"row_map", point_rows+1);
1034 entries_type colind(
"entries", point_nnz);
1035 values_type values(
"values", point_nnz);
1038 Kokkos::parallel_for(
"fillRowPtr",range_type(0,block_rows),KOKKOS_LAMBDA(
const LO i) {
1039 offset_type base = blockRowptr[i];
1040 offset_type diff = blockRowptr[i+1] - base;
1041 for(LO j=0; j<blocksize; j++) {
1042 rowptr[i*blocksize +j] = base*bs2 + j*diff*blocksize;
1046 if(i==block_rows-1) {
1047 rowptr[block_rows*blocksize] = blockRowptr[i+1] * bs2;
1052 Kokkos::parallel_for(
"copyEntriesAndValues",range_type(0,block_rows),KOKKOS_LAMBDA(
const LO i) {
1053 const offset_type blkBeg = blockRowptr[i];
1054 const offset_type numBlocks = blockRowptr[i+1] - blkBeg;
1057 for (offset_type block=0; block < numBlocks; block++) {
1058 column_type point_col_base = blockColind[blkBeg + block] * blocksize;
1059 little_block_type my_block(blockValues.data () + (blkBeg+block) * bs2, blocksize, blocksize);
1062 for(LO little_row=0; little_row<blocksize; little_row++) {
1063 offset_type point_row_offset = rowptr[i*blocksize + little_row];
1064 for(LO little_col=0; little_col<blocksize; little_col++) {
1065 colind[point_row_offset + block*blocksize + little_col] = point_col_base + little_col;
1066 values[point_row_offset + block*blocksize + little_col] = my_block(little_row,little_col);
1074 RCP<crs_matrix_type> pointCrsMatrix = rcp(
new crs_matrix_type(pointRowMap, pointColMap, 0));
1075 pointCrsMatrix->setAllValues(rowptr,colind,values);
1078 pointCrsMatrix->expertStaticFillComplete(pointDomainMap,pointRangeMap);
1080 return pointCrsMatrix;
1092 #define TPETRA_BLOCKCRSMATRIX_HELPERS_INSTANT(S,LO,GO,NODE) \
1093 template void blockCrsMatrixWriter(BlockCrsMatrix<S,LO,GO,NODE> const &A, std::string const &fileName); \
1094 template void blockCrsMatrixWriter(BlockCrsMatrix<S,LO,GO,NODE> const &A, std::string const &fileName, Teuchos::ParameterList const ¶ms); \
1095 template void blockCrsMatrixWriter(BlockCrsMatrix<S,LO,GO,NODE> const &A, std::ostream &os); \
1096 template void blockCrsMatrixWriter(BlockCrsMatrix<S,LO,GO,NODE> const &A, std::ostream &os, Teuchos::ParameterList const ¶ms); \
1097 template void writeMatrixStrip(BlockCrsMatrix<S,LO,GO,NODE> const &A, std::ostream &os, Teuchos::ParameterList const ¶ms); \
1098 template Teuchos::RCP<BlockCrsMatrix<S, LO, GO, NODE> > convertToBlockCrsMatrix(const CrsMatrix<S, LO, GO, NODE>& pointMatrix, const LO &blockSize, bool use_LID); \
1099 template Teuchos::RCP<CrsMatrix<S, LO, GO, NODE> > fillLogicalBlocks(const CrsMatrix<S, LO, GO, NODE>& pointMatrix, const LO &blockSize); \
1100 template Teuchos::RCP<CrsMatrix<S, LO, GO, NODE> > unfillFormerBlockCrs(const CrsMatrix<S, LO, GO, NODE>& pointMatrix); \
1101 template Teuchos::RCP<CrsMatrix<S, LO, GO, NODE> > convertToCrsMatrix(const BlockCrsMatrix<S, LO, GO, NODE>& blockMatrix);
1106 #define TPETRA_CREATEMESHMAP_INSTANT(LO,GO,NODE) \
1107 template Teuchos::RCP<const Map<LO,GO,NODE> > createMeshMap (const LO& blockSize, const Map<LO,GO,NODE>& pointMap); \
1108 template Teuchos::RCP<const Map<LO,GO,NODE> > createPointMap (const LO& blockSize, const Map<LO,GO,NODE>& blockMap);
1110 #endif // TPETRA_BLOCKCRSMATRIX_HELPERS_DEF_HPP
Teuchos::RCP< const Tpetra::Map< LO, GO, Node > > createPointMap(LO const &blockSize, const Tpetra::Map< LO, GO, Node > &blockMap)
Helper function to generate a point map from a block map (with a given block size) GIDs associated wi...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
virtual size_t getLocalNumEntries() const override
The local number of stored (structurally nonzero) entries.
virtual global_size_t getGlobalNumCols() const override
The global number of columns of this matrix.
void blockCrsMatrixWriter(BlockCrsMatrix< Scalar, LO, GO, Node > const &A, std::string const &fileName)
Helper function to write a BlockCrsMatrix. Calls the 3-argument version.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
virtual LO getBlockSize() const override
The number of degrees of freedom per mesh point.
Teuchos::RCP< Tpetra::CrsMatrix< Scalar, LO, GO, Node > > unfillFormerBlockCrs(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix)
Unfill all point entries in a logical block of a CrsMatrix with zeroes. This can be called after conv...
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
One or more distributed dense vectors.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
size_t getGlobalMaxNumRowEntries() const override
Maximum number of entries in any row of the graph, over all processes in the graph's communicator...
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
Accessors for the Teuchos::Comm and Kokkos Node objects.
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix's graph, as a CrsGraph.
size_t getLocalNumRows() const override
get the local number of block rows
void writeMatrixStrip(BlockCrsMatrix< Scalar, LO, GO, Node > const &A, std::ostream &os, Teuchos::ParameterList const ¶ms)
Helper function called by blockCrsMatrixWriter.
void getLocalRowView(LO LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
Teuchos::RCP< const Tpetra::Map< LO, GO, Node > > createMeshMap(LO const &blockSize, const Tpetra::Map< LO, GO, Node > &pointMap)
Helper function to generate a mesh map from a point map. Important! It's assumed that point GIDs asso...
Teuchos::RCP< const map_type > getColMap() const override
get the (mesh) map for the columns of this block matrix.
Teuchos::ArrayView< const global_ordinal_type > getLocalElementList() const
Return a NONOWNING view of the global indices owned by this process.
Teuchos::RCP< const map_type > getDomainMap() const override
Get the (point) domain Map of this matrix.
Teuchos::RCP< const map_type > getRowMap() const override
get the (mesh) map for the rows of this block matrix.
Insert new values that don't currently exist.
global_ordinal_type getIndexBase() const
The index base for this Map.
global_size_t getGlobalNumRows() const override
get the global number of block rows
local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
virtual GO getIndexBase() const override
The index base for global indices in this matrix.
virtual global_size_t getGlobalNumEntries() const override
The global number of stored (structurally nonzero) entries.
Teuchos::RCP< Tpetra::CrsMatrix< Scalar, LO, GO, Node > > fillLogicalBlocks(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix, const LO &blockSize)
Fill all point entries in a logical block of a CrsMatrix with zeroes. This should be called before co...
Teuchos::RCP< const map_type > getRangeMap() const override
Get the (point) range Map of this matrix.
Teuchos::RCP< CrsMatrix< Scalar, LO, GO, Node > > convertToCrsMatrix(const Tpetra::BlockCrsMatrix< Scalar, LO, GO, Node > &blockMatrix)
Non-member constructor that creates a point CrsMatrix from an existing BlockCrsMatrix.
Teuchos::RCP< CrsGraph< LocalOrdinal, GlobalOrdinal, Node > > createCrsGraph(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node >> &map, size_t maxNumEntriesPerRow=0, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Nonmember function to create an empty CrsGraph given a row Map and the max number of entries allowed ...
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Teuchos::RCP< Tpetra::CrsGraph< LO, GO, Node > > getBlockCrsGraph(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix, const LO &blockSize, bool use_local_ID=true)
Non-member constructor that creates the CrsGraph of a BlockCrsMatrix from an existing point CrsMatrix...
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
void add(const KeyType key, const ValueType value)
Add a key and its value to the hash table.
Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > convertToBlockCrsMatrix(const Tpetra::CrsMatrix< Scalar, LO, GO, Node > &pointMatrix, const LO &blockSize, bool use_local_ID=true)
Non-member constructor that creates a BlockCrsMatrix from an existing point CrsMatrix.
local_matrix_device_type::values_type::const_type getLocalValuesDevice(Access::ReadOnlyStruct s) const
Get the Kokkos local values on device, read only.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
virtual size_t getLocalNumCols() const override
The number of columns needed to apply the forward operator on this node.
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.