Panzer  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Panzer_Interpolation.cpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Panzer: A partial differential equation assembly
4 // engine for strongly coupled complex multiphysics systems
5 //
6 // Copyright 2011 NTESS and the Panzer contributors.
7 // SPDX-License-Identifier: BSD-3-Clause
8 // *****************************************************************************
9 // @HEADER
10 
11 #include "Panzer_Interpolation.hpp"
14 #include "Intrepid2_OrientationTools.hpp"
15 #include "Intrepid2_LagrangianInterpolation.hpp"
16 #ifdef PANZER_HAVE_EPETRA_STACK
17 #include "Epetra_MpiComm.h"
18 #endif
19 
20 // #define PANZER_INTERPOLATION_DEBUG_OUTPUT = 1
21 
22 namespace panzer {
23 
24 template <class Scalar,
25  class LocalOrdinal,
26  class GlobalOrdinal,
27  class Node>
29 removeSmallEntries(Teuchos::RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >& A,
31  using crs_matrix = Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
32  using row_ptr_type = typename crs_matrix::local_graph_device_type::row_map_type::non_const_type;
33  using col_idx_type = typename crs_matrix::local_graph_device_type::entries_type::non_const_type;
34  using vals_type = typename crs_matrix::local_matrix_device_type::values_type;
35 
36  using ATS = Kokkos::ArithTraits<Scalar>;
37  using impl_SC = typename ATS::val_type;
38  using impl_ATS = Kokkos::ArithTraits<impl_SC>;
39 
40  auto lclA = A->getLocalMatrixDevice();
41 
42  auto rowptr = row_ptr_type("rowptr", lclA.numRows() + 1);
43 
44  Kokkos::parallel_for(
45  "removeSmallEntries::rowptr1",
46  Kokkos::RangePolicy<LocalOrdinal>(0, lclA.numRows()),
47  KOKKOS_LAMBDA(const LocalOrdinal rlid) {
48  auto row = lclA.row(rlid);
49  for (LocalOrdinal k = 0; k < row.length; ++k) {
50  if (impl_ATS::magnitude(row.value(k)) > tol) {
51  rowptr(rlid + 1) += 1;
52  }
53  }
54  });
55  LocalOrdinal nnz;
56  Kokkos::parallel_scan(
57  "removeSmallEntries::rowptr2",
58  Kokkos::RangePolicy<LocalOrdinal>(0, lclA.numRows()),
59  KOKKOS_LAMBDA(const LocalOrdinal rlid, LocalOrdinal& partial_nnz, bool is_final) {
60  partial_nnz += rowptr(rlid + 1);
61  if (is_final)
62  rowptr(rlid + 1) = partial_nnz;
63  },
64  nnz);
65 
66  auto idx = col_idx_type("idx", nnz);
67  auto vals = vals_type("vals", nnz);
68 
69  Kokkos::parallel_for(
70  "removeSmallEntries::indicesValues",
71  Kokkos::RangePolicy<LocalOrdinal>(0, lclA.numRows()),
72  KOKKOS_LAMBDA(const LocalOrdinal rlid) {
73  auto row = lclA.row(rlid);
74  auto I = rowptr(rlid);
75  for (LocalOrdinal k = 0; k < row.length; ++k) {
76  if (impl_ATS::magnitude(row.value(k)) > tol) {
77  idx(I) = row.colidx(k);
78  vals(I) = row.value(k);
79  I += 1;
80  }
81  }
82  });
83  Kokkos::fence();
84 
85  auto newA = Teuchos::rcp(new crs_matrix(A->getRowMap(), A->getColMap(), rowptr, idx, vals));
86  newA->fillComplete(A->getDomainMap(),
87  A->getRangeMap());
88  return newA;
89 }
90 
91 
93  const std::string &domain_basis_name, const std::string &range_basis_name,
94  Intrepid2::EOperator op, size_t worksetSize,
95  const bool matrixFree)
96 {
97  using Teuchos::RCP;
98  using Teuchos::rcp;
99  using Teuchos::rcp_dynamic_cast;
100 
101  using Scalar = double;
102 
104 #ifdef PANZER_HAVE_EPETRA_STACK
105  using epetraBlockedLinObjFactory = typename panzer::BlockedEpetraLinearObjFactory<panzer::Traits, LocalOrdinal>;
106 #endif
107  using UGI = panzer::GlobalIndexer;
108 
109  // must be able to cast to a block linear object factory
110  RCP<const tpetraBlockedLinObjFactory > tblof = rcp_dynamic_cast<const tpetraBlockedLinObjFactory >(linObjFactory);
111 #ifdef PANZER_HAVE_EPETRA_STACK
112  RCP<const epetraBlockedLinObjFactory > eblof = rcp_dynamic_cast<const epetraBlockedLinObjFactory >(linObjFactory);
113 #endif
114 
116  if (tblof != Teuchos::null) {
117  blockedDOFMngr = tblof->getGlobalIndexer();
118 #ifdef PANZER_HAVE_EPETRA_STACK
119  } else if (eblof != Teuchos::null) {
120  blockedDOFMngr = eblof->getGlobalIndexer();
121 #endif
122  } else {
123  TEUCHOS_ASSERT(false);
124  }
125 
126  // get global indexers for domain and range dofs
127  std::vector<RCP<UGI> > fieldDOFMngrs = blockedDOFMngr->getFieldDOFManagers();
128  int domainFieldNum = blockedDOFMngr->getFieldNum(domain_basis_name);
129  int rangeFieldNum = blockedDOFMngr->getFieldNum(range_basis_name);
130  int domainBlockIndex = blockedDOFMngr->getFieldBlock(domainFieldNum);
131  int rangeBlockIndex = blockedDOFMngr->getFieldBlock(rangeFieldNum);
132  RCP<panzer::DOFManager> domain_ugi = rcp_dynamic_cast<panzer::DOFManager>(blockedDOFMngr->getFieldDOFManagers()[domainBlockIndex], true);
133  RCP<panzer::DOFManager> range_ugi = rcp_dynamic_cast<panzer::DOFManager>(blockedDOFMngr->getFieldDOFManagers()[rangeBlockIndex], true);
134 
135  RCP<const panzer::ConnManager> conn = blockedDOFMngr->getConnManager();
136 
137  return buildInterpolation(conn, domain_ugi, range_ugi, domain_basis_name, range_basis_name, op, worksetSize,
138  /*forceVectorial=*/false,
139  /*useTpetra=*/tblof != Teuchos::null,
140  matrixFree);
141 }
142 
143 
145  const Teuchos::RCP<panzer::DOFManager> &domain_ugi,
146  const Teuchos::RCP<panzer::DOFManager> &range_ugi,
147  const std::string& domain_basis_name,
148  const std::string& range_basis_name,
149  Intrepid2::EOperator op,
150  size_t worksetSize,
151  const bool force_vectorial,
152  const bool useTpetra,
153  const bool matrixFree)
154 {
155  using Teuchos::RCP;
156  using Teuchos::rcp;
157  using Teuchos::rcp_dynamic_cast;
158 
159  using Scalar = double;
160 
161  using STS = Teuchos::ScalarTraits<Scalar>;
162  using KAT = Kokkos::ArithTraits<Scalar>;
164 
165  using DeviceSpace = PHX::Device;
166  using HostSpace = Kokkos::HostSpace;
167  using ots = Intrepid2::OrientationTools<DeviceSpace>;
168  using li = Intrepid2::LagrangianInterpolation<DeviceSpace>;
169  using DynRankDeviceView = Kokkos::DynRankView<double, DeviceSpace>;
170 
171  using tp_graph = Tpetra::CrsGraph<LocalOrdinal, GlobalOrdinal>;
172  using tp_matrix = Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal>;
173  using tp_map = Tpetra::Map<LocalOrdinal, GlobalOrdinal>;
174 #ifdef PANZER_HAVE_EPETRA_STACK
175  using ep_linObjContainer = panzer::BlockedEpetraLinearObjContainer;
176  using ep_matrix = Epetra_CrsMatrix;
177  using ep_map = Epetra_Map;
178 #endif
179 
180  if (matrixFree) {
181  TEUCHOS_ASSERT(useTpetra);
182  TEUCHOS_ASSERT(!force_vectorial);
183  auto mfOp = rcp(new MatrixFreeInterpolationOp<Scalar,LocalOrdinal,GlobalOrdinal>(conn, domain_ugi, range_ugi, domain_basis_name, range_basis_name, op, worksetSize));
184  return Thyra::tpetraLinearOp<Scalar,LocalOrdinal,GlobalOrdinal,typename tp_matrix::node_type>(Thyra::createVectorSpace<Scalar,LocalOrdinal,GlobalOrdinal>(mfOp->getRangeMap()),
185  Thyra::createVectorSpace<Scalar,LocalOrdinal,GlobalOrdinal>(mfOp->getDomainMap()),
186  mfOp);
187  }
188 
189  // get the domain and range bases
190  auto domain_fieldPattern = domain_ugi->getFieldPattern(domain_basis_name);
191  auto domain_basis = rcp_dynamic_cast<const panzer::Intrepid2FieldPattern>(domain_fieldPattern,true)->getIntrepidBasis();
192  auto range_fieldPattern = range_ugi->getFieldPattern(range_basis_name);
193  auto range_basis = rcp_dynamic_cast<const panzer::Intrepid2FieldPattern>(range_fieldPattern,true)->getIntrepidBasis();
194 
195  // cardinalities
196  const size_t domainCardinality = domain_basis->getCardinality();
197  const size_t rangeCardinality = range_basis->getCardinality();
198 
199  const int dim = range_basis->getBaseCellTopology().getDimension();
200 
201  if (op == Intrepid2::OPERATOR_VALUE) {
202  TEUCHOS_ASSERT(rangeCardinality >= domainCardinality);
203  TEUCHOS_ASSERT_EQUALITY(domain_basis->getFunctionSpace(), range_basis->getFunctionSpace());
204  }
205 
206  // Create the global interp matrix.
207  RCP<const tp_map> tp_rangemap;
208  RCP<const tp_map> tp_domainmap;
209  RCP<const tp_map> tp_rowmap;
210  RCP<const tp_map> tp_colmap;
211  RCP<tp_matrix> tp_interp_matrix;
212  typename tp_matrix::local_matrix_device_type lcl_tp_interp_matrix;
213 #ifdef PANZER_HAVE_EPETRA_STACK
214  RCP<const ep_map> ep_rangemap;
215  RCP<const ep_map> ep_domainmap;
216  RCP<const ep_map> ep_rowmap;
217  RCP<const ep_map> ep_colmap;
218  RCP<ep_matrix> ep_interp_matrix;
219 #endif
220 
221  auto rangeElementLIDs_d = range_ugi->getLIDs();
222  auto domainElementLIDs_d = domain_ugi->getLIDs();
223 
224  RCP<Thyra::LinearOpBase<Scalar> > thyra_interp;
225  size_t maxNumElementsPerBlock = 0;
226  LocalOrdinal minLocalIndex = 0;
227  LocalOrdinal maxLocalIndex = 0;
228  if (useTpetra) {
229  // build maps
230  std::vector<GlobalOrdinal> gids;
231  range_ugi->getOwnedIndices(gids);
232  tp_rowmap = rcp(new tp_map(OT::invalid(), gids.data(), static_cast<LocalOrdinal>(gids.size()), OT::zero(), range_ugi->getComm()));
233  tp_rangemap = tp_rowmap;
234  domain_ugi->getOwnedIndices(gids);
235  tp_domainmap = rcp(new tp_map(OT::invalid(), gids.data(), static_cast<LocalOrdinal>(gids.size()), OT::zero(), domain_ugi->getComm()));
236  domain_ugi->getOwnedAndGhostedIndices(gids);
237  tp_colmap = rcp(new tp_map(OT::invalid(), gids.data(), static_cast<LocalOrdinal>(gids.size()), OT::zero(), domain_ugi->getComm()));
238 
239  minLocalIndex = tp_rowmap->getMinLocalIndex();
240  maxLocalIndex = tp_rowmap->getMaxLocalIndex();
241 
242  // estimate number of entries per row
243  // This is an upper bound, as we are counting dofs that are on shared nodes, edges, faces more than once.
244  using dv = Kokkos::DualView<size_t*, typename tp_graph::device_type>;
245  dv numEntriesPerRow("numEntriesPerRow", tp_rowmap->getLocalNumElements());
246  {
247  auto numEntriesPerRow_d = numEntriesPerRow.view_device();
248 
249  // loop over element blocks
250  std::vector<std::string> elementBlockIds;
251  range_ugi->getElementBlockIds(elementBlockIds);
252  for(std::size_t blockIter = 0; blockIter < elementBlockIds.size(); ++blockIter) {
253 
254  // loop over elements
255  std::vector<int> elementIds = range_ugi->getElementBlock(elementBlockIds[blockIter]);
256  Kokkos::View<int *, HostSpace> elementIds_h(elementIds.data(), elementIds.size());
257  Kokkos::View<int *, DeviceSpace> elementIds_d("elementIds_d", elementIds_h.extent(0));
258  Kokkos::deep_copy(elementIds_d, elementIds_h);
259  maxNumElementsPerBlock = std::max(maxNumElementsPerBlock, elementIds.size());
260 
261  Kokkos::parallel_for("MiniEM_Interpolation::numEntriesPerRow",
262  Kokkos::RangePolicy<size_t, typename tp_matrix::node_type::execution_space>(0, elementIds.size()),
263  KOKKOS_LAMBDA(const size_t elemIter) {
264  auto elemId = elementIds_d(elemIter);
265 
266  // get IDs for range dofs
267  auto rangeLIDs_d = Kokkos::subview(rangeElementLIDs_d, elemId, Kokkos::ALL());
268 
269  // loop over range LIDs
270  for(size_t rangeIter = 0; rangeIter < rangeCardinality; ++rangeIter) {
271  const LocalOrdinal range_row = rangeLIDs_d(rangeIter);
272  const bool isOwned = ((minLocalIndex <= range_row) && (range_row <= maxLocalIndex));
273  if (isOwned)
274  Kokkos::atomic_add(&numEntriesPerRow_d(range_row), domainCardinality);
275  } //end range LID loop
276  });
277  } // blocks loop
278  numEntriesPerRow.template modify<typename dv::t_dev>();
279  numEntriesPerRow.template sync<typename dv::t_host>();
280  }
281 
282  // Set up graph
283  auto tp_interp_graph = rcp(new tp_graph(tp_rowmap, tp_colmap, numEntriesPerRow));
284 
285  { // This runs on host
286  Kokkos::View<LocalOrdinal**, HostSpace> rangeElementLIDs_h("rangeElementLIDs_h", rangeElementLIDs_d.extent(0), rangeCardinality);
287  Kokkos::View<LocalOrdinal**, HostSpace> domainElementLIDs_h("domainElementLIDs_h", domainElementLIDs_d.extent(0), domainCardinality);
288  Kokkos::deep_copy(rangeElementLIDs_h, rangeElementLIDs_d);
289  Kokkos::deep_copy(domainElementLIDs_h, domainElementLIDs_d);
290 
291  // loop over element blocks
292  std::vector<std::string> elementBlockIds;
293  range_ugi->getElementBlockIds(elementBlockIds);
294  for(std::size_t blockIter = 0; blockIter < elementBlockIds.size(); ++blockIter) {
295 
296  // loop over elements
297  std::vector<int> elementIds = range_ugi->getElementBlock(elementBlockIds[blockIter]);
298  maxNumElementsPerBlock = std::max(maxNumElementsPerBlock, elementIds.size());
299  for(std::size_t elemIter = 0; elemIter < elementIds.size(); ++elemIter) {
300  auto elemId = elementIds[elemIter];
301 
302  // get IDs for range dofs
303  auto rangeLIDs_h = Kokkos::subview(rangeElementLIDs_h, elemId, Kokkos::ALL());
304  auto domainLIDs_h = Kokkos::subview(domainElementLIDs_h, elemId, Kokkos::ALL());
305 
306  // loop over range LIDs
307  for(size_t rangeIter = 0; rangeIter < rangeCardinality; ++rangeIter) {
308  const LocalOrdinal range_row = rangeLIDs_h(rangeIter);
309  const bool isOwned = ((minLocalIndex <= range_row) && (range_row <= maxLocalIndex));
310  if (isOwned) {
311  Teuchos::ArrayView<LocalOrdinal> domainLIDs_av = Teuchos::ArrayView<LocalOrdinal>(domainLIDs_h.data(), domainLIDs_h.extent_int(0));
312  tp_interp_graph->insertLocalIndices(range_row, domainLIDs_av);
313  }
314  } //end range LID loop
315  } // elements loop
316  } // blocks loop
317  }
318 
320  pl->set("Optimize Storage", true);
321  tp_interp_graph->fillComplete(tp_domainmap, tp_rangemap, pl);
322 
323  tp_interp_matrix = rcp(new tp_matrix(tp_interp_graph));
324  lcl_tp_interp_matrix = tp_interp_matrix->getLocalMatrixDevice();
325  }
326 #ifdef PANZER_HAVE_EPETRA_STACK
327  else {
328 
329  const RCP<const Teuchos::MpiComm<int> > mpiComm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(range_ugi->getComm());
330  auto ep_comm = Teuchos::rcp(new Epetra_MpiComm(*mpiComm->getRawMpiComm()));
331  std::vector<GlobalOrdinal> gids;
332  range_ugi->getOwnedIndices(gids);
333  ep_rowmap = rcp(new ep_map(OT::invalid(), static_cast<LocalOrdinal>(gids.size()), gids.data(), OT::zero(), *ep_comm));
334  ep_rangemap = ep_rowmap;
335  domain_ugi->getOwnedIndices(gids);
336  ep_domainmap = rcp(new ep_map(OT::invalid(), static_cast<LocalOrdinal>(gids.size()), gids.data(), OT::zero(), *ep_comm));
337  domain_ugi->getOwnedAndGhostedIndices(gids);
338  ep_colmap = rcp(new ep_map(OT::invalid(), static_cast<LocalOrdinal>(gids.size()), gids.data(), OT::zero(), *ep_comm));
339 
340  {
341  // loop over element blocks
342  std::vector<std::string> elementBlockIds;
343  range_ugi->getElementBlockIds(elementBlockIds);
344  for(std::size_t blockIter = 0; blockIter < elementBlockIds.size(); ++blockIter) {
345 
346  // loop over elements
347  std::vector<int> elementIds = range_ugi->getElementBlock(elementBlockIds[blockIter]);
348  maxNumElementsPerBlock = std::max(maxNumElementsPerBlock, elementIds.size());
349  }
350  }
351 
352  // TODO: Fix this.
353  size_t nnzPerRowEstimate = 25*domainCardinality;
354 
355  ep_interp_matrix = rcp(new ep_matrix(Copy, *ep_rowmap, *ep_colmap, static_cast<int>(nnzPerRowEstimate), /*StaticProfile=*/true));
356 
357  RCP<const Thyra::LinearOpBase<double> > th_ep_interp = Thyra::epetraLinearOp(ep_interp_matrix,
358  Thyra::NOTRANS,
359  Thyra::EPETRA_OP_APPLY_APPLY,
360  Thyra::EPETRA_OP_ADJOINT_SUPPORTED,
361  Thyra::create_VectorSpace(ep_rangemap),
362  Thyra::create_VectorSpace(ep_domainmap));
363  thyra_interp = Teuchos::rcp_const_cast<Thyra::LinearOpBase<double> >(th_ep_interp);
364  }
365 #endif
366 
367  // allocate some views
368  int numCells;
369  if (maxNumElementsPerBlock > 0)
370  numCells = static_cast<int>(std::min(maxNumElementsPerBlock, worksetSize));
371  else
372  numCells = static_cast<int>(worksetSize);
373 
374  DynRankDeviceView range_dofCoords_d("range_dofCoords_d", rangeCardinality, dim);
375  DynRankDeviceView basisCoeffsLIOriented_d("basisCoeffsLIOriented_d", numCells, rangeCardinality, domainCardinality);
376 
377  // the ranks of these depend on dimension
378  DynRankDeviceView valuesAtDofCoordsNonOriented_d;
379  DynRankDeviceView valuesAtDofCoordsOriented_d;
380 
381  if (!force_vectorial) {
382  // Let Intrepid2 give us the correctly dimensioned view, then build one with +1 ranks and extent(0) == numCells
383  auto temp = domain_basis->allocateOutputView(static_cast<int>(rangeCardinality), op);
384 
385  // These view have dimensions
386  // numCells, numFields=domainCardinality, numPoints=rangeCardinality, (spatialDim)
387  //
388  if (temp.rank() == 3) {
389  valuesAtDofCoordsNonOriented_d = DynRankDeviceView("valuesAtDofCoordsNonOriented_d", temp.extent(0), temp.extent(1), temp.extent(2));
390  valuesAtDofCoordsOriented_d = DynRankDeviceView("valuesAtDofCoordsOriented_d", numCells, temp.extent(0), temp.extent(1), temp.extent(2));
391  } else {
392  valuesAtDofCoordsNonOriented_d = DynRankDeviceView("valuesAtDofCoordsNonOriented_d", temp.extent(0), temp.extent(1));
393  valuesAtDofCoordsOriented_d = DynRankDeviceView("valuesAtDofCoordsOriented_d", numCells, temp.extent(0), temp.extent(1));
394  }
395  } else {
396  valuesAtDofCoordsNonOriented_d = DynRankDeviceView("valuesAtDofCoordsNonOriented_d", domainCardinality, rangeCardinality, dim);
397  valuesAtDofCoordsOriented_d = DynRankDeviceView("valuesAtDofCoordsOriented_d", numCells, domainCardinality, rangeCardinality, dim);
398  }
399 
400  int fieldRank = Intrepid2::getFieldRank(range_basis->getFunctionSpace());
401  TEUCHOS_ASSERT((fieldRank == 0) || (fieldRank == 1));
402 
404 
405  // range dof coordinates
406  range_basis->getDofCoords(range_dofCoords_d);
407 
408  // compute values of op * (domain basis) at range dof coords on reference element
409  domain_basis->getValues(valuesAtDofCoordsNonOriented_d, range_dofCoords_d, op);
410 
411  // get block ids
412  std::vector<std::string> elementBlockIds;
413  range_ugi->getElementBlockIds(elementBlockIds);
414 
415  // get orientations for all blocks
416  std::map<std::string, std::vector<Intrepid2::Orientation> > orientations;
417  buildIntrepidOrientations(elementBlockIds, *conn, orientations);
418 
419  // loop over element blocks
420  for(std::size_t blockIter = 0; blockIter < elementBlockIds.size(); ++blockIter) {
421 
422  auto rangeOffsets_d = range_ugi->getGIDFieldOffsetsKokkos(elementBlockIds[blockIter], 0);
423  auto domainOffsets_d = domain_ugi->getGIDFieldOffsetsKokkos(elementBlockIds[blockIter], 0);
424 #ifdef PANZER_INTERPOLATION_DEBUG_OUTPUT
425  std::cout << "rangeOffsets_d" << std::endl;
426  for (int i = 0; i < rangeCardinality; i++)
427  std::cout << rangeOffsets_d(i) << " ";
428  std::cout << std::endl;
429  std::cout << "domainOffsets_d" << std::endl;
430  for (int i = 0; i < domainCardinality; i++)
431  std::cout << domainOffsets_d(i) << " ";
432  std::cout << std::endl;
433 #endif
434 
435  // get element ids
437  {
438  std::vector<int> elementIds = range_ugi->getElementBlock(elementBlockIds[blockIter]);
439  Kokkos::View<int *, HostSpace> elementIds_h(elementIds.data(), elementIds.size());
440  elementIds_d = Kokkos::View<int *, DeviceSpace>("elementIds_d", elementIds_h.extent(0));
441  Kokkos::deep_copy(elementIds_d, elementIds_h);
442  }
443 
444  // get element orientations
445  typename Kokkos::DynRankView<Intrepid2::Orientation,DeviceSpace> elemOrts_d ("elemOrts_d", elementIds_d.extent(0));
446  {
447  // copy orientations to device
448  auto blockOrientations = orientations[elementBlockIds[blockIter]];
449  Kokkos::View<Intrepid2::Orientation*, HostSpace> elemOrts_h(blockOrientations.data(), blockOrientations.size());
450  Kokkos::deep_copy(elemOrts_d, elemOrts_h);
451  }
452 
453  // loop over element worksets
454  for(std::size_t elemIter = 0; elemIter < elementIds_d.extent(0); elemIter += numCells) {
455 
456  int endCellRange =
457  std::min(numCells, Teuchos::as<int>(elementIds_d.extent(0)) -
458  Teuchos::as<int>(elemIter));
459 
460  // get subviews on workset
461  auto ortsRange = Kokkos::make_pair(elemIter, std::min(elemIter + numCells, elemOrts_d.extent(0)));
462  auto elemOrtsWorkset_d = Kokkos::subview(elemOrts_d, ortsRange);
463  // Last workset might be shorter.
464  auto worksetRange = Kokkos::make_pair(0, endCellRange);
465  auto valuesAtDofCoordsOrientedWorkset_d = Kokkos::subview(valuesAtDofCoordsOriented_d, worksetRange, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
466  auto basisCoeffsLIOrientedWorkset_d = Kokkos::subview(basisCoeffsLIOriented_d, worksetRange, Kokkos::ALL(), Kokkos::ALL());
467 
468  // apply orientations for domain basis
469  // shuffles things in the second dimension, i.e. wrt domain basis
470  ots::modifyBasisByOrientation(valuesAtDofCoordsOrientedWorkset_d,
471  valuesAtDofCoordsNonOriented_d,
472  elemOrtsWorkset_d,
473  domain_basis.get());
474 
475  // get basis coefficients of domain basis functions wrt range basis
476  for(size_t domainIter=0; domainIter<domainCardinality; domainIter++)
477  // Get basis coeffs wrt range basis on reference element.
478  // basisCoeffsLI has dimensions (numCells, numFields=rangeCardinality, domainCardinality)
479  li::getBasisCoeffs(Kokkos::subview(basisCoeffsLIOrientedWorkset_d, Kokkos::ALL(), Kokkos::ALL(), domainIter),
480  Kokkos::subview(valuesAtDofCoordsOrientedWorkset_d, Kokkos::ALL(), domainIter, Kokkos::ALL(), Kokkos::ALL()),
481  range_basis.get(),
482  elemOrtsWorkset_d);
483 
484 #ifdef PANZER_HAVE_EPETRA_STACK
485  if (!useTpetra) { // Epetra fill
486 
487  Kokkos::View<LocalOrdinal*,DeviceSpace> indices_d("indices", domainCardinality);
488  Kokkos::View<Scalar*, DeviceSpace> values_d ("values", domainCardinality);
489 
490 
491  for (int cellNo = 0; cellNo < endCellRange; cellNo++) {
492  auto elemId = elementIds_d(elemIter+cellNo);
493 
494  // get IDs for range and domain dofs
495  auto rangeLIDs_d = Kokkos::subview(rangeElementLIDs_d, elemId, Kokkos::ALL());
496  auto domainLIDs_d = Kokkos::subview(domainElementLIDs_d, elemId, Kokkos::ALL());
497 
498  // loop over range LIDs
499  for(size_t rangeIter = 0; rangeIter < rangeCardinality; ++rangeIter) {
500  const LocalOrdinal range_row = rangeLIDs_d(rangeOffsets_d(rangeIter));
501  const bool isOwned = ep_rowmap->MyLID(range_row);
502  if (isOwned) {
503  // filter entries for zeros
504  LocalOrdinal rowNNZ = 0;
505  for(size_t domainIter = 0; domainIter < domainCardinality; domainIter++) {
506  Scalar val = basisCoeffsLIOriented_d(cellNo, rangeIter, domainIter);
507  if (KAT::magnitude(val) > entryFilterTol) {
508  indices_d(rowNNZ) = domainLIDs_d(domainOffsets_d(domainIter));
509  values_d(rowNNZ) = val;
510  ++rowNNZ;
511  }
512  }
513 
514  int ret = ep_interp_matrix->ReplaceMyValues(range_row, rowNNZ, values_d.data(), indices_d.data());
515  if (ret != 0) {
516  ret = ep_interp_matrix->InsertMyValues(range_row, rowNNZ, values_d.data(), indices_d.data());
517  TEUCHOS_ASSERT(ret == 0);
518  }
519  } //end if owned
520  } // end range LID loop
521  } //end workset loop
522  } // Epetra fill
523  else
524 #endif
525  { // Tpetra fill
526  Kokkos::parallel_for(
527  "MiniEM_Interpolation::worksetLoop",
528  Kokkos::RangePolicy<int, typename tp_matrix::node_type::execution_space>(0, endCellRange),
529  KOKKOS_LAMBDA(const int cellNo) {
530  auto elemId = elementIds_d(elemIter+cellNo);
531 
532  // get IDs for range and domain dofs
533  auto rangeLIDs_d = Kokkos::subview(rangeElementLIDs_d, elemId, Kokkos::ALL());
534  auto domainLIDs_d = Kokkos::subview(domainElementLIDs_d, elemId, Kokkos::ALL());
535 
536 #ifdef PANZER_INTERPOLATION_DEBUG_OUTPUT
537  std::cout << "\n" << elemOrts_d(elemIter+cellNo).to_string() << std::endl;
538  std::cout << "rangeLIDs" << std::endl;
539  for (int i = 0; i < rangeCardinality; i++)
540  std::cout << rangeLIDs_d(i) << " ";
541  std::cout << std::endl << "domainLIDs" << std::endl;
542  for (int i = 0; i < domainCardinality; i++)
543  std::cout << domainLIDs_d(i) << " ";
544  std::cout << std::endl;
545 #endif
546  // loop over range LIDs
547  for(size_t rangeIter = 0; rangeIter < rangeCardinality; ++rangeIter) {
548  const LocalOrdinal range_row = rangeLIDs_d(rangeOffsets_d(rangeIter));
549  const bool isOwned = ((minLocalIndex <= range_row) && (range_row <= maxLocalIndex));
550  if (isOwned) {
551  // filter entries for zeros
552  for(size_t domainIter = 0; domainIter < domainCardinality; domainIter++) {
553  Scalar val = basisCoeffsLIOriented_d(cellNo, rangeIter, domainIter);
554  if (KAT::magnitude(val) > entryFilterTol) {
555 
556 #if defined(PANZER_INTERPOLATION_DEBUG_OUTPUT) || defined(PANZER_DEBUG)
557  {
558  // Check that there is no entry yet or that we are overwriting it with the same value
559  auto row = lcl_tp_interp_matrix.rowConst(range_row);
560  for(LocalOrdinal kk = 0; kk<row.length; ++kk)
561  if (row.colidx(kk) == domainLIDs_d(domainOffsets_d(domainIter)))
562  if (!(KAT::magnitude(row.value(kk)-val) < entryFilterTol || KAT::magnitude(row.value(kk)) < entryFilterTol)) {
563 #ifdef PANZER_INTERPOLATION_DEBUG_OUTPUT
564  std::cout << "Replacing (" << range_row << "," << row.colidx(kk) << ") = " << row.value(kk) << " with " << val << std::endl;
565 #endif
566 #ifdef PANZER_DEBUG
567  TEUCHOS_ASSERT(false);
568 #endif
569  }
570  }
571 #endif
572 #ifdef PANZER_INTERPOLATION_DEBUG_OUTPUT
573  std::cout << "Setting (" << range_row << "," << domainLIDs_d(domainOffsets_d(domainIter)) << ") = " << val << std::endl;
574 #endif
575  lcl_tp_interp_matrix.replaceValues(range_row, &(domainLIDs_d(domainOffsets_d(domainIter))), 1, &val, /*is_sorted=*/false, /*force_atomic=*/true);
576  }
577  } //end if owned
578  } // isOwned
579  } // end range LID loop
580  }); //end workset loop
581  } // Tpetra fill
582  } //end element loop
583  } //end element block loop
584 
585  if (useTpetra) {
586  tp_interp_matrix->fillComplete(tp_domainmap, tp_rangemap);
587 
588  if (op != Intrepid2::OPERATOR_VALUE) {
589  // Discrete gradient, curl, etc actually live on edges, faces, etc, so we have a lot of zeros in the matrix.
590  tp_interp_matrix = removeSmallEntries(tp_interp_matrix, entryFilterTol);
591  }
592 
593  thyra_interp = Thyra::tpetraLinearOp<Scalar,LocalOrdinal,GlobalOrdinal,typename tp_matrix::node_type>(Thyra::createVectorSpace<Scalar,LocalOrdinal,GlobalOrdinal>(tp_rangemap),
594  Thyra::createVectorSpace<Scalar,LocalOrdinal,GlobalOrdinal>(tp_domainmap),
595  tp_interp_matrix);
596  }
597 #ifdef PANZER_HAVE_EPETRA_STACK
598  else
599  ep_interp_matrix->FillComplete(*ep_domainmap, *ep_rangemap);
600 #endif
601 
602  return thyra_interp;
603 }
604 
605  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
608  const Teuchos::RCP<panzer::DOFManager> &_domain_ugi,
609  const Teuchos::RCP<panzer::DOFManager> &_range_ugi,
610  const std::string& _domain_basis_name,
611  const std::string& _range_basis_name,
612  Intrepid2::EOperator _op,
613  size_t _worksetSize) :
614  name(""),
615  domain_basis_name(_domain_basis_name),
616  range_basis_name(_range_basis_name),
617  op(_op),
618  worksetSize(_worksetSize),
619  domain_ugi(_domain_ugi),
620  range_ugi(_range_ugi)
621  {
622 
623  using Teuchos::RCP;
624  using Teuchos::rcp;
625  using Teuchos::rcp_dynamic_cast;
626 
628 
629  // typedef Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal> tp_matrix;
630  typedef Tpetra::Map<LocalOrdinal,GlobalOrdinal> tp_map;
631 
632  RCP<const tp_map> tp_rangemap;
633  RCP<const tp_map> tp_domainmap;
634  RCP<const tp_map> tp_rowmap;
635  RCP<const tp_map> tp_colmap;
636 
637  {
638  // build maps
639  std::vector<GlobalOrdinal> gids;
640  range_ugi->getOwnedIndices(gids);
641  tp_rowmap = rcp(new tp_map(OT::invalid(), gids.data(), gids.size(), OT::zero(), range_ugi->getComm()));
642  tp_rangemap = tp_rowmap;
643  domain_ugi->getOwnedIndices(gids);
644  tp_domainmap = rcp(new tp_map(OT::invalid(), gids.data(), gids.size(), OT::zero(), domain_ugi->getComm()));
645  domain_ugi->getOwnedAndGhostedIndices(gids);
646  tp_colmap = rcp(new tp_map(OT::invalid(), gids.data(), gids.size(), OT::zero(), domain_ugi->getComm()));
647  }
648 
649  domainMap_ = tp_domainmap;
650  rangeMap_ = tp_rangemap;
651  columnMap_ = tp_colmap;
652  import_ = rcp(new Tpetra::Import<LocalOrdinal,GlobalOrdinal,Node>(domainMap_, columnMap_));
653 
655 
657 
658  }
659 
660  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
661  void
663  allocateColumnMapVector(size_t numVectors) {
664  colmapMV_ = rcp(new Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>(columnMap_, numVectors));
665  }
666 
667  // Pre-compute elements that own a DoF
668  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
669  void
672  size_t maxNumElementsPerBlock = 0;
673  int numCells;
674  if (maxNumElementsPerBlock > 0)
675  numCells = std::min(maxNumElementsPerBlock, worksetSize);
676  else
677  numCells = worksetSize;
678 
679  LocalOrdinal lclTargetSize = getRangeMap()->getLocalNumElements();
680  owner_d_ = Kokkos::View<LocalOrdinal*,DeviceSpace>("owner", lclTargetSize);
681 
682  auto rangeLIDs_d = range_ugi->getLIDs();
683 
684  auto owner_d = owner_d_;
685 
686  // loop over element blocks
687  std::vector<std::string> elementBlockIds;
688  range_ugi->getElementBlockIds(elementBlockIds);
689  for(std::size_t blockIter = 0; blockIter < elementBlockIds.size(); ++blockIter) {
690 
691  // loop over element worksets
692  std::vector<int> elementIds = range_ugi->getElementBlock(elementBlockIds[blockIter]);
693  Kokkos::View<int*,DeviceSpace>::HostMirror elementIds_h(elementIds.data(), elementIds.size());
694  Kokkos::View<int*,DeviceSpace> elementIds_d("elementIds_d", elementIds_h.extent(0));
695  Kokkos::deep_copy(elementIds_d, elementIds_h);
696  for(std::size_t elemIter = 0; elemIter < elementIds_d.extent(0); elemIter += numCells) {
697  using range_type = Kokkos::RangePolicy<LocalOrdinal, DeviceSpace>;
698  Kokkos::parallel_for("miniEM::MatrixFreeInterpolationOp::cellLoop",
699  range_type(elemIter, std::min(elemIter+numCells,
700  elementIds_d.extent(0))),
701  KOKKOS_LAMBDA(const LocalOrdinal cellNo2) {
702  auto elemId = elementIds_d(cellNo2);
703 
704  // loop over range LIDs
705  for(size_t rangeIter = 0; rangeIter < rangeLIDs_d.extent(1); ++rangeIter) {
706  LocalOrdinal range_row = rangeLIDs_d(elemId, rangeIter);
707  if (range_row < lclTargetSize)
708  owner_d(range_row) = elemId;
709  }
710  });
711  }
712  }
713 
714  // get orientations for all blocks
715  std::map<std::string, std::vector<Intrepid2::Orientation> > orientations;
716  buildIntrepidOrientations(elementBlockIds, *conn, orientations);
717 
718  using HostSpace = Kokkos::HostSpace;
719 
720  // copy orientations to device
721  for (auto const& orientation : orientations) {
722  auto blockOrientations = orientation.second;
723  orientations_[orientation.first] = typename Kokkos::DynRankView<Intrepid2::Orientation,DeviceSpace>("elemOrts_d", blockOrientations.size());
724  Kokkos::View<Intrepid2::Orientation*, HostSpace> elemOrts_h(blockOrientations.data(), blockOrientations.size());
725  Kokkos::deep_copy(orientations_[orientation.first], elemOrts_h);
726  }
727  }
728 
729  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
730  void
732  apply (const Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> &X,
733  Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> &Y,
734  Teuchos::ETransp mode,
735  Scalar alpha,
736  Scalar beta) const {
737 
738  if (mode == Teuchos::NO_TRANS) {
739  applyNonTransposed(X, Y, alpha, beta);
740  return;
741  } else if (mode == Teuchos::TRANS) {
742  applyTransposed(X, Y, alpha, beta);
743  return;
744  } else
745  TEUCHOS_ASSERT(false);
746  }
747 
748  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
749  void
751  applyNonTransposed (const Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> &X,
752  Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> &Y,
753  Scalar alpha,
754  Scalar beta) const {
755 
756  using Teuchos::RCP;
757  using Teuchos::rcp;
758  using Teuchos::rcp_dynamic_cast;
759  using range_type = Kokkos::RangePolicy<LocalOrdinal, DeviceSpace>;
760 
761  using ots = Intrepid2::OrientationTools<DeviceSpace>;
762  using li = Intrepid2::LagrangianInterpolation<DeviceSpace>;
763  using DynRankDeviceView = Kokkos::DynRankView<Scalar,DeviceSpace>;
764  using view_t = typename Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::dual_view_type::t_dev;
765  using const_view_t = typename Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::dual_view_type::t_dev::const_type;
766 
767  Teuchos::TimeMonitor tm(*Teuchos::TimeMonitor::getNewTimer(std::string("Mini-EM: matrix-free apply no_trans ") + name));
768 
769  using TST = Teuchos::ScalarTraits<Scalar>;
770  const Scalar ZERO = TST::zero();
771  if (beta == ZERO)
772  Y.putScalar(ZERO);
773  colmapMV_->doImport(X, *import_,Tpetra::INSERT);
774 
775  const_view_t lclX = colmapMV_->getLocalViewDevice(Tpetra::Access::ReadOnly);
776  view_t lclY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
777  size_t numVectors = lclY.extent(1);
778  LocalOrdinal lclTargetSize = getRangeMap()->getLocalNumElements();
779 
780  // get the domain and range bases
781  auto domain_fieldPattern = domain_ugi->getFieldPattern(domain_basis_name);
782  auto domain_basis = rcp_dynamic_cast<const panzer::Intrepid2FieldPattern>(domain_fieldPattern,true)->getIntrepidBasis();
783  auto range_fieldPattern = range_ugi->getFieldPattern(range_basis_name);
784  auto range_basis = rcp_dynamic_cast<const panzer::Intrepid2FieldPattern>(range_fieldPattern,true)->getIntrepidBasis();
785 
786  // cardinalities
787  const size_t domainCardinality = domain_basis->getCardinality();
788  const size_t rangeCardinality = range_basis->getCardinality();
789 
790  size_t maxNumElementsPerBlock = 0;
791 
792  const int dim = range_basis->getBaseCellTopology().getDimension();
793 
794  if (op == Intrepid2::OPERATOR_VALUE) {
795  TEUCHOS_ASSERT(rangeCardinality >= domainCardinality);
796  TEUCHOS_ASSERT_EQUALITY(domain_basis->getFunctionSpace(), range_basis->getFunctionSpace());
797  }
798 
799  // allocate some views
800  int numCells;
801  if (maxNumElementsPerBlock > 0)
802  numCells = std::min(maxNumElementsPerBlock, worksetSize);
803  else
804  numCells = worksetSize;
805  DynRankDeviceView range_dofCoords_d("range_dofCoords_d", rangeCardinality, dim);
806  DynRankDeviceView basisCoeffsLIOriented_d("basisCoeffsLIOriented_d", numCells, rangeCardinality, domainCardinality);
807 
808  // the ranks of these depend on dimension
809  DynRankDeviceView valuesAtDofCoordsNonOriented_d;
810  DynRankDeviceView valuesAtDofCoordsOriented_d;
811  DynRankDeviceView reducedValuesAtDofCoordsOriented_d;
812 
813  {
814  // Let Intrepid2 give us the correctly dimensioned view, then build one with +1 ranks and extent(0) == numCells
815  auto temp = domain_basis->allocateOutputView(rangeCardinality, op);
816 
817  // These view have dimensions
818  // numCells, numFields=domainCardinality, numPoints=rangeCardinality, (spatialDim)
819  //
820  if (temp.rank() == 3) {
821  valuesAtDofCoordsNonOriented_d = DynRankDeviceView("valuesAtDofCoordsNonOriented_d", temp.extent(0), temp.extent(1), temp.extent(2));
822  valuesAtDofCoordsOriented_d = DynRankDeviceView("valuesAtDofCoordsOriented_d", numCells, temp.extent(0), temp.extent(1), temp.extent(2));
823  reducedValuesAtDofCoordsOriented_d = DynRankDeviceView("reducedValuesAtDofCoordsOriented_d", numCells, temp.extent(1), temp.extent(2), numVectors);
824  } else {
825  valuesAtDofCoordsNonOriented_d = DynRankDeviceView("valuesAtDofCoordsNonOriented_d", temp.extent(0), temp.extent(1));
826  valuesAtDofCoordsOriented_d = DynRankDeviceView("valuesAtDofCoordsOriented_d", numCells, temp.extent(0), temp.extent(1));
827  reducedValuesAtDofCoordsOriented_d = DynRankDeviceView("reducedValuesAtDofCoordsOriented_d", numCells, temp.extent(1), numVectors);
828  }
829  }
830 
831  int fieldRank = Intrepid2::getFieldRank(range_basis->getFunctionSpace());
832  TEUCHOS_ASSERT((fieldRank == 0) || (fieldRank == 1));
833 
834  auto rangeLIDs_d = range_ugi->getLIDs();
835  auto domainLIDs_d = domain_ugi->getLIDs();
836 
837  // range dof coordinates
838  range_basis->getDofCoords(range_dofCoords_d);
839 
840  // compute values of op * (domain basis) at range dof coords on reference element
841  domain_basis->getValues(valuesAtDofCoordsNonOriented_d, range_dofCoords_d, op);
842 
843  // loop over element blocks
844  std::vector<std::string> elementBlockIds;
845  range_ugi->getElementBlockIds(elementBlockIds);
846  for(std::size_t blockIter = 0; blockIter < elementBlockIds.size(); ++blockIter) {
847 
848  auto rangeOffsets_d = range_ugi->getGIDFieldOffsetsKokkos(elementBlockIds[blockIter], 0);
849  auto domainOffsets_d = domain_ugi->getGIDFieldOffsetsKokkos(elementBlockIds[blockIter], 0);
850 
851  // get element ids
853  {
854  std::vector<int> elementIds = range_ugi->getElementBlock(elementBlockIds[blockIter]);
855  Kokkos::View<int *, Kokkos::HostSpace> elementIds_h(elementIds.data(), elementIds.size());
856  elementIds_d = Kokkos::View<int *, DeviceSpace>("elementIds_d", elementIds_h.extent(0));
857  Kokkos::deep_copy(elementIds_d, elementIds_h);
858  }
859 
860  // get element orientations
861  auto elemOrts_d = orientations_.at(elementBlockIds[blockIter]);
862 
863  for(std::size_t elemIter = 0; elemIter < elementIds_d.extent(0); elemIter += numCells) {
864 
865  int endCellRange =
866  std::min(numCells, Teuchos::as<int>(elementIds_d.extent(0)) -
867  Teuchos::as<int>(elemIter));
868 
869  // get subviews on workset
870  auto ortsRange = Kokkos::make_pair(elemIter, std::min(elemIter + numCells, elemOrts_d.extent(0)));
871  auto elemOrtsWorkset_d = Kokkos::subview(elemOrts_d, ortsRange);
872  // Last workset might be shorter.
873  auto worksetRange = Kokkos::make_pair(0, endCellRange);
874  auto valuesAtDofCoordsOrientedWorkset_d = Kokkos::subview(valuesAtDofCoordsOriented_d, worksetRange, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
875  auto basisCoeffsLIOrientedWorkset_d = Kokkos::subview(basisCoeffsLIOriented_d, worksetRange, Kokkos::ALL(), Kokkos::ALL());
876 
877  // apply orientations for LO basis
878  // shuffles things in the second dimension, i.e. wrt LO basis
879  ots::modifyBasisByOrientation(valuesAtDofCoordsOrientedWorkset_d,
880  valuesAtDofCoordsNonOriented_d,
881  elemOrtsWorkset_d,
882  domain_basis.get());
883 
884  Kokkos::deep_copy(reducedValuesAtDofCoordsOriented_d, 0.0);
885 
886  if (reducedValuesAtDofCoordsOriented_d.rank() == 4) {
887  Kokkos::parallel_for("miniEM:MatrixFreeInterpolationOp:cellLoop1",
888  range_type(0, std::min(numCells, elementIds_d.extent_int(0)-Teuchos::as<int>(elemIter))),
889  KOKKOS_LAMBDA(const LocalOrdinal cellNo) {
890  LocalOrdinal cellNo2 = elemIter+cellNo;
891  LocalOrdinal elemId = elementIds_d(cellNo2);
892  for(size_t domainIter=0; domainIter<domainCardinality; domainIter++) {
893  LocalOrdinal J = domainLIDs_d(elemId, domainOffsets_d(domainIter));
894  for(size_t rangeIter=0; rangeIter<rangeCardinality; rangeIter++) {
895  for(size_t d=0; d<valuesAtDofCoordsOriented_d.extent(3); d++) {
896  Scalar val = valuesAtDofCoordsOriented_d(cellNo, domainIter, rangeIter, d);
897  for (size_t j = 0; j<numVectors; ++j)
898  reducedValuesAtDofCoordsOriented_d(cellNo, rangeIter, d, j) += val * lclX(J, j);
899  }
900  }
901  }
902  });
903 
904  for (size_t j = 0; j<numVectors; ++j)
905  li::getBasisCoeffs(Kokkos::subview(basisCoeffsLIOrientedWorkset_d, Kokkos::ALL(), Kokkos::ALL(), j),
906  Kokkos::subview(reducedValuesAtDofCoordsOriented_d, worksetRange, Kokkos::ALL(), Kokkos::ALL(), j),
907  range_basis.get(),
908  elemOrtsWorkset_d
909  );
910  } else {
911  Kokkos::parallel_for("miniEM:MatrixFreeInterpolationOp:cellLoop1",
912  range_type(0, std::min(numCells, elementIds_d.extent_int(0)-Teuchos::as<int>(elemIter))),
913  KOKKOS_LAMBDA(const LocalOrdinal cellNo) {
914  LocalOrdinal cellNo2 = elemIter+cellNo;
915  LocalOrdinal elemId = elementIds_d(cellNo2);
916  for(size_t domainIter=0; domainIter<domainCardinality; domainIter++) {
917  LocalOrdinal J = domainLIDs_d(elemId, domainOffsets_d(domainIter));
918  for(size_t rangeIter=0; rangeIter<rangeCardinality; rangeIter++) {
919  Scalar val = valuesAtDofCoordsOriented_d(cellNo, domainIter, rangeIter);
920  for (size_t j = 0; j<numVectors; ++j)
921  reducedValuesAtDofCoordsOriented_d(cellNo, rangeIter, j) += val * lclX(J, j);
922  }
923  }
924  });
925 
926  for (size_t j = 0; j<numVectors; ++j)
927  li::getBasisCoeffs(Kokkos::subview(basisCoeffsLIOrientedWorkset_d, Kokkos::ALL(), Kokkos::ALL(), j),
928  Kokkos::subview(reducedValuesAtDofCoordsOriented_d, worksetRange, Kokkos::ALL(), j),
929  range_basis.get(),
930  elemOrtsWorkset_d
931  );
932  }
933 
934  auto owner_d = owner_d_;
935 
936  Kokkos::parallel_for("miniEM::MatrixFreeInterpolationOp::cellLoop2",
937  range_type(elemIter, std::min(elemIter+numCells,
938  elementIds_d.extent(0))),
939  KOKKOS_LAMBDA(const LocalOrdinal cellNo2) {
940  LocalOrdinal cellNo = cellNo2-elemIter;
941  LocalOrdinal elemId = elementIds_d(cellNo2);
942 
943  // loop over range LIDs
944  for(size_t rangeIter = 0; rangeIter < rangeLIDs_d.extent(1); ++rangeIter) {
945  LocalOrdinal range_row = rangeLIDs_d(elemId, rangeOffsets_d(rangeIter));
946 
947  // if owned
948  if ((range_row < lclTargetSize) && (owner_d(range_row) == elemId)) {
949 
950  for (size_t j = 0; j<numVectors; ++j) {
951  Scalar val = basisCoeffsLIOriented_d(cellNo, rangeIter, j);
952  lclY(range_row,j) = beta*lclY(range_row,j) + alpha*val;
953  }
954  } // end if owned
955  } // end range LID loop
956  }); // end element loop
957  } //end workset loop
958  } //end element block loop
959 
960  }
961 
962  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
963  void
965  applyTransposed (const Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> &X,
966  Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> &Y,
967  Scalar alpha,
968  Scalar beta) const {
969 
970  using Teuchos::RCP;
971  using Teuchos::rcp;
972  using Teuchos::rcp_dynamic_cast;
973  using range_type = Kokkos::RangePolicy<LocalOrdinal, DeviceSpace>;
974 
975  typedef Intrepid2::OrientationTools<DeviceSpace> ots;
976  typedef Intrepid2::LagrangianInterpolation<DeviceSpace> li;
977  typedef Kokkos::DynRankView<Scalar,DeviceSpace> DynRankDeviceView;
978  using TST = Teuchos::ScalarTraits<Scalar>;
979  const Scalar ZERO = TST::zero();
980  using view_t = typename Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::dual_view_type::t_dev;
981  using const_view_t = typename Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>::dual_view_type::t_dev::const_type;
982 
983  Teuchos::TimeMonitor tm(*Teuchos::TimeMonitor::getNewTimer(std::string("Mini-EM: matrix-free apply trans ") + name));
984 
985  const_view_t lclX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
986  colmapMV_->putScalar(ZERO);
987  view_t lclYtemp = colmapMV_->getLocalViewDevice(Tpetra::Access::ReadWrite);
988 
989  // get the domain and range bases
990  auto domain_fieldPattern = domain_ugi->getFieldPattern(domain_basis_name);
991  auto domain_basis = rcp_dynamic_cast<const panzer::Intrepid2FieldPattern>(domain_fieldPattern,true)->getIntrepidBasis();
992  auto range_fieldPattern = range_ugi->getFieldPattern(range_basis_name);
993  auto range_basis = rcp_dynamic_cast<const panzer::Intrepid2FieldPattern>(range_fieldPattern,true)->getIntrepidBasis();
994 
995  // cardinalities
996  const size_t domainCardinality = domain_basis->getCardinality();
997  const size_t rangeCardinality = range_basis->getCardinality();
998 
999  size_t maxNumElementsPerBlock = 0;
1000 
1001  const int dim = range_basis->getBaseCellTopology().getDimension();
1002 
1003  if (op == Intrepid2::OPERATOR_VALUE) {
1004  TEUCHOS_ASSERT(rangeCardinality >= domainCardinality);
1005  TEUCHOS_ASSERT_EQUALITY(domain_basis->getFunctionSpace(), range_basis->getFunctionSpace());
1006  }
1007 
1008  // allocate some views
1009  int numCells;
1010  if (maxNumElementsPerBlock > 0)
1011  numCells = std::min(maxNumElementsPerBlock, worksetSize);
1012  else
1013  numCells = worksetSize;
1014  DynRankDeviceView range_dofCoords_d("range_dofCoords_d", rangeCardinality, dim);
1015  DynRankDeviceView basisCoeffsLIOriented_d("basisCoeffsLIOriented_d", numCells, rangeCardinality, domainCardinality);
1016 
1017  // the ranks of these depend on dimension
1018  DynRankDeviceView valuesAtDofCoordsNonOriented_d;
1019  DynRankDeviceView valuesAtDofCoordsOriented_d;
1020 
1021  {
1022  // Let Intrepid2 give us the correctly dimensioned view, then build one with +1 ranks and extent(0) == numCells
1023  auto temp = domain_basis->allocateOutputView(rangeCardinality, op);
1024 
1025  // These view have dimensions
1026  // numCells, numFields=domainCardinality, numPoints=rangeCardinality, (spatialDim)
1027  //
1028  if (temp.rank() == 3) {
1029  valuesAtDofCoordsNonOriented_d = DynRankDeviceView("valuesAtDofCoordsNonOriented_d", temp.extent(0), temp.extent(1), temp.extent(2));
1030  valuesAtDofCoordsOriented_d = DynRankDeviceView("valuesAtDofCoordsOriented_d", numCells, temp.extent(0), temp.extent(1), temp.extent(2));
1031  } else {
1032  valuesAtDofCoordsNonOriented_d = DynRankDeviceView("valuesAtDofCoordsNonOriented_d", temp.extent(0), temp.extent(1));
1033  valuesAtDofCoordsOriented_d = DynRankDeviceView("valuesAtDofCoordsOriented_d", numCells, temp.extent(0), temp.extent(1));
1034  }
1035  }
1036 
1037  int fieldRank = Intrepid2::getFieldRank(range_basis->getFunctionSpace());
1038  TEUCHOS_ASSERT((fieldRank == 0) || (fieldRank == 1));
1039 
1040  auto rangeLIDs_d = range_ugi->getLIDs();
1041  auto domainLIDs_d = domain_ugi->getLIDs();
1042  Kokkos::fence();
1043 
1044  // range dof coordinates
1045  range_basis->getDofCoords(range_dofCoords_d);
1046 
1047  // compute values of op * (domain basis) at range dof coords on reference element
1048  domain_basis->getValues(valuesAtDofCoordsNonOriented_d, range_dofCoords_d, op);
1049 
1050  // loop over element blocks
1051  std::vector<std::string> elementBlockIds;
1052  range_ugi->getElementBlockIds(elementBlockIds);
1053  for(std::size_t blockIter = 0; blockIter < elementBlockIds.size(); ++blockIter) {
1054 
1055  auto rangeOffsets_d = range_ugi->getGIDFieldOffsetsKokkos(elementBlockIds[blockIter], 0);
1056  auto domainOffsets_d = domain_ugi->getGIDFieldOffsetsKokkos(elementBlockIds[blockIter], 0);
1057 
1058  // get element ids
1059  Kokkos::View<int *, DeviceSpace> elementIds_d;
1060  {
1061  std::vector<int> elementIds = range_ugi->getElementBlock(elementBlockIds[blockIter]);
1062  Kokkos::View<int *, Kokkos::HostSpace> elementIds_h(elementIds.data(), elementIds.size());
1063  elementIds_d = Kokkos::View<int *, DeviceSpace>("elementIds_d", elementIds_h.extent(0));
1064  Kokkos::deep_copy(elementIds_d, elementIds_h);
1065  }
1066 
1067  // get element orientations
1068  auto elemOrts_d = orientations_.at(elementBlockIds[blockIter]);
1069 
1070  for(std::size_t elemIter = 0; elemIter < elementIds_d.extent(0); elemIter += numCells) {
1071 
1072  int endCellRange =
1073  std::min(numCells, Teuchos::as<int>(elementIds_d.extent(0)) -
1074  Teuchos::as<int>(elemIter));
1075 
1076  // get subviews on workset
1077  auto ortsRange = Kokkos::make_pair(elemIter, std::min(elemIter + numCells, elemOrts_d.extent(0)));
1078  auto elemOrtsWorkset_d = Kokkos::subview(elemOrts_d, ortsRange);
1079  // Last workset might be shorter.
1080  auto worksetRange = Kokkos::make_pair(0, endCellRange);
1081  auto valuesAtDofCoordsOrientedWorkset_d = Kokkos::subview(valuesAtDofCoordsOriented_d, worksetRange, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
1082  auto basisCoeffsLIOrientedWorkset_d = Kokkos::subview(basisCoeffsLIOriented_d, worksetRange, Kokkos::ALL(), Kokkos::ALL());
1083 
1084  // apply orientations for domain basis
1085  // shuffles things in the second dimension, i.e. wrt domain basis
1086  ots::modifyBasisByOrientation(valuesAtDofCoordsOrientedWorkset_d,
1087  valuesAtDofCoordsNonOriented_d,
1088  elemOrtsWorkset_d,
1089  domain_basis.get());
1090  Kokkos::fence();
1091 
1092  // get basis coefficients of domain basis functions wrt range basis
1093  for(size_t domainIter=0; domainIter<domainCardinality; domainIter++)
1094  // Get basis coeffs wrt range basis on reference element.
1095  // basisCoeffsLI has dimensions (numCells, numFields=rangeCardinality, domainCardinality)
1096  li::getBasisCoeffs(Kokkos::subview(basisCoeffsLIOrientedWorkset_d, Kokkos::ALL(), Kokkos::ALL(), domainIter),
1097  Kokkos::subview(valuesAtDofCoordsOrientedWorkset_d, Kokkos::ALL(), domainIter, Kokkos::ALL(), Kokkos::ALL()),
1098  range_basis.get(), elemOrtsWorkset_d);
1099  Kokkos::fence();
1100 
1101  auto owner_d = owner_d_;
1102 
1103 
1104  Kokkos::parallel_for("miniEM::MatrixFreeInterpolationOp::cellLoop",
1105  range_type(elemIter, std::min(elemIter+numCells,
1106  elementIds_d.extent(0))),
1107  KOKKOS_LAMBDA(const LocalOrdinal cellNo2) {
1108  LocalOrdinal cellNo = cellNo2-elemIter;
1109  LocalOrdinal elemId = elementIds_d(cellNo2);
1110 
1111  // loop over range LIDs
1112  for(size_t rangeIter = 0; rangeIter < rangeLIDs_d.extent(1); ++rangeIter) {
1113  LocalOrdinal range_row = rangeLIDs_d(elemId, rangeOffsets_d(rangeIter));
1114 
1115  // if owned
1116  if ((range_row < (LocalOrdinal) lclX.extent(0)) && (owner_d(range_row) == elemId)) {
1117 
1118  for(size_t domainIter = 0; domainIter < domainLIDs_d.extent(1); domainIter++) {
1119  LocalOrdinal J = domainLIDs_d(elemId, domainOffsets_d(domainIter));
1120  Scalar val = basisCoeffsLIOriented_d(cellNo, rangeIter, domainIter);
1121  for (size_t j = 0; j<lclYtemp.extent(1); ++j)
1122  Kokkos::atomic_add(&lclYtemp(J,j), alpha*val*lclX(range_row,j));
1123  }
1124  } //end if owned
1125  } //end range LID loop
1126  }); // end element loop
1127  } //end workset loop
1128  } //end element block loop
1129  Kokkos::fence();
1130 
1131  if (beta == ZERO)
1132  Y.putScalar(ZERO);
1133  else
1134  Y.scale(beta);
1135  Y.doExport(*colmapMV_, *import_, Tpetra::ADD_ASSIGN);
1136  Kokkos::fence();
1137  }
1138 
1139 
1140 }
Teuchos::RCP< panzer::DOFManager > range_ugi
void getElementBlockIds(std::vector< std::string > &elementBlockIds) const
MatrixFreeInterpolationOp(const Teuchos::RCP< const panzer::ConnManager > &conn, const Teuchos::RCP< panzer::DOFManager > &_domain_ugi, const Teuchos::RCP< panzer::DOFManager > &_range_ugi, const std::string &_domain_basis_name, const std::string &_range_basis_name, Intrepid2::EOperator _op=Intrepid2::OPERATOR_VALUE, size_t _worksetSize=1000)
static magnitudeType eps()
const PHX::View< const int * > getGIDFieldOffsetsKokkos(const std::string &blockID, int fieldNum) const
const Kokkos::View< const panzer::LocalOrdinal **, Kokkos::LayoutRight, PHX::Device > getLIDs() const
void getOwnedIndices(std::vector< panzer::GlobalOrdinal > &indices) const
Get the set of indices owned by this processor.
void applyNonTransposed(const Tpetra::MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, Tpetra::MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const
void buildIntrepidOrientations(const std::vector< std::string > &eBlockNames, const panzer::ConnManager &connMgrInput, std::map< std::string, std::vector< Intrepid2::Orientation >> &orientations)
Builds the element orientations for the specified element blocks.
Teuchos::RCP< const Tpetra::Map< LocalOrdinal, GlobalOrdinal, Node > > domainMap_
ParameterList & set(std::string const &name, T &&value, std::string const &docString="", RCP< const ParameterEntryValidator > const &validator=null)
Teuchos::RCP< Tpetra::CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > removeSmallEntries(Teuchos::RCP< Tpetra::CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &A, typename Teuchos::ScalarTraits< Scalar >::magnitudeType tol)
static RCP< Time > getNewTimer(const std::string &name)
void applyTransposed(const Tpetra::MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, Tpetra::MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Teuchos::RCP< Thyra::LinearOpBase< double > > buildInterpolation(const Teuchos::RCP< const panzer::LinearObjFactory< panzer::Traits >> &linObjFactory, const std::string &domain_basis_name, const std::string &range_basis_name, Intrepid2::EOperator op, size_t worksetSize, const bool matrixFree)
Kokkos::View< typename Sacado::ScalarType< ScalarT >::type **, Kokkos::LayoutRight, PHX::Device > vals
void precomputeOwnersAndOrientations(const Teuchos::RCP< const panzer::ConnManager > &conn)
Teuchos::RCP< const Tpetra::Map< LocalOrdinal, GlobalOrdinal, Node > > rangeMap_
Teuchos::RCP< Teuchos::Comm< int > > getComm() const
void apply(const Tpetra::MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, Tpetra::MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Teuchos::RCP< const Tpetra::Map< LocalOrdinal, GlobalOrdinal, Node > > columnMap_
Teuchos::RCP< const Tpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > import_
void allocateColumnMapVector(size_t numVectors)
void getOwnedAndGhostedIndices(std::vector< panzer::GlobalOrdinal > &indices) const
Get the set of owned and ghosted indices for this processor.
#define TEUCHOS_ASSERT(assertion_test)
Teuchos::RCP< panzer::DOFManager > domain_ugi
#define TEUCHOS_ASSERT_EQUALITY(val1, val2)
const std::vector< panzer::LocalOrdinal > & getElementBlock(const std::string &blockId) const
Get the owned element block.
Teuchos::RCP< const FieldPattern > getFieldPattern(const std::string &name) const
Find a field pattern stored for a particular block and field number. This will retrive the pattern ad...