Compadre  1.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Compadre_LinearAlgebra.cpp
Go to the documentation of this file.
2 #include "Compadre_Functors.hpp"
3 //#include <KokkosBlas.hpp>
4 //#include <KokkosBatched_LU_Decl.hpp>
5 //#include <KokkosBatched_LU_Serial_Impl.hpp>
6 //#include <KokkosBatched_LU_Team_Impl.hpp>
7 //#include <KokkosBatched_Trsv_Decl.hpp>
8 //#include <KokkosBatched_Trsv_Serial_Impl.hpp>
9 //#include <KokkosBatched_Trsv_Team_Impl.hpp>
10 
11 namespace Compadre{
12 namespace GMLS_LinearAlgebra {
13 
14 void batchQRFactorize(ParallelManager pm, double *P, int lda, int nda, double *RHS, int ldb, int ndb, int M, int N, int NRHS, const int num_matrices, const size_t max_neighbors, const int initial_index_of_batch, int * neighbor_list_sizes) {
15 
16  // P was constructed layout right, while LAPACK and CUDA expect layout left
17  // P is not squared and not symmetric, so we must convert it to layout left
18  // RHS is symmetric and square, so no conversion is necessary
19  ConvertLayoutRightToLeft crl(pm, lda, nda, P);
20  int scratch_size = scratch_matrix_left_type::shmem_size(lda, nda);
21  pm.clearScratchSizes();
22  pm.setTeamScratchSize(1, scratch_size);
23  pm.CallFunctorWithTeamThreads(num_matrices, crl);
24  Kokkos::fence();
25 
26 #ifdef COMPADRE_USE_CUDA
27 
28  Kokkos::Profiling::pushRegion("QR::Setup(Pointers)");
29  Kokkos::View<size_t*> array_P_RHS("P and RHS matrix pointers on device", 2*num_matrices);
30 
31  // get pointers to device data
32  Kokkos::parallel_for(Kokkos::RangePolicy<device_execution_space>(0,num_matrices), KOKKOS_LAMBDA(const int i) {
33  array_P_RHS(i ) = reinterpret_cast<size_t>(P + TO_GLOBAL(i)*TO_GLOBAL(lda)*TO_GLOBAL(nda));
34  array_P_RHS(i + num_matrices) = reinterpret_cast<size_t>(RHS + TO_GLOBAL(i)*TO_GLOBAL(ldb)*TO_GLOBAL(ndb));
35  });
36 
37  Kokkos::View<int*> devInfo("devInfo", num_matrices);
38  cudaDeviceSynchronize();
39  Kokkos::Profiling::popRegion();
40 
41  Kokkos::Profiling::pushRegion("QR::Setup(Handle)");
42  // Create cublas instance
43  cublasHandle_t cublas_handle;
44  cublasStatus_t cublas_stat;
45  cudaDeviceSynchronize();
46  Kokkos::Profiling::popRegion();
47 
48  Kokkos::Profiling::pushRegion("QR::Setup(Create)");
49  cublasCreate(&cublas_handle);
50  cudaDeviceSynchronize();
51  Kokkos::Profiling::popRegion();
52 
53  Kokkos::Profiling::pushRegion("QR::Execution");
54  int info;
55 
56  // call batched QR
57  cublas_stat=cublasDgelsBatched(cublas_handle, CUBLAS_OP_N,
58  M, N, NRHS,
59  reinterpret_cast<double**>(array_P_RHS.data()), lda,
60  reinterpret_cast<double**>(array_P_RHS.data() + TO_GLOBAL(num_matrices)), ldb,
61  &info, devInfo.data(), num_matrices );
62 
63  compadre_assert_release(cublas_stat==CUBLAS_STATUS_SUCCESS && "cublasDgeqrfBatched failed");
64  cudaDeviceSynchronize();
65  Kokkos::Profiling::popRegion();
66 
67 
68 #elif defined(COMPADRE_USE_LAPACK)
69 
70  // requires column major input
71 
72  Kokkos::Profiling::pushRegion("QR::Setup(Create)");
73 
74  // find optimal blocksize when using LAPACK in order to allocate workspace needed
75  int lwork = -1, info = 0; double wkopt = 0;
76 
77  if (num_matrices > 0) {
78  dgels_( (char *)"N", &M, &N, &NRHS,
79  (double *)NULL, &lda,
80  (double *)NULL, &ldb,
81  &wkopt, &lwork, &info );
82  }
83 
84  // size needed to malloc for each problem
85  lwork = (int)wkopt;
86 
87  Kokkos::Profiling::popRegion();
88 
89  std::string transpose_or_no = "N";
90 
91  #ifdef LAPACK_DECLARED_THREADSAFE
92 
93  int scratch_space_size = scratch_vector_type::shmem_size( lwork ); // work space
94 
95  Kokkos::parallel_for(
96  team_policy(num_matrices, Kokkos::AUTO)
97  .set_scratch_size(0, Kokkos::PerTeam(scratch_space_size)),
98  KOKKOS_LAMBDA (const member_type& teamMember) {
99 
100  scratch_vector_type scratch_work(teamMember.team_scratch(0), lwork);
101 
102  int i_info = 0;
103 
104  const int i = teamMember.league_rank();
105 
106  double * p_offset = P + TO_GLOBAL(i)*TO_GLOBAL(lda)*TO_GLOBAL(nda);
107  double * rhs_offset = RHS + TO_GLOBAL(i)*TO_GLOBAL(ldb)*TO_GLOBAL(ndb);
108 
109  // use a custom # of neighbors for each problem, if possible
110  const int multiplier = (max_neighbors > 0) ? M/max_neighbors : 1; // assumes M is some positive integer scalaing of max_neighbors
111  int my_num_rows = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : M;
112  int my_num_rhs = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : NRHS;
113 
114  Kokkos::single(Kokkos::PerTeam(teamMember), [&] () {
115  dgels_( const_cast<char *>(transpose_or_no.c_str()),
116  const_cast<int*>(&my_num_rows), const_cast<int*>(&N), const_cast<int*>(&my_num_rhs),
117  p_offset, const_cast<int*>(&lda),
118  rhs_offset, const_cast<int*>(&ldb),
119  scratch_work.data(), const_cast<int*>(&lwork), &i_info);
120  });
121 
122  compadre_assert_release(i_info==0 && "dgels failed");
123 
124  }, "QR Execution");
125 
126  #else
127 
128  Kokkos::View<double*> scratch_work("scratch_work", lwork);
129  for (int i=0; i<num_matrices; ++i) {
130 
131  int i_info = 0;
132 
133  double * p_offset = P + TO_GLOBAL(i)*TO_GLOBAL(lda)*TO_GLOBAL(nda);
134  double * rhs_offset = RHS + TO_GLOBAL(i)*TO_GLOBAL(ldb)*TO_GLOBAL(ndb);
135 
136  // use a custom # of neighbors for each problem, if possible
137  const int multiplier = (max_neighbors > 0) ? M/max_neighbors : 1; // assumes M is some positive integer scalaing of max_neighbors
138  int my_num_rows = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : M;
139  int my_num_rhs = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : NRHS;
140 
141  dgels_( const_cast<char *>(transpose_or_no.c_str()),
142  const_cast<int*>(&my_num_rows), const_cast<int*>(&N), const_cast<int*>(&my_num_rhs),
143  p_offset, const_cast<int*>(&lda),
144  rhs_offset, const_cast<int*>(&ldb),
145  scratch_work.data(), const_cast<int*>(&lwork), &i_info);
146 
147  compadre_assert_release(i_info==0 && "dgels failed");
148  }
149 
150  #endif // LAPACK is not threadsafe
151 
152 #endif
153 
154  // Results are written layout left, so they need converted to layout right
155  ConvertLayoutLeftToRight clr(pm, ldb, ndb, RHS);
156  scratch_size = scratch_matrix_left_type::shmem_size(ldb, ndb);
157  pm.clearScratchSizes();
158  pm.setTeamScratchSize(1, scratch_size);
159  pm.CallFunctorWithTeamThreads(num_matrices, clr);
160  Kokkos::fence();
161 
162 }
163 
164 void batchSVDFactorize(ParallelManager pm, bool swap_layout_P, double *P, int lda, int nda, bool swap_layout_RHS, double *RHS, int ldb, int ndb, int M, int N, int NRHS, const int num_matrices, const size_t max_neighbors, const int initial_index_of_batch, int * neighbor_list_sizes) {
165 
166  if (swap_layout_P) {
167  // P was constructed layout right, while LAPACK and CUDA expect layout left
168  // P is not squared and not symmetric, so we must convert it to layout left
169  // RHS is symmetric and square, so no conversion is necessary
170  ConvertLayoutRightToLeft crl(pm, lda, nda, P);
171  int scratch_size = scratch_matrix_left_type::shmem_size(lda, nda);
172  pm.clearScratchSizes();
173  pm.setTeamScratchSize(1, scratch_size);
174  pm.CallFunctorWithTeamThreads(num_matrices, crl);
175  Kokkos::fence();
176  }
177 
178 #ifdef COMPADRE_USE_CUDA
179 
180  const int NUM_STREAMS = 64;
181 
182  Kokkos::Profiling::pushRegion("SVD::Setup(Allocate)");
183 
184  int ldu = M;
185  int ldv = N;
186  int min_mn = (M<N) ? M : N;
187  int max_mn = (M>N) ? M : N;
188  // local U, S, and V must be allocated
189  Kokkos::View<double*> U("U", TO_GLOBAL(num_matrices)*TO_GLOBAL(ldu)*TO_GLOBAL(M));
190  Kokkos::View<double*> V("V", TO_GLOBAL(num_matrices)*TO_GLOBAL(ldv)*TO_GLOBAL(N));
191  Kokkos::View<double*> S("S", TO_GLOBAL(num_matrices)*TO_GLOBAL(min_mn));
192  Kokkos::View<int*> devInfo("device info", num_matrices);
193 
194  Kokkos::Profiling::popRegion();
195 
196  Kokkos::Profiling::pushRegion("SVD::Setup(Handle)");
197  cudaError_t cudaStat1 = cudaSuccess;
198  std::vector<cusolverDnHandle_t> cusolver_handles(NUM_STREAMS);
199  cusolverStatus_t cusolver_stat = CUSOLVER_STATUS_SUCCESS;
200  std::vector<cudaStream_t> streams(NUM_STREAMS);
201  gesvdjInfo_t gesvdj_params = NULL;
202  Kokkos::Profiling::popRegion();
203 
204  Kokkos::Profiling::pushRegion("SVD::Setup(Create)");
205 
206  for (int i=0; i<NUM_STREAMS; ++i) {
207  cusolver_stat = cusolverDnCreate(&cusolver_handles[i]);
208  compadre_assert_release(CUSOLVER_STATUS_SUCCESS == cusolver_stat && "DN Create");
209  cudaStat1 = cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking);
210  compadre_assert_release(cudaSuccess == cudaStat1 && "Cuda Stream Create");
211  cusolver_stat = cusolverDnSetStream(cusolver_handles[i], streams[i]);
212  compadre_assert_release(CUSOLVER_STATUS_SUCCESS == cusolver_stat && "Cuda Set Stream");
213  }
214 
215 
216  cusolver_stat = cusolverDnCreateGesvdjInfo(&gesvdj_params);
217  compadre_assert_release(CUSOLVER_STATUS_SUCCESS == cusolver_stat && "Create GesvdjInfo");
218 
219  const double tol = 1.e-14;
220  const int max_sweeps = 15;
221  const int sort_svd = 1; /* sort singular values */
222 
223  cusolver_stat = cusolverDnXgesvdjSetTolerance(gesvdj_params, tol);
224  compadre_assert_release(CUSOLVER_STATUS_SUCCESS == cusolver_stat && "Set Tolerance");
225 
226  cusolver_stat = cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, max_sweeps);
227  compadre_assert_release(CUSOLVER_STATUS_SUCCESS == cusolver_stat && "Set Sweeps");
228 
229  cusolver_stat = cusolverDnXgesvdjSetSortEig(gesvdj_params, sort_svd);
230  compadre_assert_release(CUSOLVER_STATUS_SUCCESS == cusolver_stat && "Sort SVD");
231 
232  const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; /* compute singular vectors */
233 
234  Kokkos::Profiling::popRegion();
235 
236  if (max_mn <= 32) { // batch version only works on small matrices (https://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-gesvdjbatch)
237 
238  Kokkos::Profiling::pushRegion("SVD::Workspace");
239 
240  int lwork = 0;
241 
242  cusolver_stat = cusolverDnDgesvdjBatched_bufferSize(
243  cusolver_handles[0], jobz,
244  M, N,
245  P, lda, // P
246  S.data(), // S
247  U.data(), ldu, // U
248  V.data(), ldv, // V
249  &lwork, gesvdj_params, num_matrices
250  );
251  compadre_assert_release(CUSOLVER_STATUS_SUCCESS == cusolver_stat && "Get Work Size");
252 
253  Kokkos::View<double*> work("work for cusolver", lwork);
254  cudaStat1 = cudaDeviceSynchronize();
255  compadre_assert_release(cudaSuccess == cudaStat1 && "Device Sync 1");
256  Kokkos::Profiling::popRegion();
257 
258  Kokkos::Profiling::pushRegion("SVD::Execution");
259  cusolver_stat = cusolverDnDgesvdjBatched(
260  cusolver_handles[0], jobz,
261  M, N,
262  P, lda,
263  S.data(),
264  U.data(), ldu,
265  V.data(), ldv,
266  work.data(), lwork,
267  devInfo.data(), gesvdj_params, num_matrices
268  );
269  cudaStat1 = cudaDeviceSynchronize();
270  compadre_assert_release(CUSOLVER_STATUS_SUCCESS == cusolver_stat && "Solver Didn't Succeed");
271  compadre_assert_release(cudaSuccess == cudaStat1 && "Device Sync 2");
272  Kokkos::Profiling::popRegion();
273 
274  } else { // bigger than 32 (batched dgesvdj can only handle 32x32)
275 
276 
277  Kokkos::Profiling::pushRegion("SVD::Workspace");
278 
279  int lwork = 0;
280  int econ = 1;
281 
282  //cusolver_stat = cusolverDnDgesvd_bufferSize(
283  // cusolver_handles[0], M, N, &lwork );
284  //compadre_assert_release (cusolver_stat == CUSOLVER_STATUS_SUCCESS && "dgesvd Work Size Failed");
285 
286  cusolver_stat = cusolverDnDgesvdj_bufferSize(
287  cusolver_handles[0], jobz, econ,
288  M, N,
289  P, lda, // P
290  S.data(), // S
291  U.data(), ldu, // U
292  V.data(), ldv, // V
293  &lwork, gesvdj_params
294  );
295  compadre_assert_release(CUSOLVER_STATUS_SUCCESS == cusolver_stat && "Get Work Size");
296 
297  Kokkos::View<double*> work("CUDA work space", lwork*num_matrices);
298  //Kokkos::View<double*> rwork("CUDA work space", (min_mn-1)*num_matrices);
299  cudaDeviceSynchronize();
300  Kokkos::Profiling::popRegion();
301 
302  //signed char jobu = 'A';
303  //signed char jobv = 'A';
304 
305  Kokkos::Profiling::pushRegion("SVD::Execution");
306  for (int i=0; i<num_matrices; ++i) {
307  const int my_stream = i%NUM_STREAMS;
308 
309  cusolverDnDgesvdj(
310  cusolver_handles[my_stream], jobz, econ,
311  M, N,
312  P + TO_GLOBAL(i)*TO_GLOBAL(lda)*TO_GLOBAL(nda), lda,
313  S.data() + TO_GLOBAL(i)*TO_GLOBAL(min_mn),
314  U.data() + TO_GLOBAL(i)*TO_GLOBAL(ldu)*TO_GLOBAL(M), ldu,
315  V.data() + TO_GLOBAL(i)*TO_GLOBAL(ldv)*TO_GLOBAL(N), ldv,
316  work.data() + TO_GLOBAL(i)*TO_GLOBAL(lwork), lwork,
317  devInfo.data() + TO_GLOBAL(i), gesvdj_params
318  );
319 
320 // cusolverDnDgesvd (
321 // cusolver_handles[my_stream],
322 // jobu,
323 // jobv,
324 // M,
325 // N,
326 // P + TO_GLOBAL(i)*TO_GLOBAL(lda)*TO_GLOBAL(nda),
327 // lda,
328 // S.data() + TO_GLOBAL(i)*TO_GLOBAL(min_mn),
329 // U.data() + TO_GLOBAL(i)*TO_GLOBAL(ldu)*TO_GLOBAL(M),
330 // ldu,
331 // V.data() + TO_GLOBAL(i)*TO_GLOBAL(ldv)*TO_GLOBAL(N),
332 // ldv,
333 // work.data() + TO_GLOBAL(i)*TO_GLOBAL(lwork),
334 // lwork,
335 // rwork.data() + TO_GLOBAL(i)*TO_GLOBAL((min_mn-1)),
336 // devInfo.data() + TO_GLOBAL(i) );
337 
338  }
339  Kokkos::Profiling::popRegion();
340  }
341 
342  cudaDeviceSynchronize();
343  for (int i=0; i<NUM_STREAMS; ++i) {
344  cusolverDnDestroy(cusolver_handles[i]);
345  }
346 
347 
348  Kokkos::Profiling::pushRegion("SVD::S Copy");
349  // deep copy neighbor list sizes over to gpu
350  Kokkos::View<int*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_neighbor_list_sizes(neighbor_list_sizes + initial_index_of_batch, num_matrices);
351  Kokkos::View<int*> d_neighbor_list_sizes("neighbor list sizes on device", num_matrices);
352  Kokkos::deep_copy(d_neighbor_list_sizes, h_neighbor_list_sizes);
353 
354  int scratch_space_level;
355  int scratch_space_size = 0;
356  scratch_space_size += scratch_vector_type::shmem_size( min_mn ); // s
357  if (swap_layout_P) {
358  // Less memory is required if the right hand side matrix is diagonal
359  scratch_space_level = 0;
360  } else {
361  // More memory is required to perform full matrix multiplication
362  scratch_space_size += scratch_matrix_left_type::shmem_size(N, M);
363  scratch_space_size += scratch_matrix_left_type::shmem_size(N, NRHS);
364  scratch_space_level = 1;
365  }
366  Kokkos::Profiling::popRegion();
367 
368  Kokkos::parallel_for(
369  team_policy(num_matrices, Kokkos::AUTO)
370  .set_scratch_size(scratch_space_level, Kokkos::PerTeam(scratch_space_size)), // shared memory
371  KOKKOS_LAMBDA (const member_type& teamMember) {
372 
373  const int target_index = teamMember.league_rank();
374 
375  // use a custom # of neighbors for each problem, if possible
376  const int multiplier = (max_neighbors > 0) ? M/max_neighbors : 1; // assumes M is some positive integer scalaing of max_neighbors
377  int my_num_rows = d_neighbor_list_sizes(target_index)*multiplier;
378  //int my_num_rows = d_neighbor_list_sizes(target_index)*multiplier : M;
379 
380  scratch_vector_type s(teamMember.team_scratch(scratch_space_level), min_mn ); // shared memory
381 
382  // data is actually layout left
384  RHS_(RHS + TO_GLOBAL(target_index)*TO_GLOBAL(ldb)*TO_GLOBAL(NRHS), ldb, NRHS);
386  U_(U.data() + TO_GLOBAL(target_index)*TO_GLOBAL(ldu)*TO_GLOBAL(M), ldu, M);
388  V_(V.data() + TO_GLOBAL(target_index)*TO_GLOBAL(ldv)*TO_GLOBAL(N), ldv, N);
390  S_(S.data() + TO_GLOBAL(target_index)*TO_GLOBAL(min_mn), min_mn);
391 
392  // threshold for dropping singular values
393  double S0 = S_(0);
394  double eps = 1e-14;
395  double abs_threshold = eps*S0;
396 
397  // t1 takes on the role of S inverse
398  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, min_mn), [=] (const int i) {
399  s(i) = ( std::abs(S_(i)) > abs_threshold ) ? 1./S_(i) : 0;
400  });
401  teamMember.team_barrier();
402 
403  if (swap_layout_P) {
404  // When solving PsqrtW against sqrtW, since there are two
405  // diagonal matrices (s and the right hand side), the matrix
406  // multiplication can be written more efficiently
407  for (int i=0; i<my_num_rows; ++i) {
408  double sqrt_w_i_m = RHS_(i,i);
409  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,N),
410  [=] (const int j) {
411 
412  double bdataj = 0;
413  for (int k=0; k<min_mn; ++k) {
414  bdataj += V_(j,k)*s(k)*U_(i,k);
415  }
416  // RHS_ is where we can find std::sqrt(w)
417  bdataj *= sqrt_w_i_m;
418  RHS_(j,i) = bdataj;
419  });
420  teamMember.team_barrier();
421  }
422  } else {
423  // Otherwise, you need to perform a full matrix multiplication
424  scratch_matrix_left_type temp_VSU_left_matrix(teamMember.team_scratch(scratch_space_level), N, M);
425  scratch_matrix_left_type temp_x_left_matrix(teamMember.team_scratch(scratch_space_level), N, NRHS);
426  // Multiply V s U^T and sotre into temp_VSU_left_matrix
427  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, N),
428  [=] (const int i) {
429  for (int j=0; j<M; j++) {
430  double temp = 0.0;
431  for (int k=0; k<min_mn; k++) {
432  temp += V_(i,k)*s(k)*U_(j,k);
433  }
434  temp_VSU_left_matrix(i, j) = temp;
435  }
436  });
437  teamMember.team_barrier();
438 
439  // Multiply temp_VSU_left_matrix with RHS_ and store in temp_x_left_matrix
440  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, N),
441  [=] (const int i) {
442  for (int j=0; j<NRHS; j++) {
443  double temp = 0.0;
444  for (int k=0; k<M; k++) {
445  temp += temp_VSU_left_matrix(i, k)*RHS_(k, j);
446  }
447  temp_x_left_matrix(i, j) = temp;
448  }
449  });
450  teamMember.team_barrier();
451  // Copy the matrix back to RHS
452  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, N),
453  [=] (const int i) {
454  for (int j=0; j<NRHS; j++) {
455  RHS_(i, j) = temp_x_left_matrix(i, j);
456  }
457  });
458  }
459  }, "SVD: USV*RHS Multiply");
460 
461 
462 
463 
464 
465 #elif defined(COMPADRE_USE_LAPACK)
466 
467  // later improvement could be to send in an optional view with the neighbor list size for each target to reduce work
468 
469  Kokkos::Profiling::pushRegion("SVD::Setup(Create)");
470  double rcond = 1e-14; // rcond*S(0) is cutoff for singular values in dgelsd
471 
472  const int smlsiz = 25;
473  const int nlvl = std::max(0, int( std::log2( std::min(M,N) / (smlsiz+1) ) + 1));
474  const int liwork = 3*std::min(M,N)*nlvl + 11*std::min(M,N);
475 
476  std::vector<int> iwork(liwork);
477  std::vector<double> s(std::max(M,N));
478 
479  int lwork = -1;
480  double wkopt = 0;
481  int rank = 0;
482  int info = 0;
483 
484  if (num_matrices > 0) {
485  dgelsd_( &M, &N, &NRHS,
486  P, &lda,
487  RHS, &ldb,
488  s.data(), &rcond, &rank,
489  &wkopt, &lwork, iwork.data(), &info);
490  }
491  lwork = (int)wkopt;
492 
493  Kokkos::Profiling::popRegion();
494 
495  #ifdef LAPACK_DECLARED_THREADSAFE
496 
497  int scratch_space_size = 0;
498  scratch_space_size += scratch_vector_type::shmem_size( lwork ); // work space
499  scratch_space_size += scratch_vector_type::shmem_size( std::max(M,N) ); // s
500  scratch_space_size += scratch_local_index_type::shmem_size( liwork ); // iwork space
501 
502  Kokkos::parallel_for(
503  team_policy(num_matrices, Kokkos::AUTO)
504  .set_scratch_size(0, Kokkos::PerTeam(scratch_space_size)),
505  KOKKOS_LAMBDA (const member_type& teamMember) {
506 
507  scratch_vector_type scratch_work(teamMember.team_scratch(0), lwork);
508  scratch_vector_type scratch_s(teamMember.team_scratch(0), std::max(M,N) );
509  scratch_local_index_type scratch_iwork(teamMember.team_scratch(0), liwork);
510 
511  int i_rank = 0;
512  int i_info = 0;
513 
514  const int i = teamMember.league_rank();
515 
516  double * p_offset = P + TO_GLOBAL(i)*TO_GLOBAL(lda)*TO_GLOBAL(nda);
517  double * rhs_offset = RHS + TO_GLOBAL(i)*TO_GLOBAL(ldb)*TO_GLOBAL(ndb);
518 
519  // use a custom # of neighbors for each problem, if possible
520  const int multiplier = (max_neighbors > 0) ? M/max_neighbors : 1; // assumes M is some positive integer scalaing of max_neighbors
521  int my_num_rows = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : M;
522  int my_num_rhs = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : NRHS;
523 
524  Kokkos::single(Kokkos::PerTeam(teamMember), [&] () {
525  dgelsd_( const_cast<int*>(&my_num_rows), const_cast<int*>(&N), const_cast<int*>(&my_num_rhs),
526  p_offset, const_cast<int*>(&lda),
527  rhs_offset, const_cast<int*>(&ldb),
528  scratch_s.data(), const_cast<double*>(&rcond), &i_rank,
529  scratch_work.data(), const_cast<int*>(&lwork), scratch_iwork.data(), &i_info);
530  });
531 
532  compadre_assert_release(i_info==0 && "dgelsd failed");
533 
534  }, "SVD Execution");
535 
536  #else
537 
538  std::vector<double> scratch_work(lwork);
539  std::vector<double> scratch_s(std::max(M,N));
540  std::vector<int> scratch_iwork(liwork);
541 
542  for (int i=0; i<num_matrices; ++i) {
543 
544  int i_rank = 0;
545  int i_info = 0;
546 
547  double * p_offset = P + TO_GLOBAL(i)*TO_GLOBAL(lda)*TO_GLOBAL(nda);
548  double * rhs_offset = RHS + TO_GLOBAL(i)*TO_GLOBAL(ldb)*TO_GLOBAL(ndb);
549 
550  // use a custom # of neighbors for each problem, if possible
551  const int multiplier = (max_neighbors > 0) ? M/max_neighbors : 1; // assumes M is some positive integer scalaing of max_neighbors
552  int my_num_rows = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : M;
553  int my_num_rhs = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : NRHS;
554 
555  dgelsd_( const_cast<int*>(&my_num_rows), const_cast<int*>(&N), const_cast<int*>(&my_num_rhs),
556  p_offset, const_cast<int*>(&lda),
557  rhs_offset, const_cast<int*>(&ldb),
558  scratch_s.data(), const_cast<double*>(&rcond), &i_rank,
559  scratch_work.data(), const_cast<int*>(&lwork), scratch_iwork.data(), &i_info);
560 
561  compadre_assert_release(i_info==0 && "dgelsd failed");
562 
563  }
564 
565  #endif // LAPACK is not threadsafe
566 
567 #endif
568 
569  // Results are written layout left, so they need converted to layout right
570  if (swap_layout_RHS) {
571  ConvertLayoutLeftToRight clr(pm, ldb, ndb, RHS);
572  int scratch_size = scratch_matrix_left_type::shmem_size(ldb, ndb);
573  pm.clearScratchSizes();
574  pm.setTeamScratchSize(1, scratch_size);
575  pm.CallFunctorWithTeamThreads(num_matrices, clr);
576  }
577  Kokkos::fence();
578 
579 }
580 
581 void batchLUFactorize(ParallelManager pm, double *P, int lda, int nda, double *RHS, int ldb, int ndb, int M, int N, int NRHS, const int num_matrices, const size_t max_neighbors, const int initial_index_of_batch, int * neighbor_list_sizes) {
582 
583  // P was constructed layout right, while LAPACK and CUDA expect layout left
584  // P is squared and symmetric so no layout conversion necessary
585  // RHS is not square and not symmetric. However, the implicit cast to layout left
586  // is equivalent to transposing the matrix and being consistent with layout left
587  // Essentially, two operations for free.
588 
589 #ifdef COMPADRE_USE_CUDA
590 
591  Kokkos::Profiling::pushRegion("LU::Setup(Pointers)");
592  Kokkos::View<size_t*> array_P_RHS("P and RHS matrix pointers on device", 2*num_matrices);
593  Kokkos::View<int*> ipiv_device("ipiv space on device", num_matrices*M);
594 
595  // get pointers to device data
596  Kokkos::parallel_for(Kokkos::RangePolicy<device_execution_space>(0,num_matrices), KOKKOS_LAMBDA(const int i) {
597  array_P_RHS(i ) = reinterpret_cast<size_t>(P + TO_GLOBAL(i)*TO_GLOBAL(lda)*TO_GLOBAL(nda));
598  array_P_RHS(i + num_matrices) = reinterpret_cast<size_t>(RHS + TO_GLOBAL(i)*TO_GLOBAL(ldb)*TO_GLOBAL(ndb));
599  });
600 
601  Kokkos::View<int*> devInfo("devInfo", num_matrices);
602  cudaDeviceSynchronize();
603  Kokkos::Profiling::popRegion();
604 
605  Kokkos::Profiling::pushRegion("LU::Setup(Handle)");
606  // Create cublas instance
607  cublasHandle_t cublas_handle;
608  cublasStatus_t cublas_stat;
609  cudaDeviceSynchronize();
610  Kokkos::Profiling::popRegion();
611 
612  Kokkos::Profiling::pushRegion("LU::Setup(Create)");
613  cublasCreate(&cublas_handle);
614  cudaDeviceSynchronize();
615  Kokkos::Profiling::popRegion();
616 
617  int info = 0;
618 
619  Kokkos::Profiling::pushRegion("LU::Execution");
620 
621  // call batched LU factorization
622  cublas_stat=cublasDgetrfBatched(cublas_handle,
623  M,
624  reinterpret_cast<double**>(array_P_RHS.data()), lda,
625  reinterpret_cast<int*>(ipiv_device.data()),
626  devInfo.data(), num_matrices );
627  compadre_assert_release(cublas_stat==CUBLAS_STATUS_SUCCESS && "cublasDgetrfBatched failed");
628 
629  // call batched LU application
630  cublas_stat=cublasDgetrsBatched(cublas_handle, CUBLAS_OP_N,
631  M, NRHS,
632  reinterpret_cast<const double**>(array_P_RHS.data()), lda,
633  reinterpret_cast<int*>(ipiv_device.data()),
634  reinterpret_cast<double**>(array_P_RHS.data() + TO_GLOBAL(num_matrices)), ldb,
635  &info, num_matrices );
636  compadre_assert_release(cublas_stat==CUBLAS_STATUS_SUCCESS && "cublasDgetrsBatched failed");
637 
638  cudaDeviceSynchronize();
639  Kokkos::Profiling::popRegion();
640 
641 
642 #elif defined(COMPADRE_USE_LAPACK)
643 
644 // Kokkos::View<double***> AA("AA", num_matrices, M, N); /// element matrices
645 // Kokkos::View<double**> BB("BB", num_matrices, N, N); /// load vector and would be overwritten by a solution
646 //
647 // using namespace KokkosBatched;
648 //#if 0 // range policy
649 // Kokkos::parallel_for(N, KOKKOS_LAMBDA(const int i) {
650 // auto A = Kokkos::subview(AA, i, Kokkos::ALL(), Kokkos::ALL()); /// ith matrix
651 // auto B = Kokkos::subview(BB, i, Kokkos::ALL()); /// ith load/solution vector
652 //
653 // SerialLU<Algo::LU::Unblocked>
654 // ::invoke(A);
655 // SerialTrsv<Uplo::Lower,Trans::NoTranspose,Diag::Unit,Algo::Trsv::Unblocked>
656 // ::invoke(1, A, B);
657 // });
658 //#endif
659 //
660 //#if 1 // team policy
661 // {
662 // typedef Kokkos::TeamPolicy<device_execution_space> team_policy_type;
663 // typedef typename team_policy_type::member_type member_type;
664 // const int team_size = 32, vector_size = 1;
665 // team_policy_type policy(num_matrices, team_size, vector_size);
666 // Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const member_type &member) {
667 // const int i = member.league_rank();
668 // auto A = Kokkos::subview(AA, i, Kokkos::ALL(), Kokkos::ALL()); /// ith matrix
669 // auto B = Kokkos::subview(BB, i, Kokkos::ALL()); /// ith load/solution vector
670 //
671 // TeamLU<member_type,Algo::LU::Unblocked>
672 // ::invoke(member, A);
673 // TeamTrsv<member_type,Uplo::Lower,Trans::NoTranspose,Diag::Unit,Algo::Trsv::Unblocked>
674 // ::invoke(member, 1, A, B);
675 // });
676 // }
677 //#endif
678 //
679 //#endif
680 
681  // later improvement could be to send in an optional view with the neighbor list size for each target to reduce work
682 
683  Kokkos::Profiling::pushRegion("LU::Setup(Create)");
684  Kokkos::Profiling::popRegion();
685 
686  std::string transpose_or_no = "N";
687 
688  #ifdef LAPACK_DECLARED_THREADSAFE
689  int scratch_space_size = 0;
690  scratch_space_size += scratch_local_index_type::shmem_size( std::min(M, N) ); // ipiv
691 
692  Kokkos::parallel_for(
693  team_policy(num_matrices, Kokkos::AUTO)
694  .set_scratch_size(0, Kokkos::PerTeam(scratch_space_size)),
695  KOKKOS_LAMBDA (const member_type& teamMember) {
696 
697  scratch_local_index_type scratch_ipiv(teamMember.team_scratch(0), std::min(M, N));
698  int i_info = 0;
699 
700  const int i = teamMember.league_rank();
701  double * p_offset = P + TO_GLOBAL(i)*TO_GLOBAL(lda)*TO_GLOBAL(nda);
702  double * rhs_offset = RHS + TO_GLOBAL(i)*TO_GLOBAL(ldb)*TO_GLOBAL(ndb);
703 
704  Kokkos::single(Kokkos::PerTeam(teamMember), [&] () {
705  dgetrf_( const_cast<int*>(&M), const_cast<int*>(&N),
706  p_offset, const_cast<int*>(&lda),
707  scratch_ipiv.data(),
708  &i_info);
709  });
710  teamMember.team_barrier();
711 
712  compadre_assert_release(i_info==0 && "dgetrf failed");
713 
714  Kokkos::single(Kokkos::PerTeam(teamMember), [&] () {
715  dgetrs_( const_cast<char *>(transpose_or_no.c_str()),
716  const_cast<int*>(&N), const_cast<int*>(&NRHS),
717  p_offset, const_cast<int*>(&lda),
718  scratch_ipiv.data(),
719  rhs_offset, const_cast<int*>(&ldb),
720  &i_info);
721  });
722 
723  compadre_assert_release(i_info==0 && "dgetrs failed");
724 
725  }, "LU Execution");
726 
727  #else
728 
729  Kokkos::View<int*> scratch_ipiv("scratch_ipiv", std::min(M, N));
730 
731  for (int i=0; i<num_matrices; ++i) {
732 
733  int i_info = 0;
734 
735  double * p_offset = P + TO_GLOBAL(i)*TO_GLOBAL(lda)*TO_GLOBAL(nda);
736  double * rhs_offset = RHS + TO_GLOBAL(i)*TO_GLOBAL(ldb)*TO_GLOBAL(ndb);
737 
738  dgetrf_( const_cast<int*>(&M), const_cast<int*>(&N),
739  p_offset, const_cast<int*>(&lda),
740  scratch_ipiv.data(),
741  &i_info);
742 
743  dgetrs_( const_cast<char *>(transpose_or_no.c_str()),
744  const_cast<int*>(&N), const_cast<int*>(&NRHS),
745  p_offset, const_cast<int*>(&lda),
746  scratch_ipiv.data(),
747  rhs_offset, const_cast<int*>(&ldb),
748  &i_info);
749 
750  compadre_assert_release(i_info==0 && "dgetrs failed");
751 
752  }
753 
754  #endif // LAPACK is not threadsafe
755 
756 #endif
757 
758  // Results are written layout left, so they need converted to layout right
759  ConvertLayoutLeftToRight clr(pm, ldb, ndb, RHS);
760  int scratch_size = scratch_matrix_left_type::shmem_size(ldb, ndb);
761  pm.clearScratchSizes();
762  pm.setTeamScratchSize(1, scratch_size);
763  pm.CallFunctorWithTeamThreads(num_matrices, clr);
764  Kokkos::fence();
765 
766 }
767 
768 } // GMLS_LinearAlgebra
769 } // Compadre
team_policy::member_type member_type
void batchLUFactorize(ParallelManager pm, double *P, int lda, int nda, double *RHS, int ldb, int ndb, int M, int N, int NRHS, const int num_matrices, const size_t max_neighbors, const int initial_index_of_batch, int *neighbor_list_sizes)
Calls LAPACK or CUBLAS to solve a batch of LU problems.
Kokkos::TeamPolicy< device_execution_space > team_policy
#define TO_GLOBAL(variable)
Kokkos::View< double *, Kokkos::MemoryTraits< Kokkos::Unmanaged > > scratch_vector_type
Kokkos::View< double **, layout_left, Kokkos::MemoryTraits< Kokkos::Unmanaged > > scratch_matrix_left_type
void CallFunctorWithTeamThreads(const global_index_type batch_size, C functor) const
Calls a parallel_for parallel_for will break out over loops over teams with each thread executing cod...
void setTeamScratchSize(const int level, const int value)
void batchSVDFactorize(ParallelManager pm, bool swap_layout_P, double *P, int lda, int nda, bool swap_layout_RHS, double *RHS, int ldb, int ndb, int M, int N, int NRHS, const int num_matrices, const size_t max_neighbors, const int initial_index_of_batch, int *neighbor_list_sizes)
Calls LAPACK or CUBLAS to solve a batch of SVD problems.
void batchQRFactorize(ParallelManager pm, double *P, int lda, int nda, double *RHS, int ldb, int ndb, int M, int N, int NRHS, const int num_matrices, const size_t max_neighbors, const int initial_index_of_batch, int *neighbor_list_sizes)
Calls LAPACK or CUBLAS to solve a batch of QR problems.
#define compadre_assert_release(condition)
compadre_assert_release is used for assertions that should always be checked, but generally are not e...
Kokkos::View< int *, Kokkos::MemoryTraits< Kokkos::Unmanaged > > scratch_local_index_type