12 namespace GMLS_LinearAlgebra {
14 void batchQRFactorize(
ParallelManager pm,
double *P,
int lda,
int nda,
double *RHS,
int ldb,
int ndb,
int M,
int N,
int NRHS,
const int num_matrices,
const size_t max_neighbors,
const int initial_index_of_batch,
int * neighbor_list_sizes) {
20 int scratch_size = scratch_matrix_left_type::shmem_size(lda, nda);
26 #ifdef COMPADRE_USE_CUDA
28 Kokkos::Profiling::pushRegion(
"QR::Setup(Pointers)");
29 Kokkos::View<size_t*> array_P_RHS(
"P and RHS matrix pointers on device", 2*num_matrices);
32 Kokkos::parallel_for(Kokkos::RangePolicy<device_execution_space>(0,num_matrices), KOKKOS_LAMBDA(
const int i) {
37 Kokkos::View<int*> devInfo(
"devInfo", num_matrices);
38 cudaDeviceSynchronize();
39 Kokkos::Profiling::popRegion();
41 Kokkos::Profiling::pushRegion(
"QR::Setup(Handle)");
43 cublasHandle_t cublas_handle;
44 cublasStatus_t cublas_stat;
45 cudaDeviceSynchronize();
46 Kokkos::Profiling::popRegion();
48 Kokkos::Profiling::pushRegion(
"QR::Setup(Create)");
49 cublasCreate(&cublas_handle);
50 cudaDeviceSynchronize();
51 Kokkos::Profiling::popRegion();
53 Kokkos::Profiling::pushRegion(
"QR::Execution");
57 cublas_stat=cublasDgelsBatched(cublas_handle, CUBLAS_OP_N,
59 reinterpret_cast<double**>(array_P_RHS.data()), lda,
60 reinterpret_cast<double**>(array_P_RHS.data() +
TO_GLOBAL(num_matrices)), ldb,
61 &info, devInfo.data(), num_matrices );
64 cudaDeviceSynchronize();
65 Kokkos::Profiling::popRegion();
68 #elif defined(COMPADRE_USE_LAPACK)
72 Kokkos::Profiling::pushRegion(
"QR::Setup(Create)");
75 int lwork = -1, info = 0;
double wkopt = 0;
77 if (num_matrices > 0) {
78 dgels_( (
char *)
"N", &M, &N, &NRHS,
81 &wkopt, &lwork, &info );
87 Kokkos::Profiling::popRegion();
89 std::string transpose_or_no =
"N";
91 #ifdef LAPACK_DECLARED_THREADSAFE
93 int scratch_space_size = scratch_vector_type::shmem_size( lwork );
97 .set_scratch_size(0, Kokkos::PerTeam(scratch_space_size)),
104 const int i = teamMember.league_rank();
110 const int multiplier = (max_neighbors > 0) ? M/max_neighbors : 1;
111 int my_num_rows = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : M;
112 int my_num_rhs = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : NRHS;
114 Kokkos::single(Kokkos::PerTeam(teamMember), [&] () {
115 dgels_( const_cast<char *>(transpose_or_no.c_str()),
116 const_cast<int*>(&my_num_rows),
const_cast<int*
>(&N), const_cast<int*>(&my_num_rhs),
117 p_offset,
const_cast<int*
>(&lda),
118 rhs_offset, const_cast<int*>(&ldb),
119 scratch_work.data(),
const_cast<int*
>(&lwork), &i_info);
128 Kokkos::View<double*> scratch_work(
"scratch_work", lwork);
129 for (
int i=0; i<num_matrices; ++i) {
137 const int multiplier = (max_neighbors > 0) ? M/max_neighbors : 1;
138 int my_num_rows = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : M;
139 int my_num_rhs = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : NRHS;
141 dgels_( const_cast<char *>(transpose_or_no.c_str()),
142 const_cast<int*>(&my_num_rows),
const_cast<int*
>(&N), const_cast<int*>(&my_num_rhs),
143 p_offset,
const_cast<int*
>(&lda),
144 rhs_offset, const_cast<int*>(&ldb),
145 scratch_work.data(),
const_cast<int*
>(&lwork), &i_info);
150 #endif // LAPACK is not threadsafe
156 scratch_size = scratch_matrix_left_type::shmem_size(ldb, ndb);
164 void batchSVDFactorize(
ParallelManager pm,
bool swap_layout_P,
double *P,
int lda,
int nda,
bool swap_layout_RHS,
double *RHS,
int ldb,
int ndb,
int M,
int N,
int NRHS,
const int num_matrices,
const size_t max_neighbors,
const int initial_index_of_batch,
int * neighbor_list_sizes) {
171 int scratch_size = scratch_matrix_left_type::shmem_size(lda, nda);
178 #ifdef COMPADRE_USE_CUDA
180 const int NUM_STREAMS = 64;
182 Kokkos::Profiling::pushRegion(
"SVD::Setup(Allocate)");
186 int min_mn = (M<N) ? M : N;
187 int max_mn = (M>N) ? M : N;
192 Kokkos::View<int*> devInfo(
"device info", num_matrices);
194 Kokkos::Profiling::popRegion();
196 Kokkos::Profiling::pushRegion(
"SVD::Setup(Handle)");
197 cudaError_t cudaStat1 = cudaSuccess;
198 std::vector<cusolverDnHandle_t> cusolver_handles(NUM_STREAMS);
199 cusolverStatus_t cusolver_stat = CUSOLVER_STATUS_SUCCESS;
200 std::vector<cudaStream_t> streams(NUM_STREAMS);
201 gesvdjInfo_t gesvdj_params = NULL;
202 Kokkos::Profiling::popRegion();
204 Kokkos::Profiling::pushRegion(
"SVD::Setup(Create)");
206 for (
int i=0; i<NUM_STREAMS; ++i) {
207 cusolver_stat = cusolverDnCreate(&cusolver_handles[i]);
209 cudaStat1 = cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking);
211 cusolver_stat = cusolverDnSetStream(cusolver_handles[i], streams[i]);
216 cusolver_stat = cusolverDnCreateGesvdjInfo(&gesvdj_params);
219 const double tol = 1.e-14;
220 const int max_sweeps = 15;
221 const int sort_svd = 1;
223 cusolver_stat = cusolverDnXgesvdjSetTolerance(gesvdj_params, tol);
226 cusolver_stat = cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, max_sweeps);
229 cusolver_stat = cusolverDnXgesvdjSetSortEig(gesvdj_params, sort_svd);
232 const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR;
234 Kokkos::Profiling::popRegion();
238 Kokkos::Profiling::pushRegion(
"SVD::Workspace");
242 cusolver_stat = cusolverDnDgesvdjBatched_bufferSize(
243 cusolver_handles[0], jobz,
249 &lwork, gesvdj_params, num_matrices
253 Kokkos::View<double*> work(
"work for cusolver", lwork);
254 cudaStat1 = cudaDeviceSynchronize();
256 Kokkos::Profiling::popRegion();
258 Kokkos::Profiling::pushRegion(
"SVD::Execution");
259 cusolver_stat = cusolverDnDgesvdjBatched(
260 cusolver_handles[0], jobz,
267 devInfo.data(), gesvdj_params, num_matrices
269 cudaStat1 = cudaDeviceSynchronize();
272 Kokkos::Profiling::popRegion();
277 Kokkos::Profiling::pushRegion(
"SVD::Workspace");
286 cusolver_stat = cusolverDnDgesvdj_bufferSize(
287 cusolver_handles[0], jobz, econ,
293 &lwork, gesvdj_params
297 Kokkos::View<double*> work(
"CUDA work space", lwork*num_matrices);
299 cudaDeviceSynchronize();
300 Kokkos::Profiling::popRegion();
305 Kokkos::Profiling::pushRegion(
"SVD::Execution");
306 for (
int i=0; i<num_matrices; ++i) {
307 const int my_stream = i%NUM_STREAMS;
310 cusolver_handles[my_stream], jobz, econ,
317 devInfo.data() +
TO_GLOBAL(i), gesvdj_params
339 Kokkos::Profiling::popRegion();
342 cudaDeviceSynchronize();
343 for (
int i=0; i<NUM_STREAMS; ++i) {
344 cusolverDnDestroy(cusolver_handles[i]);
348 Kokkos::Profiling::pushRegion(
"SVD::S Copy");
350 Kokkos::View<int*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_neighbor_list_sizes(neighbor_list_sizes + initial_index_of_batch, num_matrices);
351 Kokkos::View<int*> d_neighbor_list_sizes(
"neighbor list sizes on device", num_matrices);
352 Kokkos::deep_copy(d_neighbor_list_sizes, h_neighbor_list_sizes);
354 int scratch_space_level;
355 int scratch_space_size = 0;
356 scratch_space_size += scratch_vector_type::shmem_size( min_mn );
359 scratch_space_level = 0;
362 scratch_space_size += scratch_matrix_left_type::shmem_size(N, M);
363 scratch_space_size += scratch_matrix_left_type::shmem_size(N, NRHS);
364 scratch_space_level = 1;
366 Kokkos::Profiling::popRegion();
368 Kokkos::parallel_for(
370 .set_scratch_size(scratch_space_level, Kokkos::PerTeam(scratch_space_size)),
373 const int target_index = teamMember.league_rank();
376 const int multiplier = (max_neighbors > 0) ? M/max_neighbors : 1;
377 int my_num_rows = d_neighbor_list_sizes(target_index)*multiplier;
395 double abs_threshold = eps*S0;
398 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, min_mn), [=] (
const int i) {
399 s(i) = ( std::abs(S_(i)) > abs_threshold ) ? 1./S_(i) : 0;
401 teamMember.team_barrier();
407 for (
int i=0; i<my_num_rows; ++i) {
408 double sqrt_w_i_m = RHS_(i,i);
409 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,N),
413 for (
int k=0; k<min_mn; ++k) {
414 bdataj += V_(j,k)*s(k)*U_(i,k);
417 bdataj *= sqrt_w_i_m;
420 teamMember.team_barrier();
427 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, N),
429 for (
int j=0; j<M; j++) {
431 for (
int k=0; k<min_mn; k++) {
432 temp += V_(i,k)*s(k)*U_(j,k);
434 temp_VSU_left_matrix(i, j) = temp;
437 teamMember.team_barrier();
440 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, N),
442 for (
int j=0; j<NRHS; j++) {
444 for (
int k=0; k<M; k++) {
445 temp += temp_VSU_left_matrix(i, k)*RHS_(k, j);
447 temp_x_left_matrix(i, j) = temp;
450 teamMember.team_barrier();
452 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, N),
454 for (
int j=0; j<NRHS; j++) {
455 RHS_(i, j) = temp_x_left_matrix(i, j);
459 },
"SVD: USV*RHS Multiply");
465 #elif defined(COMPADRE_USE_LAPACK)
469 Kokkos::Profiling::pushRegion(
"SVD::Setup(Create)");
470 double rcond = 1e-14;
472 const int smlsiz = 25;
473 const int nlvl = std::max(0,
int( std::log2( std::min(M,N) / (smlsiz+1) ) + 1));
474 const int liwork = 3*std::min(M,N)*nlvl + 11*std::min(M,N);
476 std::vector<int> iwork(liwork);
477 std::vector<double> s(std::max(M,N));
484 if (num_matrices > 0) {
485 dgelsd_( &M, &N, &NRHS,
488 s.data(), &rcond, &rank,
489 &wkopt, &lwork, iwork.data(), &info);
493 Kokkos::Profiling::popRegion();
495 #ifdef LAPACK_DECLARED_THREADSAFE
497 int scratch_space_size = 0;
498 scratch_space_size += scratch_vector_type::shmem_size( lwork );
499 scratch_space_size += scratch_vector_type::shmem_size( std::max(M,N) );
500 scratch_space_size += scratch_local_index_type::shmem_size( liwork );
502 Kokkos::parallel_for(
504 .set_scratch_size(0, Kokkos::PerTeam(scratch_space_size)),
514 const int i = teamMember.league_rank();
520 const int multiplier = (max_neighbors > 0) ? M/max_neighbors : 1;
521 int my_num_rows = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : M;
522 int my_num_rhs = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : NRHS;
524 Kokkos::single(Kokkos::PerTeam(teamMember), [&] () {
525 dgelsd_( const_cast<int*>(&my_num_rows), const_cast<int*>(&N), const_cast<int*>(&my_num_rhs),
526 p_offset, const_cast<int*>(&lda),
527 rhs_offset, const_cast<int*>(&ldb),
528 scratch_s.data(),
const_cast<double*
>(&rcond), &i_rank,
529 scratch_work.data(),
const_cast<int*
>(&lwork), scratch_iwork.data(), &i_info);
538 std::vector<double> scratch_work(lwork);
539 std::vector<double> scratch_s(std::max(M,N));
540 std::vector<int> scratch_iwork(liwork);
542 for (
int i=0; i<num_matrices; ++i) {
551 const int multiplier = (max_neighbors > 0) ? M/max_neighbors : 1;
552 int my_num_rows = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : M;
553 int my_num_rhs = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : NRHS;
555 dgelsd_( const_cast<int*>(&my_num_rows), const_cast<int*>(&N), const_cast<int*>(&my_num_rhs),
556 p_offset, const_cast<int*>(&lda),
557 rhs_offset, const_cast<int*>(&ldb),
558 scratch_s.data(),
const_cast<double*
>(&rcond), &i_rank,
559 scratch_work.data(),
const_cast<int*
>(&lwork), scratch_iwork.data(), &i_info);
565 #endif // LAPACK is not threadsafe
570 if (swap_layout_RHS) {
572 int scratch_size = scratch_matrix_left_type::shmem_size(ldb, ndb);
581 void batchLUFactorize(
ParallelManager pm,
double *P,
int lda,
int nda,
double *RHS,
int ldb,
int ndb,
int M,
int N,
int NRHS,
const int num_matrices,
const size_t max_neighbors,
const int initial_index_of_batch,
int * neighbor_list_sizes) {
589 #ifdef COMPADRE_USE_CUDA
591 Kokkos::Profiling::pushRegion(
"LU::Setup(Pointers)");
592 Kokkos::View<size_t*> array_P_RHS(
"P and RHS matrix pointers on device", 2*num_matrices);
593 Kokkos::View<int*> ipiv_device(
"ipiv space on device", num_matrices*M);
596 Kokkos::parallel_for(Kokkos::RangePolicy<device_execution_space>(0,num_matrices), KOKKOS_LAMBDA(
const int i) {
601 Kokkos::View<int*> devInfo(
"devInfo", num_matrices);
602 cudaDeviceSynchronize();
603 Kokkos::Profiling::popRegion();
605 Kokkos::Profiling::pushRegion(
"LU::Setup(Handle)");
607 cublasHandle_t cublas_handle;
608 cublasStatus_t cublas_stat;
609 cudaDeviceSynchronize();
610 Kokkos::Profiling::popRegion();
612 Kokkos::Profiling::pushRegion(
"LU::Setup(Create)");
613 cublasCreate(&cublas_handle);
614 cudaDeviceSynchronize();
615 Kokkos::Profiling::popRegion();
619 Kokkos::Profiling::pushRegion(
"LU::Execution");
622 cublas_stat=cublasDgetrfBatched(cublas_handle,
624 reinterpret_cast<double**>(array_P_RHS.data()), lda,
625 reinterpret_cast<int*>(ipiv_device.data()),
626 devInfo.data(), num_matrices );
630 cublas_stat=cublasDgetrsBatched(cublas_handle, CUBLAS_OP_N,
632 reinterpret_cast<const double**>(array_P_RHS.data()), lda,
633 reinterpret_cast<int*>(ipiv_device.data()),
634 reinterpret_cast<double**>(array_P_RHS.data() +
TO_GLOBAL(num_matrices)), ldb,
635 &info, num_matrices );
638 cudaDeviceSynchronize();
639 Kokkos::Profiling::popRegion();
642 #elif defined(COMPADRE_USE_LAPACK)
683 Kokkos::Profiling::pushRegion(
"LU::Setup(Create)");
684 Kokkos::Profiling::popRegion();
686 std::string transpose_or_no =
"N";
688 #ifdef LAPACK_DECLARED_THREADSAFE
689 int scratch_space_size = 0;
690 scratch_space_size += scratch_local_index_type::shmem_size( std::min(M, N) );
692 Kokkos::parallel_for(
694 .set_scratch_size(0, Kokkos::PerTeam(scratch_space_size)),
700 const int i = teamMember.league_rank();
704 Kokkos::single(Kokkos::PerTeam(teamMember), [&] () {
705 dgetrf_( const_cast<int*>(&M), const_cast<int*>(&N),
706 p_offset, const_cast<int*>(&lda),
710 teamMember.team_barrier();
714 Kokkos::single(Kokkos::PerTeam(teamMember), [&] () {
715 dgetrs_( const_cast<char *>(transpose_or_no.c_str()),
716 const_cast<int*>(&N),
const_cast<int*
>(&NRHS),
717 p_offset, const_cast<int*>(&lda),
719 rhs_offset,
const_cast<int*
>(&ldb),
729 Kokkos::View<int*> scratch_ipiv(
"scratch_ipiv", std::min(M, N));
731 for (
int i=0; i<num_matrices; ++i) {
738 dgetrf_( const_cast<int*>(&M), const_cast<int*>(&N),
739 p_offset, const_cast<int*>(&lda),
743 dgetrs_( const_cast<char *>(transpose_or_no.c_str()),
744 const_cast<int*>(&N),
const_cast<int*
>(&NRHS),
745 p_offset, const_cast<int*>(&lda),
747 rhs_offset,
const_cast<int*
>(&ldb),
754 #endif // LAPACK is not threadsafe
760 int scratch_size = scratch_matrix_left_type::shmem_size(ldb, ndb);
team_policy::member_type member_type
void batchLUFactorize(ParallelManager pm, double *P, int lda, int nda, double *RHS, int ldb, int ndb, int M, int N, int NRHS, const int num_matrices, const size_t max_neighbors, const int initial_index_of_batch, int *neighbor_list_sizes)
Calls LAPACK or CUBLAS to solve a batch of LU problems.
Kokkos::TeamPolicy< device_execution_space > team_policy
#define TO_GLOBAL(variable)
Kokkos::View< double *, Kokkos::MemoryTraits< Kokkos::Unmanaged > > scratch_vector_type
Kokkos::View< double **, layout_left, Kokkos::MemoryTraits< Kokkos::Unmanaged > > scratch_matrix_left_type
void CallFunctorWithTeamThreads(const global_index_type batch_size, C functor) const
Calls a parallel_for parallel_for will break out over loops over teams with each thread executing cod...
void setTeamScratchSize(const int level, const int value)
void batchSVDFactorize(ParallelManager pm, bool swap_layout_P, double *P, int lda, int nda, bool swap_layout_RHS, double *RHS, int ldb, int ndb, int M, int N, int NRHS, const int num_matrices, const size_t max_neighbors, const int initial_index_of_batch, int *neighbor_list_sizes)
Calls LAPACK or CUBLAS to solve a batch of SVD problems.
void batchQRFactorize(ParallelManager pm, double *P, int lda, int nda, double *RHS, int ldb, int ndb, int M, int N, int NRHS, const int num_matrices, const size_t max_neighbors, const int initial_index_of_batch, int *neighbor_list_sizes)
Calls LAPACK or CUBLAS to solve a batch of QR problems.
#define compadre_assert_release(condition)
compadre_assert_release is used for assertions that should always be checked, but generally are not e...
Kokkos::View< int *, Kokkos::MemoryTraits< Kokkos::Unmanaged > > scratch_local_index_type