12 namespace GMLS_LinearAlgebra {
 
   14 void batchQRFactorize(
ParallelManager pm, 
double *P, 
int lda, 
int nda, 
double *RHS, 
int ldb, 
int ndb, 
int M, 
int N, 
int NRHS, 
const int num_matrices, 
const size_t max_neighbors, 
const int initial_index_of_batch, 
int * neighbor_list_sizes) {
 
   20     int scratch_size = scratch_matrix_left_type::shmem_size(lda, nda);
 
   26 #ifdef COMPADRE_USE_CUDA 
   28     Kokkos::Profiling::pushRegion(
"QR::Setup(Pointers)");
 
   29     Kokkos::View<size_t*> array_P_RHS(
"P and RHS matrix pointers on device", 2*num_matrices);
 
   32     Kokkos::parallel_for(Kokkos::RangePolicy<device_execution_space>(0,num_matrices), KOKKOS_LAMBDA(
const int i) {
 
   37     Kokkos::View<int*> devInfo(
"devInfo", num_matrices);
 
   38     cudaDeviceSynchronize();
 
   39     Kokkos::Profiling::popRegion();
 
   41     Kokkos::Profiling::pushRegion(
"QR::Setup(Handle)");
 
   43     cublasHandle_t cublas_handle;
 
   44     cublasStatus_t cublas_stat;
 
   45     cudaDeviceSynchronize();
 
   46     Kokkos::Profiling::popRegion();
 
   48     Kokkos::Profiling::pushRegion(
"QR::Setup(Create)");
 
   49     cublasCreate(&cublas_handle);
 
   50     cudaDeviceSynchronize();
 
   51     Kokkos::Profiling::popRegion();
 
   53     Kokkos::Profiling::pushRegion(
"QR::Execution");
 
   57     cublas_stat=cublasDgelsBatched(cublas_handle, CUBLAS_OP_N, 
 
   59                                    reinterpret_cast<double**>(array_P_RHS.data()), lda,
 
   60                                    reinterpret_cast<double**>(array_P_RHS.data() + 
TO_GLOBAL(num_matrices)), ldb,
 
   61                                    &info, devInfo.data(), num_matrices );
 
   64     cudaDeviceSynchronize();
 
   65     Kokkos::Profiling::popRegion();
 
   68 #elif defined(COMPADRE_USE_LAPACK) 
   72     Kokkos::Profiling::pushRegion(
"QR::Setup(Create)");
 
   75     int lwork = -1, info = 0; 
double wkopt = 0;
 
   77     if (num_matrices > 0) {
 
   78         dgels_( (
char *)
"N", &M, &N, &NRHS, 
 
   81                 &wkopt, &lwork, &info );
 
   87     Kokkos::Profiling::popRegion();
 
   89     std::string transpose_or_no = 
"N";
 
   91     #ifdef LAPACK_DECLARED_THREADSAFE 
   93         int scratch_space_size = scratch_vector_type::shmem_size( lwork );  
 
   97             .set_scratch_size(0, Kokkos::PerTeam(scratch_space_size)),
 
  104             const int i = teamMember.league_rank();
 
  110             const int multiplier = (max_neighbors > 0) ? M/max_neighbors : 1; 
 
  111             int my_num_rows = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : M;
 
  112             int my_num_rhs = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : NRHS;
 
  114             Kokkos::single(Kokkos::PerTeam(teamMember), [&] () {
 
  115             dgels_( const_cast<char *>(transpose_or_no.c_str()), 
 
  116                     const_cast<int*>(&my_num_rows), 
const_cast<int*
>(&N), const_cast<int*>(&my_num_rhs),
 
  117                     p_offset, 
const_cast<int*
>(&lda),
 
  118                     rhs_offset, const_cast<int*>(&ldb),
 
  119                     scratch_work.data(), 
const_cast<int*
>(&lwork), &i_info);
 
  128         Kokkos::View<double*> scratch_work(
"scratch_work", lwork);
 
  129         for (
int i=0; i<num_matrices; ++i) {
 
  137             const int multiplier = (max_neighbors > 0) ? M/max_neighbors : 1; 
 
  138             int my_num_rows = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : M;
 
  139             int my_num_rhs = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : NRHS;
 
  141             dgels_( const_cast<char *>(transpose_or_no.c_str()), 
 
  142                     const_cast<int*>(&my_num_rows), 
const_cast<int*
>(&N), const_cast<int*>(&my_num_rhs),
 
  143                     p_offset, 
const_cast<int*
>(&lda),
 
  144                     rhs_offset, const_cast<int*>(&ldb),
 
  145                     scratch_work.data(), 
const_cast<int*
>(&lwork), &i_info);
 
  150     #endif // LAPACK is not threadsafe 
  156     scratch_size = scratch_matrix_left_type::shmem_size(ldb, ndb);
 
  164 void batchSVDFactorize(
ParallelManager pm, 
bool swap_layout_P, 
double *P, 
int lda, 
int nda, 
bool swap_layout_RHS, 
double *RHS, 
int ldb, 
int ndb, 
int M, 
int N, 
int NRHS, 
const int num_matrices, 
const size_t max_neighbors, 
const int initial_index_of_batch, 
int * neighbor_list_sizes) {
 
  171         int scratch_size = scratch_matrix_left_type::shmem_size(lda, nda);
 
  178 #ifdef COMPADRE_USE_CUDA 
  180     const int NUM_STREAMS = 64;
 
  182     Kokkos::Profiling::pushRegion(
"SVD::Setup(Allocate)");
 
  186       int min_mn = (M<N) ? M : N;
 
  187       int max_mn = (M>N) ? M : N;
 
  192       Kokkos::View<int*> devInfo(
"device info", num_matrices);
 
  194     Kokkos::Profiling::popRegion();
 
  196     Kokkos::Profiling::pushRegion(
"SVD::Setup(Handle)");
 
  197       cudaError_t cudaStat1 = cudaSuccess;
 
  198       std::vector<cusolverDnHandle_t> cusolver_handles(NUM_STREAMS);
 
  199       cusolverStatus_t cusolver_stat = CUSOLVER_STATUS_SUCCESS;
 
  200       std::vector<cudaStream_t> streams(NUM_STREAMS);
 
  201       gesvdjInfo_t gesvdj_params = NULL;
 
  202     Kokkos::Profiling::popRegion();
 
  204     Kokkos::Profiling::pushRegion(
"SVD::Setup(Create)");
 
  206       for (
int i=0; i<NUM_STREAMS; ++i) {
 
  207           cusolver_stat = cusolverDnCreate(&cusolver_handles[i]);
 
  209           cudaStat1 = cudaStreamCreateWithFlags(&streams[i], cudaStreamNonBlocking);
 
  211           cusolver_stat = cusolverDnSetStream(cusolver_handles[i], streams[i]);
 
  216       cusolver_stat = cusolverDnCreateGesvdjInfo(&gesvdj_params);
 
  219       const double tol = 1.e-14;
 
  220       const int max_sweeps = 15;
 
  221       const int sort_svd  = 1;   
 
  223       cusolver_stat = cusolverDnXgesvdjSetTolerance(gesvdj_params, tol);
 
  226       cusolver_stat = cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, max_sweeps);
 
  229       cusolver_stat = cusolverDnXgesvdjSetSortEig(gesvdj_params, sort_svd);
 
  232       const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; 
 
  234     Kokkos::Profiling::popRegion();
 
  238         Kokkos::Profiling::pushRegion(
"SVD::Workspace");
 
  242           cusolver_stat = cusolverDnDgesvdjBatched_bufferSize(
 
  243               cusolver_handles[0], jobz,
 
  249               &lwork, gesvdj_params, num_matrices
 
  253           Kokkos::View<double*> work(
"work for cusolver", lwork);
 
  254           cudaStat1 = cudaDeviceSynchronize();
 
  256         Kokkos::Profiling::popRegion();
 
  258         Kokkos::Profiling::pushRegion(
"SVD::Execution");
 
  259           cusolver_stat = cusolverDnDgesvdjBatched(
 
  260               cusolver_handles[0], jobz,
 
  267               devInfo.data(), gesvdj_params, num_matrices
 
  269           cudaStat1 = cudaDeviceSynchronize();
 
  272         Kokkos::Profiling::popRegion();
 
  277         Kokkos::Profiling::pushRegion(
"SVD::Workspace");
 
  286           cusolver_stat = cusolverDnDgesvdj_bufferSize(
 
  287               cusolver_handles[0], jobz, econ,
 
  293               &lwork, gesvdj_params
 
  297           Kokkos::View<double*> work(
"CUDA work space", lwork*num_matrices);
 
  299           cudaDeviceSynchronize();
 
  300         Kokkos::Profiling::popRegion();
 
  305         Kokkos::Profiling::pushRegion(
"SVD::Execution");
 
  306           for (
int i=0; i<num_matrices; ++i) {
 
  307               const int my_stream = i%NUM_STREAMS;
 
  310                   cusolver_handles[my_stream], jobz, econ,
 
  317                   devInfo.data() + 
TO_GLOBAL(i), gesvdj_params
 
  339         Kokkos::Profiling::popRegion();
 
  342     cudaDeviceSynchronize();
 
  343     for (
int i=0; i<NUM_STREAMS; ++i) {
 
  344         cusolverDnDestroy(cusolver_handles[i]);
 
  348     Kokkos::Profiling::pushRegion(
"SVD::S Copy");
 
  350     Kokkos::View<int*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_neighbor_list_sizes(neighbor_list_sizes + initial_index_of_batch, num_matrices);
 
  351     Kokkos::View<int*> d_neighbor_list_sizes(
"neighbor list sizes on device", num_matrices);
 
  352     Kokkos::deep_copy(d_neighbor_list_sizes, h_neighbor_list_sizes);
 
  354     int scratch_space_level;
 
  355     int scratch_space_size = 0;
 
  356     scratch_space_size += scratch_vector_type::shmem_size( min_mn );  
 
  359         scratch_space_level = 0;
 
  362         scratch_space_size += scratch_matrix_left_type::shmem_size(N, M);
 
  363         scratch_space_size += scratch_matrix_left_type::shmem_size(N, NRHS);
 
  364         scratch_space_level = 1;
 
  366     Kokkos::Profiling::popRegion();
 
  368     Kokkos::parallel_for(
 
  370         .set_scratch_size(scratch_space_level, Kokkos::PerTeam(scratch_space_size)), 
 
  373         const int target_index = teamMember.league_rank();
 
  376         const int multiplier = (max_neighbors > 0) ? M/max_neighbors : 1; 
 
  377         int my_num_rows = d_neighbor_list_sizes(target_index)*multiplier;
 
  395         double abs_threshold = eps*S0;        
 
  398         Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, min_mn), [=] (
const int i) {
 
  399             s(i) = ( std::abs(S_(i)) > abs_threshold ) ? 1./S_(i) : 0;
 
  401         teamMember.team_barrier();
 
  407             for (
int i=0; i<my_num_rows; ++i) {
 
  408                 double sqrt_w_i_m = RHS_(i,i);
 
  409                 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,N),
 
  413                     for (
int k=0; k<min_mn; ++k) {
 
  414                         bdataj += V_(j,k)*s(k)*U_(i,k);
 
  417                     bdataj *= sqrt_w_i_m;
 
  420                 teamMember.team_barrier();
 
  427             Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, N),
 
  429                 for (
int j=0; j<M; j++) {
 
  431                     for (
int k=0; k<min_mn; k++) {
 
  432                         temp += V_(i,k)*s(k)*U_(j,k);
 
  434                     temp_VSU_left_matrix(i, j) = temp;
 
  437             teamMember.team_barrier();
 
  440             Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, N),
 
  442                 for (
int j=0; j<NRHS; j++) {
 
  444                     for (
int k=0; k<M; k++) {
 
  445                         temp += temp_VSU_left_matrix(i, k)*RHS_(k, j);
 
  447                     temp_x_left_matrix(i, j) = temp;
 
  450             teamMember.team_barrier();
 
  452             Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, N),
 
  454                 for (
int j=0; j<NRHS; j++) {
 
  455                     RHS_(i, j) = temp_x_left_matrix(i, j);
 
  459     }, 
"SVD: USV*RHS Multiply");
 
  465 #elif defined(COMPADRE_USE_LAPACK) 
  469     Kokkos::Profiling::pushRegion(
"SVD::Setup(Create)");
 
  470     double rcond = 1e-14; 
 
  472     const int smlsiz = 25;
 
  473     const int nlvl = std::max(0, 
int( std::log2( std::min(M,N) / (smlsiz+1) ) + 1));
 
  474     const int liwork = 3*std::min(M,N)*nlvl + 11*std::min(M,N);
 
  476     std::vector<int> iwork(liwork);
 
  477     std::vector<double> s(std::max(M,N));
 
  484     if (num_matrices > 0) {
 
  485         dgelsd_( &M, &N, &NRHS, 
 
  488                  s.data(), &rcond, &rank,
 
  489                  &wkopt, &lwork, iwork.data(), &info);
 
  493     Kokkos::Profiling::popRegion();
 
  495     #ifdef LAPACK_DECLARED_THREADSAFE 
  497         int scratch_space_size = 0;
 
  498         scratch_space_size += scratch_vector_type::shmem_size( lwork );  
 
  499         scratch_space_size += scratch_vector_type::shmem_size( std::max(M,N) );  
 
  500         scratch_space_size += scratch_local_index_type::shmem_size( liwork ); 
 
  502         Kokkos::parallel_for(
 
  504             .set_scratch_size(0, Kokkos::PerTeam(scratch_space_size)),
 
  514                 const int i = teamMember.league_rank();
 
  520                 const int multiplier = (max_neighbors > 0) ? M/max_neighbors : 1; 
 
  521                 int my_num_rows = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : M;
 
  522                 int my_num_rhs = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : NRHS;
 
  524                 Kokkos::single(Kokkos::PerTeam(teamMember), [&] () {
 
  525                 dgelsd_( const_cast<int*>(&my_num_rows), const_cast<int*>(&N), const_cast<int*>(&my_num_rhs),
 
  526                          p_offset, const_cast<int*>(&lda),
 
  527                          rhs_offset, const_cast<int*>(&ldb),
 
  528                          scratch_s.data(), 
const_cast<double*
>(&rcond), &i_rank,
 
  529                          scratch_work.data(), 
const_cast<int*
>(&lwork), scratch_iwork.data(), &i_info);
 
  538         std::vector<double> scratch_work(lwork);
 
  539         std::vector<double> scratch_s(std::max(M,N));
 
  540         std::vector<int>    scratch_iwork(liwork);
 
  542         for (
int i=0; i<num_matrices; ++i) {
 
  551                 const int multiplier = (max_neighbors > 0) ? M/max_neighbors : 1; 
 
  552                 int my_num_rows = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : M;
 
  553                 int my_num_rhs = (neighbor_list_sizes) ? (*(neighbor_list_sizes + i + initial_index_of_batch))*multiplier : NRHS;
 
  555                 dgelsd_( const_cast<int*>(&my_num_rows), const_cast<int*>(&N), const_cast<int*>(&my_num_rhs),
 
  556                          p_offset, const_cast<int*>(&lda),
 
  557                          rhs_offset, const_cast<int*>(&ldb),
 
  558                          scratch_s.data(), 
const_cast<double*
>(&rcond), &i_rank,
 
  559                          scratch_work.data(), 
const_cast<int*
>(&lwork), scratch_iwork.data(), &i_info);
 
  565     #endif // LAPACK is not threadsafe 
  570     if (swap_layout_RHS) {
 
  572         int scratch_size = scratch_matrix_left_type::shmem_size(ldb, ndb);
 
  581 void batchLUFactorize(
ParallelManager pm, 
double *P, 
int lda, 
int nda, 
double *RHS, 
int ldb, 
int ndb, 
int M, 
int N, 
int NRHS, 
const int num_matrices, 
const size_t max_neighbors, 
const int initial_index_of_batch, 
int * neighbor_list_sizes) {
 
  589 #ifdef COMPADRE_USE_CUDA 
  591     Kokkos::Profiling::pushRegion(
"LU::Setup(Pointers)");
 
  592     Kokkos::View<size_t*> array_P_RHS(
"P and RHS matrix pointers on device", 2*num_matrices);
 
  593     Kokkos::View<int*> ipiv_device(
"ipiv space on device", num_matrices*M);
 
  596     Kokkos::parallel_for(Kokkos::RangePolicy<device_execution_space>(0,num_matrices), KOKKOS_LAMBDA(
const int i) {
 
  601     Kokkos::View<int*> devInfo(
"devInfo", num_matrices);
 
  602     cudaDeviceSynchronize();
 
  603     Kokkos::Profiling::popRegion();
 
  605     Kokkos::Profiling::pushRegion(
"LU::Setup(Handle)");
 
  607     cublasHandle_t cublas_handle;
 
  608     cublasStatus_t cublas_stat;
 
  609     cudaDeviceSynchronize();
 
  610     Kokkos::Profiling::popRegion();
 
  612     Kokkos::Profiling::pushRegion(
"LU::Setup(Create)");
 
  613     cublasCreate(&cublas_handle);
 
  614     cudaDeviceSynchronize();
 
  615     Kokkos::Profiling::popRegion();
 
  619     Kokkos::Profiling::pushRegion(
"LU::Execution");
 
  622     cublas_stat=cublasDgetrfBatched(cublas_handle, 
 
  624                                    reinterpret_cast<double**>(array_P_RHS.data()), lda,
 
  625                                    reinterpret_cast<int*>(ipiv_device.data()),
 
  626                                    devInfo.data(), num_matrices );
 
  630     cublas_stat=cublasDgetrsBatched(cublas_handle, CUBLAS_OP_N,
 
  632                                    reinterpret_cast<const double**>(array_P_RHS.data()), lda,
 
  633                                    reinterpret_cast<int*>(ipiv_device.data()),
 
  634                                    reinterpret_cast<double**>(array_P_RHS.data() + 
TO_GLOBAL(num_matrices)), ldb,
 
  635                                    &info, num_matrices );
 
  638     cudaDeviceSynchronize();
 
  639     Kokkos::Profiling::popRegion();
 
  642 #elif defined(COMPADRE_USE_LAPACK) 
  683     Kokkos::Profiling::pushRegion(
"LU::Setup(Create)");
 
  684     Kokkos::Profiling::popRegion();
 
  686     std::string transpose_or_no = 
"N";
 
  688     #ifdef LAPACK_DECLARED_THREADSAFE 
  689         int scratch_space_size = 0;
 
  690         scratch_space_size += scratch_local_index_type::shmem_size( std::min(M, N) );  
 
  692         Kokkos::parallel_for(
 
  694             .set_scratch_size(0, Kokkos::PerTeam(scratch_space_size)),
 
  700                 const int i = teamMember.league_rank();
 
  704                 Kokkos::single(Kokkos::PerTeam(teamMember), [&] () {
 
  705                 dgetrf_( const_cast<int*>(&M), const_cast<int*>(&N),
 
  706                          p_offset, const_cast<int*>(&lda),
 
  710                 teamMember.team_barrier();
 
  714                 Kokkos::single(Kokkos::PerTeam(teamMember), [&] () {
 
  715                 dgetrs_( const_cast<char *>(transpose_or_no.c_str()),
 
  716                          const_cast<int*>(&N), 
const_cast<int*
>(&NRHS),
 
  717                          p_offset, const_cast<int*>(&lda),
 
  719                          rhs_offset, 
const_cast<int*
>(&ldb),
 
  729         Kokkos::View<int*> scratch_ipiv(
"scratch_ipiv", std::min(M, N));
 
  731         for (
int i=0; i<num_matrices; ++i) {
 
  738                 dgetrf_( const_cast<int*>(&M), const_cast<int*>(&N),
 
  739                          p_offset, const_cast<int*>(&lda),
 
  743                 dgetrs_( const_cast<char *>(transpose_or_no.c_str()),
 
  744                          const_cast<int*>(&N), 
const_cast<int*
>(&NRHS),
 
  745                          p_offset, const_cast<int*>(&lda),
 
  747                          rhs_offset, 
const_cast<int*
>(&ldb),
 
  754     #endif // LAPACK is not threadsafe 
  760     int scratch_size = scratch_matrix_left_type::shmem_size(ldb, ndb);
 
team_policy::member_type member_type
 
void batchLUFactorize(ParallelManager pm, double *P, int lda, int nda, double *RHS, int ldb, int ndb, int M, int N, int NRHS, const int num_matrices, const size_t max_neighbors, const int initial_index_of_batch, int *neighbor_list_sizes)
Calls LAPACK or CUBLAS to solve a batch of LU problems. 
 
Kokkos::TeamPolicy< device_execution_space > team_policy
 
#define TO_GLOBAL(variable)
 
Kokkos::View< double *, Kokkos::MemoryTraits< Kokkos::Unmanaged > > scratch_vector_type
 
Kokkos::View< double **, layout_left, Kokkos::MemoryTraits< Kokkos::Unmanaged > > scratch_matrix_left_type
 
void CallFunctorWithTeamThreads(const global_index_type batch_size, C functor) const 
Calls a parallel_for parallel_for will break out over loops over teams with each thread executing cod...
 
void setTeamScratchSize(const int level, const int value)
 
void batchSVDFactorize(ParallelManager pm, bool swap_layout_P, double *P, int lda, int nda, bool swap_layout_RHS, double *RHS, int ldb, int ndb, int M, int N, int NRHS, const int num_matrices, const size_t max_neighbors, const int initial_index_of_batch, int *neighbor_list_sizes)
Calls LAPACK or CUBLAS to solve a batch of SVD problems. 
 
void batchQRFactorize(ParallelManager pm, double *P, int lda, int nda, double *RHS, int ldb, int ndb, int M, int N, int NRHS, const int num_matrices, const size_t max_neighbors, const int initial_index_of_batch, int *neighbor_list_sizes)
Calls LAPACK or CUBLAS to solve a batch of QR problems. 
 
#define compadre_assert_release(condition)
compadre_assert_release is used for assertions that should always be checked, but generally are not e...
 
Kokkos::View< int *, Kokkos::MemoryTraits< Kokkos::Unmanaged > > scratch_local_index_type