48 #include "Kokkos_Core.hpp"
53 #ifdef KOKKOS_ENABLE_CUDA
54 #include "cuda_runtime_api.h"
58 #include <sys/types.h>
61 int main(
int argc,
char *argv[])
67 const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
68 const size_t num_cores_per_socket =
69 Kokkos::hwloc::get_available_cores_per_numa();
70 const size_t num_threads_per_core =
71 Kokkos::hwloc::get_available_threads_per_core();
76 "This test performance of MP::Vector FEM assembly.\n");
78 CLP.
setOption(
"n", &nGrid,
"Number of mesh points in each direction. Set to zero to use a range");
80 CLP.
setOption(
"n-begin", &nGridBegin,
"Beginning number of mesh points in each direction.");
82 CLP.
setOption(
"n-end", &nGridEnd,
"Ending number of mesh points in each direction.");
84 CLP.
setOption(
"n-step", &nGridStep,
"Increment in number of mesh points in each direction.");
86 CLP.
setOption(
"ni", &nIter,
"Number of assembly iterations");
88 CLP.
setOption(
"print",
"no-print", &print,
"Print debugging output");
90 CLP.
setOption(
"check",
"no-check", &check,
"Check correctness");
91 bool quadratic =
false;
92 CLP.
setOption(
"quadratic",
"linear", &quadratic,
"Use quadratic basis functions");
93 int num_cores = num_cores_per_socket * num_sockets;
95 "Number of CPU cores to use (defaults to all)");
96 int num_hyper_threads = num_threads_per_core;
97 CLP.
setOption(
"hyperthreads", &num_hyper_threads,
98 "Number of hyper threads per core to use (defaults to all)");
99 #ifdef KOKKOS_ENABLE_THREADS
101 CLP.
setOption(
"threads",
"no-threads", &threads,
"Enable Threads device");
103 #ifdef KOKKOS_ENABLE_OPENMP
105 CLP.
setOption(
"openmp",
"no-openmp", &openmp,
"Enable OpenMP device");
107 #ifdef KOKKOS_ENABLE_CUDA
109 CLP.
setOption(
"cuda",
"no-cuda", &cuda,
"Enable Cuda device");
111 CLP.
setOption(
"device", &device_id,
"CUDA device ID.");
114 CLP.
setOption(
"vtune",
"no-vtune", &vtune,
"connect to vtune");
115 CLP.
parse( argc, argv );
124 std::stringstream cmd;
125 pid_t my_os_pid=getpid();
126 const std::string vtune_loc =
128 const std::string output_dir =
"./vtune/vtune.0";
130 <<
" -collect hotspots -result-dir " << output_dir
131 <<
" -target-pid " << my_os_pid <<
" &";
132 std::cout << cmd.str() << std::endl;
133 system(cmd.str().c_str());
137 Kokkos::initialize(argc,argv);
138 #ifdef KOKKOS_ENABLE_THREADS
140 typedef Kokkos::Threads Device;
142 std::cout << std::endl
143 <<
"Threads performance with " << num_cores*num_hyper_threads
144 <<
" threads:" << std::endl;
146 performance_test_driver<Device>(
147 print, nIter, nGridBegin, nGridEnd, nGridStep, quadratic, check);
151 #ifdef KOKKOS_ENABLE_OPENMP
153 typedef Kokkos::OpenMP Device;
155 std::cout << std::endl
156 <<
"OpenMP performance with " << num_cores*num_hyper_threads
157 <<
" threads:" << std::endl;
159 performance_test_driver<Device>(
160 print, nIter, nGridBegin, nGridEnd, nGridStep, quadratic, check);
164 #ifdef KOKKOS_ENABLE_CUDA
166 typedef Kokkos::Cuda Device;
168 cudaDeviceProp deviceProp;
169 cudaGetDeviceProperties(&deviceProp, device_id);
170 std::cout << std::endl
171 <<
"CUDA performance performance with device " << device_id
173 << deviceProp.name <<
"):"
176 performance_test_driver<Device>(
177 print, nIter, nGridBegin, nGridEnd, nGridStep, quadratic, check);
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
void setDocString(const char doc_string[])