Merge pull request #653 from streeve/fft_benchmark

Update Cajita benchmarks
ECP-copa · Aug 18, 2023 · fe7d01a · fe7d01a
2 parents 092aa5d + 5a15d4c
commit fe7d01a
Show file tree

Hide file tree

Showing 3 changed files with 220 additions and 161 deletions.
diff --git a/benchmark/cajita/Cajita_FastFourierTransformPerformance.cpp b/benchmark/cajita/Cajita_FastFourierTransformPerformance.cpp
@@ -31,8 +31,9 @@ using namespace Cajita;
 // Performance test.
 template <class Device>
 void performanceTest( std::ostream& stream,
-                      std::vector<double> grid_sizes_per_dim, MPI_Comm comm,
-                      const std::string& test_prefix )
+                      const Cajita::DimBlockPartitioner<3> partitioner,
+                      std::vector<double> grid_sizes_per_dim_per_rank,
+                      MPI_Comm comm, const std::string& test_prefix )
 {
     using exec_space = typename Device::execution_space;
     using memory_space = typename Device::memory_space;
@@ -42,35 +43,37 @@ void performanceTest( std::ostream& stream,
     std::array<double, 3> global_high_corner = { 1.0, 1.0, 1.0 };
     std::array<bool, 3> is_dim_periodic = { true, true, true };
 
-    int num_grid_size_per_dim = grid_sizes_per_dim.size();
+    int num_grid_size = grid_sizes_per_dim_per_rank.size();
 
     // number of runs in test loops
     int num_runs = 10;
 
-    // get MPI rank
-    int comm_rank;
-    MPI_Comm_rank( comm, &comm_rank );
-
     // create timers
     Cabana::Benchmark::Timer setup_timer( test_prefix + "setup",
-                                          num_grid_size_per_dim );
+                                          num_grid_size );
 
     Cabana::Benchmark::Timer transforms_timer( test_prefix + "transforms",
-                                               num_grid_size_per_dim );
+                                               num_grid_size );
     // loop over the grid sizes
-    for ( int p = 0; p < num_grid_size_per_dim; ++p )
+    for ( int p = 0; p < num_grid_size; ++p )
     {
-        double cell_size = 1.0 / grid_sizes_per_dim[p];
+        auto ranks_per_dim = partitioner.ranksPerDimension( comm, { 0, 0, 0 } );
+
+        std::array<int, 3> num_cell;
+        for ( int d = 0; d < 3; ++d )
+        {
+            num_cell[d] = grid_sizes_per_dim_per_rank[p] * ranks_per_dim[d];
+        }
         auto global_mesh = createUniformGlobalMesh(
-            global_low_corner, global_high_corner, cell_size );
+            global_low_corner, global_high_corner, num_cell );
 
         // Create the global grid
-        DimBlockPartitioner<3> partitioner;
         auto global_grid =
             createGlobalGrid( comm, global_mesh, is_dim_periodic, partitioner );
 
         // Create a local grid
-        auto local_grid = createLocalGrid( global_grid, 0 );
+        int halo_width = 0;
+        auto local_grid = createLocalGrid( global_grid, halo_width );
         auto owned_space = local_grid->indexSpace( Own(), Cell(), Local() );
         auto ghosted_space = local_grid->indexSpace( Ghost(), Cell(), Local() );
 
@@ -106,21 +109,19 @@ void performanceTest( std::ostream& stream,
 
         setup_timer.stop( p );
 
-        transforms_timer.start( p );
-
         // Loop over number of runs
         for ( int t = 0; t < num_runs; ++t )
         {
+            transforms_timer.start( p );
             fft->forward( *lhs, Experimental::FFTScaleFull() );
             fft->reverse( *lhs, Experimental::FFTScaleNone() );
+            transforms_timer.stop( p );
         }
-
-        transforms_timer.stop( p );
     }
 
-    outputResults( stream, "grid_size_per_dim", grid_sizes_per_dim, setup_timer,
-                   comm );
-    outputResults( stream, "grid_size_per_dim", grid_sizes_per_dim,
+    outputResults( stream, "grid_size_per_dim", grid_sizes_per_dim_per_rank,
+                   setup_timer, comm );
+    outputResults( stream, "grid_size_per_dim", grid_sizes_per_dim_per_rank,
                    transforms_timer, comm );
 
     stream << std::flush;
@@ -152,10 +153,10 @@ int main( int argc, char* argv[] )
 
     // Declare the grid size per dimension
     // currently, testing 3dims+symmetric
-    std::vector<double> grid_sizes_per_dim = { 16, 32 };
+    std::vector<double> grid_sizes_per_dim_per_rank = { 16, 32 };
     if ( run_type == "large" )
     {
-        grid_sizes_per_dim = { 16, 32, 64, 128 };
+        grid_sizes_per_dim_per_rank = { 16, 32, 64, 128 };
     }
 
     // Get the name of the output file.
@@ -164,23 +165,38 @@ int main( int argc, char* argv[] )
     // Barrier before continuing
     MPI_Barrier( MPI_COMM_WORLD );
 
-    // Get comm rank;
+    // Get comm rank and size;
     int comm_rank;
     MPI_Comm_rank( MPI_COMM_WORLD, &comm_rank );
+    int comm_size;
+    MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
+
+    // Get partitioner
+    Cajita::DimBlockPartitioner<3> partitioner;
+    // Get ranks per dimension
+    std::array<int, 3> ranks_per_dimension =
+        partitioner.ranksPerDimension( MPI_COMM_WORLD, { 0, 0, 0 } );
 
     // Open the output file on rank 0.
     std::fstream file;
-    if ( 0 == comm_rank )
-        file.open( filename, std::fstream::out );
 
-    // Output file header
+    // Output problem details.
     if ( 0 == comm_rank )
     {
+        file.open( filename + "_" + std::to_string( comm_size ),
+                   std::fstream::out );
         file << "\n";
         file << "Cajita FFT Performance Benchmark"
              << "\n";
         file << "----------------------------------------------"
              << "\n";
+        file << "MPI Ranks: " << comm_size << "\n";
+        file << "MPI Cartesian Dim Ranks: (" << ranks_per_dimension[0] << ", "
+             << ranks_per_dimension[1] << ", " << ranks_per_dimension[2]
+             << ")\n";
+        file << "----------------------------------------------"
+             << "\n";
+        file << "\n";
         file << std::flush;
     }
 
@@ -194,11 +210,13 @@ int main( int argc, char* argv[] )
     // Don't run twice on the CPU if only host enabled.
     if ( !std::is_same<device_type, host_device_type>{} )
     {
-        performanceTest<device_type>( file, grid_sizes_per_dim, MPI_COMM_WORLD,
-                                      "device_default_" );
+        performanceTest<device_type>( file, partitioner,
+                                      grid_sizes_per_dim_per_rank,
+                                      MPI_COMM_WORLD, "device_default_" );
     }
-    performanceTest<host_device_type>( file, grid_sizes_per_dim, MPI_COMM_WORLD,
-                                       "host_default_" );
+    performanceTest<host_device_type>( file, partitioner,
+                                       grid_sizes_per_dim_per_rank,
+                                       MPI_COMM_WORLD, "host_default_" );
 
     // Close the output file on rank 0.
     file.close();