diff --git a/configure.ac b/configure.ac index 9640ed009..86207c3a3 100644 --- a/configure.ac +++ b/configure.ac @@ -272,6 +272,7 @@ AC_CONFIG_FILES([bin/sstcc], [chmod +x bin/sstcc]) AC_CONFIG_FILES([bin/sstccvars.py]) AC_CONFIG_FILES([tests/runtest], [chmod +x tests/runtest]) AC_CONFIG_FILES([tests/checktest], [chmod +x tests/checktest]) +AC_CONFIG_FILES([tests/checkdiff], [chmod +x tests/checkdiff]) AC_OUTPUT diff --git a/sstmac/skeletons/Makefile.am b/sstmac/skeletons/Makefile.am index ef62ab1a0..3eb3ed946 100644 --- a/sstmac/skeletons/Makefile.am +++ b/sstmac/skeletons/Makefile.am @@ -23,6 +23,9 @@ nobase_library_include_HEADERS = \ libsstmac_skeletons_la_LDFLAGS = libsstmac_skeletons_la_SOURCES = \ + fft/fft.cc \ + halo3d-26/halo3d-26.cc \ + sweep3d/sweep3d.cc \ traffic_matrix/main.cc \ undumpi/parsedumpi.cc \ undumpi/parsedumpi_callbacks.cc diff --git a/sstmac/skeletons/fft/fft.cc b/sstmac/skeletons/fft/fft.cc new file mode 100644 index 000000000..69dbe9346 --- /dev/null +++ b/sstmac/skeletons/fft/fft.cc @@ -0,0 +1,178 @@ +/** +Copyright 2009-2020 National Technology and Engineering Solutions of Sandia, +LLC (NTESS). Under the terms of Contract DE-NA-0003525, the U.S. Government +retains certain rights in this software. + +Sandia National Laboratories is a multimission laboratory managed and operated +by National Technology and Engineering Solutions of Sandia, LLC., a wholly +owned subsidiary of Honeywell International, Inc., for the U.S. Department of +Energy's National Nuclear Security Administration under contract DE-NA0003525. + +Copyright (c) 2009-2020, NTESS + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Questions? Contact sst-macro-help@sandia.gov +*/ +#include +#include +#include +#include + +#define MP_X 0 +#define MP_Y 1 +#define MP_Z 2 + +#define calc_pe(a,b,c) ((a)+(b)*dims[MP_X]+(c)*dims[MP_X]*dims[MP_Y]) + +#define sstmac_app_name fft +int USER_MAIN(int argc, char **argv) +{ + int world_rank, numranks; + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&world_rank); + MPI_Comm_size(MPI_COMM_WORLD,&numranks); + + int myrank = world_rank; + MPI_Comm comm = MPI_COMM_WORLD; + + int dims[3] = {0, 0, 0}; + + int msg_size_x = 0; + int msg_size_y = 0; + int msg_size_z = 0; + int MAX_ITER = 10; + int print = 0; + + for (int i = 0; i < argc; ++i) { + if (strcmp("-pex", argv[i]) == 0) { + dims[MP_X] = atoi(argv[i + 1]); + i++; + } else if (strcmp("-pey", argv[i]) == 0) { + dims[MP_Y] = atoi(argv[i + 1]); + i++; + } else if (strcmp("-pez", argv[i]) == 0) { + dims[MP_Z] = atoi(argv[i + 1]); + i++; + } else if (strcmp("-iterations", argv[i]) == 0) { + MAX_ITER = atoi(argv[i + 1]); + i++; + } else if (strcmp("-nx", argv[i]) == 0) { + msg_size_x = atoi(argv[i + 1]); + i++; + } else if (strcmp("-ny", argv[i]) == 0) { + msg_size_y = atoi(argv[i + 1]); + i++; + } else if (strcmp("-nz", argv[i]) == 0) { + msg_size_z = atoi(argv[i + 1]); + i++; + } else if (strcmp(argv[i], "-print") == 0){ + print = atol(argv[i+1]); + ++i; + } + } + + if(dims[MP_X] * dims[MP_Y] * dims[MP_Z] != numranks) { + fprintf(stderr, "\n nx * ny * nz does not equal number of ranks\n"); + MPI_Abort(MPI_COMM_WORLD, 1); + } + + //figure out my coordinates + int myXcoord = myrank % dims[MP_X]; + int myYcoord = (myrank % (dims[MP_X] * dims[MP_Y])) / dims[MP_X]; + int myZcoord = (myrank % (dims[MP_X] * dims[MP_Y] * dims[MP_Z])) / (dims[MP_X] * dims[MP_Y]); + + bool skip[3]; + + //which a2as to skip + skip[MP_X] = msg_size_x == 0; + skip[MP_Y] = msg_size_y == 0; + skip[MP_Z] = msg_size_z == 0; + + //all a2a share the buffer + int largestMsg = (msg_size_x * dims[MP_X] > msg_size_y * dims[MP_Y]) ? msg_size_x * dims[MP_X] : msg_size_y * dims[MP_Y]; + largestMsg = (largestMsg > msg_size_z * dims[MP_Z]) ? largestMsg : msg_size_z * dims[MP_Z]; + + char *sendbuf = nullptr; + char *recvbuf = nullptr; + + //create subcommunicators + MPI_Comm X_comm, Y_comm, Z_comm; + if(!skip[MP_X]) { + MPI_Comm_split(comm, myZcoord * dims[MP_Y] + myYcoord, myXcoord, &X_comm); + } + if(!skip[MP_Y]) { + MPI_Comm_split(comm, myZcoord * dims[MP_X] + myXcoord, myYcoord, &Y_comm); + } + if(!skip[MP_Z]) { + MPI_Comm_split(comm, myYcoord * dims[MP_X] + myXcoord, myZcoord, &Z_comm); + } + + double startTime, stopTime; + MPI_Barrier(MPI_COMM_WORLD); + + startTime = MPI_Wtime(); + for (int i = 0; i < MAX_ITER; i++) { + double start = MPI_Wtime(); + if(!skip[MP_X]) { + MPI_Alltoall(sendbuf, msg_size_x, MPI_CHAR, recvbuf, msg_size_x, MPI_CHAR, X_comm); + } + + if(!skip[MP_Y]) { + MPI_Alltoall(sendbuf, msg_size_y, MPI_CHAR, recvbuf, msg_size_y, MPI_CHAR, Y_comm); + } + + if(!skip[MP_Z]) { + MPI_Alltoall(sendbuf, msg_size_z, MPI_CHAR, recvbuf, msg_size_z, MPI_CHAR, Z_comm); + } + + double stop = MPI_Wtime(); + if (print){ + printf("Rank %d = [%d,%d,%d] iteration %d: %12.8fs\n", + myrank, myXcoord, myYcoord, myZcoord, i, (stop-start)); + } + } + + MPI_Barrier(MPI_COMM_WORLD); + stopTime = MPI_Wtime(); + + + //finalized summary output + if(myrank == 0 && MAX_ITER != 0 && print) { + printf("Finished %d iterations\n",MAX_ITER); + printf("Time elapsed per iteration for grid size (%d,%d,%d) with message sizes (%d,%d,%d) : %f s\n", + dims[MP_X], dims[MP_Y], dims[MP_Z], msg_size_x, msg_size_y, msg_size_z, (stopTime - startTime)/MAX_ITER); + } + + MPI_Finalize(); + return 0; +} + + diff --git a/sstmac/skeletons/fft/parameters.ini b/sstmac/skeletons/fft/parameters.ini new file mode 100644 index 000000000..2c111725d --- /dev/null +++ b/sstmac/skeletons/fft/parameters.ini @@ -0,0 +1,59 @@ + +node { + app1 { + indexing = block + allocation = first_available + name = fft + launch_cmd = aprun -n 64 -N 1 + argv = -pex 4 -pey 4 -pez 4 -nx 3200 -ny 3200 -nz 3200 -iterations 3 -print 1 + } + nic { + name = snappr + injection { + mtu = 1KB + bandwidth = 10GB/s + latency = 1us + credits = 64KB + } + } + memory { + name = snappr + channel_bandwidth = 10GB/s + num_channels = 8 + mtu = 1MB + latency = 15ns + } + proc { + ncores = 4 + frequency = 2.1Ghz + } + name = simple +} + +switch { + name = snappr + arbitrator = fifo + mtu = 1KB + link { + bandwidth = 2.5GB/s + latency = 100ns + credits = 64KB + } + logp { + bandwidth = 2.5GB/s + out_in_latency = 1us + hop_latency = 100ns + } + router { + name = torus_minimal + } +} + + +topology { + name = torus + seed = 14 + geometry = [4,4,4] + redundant = [8,4,8] + concentration = 1 +} diff --git a/sstmac/skeletons/halo3d-26/halo3d-26.cc b/sstmac/skeletons/halo3d-26/halo3d-26.cc new file mode 100644 index 000000000..fb9f7b8a7 --- /dev/null +++ b/sstmac/skeletons/halo3d-26/halo3d-26.cc @@ -0,0 +1,440 @@ +/** +Copyright 2009-2020 National Technology and Engineering Solutions of Sandia, +LLC (NTESS). Under the terms of Contract DE-NA-0003525, the U.S. Government +retains certain rights in this software. + +Sandia National Laboratories is a multimission laboratory managed and operated +by National Technology and Engineering Solutions of Sandia, LLC., a wholly +owned subsidiary of Honeywell International, Inc., for the U.S. Department of +Energy's National Nuclear Security Administration under contract DE-NA0003525. + +Copyright (c) 2009-2020, NTESS + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Questions? Contact sst-macro-help@sandia.gov +*/ +#include +#include +#include +#include +#include +#include + +void get_position(int rank, int pex, int pey, int pez, + int* myX, int* myY, int* myZ) +{ + const int plane = rank % (pex * pey); + *myY = plane / pex; + *myX = (plane % pex) != 0 ? (plane % pex) : 0; + *myZ = rank / (pex * pey); +} + +int convert_position_to_rank(int pX, int pY, int pZ, + int myX, int myY, int myZ) +{ + myX = (myX + pX) % pX; + myY = (myY + pY) % pY; + myZ = (myZ + pZ) % pZ; + return (myZ * (pX * pY)) + (myY * pX) + myX; +} + +#define sstmac_app_name halo3d-26 +int USER_MAIN(int argc, char* argv[]) { + MPI_Init(&argc, &argv); + + int world_me = -1; + int world_size = -1; + + + MPI_Comm_rank(MPI_COMM_WORLD, &world_me); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + int size = world_size; + int me = world_me; + + MPI_Comm halo_comm = MPI_COMM_WORLD; + + int pex = size; + int pey = 1; + int pez = 1; + + int nx = 10; + int ny = 10; + int nz = 10; + + int repeats = 100; + int vars = 1; + + long sleep = 1000; + + int print = 0; + + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "-nx") == 0) { + if (i == argc) { + if (me == 0) { + fprintf(stderr, "Error: specified -nx without a value.\n"); + } + + exit(-1); + } + + nx = atoi(argv[i + 1]); + ++i; + } else if (strcmp(argv[i], "-ny") == 0) { + if (i == argc) { + if (me == 0) { + fprintf(stderr, "Error: specified -ny without a value.\n"); + } + + exit(-1); + } + + ny = atoi(argv[i + 1]); + ++i; + } else if (strcmp(argv[i], "-nz") == 0) { + if (i == argc) { + if (me == 0) { + fprintf(stderr, "Error: specified -nz without a value.\n"); + } + + exit(-1); + } + + nz = atoi(argv[i + 1]); + ++i; + } else if (strcmp(argv[i], "-pex") == 0) { + if (i == argc) { + if (me == 0) { + fprintf(stderr, "Error: specified -pex without a value.\n"); + } + + exit(-1); + } + + pex = atoi(argv[i + 1]); + ++i; + } else if (strcmp(argv[i], "-pey") == 0) { + if (i == argc) { + if (me == 0) { + fprintf(stderr, "Error: specified -pey without a value.\n"); + } + + exit(-1); + } + + pey = atoi(argv[i + 1]); + ++i; + } else if (strcmp(argv[i], "-pez") == 0) { + if (i == argc) { + if (me == 0) { + fprintf(stderr, "Error: specified -pez without a value.\n"); + } + + exit(-1); + } + + pez = atoi(argv[i + 1]); + ++i; + } else if (strcmp(argv[i], "-iterations") == 0) { + if (i == argc) { + if (me == 0) { + fprintf(stderr, "Error: specified -iterations without a value.\n"); + } + + exit(-1); + } + + repeats = atoi(argv[i + 1]); + ++i; + } else if (strcmp(argv[i], "-vars") == 0) { + if (i == argc) { + if (me == 0) { + fprintf(stderr, "Error: specified -vars without a value.\n"); + } + + exit(-1); + } + + vars = atoi(argv[i + 1]); + ++i; + } else if (strcmp(argv[i], "-sleep") == 0) { + if (i == argc) { + if (me == 0) { + fprintf(stderr, "Error: specified -sleep without a value.\n"); + } + + exit(-1); + } + + sleep = atol(argv[i + 1]); + ++i; + } else if (strcmp(argv[i], "-print") == 0){ + print = atoi(argv[i + 1]); + ++i; + } else { + if (0 == me) { + fprintf(stderr, "Unknown option: %s\n", argv[i]); + } + + exit(-1); + } + } + + MPI_Barrier(MPI_COMM_WORLD); + + if ((pex * pey * pez) != size) { + fprintf(stderr, "Error: rank grid does not equal number of ranks.\n"); + fprintf(stderr, "%7d x %7d x %7d != %7d\n", pex, pey, pez, size); + MPI_Abort(MPI_COMM_WORLD, 1); + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (me == 0 && print) { + printf("# MPI Nearest Neighbor Communication\n"); + printf("# Info:\n"); + printf("# Processor Grid: %7d x %7d x %7d\n", pex, pey, pez); + printf("# Data Grid (per rank): %7d x %7d x %7d\n", nx, ny, nz); + printf("# Iterations: %7d\n", repeats); + printf("# Variables: %7d\n", vars); + printf("# Sleep: %7ld\n", sleep); + } + + int posX, posY, posZ; + get_position(me, pex, pey, pez, &posX, &posY, &posZ); + + const int xFaceUp = + convert_position_to_rank(pex, pey, pez, posX + 1, posY, posZ); + const int xFaceDown = + convert_position_to_rank(pex, pey, pez, posX - 1, posY, posZ); + const int yFaceUp = + convert_position_to_rank(pex, pey, pez, posX, posY + 1, posZ); + const int yFaceDown = + convert_position_to_rank(pex, pey, pez, posX, posY - 1, posZ); + const int zFaceUp = + convert_position_to_rank(pex, pey, pez, posX, posY, posZ + 1); + const int zFaceDown = + convert_position_to_rank(pex, pey, pez, posX, posY, posZ - 1); + + const int vertexA = + convert_position_to_rank(pex, pey, pez, posX - 1, posY - 1, posZ - 1); + const int vertexB = + convert_position_to_rank(pex, pey, pez, posX - 1, posY - 1, posZ + 1); + const int vertexC = + convert_position_to_rank(pex, pey, pez, posX - 1, posY + 1, posZ - 1); + const int vertexD = + convert_position_to_rank(pex, pey, pez, posX - 1, posY + 1, posZ + 1); + const int vertexE = + convert_position_to_rank(pex, pey, pez, posX + 1, posY - 1, posZ - 1); + const int vertexF = + convert_position_to_rank(pex, pey, pez, posX + 1, posY - 1, posZ + 1); + const int vertexG = + convert_position_to_rank(pex, pey, pez, posX + 1, posY + 1, posZ - 1); + const int vertexH = + convert_position_to_rank(pex, pey, pez, posX + 1, posY + 1, posZ + 1); + + const int edgeA = + convert_position_to_rank(pex, pey, pez, posX - 1, posY - 1, posZ); + const int edgeB = + convert_position_to_rank(pex, pey, pez, posX, posY - 1, posZ - 1); + const int edgeC = + convert_position_to_rank(pex, pey, pez, posX + 1, posY - 1, posZ); + const int edgeD = + convert_position_to_rank(pex, pey, pez, posX, posY - 1, posZ + 1); + const int edgeE = + convert_position_to_rank(pex, pey, pez, posX - 1, posY, posZ + 1); + const int edgeF = + convert_position_to_rank(pex, pey, pez, posX + 1, posY, posZ + 1); + const int edgeG = + convert_position_to_rank(pex, pey, pez, posX - 1, posY, posZ - 1); + const int edgeH = + convert_position_to_rank(pex, pey, pez, posX + 1, posY, posZ - 1); + const int edgeI = + convert_position_to_rank(pex, pey, pez, posX - 1, posY + 1, posZ); + const int edgeJ = + convert_position_to_rank(pex, pey, pez, posX, posY + 1, posZ + 1); + const int edgeK = + convert_position_to_rank(pex, pey, pez, posX + 1, posY + 1, posZ); + const int edgeL = + convert_position_to_rank(pex, pey, pez, posX, posY + 1, posZ - 1); + + int requestcount = 0; + MPI_Status* status; + status = (MPI_Status*)malloc(sizeof(MPI_Status) * 52); + + MPI_Request* requests; + requests = (MPI_Request*)malloc(sizeof(MPI_Request) * 52); + + double* sendBuffer = nullptr; + double* recvBuffer = nullptr; + + struct timeval start; + struct timeval end; + + struct timespec sleepTS; + sleepTS.tv_sec = 0; + sleepTS.tv_nsec = sleep; + + struct timespec remainTS; + + gettimeofday(&start, NULL); + + for (int i = 0; i < repeats; ++i) { + requestcount = 0; + struct timeval iter_start; + struct timeval iter_end; + gettimeofday(&iter_start, NULL); + + if (nanosleep(&sleepTS, &remainTS) == EINTR) { + while (nanosleep(&remainTS, &remainTS) == EINTR) + ; + } + + MPI_Irecv(recvBuffer, ny * nz * vars, MPI_DOUBLE, xFaceUp, 1000, + halo_comm, &requests[requestcount++]); + MPI_Isend(sendBuffer, ny * nz * vars, MPI_DOUBLE, xFaceUp, 1000, + halo_comm, &requests[requestcount++]); + + MPI_Irecv(recvBuffer, ny * nz * vars, MPI_DOUBLE, xFaceDown, + 1000, halo_comm, &requests[requestcount++]); + MPI_Isend(sendBuffer, ny * nz * vars, MPI_DOUBLE, xFaceDown, + 1000, halo_comm, &requests[requestcount++]); + + MPI_Irecv(recvBuffer, nx * nz * vars, MPI_DOUBLE, yFaceUp, 2000, + halo_comm, &requests[requestcount++]); + MPI_Isend(sendBuffer, nx * nz * vars, MPI_DOUBLE, yFaceUp, 2000, + halo_comm, &requests[requestcount++]); + + MPI_Irecv(recvBuffer, nx * nz * vars, MPI_DOUBLE, yFaceDown, + 2000, halo_comm, &requests[requestcount++]); + MPI_Isend(sendBuffer, nx * nz * vars, MPI_DOUBLE, yFaceDown, + 2000, halo_comm, &requests[requestcount++]); + + MPI_Irecv(recvBuffer, nx * ny * vars, MPI_DOUBLE, zFaceUp, 4000, + halo_comm, &requests[requestcount++]); + MPI_Isend(sendBuffer, nx * ny * vars, MPI_DOUBLE, zFaceUp, 4000, + halo_comm, &requests[requestcount++]); + + MPI_Irecv(recvBuffer, nx * ny * vars, MPI_DOUBLE, zFaceDown, + 4000, halo_comm, &requests[requestcount++]); + MPI_Isend(sendBuffer, nx * ny * vars, MPI_DOUBLE, zFaceDown, + 4000, halo_comm, &requests[requestcount++]); + + MPI_Irecv(recvBuffer, nz * vars, MPI_DOUBLE, edgeA, 8000, + halo_comm, &requests[requestcount++]); + MPI_Isend(sendBuffer, nz * vars, MPI_DOUBLE, edgeA, 8000, + halo_comm, &requests[requestcount++]); + + MPI_Irecv(recvBuffer, nx * vars, MPI_DOUBLE, edgeB, 8000, + halo_comm, &requests[requestcount++]); + MPI_Isend(sendBuffer, nx * vars, MPI_DOUBLE, edgeB, 8000, + halo_comm, &requests[requestcount++]); + + MPI_Irecv(recvBuffer, nz * vars, MPI_DOUBLE, edgeC, 8000, + halo_comm, &requests[requestcount++]); + MPI_Isend(sendBuffer, nz * vars, MPI_DOUBLE, edgeC, 8000, + halo_comm, &requests[requestcount++]); + + MPI_Irecv(recvBuffer, nx * vars, MPI_DOUBLE, edgeD, 8000, + halo_comm, &requests[requestcount++]); + MPI_Isend(sendBuffer, nx * vars, MPI_DOUBLE, edgeD, 8000, + halo_comm, &requests[requestcount++]); + + MPI_Irecv(recvBuffer, ny * vars, MPI_DOUBLE, edgeE, 8000, + halo_comm, &requests[requestcount++]); + MPI_Isend(sendBuffer, ny * vars, MPI_DOUBLE, edgeE, 8000, + halo_comm, &requests[requestcount++]); + + MPI_Irecv(recvBuffer, ny * vars, MPI_DOUBLE, edgeF, 8000, + halo_comm, &requests[requestcount++]); + MPI_Isend(sendBuffer, ny * vars, MPI_DOUBLE, edgeF, 8000, + halo_comm, &requests[requestcount++]); + + MPI_Irecv(recvBuffer, ny * vars, MPI_DOUBLE, edgeG, 8000, + halo_comm, &requests[requestcount++]); + MPI_Isend(sendBuffer, ny * vars, MPI_DOUBLE, edgeG, 8000, + halo_comm, &requests[requestcount++]); + + MPI_Irecv(recvBuffer, ny * vars, MPI_DOUBLE, edgeH, 8000, + halo_comm, &requests[requestcount++]); + MPI_Isend(sendBuffer, ny * vars, MPI_DOUBLE, edgeH, 8000, + halo_comm, &requests[requestcount++]); + + MPI_Irecv(recvBuffer, nz * vars, MPI_DOUBLE, edgeI, 8000, + halo_comm, &requests[requestcount++]); + MPI_Isend(sendBuffer, nz * vars, MPI_DOUBLE, edgeI, 8000, + halo_comm, &requests[requestcount++]); + + MPI_Irecv(recvBuffer, nx * vars, MPI_DOUBLE, edgeJ, 8000, + halo_comm, &requests[requestcount++]); + MPI_Isend(sendBuffer, nx * vars, MPI_DOUBLE, edgeJ, 8000, + halo_comm, &requests[requestcount++]); + + MPI_Irecv(recvBuffer, nz * vars, MPI_DOUBLE, edgeK, 8000, + halo_comm, &requests[requestcount++]); + MPI_Isend(sendBuffer, nz * vars, MPI_DOUBLE, edgeK, 8000, + halo_comm, &requests[requestcount++]); + + MPI_Irecv(recvBuffer, nx * vars, MPI_DOUBLE, edgeL, 8000, + halo_comm, &requests[requestcount++]); + MPI_Isend(sendBuffer, nx * vars, MPI_DOUBLE, edgeL, 8000, + halo_comm, &requests[requestcount++]); + + MPI_Waitall(requestcount, requests, status); + requestcount = 0; + gettimeofday(&iter_end, NULL); + const double timeTaken = (iter_end.tv_sec-iter_start.tv_sec) + (iter_end.tv_usec-iter_start.tv_usec)*1e-6; + if (print){ + printf("Rank %d = [%d,%d,%d] iteration %d: %12.8fs\n", me, posX, posY, posZ, i, timeTaken); + } + } + + gettimeofday(&end, NULL); + + MPI_Barrier(MPI_COMM_WORLD); + + if (convert_position_to_rank(pex, pey, pez, pex / 2, pey / 2, pez / 2) == + me) { + + if (print){ + printf("# Results from rank: %d\n", me); + + const double timeTaken = + (((double)end.tv_sec) + ((double)end.tv_usec) * 1.0e-6) - + (((double)start.tv_sec) + ((double)start.tv_usec) * 1.0e-6); + + printf("Total time = %20.6f\n", timeTaken); + } + } + + MPI_Finalize(); + return 0; +} diff --git a/sstmac/skeletons/halo3d-26/parameters.ini b/sstmac/skeletons/halo3d-26/parameters.ini new file mode 100644 index 000000000..1d0e6101b --- /dev/null +++ b/sstmac/skeletons/halo3d-26/parameters.ini @@ -0,0 +1,59 @@ + +node { + app1 { + indexing = block + allocation = first_available + name = halo3d-26 + launch_cmd = aprun -n 64 -N 1 + argv = -pex 4 -pey 4 -pez 4 -nx 100 -ny 100 -nz 100 -iterations 3 -print 1 + } + nic { + name = snappr + injection { + mtu = 1KB + bandwidth = 10GB/s + latency = 1us + credits = 64KB + } + } + memory { + name = snappr + channel_bandwidth = 10GB/s + num_channels = 8 + mtu = 1MB + latency = 15ns + } + proc { + ncores = 4 + frequency = 2.1Ghz + } + name = simple +} + +switch { + name = snappr + arbitrator = fifo + mtu = 1KB + link { + bandwidth = 2.5GB/s + latency = 100ns + credits = 64KB + } + logp { + bandwidth = 2.5GB/s + out_in_latency = 1us + hop_latency = 100ns + } + router { + name = torus_minimal + } +} + + +topology { + name = torus + seed = 14 + geometry = [4,4,4] + redundant = [8,4,8] + concentration = 1 +} diff --git a/sstmac/skeletons/sweep3d/parameters.ini b/sstmac/skeletons/sweep3d/parameters.ini new file mode 100644 index 000000000..0aa03e934 --- /dev/null +++ b/sstmac/skeletons/sweep3d/parameters.ini @@ -0,0 +1,59 @@ + +node { + app1 { + indexing = block + allocation = first_available + name = sweep3d + launch_cmd = aprun -n 64 -N 1 + argv = -pex 8 -pey 8 -nx 64 -ny 64 -nz 100 -kba 10 -vars 10 -iterations 2 -print 1 + } + nic { + name = snappr + injection { + mtu = 1KB + bandwidth = 10GB/s + latency = 1us + credits = 64KB + } + } + memory { + name = snappr + channel_bandwidth = 10GB/s + num_channels = 8 + mtu = 1MB + latency = 15ns + } + proc { + ncores = 4 + frequency = 2.1Ghz + } + name = simple +} + +switch { + name = snappr + arbitrator = fifo + mtu = 1KB + link { + bandwidth = 2.5GB/s + latency = 100ns + credits = 64KB + } + logp { + bandwidth = 2.5GB/s + out_in_latency = 1us + hop_latency = 100ns + } + router { + name = torus_minimal + } +} + + +topology { + name = torus + seed = 14 + geometry = [4,4,4] + redundant = [8,4,8] + concentration = 1 +} diff --git a/sstmac/skeletons/sweep3d/sweep3d.cc b/sstmac/skeletons/sweep3d/sweep3d.cc new file mode 100644 index 000000000..a4dff893c --- /dev/null +++ b/sstmac/skeletons/sweep3d/sweep3d.cc @@ -0,0 +1,323 @@ +/** +Copyright 2009-2020 National Technology and Engineering Solutions of Sandia, +LLC (NTESS). Under the terms of Contract DE-NA-0003525, the U.S. Government +retains certain rights in this software. + +Sandia National Laboratories is a multimission laboratory managed and operated +by National Technology and Engineering Solutions of Sandia, LLC., a wholly +owned subsidiary of Honeywell International, Inc., for the U.S. Department of +Energy's National Nuclear Security Administration under contract DE-NA0003525. + +Copyright (c) 2009-2020, NTESS + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Questions? Contact sst-macro-help@sandia.gov +*/ +#include +#include +#include +#include +#include +#include +#include + +void get_position(const int rank, const int pex, const int pey, int* myX, + int* myY) { + *myX = rank % pex; + *myY = rank / pex; +} + +void compute(long sleep) { + struct timespec sleepTS; + sleepTS.tv_sec = 0; + sleepTS.tv_nsec = sleep; + + struct timespec remainTS; + + if (nanosleep(&sleepTS, &remainTS) == EINTR) { + while (nanosleep(&remainTS, &remainTS) == EINTR) + ; + } +} + +#define sstmac_app_name sweep3d +int USER_MAIN(int argc, char* argv[]) +{ + MPI_Init(&argc, &argv); + + int rank = -1; + int size = -1; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + MPI_Comm sweep_comm = MPI_COMM_WORLD; + + int pex = -1; + int pey = -1; + int nx = 50; + int ny = 50; + int nz = 100; + int kba = 10; + int repeats = 1; + + int vars = 1; + long sleep = 1000; + int print = 0; + + for (int i = 0; i < argc; ++i) { + if (strcmp("-pex", argv[i]) == 0) { + pex = atoi(argv[i + 1]); + i++; + } else if (strcmp("-pey", argv[i]) == 0) { + pey = atoi(argv[i + 1]); + i++; + } else if (strcmp("-iterations", argv[i]) == 0) { + repeats = atoi(argv[i + 1]); + i++; + } else if (strcmp("-nx", argv[i]) == 0) { + nx = atoi(argv[i + 1]); + i++; + } else if (strcmp("-ny", argv[i]) == 0) { + ny = atoi(argv[i + 1]); + i++; + } else if (strcmp("-nz", argv[i]) == 0) { + nz = atoi(argv[i + 1]); + i++; + } else if (strcmp("-sleep", argv[i]) == 0) { + sleep = atol(argv[i + 1]); + i++; + } else if (strcmp("-vars", argv[i]) == 0) { + vars = atoi(argv[i + 1]); + i++; + } else if (strcmp("-kba", argv[i]) == 0) { + kba = atoi(argv[i + 1]); + i++; + } else if (strcmp(argv[i], "-print") == 0){ + print = atoi(argv[i + 1]); + i++; + } + } + + if (kba == 0) { + if (rank == 0) { + fprintf(stderr, + "K-Blocking Factor must not be zero. Please specify -kba 0>\n"); + } + MPI_Barrier(MPI_COMM_WORLD); //needed to force correct printing + exit(-1); + } + + if (nz % kba != 0) { + if (rank == 0) { + fprintf(stderr, + "KBA must evenly divide NZ, KBA=%d, NZ=%d, remainder=%d (must be " + "zero)\n", + kba, nz, (nz % kba)); + } + MPI_Barrier(MPI_COMM_WORLD); //needed to force correct printing + exit(-1); + } + + if ((pex * pey) != size) { + if (0 == rank) { + fprintf( + stderr, + "Error: processor decomposition (%d x %d) != number of ranks (%d)\n", + pex, pey, size); + } + MPI_Barrier(MPI_COMM_WORLD); //needed to force correct printing + exit(-1); + } + + if (rank == 0 && print) { + printf("# Sweep3D Communication Pattern\n"); + printf("# Info:\n"); + printf("# Px: %8d\n", pex); + printf("# Py: %8d\n", pey); + printf("# Nx x Ny x Nz: %8d x %8d x %8d\n", nx, ny, nz); + printf("# KBA: %8d\n", kba); + printf("# Variables: %8d\n", vars); + printf("# Iterations: %8d\n", repeats); + } + + int myX = -1; + int myY = -1; + + get_position(rank, pex, pey, &myX, &myY); + + const int xUp = (myX != (pex - 1)) ? rank + 1 : -1; + const int xDown = (myX != 0) ? rank - 1 : -1; + + const int yUp = (myY != (pey - 1)) ? rank + pex : -1; + const int yDown = (myY != 0) ? rank - pex : -1; + + MPI_Status status; + + double* xRecvBuffer = nullptr; + double* xSendBuffer = nullptr; + + double* yRecvBuffer = nullptr; + double* ySendBuffer = nullptr; + + struct timeval start; + struct timeval end; + + gettimeofday(&start, NULL); + + // We repeat this sequence twice because there are really 8 vertices in the 3D + // data domain and we sweep from each of them, processing the top four first + // and then the bottom four vertices next. + for (int i = 0; i < (repeats * 2); ++i) { + // Recreate communication pattern of sweep from (0,0) towards (Px,Py) + struct timeval iter_start; + struct timeval iter_end; + gettimeofday(&iter_start, NULL); + for (int k = 0; k < nz; k += kba) { + if (xDown > -1) { + MPI_Recv(xRecvBuffer, (nx * kba * vars), MPI_DOUBLE, xDown, 1000, + sweep_comm, &status); + } + + if (yDown > -1) { + MPI_Recv(yRecvBuffer, (ny * kba * vars), MPI_DOUBLE, yDown, 1000, + sweep_comm, &status); + } + + compute(sleep); + + if (xUp > -1) { + MPI_Send(xSendBuffer, (nx * kba * vars), MPI_DOUBLE, xUp, 1000, + sweep_comm); + } + + if (yUp > -1) { + MPI_Send(ySendBuffer, (nx * kba * vars), MPI_DOUBLE, yUp, 1000, + sweep_comm); + } + } + + // Recreate communication pattern of sweep from (Px,0) towards (0,Py) + for (int k = 0; k < nz; k += kba) { + if (xUp > -1) { + MPI_Recv(xRecvBuffer, (nx * kba * vars), MPI_DOUBLE, xUp, 2000, + sweep_comm, &status); + } + + if (yDown > -1) { + MPI_Recv(yRecvBuffer, (ny * kba * vars), MPI_DOUBLE, yDown, 2000, + sweep_comm, &status); + } + + compute(sleep); + + if (xDown > -1) { + MPI_Send(xSendBuffer, (nx * kba * vars), MPI_DOUBLE, xDown, 2000, + sweep_comm); + } + + if (yUp > -1) { + MPI_Send(ySendBuffer, (nx * kba * vars), MPI_DOUBLE, yUp, 2000, + sweep_comm); + } + } + + // Recreate communication pattern of sweep from (Px,Py) towards (0,0) + for (int k = 0; k < nz; k += kba) { + if (xUp > -1) { + MPI_Recv(xRecvBuffer, (nx * kba * vars), MPI_DOUBLE, xUp, 3000, + sweep_comm, &status); + } + + if (yUp > -1) { + MPI_Recv(yRecvBuffer, (ny * kba * vars), MPI_DOUBLE, yUp, 3000, + sweep_comm, &status); + } + + compute(sleep); + + if (xDown > -1) { + MPI_Send(xSendBuffer, (nx * kba * vars), MPI_DOUBLE, xDown, 3000, + sweep_comm); + } + + if (yDown > -1) { + MPI_Send(ySendBuffer, (nx * kba * vars), MPI_DOUBLE, yDown, 3000, + sweep_comm); + } + } + + // Recreate communication pattern of sweep from (0,Py) towards (Px,0) + for (int k = 0; k < nz; k += kba) { + if (xDown > -1) { + MPI_Recv(xRecvBuffer, (nx * kba * vars), MPI_DOUBLE, xDown, 4000, + sweep_comm, &status); + } + + if (yUp > -1) { + MPI_Recv(yRecvBuffer, (ny * kba * vars), MPI_DOUBLE, yUp, 4000, + sweep_comm, &status); + } + + compute(sleep); + + if (xUp > -1) { + MPI_Send(xSendBuffer, (nx * kba * vars), MPI_DOUBLE, xUp, 4000, + sweep_comm); + } + + if (yDown > -1) { + MPI_Send(ySendBuffer, (nx * kba * vars), MPI_DOUBLE, yDown, 4000, + sweep_comm); + } + } + gettimeofday(&iter_end, NULL); + const double timeTaken = (iter_end.tv_sec-iter_start.tv_sec) + (iter_end.tv_usec-iter_start.tv_usec)*1e-6; + if (print){ + printf("Rank %d = [%d,%d] iteration %d: %12.8fs\n", rank, myX, myY, i, timeTaken); + } + } + + MPI_Barrier(MPI_COMM_WORLD); + gettimeofday(&end, NULL); + + const double timeTaken = + (((double)end.tv_sec) + ((double)end.tv_usec) * 1.0e-6) - + (((double)start.tv_sec) + ((double)start.tv_usec) * 1.0e-6); + + if (rank == 0){ + if (print){ + printf("Total time = %20.6f\n", timeTaken); + } + } + MPI_Finalize(); + return 0; +} diff --git a/tests/Makefile.clang_tests b/tests/Makefile.clang_tests index 17606ad5c..9461f4404 100644 --- a/tests/Makefile.clang_tests +++ b/tests/Makefile.clang_tests @@ -67,7 +67,7 @@ CLANGTEMP=$(CLANGTESTS:%=test_clang_%.tmp-out) .PRECIOUS: $(CLANGTEMP) test_clang_%.$(CHKSUF): test_clang_%.tmp-out - $(top_srcdir)/tests/checkdiff $< $(top_srcdir) + $(top_builddir)/tests/checkdiff $< $(top_srcdir) test_clang_%_cpp.tmp-out: sst.pp.%.cc $(SSTMAC_DEGLOBAL) -$(CXX) -std=c++11 -c $< -o tmp.o \ diff --git a/tests/Makefile.core_tests b/tests/Makefile.core_tests index fb3a41011..0eb70c64c 100644 --- a/tests/Makefile.core_tests +++ b/tests/Makefile.core_tests @@ -5,6 +5,9 @@ CORETESTS+= \ test_sumi_collective \ + test_core_apps_fft \ + test_core_apps_halo3d \ + test_core_apps_sweep3d \ test_core_apps_ping_pong_snappr \ test_core_apps_ping_pong_mem_thrash \ test_core_apps_ping_all_dfly_snappr \ diff --git a/tests/checkdiff b/tests/checkdiff.in similarity index 95% rename from tests/checkdiff rename to tests/checkdiff.in index 7b8fe9718..38a3f3be2 100755 --- a/tests/checkdiff +++ b/tests/checkdiff.in @@ -1,4 +1,4 @@ -#! /usr/bin/env python +#! @pyexe@ import sys import signal @@ -6,6 +6,8 @@ import time import os import subprocess import re + +sys.path.append("@abs_top_srcdir@/bin") from configlib import getstatusoutput class bcolors: diff --git a/tests/reference/test_core_apps_fft.ref-out b/tests/reference/test_core_apps_fft.ref-out new file mode 100644 index 000000000..0c379e318 --- /dev/null +++ b/tests/reference/test_core_apps_fft.ref-out @@ -0,0 +1 @@ +Estimated total runtime of 0.00007262 seconds diff --git a/tests/reference/test_core_apps_halo3d.ref-out b/tests/reference/test_core_apps_halo3d.ref-out new file mode 100644 index 000000000..2ba4979b5 --- /dev/null +++ b/tests/reference/test_core_apps_halo3d.ref-out @@ -0,0 +1 @@ +Estimated total runtime of 0.00070607 seconds diff --git a/tests/reference/test_core_apps_sweep3d.ref-out b/tests/reference/test_core_apps_sweep3d.ref-out new file mode 100644 index 000000000..e4f98a449 --- /dev/null +++ b/tests/reference/test_core_apps_sweep3d.ref-out @@ -0,0 +1 @@ +Estimated total runtime of 0.00254201 seconds diff --git a/tests/test_configs/test_fft.ini b/tests/test_configs/test_fft.ini new file mode 100644 index 000000000..07978eb02 --- /dev/null +++ b/tests/test_configs/test_fft.ini @@ -0,0 +1,59 @@ + +node { + app1 { + indexing = block + allocation = first_available + name = fft + launch_cmd = aprun -n 64 -N 1 + argv = -pex 4 -pey 4 -pez 4 -nx 100 -ny 100 -nz 100 -iterations 2 + } + nic { + name = snappr + injection { + mtu = 1KB + bandwidth = 10GB/s + latency = 1us + credits = 64KB + } + } + memory { + name = snappr + channel_bandwidth = 10GB/s + num_channels = 8 + mtu = 1MB + latency = 15ns + } + proc { + ncores = 4 + frequency = 2.1Ghz + } + name = simple +} + +switch { + name = snappr + arbitrator = fifo + mtu = 1KB + link { + bandwidth = 2.5GB/s + latency = 100ns + credits = 64KB + } + logp { + bandwidth = 2.5GB/s + out_in_latency = 1us + hop_latency = 100ns + } + router { + name = torus_minimal + } +} + + +topology { + name = torus + seed = 14 + geometry = [4,4,4] + redundant = [8,4,8] + concentration = 1 +} diff --git a/tests/test_configs/test_halo3d.ini b/tests/test_configs/test_halo3d.ini new file mode 100644 index 000000000..aca89db66 --- /dev/null +++ b/tests/test_configs/test_halo3d.ini @@ -0,0 +1,59 @@ + +node { + app1 { + indexing = block + allocation = first_available + name = halo3d-26 + launch_cmd = aprun -n 64 -N 1 + argv = -pex 4 -pey 4 -pez 4 -nx 100 -ny 100 -nz 100 -iterations 3 + } + nic { + name = snappr + injection { + mtu = 1KB + bandwidth = 10GB/s + latency = 1us + credits = 64KB + } + } + memory { + name = snappr + channel_bandwidth = 10GB/s + num_channels = 8 + mtu = 1MB + latency = 15ns + } + proc { + ncores = 4 + frequency = 2.1Ghz + } + name = simple +} + +switch { + name = snappr + arbitrator = fifo + mtu = 1KB + link { + bandwidth = 2.5GB/s + latency = 100ns + credits = 64KB + } + logp { + bandwidth = 2.5GB/s + out_in_latency = 1us + hop_latency = 100ns + } + router { + name = torus_minimal + } +} + + +topology { + name = torus + seed = 14 + geometry = [4,4,4] + redundant = [8,4,8] + concentration = 1 +} diff --git a/tests/test_configs/test_sweep3d.ini b/tests/test_configs/test_sweep3d.ini new file mode 100644 index 000000000..99cd3783b --- /dev/null +++ b/tests/test_configs/test_sweep3d.ini @@ -0,0 +1,59 @@ + +node { + app1 { + indexing = block + allocation = first_available + name = sweep3d + launch_cmd = aprun -n 64 -N 1 + argv = -pex 8 -pey 8 -nx 32 -ny 32 -nz 20 -kba 10 -vars 10 -iterations 1 -print 0 + } + nic { + name = snappr + injection { + mtu = 1KB + bandwidth = 10GB/s + latency = 1us + credits = 64KB + } + } + memory { + name = snappr + channel_bandwidth = 10GB/s + num_channels = 8 + mtu = 1MB + latency = 15ns + } + proc { + ncores = 4 + frequency = 2.1Ghz + } + name = simple +} + +switch { + name = snappr + arbitrator = fifo + mtu = 1KB + link { + bandwidth = 2.5GB/s + latency = 100ns + credits = 64KB + } + logp { + bandwidth = 2.5GB/s + out_in_latency = 1us + hop_latency = 100ns + } + router { + name = torus_minimal + } +} + + +topology { + name = torus + seed = 14 + geometry = [4,4,4] + redundant = [8,4,8] + concentration = 1 +}