Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MVAPICH AllGather test deadlocks #668

Open
jpkenny opened this issue Feb 16, 2022 · 0 comments
Open

MVAPICH AllGather test deadlocks #668

jpkenny opened this issue Feb 16, 2022 · 0 comments
Assignees
Labels

Comments

@jpkenny
Copy link
Contributor

jpkenny commented Feb 16, 2022

The MPI all gather app in skeletons/tests:

#include <mpi.h>
#include <stddef.h>
#include <iostream>

int main(int argc, char** argv)
{
  MPI_Init(&argc, &argv);
  int rank, size;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  if (rank == 0){
    std::cout << "Starting collective" << std::endl;
  }

  int nelems = 100;
//#define VALIDATE_BUFFERS
#ifdef VALIDATE_BUFFERS
  int* send_buf = new int[nelems];
  int* recv_buf = new int[nelems * size];
  for (int i=0; i < nelems; ++i){
    send_buf[i] = rank;
  }
  for (int i=0; i < size; ++i){
    for (int j=0; j < nelems; ++j){
      recv_buf[i*nelems + j] = -1;
    }
  }
#else
  void* send_buf = sstmac_nullptr_send;
  void* recv_buf = sstmac_nullptr_recv;
#endif

  MPI_Allgather(send_buf, 100, MPI_INT,
                recv_buf, 100, MPI_INT, MPI_COMM_WORLD);

  if (rank == 0){
    std::cout << "Cleared collective" << std::endl;
  }
#ifdef VALIDATE_BUFFERS
  for (int i=0; i < size; ++i){
    int* values = recv_buf + i*nelems;
    for (int j=0; j < nelems; ++j){
      if (values[j] != i){
        printf("V[%d][%d] = %d != %d\n", i, j, values[j], i);
      }
    }
  }
#endif

  MPI_Finalize();

  if (rank == 0){
    std::cout << "Cleared finalize" << std::endl;
  }

  return 0;
}

Deadlocks with the following param file when run over MVAPICH:

node {
 os {
  stack_size = 1MB
 }
 app1 { 
  exe = ./run
  argv = 
  launch_cmd = aprun -n 32 -N 1
  apis = [libfabric, pmi:libfabric]
  env {
    SLURM_NPROCS = 32
  }
 }
 nic {
  name = snappr
  credits = 64KB
  mtu = 4096
  bandwidth = 10.0GB/s
  injection {
   bandwidth = 10.0GB/s
   latency = 50ns
   mtu = 1024
   credits = 64KB
   send_state {
     group = state
     type = ftq_calendar
     output = ftq
     epoch_length = 1us
    }
   recv_state {
     group = state
     type = ftq_calendar
     output = ftq
     epoch_length = 1us
    }
  }
  ejection {
   latency = 50ns
  }
 }
 memory {
  name = snappr
  channel_bandwidth = 7GB/s
  num_channels = 10
  latency = 10ns
 }
 proc {
  ncores = 4
  frequency = 2GHz
 }
 name = simple
}


switch {
 name = snappr
 credits = 64KB
 link {
  bandwidth = 10.0GB/s
  latency = 100ns
  credits = 64KB
  xmit_active {
   group = test
   type = accumulator
  }
  xmit_idle {
   group = test
   type = accumulator
  }
  xmit_stall {
   group = test
   type = accumulator
  }
 }
 logp {
  bandwidth = 1GB/s
  out_in_latency = 100ns
  hop_latency = 100ns
 }
}

topology {
  name = dragonfly
  geometry = [32,9]
  h = 16
  inter_group = circulant
  concentration = 16
}

switch {
  router { 
    name = dragonfly_minimal
  }
}

I used the following Makefile:

TARGET := run
SRC := allgather.cc

CXX := sst++
CC := sstcc
CXXFLAGS := --disable-mpi -fPIC -O0 -g
CPPFLAGS := -I. -I/home/jpkenny/install/mv2-ofi-netmod/include
LIBDIR :=
PREFIX :=
LDFLAGS := /home/jpkenny/install/mv2-ofi-netmod/lib/libmpi.so /home/jpkenny/install/mv2-ofi-netmod/lib/libmpi.a /usr/lib64/libhwloc.so -Wl,-rpath,$(PREFIX)/lib -Wl,-rpath,/home/jpkenny/install/mv2-ofi-netmod/lib -Wl,-rpath,$/home/jpkenny/install/sst-transports/lib

OBJ := $(SRC:.cc=.o)
OBJ := $(OBJ:.cpp=.o)
OBJ := $(OBJ:.c=.o)

.PHONY: clean install

all: $(TARGET)

$(TARGET): $(OBJ)
	$(CXX) -o $@ $+ $(LDFLAGS) $(LIBS)  $(CXXFLAGS)

%.o: %.cc
	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@

%.o: %.c
	$(CC) $(CPPFLAGS) $(CFLAGS) -c $< -o $@

clean:
	rm -f $(TARGET) $(OBJ)

install: $(TARGET)
	cp $< $(PREFIX)/bin
@jpkenny jpkenny self-assigned this Feb 16, 2022
@jpkenny jpkenny added the bug label Apr 19, 2022
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

1 participant